Diff 170403

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 805 Lines • ▼ Show 20 Lines	if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterID);		addPass(&EarlyIfConverterID);

TargetPassConfig::addILPOpts();		TargetPassConfig::addILPOpts();
return false;		return false;
}		}

bool GCNPassConfig::addInstSelector() {		bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();		AMDGPUPassConfig::addInstSelector();
addPass(createSILowerI1CopiesPass());
addPass(&SIFixSGPRCopiesID);		addPass(&SIFixSGPRCopiesID);
		addPass(createSILowerI1CopiesPass());
return false;		return false;
}		}

bool GCNPassConfig::addIRTranslator() {		bool GCNPassConfig::addIRTranslator() {
addPass(new IRTranslator());		addPass(new IRTranslator());
return false;		return false;
}		}

▲ Show 20 Lines • Show All 90 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Show First 20 Lines • Show All 177 Lines • ▼ Show 20 Lines	const TargetRegisterClass *DstRC =
TRI.getPhysRegClass(DstReg);		TRI.getPhysRegClass(DstReg);

return std::make_pair(SrcRC, DstRC);		return std::make_pair(SrcRC, DstRC);
}		}

static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,		static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,		const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {		const SIRegisterInfo &TRI) {
return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);		return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
		TRI.hasVGPRs(SrcRC);
		arsenmUnsubmitted Not Done Reply Inline Actions Could we avoid still having VReg_1 by checking for i1 source instructions? arsenm: Could we avoid still having VReg_1 by checking for i1 source instructions?
		nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions I don't know what you mean. At this point in the compilation we can't link things back to IR anymore, so VReg_1 is actually a convenient way of indicating that a value was originally an i1. nhaehnle: I don't know what you mean. At this point in the compilation we can't link things back to IR…
}		}

static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,		static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,		const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {		const SIRegisterInfo &TRI) {
return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);		return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
		TRI.hasVGPRs(DstRC);
}		}

static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,		static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
const SIRegisterInfo *TRI,		const SIRegisterInfo *TRI,
const SIInstrInfo *TII) {		const SIInstrInfo *TII) {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();		MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
auto &Src = MI.getOperand(1);		auto &Src = MI.getOperand(1);
unsigned DstReg = MI.getOperand(0).getReg();		unsigned DstReg = MI.getOperand(0).getReg();
▲ Show 20 Lines • Show All 513 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SILowerI1Copies.cpp

	//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//			//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
	//			//
	// The LLVM Compiler Infrastructure			// The LLVM Compiler Infrastructure
	//			//
	// This file is distributed under the University of Illinois Open Source			// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.			// License. See LICENSE.TXT for details.
	//			//
	/// i1 values are usually inserted by the CFG Structurize pass and they are
	/// unique in that they can be copied from VALU to SALU registers.
	/// This is not possible for any other value type. Since there are no
	/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
	///
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
				// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
				// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
				// and a wave-level control flow graph.
				//
				// Before this pass, values that are semantically i1 and are defined and used
				// within the same basic block are already represented as lane masks in scalar
				// registers. However, values that cross basic blocks are always transferred
				// between basic blocks in vreg_1 virtual registers and are lowered by this
				// pass.
				//
				// The only instructions that use or define vreg_1 virtual registers are COPY,
				// PHI, and IMPLICIT_DEF.
				//
				//===----------------------------------------------------------------------===//

	#define DEBUG_TYPE "si-i1-copies"
	#include "AMDGPU.h"			#include "AMDGPU.h"
	#include "AMDGPUSubtarget.h"			#include "AMDGPUSubtarget.h"
	#include "SIInstrInfo.h"			#include "SIInstrInfo.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"			#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "Utils/AMDGPULaneDominator.h"
	#include "llvm/CodeGen/LiveIntervals.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"			#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"			#include "llvm/CodeGen/MachineInstrBuilder.h"
				#include "llvm/CodeGen/MachineDominators.h"
				#include "llvm/CodeGen/MachinePostDominators.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"			#include "llvm/CodeGen/MachineRegisterInfo.h"
				#include "llvm/CodeGen/MachineSSAUpdater.h"
	#include "llvm/IR/Function.h"			#include "llvm/IR/Function.h"
	#include "llvm/IR/LLVMContext.h"			#include "llvm/IR/LLVMContext.h"
	#include "llvm/Support/Debug.h"			#include "llvm/Support/Debug.h"
	#include "llvm/Target/TargetMachine.h"			#include "llvm/Target/TargetMachine.h"

				#define DEBUG_TYPE "si-i1-copies"

	using namespace llvm;			using namespace llvm;

				static unsigned createLaneMaskReg(MachineFunction &MF);
				static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);

	namespace {			namespace {

	class SILowerI1Copies : public MachineFunctionPass {			class SILowerI1Copies : public MachineFunctionPass {
	public:			public:
	static char ID;			static char ID;

				private:
				MachineFunction *MF = nullptr;
				MachineDominatorTree *DT = nullptr;
				MachinePostDominatorTree *PDT = nullptr;
				MachineRegisterInfo *MRI = nullptr;
				const SIInstrInfo *TII = nullptr;

				DenseSet<unsigned> ConstrainRegs;

	public:			public:
	SILowerI1Copies() : MachineFunctionPass(ID) {			SILowerI1Copies() : MachineFunctionPass(ID) {
	initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());			initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
	}			}

	bool runOnMachineFunction(MachineFunction &MF) override;			bool runOnMachineFunction(MachineFunction &MF) override;

	StringRef getPassName() const override { return "SI Lower i1 Copies"; }			StringRef getPassName() const override { return "SI Lower i1 Copies"; }

	void getAnalysisUsage(AnalysisUsage &AU) const override {			void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();			AU.setPreservesCFG();
				AU.addRequired<MachineDominatorTree>();
				AU.addRequired<MachinePostDominatorTree>();
	MachineFunctionPass::getAnalysisUsage(AU);			MachineFunctionPass::getAnalysisUsage(AU);
	}			}

				private:
				void lowerCopiesFromI1();
				void lowerPhis();
				void lowerCopiesToI1();
				bool isConstantLaneMask(unsigned Reg, bool &Val) const;
				void buildMergeLaneMasks(MachineBasicBlock &MBB,
				MachineBasicBlock::iterator I, const DebugLoc &DL,
				unsigned DstReg, unsigned PrevReg, unsigned CurReg);

				bool isLaneMaskReg(unsigned Reg) const {
				if (Reg == AMDGPU::VCC \|\| Reg == AMDGPU::EXEC)
				return true;
				rampitecUnsubmitted Done Reply Inline Actions TII->getRegisterInfo().getRegSizeInBits(Reg, MRI) == ST.getWavefrontSize(); rampitec:* ``` TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) == ST.getWavefrontSize(); ```

				arsenmUnsubmitted Done Reply Inline Actions Should this worry about sub registers? Can't this just use TRI.getRegClass instead of worrying about physical registers itself arsenm: Should this worry about sub registers? Can't this just use TRI.getRegClass instead of worrying…
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions Simplified to using TRI instead of listing register classes explicitly. I don't think subregisters come into it at this point. At least the cases I've tried all ended up scalarized and I was unable to get subregisters to appear. Anyway, I'm adding some assertions about subregisters just in case. nhaehnle: Simplified to using TRI instead of listing register classes explicitly. I don't think…
				if (!TargetRegisterInfo::isVirtualRegister(Reg))
				return false;

				const TargetRegisterClass *RC = MRI->getRegClass(Reg);
				return RC == &AMDGPU::SReg_64RegClass \|\|
				RC == &AMDGPU::SReg_64_XEXECRegClass;
				arsenmUnsubmitted Done Reply Inline Actions Could also be SGPR_64? I would prefer avoiding listing the register classes in another place arsenm: Could also be SGPR_64? I would prefer avoiding listing the register classes in another place
				}
				};

				/// Helper class that determines the relationship between incoming values of a
				/// phi in the control flow graph to determine where an incoming value can
				/// simply be taken as a scalar lane mask as-is, and where it needs to be
				/// merged with another, previously defined lane mask.
				///
				/// The approach is as follows:
				/// - Determine all basic blocks which, starting from the incoming blocks,
				/// a wave may reach before entering the def block (the block containing the
				/// phi).
				/// - If an incoming block has no predecessors in this set, we can take the
				/// incoming value as a scalar lane mask as-is.
				/// -- A special case of this is when the def block has a self-loop.
				/// - Otherwise, the incoming value needs to be merged with a previously
				/// defined lane mask.
				/// - If there is a path into the set of reachable blocks that does _not_ go
				/// through an incoming block where we can take the scalar lane mask as-is,
				/// we need to invent an available value for the SSAUpdater. Choices are
				/// 0 and undef, with differing consequences for how to merge values etc.
				///
				/// TODO: We could use region analysis to quickly skip over SESE regions during
				/// the traversal.
				///
				class PhiIncomingAnalysis {
				MachinePostDominatorTree &PDT;

				// For each reachable basic block, whether it is a source in the induced
				// subgraph of the CFG.
				DenseMap<MachineBasicBlock *, bool> ReachableMap;
				SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
				SmallVector<MachineBasicBlock *, 4> Stack;
				SmallVector<MachineBasicBlock *, 4> Predecessors;

				public:
				PhiIncomingAnalysis(MachinePostDominatorTree &PDT)
				: PDT(PDT) {}

				/// Returns whether \p MBB is a source in the induced subgraph of reachable
				/// blocks.
				bool isSource(MachineBasicBlock &MBB) const {
				return ReachableMap.find(&MBB)->second;
				}

				ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }

				void analyze(MachineBasicBlock &DefBlock,
				ArrayRef<MachineBasicBlock *> IncomingBlocks) {
				assert(Stack.empty());
				ReachableMap.clear();
				ReachableOrdered.clear();
				Predecessors.clear();

				// Insert the def block first, so that it acts as an end point for the
				// traversal.
				ReachableMap.try_emplace(&DefBlock, false);
				ReachableOrdered.push_back(&DefBlock);

				for (MachineBasicBlock *MBB : IncomingBlocks) {
				if (MBB == &DefBlock) {
				ReachableMap[&DefBlock] = true; // self-loop on DefBlock
				continue;
				}

				ReachableMap.try_emplace(MBB, false);
				ReachableOrdered.push_back(MBB);

				// If this block has a divergent terminator and the def block is its
				// post-dominator, the wave may first visit the other successors.
				bool Divergent = false;
				for (MachineInstr &MI : MBB->terminators()) {
				if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO \|\|
				MI.getOpcode() == AMDGPU::SI_IF \|\|
				MI.getOpcode() == AMDGPU::SI_ELSE \|\|
				MI.getOpcode() == AMDGPU::SI_LOOP) {
				Divergent = true;
				break;
				}
				arsenmUnsubmitted Not Done Reply Inline Actions Separate function? arsenm: Separate function?
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions I don't think that helps. nhaehnle: I don't think that helps.
				}

				if (Divergent && PDT.dominates(&DefBlock, MBB)) {
				for (MachineBasicBlock *Succ : MBB->successors())
				Stack.push_back(Succ);
				}
				}

				while (!Stack.empty()) {
				MachineBasicBlock *MBB = Stack.pop_back_val();
				if (!ReachableMap.try_emplace(MBB, false).second)
				continue;
				ReachableOrdered.push_back(MBB);

				for (MachineBasicBlock *Succ : MBB->successors())
				Stack.push_back(Succ);
				}

				for (MachineBasicBlock *MBB : ReachableOrdered) {
				bool HaveReachablePred = false;
				for (MachineBasicBlock *Pred : MBB->predecessors()) {
				if (ReachableMap.count(Pred)) {
				HaveReachablePred = true;
				} else {
				Stack.push_back(Pred);
				}
				}
				if (!HaveReachablePred)
				ReachableMap[MBB] = true;
				if (HaveReachablePred) {
				for (MachineBasicBlock *UnreachablePred : Stack) {
				if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
				Predecessors.push_back(UnreachablePred);
				}
				}
				Stack.clear();
				}
				}
				};

				/// Helper class that detects loops which require us to lower an i1 COPY into
				/// bitwise manipulation.
				///
				/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
				/// between loops with the same header. Consider this example:
				///
				/// A-+-+
				/// \| \| \|
				/// B-+ \|
				/// \| \|
				/// C---+
				///
				/// A is the header of a loop containing A, B, and C as far as LoopInfo is
				/// concerned. However, an i1 COPY in B that is used in C must be lowered to
				/// bitwise operations to combine results from different loop iterations when
				/// B has a divergent branch (since by default we will compile this code such
				/// that threads in a wave are merged at the entry of C).
				///
				/// The following rule is implemented to determine whether bitwise operations
				/// are required: use the bitwise lowering for a def in block B if a backward
				/// edge to B is reachable without going through the nearest common
				/// post-dominator of B and all uses of the def.
				///
				/// TODO: This rule is conservative because it does not check whether the
				/// relevant branches are actually divergent.
				///
				/// The class is designed to cache the CFG traversal so that it can be re-used
				/// for multiple defs within the same basic block.
				///
				/// TODO: We could use region analysis to quickly skip over SESE regions during
				/// the traversal.
				///
				class LoopFinder {
				MachineDominatorTree &DT;
				MachinePostDominatorTree &PDT;

				// All visited / reachable block, tagged by level (level 0 is the def block,
				// level 1 are all blocks reachable including but not going through the def
				// block's IPDOM, etc.).
				DenseMap<MachineBasicBlock *, unsigned> Visited;

				// Nearest common dominator of all visited blocks by level (level 0 is the
				// def block). Used for seeding the SSAUpdater.
				SmallVector<MachineBasicBlock *, 4> CommonDominators;

				// Post-dominator of all visited blocks.
				MachineBasicBlock *VisitedPostDom = nullptr;

				// Level at which a loop was found: 0 is not possible; 1 = a backward edge is
				// reachable without going through the IPDOM of the def block (if the IPDOM
				// itself has an edge to the def block, the loop level is 2), etc.
				unsigned FoundLoopLevel = ~0u;

				MachineBasicBlock *DefBlock = nullptr;
				SmallVector<MachineBasicBlock *, 4> Stack;
				SmallVector<MachineBasicBlock *, 4> NextLevel;

				public:
				LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
				: DT(DT), PDT(PDT) {}

				void initialize(MachineBasicBlock &MBB) {
				Visited.clear();
				CommonDominators.clear();
				Stack.clear();
				NextLevel.clear();
				VisitedPostDom = nullptr;
				FoundLoopLevel = ~0u;

				DefBlock = &MBB;
				}

				/// Check whether a backward edge can be reached without going through the
				/// given \p PostDom of the def block.
				///
				/// Return the level of \p PostDom if a loop was found, or 0 otherwise.
				unsigned findLoop(MachineBasicBlock *PostDom) {
				MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);

				if (!VisitedPostDom)
				advanceLevel();

				unsigned Level = 0;
				while (PDNode->getBlock() != PostDom) {
				if (PDNode->getBlock() == VisitedPostDom)
				advanceLevel();
				PDNode = PDNode->getIDom();
				Level++;
				if (FoundLoopLevel == Level)
				return Level;
				}

				return 0;
				}

				/// Add undef values dominating the loop and the optionally given additional
				/// blocks, so that the SSA updater doesn't have to search all the way to the
				/// function entry.
				void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
				ArrayRef<MachineBasicBlock *> Blocks = {}) {
				assert(LoopLevel < CommonDominators.size());

				MachineBasicBlock *Dom = CommonDominators[LoopLevel];
				for (MachineBasicBlock *MBB : Blocks)
				Dom = DT.findNearestCommonDominator(Dom, MBB);

				if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
				SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
				} else {
				// The dominator is part of the loop or the given blocks, so add the
				// undef value to unreachable predecessors instead.
				for (MachineBasicBlock *Pred : Dom->predecessors()) {
				if (!inLoopLevel(*Pred, LoopLevel, Blocks))
				SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
				}
				}
				}

				private:
				bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
				ArrayRef<MachineBasicBlock *> Blocks) const {
				auto DomIt = Visited.find(&MBB);
				if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
				return true;

				if (llvm::find(Blocks, &MBB) != Blocks.end())
				return true;

				return false;
				}

				void advanceLevel() {
				MachineBasicBlock *VisitedDom;

				if (!VisitedPostDom) {
				VisitedPostDom = DefBlock;
				VisitedDom = DefBlock;
				Stack.push_back(DefBlock);
				} else {
				VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
				VisitedDom = CommonDominators.back();

				for (unsigned i = 0; i < NextLevel.size();) {
				if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
				Stack.push_back(NextLevel[i]);

				NextLevel[i] = NextLevel.back();
				NextLevel.pop_back();
				} else {
				i++;
				}
				}
				}

				unsigned Level = CommonDominators.size();
				while (!Stack.empty()) {
				MachineBasicBlock *MBB = Stack.pop_back_val();
				if (!PDT.dominates(VisitedPostDom, MBB))
				NextLevel.push_back(MBB);

				Visited[MBB] = Level;
				VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);

				for (MachineBasicBlock *Succ : MBB->successors()) {
				if (Succ == DefBlock) {
				if (MBB == VisitedPostDom)
				FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
				else
				FoundLoopLevel = std::min(FoundLoopLevel, Level);
				continue;
				}

				if (Visited.try_emplace(Succ, ~0u).second) {
				if (MBB == VisitedPostDom)
				NextLevel.push_back(Succ);
				else
				Stack.push_back(Succ);
				}
				}
				}

				CommonDominators.push_back(VisitedDom);
				}
	};			};

	} // End anonymous namespace.			} // End anonymous namespace.

	INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,			INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE,
				"SI Lower i1 Copies", false, false)
				INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
				INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
				INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE,
	"SI Lower i1 Copies", false, false)			"SI Lower i1 Copies", false, false)

	char SILowerI1Copies::ID = 0;			char SILowerI1Copies::ID = 0;

	char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;			char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;

	FunctionPass *llvm::createSILowerI1CopiesPass() {			FunctionPass *llvm::createSILowerI1CopiesPass() {
	return new SILowerI1Copies();			return new SILowerI1Copies();
	}			}

	bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {			static unsigned createLaneMaskReg(MachineFunction &MF) {
	MachineRegisterInfo &MRI = MF.getRegInfo();			MachineRegisterInfo &MRI = MF.getRegInfo();
				return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
				}

				static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
				MachineFunction &MF = *MBB.getParent();
	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();			const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
	const SIInstrInfo *TII = ST.getInstrInfo();			const SIInstrInfo *TII = ST.getInstrInfo();
	const TargetRegisterInfo *TRI = &TII->getRegisterInfo();			unsigned UndefReg = createLaneMaskReg(MF);
				BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
	std::vector<unsigned> I1Defs;			UndefReg);
				return UndefReg;
				}

	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();			/// Lower all instructions that def or use vreg_1 registers.
	BI != BE; ++BI) {			///
				/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
				/// occur around inline assembly. We do this first, before vreg_1 registers
				/// are changed to scalar mask registers.
				///
				/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
				/// all others, because phi lowering looks through copies and can therefore
				/// often make copy lowering unnecessary.
				bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
				MF = &TheMF;
				MRI = &MF->getRegInfo();
				DT = &getAnalysis<MachineDominatorTree>();
				PDT = &getAnalysis<MachinePostDominatorTree>();

				const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
				TII = ST.getInstrInfo();

				lowerCopiesFromI1();
				lowerPhis();
				lowerCopiesToI1();

				for (unsigned Reg : ConstrainRegs)
				MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
				ConstrainRegs.clear();

	MachineBasicBlock &MBB = *BI;			return true;
	MachineBasicBlock::iterator I, Next;
	for (I = MBB.begin(); I != MBB.end(); I = Next) {
	Next = std::next(I);
	MachineInstr &MI = *I;

	if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
	unsigned Reg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(Reg);
	if (RC == &AMDGPU::VReg_1RegClass)
	MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
	continue;
	}			}

				void SILowerI1Copies::lowerCopiesFromI1() {
				SmallVector<MachineInstr *, 4> DeadCopies;

				for (MachineBasicBlock &MBB : *MF) {
				for (MachineInstr &MI : MBB) {
	if (MI.getOpcode() != AMDGPU::COPY)			if (MI.getOpcode() != AMDGPU::COPY)
	continue;			continue;

	const MachineOperand &Dst = MI.getOperand(0);			unsigned DstReg = MI.getOperand(0).getReg();
	const MachineOperand &Src = MI.getOperand(1);			unsigned SrcReg = MI.getOperand(1).getReg();
				if (!TargetRegisterInfo::isVirtualRegister(SrcReg) \|\|
	if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) \|\|			MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
	!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
	continue;			continue;

	const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());			if (isLaneMaskReg(DstReg) \|\|
	const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());			(TargetRegisterInfo::isVirtualRegister(DstReg) &&
				MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
				continue;

				// Copy into a 32-bit vector register.
				LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
	DebugLoc DL = MI.getDebugLoc();			DebugLoc DL = MI.getDebugLoc();
	MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
	if (DstRC == &AMDGPU::VReg_1RegClass &&
	TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
	I1Defs.push_back(Dst.getReg());

	if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {			assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
	if (DefInst->getOperand(1).isImm()) {
	I1Defs.push_back(Dst.getReg());

	int64_t Val = DefInst->getOperand(1).getImm();			ConstrainRegs.insert(SrcReg);
	assert(Val == 0 \|\| Val == -1);			BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
				.addImm(0)
				.addImm(-1)
				.addReg(SrcReg);
				DeadCopies.push_back(&MI);
				}

				for (MachineInstr *MI : DeadCopies)
				MI->eraseFromParent();
				DeadCopies.clear();
				}
				}

	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))			void SILowerI1Copies::lowerPhis() {
	.add(Dst)			MachineSSAUpdater SSAUpdater(*MF);
	.addImm(Val);			LoopFinder LF(DT, PDT);
	MI.eraseFromParent();			PhiIncomingAnalysis PIA(*PDT);
				SmallVector<MachineInstr *, 4> DeadPhis;
				SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
				SmallVector<unsigned, 4> IncomingRegs;
				SmallVector<unsigned, 4> IncomingUpdated;

				for (MachineBasicBlock &MBB : *MF) {
				LF.initialize(MBB);

				for (MachineInstr &MI : MBB.phis()) {
				if (!MI.isPHI())
				continue;
				arsenmUnsubmitted Not Done Reply Inline Actions Why is this necessary with phis() arsenm: Why is this necessary with phis()
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions You're right, it's not. nhaehnle: You're right, it's not.

				unsigned DstReg = MI.getOperand(0).getReg();
				if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
				continue;

				LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);

				MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);

				// Collect incoming values.
				alex-tUnsubmitted Not Done Reply Inline Actions How do you suppose this assert can be hit? Given well formed PHI node. alex-t: How do you suppose this assert can be hit? Given well formed PHI node.
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions Yeah, that's why it's an assert. Admittedly not the most important one. nhaehnle: Yeah, that's why it's an assert. Admittedly not the most important one.
				for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
				assert(i + 1 < MI.getNumOperands());
				unsigned IncomingReg = MI.getOperand(i).getReg();
				alex-tUnsubmitted Not Done Reply Inline Actions Are you sure that IncomingDef will never be null? What if IncomingReg is the subreg? alex-t: Are you sure that IncomingDef will never be null? What if IncomingReg is the subreg?
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions Neither of those should ever happen since the input is in machine SSA after isel. nhaehnle: Neither of those should ever happen since the input is in machine SSA after isel.
				MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
				MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);

				if (IncomingDef->getOpcode() == AMDGPU::COPY) {
				IncomingReg = IncomingDef->getOperand(1).getReg();
				assert(isLaneMaskReg(IncomingReg));
				} else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
	continue;			continue;
				} else {
				assert(IncomingDef->isPHI());
	}			}

				IncomingBlocks.push_back(IncomingMBB);
				IncomingRegs.push_back(IncomingReg);
	}			}

	unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);			// Phis in a loop that are observed outside the loop receive a simple but
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)			// conservatively correct treatment.
	.add(Src);			MachineBasicBlock *PostDomBound = &MBB;
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))			for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
	.add(Dst)			PostDomBound =
	.addImm(0)			PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
	.addImm(-1)			}
	.addReg(TmpSrc);
	MI.eraseFromParent();			unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
	} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&
	SrcRC == &AMDGPU::VReg_1RegClass) {			SSAUpdater.Initialize(DstReg);
	if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&
	DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&			if (FoundLoopLevel) {
	DefInst->getOperand(1).getImm() == 0 &&			LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
	DefInst->getOperand(2).getImm() != 0 &&
	DefInst->getOperand(3).isReg() &&			for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
	TargetRegisterInfo::isVirtualRegister(			IncomingUpdated.push_back(createLaneMaskReg(*MF));
	DefInst->getOperand(3).getReg()) &&			SSAUpdater.AddAvailableValue(IncomingBlocks[i],
	TRI->getCommonSubClass(			IncomingUpdated.back());
	MRI.getRegClass(DefInst->getOperand(3).getReg()),			}
	&AMDGPU::SGPR_64RegClass) &&
	AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {			for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))			MachineBasicBlock &IMBB = *IncomingBlocks[i];
	.add(Dst)			buildMergeLaneMasks(IMBB, IMBB.getFirstInstrTerminator(), {},
	.addReg(AMDGPU::EXEC)			IncomingUpdated[i],
	.add(DefInst->getOperand(3));			SSAUpdater.GetValueInMiddleOfBlock(&IMBB),
				IncomingRegs[i]);
				}
	} else {			} else {
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))			// The phi is not observed from outside a loop. Use a more accurate
	.add(Dst)			// lowering.
	.add(Src)			PIA.analyze(MBB, IncomingBlocks);

				for (MachineBasicBlock *MBB : PIA.predecessors())
				SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));

				for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
				MachineBasicBlock &IMBB = *IncomingBlocks[i];
				if (PIA.isSource(IMBB)) {
				IncomingUpdated.push_back(0);
				SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
				} else {
				IncomingUpdated.push_back(createLaneMaskReg(*MF));
				SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
				}
				}

				for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
				if (!IncomingUpdated[i])
				continue;

				MachineBasicBlock &IMBB = *IncomingBlocks[i];
				buildMergeLaneMasks(IMBB, IMBB.getFirstInstrTerminator(), {},
				IncomingUpdated[i],
				SSAUpdater.GetValueInMiddleOfBlock(&IMBB),
				IncomingRegs[i]);
				}
				}

				unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
				if (NewReg != DstReg) {
				MRI->replaceRegWith(NewReg, DstReg);

				// Ensure that DstReg has a single def and mark the old PHI node for
				// deletion.
				MI.getOperand(0).setReg(NewReg);
				DeadPhis.push_back(&MI);
				}

				IncomingBlocks.clear();
				IncomingRegs.clear();
				IncomingUpdated.clear();
				}

				for (MachineInstr *MI : DeadPhis)
				MI->eraseFromParent();
				DeadPhis.clear();
				}
				}

				void SILowerI1Copies::lowerCopiesToI1() {
				MachineSSAUpdater SSAUpdater(*MF);
				LoopFinder LF(DT, PDT);
				SmallVector<MachineInstr *, 4> DeadCopies;

				for (MachineBasicBlock &MBB : *MF) {
				LF.initialize(MBB);

				for (MachineInstr &MI : MBB) {
				if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
				MI.getOpcode() != AMDGPU::COPY)
				continue;

				unsigned DstReg = MI.getOperand(0).getReg();
				if (!TargetRegisterInfo::isVirtualRegister(DstReg) \|\|
				MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
				continue;

				if (MRI->use_empty(DstReg)) {
				DeadCopies.push_back(&MI);
				continue;
				}

				LLVM_DEBUG(dbgs() << "Lower Other: " << MI);

				MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
				if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
				continue;

				DebugLoc DL = MI.getDebugLoc();
				unsigned SrcReg = MI.getOperand(1).getReg();

				if (!TargetRegisterInfo::isVirtualRegister(SrcReg) \|\|
				!isLaneMaskReg(SrcReg)) {
				assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
				unsigned TmpReg = createLaneMaskReg(*MF);
				BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
				.addReg(SrcReg)
	.addImm(0);			.addImm(0);
				MI.getOperand(1).setReg(TmpReg);
				SrcReg = TmpReg;
				}

				// Defs in a loop that are observed outside the loop must be transformed
				// into appropriate bit manipulation.
				MachineBasicBlock *PostDomBound = &MBB;
				for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
				PostDomBound =
				PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
				}

				unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
				if (FoundLoopLevel) {
				SSAUpdater.Initialize(DstReg);
				SSAUpdater.AddAvailableValue(&MBB, DstReg);
				LF.addLoopEntries(FoundLoopLevel, SSAUpdater);

				buildMergeLaneMasks(MBB, MI, DL, DstReg,
				SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
				DeadCopies.push_back(&MI);
	}			}
	MI.eraseFromParent();			}

				for (MachineInstr *MI : DeadCopies)
				MI->eraseFromParent();
				DeadCopies.clear();
	}			}
	}			}

				bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
				const MachineInstr *MI;
				for (;;) {
				MI = MRI->getUniqueVRegDef(Reg);
				if (MI->getOpcode() != AMDGPU::COPY)
				break;

				Reg = MI->getOperand(1).getReg();
				if (!TargetRegisterInfo::isVirtualRegister(Reg))
				return false;
				if (!isLaneMaskReg(Reg))
				return false;
	}			}

	for (unsigned Reg : I1Defs)			if (MI->getOpcode() != AMDGPU::S_MOV_B64)
	MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);			return false;

				if (!MI->getOperand(1).isImm())
				return false;

				int64_t Imm = MI->getOperand(1).getImm();
				if (Imm == 0) {
				Val = false;
				return true;
				}
				if (Imm == -1) {
				Val = true;
				return true;
				}

	return false;			return false;
	}			}

				void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
				MachineBasicBlock::iterator I,
				const DebugLoc &DL, unsigned DstReg,
				unsigned PrevReg, unsigned CurReg) {
				bool PrevVal;
				bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
				bool CurVal;
				bool CurConstant = isConstantLaneMask(CurReg, CurVal);

				if (PrevConstant && CurConstant) {
				if (PrevVal == CurVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
				.addReg(CurReg);
				} else if (CurVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
				.addReg(AMDGPU::EXEC);
				} else {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
				.addReg(AMDGPU::EXEC)
				.addImm(-1);
				}
				return;
				}

				unsigned PrevMaskedReg = 0;
				unsigned CurMaskedReg = 0;
				if (!PrevConstant) {
				if (CurConstant && CurVal) {
				PrevMaskedReg = PrevReg;
				} else {
				rampitecUnsubmitted Not Done Reply Inline Actions MI can be null technically. rampitec: MI can be null technically.
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions How? This pass runs on Machine SSA form. nhaehnle: How? This pass runs on Machine SSA form.
				rampitecUnsubmitted Not Done Reply Inline Actions As far as I understand you may have two defs in case of a superreg. rampitec: As far as I understand you may have two defs in case of a superreg.
				arsenmUnsubmitted Not Done Reply Inline Actions You can't have 2 defs of a super register in SSA. The case I would worry about is if the register is undef or an argument, which I'm not sure are an issue this early arsenm: You can't have 2 defs of a super register in SSA. The case I would worry about is if the…
				PrevMaskedReg = createLaneMaskReg(*MF);
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
				.addReg(PrevReg)
				.addReg(AMDGPU::EXEC);
				}
				}
				if (!CurConstant) {
				// TODO: check whether CurReg is already masked by EXEC
				if (PrevConstant && PrevVal) {
				CurMaskedReg = CurReg;
				} else {
				CurMaskedReg = createLaneMaskReg(*MF);
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
				.addReg(CurReg)
				.addReg(AMDGPU::EXEC);
				}
				}

				arsenmUnsubmitted Done Reply Inline Actions .addRegs on new line arsenm: .addRegs on new line
				nhaehnleAuthorUnsubmitted Not Done Reply Inline Actions clang-format disagreed :/ But yeah, I'm changing it back. nhaehnle: clang-format disagreed :/ But yeah, I'm changing it back.
				if (PrevConstant && !PrevVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
				.addReg(CurMaskedReg);
				} else if (CurConstant && !CurVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
				.addReg(PrevMaskedReg);
				} else if (PrevConstant && PrevVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
				.addReg(CurMaskedReg)
				.addReg(AMDGPU::EXEC);
				} else {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
				.addReg(PrevMaskedReg)
				.addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
				}
				}

lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h

This file was deleted.

	//===- AMDGPULaneDominator.h ------------------------------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
	#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H

	namespace llvm {

	class MachineBasicBlock;

	namespace AMDGPU {

	bool laneDominates(MachineBasicBlock MBBA, MachineBasicBlock MBBB);

	} // end namespace AMDGPU
	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H

lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp

This file was deleted.

	//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// MBB A lane-dominates MBB B if
	// 1. A dominates B in the usual sense, i.e. every path from the entry to B
	// goes through A, and
	// 2. whenever B executes, every active lane during that execution of B was
	// also active during the most recent execution of A.
	//
	// The simplest example where A dominates B but does not lane-dominate it is
	// where A is a loop:
	//
	// \|
	// +--+
	// A \|
	// +--+
	// \|
	// B
	//
	// Unfortunately, the second condition is not fully captured by the control
	// flow graph when it is unstructured (as may happen when branch conditions are
	// uniform).
	//
	// The following replacement of the second condition is a conservative
	// approximation. It is an equivalent condition when the CFG is fully
	// structured:
	//
	// 2'. every cycle in the CFG that contains A also contains B.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPULaneDominator.h"

	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"

	namespace llvm {

	namespace AMDGPU {

	// Given machine basic blocks A and B where A dominates B, check whether
	// A lane-dominates B.
	//
	// The check is conservative, i.e. there can be false-negatives.
	bool laneDominates(MachineBasicBlock A, MachineBasicBlock B) {
	// Check whether A is reachable from itself without going through B.
	DenseSet<MachineBasicBlock *> Reachable;
	SmallVector<MachineBasicBlock *, 8> Stack;

	Stack.push_back(A);
	do {
	MachineBasicBlock *MBB = Stack.back();
	Stack.pop_back();

	for (MachineBasicBlock *Succ : MBB->successors()) {
	if (Succ == A)
	return false;
	if (Succ != B && Reachable.insert(Succ).second)
	Stack.push_back(Succ);
	}
	} while (!Stack.empty());

	return true;
	}

	} // namespace AMDGPU

	} // namespace llvm

lib/Target/AMDGPU/Utils/CMakeLists.txt

	add_llvm_library(LLVMAMDGPUUtils			add_llvm_library(LLVMAMDGPUUtils
	AMDGPUBaseInfo.cpp			AMDGPUBaseInfo.cpp
	AMDKernelCodeTUtils.cpp			AMDKernelCodeTUtils.cpp
	AMDGPUAsmUtils.cpp			AMDGPUAsmUtils.cpp
	AMDGPULaneDominator.cpp
	)			)

test/CodeGen/AMDGPU/i1-copy-from-loop.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s

	; SI-LABEL: {{^}}i1_copy_from_loop:			; SI-LABEL: {{^}}i1_copy_from_loop:
	;			;
	; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
	; only contain the lanes that were active during the last loop iteration.
	;
	; SI: ; %for.body			; SI: ; %for.body
	; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,			; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
	; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]			; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
	; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]			; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
	; SI: [[ENDIF]]:			; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
	; SI-NOT: [[VREG]]
				; SI: ; %Flow1
				; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec

				; SI: ; %Flow
				; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
				; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
				; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]

	; SI: ; %for.end			; SI: ; %for.end
	; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]]			; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]

	define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {			define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
	entry:			entry:
	br label %for.body			br label %for.body

	for.body:			for.body:
	%i = phi i32 [0, %entry], [%i.inc, %end.loop]			%i = phi i32 [0, %entry], [%i.inc, %end.loop]
	%cc = icmp ult i32 %i, 4			%cc = icmp ult i32 %i, 4
	br i1 %cc, label %mid.loop, label %for.end			br i1 %cc, label %mid.loop, label %for.end
	Show All 26 Lines

test/CodeGen/AMDGPU/i1-copy-phi.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s

	; SI-LABEL: {{^}}br_i1_phi:			; SI-LABEL: {{^}}br_i1_phi:
	; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
	; SI: s_and_saveexec_b64			; SI: ; %bb
	; SI: v_mov_b32_e32 [[REG]], -1{{$}}			; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], 0
	; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
	; SI: s_and_saveexec_b64			; SI: ; %bb2
	; SI: s_endpgm			; SI: s_mov_b64 [[TMP]], exec

				; SI: ; %bb3
				; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]

	define amdgpu_kernel void @br_i1_phi(i32 %arg) {			define amdgpu_kernel void @br_i1_phi(i32 %arg) {
	bb:			bb:
	%tidig = call i32 @llvm.amdgcn.workitem.id.x()			%tidig = call i32 @llvm.amdgcn.workitem.id.x()
	%cmp = trunc i32 %tidig to i1			%cmp = trunc i32 %tidig to i1
	br i1 %cmp, label %bb2, label %bb3			br i1 %cmp, label %bb2, label %bb3

	bb2: ; preds = %bb			bb2: ; preds = %bb
	br label %bb3			br label %bb3
	Show All 17 Lines

test/CodeGen/AMDGPU/inline-asm.ll

	Show First 20 Lines • Show All 192 Lines • ▼ Show 20 Lines
	; CHECK: use v[0:1]			; CHECK: use v[0:1]
	define amdgpu_kernel void @i64_imm_input_phys_vgpr() {			define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
	entry:			entry:
	call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456)			call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456)
	ret void			ret void
	}			}

	; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:			; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
	; CHECK: v_mov_b32_e32 v0, -1{{$}}			; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
				; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
	; CHECK: ; use v0			; CHECK: ; use v0
	define amdgpu_kernel void @i1_imm_input_phys_vgpr() {			define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
	entry:			entry:
	call void asm sideeffect "; use $0 ", "{v0}"(i1 true)			call void asm sideeffect "; use $0 ", "{v0}"(i1 true)
	ret void			ret void
	}			}

	; CHECK-LABEL: {{^}}i1_input_phys_vgpr:			; CHECK-LABEL: {{^}}i1_input_phys_vgpr:
	; CHECK: {{buffer\|flat}}_load_ubyte [[LOAD:v[0-9]+]]			; CHECK: {{buffer\|flat}}_load_ubyte [[LOAD:v[0-9]+]]
	; CHECK: v_and_b32_e32 [[LOAD]], 1, [[LOAD]]			; CHECK: v_and_b32_e32 [[LOAD]], 1, [[LOAD]]
	; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]			; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
	; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc			; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
	; CHECK: ; use v0			; CHECK: ; use v0
				; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
				; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
				; CHECK: {{buffer\|flat}}_store_byte [[STORE]],
	define amdgpu_kernel void @i1_input_phys_vgpr() {			define amdgpu_kernel void @i1_input_phys_vgpr() {
	entry:			entry:
	%val = load i1, i1 addrspace(1)* undef			%val = load i1, i1 addrspace(1)* undef
	call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)			%cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
				store i1 %cc, i1 addrspace(1)* undef
	ret void			ret void
	}			}

	; FIXME: Should be scheduled to shrink vcc			; FIXME: Should be scheduled to shrink vcc
	; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:			; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
	; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0			; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0
	; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc			; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc
	; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1			; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1
	Show All 38 Lines

test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s		; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s		; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI %s

; FIXME: Enable for VI.		; FIXME: Enable for VI.

declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone		declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone		declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone		declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone

; GCN-LABEL: {{^}}test_div_fmas_f32:		; GCN-LABEL: {{^}}test_div_fmas_f32:
▲ Show 20 Lines • Show All 128 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
%and = and i1 %cmp0, %cmp1		%and = and i1 %cmp0, %cmp1

%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone		%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
store float %result, float addrspace(1)* %gep.out, align 4		store float %result, float addrspace(1)* %gep.out, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:		; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc

; SI: buffer_load_dword [[LOAD:v[0-9]+]]		; SI: ; %entry
; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]		; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc		; SI: s_mov_b64 vcc, 0
		; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]

		; SI: ; %bb
		; SI: buffer_load_dword [[LOAD:v[0-9]+]],
		; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
		; SI: s_and_b64 vcc, vcc, exec

; SI: BB9_2:		; SI: ; %exit
; SI: s_or_b64 exec, exec, [[SAVE]]		; SI: s_or_b64 exec, exec, [[SAVE]]
; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}		; SI-NOT: vcc
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}		; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; SI: buffer_store_dword		; SI: buffer_store_dword
; SI: s_endpgm		; SI: s_endpgm

define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {		define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
entry:		entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone		%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.out = getelementptr float, float addrspace(1)* %out, i32 2		%gep.out = getelementptr float, float addrspace(1)* %out, i32 2
%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid		%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1		%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
%gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2		%gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2

Show All 18 Lines

test/CodeGen/AMDGPU/loop_break.ll

	Show All 16 Lines
	; OPT: Flow:			; OPT: Flow:
	; OPT: call i64 @llvm.amdgcn.if.break(			; OPT: call i64 @llvm.amdgcn.if.break(
	; OPT: call i1 @llvm.amdgcn.loop(i64			; OPT: call i1 @llvm.amdgcn.loop(i64
	; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1			; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1

	; OPT: bb9:			; OPT: bb9:
	; OPT: call void @llvm.amdgcn.end.cf(i64			; OPT: call void @llvm.amdgcn.end.cf(i64

	; TODO: Can remove exec fixes in return block
	; GCN-LABEL: {{^}}break_loop:			; GCN-LABEL: {{^}}break_loop:
	; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}			; GCN: s_mov_b64 [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1			; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
	; GCN: v_cmp_lt_i32_e32 vcc, -1			; GCN: v_cmp_lt_i32_e32 vcc, -1
	; GCN: s_and_b64 vcc, exec, vcc			; GCN: s_and_b64 vcc, exec, vcc
				; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
	; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]			; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]

	; GCN: ; %bb.2: ; %bb4			; GCN: ; %bb4
	; GCN: buffer_load_dword			; GCN: buffer_load_dword
	; GCN: v_cmp_ge_i32_e32 vcc,			; GCN: v_cmp_ge_i32_e32 vcc,
				; GCN: s_andn2_b64 [[INNER_MASK]], [[INNER_MASK]], exec
	; GCN: [[FLOW]]:			; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
	; GCN: s_or_b64 [[MASK]], vcc, [[MASK]]			; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[TMP0]]
	; GCN: s_andn2_b64 exec, exec, [[MASK]]
				; GCN: [[FLOW]]: ; %Flow
				; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
				; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[OUTER_MASK]]
				; GCN: s_mov_b64 [[OUTER_MASK]], [[TMP1]]
				; GCN: s_andn2_b64 exec, exec, [[TMP1]]
	; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]			; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]

	; GCN: ; %bb.4: ; %bb9			; GCN: ; %bb.4: ; %bb9
	; GCN-NEXT: s_endpgm			; GCN-NEXT: s_endpgm
	define amdgpu_kernel void @break_loop(i32 %arg) #0 {			define amdgpu_kernel void @break_loop(i32 %arg) #0 {
	bb:			bb:
	%id = call i32 @llvm.amdgcn.workitem.id.x()			%id = call i32 @llvm.amdgcn.workitem.id.x()
	%tmp = sub i32 %id, %arg			%tmp = sub i32 %id, %arg
	br label %bb1			br label %bb1
	▲ Show 20 Lines • Show All 281 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

	Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines
	; IR: br label %Flow2			; IR: br label %Flow2

	; IR: UnifiedReturnBlock:			; IR: UnifiedReturnBlock:
	; IR: call void @llvm.amdgcn.end.cf(i64 %14)			; IR: call void @llvm.amdgcn.end.cf(i64 %14)
	; IR: ret void			; IR: ret void


	; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:			; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
	; GCN: v_cmp_lt_i32_e32 vcc, 1
				; GCN: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
				; GCN: v_cmp_lt_i32_e32 vcc, 1,
				; GCN: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
	; GCN: s_and_saveexec_b64			; GCN: s_and_saveexec_b64
	; GCN: s_xor_b64			; GCN: s_xor_b64

				; GCN: ; %LeafBlock1
				; GCN-NEXT: s_mov_b64 [[EXIT0]], exec
				; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2,
				; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec

				; GCN: ; %Flow
				; GCN-NEXT: s_or_saveexec_b64
				; GCN-NEXT: s_xor_b64

	; FIXME: Why is this compare essentially repeated?			; FIXME: Why is this compare essentially repeated?
	; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]			; GCN: ; %LeafBlock
	; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc			; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1,
	; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]			; GCN-DAG: v_cmp_ne_u32_e64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
	; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc			; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
				; GCN-DAG: s_andn2_b64 [[EXIT1]], [[EXIT1]], exec
				; GCN-DAG: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
				; GCN-DAG: s_and_b64 [[TMP1]], [[TMP1]], exec
				; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]]
				; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]]

	; GCN: ; %Flow4			; GCN: ; %Flow4
	; GCN-NEXT: s_or_b64 exec, exec			; GCN-NEXT: s_or_b64 exec, exec,
	; GCN: v_cmp_ne_u32_e32 vcc, 0			; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
				; GCN-NEXT: s_xor_b64

	; GCN: ; %exit1			; GCN: ; %exit1
	; GCN: ds_write_b32			; GCN: ds_write_b32
				; GCN: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec

	; GCN: %Flow5			; GCN: ; %Flow5
	; GCN-NEXT: s_or_b64 exec, exec			; GCN-NEXT: s_or_b64 exec, exec,
	; GCN: v_cmp_ne_u32_e32 vcc, 0			; GCN-NEXT; s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
	; GCN-NEXT: s_and_saveexec_b64

	; GCN: ; %exit0			; GCN: ; %exit0
	; GCN: buffer_store_dword			; GCN: buffer_store_dword

	; GCN: ; %UnifiedReturnBlock			; GCN: ; %UnifiedReturnBlock
	; GCN-NEXT: s_endpgm			; GCN-NEXT: s_endpgm
	define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {			define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
	entry:			entry:
	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%tmp1 = add i32 0, %tmp			%tmp1 = add i32 0, %tmp
	%tmp2 = zext i32 %tmp1 to i64			%tmp2 = zext i32 %tmp1 to i64
	▲ Show 20 Lines • Show All 614 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/multilevel-break.ll

	Show All 15 Lines
	; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1			; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1
	; OPT-NEXT: call i1 @llvm.amdgcn.loop(i64			; OPT-NEXT: call i1 @llvm.amdgcn.loop(i64
	; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1			; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1
	;			;
	; OPT: Flow1:			; OPT: Flow1:

	; GCN-LABEL: {{^}}multi_else_break:			; GCN-LABEL: {{^}}multi_else_break:

				; GCN: ; %main_body
				; GCN: s_mov_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}			; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
				; GCN: s_mov_b64 [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}			; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
	; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc			; GCN: s_or_b64 [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
				; GCN: s_or_b64 [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
	; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}			; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
	; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
				; FIXME: duplicate comparison
				; GCN: ; %ENDIF
				; GCN-DAG: v_cmp_eq_u32_e32 vcc,
				; GCN-DAG: v_cmp_ne_u32_e64 [[TMP51NEG:s\[[0-9]+:[0-9]+\]]],
				; GCN-DAG: s_andn2_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], exec
				; GCN-DAG: s_andn2_b64 [[BREAK_INNER]], [[BREAK_INNER]], exec
				; GCN-DAG: s_and_b64 [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec
				; GCN-DAG: s_and_b64 [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec
				; GCN-DAG: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
				; GCN-DAG: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]

				; GCN: ; %Flow
				; GCN: s_or_b64 exec, exec, [[SAVE_EXEC]]
				; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
				; GCN: s_or_b64 [[TMP0]], [[TMP0]], [[LEFT_INNER]]
				; GCN: s_mov_b64 [[LEFT_INNER]], [[TMP0]]
				; GCN: s_andn2_b64 exec, exec, [[TMP0]]
				; GCN: s_cbranch_execnz [[INNER_LOOP]]

				; GCN: ; %Flow2
				; GCN: s_or_b64 exec, exec, [[TMP0]]
				; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
				; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
				; GCN: s_mov_b64 [[LEFT_OUTER]], [[TMP1]]
				; GCN: s_andn2_b64 exec, exec, [[TMP1]]
				; GCN: s_cbranch_execnz [[OUTER_LOOP]]

	; Ensure extra or eliminated			; GCN: ; %IF
	; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]			; GCN-NEXT: s_endpgm
	; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
	; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}}
	; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
	; GCN-NEXT: v_mov_b32_e32
	; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
	; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]

	; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
	; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1

	; Ensure copy is eliminated
	; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
	; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc
	; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
	; GCN-NEXT: s_mov_b64
	; GCN-NEXT: v_mov_b32_e32
	; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
	; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
	define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {			define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
	main_body:			main_body:
	br label %LOOP.outer			br label %LOOP.outer

	LOOP.outer: ; preds = %ENDIF, %main_body			LOOP.outer: ; preds = %ENDIF, %main_body
	%tmp43 = phi i32 [ 0, %main_body ], [ %tmp47, %ENDIF ]			%tmp43 = phi i32 [ 0, %main_body ], [ %tmp47, %ENDIF ]
	br label %LOOP			br label %LOOP

	Show All 13 Lines

	; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(			; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
	; OPT: llvm.amdgcn.if.break			; OPT: llvm.amdgcn.if.break
	; OPT: llvm.amdgcn.loop			; OPT: llvm.amdgcn.loop
	; OPT: llvm.amdgcn.if.break			; OPT: llvm.amdgcn.if.break
	; OPT: llvm.amdgcn.end.cf			; OPT: llvm.amdgcn.end.cf

	; GCN-LABEL: {{^}}multi_if_break_loop:			; GCN-LABEL: {{^}}multi_if_break_loop:
	; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}			; GCN: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}			; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
				; GCN: s_mov_b64 [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]

	; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]]			; GCN: ; %LeafBlock1
	; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]]			; GCN: s_mov_b64
				; GCN: s_mov_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}

				; GCN: ; %case1
				; GCN: buffer_load_dword [[LOAD2:v[0-9]+]],
				; GCN: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD2]]
				; GCN: s_orn2_b64 [[BREAK]], vcc, exec

				; GCN: ; %Flow3
				; GCN: s_branch [[FLOW:BB[0-9]+_[0-9]+]]

				; GCN: s_mov_b64 [[BREAK]], -1{{$}}

				; GCN: [[FLOW]]: ; %Flow

				; GCN: ; %case0
				; GCN: buffer_load_dword [[LOAD1:v[0-9]+]],
				; GCN-DAG: s_andn2_b64 [[BREAK]], [[BREAK]], exec
				; GCN-DAG: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD1]]
				; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
				; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]]

				; GCN: ; %Flow4
				; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]]
				; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]]
				; GCN: s_andn2_b64 exec, exec, [[LEFT]]
	; GCN-NEXT: s_cbranch_execnz			; GCN-NEXT: s_cbranch_execnz

	define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {			define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
	bb:			bb:
	%id = call i32 @llvm.amdgcn.workitem.id.x()			%id = call i32 @llvm.amdgcn.workitem.id.x()
	%tmp = sub i32 %id, %arg			%tmp = sub i32 %id, %arg
	br label %bb1			br label %bb1

	Show All 28 Lines

test/CodeGen/AMDGPU/select-opt.ll

Show First 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
store i64 %select, i64 addrspace(1)* %out		store i64 %select, i64 addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}regression:		; GCN-LABEL: {{^}}regression:
; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0		; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}		; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}		; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}

define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {		define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
entry:		entry:
%cmp0 = fcmp oeq float %c0, 1.0		%cmp0 = fcmp oeq float %c0, 1.0
br i1 %cmp0, label %if0, label %endif		br i1 %cmp0, label %if0, label %endif

if0:		if0:
%cmp1 = fcmp oeq float %c1, 0.0		%cmp1 = fcmp oeq float %c1, 0.0
Show All 14 Lines

test/CodeGen/AMDGPU/sgpr-control-flow.ll

Show First 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	else:
br label %endif		br label %endif

endif:		endif:
%tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]		%tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
store i32 %tmp4, i32 addrspace(1)* %out		store i32 %tmp4, i32 addrspace(1)* %out
ret void		ret void
}		}

; FIXME: Should write to different SGPR pairs instead of copying to
; VALU for i1 phi.

; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:		; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:

		; SI: ; %else
; SI: buffer_load_dword [[AVAL:v[0-9]+]]		; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]		; SI: v_cmp_gt_i32_e64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]

; SI: BB{{[0-9]+}}_2:		; SI: ; %if
; SI: buffer_load_dword [[AVAL:v[0-9]+]]		; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]		; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]		; SI-DAG: s_andn2_b64 [[PHI]], [[PHI]], exec
		; SI-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec
; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]		; SI: s_or_b64 [[PHI]], [[PHI]], [[TMP]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
; SI: buffer_store_dword [[RESULT]]		; SI: ; %endif
		; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[PHI]]
		; SI: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {		define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
entry:		entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0		%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%tmp1 = icmp eq i32 %tid, 0		%tmp1 = icmp eq i32 %tid, 0
br i1 %tmp1, label %if, label %else		br i1 %tmp1, label %if, label %else

if:		if:
%gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid		%gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid
Show All 20 Lines

test/CodeGen/AMDGPU/si-annotate-cf.ll

; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s		; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s		; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s

; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:		; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:

; SI: [[LOOP_LABEL:[A-Z0-9]+]]:		; SI: [[LOOP_LABEL:[A-Z0-9]+]]:
; Lowered break instructin:		; Lowered break instructin:
; SI: s_or_b64		; SI: s_or_b64
; Lowered Loop instruction:		; Lowered Loop instruction:
; SI: s_andn2_b64		; SI: s_andn2_b64
Show All 11 Lines	ENDLOOP:
ret void		ret void

ENDIF:		ENDIF:
br i1 %1, label %ENDLOOP, label %ENDIF		br i1 %1, label %ENDLOOP, label %ENDIF
}		}


; FUNC-LABEL: {{^}}phi_cond_outside_loop:		; FUNC-LABEL: {{^}}phi_cond_outside_loop:
; FIXME: This could be folded into the s_or_b64 instruction
; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
; SI: [[LOOP_LABEL:[A-Z0-9]+]]
; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}

; SI_IF_BREAK instruction:		; SI: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]		; SI: s_mov_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0

; SI_LOOP instruction:		; SI: ; %else
; SI: s_andn2_b64 exec, exec, [[BREAK]]		; SI: v_cmp_eq_u32_e64 [[TMP:s\[[0-9]+:[0-9]+\]]],
		; SI: s_and_b64 [[PHI]], [[TMP]], exec

		; SI: ; %endif

		; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
		; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
		; SI: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
		; SI: s_or_b64 [[LEFT]], [[TMP1]], [[TMP]]
		; SI: s_andn2_b64 exec, exec, [[LEFT]]
; SI: s_cbranch_execnz [[LOOP_LABEL]]		; SI: s_cbranch_execnz [[LOOP_LABEL]]
; SI: s_endpgm		; SI: s_endpgm

define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {		define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
entry:		entry:
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0		%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%0 = icmp eq i32 %tid , 0		%0 = icmp eq i32 %tid , 0
br i1 %0, label %if, label %else		br i1 %0, label %if, label %else

if:		if:
Show All 35 Lines	sw.epilog:
ret void		ret void
}		}

declare float @llvm.fabs.f32(float) nounwind readnone		declare float @llvm.fabs.f32(float) nounwind readnone

; This broke the old AMDIL cfg structurizer		; This broke the old AMDIL cfg structurizer
; FUNC-LABEL: {{^}}loop_land_info_assert:		; FUNC-LABEL: {{^}}loop_land_info_assert:
; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}		; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
; SI: s_and_b64 vcc, exec, [[CMP4]]		; SI: s_and_b64 [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]		; SI: s_mov_b64 vcc, [[CMP4M]]
; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]		; SI-NEXT: s_cbranch_vccnz [[CONVEX_EXIT:BB[0-9_]+]]
; SI-NEXT: BB{{[0-9_]+}}:		; SI-NEXT: s_branch [[FOR_COND_PREHDR:BB[0-9_]+]]
; SI-NEXT: buffer_store_dword
		; SI: ; %if.else
		; SI: buffer_store_dword

; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]:		; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]:

; SI: [[BR1]]:		; SI: [[CONVEX_EXIT]]:
; SI-NEXT: s_and_b64 vcc, exec,		; SI: s_mov_b64 vcc,
; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]		; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
; SI: s_branch [[INFLOOP]]		; SI: s_branch [[INFLOOP]]
; SI-NEXT: [[BR2]]:		; SI-NEXT: [[FOR_COND_PREHDR]]:
; SI: s_cbranch_vccz [[ENDPGM]]		; SI: s_cbranch_vccz [[ENDPGM]]

; SI: [[ENDPGM]]:		; SI: [[ENDPGM]]:
; SI-NEXT: s_endpgm		; SI-NEXT: s_endpgm
define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {		define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
entry:		entry:
%cmp = icmp sgt i32 %c0, 0		%cmp = icmp sgt i32 %c0, 0
br label %while.cond.outer		br label %while.cond.outer
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/valu-i1.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s \| FileCheck -check-prefix=SI %s

	declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone			declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

	; SI-LABEL: {{^}}test_if:			; SI-LABEL: {{^}}test_if:
	; Make sure the i1 values created by the cfg structurizer pass are			; Make sure the i1 values created by the cfg structurizer pass are
	; moved using VALU instructions			; moved using VALU instructions


	; waitcnt should be inserted after exec modification			; waitcnt should be inserted after exec modification
	; SI: v_cmp_lt_i32_e32 vcc, 0,			; SI: v_cmp_lt_i32_e32 vcc, 0,
	; SI: v_mov_b32_e32 {{v[0-9]+}}, 0			; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
				; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
	; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc			; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
	; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]			; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
	; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]			; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
	; SI-NEXT: s_cbranch_execz [[FLOW_BB]]			; SI-NEXT: s_cbranch_execz [[FLOW_BB]]

	; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3			; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
	; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1			; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
	; SI: v_mov_b32_e32 v{{[0-9]}}, -1
	; SI: s_and_saveexec_b64			; SI: s_and_saveexec_b64
	; SI-NEXT: ; mask branch			; SI-NEXT: ; mask branch

	; v_mov should be after exec modification			; v_mov should be after exec modification
	; SI: [[FLOW_BB]]:			; SI: [[FLOW_BB]]:
	; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]			; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
	; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
	; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]			; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
	; SI-NEXT: ; mask branch			; SI-NEXT: ; mask branch
	;			;
	define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {			define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone			%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
	switch i32 %tid, label %default [			switch i32 %tid, label %default [
	i32 0, label %case0			i32 0, label %case0
	▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines
	; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]			; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]

	; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20			; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
	; SI: buffer_store_dword			; SI: buffer_store_dword

	; SI: [[LABEL_FLOW]]:			; SI: [[LABEL_FLOW]]:
	; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]			; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
	; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]			; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
	; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10			; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
	; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]			; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
	; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]			; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
				; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
	; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]			; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]

	; SI: [[LABEL_EXIT]]:			; SI: [[LABEL_EXIT]]:
	; SI-NOT: [[COND_STATE]]			; SI-NOT: [[COND_STATE]]
	; SI: s_endpgm			; SI: s_endpgm

	define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {			define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
	bb:			bb:
	Show All 34 Lines

test/CodeGen/AMDGPU/waitcnt-looptest.ll

	; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-load-store-vectorizer=0 \| FileCheck --check-prefix=GCN %s			; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-load-store-vectorizer=0 \| FileCheck --check-prefix=GCN %s

	; Check that the waitcnt insertion algorithm correctly propagates wait counts			; Check that the waitcnt insertion algorithm correctly propagates wait counts
	; from before a loop to the loop header.			; from before a loop to the loop header.

	; GCN-LABEL: {{^}}testKernel			; GCN-LABEL: {{^}}testKernel
	; GCN: BB0_1:			; GCN: BB0_1:
	; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cmp_eq_f32_e64			; GCN-NEXT: v_cmp_eq_f32_e32
	; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cmp_eq_f32_e32			; GCN-NEXT: v_cmp_eq_f32_e32
	; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cmp_eq_f32_e32			; GCN-NEXT: v_cmp_eq_f32_e32

	@data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4			@data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4
	@data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4			@data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4

	▲ Show 20 Lines • Show All 129 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Rewrite SILowerI1Copies to always stay on SALU
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 170403

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

lib/Target/AMDGPU/SILowerI1Copies.cpp

lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h

lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp

lib/Target/AMDGPU/Utils/CMakeLists.txt

test/CodeGen/AMDGPU/i1-copy-from-loop.ll

test/CodeGen/AMDGPU/i1-copy-phi.ll

test/CodeGen/AMDGPU/inline-asm.ll

test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

test/CodeGen/AMDGPU/loop_break.ll

test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

test/CodeGen/AMDGPU/multilevel-break.ll

test/CodeGen/AMDGPU/select-opt.ll

test/CodeGen/AMDGPU/sgpr-control-flow.ll

test/CodeGen/AMDGPU/si-annotate-cf.ll

test/CodeGen/AMDGPU/valu-i1.ll

test/CodeGen/AMDGPU/waitcnt-looptest.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Rewrite SILowerI1Copies to always stay on SALUClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 170403

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

lib/Target/AMDGPU/SILowerI1Copies.cpp

lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h

lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp

lib/Target/AMDGPU/Utils/CMakeLists.txt

test/CodeGen/AMDGPU/i1-copy-from-loop.ll

test/CodeGen/AMDGPU/i1-copy-phi.ll

test/CodeGen/AMDGPU/inline-asm.ll

test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

test/CodeGen/AMDGPU/loop_break.ll

test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

test/CodeGen/AMDGPU/multilevel-break.ll

test/CodeGen/AMDGPU/select-opt.ll

test/CodeGen/AMDGPU/sgpr-control-flow.ll

test/CodeGen/AMDGPU/si-annotate-cf.ll

test/CodeGen/AMDGPU/valu-i1.ll

test/CodeGen/AMDGPU/waitcnt-looptest.ll

AMDGPU: Rewrite SILowerI1Copies to always stay on SALU
ClosedPublic