Diff 171905

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 811 Lines • ▼ Show 20 Lines	if (EnableEarlyIfConversion)
addPass(&EarlyIfConverterID);		addPass(&EarlyIfConverterID);

TargetPassConfig::addILPOpts();		TargetPassConfig::addILPOpts();
return false;		return false;
}		}

bool GCNPassConfig::addInstSelector() {		bool GCNPassConfig::addInstSelector() {
AMDGPUPassConfig::addInstSelector();		AMDGPUPassConfig::addInstSelector();
addPass(createSILowerI1CopiesPass());
addPass(&SIFixSGPRCopiesID);		addPass(&SIFixSGPRCopiesID);
		addPass(createSILowerI1CopiesPass());
return false;		return false;
}		}

bool GCNPassConfig::addIRTranslator() {		bool GCNPassConfig::addIRTranslator() {
addPass(new IRTranslator());		addPass(new IRTranslator());
return false;		return false;
}		}

▲ Show 20 Lines • Show All 90 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Show First 20 Lines • Show All 177 Lines • ▼ Show 20 Lines	const TargetRegisterClass *DstRC =
TRI.getPhysRegClass(DstReg);		TRI.getPhysRegClass(DstReg);

return std::make_pair(SrcRC, DstRC);		return std::make_pair(SrcRC, DstRC);
}		}

static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,		static bool isVGPRToSGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,		const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {		const SIRegisterInfo &TRI) {
return TRI.isSGPRClass(DstRC) && TRI.hasVGPRs(SrcRC);		return SrcRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(DstRC) &&
		TRI.hasVGPRs(SrcRC);
}		}

static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,		static bool isSGPRToVGPRCopy(const TargetRegisterClass *SrcRC,
const TargetRegisterClass *DstRC,		const TargetRegisterClass *DstRC,
const SIRegisterInfo &TRI) {		const SIRegisterInfo &TRI) {
return TRI.isSGPRClass(SrcRC) && TRI.hasVGPRs(DstRC);		return DstRC != &AMDGPU::VReg_1RegClass && TRI.isSGPRClass(SrcRC) &&
		TRI.hasVGPRs(DstRC);
}		}

static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,		static bool tryChangeVGPRtoSGPRinCopy(MachineInstr &MI,
const SIRegisterInfo *TRI,		const SIRegisterInfo *TRI,
const SIInstrInfo *TII) {		const SIInstrInfo *TII) {
MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();		MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
auto &Src = MI.getOperand(1);		auto &Src = MI.getOperand(1);
unsigned DstReg = MI.getOperand(0).getReg();		unsigned DstReg = MI.getOperand(0).getReg();
▲ Show 20 Lines • Show All 513 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp

	//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//			//===-- SILowerI1Copies.cpp - Lower I1 Copies -----------------------------===//
	//			//
	// The LLVM Compiler Infrastructure			// The LLVM Compiler Infrastructure
	//			//
	// This file is distributed under the University of Illinois Open Source			// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.			// License. See LICENSE.TXT for details.
	//			//
	/// i1 values are usually inserted by the CFG Structurize pass and they are
	/// unique in that they can be copied from VALU to SALU registers.
	/// This is not possible for any other value type. Since there are no
	/// MOV instructions for i1, we to use V_CMP_* and V_CNDMASK to move the i1.
	///
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	//			//
				// This pass lowers all occurrences of i1 values (with a vreg_1 register class)
				// to lane masks (64-bit scalar registers). The pass assumes machine SSA form
				// and a wave-level control flow graph.
				//
				// Before this pass, values that are semantically i1 and are defined and used
				// within the same basic block are already represented as lane masks in scalar
				// registers. However, values that cross basic blocks are always transferred
				// between basic blocks in vreg_1 virtual registers and are lowered by this
				// pass.
				//
				// The only instructions that use or define vreg_1 virtual registers are COPY,
				// PHI, and IMPLICIT_DEF.
				//
				//===----------------------------------------------------------------------===//

	#define DEBUG_TYPE "si-i1-copies"
	#include "AMDGPU.h"			#include "AMDGPU.h"
	#include "AMDGPUSubtarget.h"			#include "AMDGPUSubtarget.h"
	#include "SIInstrInfo.h"
	#include "MCTargetDesc/AMDGPUMCTargetDesc.h"			#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
	#include "Utils/AMDGPULaneDominator.h"			#include "SIInstrInfo.h"
	#include "llvm/CodeGen/LiveIntervals.h"			#include "llvm/CodeGen/MachineDominators.h"
	#include "llvm/CodeGen/MachineFunctionPass.h"			#include "llvm/CodeGen/MachineFunctionPass.h"
	#include "llvm/CodeGen/MachineInstrBuilder.h"			#include "llvm/CodeGen/MachineInstrBuilder.h"
				#include "llvm/CodeGen/MachinePostDominators.h"
	#include "llvm/CodeGen/MachineRegisterInfo.h"			#include "llvm/CodeGen/MachineRegisterInfo.h"
				#include "llvm/CodeGen/MachineSSAUpdater.h"
	#include "llvm/IR/Function.h"			#include "llvm/IR/Function.h"
	#include "llvm/IR/LLVMContext.h"			#include "llvm/IR/LLVMContext.h"
	#include "llvm/Support/Debug.h"			#include "llvm/Support/Debug.h"
	#include "llvm/Target/TargetMachine.h"			#include "llvm/Target/TargetMachine.h"

				#define DEBUG_TYPE "si-i1-copies"

	using namespace llvm;			using namespace llvm;

				static unsigned createLaneMaskReg(MachineFunction &MF);
				static unsigned insertUndefLaneMask(MachineBasicBlock &MBB);

	namespace {			namespace {

	class SILowerI1Copies : public MachineFunctionPass {			class SILowerI1Copies : public MachineFunctionPass {
	public:			public:
	static char ID;			static char ID;

				private:
				MachineFunction *MF = nullptr;
				MachineDominatorTree *DT = nullptr;
				MachinePostDominatorTree *PDT = nullptr;
				MachineRegisterInfo *MRI = nullptr;
				const GCNSubtarget *ST = nullptr;
				const SIInstrInfo *TII = nullptr;

				DenseSet<unsigned> ConstrainRegs;

	public:			public:
	SILowerI1Copies() : MachineFunctionPass(ID) {			SILowerI1Copies() : MachineFunctionPass(ID) {
	initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());			initializeSILowerI1CopiesPass(*PassRegistry::getPassRegistry());
	}			}

	bool runOnMachineFunction(MachineFunction &MF) override;			bool runOnMachineFunction(MachineFunction &MF) override;

	StringRef getPassName() const override { return "SI Lower i1 Copies"; }			StringRef getPassName() const override { return "SI Lower i1 Copies"; }

	void getAnalysisUsage(AnalysisUsage &AU) const override {			void getAnalysisUsage(AnalysisUsage &AU) const override {
	AU.setPreservesCFG();			AU.setPreservesCFG();
				AU.addRequired<MachineDominatorTree>();
				AU.addRequired<MachinePostDominatorTree>();
	MachineFunctionPass::getAnalysisUsage(AU);			MachineFunctionPass::getAnalysisUsage(AU);
	}			}

				private:
				void lowerCopiesFromI1();
				void lowerPhis();
				void lowerCopiesToI1();
				bool isConstantLaneMask(unsigned Reg, bool &Val) const;
				void buildMergeLaneMasks(MachineBasicBlock &MBB,
				MachineBasicBlock::iterator I, const DebugLoc &DL,
				unsigned DstReg, unsigned PrevReg, unsigned CurReg);
				MachineBasicBlock::iterator
				getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;

				bool isLaneMaskReg(unsigned Reg) const {
				return TII->getRegisterInfo().isSGPRReg(*MRI, Reg) &&
				TII->getRegisterInfo().getRegSizeInBits(Reg, *MRI) ==
				ST->getWavefrontSize();
				}
				};

				/// Helper class that determines the relationship between incoming values of a
				/// phi in the control flow graph to determine where an incoming value can
				/// simply be taken as a scalar lane mask as-is, and where it needs to be
				/// merged with another, previously defined lane mask.
				///
				/// The approach is as follows:
				/// - Determine all basic blocks which, starting from the incoming blocks,
				/// a wave may reach before entering the def block (the block containing the
				/// phi).
				/// - If an incoming block has no predecessors in this set, we can take the
				/// incoming value as a scalar lane mask as-is.
				/// -- A special case of this is when the def block has a self-loop.
				/// - Otherwise, the incoming value needs to be merged with a previously
				/// defined lane mask.
				/// - If there is a path into the set of reachable blocks that does _not_ go
				/// through an incoming block where we can take the scalar lane mask as-is,
				/// we need to invent an available value for the SSAUpdater. Choices are
				/// 0 and undef, with differing consequences for how to merge values etc.
				///
				/// TODO: We could use region analysis to quickly skip over SESE regions during
				/// the traversal.
				///
				class PhiIncomingAnalysis {
				MachinePostDominatorTree &PDT;

				// For each reachable basic block, whether it is a source in the induced
				// subgraph of the CFG.
				DenseMap<MachineBasicBlock *, bool> ReachableMap;
				SmallVector<MachineBasicBlock *, 4> ReachableOrdered;
				SmallVector<MachineBasicBlock *, 4> Stack;
				SmallVector<MachineBasicBlock *, 4> Predecessors;

				public:
				PhiIncomingAnalysis(MachinePostDominatorTree &PDT) : PDT(PDT) {}

				/// Returns whether \p MBB is a source in the induced subgraph of reachable
				/// blocks.
				bool isSource(MachineBasicBlock &MBB) const {
				return ReachableMap.find(&MBB)->second;
				}

				ArrayRef<MachineBasicBlock *> predecessors() const { return Predecessors; }

				void analyze(MachineBasicBlock &DefBlock,
				ArrayRef<MachineBasicBlock *> IncomingBlocks) {
				assert(Stack.empty());
				ReachableMap.clear();
				ReachableOrdered.clear();
				Predecessors.clear();

				// Insert the def block first, so that it acts as an end point for the
				// traversal.
				ReachableMap.try_emplace(&DefBlock, false);
				ReachableOrdered.push_back(&DefBlock);

				for (MachineBasicBlock *MBB : IncomingBlocks) {
				if (MBB == &DefBlock) {
				ReachableMap[&DefBlock] = true; // self-loop on DefBlock
				continue;
				}

				ReachableMap.try_emplace(MBB, false);
				ReachableOrdered.push_back(MBB);

				// If this block has a divergent terminator and the def block is its
				// post-dominator, the wave may first visit the other successors.
				bool Divergent = false;
				for (MachineInstr &MI : MBB->terminators()) {
				if (MI.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO \|\|
				MI.getOpcode() == AMDGPU::SI_IF \|\|
				MI.getOpcode() == AMDGPU::SI_ELSE \|\|
				MI.getOpcode() == AMDGPU::SI_LOOP) {
				Divergent = true;
				break;
				}
				}

				if (Divergent && PDT.dominates(&DefBlock, MBB)) {
				for (MachineBasicBlock *Succ : MBB->successors())
				Stack.push_back(Succ);
				}
				}

				while (!Stack.empty()) {
				MachineBasicBlock *MBB = Stack.pop_back_val();
				if (!ReachableMap.try_emplace(MBB, false).second)
				continue;
				ReachableOrdered.push_back(MBB);

				for (MachineBasicBlock *Succ : MBB->successors())
				Stack.push_back(Succ);
				}

				for (MachineBasicBlock *MBB : ReachableOrdered) {
				bool HaveReachablePred = false;
				for (MachineBasicBlock *Pred : MBB->predecessors()) {
				if (ReachableMap.count(Pred)) {
				HaveReachablePred = true;
				} else {
				Stack.push_back(Pred);
				}
				}
				if (!HaveReachablePred)
				ReachableMap[MBB] = true;
				if (HaveReachablePred) {
				for (MachineBasicBlock *UnreachablePred : Stack) {
				if (llvm::find(Predecessors, UnreachablePred) == Predecessors.end())
				Predecessors.push_back(UnreachablePred);
				}
				}
				Stack.clear();
				}
				}
				};

				/// Helper class that detects loops which require us to lower an i1 COPY into
				/// bitwise manipulation.
				///
				/// Unfortunately, we cannot use LoopInfo because LoopInfo does not distinguish
				/// between loops with the same header. Consider this example:
				///
				/// A-+-+
				/// \| \| \|
				/// B-+ \|
				/// \| \|
				/// C---+
				///
				/// A is the header of a loop containing A, B, and C as far as LoopInfo is
				/// concerned. However, an i1 COPY in B that is used in C must be lowered to
				/// bitwise operations to combine results from different loop iterations when
				/// B has a divergent branch (since by default we will compile this code such
				/// that threads in a wave are merged at the entry of C).
				///
				/// The following rule is implemented to determine whether bitwise operations
				/// are required: use the bitwise lowering for a def in block B if a backward
				/// edge to B is reachable without going through the nearest common
				/// post-dominator of B and all uses of the def.
				///
				/// TODO: This rule is conservative because it does not check whether the
				/// relevant branches are actually divergent.
				///
				/// The class is designed to cache the CFG traversal so that it can be re-used
				/// for multiple defs within the same basic block.
				///
				/// TODO: We could use region analysis to quickly skip over SESE regions during
				/// the traversal.
				///
				class LoopFinder {
				MachineDominatorTree &DT;
				MachinePostDominatorTree &PDT;

				// All visited / reachable block, tagged by level (level 0 is the def block,
				// level 1 are all blocks reachable including but not going through the def
				// block's IPDOM, etc.).
				DenseMap<MachineBasicBlock *, unsigned> Visited;

				// Nearest common dominator of all visited blocks by level (level 0 is the
				// def block). Used for seeding the SSAUpdater.
				SmallVector<MachineBasicBlock *, 4> CommonDominators;

				// Post-dominator of all visited blocks.
				MachineBasicBlock *VisitedPostDom = nullptr;

				// Level at which a loop was found: 0 is not possible; 1 = a backward edge is
				// reachable without going through the IPDOM of the def block (if the IPDOM
				// itself has an edge to the def block, the loop level is 2), etc.
				unsigned FoundLoopLevel = ~0u;

				MachineBasicBlock *DefBlock = nullptr;
				SmallVector<MachineBasicBlock *, 4> Stack;
				SmallVector<MachineBasicBlock *, 4> NextLevel;

				public:
				LoopFinder(MachineDominatorTree &DT, MachinePostDominatorTree &PDT)
				: DT(DT), PDT(PDT) {}

				void initialize(MachineBasicBlock &MBB) {
				Visited.clear();
				CommonDominators.clear();
				Stack.clear();
				NextLevel.clear();
				VisitedPostDom = nullptr;
				FoundLoopLevel = ~0u;

				DefBlock = &MBB;
				}

				/// Check whether a backward edge can be reached without going through the
				/// given \p PostDom of the def block.
				///
				/// Return the level of \p PostDom if a loop was found, or 0 otherwise.
				unsigned findLoop(MachineBasicBlock *PostDom) {
				MachineDomTreeNode *PDNode = PDT.getNode(DefBlock);

				if (!VisitedPostDom)
				advanceLevel();

				unsigned Level = 0;
				while (PDNode->getBlock() != PostDom) {
				if (PDNode->getBlock() == VisitedPostDom)
				advanceLevel();
				PDNode = PDNode->getIDom();
				Level++;
				if (FoundLoopLevel == Level)
				return Level;
				}

				return 0;
				}

				/// Add undef values dominating the loop and the optionally given additional
				/// blocks, so that the SSA updater doesn't have to search all the way to the
				/// function entry.
				void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
				ArrayRef<MachineBasicBlock *> Blocks = {}) {
				assert(LoopLevel < CommonDominators.size());

				MachineBasicBlock *Dom = CommonDominators[LoopLevel];
				for (MachineBasicBlock *MBB : Blocks)
				Dom = DT.findNearestCommonDominator(Dom, MBB);

				if (!inLoopLevel(*Dom, LoopLevel, Blocks)) {
				SSAUpdater.AddAvailableValue(Dom, insertUndefLaneMask(*Dom));
				} else {
				// The dominator is part of the loop or the given blocks, so add the
				// undef value to unreachable predecessors instead.
				for (MachineBasicBlock *Pred : Dom->predecessors()) {
				if (!inLoopLevel(*Pred, LoopLevel, Blocks))
				SSAUpdater.AddAvailableValue(Pred, insertUndefLaneMask(*Pred));
				}
				}
				}

				private:
				bool inLoopLevel(MachineBasicBlock &MBB, unsigned LoopLevel,
				ArrayRef<MachineBasicBlock *> Blocks) const {
				auto DomIt = Visited.find(&MBB);
				if (DomIt != Visited.end() && DomIt->second <= LoopLevel)
				return true;

				if (llvm::find(Blocks, &MBB) != Blocks.end())
				return true;

				return false;
				}

				void advanceLevel() {
				MachineBasicBlock *VisitedDom;

				if (!VisitedPostDom) {
				VisitedPostDom = DefBlock;
				VisitedDom = DefBlock;
				Stack.push_back(DefBlock);
				} else {
				VisitedPostDom = PDT.getNode(VisitedPostDom)->getIDom()->getBlock();
				VisitedDom = CommonDominators.back();

				for (unsigned i = 0; i < NextLevel.size();) {
				if (PDT.dominates(VisitedPostDom, NextLevel[i])) {
				Stack.push_back(NextLevel[i]);

				NextLevel[i] = NextLevel.back();
				NextLevel.pop_back();
				} else {
				i++;
				}
				}
				}

				unsigned Level = CommonDominators.size();
				while (!Stack.empty()) {
				MachineBasicBlock *MBB = Stack.pop_back_val();
				if (!PDT.dominates(VisitedPostDom, MBB))
				NextLevel.push_back(MBB);

				Visited[MBB] = Level;
				VisitedDom = DT.findNearestCommonDominator(VisitedDom, MBB);

				for (MachineBasicBlock *Succ : MBB->successors()) {
				if (Succ == DefBlock) {
				if (MBB == VisitedPostDom)
				FoundLoopLevel = std::min(FoundLoopLevel, Level + 1);
				else
				FoundLoopLevel = std::min(FoundLoopLevel, Level);
				continue;
				}

				if (Visited.try_emplace(Succ, ~0u).second) {
				if (MBB == VisitedPostDom)
				NextLevel.push_back(Succ);
				else
				Stack.push_back(Succ);
				}
				}
				}

				CommonDominators.push_back(VisitedDom);
				}
	};			};

	} // End anonymous namespace.			} // End anonymous namespace.

	INITIALIZE_PASS(SILowerI1Copies, DEBUG_TYPE,			INITIALIZE_PASS_BEGIN(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
	"SI Lower i1 Copies", false, false)			false)
				INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
				INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
				INITIALIZE_PASS_END(SILowerI1Copies, DEBUG_TYPE, "SI Lower i1 Copies", false,
				false)

	char SILowerI1Copies::ID = 0;			char SILowerI1Copies::ID = 0;

	char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;			char &llvm::SILowerI1CopiesID = SILowerI1Copies::ID;

	FunctionPass *llvm::createSILowerI1CopiesPass() {			FunctionPass *llvm::createSILowerI1CopiesPass() {
	return new SILowerI1Copies();			return new SILowerI1Copies();
	}			}

	bool SILowerI1Copies::runOnMachineFunction(MachineFunction &MF) {			static unsigned createLaneMaskReg(MachineFunction &MF) {
	MachineRegisterInfo &MRI = MF.getRegInfo();			MachineRegisterInfo &MRI = MF.getRegInfo();
				return MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
				}

				static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
				MachineFunction &MF = *MBB.getParent();
	const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();			const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
	const SIInstrInfo *TII = ST.getInstrInfo();			const SIInstrInfo *TII = ST.getInstrInfo();
	const TargetRegisterInfo *TRI = &TII->getRegisterInfo();			unsigned UndefReg = createLaneMaskReg(MF);
				BuildMI(MBB, MBB.getFirstTerminator(), {}, TII->get(AMDGPU::IMPLICIT_DEF),
	std::vector<unsigned> I1Defs;			UndefReg);
				return UndefReg;
				}

	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();			/// Lower all instructions that def or use vreg_1 registers.
	BI != BE; ++BI) {			///
				/// In a first pass, we lower COPYs from vreg_1 to vector registers, as can
				/// occur around inline assembly. We do this first, before vreg_1 registers
				/// are changed to scalar mask registers.
				///
				/// Then we lower all defs of vreg_1 registers. Phi nodes are lowered before
				/// all others, because phi lowering looks through copies and can therefore
				/// often make copy lowering unnecessary.
				bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
				MF = &TheMF;
				MRI = &MF->getRegInfo();
				DT = &getAnalysis<MachineDominatorTree>();
				PDT = &getAnalysis<MachinePostDominatorTree>();

				ST = &MF->getSubtarget<GCNSubtarget>();
				TII = ST->getInstrInfo();

				lowerCopiesFromI1();
				lowerPhis();
				lowerCopiesToI1();

				for (unsigned Reg : ConstrainRegs)
				MRI->constrainRegClass(Reg, &AMDGPU::SReg_64_XEXECRegClass);
				ConstrainRegs.clear();

	MachineBasicBlock &MBB = *BI;			return true;
	MachineBasicBlock::iterator I, Next;
	for (I = MBB.begin(); I != MBB.end(); I = Next) {
	Next = std::next(I);
	MachineInstr &MI = *I;

	if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF) {
	unsigned Reg = MI.getOperand(0).getReg();
	const TargetRegisterClass *RC = MRI.getRegClass(Reg);
	if (RC == &AMDGPU::VReg_1RegClass)
	MRI.setRegClass(Reg, &AMDGPU::SReg_64RegClass);
	continue;
	}			}

				void SILowerI1Copies::lowerCopiesFromI1() {
				SmallVector<MachineInstr *, 4> DeadCopies;

				for (MachineBasicBlock &MBB : *MF) {
				for (MachineInstr &MI : MBB) {
	if (MI.getOpcode() != AMDGPU::COPY)			if (MI.getOpcode() != AMDGPU::COPY)
	continue;			continue;

	const MachineOperand &Dst = MI.getOperand(0);			unsigned DstReg = MI.getOperand(0).getReg();
	const MachineOperand &Src = MI.getOperand(1);			unsigned SrcReg = MI.getOperand(1).getReg();
				if (!TargetRegisterInfo::isVirtualRegister(SrcReg) \|\|
	if (!TargetRegisterInfo::isVirtualRegister(Src.getReg()) \|\|			MRI->getRegClass(SrcReg) != &AMDGPU::VReg_1RegClass)
	!TargetRegisterInfo::isVirtualRegister(Dst.getReg()))
	continue;			continue;

	const TargetRegisterClass *DstRC = MRI.getRegClass(Dst.getReg());			if (isLaneMaskReg(DstReg) \|\|
	const TargetRegisterClass *SrcRC = MRI.getRegClass(Src.getReg());			(TargetRegisterInfo::isVirtualRegister(DstReg) &&
				MRI->getRegClass(DstReg) == &AMDGPU::VReg_1RegClass))
				continue;

				// Copy into a 32-bit vector register.
				LLVM_DEBUG(dbgs() << "Lower copy from i1: " << MI);
	DebugLoc DL = MI.getDebugLoc();			DebugLoc DL = MI.getDebugLoc();
	MachineInstr *DefInst = MRI.getUniqueVRegDef(Src.getReg());
	if (DstRC == &AMDGPU::VReg_1RegClass &&
	TRI->getCommonSubClass(SrcRC, &AMDGPU::SGPR_64RegClass)) {
	I1Defs.push_back(Dst.getReg());

	if (DefInst->getOpcode() == AMDGPU::S_MOV_B64) {			assert(TII->getRegisterInfo().getRegSizeInBits(DstReg, *MRI) == 32);
	if (DefInst->getOperand(1).isImm()) {			assert(!MI.getOperand(0).getSubReg());
	I1Defs.push_back(Dst.getReg());
				ConstrainRegs.insert(SrcReg);
				BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64), DstReg)
				.addImm(0)
				.addImm(-1)
				.addReg(SrcReg);
				DeadCopies.push_back(&MI);
				}

				for (MachineInstr *MI : DeadCopies)
				MI->eraseFromParent();
				DeadCopies.clear();
				}
				}

				void SILowerI1Copies::lowerPhis() {
				MachineSSAUpdater SSAUpdater(*MF);
				LoopFinder LF(DT, PDT);
				PhiIncomingAnalysis PIA(*PDT);
				SmallVector<MachineInstr *, 4> DeadPhis;
				SmallVector<MachineBasicBlock *, 4> IncomingBlocks;
				SmallVector<unsigned, 4> IncomingRegs;
				SmallVector<unsigned, 4> IncomingUpdated;

	int64_t Val = DefInst->getOperand(1).getImm();			for (MachineBasicBlock &MBB : *MF) {
	assert(Val == 0 \|\| Val == -1);			LF.initialize(MBB);

	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_MOV_B32_e32))			for (MachineInstr &MI : MBB.phis()) {
	.add(Dst)			unsigned DstReg = MI.getOperand(0).getReg();
	.addImm(Val);			if (MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
	MI.eraseFromParent();
	continue;			continue;

				LLVM_DEBUG(dbgs() << "Lower PHI: " << MI);

				MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);

				// Collect incoming values.
				for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
				assert(i + 1 < MI.getNumOperands());
				unsigned IncomingReg = MI.getOperand(i).getReg();
				MachineBasicBlock *IncomingMBB = MI.getOperand(i + 1).getMBB();
				MachineInstr *IncomingDef = MRI->getUniqueVRegDef(IncomingReg);

				if (IncomingDef->getOpcode() == AMDGPU::COPY) {
				IncomingReg = IncomingDef->getOperand(1).getReg();
				assert(isLaneMaskReg(IncomingReg));
				assert(!IncomingDef->getOperand(1).getSubReg());
				} else if (IncomingDef->getOpcode() == AMDGPU::IMPLICIT_DEF) {
				continue;
				} else {
				assert(IncomingDef->isPHI());
				}

				IncomingBlocks.push_back(IncomingMBB);
				IncomingRegs.push_back(IncomingReg);
	}			}

				// Phis in a loop that are observed outside the loop receive a simple but
				// conservatively correct treatment.
				MachineBasicBlock *PostDomBound = &MBB;
				for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
				PostDomBound =
				PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
	}			}

	unsigned int TmpSrc = MRI.createVirtualRegister(&AMDGPU::SReg_64_XEXECRegClass);			unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::COPY), TmpSrc)
	.add(Src);			SSAUpdater.Initialize(DstReg);
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CNDMASK_B32_e64))
	.add(Dst)			if (FoundLoopLevel) {
	.addImm(0)			LF.addLoopEntries(FoundLoopLevel, SSAUpdater, IncomingBlocks);
	.addImm(-1)
	.addReg(TmpSrc);			for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
	MI.eraseFromParent();			IncomingUpdated.push_back(createLaneMaskReg(*MF));
	} else if (TRI->getCommonSubClass(DstRC, &AMDGPU::SGPR_64RegClass) &&			SSAUpdater.AddAvailableValue(IncomingBlocks[i],
	SrcRC == &AMDGPU::VReg_1RegClass) {			IncomingUpdated.back());
	if (DefInst->getOpcode() == AMDGPU::V_CNDMASK_B32_e64 &&			}
	DefInst->getOperand(1).isImm() && DefInst->getOperand(2).isImm() &&
	DefInst->getOperand(1).getImm() == 0 &&			for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
	DefInst->getOperand(2).getImm() != 0 &&			MachineBasicBlock &IMBB = *IncomingBlocks[i];
	DefInst->getOperand(3).isReg() &&			buildMergeLaneMasks(
	TargetRegisterInfo::isVirtualRegister(			IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
	DefInst->getOperand(3).getReg()) &&			SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
	TRI->getCommonSubClass(			}
	MRI.getRegClass(DefInst->getOperand(3).getReg()),			} else {
	&AMDGPU::SGPR_64RegClass) &&			// The phi is not observed from outside a loop. Use a more accurate
	AMDGPU::laneDominates(DefInst->getParent(), &MBB)) {			// lowering.
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::S_AND_B64))			PIA.analyze(MBB, IncomingBlocks);
	.add(Dst)
	.addReg(AMDGPU::EXEC)			for (MachineBasicBlock *MBB : PIA.predecessors())
	.add(DefInst->getOperand(3));			SSAUpdater.AddAvailableValue(MBB, insertUndefLaneMask(*MBB));

				for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
				MachineBasicBlock &IMBB = *IncomingBlocks[i];
				if (PIA.isSource(IMBB)) {
				IncomingUpdated.push_back(0);
				SSAUpdater.AddAvailableValue(&IMBB, IncomingRegs[i]);
	} else {			} else {
	BuildMI(MBB, &MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64))			IncomingUpdated.push_back(createLaneMaskReg(*MF));
	.add(Dst)			SSAUpdater.AddAvailableValue(&IMBB, IncomingUpdated.back());
	.add(Src)			}
				}

				for (unsigned i = 0; i < IncomingRegs.size(); ++i) {
				if (!IncomingUpdated[i])
				continue;

				MachineBasicBlock &IMBB = *IncomingBlocks[i];
				buildMergeLaneMasks(
				IMBB, getSaluInsertionAtEnd(IMBB), {}, IncomingUpdated[i],
				SSAUpdater.GetValueInMiddleOfBlock(&IMBB), IncomingRegs[i]);
				}
				}

				unsigned NewReg = SSAUpdater.GetValueInMiddleOfBlock(&MBB);
				if (NewReg != DstReg) {
				MRI->replaceRegWith(NewReg, DstReg);

				// Ensure that DstReg has a single def and mark the old PHI node for
				// deletion.
				MI.getOperand(0).setReg(NewReg);
				DeadPhis.push_back(&MI);
				}

				IncomingBlocks.clear();
				IncomingRegs.clear();
				IncomingUpdated.clear();
				}

				for (MachineInstr *MI : DeadPhis)
				MI->eraseFromParent();
				DeadPhis.clear();
				}
				}

				void SILowerI1Copies::lowerCopiesToI1() {
				MachineSSAUpdater SSAUpdater(*MF);
				LoopFinder LF(DT, PDT);
				SmallVector<MachineInstr *, 4> DeadCopies;

				for (MachineBasicBlock &MBB : *MF) {
				LF.initialize(MBB);

				for (MachineInstr &MI : MBB) {
				if (MI.getOpcode() != AMDGPU::IMPLICIT_DEF &&
				MI.getOpcode() != AMDGPU::COPY)
				continue;

				unsigned DstReg = MI.getOperand(0).getReg();
				if (!TargetRegisterInfo::isVirtualRegister(DstReg) \|\|
				MRI->getRegClass(DstReg) != &AMDGPU::VReg_1RegClass)
				continue;

				if (MRI->use_empty(DstReg)) {
				DeadCopies.push_back(&MI);
				continue;
				}

				LLVM_DEBUG(dbgs() << "Lower Other: " << MI);

				MRI->setRegClass(DstReg, &AMDGPU::SReg_64RegClass);
				if (MI.getOpcode() == AMDGPU::IMPLICIT_DEF)
				continue;

				DebugLoc DL = MI.getDebugLoc();
				unsigned SrcReg = MI.getOperand(1).getReg();
				assert(!MI.getOperand(1).getSubReg());

				if (!TargetRegisterInfo::isVirtualRegister(SrcReg) \|\|
				!isLaneMaskReg(SrcReg)) {
				assert(TII->getRegisterInfo().getRegSizeInBits(SrcReg, *MRI) == 32);
				unsigned TmpReg = createLaneMaskReg(*MF);
				BuildMI(MBB, MI, DL, TII->get(AMDGPU::V_CMP_NE_U32_e64), TmpReg)
				.addReg(SrcReg)
	.addImm(0);			.addImm(0);
				MI.getOperand(1).setReg(TmpReg);
				SrcReg = TmpReg;
	}			}
	MI.eraseFromParent();
				// Defs in a loop that are observed outside the loop must be transformed
				// into appropriate bit manipulation.
				MachineBasicBlock *PostDomBound = &MBB;
				for (MachineInstr &Use : MRI->use_instructions(DstReg)) {
				PostDomBound =
				PDT->findNearestCommonDominator(PostDomBound, Use.getParent());
				}

				unsigned FoundLoopLevel = LF.findLoop(PostDomBound);
				if (FoundLoopLevel) {
				SSAUpdater.Initialize(DstReg);
				SSAUpdater.AddAvailableValue(&MBB, DstReg);
				LF.addLoopEntries(FoundLoopLevel, SSAUpdater);

				buildMergeLaneMasks(MBB, MI, DL, DstReg,
				SSAUpdater.GetValueInMiddleOfBlock(&MBB), SrcReg);
				DeadCopies.push_back(&MI);
				}
				}

				for (MachineInstr *MI : DeadCopies)
				MI->eraseFromParent();
				DeadCopies.clear();
	}			}
	}			}

				bool SILowerI1Copies::isConstantLaneMask(unsigned Reg, bool &Val) const {
				const MachineInstr *MI;
				for (;;) {
				MI = MRI->getUniqueVRegDef(Reg);
				if (MI->getOpcode() != AMDGPU::COPY)
				break;

				Reg = MI->getOperand(1).getReg();
				if (!TargetRegisterInfo::isVirtualRegister(Reg))
				return false;
				if (!isLaneMaskReg(Reg))
				return false;
	}			}

	for (unsigned Reg : I1Defs)			if (MI->getOpcode() != AMDGPU::S_MOV_B64)
	MRI.setRegClass(Reg, &AMDGPU::VGPR_32RegClass);			return false;

				if (!MI->getOperand(1).isImm())
				return false;

				int64_t Imm = MI->getOperand(1).getImm();
				if (Imm == 0) {
				Val = false;
				return true;
				}
				if (Imm == -1) {
				Val = true;
				return true;
				}

	return false;			return false;
	}			}

				static void instrDefsUsesSCC(const MachineInstr &MI, bool &Def, bool &Use) {
				Def = false;
				Use = false;

				for (const MachineOperand &MO : MI.operands()) {
				if (MO.isReg() && MO.getReg() == AMDGPU::SCC) {
				if (MO.isUse())
				Use = true;
				else
				Def = true;
				}
				}
				}

				/// Return a point at the end of the given \p MBB to insert SALU instructions
				/// for lane mask calculation. Take terminators and SCC into account.
				MachineBasicBlock::iterator
				SILowerI1Copies::getSaluInsertionAtEnd(MachineBasicBlock &MBB) const {
				auto InsertionPt = MBB.getFirstTerminator();
				bool TerminatorsUseSCC = false;
				for (auto I = InsertionPt, E = MBB.end(); I != E; ++I) {
				bool DefsSCC;
				instrDefsUsesSCC(*I, DefsSCC, TerminatorsUseSCC);
				if (TerminatorsUseSCC \|\| DefsSCC)
				break;
				}

				if (!TerminatorsUseSCC)
				return InsertionPt;

				while (InsertionPt != MBB.begin()) {
				InsertionPt--;

				bool DefSCC, UseSCC;
				instrDefsUsesSCC(*InsertionPt, DefSCC, UseSCC);
				if (DefSCC)
				return InsertionPt;
				}

				// We should have at least seen an IMPLICIT_DEF or COPY
				llvm_unreachable("SCC used by terminator but no def in block");
				}

				void SILowerI1Copies::buildMergeLaneMasks(MachineBasicBlock &MBB,
				MachineBasicBlock::iterator I,
				const DebugLoc &DL, unsigned DstReg,
				unsigned PrevReg, unsigned CurReg) {
				bool PrevVal;
				bool PrevConstant = isConstantLaneMask(PrevReg, PrevVal);
				bool CurVal;
				bool CurConstant = isConstantLaneMask(CurReg, CurVal);

				if (PrevConstant && CurConstant) {
				if (PrevVal == CurVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(CurReg);
				} else if (CurVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg).addReg(AMDGPU::EXEC);
				} else {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_XOR_B64), DstReg)
				.addReg(AMDGPU::EXEC)
				.addImm(-1);
				}
				return;
				}

				unsigned PrevMaskedReg = 0;
				unsigned CurMaskedReg = 0;
				if (!PrevConstant) {
				if (CurConstant && CurVal) {
				PrevMaskedReg = PrevReg;
				} else {
				PrevMaskedReg = createLaneMaskReg(*MF);
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ANDN2_B64), PrevMaskedReg)
				.addReg(PrevReg)
				.addReg(AMDGPU::EXEC);
				}
				}
				if (!CurConstant) {
				// TODO: check whether CurReg is already masked by EXEC
				if (PrevConstant && PrevVal) {
				CurMaskedReg = CurReg;
				} else {
				CurMaskedReg = createLaneMaskReg(*MF);
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_AND_B64), CurMaskedReg)
				.addReg(CurReg)
				.addReg(AMDGPU::EXEC);
				}
				}

				if (PrevConstant && !PrevVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
				.addReg(CurMaskedReg);
				} else if (CurConstant && !CurVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), DstReg)
				.addReg(PrevMaskedReg);
				} else if (PrevConstant && PrevVal) {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ORN2_B64), DstReg)
				.addReg(CurMaskedReg)
				.addReg(AMDGPU::EXEC);
				} else {
				BuildMI(MBB, I, DL, TII->get(AMDGPU::S_OR_B64), DstReg)
				.addReg(PrevMaskedReg)
				.addReg(CurMaskedReg ? CurMaskedReg : (unsigned)AMDGPU::EXEC);
				}
				}

llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h

	//===- AMDGPULaneDominator.h ------------------------------------- C++ --===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//

	#ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H
	#define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H

	namespace llvm {

	class MachineBasicBlock;

	namespace AMDGPU {

	bool laneDominates(MachineBasicBlock MBBA, MachineBasicBlock MBBB);

	} // end namespace AMDGPU
	} // end namespace llvm

	#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPULANEDOMINATOR_H

llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp

	//===-- AMDGPULaneDominator.cpp - Determine Lane Dominators ---------------===//
	//
	// The LLVM Compiler Infrastructure
	//
	// This file is distributed under the University of Illinois Open Source
	// License. See LICENSE.TXT for details.
	//
	//===----------------------------------------------------------------------===//
	//
	// MBB A lane-dominates MBB B if
	// 1. A dominates B in the usual sense, i.e. every path from the entry to B
	// goes through A, and
	// 2. whenever B executes, every active lane during that execution of B was
	// also active during the most recent execution of A.
	//
	// The simplest example where A dominates B but does not lane-dominate it is
	// where A is a loop:
	//
	// \|
	// +--+
	// A \|
	// +--+
	// \|
	// B
	//
	// Unfortunately, the second condition is not fully captured by the control
	// flow graph when it is unstructured (as may happen when branch conditions are
	// uniform).
	//
	// The following replacement of the second condition is a conservative
	// approximation. It is an equivalent condition when the CFG is fully
	// structured:
	//
	// 2'. every cycle in the CFG that contains A also contains B.
	//
	//===----------------------------------------------------------------------===//

	#include "AMDGPULaneDominator.h"

	#include "llvm/ADT/DenseSet.h"
	#include "llvm/ADT/SmallVector.h"
	#include "llvm/CodeGen/MachineBasicBlock.h"

	namespace llvm {

	namespace AMDGPU {

	// Given machine basic blocks A and B where A dominates B, check whether
	// A lane-dominates B.
	//
	// The check is conservative, i.e. there can be false-negatives.
	bool laneDominates(MachineBasicBlock A, MachineBasicBlock B) {
	// Check whether A is reachable from itself without going through B.
	DenseSet<MachineBasicBlock *> Reachable;
	SmallVector<MachineBasicBlock *, 8> Stack;

	Stack.push_back(A);
	do {
	MachineBasicBlock *MBB = Stack.back();
	Stack.pop_back();

	for (MachineBasicBlock *Succ : MBB->successors()) {
	if (Succ == A)
	return false;
	if (Succ != B && Reachable.insert(Succ).second)
	Stack.push_back(Succ);
	}
	} while (!Stack.empty());

	return true;
	}

	} // namespace AMDGPU

	} // namespace llvm

llvm/trunk/lib/Target/AMDGPU/Utils/CMakeLists.txt

	add_llvm_library(LLVMAMDGPUUtils			add_llvm_library(LLVMAMDGPUUtils
	AMDGPUBaseInfo.cpp			AMDGPUBaseInfo.cpp
	AMDKernelCodeTUtils.cpp			AMDKernelCodeTUtils.cpp
	AMDGPUAsmUtils.cpp			AMDGPUAsmUtils.cpp
	AMDGPULaneDominator.cpp
	)			)

llvm/trunk/test/CodeGen/AMDGPU/add_i1.ll

	Show All 15 Lines
	define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {			define amdgpu_kernel void @add_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
	%a = load volatile i1, i1 addrspace(1)* %in			%a = load volatile i1, i1 addrspace(1)* %in
	%add = add i1 %a, 1			%add = add i1 %a, 1
	store i1 %add, i1 addrspace(1)* %out			store i1 %add, i1 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}add_i1_cf:			; GCN-LABEL: {{^}}add_i1_cf:
	; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}			; GCN: ; %endif
	; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc			; GCN: s_not_b64
	define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {			define amdgpu_kernel void @add_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%d_cmp = icmp ult i32 %tid, 16			%d_cmp = icmp ult i32 %tid, 16
	br i1 %d_cmp, label %if, label %else			br i1 %d_cmp, label %if, label %else

	if:			if:
	%0 = load volatile i1, i1 addrspace(1)* %a			%0 = load volatile i1, i1 addrspace(1)* %a
	Show All 14 Lines

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-from-loop.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -mtriple=amdgcn-- -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s

	; SI-LABEL: {{^}}i1_copy_from_loop:			; SI-LABEL: {{^}}i1_copy_from_loop:
	;			;
	; Cannot use an SGPR mask to copy %cc out of the loop, since the mask would
	; only contain the lanes that were active during the last loop iteration.
	;
	; SI: ; %for.body			; SI: ; %for.body
	; SI: v_cmp_gt_u32_e64 [[SREG:s\[[0-9]+:[0-9]+\]]], 4,			; SI: v_cmp_gt_u32_e64 [[CC_SREG:s\[[0-9]+:[0-9]+\]]], 4,
	; SI: v_cndmask_b32_e64 [[VREG:v[0-9]+]], 0, -1, [[SREG]]			; SI-DAG: s_andn2_b64 [[CC_ACCUM:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
	; SI-NEXT: s_cbranch_vccnz [[ENDIF:BB[0-9_]+]]			; SI-DAG: s_and_b64 [[CC_MASK:s\[[0-9]+:[0-9]+\]]], [[CC_SREG]], exec
	; SI: [[ENDIF]]:			; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], [[CC_MASK]]
	; SI-NOT: [[VREG]]
				; SI: ; %Flow1
				; SI: s_or_b64 [[CC_ACCUM]], [[CC_ACCUM]], exec

				; SI: ; %Flow
				; SI-DAG: s_andn2_b64 [[LCSSA_ACCUM:s\[[0-9]+:[0-9]+\]]], [[LCSSA_ACCUM]], exec
				; SI-DAG: s_and_b64 [[CC_MASK2:s\[[0-9]+:[0-9]+\]]], [[CC_ACCUM]], exec
				; SI: s_or_b64 [[LCSSA_ACCUM]], [[LCSSA_ACCUM]], [[CC_MASK2]]

	; SI: ; %for.end			; SI: ; %for.end
	; SI: v_cmp_ne_u32_e32 vcc, 0, [[VREG]]			; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[LCSSA_ACCUM]]

	define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {			define amdgpu_ps void @i1_copy_from_loop(<4 x i32> inreg %rsrc, i32 %tid) {
	entry:			entry:
	br label %for.body			br label %for.body

	for.body:			for.body:
	%i = phi i32 [0, %entry], [%i.inc, %end.loop]			%i = phi i32 [0, %entry], [%i.inc, %end.loop]
	%cc = icmp ult i32 %i, 4			%cc = icmp ult i32 %i, 4
	br i1 %cc, label %mid.loop, label %for.end			br i1 %cc, label %mid.loop, label %for.end
	Show All 26 Lines

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll

				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

				; GCN-LABEL: {{^}}test_dont_clobber_scc:

				; GCN: ; %entry
				; GCN: s_cmp_eq_u32 s0, 0
				; GCN: s_cbranch_scc1 [[PREEXIT:BB[0-9_]+]]

				; GCN: ; %blocka
				; GCN: s_xor_b64 s[{{[0-9:]+}}], exec, -1
				; GCN: s_cmp_eq_u32 s1, 0
				; GCN: s_cbranch_scc1 [[EXIT:BB[0-9_]+]]

				; GCN: [[PREEXIT]]:
				; GCN: [[EXIT]]:

				define amdgpu_vs float @test_dont_clobber_scc(i32 inreg %uni, i32 inreg %uni2) #0 {
				entry:
				%cc.uni = icmp eq i32 %uni, 0
				br i1 %cc.uni, label %exit, label %blocka

				blocka:
				call void asm sideeffect "; dummy a", ""()
				%cc.uni2 = icmp eq i32 %uni2, 0
				br i1 %cc.uni2, label %exit, label %blockb

				blockb:
				call void asm sideeffect "; dummy b", ""()
				br label %exit

				exit:
				%cc.phi = phi i1 [ true, %entry ], [ false, %blocka ], [ false, %blockb ]
				call void asm sideeffect "; dummy exit", ""()
				%r = select i1 %cc.phi, float 1.0, float 2.0
				ret float %r
				}

				attributes #0 = { nounwind }

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s
	; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s \| FileCheck -check-prefix=SI %s

	; SI-LABEL: {{^}}br_i1_phi:			; SI-LABEL: {{^}}br_i1_phi:
	; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
	; SI: s_and_saveexec_b64			; SI: ; %bb
	; SI: v_mov_b32_e32 [[REG]], -1{{$}}			; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], 0
	; SI: v_cmp_ne_u32_e32 vcc, 0, [[REG]]
	; SI: s_and_saveexec_b64			; SI: ; %bb2
	; SI: s_endpgm			; SI: s_mov_b64 [[TMP]], exec

				; SI: ; %bb3
				; SI: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[TMP]]

	define amdgpu_kernel void @br_i1_phi(i32 %arg) {			define amdgpu_kernel void @br_i1_phi(i32 %arg) {
	bb:			bb:
	%tidig = call i32 @llvm.amdgcn.workitem.id.x()			%tidig = call i32 @llvm.amdgcn.workitem.id.x()
	%cmp = trunc i32 %tidig to i1			%cmp = trunc i32 %tidig to i1
	br i1 %cmp, label %bb2, label %bb3			br i1 %cmp, label %bb2, label %bb3

	bb2: ; preds = %bb			bb2: ; preds = %bb
	br label %bb3			br label %bb3
	Show All 17 Lines

llvm/trunk/test/CodeGen/AMDGPU/inline-asm.ll

	Show First 20 Lines • Show All 192 Lines • ▼ Show 20 Lines
	; CHECK: use v[0:1]			; CHECK: use v[0:1]
	define amdgpu_kernel void @i64_imm_input_phys_vgpr() {			define amdgpu_kernel void @i64_imm_input_phys_vgpr() {
	entry:			entry:
	call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456)			call void asm sideeffect "; use $0 ", "{v[0:1]}"(i64 123456)
	ret void			ret void
	}			}

	; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:			; CHECK-LABEL: {{^}}i1_imm_input_phys_vgpr:
	; CHECK: v_mov_b32_e32 v0, -1{{$}}			; CHECK: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], -1
				; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, [[MASK]]
	; CHECK: ; use v0			; CHECK: ; use v0
	define amdgpu_kernel void @i1_imm_input_phys_vgpr() {			define amdgpu_kernel void @i1_imm_input_phys_vgpr() {
	entry:			entry:
	call void asm sideeffect "; use $0 ", "{v0}"(i1 true)			call void asm sideeffect "; use $0 ", "{v0}"(i1 true)
	ret void			ret void
	}			}

	; CHECK-LABEL: {{^}}i1_input_phys_vgpr:			; CHECK-LABEL: {{^}}i1_input_phys_vgpr:
	; CHECK: {{buffer\|flat}}_load_ubyte [[LOAD:v[0-9]+]]			; CHECK: {{buffer\|flat}}_load_ubyte [[LOAD:v[0-9]+]]
	; CHECK: v_and_b32_e32 [[LOAD]], 1, [[LOAD]]			; CHECK: v_and_b32_e32 [[LOAD]], 1, [[LOAD]]
	; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]			; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, [[LOAD]]
	; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc			; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
	; CHECK: ; use v0			; CHECK: ; use v0
				; CHECK: v_cmp_ne_u32_e32 vcc, 0, v1
				; CHECK: v_cndmask_b32_e64 [[STORE:v[0-9]+]], 0, 1, vcc
				; CHECK: {{buffer\|flat}}_store_byte [[STORE]],
	define amdgpu_kernel void @i1_input_phys_vgpr() {			define amdgpu_kernel void @i1_input_phys_vgpr() {
	entry:			entry:
	%val = load i1, i1 addrspace(1)* undef			%val = load i1, i1 addrspace(1)* undef
	call void asm sideeffect "; use $0 ", "{v0}"(i1 %val)			%cc = call i1 asm sideeffect "; use $1, def $0 ", "={v1}, {v0}"(i1 %val)
				store i1 %cc, i1 addrspace(1)* undef
	ret void			ret void
	}			}

	; FIXME: Should be scheduled to shrink vcc			; FIXME: Should be scheduled to shrink vcc
	; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:			; CHECK-LABEL: {{^}}i1_input_phys_vgpr_x2:
	; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0			; CHECK: v_cmp_eq_u32_e32 vcc, 1, v0
	; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc			; CHECK: v_cndmask_b32_e64 v0, 0, -1, vcc
	; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1			; CHECK: v_cmp_eq_u32_e32 vcc, 1, v1
	Show All 38 Lines

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,SI %s		; RUN: llc -amdgpu-scalarize-global-loads=false -march=amdgcn -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,SI %s
; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -strict-whitespace -check-prefixes=GCN,VI %s		; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s \| FileCheck -enable-var-scope -check-prefixes=GCN,VI %s

; FIXME: Enable for VI.		; FIXME: Enable for VI.

declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone		declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone		declare float @llvm.amdgcn.div.fmas.f32(float, float, float, i1) nounwind readnone
declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone		declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind readnone

; GCN-LABEL: {{^}}test_div_fmas_f32:		; GCN-LABEL: {{^}}test_div_fmas_f32:
▲ Show 20 Lines • Show All 128 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
%and = and i1 %cmp0, %cmp1		%and = and i1 %cmp0, %cmp1

%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone		%result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %and) nounwind readnone
store float %result, float addrspace(1)* %gep.out, align 4		store float %result, float addrspace(1)* %gep.out, align 4
ret void		ret void
}		}

; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:		; GCN-LABEL: {{^}}test_div_fmas_f32_i1_phi_vcc:
; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], vcc

; SI: buffer_load_dword [[LOAD:v[0-9]+]]		; SI: ; %entry
; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]		; SI: v_cmp_eq_u32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, {{v[0-9]+}}
; SI: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc		; SI: s_mov_b64 vcc, 0
		; SI: s_and_saveexec_b64 [[SAVE:s\[[0-9]+:[0-9]+\]]], [[CMP]]

		; SI: ; %bb
		; SI: buffer_load_dword [[LOAD:v[0-9]+]],
		; SI: v_cmp_ne_u32_e32 vcc, 0, [[LOAD]]
		; SI: s_and_b64 vcc, vcc, exec

; SI: BB9_2:		; SI: ; %exit
; SI: s_or_b64 exec, exec, [[SAVE]]		; SI: s_or_b64 exec, exec, [[SAVE]]
; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}		; SI-NOT: vcc
; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}		; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
; SI: buffer_store_dword		; SI: buffer_store_dword
; SI: s_endpgm		; SI: s_endpgm

define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {		define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
entry:		entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone		%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
%gep.out = getelementptr float, float addrspace(1)* %out, i32 2		%gep.out = getelementptr float, float addrspace(1)* %out, i32 2
%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid		%gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1		%gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
%gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2		%gep.c = getelementptr float, float addrspace(1)* %gep.a, i32 2

Show All 18 Lines

llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll

	Show All 16 Lines
	; OPT: Flow:			; OPT: Flow:
	; OPT: call i64 @llvm.amdgcn.if.break(			; OPT: call i64 @llvm.amdgcn.if.break(
	; OPT: call i1 @llvm.amdgcn.loop(i64			; OPT: call i1 @llvm.amdgcn.loop(i64
	; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1			; OPT: br i1 %{{[0-9]+}}, label %bb9, label %bb1

	; OPT: bb9:			; OPT: bb9:
	; OPT: call void @llvm.amdgcn.end.cf(i64			; OPT: call void @llvm.amdgcn.end.cf(i64

	; TODO: Can remove exec fixes in return block
	; GCN-LABEL: {{^}}break_loop:			; GCN-LABEL: {{^}}break_loop:
	; GCN: s_mov_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}			; GCN: s_mov_b64 [[OUTER_MASK:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1			; GCN: [[LOOP_ENTRY:BB[0-9]+_[0-9]+]]: ; %bb1
	; GCN: v_cmp_lt_i32_e32 vcc, -1			; GCN: v_cmp_lt_i32_e32 vcc, -1
	; GCN: s_and_b64 vcc, exec, vcc			; GCN: s_and_b64 vcc, exec, vcc
				; GCN: s_or_b64 [[INNER_MASK:s\[[0-9]+:[0-9]+\]]], [[INNER_MASK]], exec
	; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]			; GCN: s_cbranch_vccnz [[FLOW:BB[0-9]+_[0-9]+]]

	; GCN: ; %bb.2: ; %bb4			; GCN: ; %bb4
	; GCN: buffer_load_dword			; GCN: buffer_load_dword
	; GCN: v_cmp_ge_i32_e32 vcc,			; GCN: v_cmp_ge_i32_e32 vcc,
				; GCN: s_andn2_b64 [[INNER_MASK]], [[INNER_MASK]], exec
	; GCN: [[FLOW]]:			; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
	; GCN: s_or_b64 [[MASK]], vcc, [[MASK]]			; GCN: s_or_b64 [[INNER_MASK]], [[INNER_MASK]], [[TMP0]]
	; GCN: s_andn2_b64 exec, exec, [[MASK]]
				; GCN: [[FLOW]]: ; %Flow
				; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[INNER_MASK]]
				; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[OUTER_MASK]]
				; GCN: s_mov_b64 [[OUTER_MASK]], [[TMP1]]
				; GCN: s_andn2_b64 exec, exec, [[TMP1]]
	; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]			; GCN-NEXT: s_cbranch_execnz [[LOOP_ENTRY]]

	; GCN: ; %bb.4: ; %bb9			; GCN: ; %bb.4: ; %bb9
	; GCN-NEXT: s_endpgm			; GCN-NEXT: s_endpgm
	define amdgpu_kernel void @break_loop(i32 %arg) #0 {			define amdgpu_kernel void @break_loop(i32 %arg) #0 {
	bb:			bb:
	%id = call i32 @llvm.amdgcn.workitem.id.x()			%id = call i32 @llvm.amdgcn.workitem.id.x()
	%tmp = sub i32 %id, %arg			%tmp = sub i32 %id, %arg
	br label %bb1			br label %bb1
	▲ Show 20 Lines • Show All 281 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

	Show First 20 Lines • Show All 53 Lines • ▼ Show 20 Lines
	; IR: br label %Flow2			; IR: br label %Flow2

	; IR: UnifiedReturnBlock:			; IR: UnifiedReturnBlock:
	; IR: call void @llvm.amdgcn.end.cf(i64 %14)			; IR: call void @llvm.amdgcn.end.cf(i64 %14)
	; IR: ret void			; IR: ret void


	; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:			; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
	; GCN: v_cmp_lt_i32_e32 vcc, 1
				; GCN: s_mov_b64 [[EXIT1:s\[[0-9]+:[0-9]+\]]], 0
				; GCN: v_cmp_lt_i32_e32 vcc, 1,
				; GCN: s_mov_b64 [[EXIT0:s\[[0-9]+:[0-9]+\]]], 0
	; GCN: s_and_saveexec_b64			; GCN: s_and_saveexec_b64
	; GCN: s_xor_b64			; GCN: s_xor_b64

				; GCN: ; %LeafBlock1
				; GCN-NEXT: s_mov_b64 [[EXIT0]], exec
				; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 2,
				; GCN-NEXT: s_and_b64 [[EXIT1]], vcc, exec

				; GCN: ; %Flow
				; GCN-NEXT: s_or_saveexec_b64
				; GCN-NEXT: s_xor_b64

	; FIXME: Why is this compare essentially repeated?			; FIXME: Why is this compare essentially repeated?
	; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]			; GCN: ; %LeafBlock
	; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc			; GCN-DAG: v_cmp_eq_u32_e32 vcc, 1,
	; GCN: v_cmp_ne_u32_e32 vcc, 1, [[REG]]			; GCN-DAG: v_cmp_ne_u32_e64 [[TMP1:s\[[0-9]+:[0-9]+\]]], 1,
	; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc			; GCN-DAG: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec
				; GCN-DAG: s_andn2_b64 [[EXIT1]], [[EXIT1]], exec
				; GCN-DAG: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], vcc, exec
				; GCN-DAG: s_and_b64 [[TMP1]], [[TMP1]], exec
				; GCN-DAG: s_or_b64 [[EXIT0]], [[EXIT0]], [[TMP0]]
				; GCN-DAG: s_or_b64 [[EXIT1]], [[EXIT1]], [[TMP1]]

	; GCN: ; %Flow4			; GCN: ; %Flow4
	; GCN-NEXT: s_or_b64 exec, exec			; GCN-NEXT: s_or_b64 exec, exec,
	; GCN: v_cmp_ne_u32_e32 vcc, 0			; GCN-NEXT: s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT1]]
				; GCN-NEXT: s_xor_b64

	; GCN: ; %exit1			; GCN: ; %exit1
	; GCN: ds_write_b32			; GCN: ds_write_b32
				; GCN: s_andn2_b64 [[EXIT0]], [[EXIT0]], exec

	; GCN: %Flow5			; GCN: ; %Flow5
	; GCN-NEXT: s_or_b64 exec, exec			; GCN-NEXT: s_or_b64 exec, exec,
	; GCN: v_cmp_ne_u32_e32 vcc, 0			; GCN-NEXT; s_and_saveexec_b64 {{s\[[0-9]+:[0-9]+\]}}, [[EXIT0]]
	; GCN-NEXT: s_and_saveexec_b64

	; GCN: ; %exit0			; GCN: ; %exit0
	; GCN: buffer_store_dword			; GCN: buffer_store_dword

	; GCN: ; %UnifiedReturnBlock			; GCN: ; %UnifiedReturnBlock
	; GCN-NEXT: s_endpgm			; GCN-NEXT: s_endpgm
	define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {			define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
	entry:			entry:
	%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1			%tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
	%tmp1 = add i32 0, %tmp			%tmp1 = add i32 0, %tmp
	%tmp2 = zext i32 %tmp1 to i64			%tmp2 = zext i32 %tmp1 to i64
	▲ Show 20 Lines • Show All 614 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/multilevel-break.ll

	Show All 15 Lines
	; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1			; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1
	; OPT-NEXT: call i1 @llvm.amdgcn.loop(i64			; OPT-NEXT: call i1 @llvm.amdgcn.loop(i64
	; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1			; OPT-NEXT: call i64 @llvm.amdgcn.if.break(i1
	;			;
	; OPT: Flow1:			; OPT: Flow1:

	; GCN-LABEL: {{^}}multi_else_break:			; GCN-LABEL: {{^}}multi_else_break:

				; GCN: ; %main_body
				; GCN: s_mov_b64 [[LEFT_OUTER:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}			; GCN: [[OUTER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP.outer{{$}}
				; GCN: s_mov_b64 [[LEFT_INNER:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}			; GCN: [[INNER_LOOP:BB[0-9]+_[0-9]+]]: ; %LOOP{{$}}
	; GCN: s_and_saveexec_b64 [[SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], vcc			; GCN: s_or_b64 [[BREAK_OUTER:s\[[0-9]+:[0-9]+\]]], [[BREAK_OUTER]], exec
				; GCN: s_or_b64 [[BREAK_INNER:s\[[0-9]+:[0-9]+\]]], [[BREAK_INNER]], exec
	; GCN: BB{{[0-9]+}}_{{[0-9]+}}: ; %Flow{{$}}			; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
	; GCN-NEXT: ; in Loop: Header=[[INNER_LOOP]] Depth=2
				; FIXME: duplicate comparison
				; GCN: ; %ENDIF
				; GCN-DAG: v_cmp_eq_u32_e32 vcc,
				; GCN-DAG: v_cmp_ne_u32_e64 [[TMP51NEG:s\[[0-9]+:[0-9]+\]]],
				; GCN-DAG: s_andn2_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], exec
				; GCN-DAG: s_andn2_b64 [[BREAK_INNER]], [[BREAK_INNER]], exec
				; GCN-DAG: s_and_b64 [[TMP_EQ:s\[[0-9]+:[0-9]+\]]], vcc, exec
				; GCN-DAG: s_and_b64 [[TMP_NE:s\[[0-9]+:[0-9]+\]]], [[TMP51NEG]], exec
				; GCN-DAG: s_or_b64 [[BREAK_OUTER]], [[BREAK_OUTER]], [[TMP_EQ]]
				; GCN-DAG: s_or_b64 [[BREAK_INNER]], [[BREAK_INNER]], [[TMP_NE]]

				; GCN: ; %Flow
				; GCN: s_or_b64 exec, exec, [[SAVE_EXEC]]
				; GCN: s_and_b64 [[TMP0:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_INNER]]
				; GCN: s_or_b64 [[TMP0]], [[TMP0]], [[LEFT_INNER]]
				; GCN: s_mov_b64 [[LEFT_INNER]], [[TMP0]]
				; GCN: s_andn2_b64 exec, exec, [[TMP0]]
				; GCN: s_cbranch_execnz [[INNER_LOOP]]

				; GCN: ; %Flow2
				; GCN: s_or_b64 exec, exec, [[TMP0]]
				; GCN: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[BREAK_OUTER]]
				; GCN: s_or_b64 [[TMP1]], [[TMP1]], [[LEFT_OUTER]]
				; GCN: s_mov_b64 [[LEFT_OUTER]], [[TMP1]]
				; GCN: s_andn2_b64 exec, exec, [[TMP1]]
				; GCN: s_cbranch_execnz [[OUTER_LOOP]]

	; Ensure extra or eliminated			; GCN: ; %IF
	; GCN-NEXT: s_or_b64 exec, exec, [[SAVE_BREAK]]			; GCN-NEXT: s_endpgm
	; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
	; GCN-NEXT: s_or_b64 [[OR_BREAK:s\[[0-9]+:[0-9]+\]]], vcc, s{{\[[0-9]+:[0-9]+\]}}
	; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
	; GCN-NEXT: v_mov_b32_e32
	; GCN-NEXT: s_andn2_b64 exec, exec, [[OR_BREAK]]
	; GCN-NEXT: s_cbranch_execnz [[INNER_LOOP]]

	; GCN: ; %bb.{{[0-9]+}}: ; %Flow2{{$}}
	; GCN-NEXT: ; in Loop: Header=[[OUTER_LOOP]] Depth=1

	; Ensure copy is eliminated
	; GCN-NEXT: s_or_b64 exec, exec, [[OR_BREAK]]
	; GCN-NEXT: s_and_b64 [[MASKED2_SAVE_BREAK:s\[[0-9]+:[0-9]+\]]], exec, vcc
	; GCN-NEXT: s_or_b64 [[OUTER_OR_BREAK:s\[[0-9]+:[0-9]+\]]], [[MASKED2_SAVE_BREAK]], s{{\[[0-9]+:[0-9]+\]}}
	; GCN-NEXT: s_mov_b64
	; GCN-NEXT: v_mov_b32_e32
	; GCN-NEXT: s_andn2_b64 exec, exec, [[OUTER_OR_BREAK]]
	; GCN-NEXT: s_cbranch_execnz [[OUTER_LOOP]]
	define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {			define amdgpu_vs void @multi_else_break(<4 x float> %vec, i32 %ub, i32 %cont) {
	main_body:			main_body:
	br label %LOOP.outer			br label %LOOP.outer

	LOOP.outer: ; preds = %ENDIF, %main_body			LOOP.outer: ; preds = %ENDIF, %main_body
	%tmp43 = phi i32 [ 0, %main_body ], [ %tmp47, %ENDIF ]			%tmp43 = phi i32 [ 0, %main_body ], [ %tmp47, %ENDIF ]
	br label %LOOP			br label %LOOP

	Show All 13 Lines

	; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(			; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
	; OPT: llvm.amdgcn.if.break			; OPT: llvm.amdgcn.if.break
	; OPT: llvm.amdgcn.loop			; OPT: llvm.amdgcn.loop
	; OPT: llvm.amdgcn.if.break			; OPT: llvm.amdgcn.if.break
	; OPT: llvm.amdgcn.end.cf			; OPT: llvm.amdgcn.end.cf

	; GCN-LABEL: {{^}}multi_if_break_loop:			; GCN-LABEL: {{^}}multi_if_break_loop:
	; GCN: s_mov_b64 [[BREAK_REG:s\[[0-9]+:[0-9]+\]]], 0{{$}}			; GCN: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0{{$}}

	; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}			; GCN: [[LOOP:BB[0-9]+_[0-9]+]]: ; %bb1{{$}}
				; GCN: s_mov_b64 [[OLD_LEFT:s\[[0-9]+:[0-9]+\]]], [[LEFT]]

	; GCN: s_or_b64 [[BREAK_REG]], vcc, [[BREAK_REG]]			; GCN: ; %LeafBlock1
	; GCN: s_andn2_b64 exec, exec, [[BREAK_REG]]			; GCN: s_mov_b64
				; GCN: s_mov_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], -1{{$}}

				; GCN: ; %case1
				; GCN: buffer_load_dword [[LOAD2:v[0-9]+]],
				; GCN: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD2]]
				; GCN: s_orn2_b64 [[BREAK]], vcc, exec

				; GCN: ; %Flow3
				; GCN: s_branch [[FLOW:BB[0-9]+_[0-9]+]]

				; GCN: s_mov_b64 [[BREAK]], -1{{$}}

				; GCN: [[FLOW]]: ; %Flow

				; GCN: ; %case0
				; GCN: buffer_load_dword [[LOAD1:v[0-9]+]],
				; GCN-DAG: s_andn2_b64 [[BREAK]], [[BREAK]], exec
				; GCN-DAG: v_cmp_ge_i32_e32 vcc, {{v[0-9]+}}, [[LOAD1]]
				; GCN-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], vcc, exec
				; GCN: s_or_b64 [[BREAK]], [[BREAK]], [[TMP]]

				; GCN: ; %Flow4
				; GCN: s_and_b64 [[BREAK]], exec, [[BREAK]]
				; GCN: s_or_b64 [[LEFT]], [[BREAK]], [[OLD_LEFT]]
				; GCN: s_andn2_b64 exec, exec, [[LEFT]]
	; GCN-NEXT: s_cbranch_execnz			; GCN-NEXT: s_cbranch_execnz

	define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {			define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
	bb:			bb:
	%id = call i32 @llvm.amdgcn.workitem.id.x()			%id = call i32 @llvm.amdgcn.workitem.id.x()
	%tmp = sub i32 %id, %arg			%tmp = sub i32 %id, %arg
	br label %bb1			br label %bb1

	Show All 28 Lines

llvm/trunk/test/CodeGen/AMDGPU/select-opt.ll

Show First 20 Lines • Show All 131 Lines • ▼ Show 20 Lines	define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
store i64 %select, i64 addrspace(1)* %out		store i64 %select, i64 addrspace(1)* %out
ret void		ret void
}		}

; GCN-LABEL: {{^}}regression:		; GCN-LABEL: {{^}}regression:
; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0		; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0
; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}		; GCN: v_cmp_neq_f32_e32 vcc, 0, v{{[0-9]+}}
; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}		; GCN: v_cmp_eq_f32_e32 vcc, 0, v{{[0-9]+}}
; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}

define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {		define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
entry:		entry:
%cmp0 = fcmp oeq float %c0, 1.0		%cmp0 = fcmp oeq float %c0, 1.0
br i1 %cmp0, label %if0, label %endif		br i1 %cmp0, label %if0, label %endif

if0:		if0:
%cmp1 = fcmp oeq float %c1, 0.0		%cmp1 = fcmp oeq float %c1, 0.0
Show All 14 Lines

llvm/trunk/test/CodeGen/AMDGPU/sgpr-control-flow.ll

Show First 20 Lines • Show All 94 Lines • ▼ Show 20 Lines	else:
br label %endif		br label %endif

endif:		endif:
%tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]		%tmp4 = phi i32 [%tmp2, %if], [%tmp3, %else]
store i32 %tmp4, i32 addrspace(1)* %out		store i32 %tmp4, i32 addrspace(1)* %out
ret void		ret void
}		}

; FIXME: Should write to different SGPR pairs instead of copying to
; VALU for i1 phi.

; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:		; SI-LABEL: {{^}}sgpr_if_else_valu_cmp_phi_br:

		; SI: ; %else
; SI: buffer_load_dword [[AVAL:v[0-9]+]]		; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_gt_i32_e32 [[CMP_IF:vcc]], 0, [[AVAL]]		; SI: v_cmp_gt_i32_e64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP:v[0-9]+]], 0, -1, [[CMP_IF]]

; SI: BB{{[0-9]+}}_2:		; SI: ; %if
; SI: buffer_load_dword [[AVAL:v[0-9]+]]		; SI: buffer_load_dword [[AVAL:v[0-9]+]]
; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]		; SI: v_cmp_eq_u32_e32 [[CMP_ELSE:vcc]], 0, [[AVAL]]
; SI: v_cndmask_b32_e64 [[V_CMP]], 0, -1, [[CMP_ELSE]]		; SI-DAG: s_andn2_b64 [[PHI]], [[PHI]], exec
		; SI-DAG: s_and_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP_ELSE]], exec
; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]		; SI: s_or_b64 [[PHI]], [[PHI]], [[TMP]]
; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
; SI: buffer_store_dword [[RESULT]]		; SI: ; %endif
		; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[PHI]]
		; SI: buffer_store_dword [[RESULT]],
define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {		define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
entry:		entry:
%tid = call i32 @llvm.amdgcn.workitem.id.x() #0		%tid = call i32 @llvm.amdgcn.workitem.id.x() #0
%tmp1 = icmp eq i32 %tid, 0		%tmp1 = icmp eq i32 %tid, 0
br i1 %tmp1, label %if, label %else		br i1 %tmp1, label %if, label %else

if:		if:
%gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid		%gep.if = getelementptr i32, i32 addrspace(1)* %a, i32 %tid
Show All 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf.ll

; RUN: llc < %s -march=amdgcn -mcpu=verde -asm-verbose=0 -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s		; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s
; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -asm-verbose=0 -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s		; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs \| FileCheck --check-prefix=SI --check-prefix=FUNC %s

; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:		; FUNC-LABEL: {{^}}break_inserted_outside_of_loop:

; SI: [[LOOP_LABEL:[A-Z0-9]+]]:		; SI: [[LOOP_LABEL:[A-Z0-9]+]]:
; Lowered break instructin:		; Lowered break instructin:
; SI: s_or_b64		; SI: s_or_b64
; Lowered Loop instruction:		; Lowered Loop instruction:
; SI: s_andn2_b64		; SI: s_andn2_b64
Show All 11 Lines	ENDLOOP:
ret void		ret void

ENDIF:		ENDIF:
br i1 %1, label %ENDLOOP, label %ENDIF		br i1 %1, label %ENDLOOP, label %ENDIF
}		}


; FUNC-LABEL: {{^}}phi_cond_outside_loop:		; FUNC-LABEL: {{^}}phi_cond_outside_loop:
; FIXME: This could be folded into the s_or_b64 instruction
; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0
; SI: [[LOOP_LABEL:[A-Z0-9]+]]
; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}

; SI_IF_BREAK instruction:		; SI: s_mov_b64 [[LEFT:s\[[0-9]+:[0-9]+\]]], 0
; SI: s_or_b64 [[BREAK:s\[[0-9]+:[0-9]+\]]], vcc, [[ZERO]]		; SI: s_mov_b64 [[PHI:s\[[0-9]+:[0-9]+\]]], 0

; SI_LOOP instruction:		; SI: ; %else
; SI: s_andn2_b64 exec, exec, [[BREAK]]		; SI: v_cmp_eq_u32_e64 [[TMP:s\[[0-9]+:[0-9]+\]]],
		; SI: s_and_b64 [[PHI]], [[TMP]], exec

		; SI: ; %endif

		; SI: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]: ; %loop
		; SI: s_mov_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[LEFT]]
		; SI: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]], exec, [[PHI]]
		; SI: s_or_b64 [[LEFT]], [[TMP1]], [[TMP]]
		; SI: s_andn2_b64 exec, exec, [[LEFT]]
; SI: s_cbranch_execnz [[LOOP_LABEL]]		; SI: s_cbranch_execnz [[LOOP_LABEL]]
; SI: s_endpgm		; SI: s_endpgm

define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {		define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
entry:		entry:
%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0		%tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
%0 = icmp eq i32 %tid , 0		%0 = icmp eq i32 %tid , 0
br i1 %0, label %if, label %else		br i1 %0, label %if, label %else

if:		if:
Show All 35 Lines	sw.epilog:
ret void		ret void
}		}

declare float @llvm.fabs.f32(float) nounwind readnone		declare float @llvm.fabs.f32(float) nounwind readnone

; This broke the old AMDIL cfg structurizer		; This broke the old AMDIL cfg structurizer
; FUNC-LABEL: {{^}}loop_land_info_assert:		; FUNC-LABEL: {{^}}loop_land_info_assert:
; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}		; SI: v_cmp_lt_i32_e64 [[CMP4:s\[[0-9:]+\]]], s{{[0-9]+}}, 4{{$}}
; SI: s_and_b64 vcc, exec, [[CMP4]]		; SI: s_and_b64 [[CMP4M:s\[[0-9]+:[0-9]+\]]], exec, [[CMP4]]
; SI-NEXT: s_cbranch_vccnz [[BR1:BB[0-9_]+]]		; SI: s_mov_b64 vcc, [[CMP4M]]
; SI-NEXT: s_branch [[BR2:BB[0-9_]+]]		; SI-NEXT: s_cbranch_vccnz [[CONVEX_EXIT:BB[0-9_]+]]
; SI-NEXT: BB{{[0-9_]+}}:		; SI-NEXT: s_branch [[FOR_COND_PREHDR:BB[0-9_]+]]
; SI-NEXT: buffer_store_dword
		; SI: ; %if.else
		; SI: buffer_store_dword

; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]:		; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]:

; SI: [[BR1]]:		; SI: [[CONVEX_EXIT]]:
; SI-NEXT: s_and_b64 vcc, exec,		; SI: s_mov_b64 vcc,
; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]		; SI-NEXT: s_cbranch_vccnz [[ENDPGM:BB[0-9]+_[0-9]+]]
; SI: s_branch [[INFLOOP]]		; SI: s_branch [[INFLOOP]]
; SI-NEXT: [[BR2]]:		; SI-NEXT: [[FOR_COND_PREHDR]]:
; SI: s_cbranch_vccz [[ENDPGM]]		; SI: s_cbranch_vccz [[ENDPGM]]

; SI: [[ENDPGM]]:		; SI: [[ENDPGM]]:
; SI-NEXT: s_endpgm		; SI-NEXT: s_endpgm
define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {		define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
entry:		entry:
%cmp = icmp sgt i32 %c0, 0		%cmp = icmp sgt i32 %c0, 0
br label %while.cond.outer		br label %while.cond.outer
▲ Show 20 Lines • Show All 43 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/AMDGPU/sub_i1.ll

	Show All 15 Lines
	define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {			define amdgpu_kernel void @sub_var_imm_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) {
	%a = load volatile i1, i1 addrspace(1)* %in			%a = load volatile i1, i1 addrspace(1)* %in
	%sub = sub i1 %a, 1			%sub = sub i1 %a, 1
	store i1 %sub, i1 addrspace(1)* %out			store i1 %sub, i1 addrspace(1)* %out
	ret void			ret void
	}			}

	; GCN-LABEL: {{^}}sub_i1_cf:			; GCN-LABEL: {{^}}sub_i1_cf:
	; GCN: v_cmp_ne_u32_e32 vcc, 0, {{v[0-9]+}}			; GCN: ; %endif
	; GCN-NEXT: s_not_b64 s{{\[[0-9]+:[0-9]+\]}}, vcc			; GCN: s_not_b64
	define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {			define amdgpu_kernel void @sub_i1_cf(i1 addrspace(1)* %out, i1 addrspace(1)* %a, i1 addrspace(1)* %b) {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x()			%tid = call i32 @llvm.amdgcn.workitem.id.x()
	%d_cmp = icmp ult i32 %tid, 16			%d_cmp = icmp ult i32 %tid, 16
	br i1 %d_cmp, label %if, label %else			br i1 %d_cmp, label %if, label %else

	if:			if:
	%0 = load volatile i1, i1 addrspace(1)* %a			%0 = load volatile i1, i1 addrspace(1)* %a
	Show All 14 Lines

llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll

	; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s \| FileCheck -check-prefix=SI %s			; RUN: llc -march=amdgcn -verify-machineinstrs -enable-misched -asm-verbose < %s \| FileCheck -check-prefix=SI %s

	declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone			declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone

	; SI-LABEL: {{^}}test_if:			; SI-LABEL: {{^}}test_if:
	; Make sure the i1 values created by the cfg structurizer pass are			; Make sure the i1 values created by the cfg structurizer pass are
	; moved using VALU instructions			; moved using VALU instructions


	; waitcnt should be inserted after exec modification			; waitcnt should be inserted after exec modification
	; SI: v_cmp_lt_i32_e32 vcc, 0,			; SI: v_cmp_lt_i32_e32 vcc, 0,
	; SI: v_mov_b32_e32 {{v[0-9]+}}, 0			; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
				; SI-NEXT: s_mov_b64 {{s\[[0-9]+:[0-9]+\]}}, 0
	; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc			; SI-NEXT: s_and_saveexec_b64 [[SAVE1:s\[[0-9]+:[0-9]+\]]], vcc
	; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]			; SI-NEXT: s_xor_b64 [[SAVE2:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE1]]
	; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]			; SI-NEXT: ; mask branch [[FLOW_BB:BB[0-9]+_[0-9]+]]
	; SI-NEXT: s_cbranch_execz [[FLOW_BB]]			; SI-NEXT: s_cbranch_execz [[FLOW_BB]]

	; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3			; SI-NEXT: BB{{[0-9]+}}_1: ; %LeafBlock3
	; SI-NOT: s_mov_b64 s[{{[0-9]:[0-9]}}], -1			; SI: s_mov_b64 s[{{[0-9]:[0-9]}}], -1
	; SI: v_mov_b32_e32 v{{[0-9]}}, -1
	; SI: s_and_saveexec_b64			; SI: s_and_saveexec_b64
	; SI-NEXT: ; mask branch			; SI-NEXT: ; mask branch

	; v_mov should be after exec modification			; v_mov should be after exec modification
	; SI: [[FLOW_BB]]:			; SI: [[FLOW_BB]]:
	; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]			; SI-NEXT: s_or_saveexec_b64 [[SAVE3:s\[[0-9]+:[0-9]+\]]], [[SAVE2]]
	; SI-NEXT: v_mov_b32_e32 v{{[0-9]+}}
	; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]			; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
	; SI-NEXT: ; mask branch			; SI-NEXT: ; mask branch
	;			;
	define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {			define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
	entry:			entry:
	%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone			%tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
	switch i32 %tid, label %default [			switch i32 %tid, label %default [
	i32 0, label %case0			i32 0, label %case0
	▲ Show 20 Lines • Show All 179 Lines • ▼ Show 20 Lines
	; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]			; SI: s_cbranch_execz [[LABEL_FLOW:BB[0-9]+_[0-9]+]]

	; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20			; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
	; SI: buffer_store_dword			; SI: buffer_store_dword

	; SI: [[LABEL_FLOW]]:			; SI: [[LABEL_FLOW]]:
	; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]			; SI-NEXT: ; in Loop: Header=[[LABEL_LOOP]]
	; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]			; SI-NEXT: s_or_b64 exec, exec, [[ORNEG2]]
	; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10			; SI-NEXT: s_and_b64 [[TMP1:s\[[0-9]+:[0-9]+\]]],
	; SI-NEXT: s_or_b64 [[COND_STATE]], vcc, [[COND_STATE]]			; SI-NEXT: s_or_b64 [[TMP2:s\[[0-9]+:[0-9]+\]]], [[TMP1]], [[COND_STATE]]
	; SI-NEXT: s_andn2_b64 exec, exec, [[COND_STATE]]			; SI-NEXT: s_mov_b64 [[COND_STATE]], [[TMP2]]
				; SI-NEXT: s_andn2_b64 exec, exec, [[TMP2]]
	; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]			; SI-NEXT: s_cbranch_execnz [[LABEL_LOOP]]

	; SI: [[LABEL_EXIT]]:			; SI: [[LABEL_EXIT]]:
	; SI-NOT: [[COND_STATE]]			; SI-NOT: [[COND_STATE]]
	; SI: s_endpgm			; SI: s_endpgm

	define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {			define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
	bb:			bb:
	Show All 34 Lines

llvm/trunk/test/CodeGen/AMDGPU/waitcnt-looptest.ll

	; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-load-store-vectorizer=0 \| FileCheck --check-prefix=GCN %s			; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-load-store-vectorizer=0 \| FileCheck --check-prefix=GCN %s

	; Check that the waitcnt insertion algorithm correctly propagates wait counts			; Check that the waitcnt insertion algorithm correctly propagates wait counts
	; from before a loop to the loop header.			; from before a loop to the loop header.

	; GCN-LABEL: {{^}}testKernel			; GCN-LABEL: {{^}}testKernel
	; GCN: BB0_1:			; GCN: BB0_1:
	; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cmp_eq_f32_e64			; GCN-NEXT: v_cmp_eq_f32_e32
	; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cmp_eq_f32_e32			; GCN-NEXT: v_cmp_eq_f32_e32
	; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)			; GCN: s_waitcnt vmcnt(0) lgkmcnt(0)
	; GCN-NEXT: v_cmp_eq_f32_e32			; GCN-NEXT: v_cmp_eq_f32_e32

	@data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4			@data_generic = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4
	@data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4			@data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4

	▲ Show 20 Lines • Show All 129 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Rewrite SILowerI1Copies to always stay on SALU
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 171905

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp

llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h

llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp

llvm/trunk/lib/Target/AMDGPU/Utils/CMakeLists.txt

llvm/trunk/test/CodeGen/AMDGPU/add_i1.ll

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-from-loop.ll

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi.ll

llvm/trunk/test/CodeGen/AMDGPU/inline-asm.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll

llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

llvm/trunk/test/CodeGen/AMDGPU/multilevel-break.ll

llvm/trunk/test/CodeGen/AMDGPU/select-opt.ll

llvm/trunk/test/CodeGen/AMDGPU/sgpr-control-flow.ll

llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf.ll

llvm/trunk/test/CodeGen/AMDGPU/sub_i1.ll

llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll

llvm/trunk/test/CodeGen/AMDGPU/waitcnt-looptest.ll

This is an archive of the discontinued LLVM Phabricator instance.

AMDGPU: Rewrite SILowerI1Copies to always stay on SALUClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 171905

llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

llvm/trunk/lib/Target/AMDGPU/SIFixSGPRCopies.cpp

llvm/trunk/lib/Target/AMDGPU/SILowerI1Copies.cpp

llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.h

llvm/trunk/lib/Target/AMDGPU/Utils/AMDGPULaneDominator.cpp

llvm/trunk/lib/Target/AMDGPU/Utils/CMakeLists.txt

llvm/trunk/test/CodeGen/AMDGPU/add_i1.ll

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-from-loop.ll

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi-uniform-branch.ll

llvm/trunk/test/CodeGen/AMDGPU/i1-copy-phi.ll

llvm/trunk/test/CodeGen/AMDGPU/inline-asm.ll

llvm/trunk/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll

llvm/trunk/test/CodeGen/AMDGPU/loop_break.ll

llvm/trunk/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll

llvm/trunk/test/CodeGen/AMDGPU/multilevel-break.ll

llvm/trunk/test/CodeGen/AMDGPU/select-opt.ll

llvm/trunk/test/CodeGen/AMDGPU/sgpr-control-flow.ll

llvm/trunk/test/CodeGen/AMDGPU/si-annotate-cf.ll

llvm/trunk/test/CodeGen/AMDGPU/sub_i1.ll

llvm/trunk/test/CodeGen/AMDGPU/valu-i1.ll

llvm/trunk/test/CodeGen/AMDGPU/waitcnt-looptest.ll

AMDGPU: Rewrite SILowerI1Copies to always stay on SALU
ClosedPublic