Diff 108720

include/llvm/IR/IntrinsicsAMDGPU.td

	Show First 20 Lines • Show All 741 Lines • ▼ Show 20 Lines


	// Copies the source value to the destination value, with the guarantee that			// Copies the source value to the destination value, with the guarantee that
	// the source value is computed as if the entire program were executed in WQM.			// the source value is computed as if the entire program were executed in WQM.
	def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],			def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]			[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;			>;

				// Copies the active channels of the source value to the destination value,
				// with the guarantee that the source value is computed as if the entire
				// program were executed in Whole Wavefront Mode, i.e. with all channels
				// enabled, with a few exceptions: - Phi nodes with require WWM return an
				// undefined value.
				def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
				[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
				>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// CI+ Intrinsics			// CI+ Intrinsics
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def int_amdgcn_s_dcache_inv_vol :			def int_amdgcn_s_dcache_inv_vol :
	GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,			GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
	Intrinsic<[], [], []>;			Intrinsic<[], [], []>;

	▲ Show 20 Lines • Show All 77 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPU.h

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	FunctionPass *createSIShrinkInstructionsPass();			FunctionPass *createSIShrinkInstructionsPass();
	FunctionPass *createSILoadStoreOptimizerPass();			FunctionPass *createSILoadStoreOptimizerPass();
	FunctionPass *createSIWholeQuadModePass();			FunctionPass *createSIWholeQuadModePass();
	FunctionPass *createSIFixControlFlowLiveIntervalsPass();			FunctionPass *createSIFixControlFlowLiveIntervalsPass();
	FunctionPass *createSIFixSGPRCopiesPass();			FunctionPass *createSIFixSGPRCopiesPass();
	FunctionPass *createSIDebuggerInsertNopsPass();			FunctionPass *createSIDebuggerInsertNopsPass();
	FunctionPass *createSIInsertWaitsPass();			FunctionPass *createSIInsertWaitsPass();
	FunctionPass *createSIInsertWaitcntsPass();			FunctionPass *createSIInsertWaitcntsPass();
				FunctionPass *createSIFixWWMLivenessPass();
	FunctionPass *createAMDGPUCodeGenPreparePass();			FunctionPass *createAMDGPUCodeGenPreparePass();
	FunctionPass *createAMDGPUMachineCFGStructurizerPass();			FunctionPass *createAMDGPUMachineCFGStructurizerPass();

	void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);			void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
	extern char &AMDGPUMachineCFGStructurizerID;			extern char &AMDGPUMachineCFGStructurizerID;

	void initializeAMDGPUAlwaysInlinePass(PassRegistry&);			void initializeAMDGPUAlwaysInlinePass(PassRegistry&);

	Show All 33 Lines
	extern char &SILowerControlFlowID;			extern char &SILowerControlFlowID;

	void initializeSIInsertSkipsPass(PassRegistry &);			void initializeSIInsertSkipsPass(PassRegistry &);
	extern char &SIInsertSkipsPassID;			extern char &SIInsertSkipsPassID;

	void initializeSIOptimizeExecMaskingPass(PassRegistry &);			void initializeSIOptimizeExecMaskingPass(PassRegistry &);
	extern char &SIOptimizeExecMaskingID;			extern char &SIOptimizeExecMaskingID;

				void initializeSIFixWWMLivenessPass(PassRegistry &);
				extern char &SIFixWWMLivenessID;

	// Passes common to R600 and SI			// Passes common to R600 and SI
	FunctionPass *createAMDGPUPromoteAlloca();			FunctionPass *createAMDGPUPromoteAlloca();
	void initializeAMDGPUPromoteAllocaPass(PassRegistry&);			void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
	extern char &AMDGPUPromoteAllocaID;			extern char &AMDGPUPromoteAllocaID;

	Pass *createAMDGPUStructurizeCFGPass();			Pass *createAMDGPUStructurizeCFGPass();
	FunctionPass *createAMDGPUISelDag(TargetMachine &TM,			FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
	CodeGenOpt::Level OptLevel);			CodeGenOpt::Level OptLevel);
	▲ Show 20 Lines • Show All 106 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 145 Lines • ▼ Show 20 Lines	extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIAnnotateControlFlowPass(*PR);		initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitsPass(*PR);		initializeSIInsertWaitsPass(*PR);
initializeSIInsertWaitcntsPass(*PR);		initializeSIInsertWaitcntsPass(*PR);
initializeSIWholeQuadModePass(*PR);		initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);		initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);		initializeSIInsertSkipsPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);		initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);		initializeSIOptimizeExecMaskingPass(*PR);
		initializeSIFixWWMLivenessPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);		initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);		initializeAMDGPUAAWrapperPassPass(*PR);
}		}

static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {		static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return llvm::make_unique<AMDGPUTargetObjectFile>();		return llvm::make_unique<AMDGPUTargetObjectFile>();
}		}

▲ Show 20 Lines • Show All 595 Lines • ▼ Show 20 Lines	void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// FIXME: We have to disable the verifier here because of PHIElimination +		// FIXME: We have to disable the verifier here because of PHIElimination +
// TwoAddressInstructions disabling it.		// TwoAddressInstructions disabling it.

// This must be run immediately after phi elimination and before		// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of		// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.		// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);		insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

		// This must be run after SILowerControlFlow, since it needs to use the
		// machine-level CFG, but before register allocation.
		insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);

TargetPassConfig::addFastRegAlloc(RegAllocPass);		TargetPassConfig::addFastRegAlloc(RegAllocPass);
}		}

void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {		void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
// This needs to be run directly before register allocation because earlier		// This needs to be run directly before register allocation because earlier
// passes might recompute live intervals.		// passes might recompute live intervals.
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);		insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);

// This must be run immediately after phi elimination and before		// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of		// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.		// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);		insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

		// This must be run after SILowerControlFlow, since it needs to use the
		// machine-level CFG, but before register allocation.
		insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);

TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);		TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}		}

void GCNPassConfig::addPostRegAlloc() {		void GCNPassConfig::addPostRegAlloc() {
addPass(&SIFixVGPRCopiesID);		addPass(&SIFixVGPRCopiesID);
addPass(&SIOptimizeExecMaskingID);		addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();		TargetPassConfig::addPostRegAlloc();
}		}
Show All 29 Lines

lib/Target/AMDGPU/CMakeLists.txt

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	add_llvm_target(AMDGPUCodeGen
R600OptimizeVectorRegisters.cpp		R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp		R600Packetizer.cpp
R600RegisterInfo.cpp		R600RegisterInfo.cpp
SIAnnotateControlFlow.cpp		SIAnnotateControlFlow.cpp
SIDebuggerInsertNops.cpp		SIDebuggerInsertNops.cpp
SIFixControlFlowLiveIntervals.cpp		SIFixControlFlowLiveIntervals.cpp
SIFixSGPRCopies.cpp		SIFixSGPRCopies.cpp
SIFixVGPRCopies.cpp		SIFixVGPRCopies.cpp
		SIFixWWMLiveness.cpp
SIFoldOperands.cpp		SIFoldOperands.cpp
SIFrameLowering.cpp		SIFrameLowering.cpp
SIInsertSkips.cpp		SIInsertSkips.cpp
SIInsertWaits.cpp		SIInsertWaits.cpp
SIInsertWaitcnts.cpp		SIInsertWaitcnts.cpp
SIInstrInfo.cpp		SIInstrInfo.cpp
SIISelLowering.cpp		SIISelLowering.cpp
SILoadStoreOptimizer.cpp		SILoadStoreOptimizer.cpp
Show All 21 Lines

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Show First 20 Lines • Show All 562 Lines • ▼ Show 20 Lines	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();		for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {		I != E; ++I) {
MachineInstr &MI = *I;		MachineInstr &MI = *I;

switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
default:		default:
continue;		continue;
case AMDGPU::COPY:		case AMDGPU::COPY:
case AMDGPU::WQM: {		case AMDGPU::WQM:
		case AMDGPU::WWM: {
// If the destination register is a physical register there isn't really		// If the destination register is a physical register there isn't really
// much we can do to fix this.		// much we can do to fix this.
if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))		if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
continue;		continue;

const TargetRegisterClass SrcRC, DstRC;		const TargetRegisterClass SrcRC, DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);		std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {		if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
▲ Show 20 Lines • Show All 116 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIFixWWMLiveness.cpp

This file was added.

				//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				//
				/// \file
				/// \brief Computations in WWM can overwrite values in inactive channels for
				/// variables that the register allocator thinks are dead. This pass adds fake
				/// uses of those variables to WWM instructions to make sure that they aren't
				/// overwritten.
				///
				/// As an example, consider this snippet:
				/// %vgpr0 = V_MOV_B32_e32 0.0
				/// if (...) {
				/// %vgpr1 = ...
				/// %vgpr2 = WWM %vgpr1<kill>
				/// ... = %vgpr2<kill>
				/// %vgpr0 = V_MOV_B32_e32 1.0
				/// }
				/// ... = %vgpr0
				///
				/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
				/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
				/// writing %vgpr1 would only write to channels that would be clobbered by the
				/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
				/// it would clobber even the inactive channels for which the if-condition is
				/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
				/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
				/// same register.
				///
				/// In general, we need to figure out what registers might have their inactive
				/// channels which are eventually used accidentally clobbered by a WWM
				/// instruction. We approximate this using two conditions:
				///
				/// 1. A definition of the variable reaches the WWM instruction.
				/// 2. The variable would be live at the WWM instruction if all its defs were
				/// partial defs (i.e. considered as a use), ignoring normal uses.
				///
				/// If a register matches both conditions, then we add an implicit use of it to
				/// the WWM instruction. Condition #2 is the heart of the matter: every
				/// definition is really a partial definition, since every VALU instruction is
				/// implicitly predicated. We can usually ignore this, but WWM forces us not
				/// to. Condition #1 prevents false positives if the variable is undefined at
				/// the WWM instruction anyways. This is overly conservative in certain cases,
				/// especially in uniform control flow, but this is a workaround anyways until
				/// LLVM gains the notion of predicated uses and definitions of variables.
				///
				//===----------------------------------------------------------------------===//

				#include "AMDGPU.h"
				#include "AMDGPUSubtarget.h"
				#include "SIInstrInfo.h"
				#include "SIRegisterInfo.h"
				#include "llvm/ADT/DepthFirstIterator.h"
				#include "llvm/ADT/SparseBitVector.h"
				#include "llvm/CodeGen/LiveIntervalAnalysis.h"
				#include "llvm/CodeGen/MachineFunctionPass.h"
				#include "llvm/CodeGen/Passes.h"
				#include "llvm/Target/TargetRegisterInfo.h"

				using namespace llvm;

				#define DEBUG_TYPE "si-fix-wwm-liveness"

				namespace {

				class SIFixWWMLiveness : public MachineFunctionPass {
				private:
				LiveIntervals *LIS = nullptr;
				const SIRegisterInfo *TRI;
				MachineRegisterInfo *MRI;

				public:
				static char ID;

				SIFixWWMLiveness() : MachineFunctionPass(ID) {
				initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
				}

				bool runOnMachineFunction(MachineFunction &MF) override;

				bool runOnWWMInstruction(MachineInstr &MI);

				void addDefs(const MachineInstr &MI, SparseBitVector<> &set);

				StringRef getPassName() const override { return "SI Fix WWM Liveness"; }

				void getAnalysisUsage(AnalysisUsage &AU) const override {
				// Should preserve the same set that TwoAddressInstructions does.
				AU.addPreserved<SlotIndexes>();
				AU.addPreserved<LiveIntervals>();
				AU.addPreservedID(LiveVariablesID);
				AU.addPreservedID(MachineLoopInfoID);
				AU.addPreservedID(MachineDominatorsID);
				AU.setPreservesCFG();
				MachineFunctionPass::getAnalysisUsage(AU);
				}
				};

				} // End anonymous namespace.

				INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
				"SI fix WWM liveness", false, false)

				char SIFixWWMLiveness::ID = 0;

				char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;

				FunctionPass *llvm::createSIFixWWMLivenessPass() {
				return new SIFixWWMLiveness();
				}

				void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
				{
				for (const MachineOperand &Op : MI.defs()) {
				if (Op.isReg()) {
				unsigned Reg = Op.getReg();
				if (TRI->isVGPR(*MRI, Reg))
				Regs.set(Reg);
				}
				}
				}

				bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
				MachineBasicBlock *MBB = WWM.getParent();

				// Compute the registers that are live out of MI by figuring out which defs
				// are reachable from MI.
				SparseBitVector<> LiveOut;

				for (auto II = MachineBasicBlock::iterator(WWM), IE =
				MBB->end(); II != IE; ++II) {
				addDefs(*II, LiveOut);
				}

				for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
				E = df_end(MBB);
				I != E; ++I) {
				for (const MachineInstr &MI : **I) {
				addDefs(MI, LiveOut);
				}
				}

				// Compute the registers that reach MI.
				SparseBitVector<> Reachable;

				for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
				MBB->rend(); II != IE; ++II) {
				addDefs(*II, Reachable);
				}

				for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
				E = idf_end(MBB);
				I != E; ++I) {
				for (const MachineInstr &MI : **I) {
				addDefs(MI, Reachable);
				}
				}

				// find the intersection, and add implicit uses.
				LiveOut &= Reachable;

				bool Modified = false;
				for (unsigned Reg : LiveOut) {
				WWM.addOperand(MachineOperand::CreateReg(Reg, false, /isImp=/true));
				if (LIS) {
				// FIXME: is there a better way to update the live interval?
				LIS->removeInterval(Reg);
				LIS->createAndComputeVirtRegInterval(Reg);
				}
				Modified = true;
				}

				return Modified;
				}

				bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
				bool modified = false;

				// This doesn't actually need LiveIntervals, but we can preserve them.
				LIS = getAnalysisIfAvailable<LiveIntervals>();

				const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
				const SIInstrInfo *TII = ST.getInstrInfo();

				TRI = &TII->getRegisterInfo();
				MRI = &MF.getRegInfo();

				for (MachineBasicBlock &MBB : MF) {
				for (MachineInstr &MI : MBB) {
				if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
				modified \|= runOnWWMInstruction(MI);
				}
				}
				}

				return modified;
				}

lib/Target/AMDGPU/SIISelLowering.cpp

Show First 20 Lines • Show All 3,289 Lines • ▼ Show 20 Lines	SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);		return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}		}
case Intrinsic::amdgcn_wqm: {		case Intrinsic::amdgcn_wqm: {
SDValue Src = Op.getOperand(1);		SDValue Src = Op.getOperand(1);
return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),		return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
0);		0);
}		}
		case Intrinsic::amdgcn_wwm: {
		SDValue Src = Op.getOperand(1);
		return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
		0);
		}
default:		default:
return Op;		return Op;
}		}
}		}

SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,		SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();		unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
▲ Show 20 Lines • Show All 2,443 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 1,139 Lines • ▼ Show 20 Lines	else
MIB.add(MI.getOperand(2));		MIB.add(MI.getOperand(2));

Bundler.append(MIB);		Bundler.append(MIB);
llvm::finalizeBundle(MBB, Bundler.begin());		llvm::finalizeBundle(MBB, Bundler.begin());

MI.eraseFromParent();		MI.eraseFromParent();
break;		break;
}		}
		case AMDGPU::EXIT_WWM: {
		// This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
		// is exited.
		MI.setDesc(get(AMDGPU::S_MOV_B64));
		break;
		}
}		}
return true;		return true;
}		}

bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,		bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,		MachineOperand &Src0,
unsigned Src0OpName,		unsigned Src0OpName,
MachineOperand &Src1,		MachineOperand &Src1,
▲ Show 20 Lines • Show All 1,493 Lines • ▼ Show 20 Lines
unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {		unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
default: return AMDGPU::INSTRUCTION_LIST_END;		default: return AMDGPU::INSTRUCTION_LIST_END;
case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;		case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
case AMDGPU::COPY: return AMDGPU::COPY;		case AMDGPU::COPY: return AMDGPU::COPY;
case AMDGPU::PHI: return AMDGPU::PHI;		case AMDGPU::PHI: return AMDGPU::PHI;
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;		case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;		case AMDGPU::WQM: return AMDGPU::WQM;
		case AMDGPU::WWM: return AMDGPU::WWM;
case AMDGPU::S_MOV_B32:		case AMDGPU::S_MOV_B32:
return MI.getOperand(1).isReg() ?		return MI.getOperand(1).isReg() ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;		AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
case AMDGPU::S_ADD_I32:		case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;		case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;		case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
case AMDGPU::S_SUB_I32:		case AMDGPU::S_SUB_I32:
case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;		case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
▲ Show 20 Lines • Show All 1,289 Lines • ▼ Show 20 Lines	const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
// For target instructions, getOpRegClass just returns the virtual register		// For target instructions, getOpRegClass just returns the virtual register
// class associated with the operand, so we need to find an equivalent VGPR		// class associated with the operand, so we need to find an equivalent VGPR
// register class in order to move the instruction to the VALU.		// register class in order to move the instruction to the VALU.
case AMDGPU::COPY:		case AMDGPU::COPY:
case AMDGPU::PHI:		case AMDGPU::PHI:
case AMDGPU::REG_SEQUENCE:		case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:		case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:		case AMDGPU::WQM:
		case AMDGPU::WWM:
if (RI.hasVGPRs(NewDstRC))		if (RI.hasVGPRs(NewDstRC))
return nullptr;		return nullptr;

NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);		NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
if (!NewDstRC)		if (!NewDstRC)
return nullptr;		return nullptr;
return NewDstRC;		return NewDstRC;
default:		default:
▲ Show 20 Lines • Show All 390 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
let usesCustomInserter = 1;		let usesCustomInserter = 1;
}		}

// 64-bit vector move instruction. This is mainly used by the SIFoldOperands		// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
// pass to enable folding of inline immediates.		// pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),		def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;		(ins VSrc_b64:$src0)>;

// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy		// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// after the WQM pass processes them.		// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;		def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

		// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
		// that the @earlyclobber is respected. The @earlyclobber is to make sure that
		// the instruction that defines $src0 (which is run in WWM) doesn't
		// accidentally clobber inactive channels of $vdst.
		let Constraints = "@earlyclobber $vdst" in {
		def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
		}

} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]		} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]

		def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$exec), (ins SReg_64:$src0)> {
		let hasSideEffects = 0;
		let mayLoad = 0;
		let mayStore = 0;
		}

let usesCustomInserter = 1, SALU = 1 in {		let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),		def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;		[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1		} // End let usesCustomInserter = 1, SALU = 1

def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),		def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0)> {		(ins SSrc_b64:$src0)> {
let SALU = 1;		let SALU = 1;
▲ Show 20 Lines • Show All 1,180 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIWholeQuadMode.cpp

//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//		//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
//		//
// The LLVM Compiler Infrastructure		// The LLVM Compiler Infrastructure
//		//
// This file is distributed under the University of Illinois Open Source		// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.		// License. See LICENSE.TXT for details.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
/// \file		/// \file
/// \brief This pass adds instructions to enable whole quad mode for pixel		/// \brief This pass adds instructions to enable whole quad mode for pixel
/// shaders.		/// shaders, and whole wavefront mode for all programs.
///		///
/// Whole quad mode is required for derivative computations, but it interferes		/// Whole quad mode is required for derivative computations, but it interferes
/// with shader side effects (stores and atomics). This pass is run on the		/// with shader side effects (stores and atomics). This pass is run on the
/// scheduled machine IR but before register coalescing, so that machine SSA is		/// scheduled machine IR but before register coalescing, so that machine SSA is
/// available for analysis. It ensures that WQM is enabled when necessary, but		/// available for analysis. It ensures that WQM is enabled when necessary, but
/// disabled around stores and atomics.		/// disabled around stores and atomics.
///		///
/// When necessary, this pass creates a function prolog		/// When necessary, this pass creates a function prolog
///		///
/// S_MOV_B64 LiveMask, EXEC		/// S_MOV_B64 LiveMask, EXEC
/// S_WQM_B64 EXEC, EXEC		/// S_WQM_B64 EXEC, EXEC
///		///
/// to enter WQM at the top of the function and surrounds blocks of Exact		/// to enter WQM at the top of the function and surrounds blocks of Exact
/// instructions by		/// instructions by
///		///
/// S_AND_SAVEEXEC_B64 Tmp, LiveMask		/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
/// ...		/// ...
/// S_MOV_B64 EXEC, Tmp		/// S_MOV_B64 EXEC, Tmp
///		///
		/// We also compute when a sequence of instructions requires Whole Wavefront
		/// Mode (WWM) and insert instructions to save and restore it:
		///
		/// S_OR_SAVEEXEC_B64 Tmp, -1
		/// ...
		/// S_MOV_B64 EXEC, Tmp
		///
/// In order to avoid excessive switching during sequences of Exact		/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM		/// instructions, the pass first analyzes which instructions must be run in WQM
/// (aka which instructions produce values that lead to derivative		/// (aka which instructions produce values that lead to derivative
/// computations).		/// computations).
///		///
/// Basic blocks are always exited in WQM as long as some successor needs WQM.		/// Basic blocks are always exited in WQM as long as some successor needs WQM.
///		///
/// There is room for improvement given better control flow analysis:		/// There is room for improvement given better control flow analysis:
Show All 9 Lines
///		///
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//

#include "AMDGPU.h"		#include "AMDGPU.h"
#include "AMDGPUSubtarget.h"		#include "AMDGPUSubtarget.h"
#include "SIInstrInfo.h"		#include "SIInstrInfo.h"
#include "SIMachineFunctionInfo.h"		#include "SIMachineFunctionInfo.h"
#include "llvm/ADT/DenseMap.h"		#include "llvm/ADT/DenseMap.h"
		#include "llvm/ADT/PostOrderIterator.h"
#include "llvm/ADT/SmallVector.h"		#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"		#include "llvm/ADT/StringRef.h"
#include "llvm/CodeGen/LiveInterval.h"		#include "llvm/CodeGen/LiveInterval.h"
#include "llvm/CodeGen/LiveIntervalAnalysis.h"		#include "llvm/CodeGen/LiveIntervalAnalysis.h"
#include "llvm/CodeGen/MachineBasicBlock.h"		#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineFunction.h"		#include "llvm/CodeGen/MachineFunction.h"
#include "llvm/CodeGen/MachineFunctionPass.h"		#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"		#include "llvm/CodeGen/MachineInstr.h"
Show All 14 Lines
using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "si-wqm"		#define DEBUG_TYPE "si-wqm"

namespace {		namespace {

enum {		enum {
StateWQM = 0x1,		StateWQM = 0x1,
StateExact = 0x2,		StateWWM = 0x2,
		StateExact = 0x4,
};		};

struct PrintState {		struct PrintState {
public:		public:
int State;		int State;

explicit PrintState(int State) : State(State) {}		explicit PrintState(int State) : State(State) {}
};		};

static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {		static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
if (PS.State & StateWQM)		if (PS.State & StateWQM)
OS << "WQM";		OS << "WQM";
if (PS.State & StateExact) {		if (PS.State & StateWWM) {
if (PS.State & StateWQM)		if (PS.State & StateWQM)
OS << '\|';		OS << '\|';
		OS << "WWM";
		}
		if (PS.State & StateExact) {
		if (PS.State & (StateWQM \| StateWWM))
		OS << '\|';
OS << "Exact";		OS << "Exact";
}		}

return OS;		return OS;
}		}

struct InstrInfo {		struct InstrInfo {
char Needs = 0;		char Needs = 0;
		char Disabled = 0;
char OutNeeds = 0;		char OutNeeds = 0;
};		};

struct BlockInfo {		struct BlockInfo {
char Needs = 0;		char Needs = 0;
char InNeeds = 0;		char InNeeds = 0;
char OutNeeds = 0;		char OutNeeds = 0;
};		};

struct WorkItem {		struct WorkItem {
MachineBasicBlock *MBB = nullptr;		MachineBasicBlock *MBB = nullptr;
MachineInstr *MI = nullptr;		MachineInstr *MI = nullptr;

WorkItem() = default;		WorkItem() = default;
WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}		WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem(MachineInstr *MI) : MI(MI) {}		WorkItem(MachineInstr *MI) : MI(MI) {}
};		};

class SIWholeQuadMode : public MachineFunctionPass {		class SIWholeQuadMode : public MachineFunctionPass {
private:		private:
		CallingConv::ID CallingConv;
const SIInstrInfo *TII;		const SIInstrInfo *TII;
const SIRegisterInfo *TRI;		const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;
LiveIntervals *LIS;		LiveIntervals *LIS;

DenseMap<const MachineInstr *, InstrInfo> Instructions;		DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;		DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<MachineInstr *, 1> LiveMaskQueries;		SmallVector<MachineInstr *, 1> LiveMaskQueries;
SmallVector<MachineInstr *, 4> LowerToCopyInstrs;		SmallVector<MachineInstr *, 4> LowerToCopyInstrs;

void printInfo();		void printInfo();

void markInstruction(MachineInstr &MI, char Flag,		void markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist);		std::vector<WorkItem> &Worklist);
void markUsesWQM(const MachineInstr &MI, std::vector<WorkItem> &Worklist);		void markInstructionUses(const MachineInstr &MI, char Flag,
		std::vector<WorkItem> &Worklist);
char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);		char scanInstructions(MachineFunction &MF, std::vector<WorkItem> &Worklist);
void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);		void propagateInstruction(MachineInstr &MI, std::vector<WorkItem> &Worklist);
void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);		void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
char analyzeFunction(MachineFunction &MF);		char analyzeFunction(MachineFunction &MF);

bool requiresCorrectState(const MachineInstr &MI) const;		bool requiresCorrectState(const MachineInstr &MI) const;

MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,		MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
MachineBasicBlock::iterator Before);		MachineBasicBlock::iterator Before);
MachineBasicBlock::iterator		MachineBasicBlock::iterator
prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,		prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
MachineBasicBlock::iterator Last, bool PreferLast,		MachineBasicBlock::iterator Last, bool PreferLast,
bool SaveSCC);		bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg);		unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SavedWQM);		unsigned SavedWQM);
		void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
		unsigned SaveOrig);
		void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
		unsigned SavedOrig);
void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);		void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);

void lowerLiveMaskQueries(unsigned LiveMaskReg);		void lowerLiveMaskQueries(unsigned LiveMaskReg);
void lowerCopyInstrs();		void lowerCopyInstrs();

public:		public:
static char ID;		static char ID;

▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	for (const auto &BII : Blocks) {
}		}
}		}
}		}

void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,		void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {		std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];		InstrInfo &II = Instructions[&MI];

assert(Flag == StateWQM \|\| Flag == StateExact);		assert(!(Flag & StateExact) && Flag != 0);

// Ignore if the instruction is already marked. The typical case is that we		// Remove any disabled states from the flag. The user that required it gets
// mark an instruction WQM multiple times, but for atomics it can happen that		// an undefined value in the helper lanes. For example, this can happen if
// Flag is StateWQM, but Needs is already set to StateExact. In this case,		// the result of an atomic is used by instruction that requires WQM, where
// letting the atomic run in StateExact is correct as per the relevant specs.		// ignoring the request for WQM is correct as per the relevant specs.
if (II.Needs)		Flag &= ~II.Disabled;

		// Ignore if the flag is already encompassed by the existing needs, or we
		// just disabled everything.
		if ((II.Needs & Flag) == Flag)
return;		return;

II.Needs = Flag;		II.Needs \|= Flag;
Worklist.push_back(&MI);		Worklist.push_back(&MI);
		nhaehnleUnsubmitted Done Reply Inline Actions I think you can mask out the Flag first. nhaehnle: I think you can mask out the Flag first.
}		}

/// Mark all instructions defining the uses in \p MI as WQM.		/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markUsesWQM(const MachineInstr &MI,		void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {		std::vector<WorkItem> &Worklist) {
for (const MachineOperand &Use : MI.uses()) {		for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() \|\| !Use.isUse())		if (!Use.isReg() \|\| !Use.isUse())
continue;		continue;

unsigned Reg = Use.getReg();		unsigned Reg = Use.getReg();

// Handle physical registers that we need to track; this is mostly relevant		// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,		// for VCC, which can appear as the (implicit) input of a uniform branch,
// e.g. when a loop counter is stored in a VGPR.		// e.g. when a loop counter is stored in a VGPR.
if (!TargetRegisterInfo::isVirtualRegister(Reg)) {		if (!TargetRegisterInfo::isVirtualRegister(Reg)) {
if (Reg == AMDGPU::EXEC)		if (Reg == AMDGPU::EXEC)
continue;		continue;

for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {		for (MCRegUnitIterator RegUnit(Reg, TRI); RegUnit.isValid(); ++RegUnit) {
LiveRange &LR = LIS->getRegUnit(*RegUnit);		LiveRange &LR = LIS->getRegUnit(*RegUnit);
const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();		const VNInfo *Value = LR.Query(LIS->getInstructionIndex(MI)).valueIn();
if (!Value)		if (!Value)
continue;		continue;

// Since we're in machine SSA, we do not need to track physical		// Since we're in machine SSA, we do not need to track physical
// registers across basic blocks.		// registers across basic blocks.
if (Value->isPHIDef())		if (Value->isPHIDef())
continue;		continue;

markInstruction(*LIS->getInstructionFromIndex(Value->def), StateWQM,		markInstruction(*LIS->getInstructionFromIndex(Value->def), Flag,
Worklist);		Worklist);
}		}

continue;		continue;
}		}

for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))		for (MachineInstr &DefMI : MRI->def_instructions(Use.getReg()))
markInstruction(DefMI, StateWQM, Worklist);		markInstruction(DefMI, Flag, Worklist);
}		}
}		}

// Scan instructions to determine which ones require an Exact execmask and		// Scan instructions to determine which ones require an Exact execmask and
// which ones seed WQM requirements.		// which ones seed WQM requirements.
char SIWholeQuadMode::scanInstructions(MachineFunction &MF,		char SIWholeQuadMode::scanInstructions(MachineFunction &MF,
std::vector<WorkItem> &Worklist) {		std::vector<WorkItem> &Worklist) {
char GlobalFlags = 0;		char GlobalFlags = 0;
bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");		bool WQMOutputs = MF.getFunction()->hasFnAttribute("amdgpu-ps-wqm-outputs");

for (auto BI = MF.begin(), BE = MF.end(); BI != BE; ++BI) {		// We need to visit the basic blocks in reverse post-order so that we visit
MachineBasicBlock &MBB = *BI;		// defs before uses, in particular so that we don't accidentally mark an
		// instruction as needing e.g. WQM before visiting it and realizing it needs
		// WQM disabled.
		ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
		for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
		MachineBasicBlock &MBB = **BI;
		BlockInfo &BBI = Blocks[&MBB];

for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {		for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
MachineInstr &MI = *II;		MachineInstr &MI = *II;
		InstrInfo &III = Instructions[&MI];
unsigned Opcode = MI.getOpcode();		unsigned Opcode = MI.getOpcode();
char Flags = 0;		char Flags = 0;

if (TII->isDS(Opcode)) {		if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
Flags = StateWQM;		Flags = StateWQM;
} else if (TII->isWQM(Opcode)) {		} else if (TII->isWQM(Opcode)) {
// Sampling instructions don't need to produce results for all pixels		// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been		// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.		// computed for derivatives.
markUsesWQM(MI, Worklist);		markInstructionUses(MI, StateWQM, Worklist);
GlobalFlags \|= StateWQM;		GlobalFlags \|= StateWQM;
continue;		continue;
} else if (Opcode == AMDGPU::WQM) {		} else if (Opcode == AMDGPU::WQM) {
// The WQM intrinsic requires its output to have all the helper lanes		// The WQM intrinsic requires its output to have all the helper lanes
// correct, so we need it to be in WQM.		// correct, so we need it to be in WQM.
Flags = StateWQM;		Flags = StateWQM;
LowerToCopyInstrs.push_back(&MI);		LowerToCopyInstrs.push_back(&MI);
		} else if (Opcode == AMDGPU::WWM) {
		// The WWM intrinsic doesn't make the same guarantee, and plus it needs
		// to be executed in WQM or Exact so that its copy doesn't clobber
		// inactive lanes.
		markInstructionUses(MI, StateWWM, Worklist);
		GlobalFlags \|= StateWWM;
		LowerToCopyInstrs.push_back(&MI);
		continue;
} else if (TII->isDisableWQM(MI)) {		} else if (TII->isDisableWQM(MI)) {
Flags = StateExact;		BBI.Needs \|= StateExact;
		if (!(BBI.InNeeds & StateExact)) {
		BBI.InNeeds \|= StateExact;
		Worklist.push_back(&MBB);
		}
		GlobalFlags \|= StateExact;
		III.Disabled = StateWQM \| StateWWM;
		continue;
} else {		} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {		if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);		LiveMaskQueries.push_back(&MI);
} else if (WQMOutputs) {		} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical		// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are		// VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.		// only used, outputs are only defined.
for (const MachineOperand &MO : MI.defs()) {		for (const MachineOperand &MO : MI.defs()) {
Show All 25 Lines
void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,		void SIWholeQuadMode::propagateInstruction(MachineInstr &MI,
std::vector<WorkItem>& Worklist) {		std::vector<WorkItem>& Worklist) {
MachineBasicBlock *MBB = MI.getParent();		MachineBasicBlock *MBB = MI.getParent();
InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references		InstrInfo II = Instructions[&MI]; // take a copy to prevent dangling references
BlockInfo &BI = Blocks[MBB];		BlockInfo &BI = Blocks[MBB];

// Control flow-type instructions and stores to temporary memory that are		// Control flow-type instructions and stores to temporary memory that are
// followed by WQM computations must themselves be in WQM.		// followed by WQM computations must themselves be in WQM.
if ((II.OutNeeds & StateWQM) && !II.Needs &&		if ((II.OutNeeds & StateWQM) && !(II.Disabled & StateWQM) &&
(MI.isTerminator() \|\| (TII->usesVM_CNT(MI) && MI.mayStore()))) {		(MI.isTerminator() \|\| (TII->usesVM_CNT(MI) && MI.mayStore()))) {
Instructions[&MI].Needs = StateWQM;		Instructions[&MI].Needs = StateWQM;
II.Needs = StateWQM;		II.Needs = StateWQM;
}		}

// Propagate to block level		// Propagate to block level
BI.Needs \|= II.Needs;		if (II.Needs & StateWQM) {
if ((BI.InNeeds \| II.Needs) != BI.InNeeds) {		BI.Needs \|= StateWQM;
BI.InNeeds \|= II.Needs;		if (!(BI.InNeeds & StateWQM)) {
		BI.InNeeds \|= StateWQM;
Worklist.push_back(MBB);		Worklist.push_back(MBB);
}		}
		}

// Propagate backwards within block		// Propagate backwards within block
if (MachineInstr *PrevMI = MI.getPrevNode()) {		if (MachineInstr *PrevMI = MI.getPrevNode()) {
char InNeeds = II.Needs \| II.OutNeeds;		char InNeeds = (II.Needs & ~StateWWM) \| II.OutNeeds;
if (!PrevMI->isPHI()) {		if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];		InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds \| InNeeds) != PrevII.OutNeeds) {		if ((PrevII.OutNeeds \| InNeeds) != PrevII.OutNeeds) {
PrevII.OutNeeds \|= InNeeds;		PrevII.OutNeeds \|= InNeeds;
Worklist.push_back(PrevMI);		Worklist.push_back(PrevMI);
}		}
}		}
}		}

// Propagate WQM flag to instruction inputs		// Propagate WQM flag to instruction inputs
assert(II.Needs != (StateWQM \| StateExact));		assert(!(II.Needs & StateExact));

if (II.Needs == StateWQM)		if (II.Needs != 0)
markUsesWQM(MI, Worklist);		markInstructionUses(MI, II.Needs, Worklist);
}		}

void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,		void SIWholeQuadMode::propagateBlock(MachineBasicBlock &MBB,
std::vector<WorkItem>& Worklist) {		std::vector<WorkItem>& Worklist) {
BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.		BlockInfo BI = Blocks[&MBB]; // Make a copy to prevent dangling references.

// Propagate through instructions		// Propagate through instructions
if (!MBB.empty()) {		if (!MBB.empty()) {
▲ Show 20 Lines • Show All 175 Lines • ▼ Show 20 Lines	if (SavedWQM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);
}		}

LIS->InsertMachineInstrInMaps(*MI);		LIS->InsertMachineInstrInMaps(*MI);
}		}

		void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
		MachineBasicBlock::iterator Before,
		unsigned SaveOrig)
		{
		MachineInstr *MI;

		assert(SaveOrig);
		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
		SaveOrig)
		.addImm(-1);
		LIS->InsertMachineInstrInMaps(*MI);
		}

		void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
		MachineBasicBlock::iterator Before,
		unsigned SavedOrig)
		{
		MachineInstr *MI;

		assert(SavedOrig);
		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
		.addReg(SavedOrig);
		LIS->InsertMachineInstrInMaps(*MI);
		}

void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,		void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool isEntry) {		bool isEntry) {
auto BII = Blocks.find(&MBB);		auto BII = Blocks.find(&MBB);
if (BII == Blocks.end())		if (BII == Blocks.end())
return;		return;

const BlockInfo &BI = BII->second;		const BlockInfo &BI = BII->second;

if (!(BI.InNeeds & StateWQM))
return;

// This is a non-entry block that is WQM throughout, so no need to do		// This is a non-entry block that is WQM throughout, so no need to do
// anything.		// anything.
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)		if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
return;		return;

DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");		DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");

unsigned SavedWQMReg = 0;		unsigned SavedWQMReg = 0;
		unsigned SavedNonWWMReg = 0;
bool WQMFromExec = isEntry;		bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;		char State = (isEntry \|\| !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
		char NonWWMState = 0;

auto II = MBB.getFirstNonPHI(), IE = MBB.end();		auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (isEntry)		if (isEntry)
++II; // Skip the instruction that saves LiveMask		++II; // Skip the instruction that saves LiveMask

MachineBasicBlock::iterator First = IE;		// This stores the first instruction where it's safe to switch from WQM to
		// Exact or vice versa.
		MachineBasicBlock::iterator FirstWQM = IE;

		// This stores the first instruction where it's safe to switch from WWM to
		// Exact/WQM or to switch to WWM. It must always be the same as, or after,
		// FirstWQM since if it's safe to switch to/from WWM, it must be safe to
		// switch to/from WQM as well.
		MachineBasicBlock::iterator FirstWWM = IE;
for (;;) {		for (;;) {
MachineBasicBlock::iterator Next = II;		MachineBasicBlock::iterator Next = II;
char Needs = 0;		char Needs = StateExact \| StateWQM; // WWM is disabled by default
char OutNeeds = 0;		char OutNeeds = 0;

if (First == IE)		if (FirstWQM == IE)
First = II;		FirstWQM = II;

		if (FirstWWM == IE)
		FirstWWM = II;

		// First, figure out the allowed states (Needs) based on the propagated
		// flags.
if (II != IE) {		if (II != IE) {
MachineInstr &MI = *II;		MachineInstr &MI = *II;

if (requiresCorrectState(MI)) {		if (requiresCorrectState(MI)) {
auto III = Instructions.find(&MI);		auto III = Instructions.find(&MI);
if (III != Instructions.end()) {		if (III != Instructions.end()) {
Needs = III->second.Needs;		if (III->second.Needs & StateWWM)
		Needs = StateWWM;
		else if (III->second.Needs & StateWQM)
		Needs = StateWQM;
		else
		Needs &= ~III->second.Disabled;
OutNeeds = III->second.OutNeeds;		OutNeeds = III->second.OutNeeds;
}		}
		} else {
		// If the instruction doesn't actually need a correct EXEC, then we can
		// safely leave WWM enabled.
		Needs = StateExact \| StateWQM \| StateWWM;
}		}

if (MI.isTerminator() && !Needs && OutNeeds == StateExact)		if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;		Needs = StateExact;

if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)		if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
MI.getOperand(3).setImm(1);		MI.getOperand(3).setImm(1);

++Next;		++Next;
} else {		} else {
// End of basic block		// End of basic block
if (BI.OutNeeds & StateWQM)		if (BI.OutNeeds & StateWQM)
Needs = StateWQM;		Needs = StateWQM;
else if (BI.OutNeeds == StateExact)		else if (BI.OutNeeds == StateExact)
Needs = StateExact;		Needs = StateExact;
		else
		Needs = StateWQM \| StateExact;
		}

		// Now, transition if necessary.
		if (!(Needs & State)) {
		MachineBasicBlock::iterator First;
		if (State == StateWWM \|\| Needs == StateWWM) {
		// We must switch to or from WWM
		First = FirstWWM;
		} else {
		// We only need to switch to/from WQM, so we can use FirstWQM
		First = FirstWQM;
}		}

if (Needs) {
if (Needs != State) {
MachineBasicBlock::iterator Before =		MachineBasicBlock::iterator Before =
prepareInsertion(MBB, First, II, Needs == StateWQM,		prepareInsertion(MBB, First, II, Needs == StateWQM,
Needs == StateExact \|\| WQMFromExec);		Needs == StateExact \|\| WQMFromExec);

if (Needs == StateExact) {		if (State == StateWWM) {
		assert(SavedNonWWMReg);
		fromWWM(MBB, Before, SavedNonWWMReg);
		State = NonWWMState;
		}

		if (Needs == StateWWM) {
		NonWWMState = State;
		SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
		toWWM(MBB, Before, SavedNonWWMReg);
		State = StateWWM;
		} else {
		if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
if (!WQMFromExec && (OutNeeds & StateWQM))		if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);		SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

toExact(MBB, Before, SavedWQMReg, LiveMaskReg);		toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
} else {		State = StateExact;
		} else if (State == StateExact && (Needs & StateWQM) &&
		!(Needs & StateExact)) {
assert(WQMFromExec == (SavedWQMReg == 0));		assert(WQMFromExec == (SavedWQMReg == 0));

toWQM(MBB, Before, SavedWQMReg);		toWQM(MBB, Before, SavedWQMReg);

if (SavedWQMReg) {		if (SavedWQMReg) {
LIS->createAndComputeVirtRegInterval(SavedWQMReg);		LIS->createAndComputeVirtRegInterval(SavedWQMReg);
SavedWQMReg = 0;		SavedWQMReg = 0;
}		}
		State = StateWQM;
		} else {
		// We can get here if we transitioned from WWM to a non-WWM state that
		// already matches our needs, but we shouldn't need to do anything.
		assert(Needs & State);
		}
}		}

State = Needs;
}		}

First = IE;		if (Needs != (StateExact \| StateWQM \| StateWWM)) {
		if (Needs != (StateExact \| StateWQM))
		FirstWQM = IE;
		FirstWWM = IE;
}		}

if (II == IE)		if (II == IE)
break;		break;
II = Next;		II = Next;
}		}
}		}

Show All 11 Lines
}		}

void SIWholeQuadMode::lowerCopyInstrs() {		void SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs)		for (MachineInstr *MI : LowerToCopyInstrs)
MI->setDesc(TII->get(AMDGPU::COPY));		MI->setDesc(TII->get(AMDGPU::COPY));
}		}

bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {		bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
return false;

Instructions.clear();		Instructions.clear();
Blocks.clear();		Blocks.clear();
LiveMaskQueries.clear();		LiveMaskQueries.clear();
LowerToCopyInstrs.clear();		LowerToCopyInstrs.clear();
		CallingConv = MF.getFunction()->getCallingConv();

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();		const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

TII = ST.getInstrInfo();		TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();		LIS = &getAnalysis<LiveIntervals>();

char GlobalFlags = analyzeFunction(MF);		char GlobalFlags = analyzeFunction(MF);
		unsigned LiveMaskReg = 0;
if (!(GlobalFlags & StateWQM)) {		if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(AMDGPU::EXEC);		lowerLiveMaskQueries(AMDGPU::EXEC);
		if (!(GlobalFlags & StateWWM))
return !LiveMaskQueries.empty();		return !LiveMaskQueries.empty();
}		} else {

// Store a copy of the original live mask when required		// Store a copy of the original live mask when required
unsigned LiveMaskReg = 0;
{
MachineBasicBlock &Entry = MF.front();		MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();		MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();

if (GlobalFlags & StateExact \|\| !LiveMaskQueries.empty()) {		if (GlobalFlags & StateExact \|\| !LiveMaskQueries.empty()) {
LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);		LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),		MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
TII->get(AMDGPU::COPY), LiveMaskReg)		TII->get(AMDGPU::COPY), LiveMaskReg)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);
LIS->InsertMachineInstrInMaps(*MI);		LIS->InsertMachineInstrInMaps(*MI);
}		}

		lowerLiveMaskQueries(LiveMaskReg);

if (GlobalFlags == StateWQM) {		if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.		// For a shader that needs only WQM, we can just set it once.
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),		BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);

lowerLiveMaskQueries(LiveMaskReg);
lowerCopyInstrs();		lowerCopyInstrs();
// EntryMI may become invalid here		// EntryMI may become invalid here
return true;		return true;
}		}
}		}

DEBUG(printInfo());		DEBUG(printInfo());

lowerLiveMaskQueries(LiveMaskReg);
lowerCopyInstrs();		lowerCopyInstrs();

// Handle the general case		// Handle the general case
for (auto BII : Blocks)		for (auto BII : Blocks)
processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());		processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());

// Physical registers like SCC aren't tracked by default anyway, so just		// Physical registers like SCC aren't tracked by default anyway, so just
// removing the ranges we computed is the simplest option for maintaining		// removing the ranges we computed is the simplest option for maintaining
// the analysis results.		// the analysis results.
LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));		LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));

return true;		return true;
}		}

test/CodeGen/AMDGPU/fix-wwm-liveness.mir

This file was added.

				# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o - %s \| FileCheck %s
				#CHECK: %exec = EXIT_WWM killed %19, implicit %21

				---
				name: test_wwm_liveness
				alignment: 0
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				tracksRegLiveness: true
				registers:
				- { id: 0, class: sreg_64, preferred-register: '' }
				- { id: 1, class: sgpr_32, preferred-register: '' }
				- { id: 2, class: sgpr_32, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: vgpr_32, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64, preferred-register: '%vcc' }
				- { id: 9, class: sreg_64, preferred-register: '' }
				- { id: 10, class: sreg_32_xm0, preferred-register: '' }
				- { id: 11, class: sreg_64, preferred-register: '' }
				- { id: 12, class: sreg_32_xm0, preferred-register: '' }
				- { id: 13, class: sreg_32_xm0, preferred-register: '' }
				- { id: 14, class: sreg_32_xm0, preferred-register: '' }
				- { id: 15, class: sreg_128, preferred-register: '' }
				- { id: 16, class: vgpr_32, preferred-register: '' }
				- { id: 17, class: vgpr_32, preferred-register: '' }
				- { id: 18, class: vgpr_32, preferred-register: '' }
				- { id: 19, class: sreg_64, preferred-register: '' }
				- { id: 20, class: sreg_64, preferred-register: '' }
				- { id: 21, class: vgpr_32, preferred-register: '' }
				- { id: 22, class: sreg_64, preferred-register: '' }
				- { id: 23, class: sreg_64, preferred-register: '' }
				liveins:
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 0
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				maxCallFrameSize: 4294967295
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				savePoint: ''
				restorePoint: ''
				fixedStack:
				stack:
				constants:
				body: \|
				bb.0:
				successors: %bb.1(0x40000000), %bb.2(0x40000000)

				%21 = V_MOV_B32_e32 0, implicit %exec
				%5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit %exec
				%6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit %exec
				%8 = V_CMP_GT_U32_e64 32, killed %6, implicit %exec
				%22 = COPY %exec, implicit-def %exec
				%23 = S_AND_B64 %22, %8, implicit-def dead %scc
				%0 = S_XOR_B64 %23, %22, implicit-def dead %scc
				%exec = S_MOV_B64_term killed %23
				SI_MASK_BRANCH %bb.2, implicit %exec
				S_BRANCH %bb.1

				bb.1:
				successors: %bb.2(0x80000000)

				%13 = S_MOV_B32 61440
				%14 = S_MOV_B32 -1
				%15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4
				%19 = COPY %exec
				%exec = S_MOV_B64 -1
				%16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4)
				%17 = V_ADD_F32_e32 1065353216, killed %16, implicit %exec
				%exec = EXIT_WWM killed %19
				%21 = V_MOV_B32_e32 1, implicit %exec
				early-clobber %18 = WWM killed %17, implicit %exec
				BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit %exec :: (store 4)

				bb.2:
				%exec = S_OR_B64 %exec, killed %0, implicit-def %scc
				%vgpr0 = COPY killed %21
				SI_RETURN_TO_EPILOG killed %vgpr0

				...

test/CodeGen/AMDGPU/wqm.ll

Show First 20 Lines • Show All 102 Lines • ▼ Show 20 Lines	main_body:
%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)		%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
%out = fadd float %src0, %src1		%out = fadd float %src0, %src1
%out.0 = bitcast float %out to i32		%out.0 = bitcast float %out to i32
%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)		%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
%out.2 = bitcast i32 %out.1 to float		%out.2 = bitcast i32 %out.1 to float
ret float %out.2		ret float %out.2
}		}

		; Check that WWM is triggered by the wwm intrinsic.
		;
		;CHECK-LABEL: {{^}}test_wwm1:
		;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
		;CHECK: buffer_load_dword
		;CHECK: buffer_load_dword
		;CHECK: v_add_f32_e32
		define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
		main_body:
		%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
		%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
		%out = fadd float %src0, %src1
		%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
		ret float %out.0
		}

		; Check that we don't leave WWM on for computations that don't require WWM,
		; since that will lead clobbering things that aren't supposed to be clobbered
		; in cases like this.
		;
		;CHECK-LABEL: {{^}}test_wwm2:
		;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
		;CHECK: buffer_load_dword
		;CHECK: v_add_f32_e32
		;CHECK: s_mov_b64 exec, [[ORIG]]
		;CHECK: v_add_f32_e32
		define amdgpu_ps float @test_wwm2(i32 inreg %idx) {
		main_body:
		; use mbcnt to make sure the branch is divergent
		%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
		%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
		%cc = icmp uge i32 %hi, 32
		br i1 %cc, label %endif, label %if

		if:
		%src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
		%out = fadd float %src, %src
		%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
		%out.1 = fadd float %src, %out.0
		br label %endif

		endif:
		%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
		ret float %out.2
		}

		; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
		; write could clobber disabled channels in the non-WWM one.
		;
		;CHECK-LABEL: {{^}}test_wwm3:
		;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
		;CHECK: buffer_load_dword
		;CHECK: v_add_f32_e32
		;CHECK: s_mov_b64 exec, [[ORIG]]
		;CHECK-NEXT: v_mov_b32_e32
		define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
		main_body:
		; use mbcnt to make sure the branch is divergent
		%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
		%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
		%cc = icmp uge i32 %hi, 32
		br i1 %cc, label %endif, label %if

		if:
		%src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
		%out = fadd float %src, %src
		%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
		br label %endif

		endif:
		%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
		ret float %out.1
		}

		; Make sure the transition from Exact to WWM then WQM works properly.
		;
		;CHECK-LABEL: {{^}}test_wwm4:
		;CHECK: buffer_load_dword
		;CHECK: buffer_store_dword
		;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
		;CHECK: buffer_load_dword
		;CHECK: v_add_f32_e32
		;CHECK: s_mov_b64 exec, [[ORIG]]
		;CHECK: s_wqm_b64 exec, exec
		define amdgpu_ps float @test_wwm4(i32 inreg %idx0, i32 inreg %idx1) {
		main_body:
		%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
		call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
		%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
		%temp = fadd float %src1, %src1
		%temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
		%out = fadd float %temp.0, %temp.0
		%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
		ret float %out.0
		}

		; Check that WWM is turned on correctly across basic block boundaries.
		;
		;CHECK-LABEL: {{^}}test_wwm5:
		;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
		;SI-CHECK: buffer_load_dword
		;VI-CHECK: flat_load_dword
		;CHECK: s_mov_b64 exec, [[ORIG]]
		;CHECK: %if
		;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
		;SI-CHECK: buffer_load_dword
		;VI-CHECK: flat_load_dword
		;CHECK: v_add_f32_e32
		;CHECK: s_mov_b64 exec, [[ORIG2]]
		define amdgpu_ps float @test_wwm5() {
		main_body:
		%src0 = load volatile float, float addrspace(1)* undef
		; use mbcnt to make sure the branch is divergent
		%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
		%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
		%cc = icmp uge i32 %hi, 32
		br i1 %cc, label %endif, label %if

		if:
		%src1 = load volatile float, float addrspace(1)* undef
		%out = fadd float %src0, %src1
		%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
		br label %endif

		endif:
		%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
		ret float %out.1
		}

; Check a case of one branch of an if-else requiring WQM, the other requiring		; Check a case of one branch of an if-else requiring WQM, the other requiring
; exact.		; exact.
;		;
; Note: In this particular case, the save-and-restore could be avoided if the		; Note: In this particular case, the save-and-restore could be avoided if the
; analysis understood that the two branches of the if-else are mutually		; analysis understood that the two branches of the if-else are mutually
; exclusive.		; exclusive.
;		;
;CHECK-LABEL: {{^}}test_control_flow_0:		;CHECK-LABEL: {{^}}test_control_flow_0:
▲ Show 20 Lines • Show All 406 Lines • ▼ Show 20 Lines
declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3		declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3
declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3		declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3		declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3		declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3		declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
declare void @llvm.AMDGPU.kill(float) #1		declare void @llvm.AMDGPU.kill(float) #1
declare float @llvm.amdgcn.wqm.f32(float) #3		declare float @llvm.amdgcn.wqm.f32(float) #3
declare i32 @llvm.amdgcn.wqm.i32(i32) #3		declare i32 @llvm.amdgcn.wqm.i32(i32) #3
		declare float @llvm.amdgcn.wwm.f32(float) #3
		declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
		declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3

attributes #1 = { nounwind }		attributes #1 = { nounwind }
attributes #2 = { nounwind readonly }		attributes #2 = { nounwind readonly }
attributes #3 = { nounwind readnone }		attributes #3 = { nounwind readnone }
attributes #4 = { "amdgpu-ps-wqm-outputs" }		attributes #4 = { "amdgpu-ps-wqm-outputs" }

test/CodeGen/AMDGPU/wqm.mir

This file was added.

				# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-wqm -o - %s \| FileCheck %s

				---
				# Check for awareness that s_or_saveexec_b64 clobbers SCC
				#
				#CHECK: S_OR_SAVEEXEC_B64
				#CHECK: S_CMP_LT_I32
				#CHECK: S_CSELECT_B32
				name: test_wwm_scc
				alignment: 0
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				tracksRegLiveness: true
				registers:
				- { id: 0, class: sgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_32, preferred-register: '' }
				- { id: 2, class: sgpr_32, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: sgpr_32, preferred-register: '' }
				- { id: 6, class: vgpr_32, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_32_xm0, preferred-register: '' }
				- { id: 9, class: sreg_32, preferred-register: '' }
				- { id: 10, class: sreg_32, preferred-register: '' }
				- { id: 11, class: vgpr_32, preferred-register: '' }
				- { id: 12, class: vgpr_32, preferred-register: '' }
				liveins:
				- { reg: '%sgpr0', virtual-reg: '%0' }
				- { reg: '%sgpr1', virtual-reg: '%1' }
				- { reg: '%sgpr2', virtual-reg: '%2' }
				- { reg: '%vgpr0', virtual-reg: '%3' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 0
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				maxCallFrameSize: 4294967295
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				savePoint: ''
				restorePoint: ''
				fixedStack:
				stack:
				constants:
				body: \|
				bb.0:
				liveins: %sgpr0, %sgpr1, %sgpr2, %vgpr0

				%3 = COPY %vgpr0
				%2 = COPY %sgpr2
				%1 = COPY %sgpr1
				%0 = COPY %sgpr0
				S_CMP_LT_I32 0, %0, implicit-def %scc
				%12 = V_ADD_I32_e32 %3, %3, implicit-def %vcc, implicit %exec
				%5 = S_CSELECT_B32 %2, %1, implicit %scc
				%11 = V_ADD_I32_e32 %5, %12, implicit-def %vcc, implicit %exec
				%vgpr0 = WWM %11, implicit %exec
				SI_RETURN_TO_EPILOG %vgpr0

				...

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] refactor WQM pass in preparation for WWM (NFCI)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 108720

include/llvm/IR/IntrinsicsAMDGPU.td

lib/Target/AMDGPU/AMDGPU.h

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

lib/Target/AMDGPU/CMakeLists.txt

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

lib/Target/AMDGPU/SIFixWWMLiveness.cpp

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIInstructions.td

lib/Target/AMDGPU/SIWholeQuadMode.cpp

test/CodeGen/AMDGPU/fix-wwm-liveness.mir

test/CodeGen/AMDGPU/wqm.ll

test/CodeGen/AMDGPU/wqm.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] refactor WQM pass in preparation for WWM (NFCI)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 108720

include/llvm/IR/IntrinsicsAMDGPU.td

lib/Target/AMDGPU/AMDGPU.h

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

lib/Target/AMDGPU/CMakeLists.txt

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

lib/Target/AMDGPU/SIFixWWMLiveness.cpp

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIInstructions.td

lib/Target/AMDGPU/SIWholeQuadMode.cpp

test/CodeGen/AMDGPU/fix-wwm-liveness.mir

test/CodeGen/AMDGPU/wqm.ll

test/CodeGen/AMDGPU/wqm.mir

[AMDGPU] refactor WQM pass in preparation for WWM (NFCI)
ClosedPublic