Diff 108735

include/llvm/IR/IntrinsicsAMDGPU.td

	Show First 20 Lines • Show All 741 Lines • ▼ Show 20 Lines


	// Copies the source value to the destination value, with the guarantee that			// Copies the source value to the destination value, with the guarantee that
	// the source value is computed as if the entire program were executed in WQM.			// the source value is computed as if the entire program were executed in WQM.
	def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],			def int_amdgcn_wqm : Intrinsic<[llvm_any_ty],
	[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]			[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
	>;			>;

				// Copies the active channels of the source value to the destination value,
				// with the guarantee that the source value is computed as if the entire
				// program were executed in Whole Wavefront Mode, i.e. with all channels
				// enabled, with a few exceptions: - Phi nodes with require WWM return an
				// undefined value.
				def int_amdgcn_wwm : Intrinsic<[llvm_any_ty],
				[LLVMMatchType<0>], [IntrNoMem, IntrSpeculatable]
				>;

	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//
	// CI+ Intrinsics			// CI+ Intrinsics
	//===----------------------------------------------------------------------===//			//===----------------------------------------------------------------------===//

	def int_amdgcn_s_dcache_inv_vol :			def int_amdgcn_s_dcache_inv_vol :
	GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,			GCCBuiltin<"__builtin_amdgcn_s_dcache_inv_vol">,
	Intrinsic<[], [], []>;			Intrinsic<[], [], []>;

	▲ Show 20 Lines • Show All 77 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPU.h

	Show First 20 Lines • Show All 42 Lines • ▼ Show 20 Lines
	FunctionPass *createSIShrinkInstructionsPass();			FunctionPass *createSIShrinkInstructionsPass();
	FunctionPass *createSILoadStoreOptimizerPass();			FunctionPass *createSILoadStoreOptimizerPass();
	FunctionPass *createSIWholeQuadModePass();			FunctionPass *createSIWholeQuadModePass();
	FunctionPass *createSIFixControlFlowLiveIntervalsPass();			FunctionPass *createSIFixControlFlowLiveIntervalsPass();
	FunctionPass *createSIFixSGPRCopiesPass();			FunctionPass *createSIFixSGPRCopiesPass();
	FunctionPass *createSIDebuggerInsertNopsPass();			FunctionPass *createSIDebuggerInsertNopsPass();
	FunctionPass *createSIInsertWaitsPass();			FunctionPass *createSIInsertWaitsPass();
	FunctionPass *createSIInsertWaitcntsPass();			FunctionPass *createSIInsertWaitcntsPass();
				FunctionPass *createSIFixWWMLivenessPass();
	FunctionPass *createAMDGPUCodeGenPreparePass();			FunctionPass *createAMDGPUCodeGenPreparePass();
	FunctionPass *createAMDGPUMachineCFGStructurizerPass();			FunctionPass *createAMDGPUMachineCFGStructurizerPass();

	void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);			void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&);
	extern char &AMDGPUMachineCFGStructurizerID;			extern char &AMDGPUMachineCFGStructurizerID;

	void initializeAMDGPUAlwaysInlinePass(PassRegistry&);			void initializeAMDGPUAlwaysInlinePass(PassRegistry&);

	Show All 33 Lines
	extern char &SILowerControlFlowID;			extern char &SILowerControlFlowID;

	void initializeSIInsertSkipsPass(PassRegistry &);			void initializeSIInsertSkipsPass(PassRegistry &);
	extern char &SIInsertSkipsPassID;			extern char &SIInsertSkipsPassID;

	void initializeSIOptimizeExecMaskingPass(PassRegistry &);			void initializeSIOptimizeExecMaskingPass(PassRegistry &);
	extern char &SIOptimizeExecMaskingID;			extern char &SIOptimizeExecMaskingID;

				void initializeSIFixWWMLivenessPass(PassRegistry &);
				extern char &SIFixWWMLivenessID;

	// Passes common to R600 and SI			// Passes common to R600 and SI
	FunctionPass *createAMDGPUPromoteAlloca();			FunctionPass *createAMDGPUPromoteAlloca();
	void initializeAMDGPUPromoteAllocaPass(PassRegistry&);			void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
	extern char &AMDGPUPromoteAllocaID;			extern char &AMDGPUPromoteAllocaID;

	Pass *createAMDGPUStructurizeCFGPass();			Pass *createAMDGPUStructurizeCFGPass();
	FunctionPass *createAMDGPUISelDag(TargetMachine &TM,			FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
	CodeGenOpt::Level OptLevel);			CodeGenOpt::Level OptLevel);
	▲ Show 20 Lines • Show All 106 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

Show First 20 Lines • Show All 145 Lines • ▼ Show 20 Lines	extern "C" void LLVMInitializeAMDGPUTarget() {
initializeSIAnnotateControlFlowPass(*PR);		initializeSIAnnotateControlFlowPass(*PR);
initializeSIInsertWaitsPass(*PR);		initializeSIInsertWaitsPass(*PR);
initializeSIInsertWaitcntsPass(*PR);		initializeSIInsertWaitcntsPass(*PR);
initializeSIWholeQuadModePass(*PR);		initializeSIWholeQuadModePass(*PR);
initializeSILowerControlFlowPass(*PR);		initializeSILowerControlFlowPass(*PR);
initializeSIInsertSkipsPass(*PR);		initializeSIInsertSkipsPass(*PR);
initializeSIDebuggerInsertNopsPass(*PR);		initializeSIDebuggerInsertNopsPass(*PR);
initializeSIOptimizeExecMaskingPass(*PR);		initializeSIOptimizeExecMaskingPass(*PR);
		initializeSIFixWWMLivenessPass(*PR);
initializeAMDGPUUnifyDivergentExitNodesPass(*PR);		initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
initializeAMDGPUAAWrapperPassPass(*PR);		initializeAMDGPUAAWrapperPassPass(*PR);
}		}

static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {		static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
return llvm::make_unique<AMDGPUTargetObjectFile>();		return llvm::make_unique<AMDGPUTargetObjectFile>();
}		}

▲ Show 20 Lines • Show All 595 Lines • ▼ Show 20 Lines	void GCNPassConfig::addFastRegAlloc(FunctionPass *RegAllocPass) {
// FIXME: We have to disable the verifier here because of PHIElimination +		// FIXME: We have to disable the verifier here because of PHIElimination +
// TwoAddressInstructions disabling it.		// TwoAddressInstructions disabling it.

// This must be run immediately after phi elimination and before		// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of		// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.		// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);		insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

		// This must be run after SILowerControlFlow, since it needs to use the
		// machine-level CFG, but before register allocation.
		insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);

TargetPassConfig::addFastRegAlloc(RegAllocPass);		TargetPassConfig::addFastRegAlloc(RegAllocPass);
}		}

void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {		void GCNPassConfig::addOptimizedRegAlloc(FunctionPass *RegAllocPass) {
// This needs to be run directly before register allocation because earlier		// This needs to be run directly before register allocation because earlier
// passes might recompute live intervals.		// passes might recompute live intervals.
insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);		insertPass(&MachineSchedulerID, &SIFixControlFlowLiveIntervalsID);

// This must be run immediately after phi elimination and before		// This must be run immediately after phi elimination and before
// TwoAddressInstructions, otherwise the processing of the tied operand of		// TwoAddressInstructions, otherwise the processing of the tied operand of
// SI_ELSE will introduce a copy of the tied operand source after the else.		// SI_ELSE will introduce a copy of the tied operand source after the else.
insertPass(&PHIEliminationID, &SILowerControlFlowID, false);		insertPass(&PHIEliminationID, &SILowerControlFlowID, false);

		// This must be run after SILowerControlFlow, since it needs to use the
		// machine-level CFG, but before register allocation.
		insertPass(&SILowerControlFlowID, &SIFixWWMLivenessID, false);

TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);		TargetPassConfig::addOptimizedRegAlloc(RegAllocPass);
}		}

void GCNPassConfig::addPostRegAlloc() {		void GCNPassConfig::addPostRegAlloc() {
addPass(&SIFixVGPRCopiesID);		addPass(&SIFixVGPRCopiesID);
addPass(&SIOptimizeExecMaskingID);		addPass(&SIOptimizeExecMaskingID);
TargetPassConfig::addPostRegAlloc();		TargetPassConfig::addPostRegAlloc();
}		}
Show All 29 Lines

lib/Target/AMDGPU/CMakeLists.txt

Show First 20 Lines • Show All 74 Lines • ▼ Show 20 Lines	add_llvm_target(AMDGPUCodeGen
R600OptimizeVectorRegisters.cpp		R600OptimizeVectorRegisters.cpp
R600Packetizer.cpp		R600Packetizer.cpp
R600RegisterInfo.cpp		R600RegisterInfo.cpp
SIAnnotateControlFlow.cpp		SIAnnotateControlFlow.cpp
SIDebuggerInsertNops.cpp		SIDebuggerInsertNops.cpp
SIFixControlFlowLiveIntervals.cpp		SIFixControlFlowLiveIntervals.cpp
SIFixSGPRCopies.cpp		SIFixSGPRCopies.cpp
SIFixVGPRCopies.cpp		SIFixVGPRCopies.cpp
		SIFixWWMLiveness.cpp
SIFoldOperands.cpp		SIFoldOperands.cpp
SIFrameLowering.cpp		SIFrameLowering.cpp
SIInsertSkips.cpp		SIInsertSkips.cpp
SIInsertWaits.cpp		SIInsertWaits.cpp
SIInsertWaitcnts.cpp		SIInsertWaitcnts.cpp
SIInstrInfo.cpp		SIInstrInfo.cpp
SIISelLowering.cpp		SIISelLowering.cpp
SILoadStoreOptimizer.cpp		SILoadStoreOptimizer.cpp
Show All 21 Lines

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

Show First 20 Lines • Show All 562 Lines • ▼ Show 20 Lines	for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();		for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
I != E; ++I) {		I != E; ++I) {
MachineInstr &MI = *I;		MachineInstr &MI = *I;

switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
default:		default:
continue;		continue;
case AMDGPU::COPY:		case AMDGPU::COPY:
case AMDGPU::WQM: {		case AMDGPU::WQM:
		case AMDGPU::WWM: {
// If the destination register is a physical register there isn't really		// If the destination register is a physical register there isn't really
// much we can do to fix this.		// much we can do to fix this.
if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))		if (!TargetRegisterInfo::isVirtualRegister(MI.getOperand(0).getReg()))
continue;		continue;

const TargetRegisterClass SrcRC, DstRC;		const TargetRegisterClass SrcRC, DstRC;
std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);		std::tie(SrcRC, DstRC) = getCopyRegClasses(MI, *TRI, MRI);
if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {		if (isVGPRToSGPRCopy(SrcRC, DstRC, *TRI)) {
▲ Show 20 Lines • Show All 116 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIFixWWMLiveness.cpp

This file was added.

				//===-- SIFixWWMLiveness.cpp - Fix WWM live intervals ---------===//
				//
				// The LLVM Compiler Infrastructure
				//
				// This file is distributed under the University of Illinois Open Source
				// License. See LICENSE.TXT for details.
				//
				//===----------------------------------------------------------------------===//
				//
				/// \file
				/// \brief Computations in WWM can overwrite values in inactive channels for
				/// variables that the register allocator thinks are dead. This pass adds fake
				/// uses of those variables to WWM instructions to make sure that they aren't
				/// overwritten.
				///
				/// As an example, consider this snippet:
				/// %vgpr0 = V_MOV_B32_e32 0.0
				/// if (...) {
				/// %vgpr1 = ...
				/// %vgpr2 = WWM %vgpr1<kill>
				/// ... = %vgpr2<kill>
				/// %vgpr0 = V_MOV_B32_e32 1.0
				/// }
				/// ... = %vgpr0
				///
				/// The live intervals of %vgpr0 don't overlap with those of %vgpr1. Normally,
				/// we can safely allocate %vgpr0 and %vgpr1 in the same register, since
				nhaehnleUnsubmitted Not Done Reply Inline Actions don't nhaehnle:* *don't
				/// writing %vgpr1 would only write to channels that would be clobbered by the
				/// second write to %vgpr0 anyways. But if %vgpr1 is written with WWM enabled,
				/// it would clobber even the inactive channels for which the if-condition is
				/// false, for which %vgpr0 is supposed to be 0. This pass adds an implicit use
				/// of %vgpr0 to the WWM instruction to make sure they aren't allocated to the
				/// same register.
				///
				/// In general, we need to figure out what registers might have their inactive
				/// channels which are eventually used accidentally clobbered by a WWM
				/// instruction. We approximate this using two conditions:
				///
				/// 1. A definition of the variable reaches the WWM instruction.
				/// 2. The variable would be live at the WWM instruction if all its defs were
				/// partial defs (i.e. considered as a use), ignoring normal uses.
				///
				/// If a register matches both conditions, then we add an implicit use of it to
				/// the WWM instruction. Condition #2 is the heart of the matter: every
				/// definition is really a partial definition, since every VALU instruction is
				/// implicitly predicated. We can usually ignore this, but WWM forces us not
				/// to. Condition #1 prevents false positives if the variable is undefined at
				/// the WWM instruction anyways. This is overly conservative in certain cases,
				/// especially in uniform control flow, but this is a workaround anyways until
				/// LLVM gains the notion of predicated uses and definitions of variables.
				///
				//===----------------------------------------------------------------------===//

				#include "AMDGPU.h"
				#include "AMDGPUSubtarget.h"
				#include "SIInstrInfo.h"
				#include "SIRegisterInfo.h"
				#include "llvm/ADT/DepthFirstIterator.h"
				#include "llvm/ADT/SparseBitVector.h"
				#include "llvm/CodeGen/LiveIntervalAnalysis.h"
				#include "llvm/CodeGen/MachineFunctionPass.h"
				#include "llvm/CodeGen/Passes.h"
				#include "llvm/Target/TargetRegisterInfo.h"

				using namespace llvm;

				#define DEBUG_TYPE "si-fix-wwm-liveness"

				namespace {

				class SIFixWWMLiveness : public MachineFunctionPass {
				private:
				LiveIntervals *LIS = nullptr;
				const SIRegisterInfo *TRI;
				MachineRegisterInfo *MRI;

				public:
				static char ID;

				SIFixWWMLiveness() : MachineFunctionPass(ID) {
				initializeSIFixWWMLivenessPass(*PassRegistry::getPassRegistry());
				}

				bool runOnMachineFunction(MachineFunction &MF) override;

				bool runOnWWMInstruction(MachineInstr &MI);

				void addDefs(const MachineInstr &MI, SparseBitVector<> &set);

				StringRef getPassName() const override { return "SI Fix WWM Liveness"; }

				void getAnalysisUsage(AnalysisUsage &AU) const override {
				// Should preserve the same set that TwoAddressInstructions does.
				AU.addPreserved<SlotIndexes>();
				AU.addPreserved<LiveIntervals>();
				AU.addPreservedID(LiveVariablesID);
				AU.addPreservedID(MachineLoopInfoID);
				AU.addPreservedID(MachineDominatorsID);
				AU.setPreservesCFG();
				MachineFunctionPass::getAnalysisUsage(AU);
				}
				};
				nhaehnleUnsubmitted Not Done Reply Inline Actions Duplicate function call. nhaehnle: Duplicate function call.

				} // End anonymous namespace.

				INITIALIZE_PASS(SIFixWWMLiveness, DEBUG_TYPE,
				"SI fix WWM liveness", false, false)

				char SIFixWWMLiveness::ID = 0;

				char &llvm::SIFixWWMLivenessID = SIFixWWMLiveness::ID;

				FunctionPass *llvm::createSIFixWWMLivenessPass() {
				return new SIFixWWMLiveness();
				}

				void SIFixWWMLiveness::addDefs(const MachineInstr &MI, SparseBitVector<> &Regs)
				{
				for (const MachineOperand &Op : MI.defs()) {
				nhaehnleUnsubmitted Not Done Reply Inline Actions Upper case variable names. nhaehnle: Upper case variable names.
				if (Op.isReg()) {
				unsigned Reg = Op.getReg();
				if (TRI->isVGPR(*MRI, Reg))
				Regs.set(Reg);
				}
				}
				}

				bool SIFixWWMLiveness::runOnWWMInstruction(MachineInstr &WWM) {
				MachineBasicBlock *MBB = WWM.getParent();

				// Compute the registers that are live out of MI by figuring out which defs
				// are reachable from MI.
				SparseBitVector<> LiveOut;

				for (auto II = MachineBasicBlock::iterator(WWM), IE =
				MBB->end(); II != IE; ++II) {
				addDefs(*II, LiveOut);
				}

				for (df_iterator<MachineBasicBlock *> I = ++df_begin(MBB),
				E = df_end(MBB);
				I != E; ++I) {
				for (const MachineInstr &MI : **I) {
				addDefs(MI, LiveOut);
				}
				}

				// Compute the registers that reach MI.
				SparseBitVector<> Reachable;

				for (auto II = ++MachineBasicBlock::reverse_iterator(WWM), IE =
				MBB->rend(); II != IE; ++II) {
				addDefs(*II, Reachable);
				}

				for (idf_iterator<MachineBasicBlock *> I = ++idf_begin(MBB),
				E = idf_end(MBB);
				I != E; ++I) {
				for (const MachineInstr &MI : **I) {
				addDefs(MI, Reachable);
				}
				}

				// find the intersection, and add implicit uses.
				LiveOut &= Reachable;

				bool Modified = false;
				for (unsigned Reg : LiveOut) {
				WWM.addOperand(MachineOperand::CreateReg(Reg, false, /isImp=/true));
				nhaehnleUnsubmitted Not Done Reply Inline Actions Variable names are upper case in LLVM style. nhaehnle: Variable names are upper case in LLVM style.
				if (LIS) {
				// FIXME: is there a better way to update the live interval?
				LIS->removeInterval(Reg);
				LIS->createAndComputeVirtRegInterval(Reg);
				}
				Modified = true;
				}

				return Modified;
				}

				bool SIFixWWMLiveness::runOnMachineFunction(MachineFunction &MF) {
				bool modified = false;

				// This doesn't actually need LiveIntervals, but we can preserve them.
				nhaehnleUnsubmitted Not Done Reply Inline Actions Variable names are upper case in LLVM style. nhaehnle: Variable names are upper case in LLVM style.
				nhaehnleUnsubmitted Not Done Reply Inline Actions Gentle reminder :) nhaehnle: Gentle reminder :)
				LIS = getAnalysisIfAvailable<LiveIntervals>();

				const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
				const SIInstrInfo *TII = ST.getInstrInfo();

				TRI = &TII->getRegisterInfo();
				MRI = &MF.getRegInfo();

				for (MachineBasicBlock &MBB : MF) {
				for (MachineInstr &MI : MBB) {
				if (MI.getOpcode() == AMDGPU::EXIT_WWM) {
				modified \|= runOnWWMInstruction(MI);
				}
				}
				}

				return modified;
				}

lib/Target/AMDGPU/SIISelLowering.cpp

Show First 20 Lines • Show All 3,289 Lines • ▼ Show 20 Lines	SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
Op.getOperand(1), Op.getOperand(2));		Op.getOperand(1), Op.getOperand(2));
return DAG.getNode(ISD::BITCAST, DL, VT, Node);		return DAG.getNode(ISD::BITCAST, DL, VT, Node);
}		}
case Intrinsic::amdgcn_wqm: {		case Intrinsic::amdgcn_wqm: {
SDValue Src = Op.getOperand(1);		SDValue Src = Op.getOperand(1);
return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),		return SDValue(DAG.getMachineNode(AMDGPU::WQM, DL, Src.getValueType(), Src),
0);		0);
}		}
		case Intrinsic::amdgcn_wwm: {
		SDValue Src = Op.getOperand(1);
		return SDValue(DAG.getMachineNode(AMDGPU::WWM, DL, Src.getValueType(), Src),
		0);
		arsenmUnsubmitted Not Done Reply Inline Actions Shouldn't this have a chain? arsenm: Shouldn't this have a chain?
		cwabbottAuthorUnsubmitted Not Done Reply Inline Actions No, because it's really just a copy instruction that happens to also enable WWM for its sources -- it doesn't have any side-effects. cwabbott: No, because it's really just a copy instruction that happens to also enable WWM for its sources…
		}
default:		default:
return Op;		return Op;
}		}
}		}

SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,		SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
SelectionDAG &DAG) const {		SelectionDAG &DAG) const {
unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();		unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
▲ Show 20 Lines • Show All 2,443 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstrInfo.cpp

Show First 20 Lines • Show All 1,139 Lines • ▼ Show 20 Lines	else
MIB.add(MI.getOperand(2));		MIB.add(MI.getOperand(2));

Bundler.append(MIB);		Bundler.append(MIB);
llvm::finalizeBundle(MBB, Bundler.begin());		llvm::finalizeBundle(MBB, Bundler.begin());

MI.eraseFromParent();		MI.eraseFromParent();
break;		break;
}		}
		case AMDGPU::EXIT_WWM: {
		// This only gets its own opcode so that SIFixWWMLiveness can tell when WWM
		// is exited.
		MI.setDesc(get(AMDGPU::S_MOV_B64));
		break;
		}
}		}
return true;		return true;
}		}

bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,		bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI,
MachineOperand &Src0,		MachineOperand &Src0,
unsigned Src0OpName,		unsigned Src0OpName,
MachineOperand &Src1,		MachineOperand &Src1,
▲ Show 20 Lines • Show All 1,493 Lines • ▼ Show 20 Lines
unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {		unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) {
switch (MI.getOpcode()) {		switch (MI.getOpcode()) {
default: return AMDGPU::INSTRUCTION_LIST_END;		default: return AMDGPU::INSTRUCTION_LIST_END;
case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;		case AMDGPU::REG_SEQUENCE: return AMDGPU::REG_SEQUENCE;
case AMDGPU::COPY: return AMDGPU::COPY;		case AMDGPU::COPY: return AMDGPU::COPY;
case AMDGPU::PHI: return AMDGPU::PHI;		case AMDGPU::PHI: return AMDGPU::PHI;
case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;		case AMDGPU::INSERT_SUBREG: return AMDGPU::INSERT_SUBREG;
case AMDGPU::WQM: return AMDGPU::WQM;		case AMDGPU::WQM: return AMDGPU::WQM;
		case AMDGPU::WWM: return AMDGPU::WWM;
case AMDGPU::S_MOV_B32:		case AMDGPU::S_MOV_B32:
return MI.getOperand(1).isReg() ?		return MI.getOperand(1).isReg() ?
AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;		AMDGPU::COPY : AMDGPU::V_MOV_B32_e32;
case AMDGPU::S_ADD_I32:		case AMDGPU::S_ADD_I32:
case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;		case AMDGPU::S_ADD_U32: return AMDGPU::V_ADD_I32_e32;
case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;		case AMDGPU::S_ADDC_U32: return AMDGPU::V_ADDC_U32_e32;
case AMDGPU::S_SUB_I32:		case AMDGPU::S_SUB_I32:
case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;		case AMDGPU::S_SUB_U32: return AMDGPU::V_SUB_I32_e32;
▲ Show 20 Lines • Show All 1,289 Lines • ▼ Show 20 Lines	const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
// For target instructions, getOpRegClass just returns the virtual register		// For target instructions, getOpRegClass just returns the virtual register
// class associated with the operand, so we need to find an equivalent VGPR		// class associated with the operand, so we need to find an equivalent VGPR
// register class in order to move the instruction to the VALU.		// register class in order to move the instruction to the VALU.
case AMDGPU::COPY:		case AMDGPU::COPY:
case AMDGPU::PHI:		case AMDGPU::PHI:
case AMDGPU::REG_SEQUENCE:		case AMDGPU::REG_SEQUENCE:
case AMDGPU::INSERT_SUBREG:		case AMDGPU::INSERT_SUBREG:
case AMDGPU::WQM:		case AMDGPU::WQM:
		case AMDGPU::WWM:
if (RI.hasVGPRs(NewDstRC))		if (RI.hasVGPRs(NewDstRC))
return nullptr;		return nullptr;

NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);		NewDstRC = RI.getEquivalentVGPRClass(NewDstRC);
if (!NewDstRC)		if (!NewDstRC)
return nullptr;		return nullptr;
return NewDstRC;		return NewDstRC;
default:		default:
▲ Show 20 Lines • Show All 390 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIInstructions.td

Show First 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	def V_CNDMASK_B64_PSEUDO : VOP3Common <(outs VReg_64:$vdst),
let usesCustomInserter = 1;		let usesCustomInserter = 1;
}		}

// 64-bit vector move instruction. This is mainly used by the SIFoldOperands		// 64-bit vector move instruction. This is mainly used by the SIFoldOperands
// pass to enable folding of inline immediates.		// pass to enable folding of inline immediates.
def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),		def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
(ins VSrc_b64:$src0)>;		(ins VSrc_b64:$src0)>;

// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy		// Pseudoinstruction for @llvm.amdgcn.wqm. It is turned into a copy after the
// after the WQM pass processes them.		// WQM pass processes it.
def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;		def WQM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;

		// Pseudoinstruction for @llvm.amdgcn.wwm. It is turned into a copy post-RA, so
		// that the @earlyclobber is respected. The @earlyclobber is to make sure that
		// the instruction that defines $src0 (which is run in WWM) doesn't
		// accidentally clobber inactive channels of $vdst.
		let Constraints = "@earlyclobber $vdst" in {
		def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
		}

} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]		} // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]

		def EXIT_WWM : SPseudoInstSI <(outs SReg_64:$exec), (ins SReg_64:$src0)> {
		arsenmUnsubmitted Done Reply Inline Actions I think you should keep the out name to match others, i.e. $sdst arsenm: I think you should keep the out name to match others, i.e. $sdst
		let hasSideEffects = 0;
		let mayLoad = 0;
		let mayStore = 0;
		}

let usesCustomInserter = 1, SALU = 1 in {		let usesCustomInserter = 1, SALU = 1 in {
def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),		def GET_GROUPSTATICSIZE : PseudoInstSI <(outs SReg_32:$sdst), (ins),
[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;		[(set SReg_32:$sdst, (int_amdgcn_groupstaticsize))]>;
} // End let usesCustomInserter = 1, SALU = 1		} // End let usesCustomInserter = 1, SALU = 1

def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),		def S_MOV_B64_term : PseudoInstSI<(outs SReg_64:$dst),
(ins SSrc_b64:$src0)> {		(ins SSrc_b64:$src0)> {
let SALU = 1;		let SALU = 1;
▲ Show 20 Lines • Show All 1,180 Lines • Show Last 20 Lines

lib/Target/AMDGPU/SIWholeQuadMode.cpp

//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//		//===-- SIWholeQuadMode.cpp - enter and suspend whole quad mode -----------===//
//		//
// The LLVM Compiler Infrastructure		// The LLVM Compiler Infrastructure
//		//
// This file is distributed under the University of Illinois Open Source		// This file is distributed under the University of Illinois Open Source
// License. See LICENSE.TXT for details.		// License. See LICENSE.TXT for details.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
/// \file		/// \file
/// \brief This pass adds instructions to enable whole quad mode for pixel		/// \brief This pass adds instructions to enable whole quad mode for pixel
/// shaders.		/// shaders, and whole wavefront mode for all programs.
///		///
/// Whole quad mode is required for derivative computations, but it interferes		/// Whole quad mode is required for derivative computations, but it interferes
/// with shader side effects (stores and atomics). This pass is run on the		/// with shader side effects (stores and atomics). This pass is run on the
/// scheduled machine IR but before register coalescing, so that machine SSA is		/// scheduled machine IR but before register coalescing, so that machine SSA is
/// available for analysis. It ensures that WQM is enabled when necessary, but		/// available for analysis. It ensures that WQM is enabled when necessary, but
/// disabled around stores and atomics.		/// disabled around stores and atomics.
///		///
/// When necessary, this pass creates a function prolog		/// When necessary, this pass creates a function prolog
///		///
/// S_MOV_B64 LiveMask, EXEC		/// S_MOV_B64 LiveMask, EXEC
/// S_WQM_B64 EXEC, EXEC		/// S_WQM_B64 EXEC, EXEC
///		///
/// to enter WQM at the top of the function and surrounds blocks of Exact		/// to enter WQM at the top of the function and surrounds blocks of Exact
/// instructions by		/// instructions by
///		///
/// S_AND_SAVEEXEC_B64 Tmp, LiveMask		/// S_AND_SAVEEXEC_B64 Tmp, LiveMask
/// ...		/// ...
/// S_MOV_B64 EXEC, Tmp		/// S_MOV_B64 EXEC, Tmp
///		///
		/// We also compute when a sequence of instructions requires Whole Wavefront
		/// Mode (WWM) and insert instructions to save and restore it:
		///
		/// S_OR_SAVEEXEC_B64 Tmp, -1
		/// ...
		/// S_MOV_B64 EXEC, Tmp
		///
/// In order to avoid excessive switching during sequences of Exact		/// In order to avoid excessive switching during sequences of Exact
/// instructions, the pass first analyzes which instructions must be run in WQM		/// instructions, the pass first analyzes which instructions must be run in WQM
/// (aka which instructions produce values that lead to derivative		/// (aka which instructions produce values that lead to derivative
/// computations).		/// computations).
///		///
/// Basic blocks are always exited in WQM as long as some successor needs WQM.		/// Basic blocks are always exited in WQM as long as some successor needs WQM.
///		///
/// There is room for improvement given better control flow analysis:		/// There is room for improvement given better control flow analysis:
Show All 40 Lines
using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "si-wqm"		#define DEBUG_TYPE "si-wqm"

namespace {		namespace {

enum {		enum {
StateWQM = 0x1,		StateWQM = 0x1,
StateExact = 0x2,		StateWWM = 0x2,
		StateExact = 0x4,
};		};

struct PrintState {		struct PrintState {
public:		public:
int State;		int State;

explicit PrintState(int State) : State(State) {}		explicit PrintState(int State) : State(State) {}
};		};

static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {		static raw_ostream &operator<<(raw_ostream &OS, const PrintState &PS) {
if (PS.State & StateWQM)		if (PS.State & StateWQM)
OS << "WQM";		OS << "WQM";
if (PS.State & StateExact) {		if (PS.State & StateWWM) {
if (PS.State & StateWQM)		if (PS.State & StateWQM)
OS << '\|';		OS << '\|';
		OS << "WWM";
		}
		if (PS.State & StateExact) {
		if (PS.State & (StateWQM \| StateWWM))
		OS << '\|';
OS << "Exact";		OS << "Exact";
}		}

return OS;		return OS;
}		}

struct InstrInfo {		struct InstrInfo {
char Needs = 0;		char Needs = 0;
Show All 13 Lines	struct WorkItem {

WorkItem() = default;		WorkItem() = default;
WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}		WorkItem(MachineBasicBlock *MBB) : MBB(MBB) {}
WorkItem(MachineInstr *MI) : MI(MI) {}		WorkItem(MachineInstr *MI) : MI(MI) {}
};		};

class SIWholeQuadMode : public MachineFunctionPass {		class SIWholeQuadMode : public MachineFunctionPass {
private:		private:
		CallingConv::ID CallingConv;
		nhaehnleUnsubmitted Not Done Reply Inline Actions LLVM style is upper-case variable names. nhaehnle: LLVM style is upper-case variable names.
const SIInstrInfo *TII;		const SIInstrInfo *TII;
const SIRegisterInfo *TRI;		const SIRegisterInfo *TRI;
MachineRegisterInfo *MRI;		MachineRegisterInfo *MRI;
LiveIntervals *LIS;		LiveIntervals *LIS;

DenseMap<const MachineInstr *, InstrInfo> Instructions;		DenseMap<const MachineInstr *, InstrInfo> Instructions;
DenseMap<MachineBasicBlock *, BlockInfo> Blocks;		DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
SmallVector<MachineInstr *, 1> LiveMaskQueries;		SmallVector<MachineInstr *, 1> LiveMaskQueries;
Show All 17 Lines	private:
MachineBasicBlock::iterator		MachineBasicBlock::iterator
prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,		prepareInsertion(MachineBasicBlock &MBB, MachineBasicBlock::iterator First,
MachineBasicBlock::iterator Last, bool PreferLast,		MachineBasicBlock::iterator Last, bool PreferLast,
bool SaveSCC);		bool SaveSCC);
void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toExact(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SaveWQM, unsigned LiveMaskReg);		unsigned SaveWQM, unsigned LiveMaskReg);
void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,		void toWQM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
unsigned SavedWQM);		unsigned SavedWQM);
		void toWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
		unsigned SaveOrig);
		void fromWWM(MachineBasicBlock &MBB, MachineBasicBlock::iterator Before,
		unsigned SavedOrig);
void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);		void processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg, bool isEntry);

void lowerLiveMaskQueries(unsigned LiveMaskReg);		void lowerLiveMaskQueries(unsigned LiveMaskReg);
void lowerCopyInstrs();		void lowerCopyInstrs();

public:		public:
static char ID;		static char ID;

▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	for (const auto &BII : Blocks) {
}		}
}		}
}		}

void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,		void SIWholeQuadMode::markInstruction(MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {		std::vector<WorkItem> &Worklist) {
InstrInfo &II = Instructions[&MI];		InstrInfo &II = Instructions[&MI];

assert(Flag == StateWQM);		assert(!(Flag & StateExact) && Flag != 0);

// Remove any disabled states from the flag. The user that required it gets		// Remove any disabled states from the flag. The user that required it gets
// an undefined value in the helper lanes. For example, this can happen if		// an undefined value in the helper lanes. For example, this can happen if
// the result of an atomic is used by instruction that requires WQM, where		// the result of an atomic is used by instruction that requires WQM, where
// ignoring the request for WQM is correct as per the relevant specs.		// ignoring the request for WQM is correct as per the relevant specs.
Flag &= ~II.Disabled;		Flag &= ~II.Disabled;

// Ignore if the flag is already encompassed by the existing needs, or we		// Ignore if the flag is already encompassed by the existing needs, or we
// just disabled everything.		// just disabled everything.
if ((II.Needs & Flag) == Flag)		if ((II.Needs & Flag) == Flag)
return;		return;

II.Needs \|= Flag;		II.Needs \|= Flag;
Worklist.push_back(&MI);		Worklist.push_back(&MI);
}		}

/// Mark all instructions defining the uses in \p MI with \p Flag.		/// Mark all instructions defining the uses in \p MI with \p Flag.
void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,		void SIWholeQuadMode::markInstructionUses(const MachineInstr &MI, char Flag,
std::vector<WorkItem> &Worklist) {		std::vector<WorkItem> &Worklist) {
assert(Flag == StateWQM);
for (const MachineOperand &Use : MI.uses()) {		for (const MachineOperand &Use : MI.uses()) {
if (!Use.isReg() \|\| !Use.isUse())		if (!Use.isReg() \|\| !Use.isUse())
continue;		continue;

unsigned Reg = Use.getReg();		unsigned Reg = Use.getReg();

// Handle physical registers that we need to track; this is mostly relevant		// Handle physical registers that we need to track; this is mostly relevant
// for VCC, which can appear as the (implicit) input of a uniform branch,		// for VCC, which can appear as the (implicit) input of a uniform branch,
▲ Show 20 Lines • Show All 42 Lines • ▼ Show 20 Lines	for (auto BI = RPOT.begin(), BE = RPOT.end(); BI != BE; ++BI) {
BlockInfo &BBI = Blocks[&MBB];		BlockInfo &BBI = Blocks[&MBB];

for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {		for (auto II = MBB.begin(), IE = MBB.end(); II != IE; ++II) {
MachineInstr &MI = *II;		MachineInstr &MI = *II;
InstrInfo &III = Instructions[&MI];		InstrInfo &III = Instructions[&MI];
unsigned Opcode = MI.getOpcode();		unsigned Opcode = MI.getOpcode();
char Flags = 0;		char Flags = 0;

if (TII->isDS(Opcode)) {		if (TII->isDS(Opcode) && CallingConv == CallingConv::AMDGPU_PS) {
Flags = StateWQM;		Flags = StateWQM;
} else if (TII->isWQM(Opcode)) {		} else if (TII->isWQM(Opcode)) {
// Sampling instructions don't need to produce results for all pixels		// Sampling instructions don't need to produce results for all pixels
// in a quad, they just require all inputs of a quad to have been		// in a quad, they just require all inputs of a quad to have been
// computed for derivatives.		// computed for derivatives.
markInstructionUses(MI, StateWQM, Worklist);		markInstructionUses(MI, StateWQM, Worklist);
GlobalFlags \|= StateWQM;		GlobalFlags \|= StateWQM;
continue;		continue;
} else if (Opcode == AMDGPU::WQM) {		} else if (Opcode == AMDGPU::WQM) {
// The WQM intrinsic requires its output to have all the helper lanes		// The WQM intrinsic requires its output to have all the helper lanes
// correct, so we need it to be in WQM.		// correct, so we need it to be in WQM.
Flags = StateWQM;		Flags = StateWQM;
LowerToCopyInstrs.push_back(&MI);		LowerToCopyInstrs.push_back(&MI);
		} else if (Opcode == AMDGPU::WWM) {
		// The WWM intrinsic doesn't make the same guarantee, and plus it needs
		// to be executed in WQM or Exact so that its copy doesn't clobber
		// inactive lanes.
		markInstructionUses(MI, StateWWM, Worklist);
		GlobalFlags \|= StateWWM;
		LowerToCopyInstrs.push_back(&MI);
		continue;
} else if (TII->isDisableWQM(MI)) {		} else if (TII->isDisableWQM(MI)) {
BBI.Needs \|= StateExact;		BBI.Needs \|= StateExact;
if (!(BBI.InNeeds & StateExact)) {		if (!(BBI.InNeeds & StateExact)) {
BBI.InNeeds \|= StateExact;		BBI.InNeeds \|= StateExact;
Worklist.push_back(&MBB);		Worklist.push_back(&MBB);
}		}
GlobalFlags \|= StateExact;		GlobalFlags \|= StateExact;
III.Disabled = StateWQM;		III.Disabled = StateWQM \| StateWWM;
continue;		continue;
} else {		} else {
if (Opcode == AMDGPU::SI_PS_LIVE) {		if (Opcode == AMDGPU::SI_PS_LIVE) {
LiveMaskQueries.push_back(&MI);		LiveMaskQueries.push_back(&MI);
} else if (WQMOutputs) {		} else if (WQMOutputs) {
// The function is in machine SSA form, which means that physical		// The function is in machine SSA form, which means that physical
// VGPRs correspond to shader inputs and outputs. Inputs are		// VGPRs correspond to shader inputs and outputs. Inputs are
// only used, outputs are only defined.		// only used, outputs are only defined.
▲ Show 20 Lines • Show All 43 Lines • ▼ Show 20 Lines	if (II.Needs & StateWQM) {
if (!(BI.InNeeds & StateWQM)) {		if (!(BI.InNeeds & StateWQM)) {
BI.InNeeds \|= StateWQM;		BI.InNeeds \|= StateWQM;
Worklist.push_back(MBB);		Worklist.push_back(MBB);
}		}
}		}

// Propagate backwards within block		// Propagate backwards within block
if (MachineInstr *PrevMI = MI.getPrevNode()) {		if (MachineInstr *PrevMI = MI.getPrevNode()) {
char InNeeds = II.Needs \| II.OutNeeds;		char InNeeds = (II.Needs & ~StateWWM) \| II.OutNeeds;
if (!PrevMI->isPHI()) {		if (!PrevMI->isPHI()) {
InstrInfo &PrevII = Instructions[PrevMI];		InstrInfo &PrevII = Instructions[PrevMI];
if ((PrevII.OutNeeds \| InNeeds) != PrevII.OutNeeds) {		if ((PrevII.OutNeeds \| InNeeds) != PrevII.OutNeeds) {
PrevII.OutNeeds \|= InNeeds;		PrevII.OutNeeds \|= InNeeds;
Worklist.push_back(PrevMI);		Worklist.push_back(PrevMI);
}		}
}		}
}		}
▲ Show 20 Lines • Show All 189 Lines • ▼ Show 20 Lines	if (SavedWQM) {
MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);
}		}

LIS->InsertMachineInstrInMaps(*MI);		LIS->InsertMachineInstrInMaps(*MI);
}		}

		void SIWholeQuadMode::toWWM(MachineBasicBlock &MBB,
		MachineBasicBlock::iterator Before,
		unsigned SaveOrig)
		{
		MachineInstr *MI;

		assert(SaveOrig);
		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::S_OR_SAVEEXEC_B64),
		SaveOrig)
		.addImm(-1);
		LIS->InsertMachineInstrInMaps(*MI);
		}

		nhaehnleUnsubmitted Not Done Reply Inline Actions This could be an S_OR_SAVEEXEC_B64. nhaehnle: This could be an S_OR_SAVEEXEC_B64.
		cwabbottAuthorUnsubmitted Not Done Reply Inline Actions This is a little tricky, since S_OR_SAVEEXEC_B64 clobbers SCC while the S_MOV_B64 doesn't, so we need to be more careful about where we put it. But I didn't like how I was handling prepareInsertion() in the face of WWM anyways, so I've refactored it to make using S_OR_SAVEEXEC_B64 possible. cwabbott: This is a little tricky, since S_OR_SAVEEXEC_B64 clobbers SCC while the S_MOV_B64 doesn't, so…
		arsenmUnsubmitted Not Done Reply Inline Actions I think this pass should probably not be producing the save exec instructions directly. We re-form these later and this may be a problem with -O0 arsenm: I think this pass should probably not be producing the save exec instructions directly. We re…
		cwabbottAuthorUnsubmitted Not Done Reply Inline Actions The existing pass already uses S_AND_SAVEEXEC_B64, so I don't see a reason not to use it here; I was just trying to be consistent. If we want to let that pass to handle it, it should probably be a separate change. Also, because of that, any problem with -O0 is probably an already-existing bug (although I doubt this pass got much testing with -O0 before), so I don't want to block anything on that. I can try adding an -O0 line to the test though. cwabbott: The existing pass already uses S_AND_SAVEEXEC_B64, so I don't see a reason not to use it here…
		void SIWholeQuadMode::fromWWM(MachineBasicBlock &MBB,
		MachineBasicBlock::iterator Before,
		unsigned SavedOrig)
		{
		arsenmUnsubmitted Done Reply Inline Actions Formatting arsenm: Formatting
		MachineInstr *MI;

		assert(SavedOrig);
		MI = BuildMI(MBB, Before, DebugLoc(), TII->get(AMDGPU::EXIT_WWM), AMDGPU::EXEC)
		.addReg(SavedOrig);
		LIS->InsertMachineInstrInMaps(*MI);
		}

void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,		void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
bool isEntry) {		bool isEntry) {
auto BII = Blocks.find(&MBB);		auto BII = Blocks.find(&MBB);
if (BII == Blocks.end())		if (BII == Blocks.end())
return;		return;

const BlockInfo &BI = BII->second;		const BlockInfo &BI = BII->second;

if (!(BI.InNeeds & StateWQM))
return;

// This is a non-entry block that is WQM throughout, so no need to do		// This is a non-entry block that is WQM throughout, so no need to do
// anything.		// anything.
if (!isEntry && !(BI.Needs & StateExact) && BI.OutNeeds != StateExact)		if (!isEntry && BI.Needs == StateWQM && BI.OutNeeds != StateExact)
return;		return;

DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");		DEBUG(dbgs() << "\nProcessing block BB#" << MBB.getNumber() << ":\n");

unsigned SavedWQMReg = 0;		unsigned SavedWQMReg = 0;
		unsigned SavedNonWWMReg = 0;
bool WQMFromExec = isEntry;		bool WQMFromExec = isEntry;
char State = isEntry ? StateExact : StateWQM;		char State = (isEntry \|\| !(BI.InNeeds & StateWQM)) ? StateExact : StateWQM;
		char NonWWMState = 0;

auto II = MBB.getFirstNonPHI(), IE = MBB.end();		auto II = MBB.getFirstNonPHI(), IE = MBB.end();
if (isEntry)		if (isEntry)
++II; // Skip the instruction that saves LiveMask		++II; // Skip the instruction that saves LiveMask

MachineBasicBlock::iterator First = IE;		// This stores the first instruction where it's safe to switch from WQM to
		// Exact or vice versa.
		MachineBasicBlock::iterator FirstWQM = IE;

		// This stores the first instruction where it's safe to switch from WWM to
		// Exact/WQM or to switch to WWM. It must always be the same as, or after,
		// FirstWQM since if it's safe to switch to/from WWM, it must be safe to
		// switch to/from WQM as well.
		MachineBasicBlock::iterator FirstWWM = IE;
for (;;) {		for (;;) {
MachineBasicBlock::iterator Next = II;		MachineBasicBlock::iterator Next = II;
char Needs = StateExact \| StateWQM;		char Needs = StateExact \| StateWQM; // WWM is disabled by default
char OutNeeds = 0;		char OutNeeds = 0;

if (First == IE)		if (FirstWQM == IE)
First = II;		FirstWQM = II;

		if (FirstWWM == IE)
		FirstWWM = II;

		// First, figure out the allowed states (Needs) based on the propagated
		// flags.
if (II != IE) {		if (II != IE) {
MachineInstr &MI = *II;		MachineInstr &MI = *II;

if (requiresCorrectState(MI)) {		if (requiresCorrectState(MI)) {
auto III = Instructions.find(&MI);		auto III = Instructions.find(&MI);
if (III != Instructions.end()) {		if (III != Instructions.end()) {
if (III->second.Needs & StateWQM)		if (III->second.Needs & StateWWM)
		Needs = StateWWM;
		else if (III->second.Needs & StateWQM)
Needs = StateWQM;		Needs = StateWQM;
else		else
Needs &= ~III->second.Disabled;		Needs &= ~III->second.Disabled;
OutNeeds = III->second.OutNeeds;		OutNeeds = III->second.OutNeeds;
}		}
		} else {
		// If the instruction doesn't actually need a correct EXEC, then we can
		// safely leave WWM enabled.
		Needs = StateExact \| StateWQM \| StateWWM;
}		}

if (MI.isTerminator() && OutNeeds == StateExact)		if (MI.isTerminator() && OutNeeds == StateExact)
Needs = StateExact;		Needs = StateExact;

if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)		if (MI.getOpcode() == AMDGPU::SI_ELSE && BI.OutNeeds == StateExact)
MI.getOperand(3).setImm(1);		MI.getOperand(3).setImm(1);

++Next;		++Next;
} else {		} else {
// End of basic block		// End of basic block
if (BI.OutNeeds & StateWQM)		if (BI.OutNeeds & StateWQM)
Needs = StateWQM;		Needs = StateWQM;
else if (BI.OutNeeds == StateExact)		else if (BI.OutNeeds == StateExact)
Needs = StateExact;		Needs = StateExact;
else		else
Needs = StateWQM \| StateExact;		Needs = StateWQM \| StateExact;
}		}

		// Now, transition if necessary.
if (!(Needs & State)) {		if (!(Needs & State)) {
		MachineBasicBlock::iterator First;
		if (State == StateWWM \|\| Needs == StateWWM) {
		// We must switch to or from WWM
		First = FirstWWM;
		} else {
		// We only need to switch to/from WQM, so we can use FirstWQM
		First = FirstWQM;
		}

MachineBasicBlock::iterator Before =		MachineBasicBlock::iterator Before =
prepareInsertion(MBB, First, II, Needs == StateWQM,		prepareInsertion(MBB, First, II, Needs == StateWQM,
Needs == StateExact \|\| WQMFromExec);		Needs == StateExact \|\| WQMFromExec);

if (Needs == StateExact) {		if (State == StateWWM) {
		assert(SavedNonWWMReg);
		fromWWM(MBB, Before, SavedNonWWMReg);
		State = NonWWMState;
		}

		if (Needs == StateWWM) {
		NonWWMState = State;
		SavedNonWWMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
		toWWM(MBB, Before, SavedNonWWMReg);
		State = StateWWM;
		} else {
		if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
if (!WQMFromExec && (OutNeeds & StateWQM))		if (!WQMFromExec && (OutNeeds & StateWQM))
SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);		SavedWQMReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);

toExact(MBB, Before, SavedWQMReg, LiveMaskReg);		toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
State = StateExact;		State = StateExact;
} else {		} else if (State == StateExact && (Needs & StateWQM) &&
assert(Needs == StateWQM);		!(Needs & StateExact)) {
assert(WQMFromExec == (SavedWQMReg == 0));		assert(WQMFromExec == (SavedWQMReg == 0));

toWQM(MBB, Before, SavedWQMReg);		toWQM(MBB, Before, SavedWQMReg);

if (SavedWQMReg) {		if (SavedWQMReg) {
LIS->createAndComputeVirtRegInterval(SavedWQMReg);		LIS->createAndComputeVirtRegInterval(SavedWQMReg);
SavedWQMReg = 0;		SavedWQMReg = 0;
}		}
State = StateWQM;		State = StateWQM;
		} else {
		// We can get here if we transitioned from WWM to a non-WWM state that
		// already matches our needs, but we shouldn't need to do anything.
		assert(Needs & State);
		}
}		}

First = IE;
}		}

		if (Needs != (StateExact \| StateWQM \| StateWWM)) {
if (Needs != (StateExact \| StateWQM))		if (Needs != (StateExact \| StateWQM))
First = IE;		FirstWQM = IE;
		FirstWWM = IE;
		}

if (II == IE)		if (II == IE)
break;		break;
II = Next;		II = Next;
}		}
}		}

void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {		void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
for (MachineInstr *MI : LiveMaskQueries) {		for (MachineInstr *MI : LiveMaskQueries) {
const DebugLoc &DL = MI->getDebugLoc();		const DebugLoc &DL = MI->getDebugLoc();
unsigned Dest = MI->getOperand(0).getReg();		unsigned Dest = MI->getOperand(0).getReg();
MachineInstr *Copy =		MachineInstr *Copy =
BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)		BuildMI(*MI->getParent(), MI, DL, TII->get(AMDGPU::COPY), Dest)
.addReg(LiveMaskReg);		.addReg(LiveMaskReg);

LIS->ReplaceMachineInstrInMaps(MI, Copy);		LIS->ReplaceMachineInstrInMaps(MI, Copy);
MI->eraseFromParent();		MI->eraseFromParent();
}		}
}		}

void SIWholeQuadMode::lowerCopyInstrs() {		void SIWholeQuadMode::lowerCopyInstrs() {
for (MachineInstr *MI : LowerToCopyInstrs)		for (MachineInstr *MI : LowerToCopyInstrs)
MI->setDesc(TII->get(AMDGPU::COPY));		MI->setDesc(TII->get(AMDGPU::COPY));
		nhaehnleUnsubmitted Not Done Reply Inline Actions This should be simpler with MI->setDesc on the previous patch :) nhaehnle: This should be simpler with MI->setDesc on the previous patch :)
}		}

bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {		bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
if (MF.getFunction()->getCallingConv() != CallingConv::AMDGPU_PS)
arsenmUnsubmitted Not Done Reply Inline Actions Why remove this? It should definitely exit early for compute arsenm: Why remove this? It should definitely exit early for compute
cwabbottAuthorUnsubmitted Not Done Reply Inline Actions Because OpenGL compute shaders (and geometry shaders, and tesselation shaders etc.) can all use WWM for doing non-uniform wavefront reductions, so we at least need to scan the program for WWM instructions. ROCm will probably want it at some point too. cwabbott: Because OpenGL compute shaders (and geometry shaders, and tesselation shaders etc.) can all use…
return false;

Instructions.clear();		Instructions.clear();
Blocks.clear();		Blocks.clear();
LiveMaskQueries.clear();		LiveMaskQueries.clear();
LowerToCopyInstrs.clear();		LowerToCopyInstrs.clear();
		CallingConv = MF.getFunction()->getCallingConv();

const SISubtarget &ST = MF.getSubtarget<SISubtarget>();		const SISubtarget &ST = MF.getSubtarget<SISubtarget>();

TII = ST.getInstrInfo();		TII = ST.getInstrInfo();
TRI = &TII->getRegisterInfo();		TRI = &TII->getRegisterInfo();
MRI = &MF.getRegInfo();		MRI = &MF.getRegInfo();
LIS = &getAnalysis<LiveIntervals>();		LIS = &getAnalysis<LiveIntervals>();

char GlobalFlags = analyzeFunction(MF);		char GlobalFlags = analyzeFunction(MF);
		unsigned LiveMaskReg = 0;
if (!(GlobalFlags & StateWQM)) {		if (!(GlobalFlags & StateWQM)) {
lowerLiveMaskQueries(AMDGPU::EXEC);		lowerLiveMaskQueries(AMDGPU::EXEC);
		if (!(GlobalFlags & StateWWM))
return !LiveMaskQueries.empty();		return !LiveMaskQueries.empty();
}		} else {

// Store a copy of the original live mask when required		// Store a copy of the original live mask when required
unsigned LiveMaskReg = 0;
{
MachineBasicBlock &Entry = MF.front();		MachineBasicBlock &Entry = MF.front();
MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();		MachineBasicBlock::iterator EntryMI = Entry.getFirstNonPHI();

if (GlobalFlags & StateExact \|\| !LiveMaskQueries.empty()) {		if (GlobalFlags & StateExact \|\| !LiveMaskQueries.empty()) {
LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);		LiveMaskReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),		MachineInstr *MI = BuildMI(Entry, EntryMI, DebugLoc(),
TII->get(AMDGPU::COPY), LiveMaskReg)		TII->get(AMDGPU::COPY), LiveMaskReg)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);
LIS->InsertMachineInstrInMaps(*MI);		LIS->InsertMachineInstrInMaps(*MI);
}		}

		lowerLiveMaskQueries(LiveMaskReg);

if (GlobalFlags == StateWQM) {		if (GlobalFlags == StateWQM) {
// For a shader that needs only WQM, we can just set it once.		// For a shader that needs only WQM, we can just set it once.
BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),		BuildMI(Entry, EntryMI, DebugLoc(), TII->get(AMDGPU::S_WQM_B64),
AMDGPU::EXEC)		AMDGPU::EXEC)
.addReg(AMDGPU::EXEC);		.addReg(AMDGPU::EXEC);

lowerLiveMaskQueries(LiveMaskReg);
lowerCopyInstrs();		lowerCopyInstrs();
// EntryMI may become invalid here		// EntryMI may become invalid here
return true;		return true;
}		}
}		}

DEBUG(printInfo());		DEBUG(printInfo());

lowerLiveMaskQueries(LiveMaskReg);
lowerCopyInstrs();		lowerCopyInstrs();

// Handle the general case		// Handle the general case
for (auto BII : Blocks)		for (auto BII : Blocks)
processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());		processBlock(BII.first, LiveMaskReg, BII.first == &MF.begin());

// Physical registers like SCC aren't tracked by default anyway, so just		// Physical registers like SCC aren't tracked by default anyway, so just
// removing the ranges we computed is the simplest option for maintaining		// removing the ranges we computed is the simplest option for maintaining
// the analysis results.		// the analysis results.
LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));		LIS->removeRegUnit(*MCRegUnitIterator(AMDGPU::SCC, TRI));

return true;		return true;
}		}

test/CodeGen/AMDGPU/fix-wwm-liveness.mir

This file was added.

				# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fix-wwm-liveness -o - %s \| FileCheck %s
				#CHECK: %exec = EXIT_WWM killed %19, implicit %21

				---
				name: test_wwm_liveness
				alignment: 0
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				tracksRegLiveness: true
				registers:
				- { id: 0, class: sreg_64, preferred-register: '' }
				- { id: 1, class: sgpr_32, preferred-register: '' }
				- { id: 2, class: sgpr_32, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: vgpr_32, preferred-register: '' }
				- { id: 6, class: vgpr_32, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_64, preferred-register: '%vcc' }
				- { id: 9, class: sreg_64, preferred-register: '' }
				- { id: 10, class: sreg_32_xm0, preferred-register: '' }
				- { id: 11, class: sreg_64, preferred-register: '' }
				- { id: 12, class: sreg_32_xm0, preferred-register: '' }
				- { id: 13, class: sreg_32_xm0, preferred-register: '' }
				- { id: 14, class: sreg_32_xm0, preferred-register: '' }
				- { id: 15, class: sreg_128, preferred-register: '' }
				- { id: 16, class: vgpr_32, preferred-register: '' }
				- { id: 17, class: vgpr_32, preferred-register: '' }
				- { id: 18, class: vgpr_32, preferred-register: '' }
				- { id: 19, class: sreg_64, preferred-register: '' }
				- { id: 20, class: sreg_64, preferred-register: '' }
				- { id: 21, class: vgpr_32, preferred-register: '' }
				- { id: 22, class: sreg_64, preferred-register: '' }
				- { id: 23, class: sreg_64, preferred-register: '' }
				liveins:
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 0
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				maxCallFrameSize: 4294967295
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				savePoint: ''
				restorePoint: ''
				fixedStack:
				stack:
				constants:
				body: \|
				bb.0:
				successors: %bb.1(0x40000000), %bb.2(0x40000000)

				%21 = V_MOV_B32_e32 0, implicit %exec
				%5 = V_MBCNT_LO_U32_B32_e64 -1, 0, implicit %exec
				%6 = V_MBCNT_HI_U32_B32_e32 -1, killed %5, implicit %exec
				%8 = V_CMP_GT_U32_e64 32, killed %6, implicit %exec
				%22 = COPY %exec, implicit-def %exec
				%23 = S_AND_B64 %22, %8, implicit-def dead %scc
				%0 = S_XOR_B64 %23, %22, implicit-def dead %scc
				%exec = S_MOV_B64_term killed %23
				SI_MASK_BRANCH %bb.2, implicit %exec
				S_BRANCH %bb.1

				bb.1:
				successors: %bb.2(0x80000000)

				%13 = S_MOV_B32 61440
				%14 = S_MOV_B32 -1
				%15 = REG_SEQUENCE undef %12, 1, undef %10, 2, killed %14, 3, killed %13, 4
				%19 = COPY %exec
				%exec = S_MOV_B64 -1
				%16 = BUFFER_LOAD_DWORD_OFFSET %15, 0, 0, 0, 0, 0, implicit %exec :: (volatile load 4)
				%17 = V_ADD_F32_e32 1065353216, killed %16, implicit %exec
				%exec = EXIT_WWM killed %19
				%21 = V_MOV_B32_e32 1, implicit %exec
				early-clobber %18 = WWM killed %17, implicit %exec
				BUFFER_STORE_DWORD_OFFSET killed %18, killed %15, 0, 0, 0, 0, 0, implicit %exec :: (store 4)

				bb.2:
				%exec = S_OR_B64 %exec, killed %0, implicit-def %scc
				%vgpr0 = COPY killed %21
				SI_RETURN_TO_EPILOG killed %vgpr0

				...

test/CodeGen/AMDGPU/wqm.ll

	Show First 20 Lines • Show All 95 Lines • ▼ Show 20 Lines
	;CHECK: s_wqm_b64 exec, exec			;CHECK: s_wqm_b64 exec, exec
	;CHECK: buffer_load_dword			;CHECK: buffer_load_dword
	;CHECK: buffer_load_dword			;CHECK: buffer_load_dword
	;CHECK: v_add_f32_e32			;CHECK: v_add_f32_e32
	define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {			define amdgpu_ps float @test6(i32 inreg %idx0, i32 inreg %idx1) {
	main_body:			main_body:
	%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)			%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
	%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)			%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
	%out = fadd float %src0, %src1			%out = fadd float %src0, %src1
				arsenmUnsubmitted Not Done Reply Inline Actions An -O0 run line would be nice arsenm: An -O0 run line would be nice
				cwabbottAuthorUnsubmitted Not Done Reply Inline Actions I tried adding that, but one of the pre-existing functions in the test assert-fails. Something about it wanting to use scratch with -O0, but it isn't set up correctly. I'd be inclined to punt on this, since it's not related to WWM. cwabbott: I tried adding that, but one of the pre-existing functions in the test assert-fails. Something…
	%out.0 = bitcast float %out to i32			%out.0 = bitcast float %out to i32
	%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)			%out.1 = call i32 @llvm.amdgcn.wqm.i32(i32 %out.0)
	%out.2 = bitcast i32 %out.1 to float			%out.2 = bitcast i32 %out.1 to float
	ret float %out.2			ret float %out.2
	}			}

				; Check that WWM is triggered by the wwm intrinsic.
				;
				;CHECK-LABEL: {{^}}test_wwm1:
				;CHECK: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1
				;CHECK: buffer_load_dword
				;CHECK: buffer_load_dword
				;CHECK: v_add_f32_e32
				define amdgpu_ps float @test_wwm1(i32 inreg %idx0, i32 inreg %idx1) {
				main_body:
				%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
				%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
				%out = fadd float %src0, %src1
				%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
				arsenmUnsubmitted Done Reply Inline Actions Need some additional tests with wwm using different types arsenm: Need some additional tests with wwm using different types
				ret float %out.0
				}

				; Check that we don't leave WWM on for computations that don't require WWM,
				; since that will lead clobbering things that aren't supposed to be clobbered
				; in cases like this.
				;
				;CHECK-LABEL: {{^}}test_wwm2:
				;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
				;CHECK: buffer_load_dword
				;CHECK: v_add_f32_e32
				;CHECK: s_mov_b64 exec, [[ORIG]]
				;CHECK: v_add_f32_e32
				define amdgpu_ps float @test_wwm2(i32 inreg %idx) {
				main_body:
				; use mbcnt to make sure the branch is divergent
				%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
				%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
				%cc = icmp uge i32 %hi, 32
				br i1 %cc, label %endif, label %if

				if:
				%src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
				%out = fadd float %src, %src
				%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
				%out.1 = fadd float %src, %out.0
				br label %endif

				endif:
				%out.2 = phi float [ %out.1, %if ], [ 0.0, %main_body ]
				ret float %out.2
				}

				; Check that WWM writes aren't coalesced with non-WWM writes, since the WWM
				; write could clobber disabled channels in the non-WWM one.
				;
				;CHECK-LABEL: {{^}}test_wwm3:
				;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
				;CHECK: buffer_load_dword
				;CHECK: v_add_f32_e32
				;CHECK: s_mov_b64 exec, [[ORIG]]
				;CHECK-NEXT: v_mov_b32_e32
				define amdgpu_ps float @test_wwm3(i32 inreg %idx) {
				main_body:
				; use mbcnt to make sure the branch is divergent
				%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
				%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
				%cc = icmp uge i32 %hi, 32
				br i1 %cc, label %endif, label %if

				if:
				%src = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
				%out = fadd float %src, %src
				%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
				br label %endif

				endif:
				%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
				ret float %out.1
				}

				; Make sure the transition from Exact to WWM then WQM works properly.
				nhaehnleUnsubmitted Not Done Reply Inline Actions Maybe I have jetlag blindness, but those two tests seem to be the same. Also, there should be a test where the wwm computation does use some value from the predecessor block (e.g. use %hi as the offset in the load). nhaehnle: Maybe I have jetlag blindness, but those two tests seem to be the same. Also, there should be…
				cwabbottAuthorUnsubmitted Not Done Reply Inline Actions They're not, since the second one uses lacks the iadd after the llvm.amdgcn.wwm intrinsic. Instead, the intrinsic result is passed directly to the phi. Without the early clobber on the WWM intrinsic, we would coalesce the resulting copy with the other phi sources, and the WWM write would overwrite the inactive channels that should be 0. There's a similar thing going on with the above, but this is easier to verify in a less fragile way -- we just need to make sure that a v_mov_b32 gets emitted after the WWM computation. I'm currently jetlagged too, but I think that's why I added this test. Good point about the additional test though. cwabbott: They're not, since the second one uses lacks the iadd after the llvm.amdgcn.wwm intrinsic.
				nhaehnleUnsubmitted Not Done Reply Inline Actions Wow, that's subtle stuff. Thanks for the explanation! nhaehnle: Wow, that's subtle stuff. Thanks for the explanation!
				;
				;CHECK-LABEL: {{^}}test_wwm4:
				;CHECK: buffer_load_dword
				;CHECK: buffer_store_dword
				;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
				;CHECK: buffer_load_dword
				;CHECK: v_add_f32_e32
				;CHECK: s_mov_b64 exec, [[ORIG]]
				;CHECK: s_wqm_b64 exec, exec
				define amdgpu_ps float @test_wwm4(i32 inreg %idx0, i32 inreg %idx1) {
				main_body:
				%src0 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
				call void @llvm.amdgcn.buffer.store.f32(float %src0, <4 x i32> undef, i32 %idx0, i32 0, i1 0, i1 0)
				%src1 = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> undef, i32 %idx1, i32 0, i1 0, i1 0)
				%temp = fadd float %src1, %src1
				%temp.0 = call float @llvm.amdgcn.wwm.f32(float %temp)
				%out = fadd float %temp.0, %temp.0
				%out.0 = call float @llvm.amdgcn.wqm.f32(float %out)
				ret float %out.0
				}

				; Check that WWM is turned on correctly across basic block boundaries.
				;
				;CHECK-LABEL: {{^}}test_wwm5:
				;CHECK: s_or_saveexec_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], -1
				;SI-CHECK: buffer_load_dword
				;VI-CHECK: flat_load_dword
				;CHECK: s_mov_b64 exec, [[ORIG]]
				;CHECK: %if
				;CHECK: s_or_saveexec_b64 [[ORIG2:s\[[0-9]+:[0-9]+\]]], -1
				;SI-CHECK: buffer_load_dword
				;VI-CHECK: flat_load_dword
				;CHECK: v_add_f32_e32
				;CHECK: s_mov_b64 exec, [[ORIG2]]
				define amdgpu_ps float @test_wwm5() {
				main_body:
				%src0 = load volatile float, float addrspace(1)* undef
				; use mbcnt to make sure the branch is divergent
				%lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
				%hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
				%cc = icmp uge i32 %hi, 32
				br i1 %cc, label %endif, label %if

				if:
				%src1 = load volatile float, float addrspace(1)* undef
				%out = fadd float %src0, %src1
				%out.0 = call float @llvm.amdgcn.wwm.f32(float %out)
				br label %endif

				endif:
				%out.1 = phi float [ %out.0, %if ], [ 0.0, %main_body ]
				ret float %out.1
				}

	; Check a case of one branch of an if-else requiring WQM, the other requiring			; Check a case of one branch of an if-else requiring WQM, the other requiring
	; exact.			; exact.
	;			;
	; Note: In this particular case, the save-and-restore could be avoided if the			; Note: In this particular case, the save-and-restore could be avoided if the
	; analysis understood that the two branches of the if-else are mutually			; analysis understood that the two branches of the if-else are mutually
	; exclusive.			; exclusive.
	;			;
	;CHECK-LABEL: {{^}}test_control_flow_0:			;CHECK-LABEL: {{^}}test_control_flow_0:
	▲ Show 20 Lines • Show All 406 Lines • ▼ Show 20 Lines
	declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3			declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3
	declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3			declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
	declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3			declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
	declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3			declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
	declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3			declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
	declare void @llvm.AMDGPU.kill(float) #1			declare void @llvm.AMDGPU.kill(float) #1
	declare float @llvm.amdgcn.wqm.f32(float) #3			declare float @llvm.amdgcn.wqm.f32(float) #3
	declare i32 @llvm.amdgcn.wqm.i32(i32) #3			declare i32 @llvm.amdgcn.wqm.i32(i32) #3
				declare float @llvm.amdgcn.wwm.f32(float) #3
				declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #3
				declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #3

	attributes #1 = { nounwind }			attributes #1 = { nounwind }
	attributes #2 = { nounwind readonly }			attributes #2 = { nounwind readonly }
	attributes #3 = { nounwind readnone }			attributes #3 = { nounwind readnone }
	attributes #4 = { "amdgpu-ps-wqm-outputs" }			attributes #4 = { "amdgpu-ps-wqm-outputs" }

test/CodeGen/AMDGPU/wqm.mir

This file was added.

				# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-wqm -o - %s \| FileCheck %s

				---
				# Check for awareness that s_or_saveexec_b64 clobbers SCC
				#
				#CHECK: S_OR_SAVEEXEC_B64
				#CHECK: S_CMP_LT_I32
				#CHECK: S_CSELECT_B32
				name: test_wwm_scc
				alignment: 0
				exposesReturnsTwice: false
				legalized: false
				regBankSelected: false
				selected: false
				tracksRegLiveness: true
				registers:
				- { id: 0, class: sgpr_32, preferred-register: '' }
				- { id: 1, class: sgpr_32, preferred-register: '' }
				- { id: 2, class: sgpr_32, preferred-register: '' }
				- { id: 3, class: vgpr_32, preferred-register: '' }
				- { id: 4, class: vgpr_32, preferred-register: '' }
				- { id: 5, class: sgpr_32, preferred-register: '' }
				- { id: 6, class: vgpr_32, preferred-register: '' }
				- { id: 7, class: vgpr_32, preferred-register: '' }
				- { id: 8, class: sreg_32_xm0, preferred-register: '' }
				- { id: 9, class: sreg_32, preferred-register: '' }
				- { id: 10, class: sreg_32, preferred-register: '' }
				- { id: 11, class: vgpr_32, preferred-register: '' }
				- { id: 12, class: vgpr_32, preferred-register: '' }
				liveins:
				- { reg: '%sgpr0', virtual-reg: '%0' }
				- { reg: '%sgpr1', virtual-reg: '%1' }
				- { reg: '%sgpr2', virtual-reg: '%2' }
				- { reg: '%vgpr0', virtual-reg: '%3' }
				frameInfo:
				isFrameAddressTaken: false
				isReturnAddressTaken: false
				hasStackMap: false
				hasPatchPoint: false
				stackSize: 0
				offsetAdjustment: 0
				maxAlignment: 0
				adjustsStack: false
				hasCalls: false
				stackProtector: ''
				maxCallFrameSize: 4294967295
				hasOpaqueSPAdjustment: false
				hasVAStart: false
				hasMustTailInVarArgFunc: false
				savePoint: ''
				restorePoint: ''
				fixedStack:
				stack:
				constants:
				arsenmUnsubmitted Not Done Reply Inline Actions You can delete all of this arsenm: You can delete all of this
				body: \|
				bb.0:
				liveins: %sgpr0, %sgpr1, %sgpr2, %vgpr0

				%3 = COPY %vgpr0
				%2 = COPY %sgpr2
				%1 = COPY %sgpr1
				%0 = COPY %sgpr0
				S_CMP_LT_I32 0, %0, implicit-def %scc
				%12 = V_ADD_I32_e32 %3, %3, implicit-def %vcc, implicit %exec
				%5 = S_CSELECT_B32 %2, %1, implicit %scc
				%11 = V_ADD_I32_e32 %5, %12, implicit-def %vcc, implicit %exec
				%vgpr0 = WWM %11, implicit %exec
				SI_RETURN_TO_EPILOG %vgpr0

				...

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add support for Whole Wavefront Mode
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 108735

include/llvm/IR/IntrinsicsAMDGPU.td

lib/Target/AMDGPU/AMDGPU.h

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

lib/Target/AMDGPU/CMakeLists.txt

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

lib/Target/AMDGPU/SIFixWWMLiveness.cpp

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIInstructions.td

lib/Target/AMDGPU/SIWholeQuadMode.cpp

test/CodeGen/AMDGPU/fix-wwm-liveness.mir

test/CodeGen/AMDGPU/wqm.ll

test/CodeGen/AMDGPU/wqm.mir

This is an archive of the discontinued LLVM Phabricator instance.

[AMDGPU] Add support for Whole Wavefront ModeClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 108735

include/llvm/IR/IntrinsicsAMDGPU.td

lib/Target/AMDGPU/AMDGPU.h

lib/Target/AMDGPU/AMDGPUTargetMachine.cpp

lib/Target/AMDGPU/CMakeLists.txt

lib/Target/AMDGPU/SIFixSGPRCopies.cpp

lib/Target/AMDGPU/SIFixWWMLiveness.cpp

lib/Target/AMDGPU/SIISelLowering.cpp

lib/Target/AMDGPU/SIInstrInfo.cpp

lib/Target/AMDGPU/SIInstructions.td

lib/Target/AMDGPU/SIWholeQuadMode.cpp

test/CodeGen/AMDGPU/fix-wwm-liveness.mir

test/CodeGen/AMDGPU/wqm.ll

test/CodeGen/AMDGPU/wqm.mir

[AMDGPU] Add support for Whole Wavefront Mode
ClosedPublic