Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -54,6 +54,26 @@ using namespace llvm; +static cl::opt ForceZeroFlag( + "amdgpu-waitcnt-forcezero", + cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), + cl::init(0), cl::Hidden); + +static cl::opt ForceExpFlag( + "amdgpu-waitcnt-forceexp", + cl::desc("Force emit a s_waitcnt expcnt(0) before the first instrs"), + cl::init(0), cl::Hidden); + +static cl::opt ForceLgkmFlag( + "amdgpu-waitcnt-forcelgkm", + cl::desc("Force emit a s_waitcnt lgkmcnt(0) before the first instrs"), + cl::init(0), cl::Hidden); + +static cl::opt ForceVmFlag( + "amdgpu-waitcnt-forcevm", + cl::desc("Force emit a s_waitcnt vmcnt(0) before the first instrs"), + cl::init(0), cl::Hidden); + namespace { // Class of object that encapsulates latest instruction counter score @@ -373,6 +393,9 @@ std::vector> KillWaitBrackets; + bool ForceZero = false; + int32_t ForceSwaitcnt[NUM_INST_CNTS]; + public: static char ID; @@ -397,6 +420,14 @@ llvm::make_unique(*Bracket)); } + bool ForceEmit() const { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) + if (ForceSwaitcnt[T] > 0) + return true; + return false; + } + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets); @@ -1023,9 +1054,6 @@ } // End of for loop that looks at all dest operands. } - // TODO: Tie force zero to a compiler triage option. - bool ForceZero = false; - // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 // occurs before the instruction. Doing it here prevents any additional // S_WAITCNTs from being emitted if the instruction was marked as @@ -1058,7 +1086,7 @@ } // Does this operand processing indicate s_wait counter update? - if (EmitSwaitcnt) { + if (EmitSwaitcnt || ForceEmit()) { int CntVal[NUM_INST_CNTS]; bool UseDefaultWaitcntStrategy = true; @@ -1099,7 +1127,7 @@ } // If we are not waiting on any counter we can skip the wait altogether. - if (EmitSwaitcnt != 0) { + if (EmitSwaitcnt != 0 || ForceEmit()) { MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) != @@ -1135,11 +1163,31 @@ CompilerGeneratedWaitcntSet.insert(SWaitInst); } + if (!EmitSwaitcnt) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + if (ForceSwaitcnt[T] > 0 ) { + DEBUG(dbgs() << "ForceSwaitcnt[" << T << "]: " + << ForceSwaitcnt[T] << '\n';); + } + } + } + const MachineOperand &Op = MachineOperand::CreateImm(AMDGPU::encodeWaitcnt( - IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT])); + IV, + (ForceSwaitcnt[VM_CNT] > 0) ? 0 : CntVal[VM_CNT], + (ForceSwaitcnt[EXP_CNT] > 0) ? 0 : CntVal[EXP_CNT], + (ForceSwaitcnt[LGKM_CNT] > 0) ? 0 : CntVal[LGKM_CNT])); SWaitInst->addOperand(MF, Op); + if (!EmitSwaitcnt) { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) { + --ForceSwaitcnt[T]; + } + } + if (CntVal[EXP_CNT] == 0) { ScoreBrackets->setMixedExpTypes(false); } @@ -1512,13 +1560,15 @@ // Generate s_waitcnt instructions where needed. void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block) { + static int32_t InstCnt = 0; + // Initialize the state information. mergeInputScoreBrackets(Block); BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); DEBUG({ - dbgs() << "Block" << Block.getNumber(); + dbgs() << "*** Block" << Block.getNumber() << " ***"; ScoreBrackets->dump(); }); @@ -1591,7 +1641,7 @@ DEBUG({ SWaitInst->print(dbgs() << '\n'); }); } DEBUG({ - Inst.print(dbgs()); + dbgs() << "Instr" << ++InstCnt << ": " << Inst; ScoreBrackets->dump(); }); @@ -1696,6 +1746,11 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); AMDGPUASI = ST->getAMDGPUAS(); + ForceZero = ForceZeroFlag; + ForceSwaitcnt[VM_CNT] = ForceVmFlag; + ForceSwaitcnt[EXP_CNT] = ForceExpFlag; + ForceSwaitcnt[LGKM_CNT] = ForceLgkmFlag; + HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); Index: test/CodeGen/AMDGPU/waitcnt-debug.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/waitcnt-debug.mir @@ -0,0 +1,41 @@ +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forcelgkm=1 -o - %s | FileCheck -check-prefixes=GCN,LGKM %s +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forceexp=2 -o - %s | FileCheck -check-prefixes=GCN,EXP %s +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forcevm=3 -o - %s | FileCheck -check-prefixes=GCN,VM %s +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forcezero=1 -amdgpu-waitcnt-forcevm=2 -o - %s | FileCheck -check-prefixes=GCN,ZERO %s + +# check that the waitcnt pass options that force insertion of waitcnt instructions are working as expected + +... +# GCN-LABEL: name: waitcnt-debug +# LGKM: S_WAITCNT 127 +# LGKM-NEXT: S_NOP 0 +# LGKM-NEXT: S_NOP 0 + +# EXP: S_WAITCNT 3855 +# EXP-NEXT: S_NOP 0 +# EXP-NEXT: S_WAITCNT 3855 +# EXP-NEXT: S_NOP 0 + +# VM: S_WAITCNT 3952 +# VM-NEXT: S_NOP 0 +# VM-NEXT: S_WAITCNT 3952 +# VM-NEXT: S_NOP 0 +# VM-NEXT: S_WAITCNT 3952 +# VM-NEXT: S_NOP 0 + +# ZERO: S_WAITCNT 0 +# ZERO-NEXT: S_WAITCNT 0 +# ZERO-NEXT: S_NOP 0 +# ZERO-NEXT: S_WAITCNT 0 +# ZERO-NEXT: S_NOP 0 + +name: waitcnt-debug +liveins: +body: | + bb.0: + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 +... +