Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/DebugLoc.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/DebugCounter.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include @@ -50,9 +51,21 @@ #include #include +using namespace llvm; + #define DEBUG_TYPE "si-insert-waitcnts" -using namespace llvm; +DEBUG_COUNTER(ForceExpCounter, DEBUG_TYPE"-forceexp", + "Force emit s_waitcnt expcnt(0) instrs"); +DEBUG_COUNTER(ForceLgkmCounter, DEBUG_TYPE"-forcelgkm", + "Force emit s_waitcnt lgkmcnt(0) instrs"); +DEBUG_COUNTER(ForceVMCounter, DEBUG_TYPE"-forcevm", + "Force emit s_waitcnt vmcnt(0) instrs"); + +static cl::opt ForceEmitZeroFlag( + "amdgpu-waitcnt-forcezero", + cl::desc("Force all waitcnt instrs to be emitted as s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)"), + cl::init(0), cl::Hidden); namespace { @@ -338,7 +351,7 @@ MachineInstr *getWaitcnt() const { return LfWaitcnt; } void print() { - DEBUG(dbgs() << " iteration " << IterCnt << '\n';); + DEBUG(dbgs() << " iteration " << IterCnt << '\n'); } private: @@ -373,6 +386,9 @@ std::vector> KillWaitBrackets; + bool ForceEmitZeroWaitcnt; + bool ForceEmitWaitcnt[NUM_INST_CNTS]; + public: static char ID; @@ -397,10 +413,45 @@ llvm::make_unique(*Bracket)); } + bool isForceEmitWaitcnt() const { + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) + if (ForceEmitWaitcnt[T]) + return true; + return false; + } + + void setForceEmitWaitcnt() { +// For non-debug builds, ForceEmitWaitcnt has been initialized to false; +// For debug builds, get the debug counter info and adjust if need be +#ifndef NDEBUG + if (DebugCounter::isCounterSet(ForceExpCounter) && + DebugCounter::shouldExecute(ForceExpCounter)) { + ForceEmitWaitcnt[EXP_CNT] = true; + } else { + ForceEmitWaitcnt[EXP_CNT] = false; + } + + if (DebugCounter::isCounterSet(ForceLgkmCounter) && + DebugCounter::shouldExecute(ForceLgkmCounter)) { + ForceEmitWaitcnt[LGKM_CNT] = true; + } else { + ForceEmitWaitcnt[LGKM_CNT] = false; + } + + if (DebugCounter::isCounterSet(ForceVMCounter) && + DebugCounter::shouldExecute(ForceVMCounter)) { + ForceEmitWaitcnt[VM_CNT] = true; + } else { + ForceEmitWaitcnt[VM_CNT] = false; + } +#endif // NDEBUG + } + bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; - void generateSWaitCntInstBefore(MachineInstr &MI, + void generateWaitcntInstBefore(MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets); - void updateEventWaitCntAfter(MachineInstr &Inst, + void updateEventWaitcntAfter(MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets); void mergeInputScoreBrackets(MachineBasicBlock &Block); MachineBasicBlock *loopBottom(const MachineLoop *Loop); @@ -824,17 +875,21 @@ /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -void SIInsertWaitcnts::generateSWaitCntInstBefore( +void SIInsertWaitcnts::generateWaitcntInstBefore( MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) { // To emit, or not to emit - that's the question! // Start with an assumption that there is no need to emit. - unsigned int EmitSwaitcnt = 0; + unsigned int EmitWaitcnt = 0; // No need to wait before phi. If a phi-move exists, then the wait should // has been inserted before the move. If a phi-move does not exist, then // wait should be inserted before the real use. The same is true for // sc-merge. It is not a coincident that all these cases correspond to the // instructions that are skipped in the assembling loop. bool NeedLineMapping = false; // TODO: Check on this. + setForceEmitWaitcnt(); + + bool IsForceEmitWaitcnt = isForceEmitWaitcnt(); + if (MI.isDebugValue() && // TODO: any other opcode? !NeedLineMapping) { @@ -849,7 +904,7 @@ ScoreBrackets->clearWaitAtBeginning(); for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { - EmitSwaitcnt |= CNT_MASK(T); + EmitWaitcnt |= CNT_MASK(T); ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); } } @@ -859,7 +914,7 @@ else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC || MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) { - EmitSwaitcnt |= + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); } @@ -873,7 +928,7 @@ T = (enum InstCounterType)(T + 1)) { if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); - EmitSwaitcnt |= CNT_MASK(T); + EmitWaitcnt |= CNT_MASK(T); } } } @@ -884,7 +939,7 @@ AMDGPU::SendMsg::ID_GS_DONE)) { if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) { ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitSwaitcnt |= CNT_MASK(VM_CNT); + EmitWaitcnt |= CNT_MASK(VM_CNT); } } #if 0 // TODO: the following blocks of logic when we have fence. @@ -902,11 +957,11 @@ case SCMEM_LDS: if (group_is_multi_wave || context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); // LDS may have to wait for VM_CNT after buffer load to LDS if (target_info->HasBufferLoadToLDS()) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); } } @@ -914,9 +969,9 @@ case SCMEM_GDS: if (group_is_multi_wave || fence_is_global) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); } break; @@ -926,9 +981,9 @@ case SCMEM_RING: case SCMEM_SCATTER: if (group_is_multi_wave || fence_is_global) { - EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT, + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); } break; @@ -949,13 +1004,13 @@ if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) { // Export and GDS are tracked individually, either may trigger a waitcnt // for EXEC. - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK)); } @@ -970,7 +1025,7 @@ if (ScoreBrackets->getScoreUB(EXP_CNT) > ScoreBrackets->getScoreLB(EXP_CNT)) { ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= CNT_MASK(EXP_CNT); + EmitWaitcnt |= CNT_MASK(EXP_CNT); } } #endif @@ -988,7 +1043,7 @@ continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; // VM_CNT is only relevant to vgpr or LDS. - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); } @@ -1000,10 +1055,10 @@ for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Op.getReg())) { // VM_CNT is only relevant to vgpr or LDS. - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); } - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); } } @@ -1022,9 +1077,9 @@ if (AS != AMDGPUASI.LOCAL_ADDRESS) continue; unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS; - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); } } @@ -1035,31 +1090,28 @@ ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true); for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (TRI->isVGPR(MRIA, Def.getReg())) { - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT)); } - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT)); } } // End of for loop that looks at all dest operands. } - // TODO: Tie force zero to a compiler triage option. - bool ForceZero = false; - // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0 // occurs before the instruction. Doing it here prevents any additional // S_WAITCNTs from being emitted if the instruction was marked as // requiring a WAITCNT beforehand. if (MI.getOpcode() == AMDGPU::S_BARRIER && !ST->hasAutoWaitcntBeforeBarrier()) { - EmitSwaitcnt |= + EmitWaitcnt |= ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT)); - EmitSwaitcnt |= ScoreBrackets->updateByWait( + EmitWaitcnt |= ScoreBrackets->updateByWait( LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT)); } @@ -1075,17 +1127,17 @@ // block, so if we only wait on LGKM here, we might end up with // another s_waitcnt inserted right after this if there are non-LGKM // instructions still outstanding. - ForceZero = true; - EmitSwaitcnt = true; + ForceEmitZeroWaitcnt = true; + EmitWaitcnt = true; } } // Does this operand processing indicate s_wait counter update? - if (EmitSwaitcnt) { + if (EmitWaitcnt || IsForceEmitWaitcnt) { int CntVal[NUM_INST_CNTS]; bool UseDefaultWaitcntStrategy = true; - if (ForceZero) { + if (ForceEmitZeroWaitcnt) { // Force all waitcnts to 0. for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { @@ -1100,7 +1152,7 @@ if (UseDefaultWaitcntStrategy) { for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; T = (enum InstCounterType)(T + 1)) { - if (EmitSwaitcnt & CNT_MASK(T)) { + if (EmitWaitcnt & CNT_MASK(T)) { int Delta = ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T); int MaxDelta = ScoreBrackets->getWaitCountMax(T); @@ -1110,7 +1162,7 @@ ScoreBrackets->setScoreLB( T, ScoreBrackets->getScoreUB(T) - MaxDelta); } - EmitSwaitcnt &= ~CNT_MASK(T); + EmitWaitcnt &= ~CNT_MASK(T); } CntVal[T] = Delta; } else { @@ -1122,7 +1174,7 @@ } // If we are not waiting on any counter we can skip the wait altogether. - if (EmitSwaitcnt != 0) { + if (EmitWaitcnt != 0 || IsForceEmitWaitcnt) { MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt(); int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm(); if (!OldWaitcnt || @@ -1145,13 +1197,15 @@ } ScoreBracket->setRevisitLoop(true); DEBUG(dbgs() << "set-revisit: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + << ContainingLoop->getHeader()->getNumber() << '\n'); } } // Update an existing waitcount, or make a new one. - unsigned Enc = AMDGPU::encodeWaitcnt(IV, CntVal[VM_CNT], - CntVal[EXP_CNT], CntVal[LGKM_CNT]); + unsigned Enc = AMDGPU::encodeWaitcnt(IV, + ForceEmitWaitcnt[VM_CNT] ? 0 : CntVal[VM_CNT], + ForceEmitWaitcnt[EXP_CNT] ? 0 : CntVal[EXP_CNT], + ForceEmitWaitcnt[LGKM_CNT] ? 0 : CntVal[LGKM_CNT]); // We don't remove waitcnts that existed prior to the waitcnt // pass. Check if the waitcnt to-be-inserted can be avoided // or if the prev waitcnt can be updated. @@ -1177,6 +1231,11 @@ } if (insertSWaitInst) { if (OldWaitcnt && OldWaitcnt->getOpcode() == AMDGPU::S_WAITCNT) { + if (ForceEmitZeroWaitcnt) + DEBUG(dbgs() << "Force emit s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)\n"); + if (IsForceEmitWaitcnt) + DEBUG(dbgs() << "Force emit a s_waitcnt due to debug counter\n"); + OldWaitcnt->getOperand(0).setImm(Enc); if (!OldWaitcnt->getParent()) MI.getParent()->insert(MI, OldWaitcnt); @@ -1234,7 +1293,7 @@ return false; } -void SIInsertWaitcnts::updateEventWaitCntAfter( +void SIInsertWaitcnts::updateEventWaitcntAfter( MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) { // Now look at the instruction opcode. If it is a memory access // instruction, update the upper-bound of the appropriate counter's @@ -1588,7 +1647,7 @@ BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); DEBUG({ - dbgs() << "Block" << Block.getNumber(); + dbgs() << "*** Block" << Block.getNumber() << " ***"; ScoreBrackets->dump(); }); @@ -1631,9 +1690,9 @@ // Generate an s_waitcnt instruction to be placed before // cur_Inst, if needed. - generateSWaitCntInstBefore(Inst, ScoreBrackets); + generateWaitcntInstBefore(Inst, ScoreBrackets); - updateEventWaitCntAfter(Inst, ScoreBrackets); + updateEventWaitcntAfter(Inst, ScoreBrackets); #if 0 // TODO: implement resource type check controlled by options with ub = LB. // If this instruction generates a S_SETVSKIP because it is an @@ -1688,7 +1747,7 @@ if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); WaitcntData->print(); - DEBUG(dbgs() << '\n';); + DEBUG(dbgs() << '\n'); // The iterative waitcnt insertion algorithm aims for optimal waitcnt // placement and doesn't always guarantee convergence for a loop. Each @@ -1754,6 +1813,11 @@ const SIMachineFunctionInfo *MFI = MF.getInfo(); AMDGPUASI = ST->getAMDGPUAS(); + ForceEmitZeroWaitcnt = ForceEmitZeroFlag; + for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS; + T = (enum InstCounterType)(T + 1)) + ForceEmitWaitcnt[T] = false; + HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV); HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV); HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV); @@ -1803,7 +1867,7 @@ (!BlockWaitcntProcessedSet.count(&MBB))) { BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); DEBUG(dbgs() << "set-revisit: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + << ContainingLoop->getHeader()->getNumber() << '\n'); } // Walk over the instructions. @@ -1833,7 +1897,7 @@ } LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); WaitcntData->incIterCnt(); - DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n';); + DEBUG(dbgs() << "revisit: Block" << EntryBB->getNumber() << '\n'); continue; } else { LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); Index: test/CodeGen/AMDGPU/waitcnt-debug.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/waitcnt-debug.mir @@ -0,0 +1,41 @@ +# REQUIRES: asserts +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -debug-counter=si-insert-waitcnts-forcelgkm-count=1 -o - %s | FileCheck -check-prefixes=GCN,LGKM %s +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -debug-counter=si-insert-waitcnts-forceexp-count=2 -o - %s | FileCheck -check-prefixes=GCN,EXP %s +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -debug-counter=si-insert-waitcnts-forcevm-count=3 -o - %s | FileCheck -check-prefixes=GCN,VM %s +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -amdgpu-waitcnt-forcezero=1 -debug-counter=si-insert-waitcnts-forcevm-count=2 -o - %s | FileCheck -check-prefixes=GCN,ZERO %s + +# check that the waitcnt pass options that force insertion of waitcnt instructions are working as expected + +... +# GCN-LABEL: name: waitcnt-debug +# LGKM: S_WAITCNT 127 +# LGKM-NEXT: S_NOP 0 +# LGKM-NEXT: S_NOP 0 + +# EXP: S_WAITCNT 3855 +# EXP-NEXT: S_NOP 0 +# EXP-NEXT: S_WAITCNT 3855 +# EXP-NEXT: S_NOP 0 + +# VM: S_WAITCNT 3952 +# VM-NEXT: S_NOP 0 +# VM-NEXT: S_WAITCNT 3952 +# VM-NEXT: S_NOP 0 +# VM-NEXT: S_WAITCNT 3952 +# VM-NEXT: S_NOP 0 + +# ZERO: S_WAITCNT 0 +# ZERO-NEXT: S_WAITCNT 0 +# ZERO-NEXT: S_NOP 0 +# ZERO-NEXT: S_WAITCNT 0 +# ZERO-NEXT: S_NOP 0 + +name: waitcnt-debug +liveins: +body: | + bb.0: + S_NOP 0 + S_NOP 0 + S_NOP 0 + S_NOP 0 +...