diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Sequence.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -355,6 +356,8 @@ DenseSet TrackedWaitcntSet; DenseMap SLoadAddresses; + DenseMap PreheadersToFlush; + MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; struct BlockInfo { @@ -381,6 +384,9 @@ (void)ForceVMCounter; } + bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); + bool isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -389,6 +395,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -431,14 +438,18 @@ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr); + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt); + bool generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, - AMDGPU::Waitcnt &Wait, const MachineInstr *MI); + AMDGPU::Waitcnt &Wait, MachineBasicBlock::iterator It); }; } // end anonymous namespace @@ -792,6 +803,7 @@ INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -804,19 +816,19 @@ return new SIInsertWaitcnts(); } -/// Combine consecutive waitcnt instructions that precede \p MI and follow +/// Combine consecutive waitcnt instructions that precede \p It and follow /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added /// by previous passes. Currently this pass conservatively assumes that these /// preexisting waitcnt are required for correctness. bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, - const MachineInstr *MI) { + MachineBasicBlock::iterator It) { bool Modified = false; MachineInstr *WaitcntInstr = nullptr; MachineInstr *WaitcntVsCntInstr = nullptr; for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II); - &*II != MI; II = NextI, ++NextI) { + II != It; II = NextI, ++NextI) { if (II->isMetaInstruction()) continue; @@ -936,9 +948,10 @@ /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -bool SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr) { +bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt) { setForceEmitWaitcnt(); if (MI.isMetaInstruction()) @@ -1182,11 +1195,18 @@ if (ForceEmitWaitcnt[VS_CNT]) Wait.VsCnt = 0; + if (FlushVmCnt) { + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB != 0) + Wait.VmCnt = 0; + } + if (OldWaitcntInstr) { // Try to merge the required wait with preexisting waitcnt instructions. // Also erase redundant waitcnt. Modified = - applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI); + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, MI.getIterator()); } else { // Update waitcnt brackets after determining the required wait. ScoreBrackets.applyWaitcnt(Wait); @@ -1226,6 +1246,46 @@ return Modified; } +// Add a waitcnt to flush the vmcnt counter at the end of the given block if +// needed. +bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { + AMDGPU::Waitcnt Wait; + bool Modified = false; + + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB == 0) + return false; + + Wait.VmCnt = 0; + ScoreBrackets.applyWaitcnt(Wait); + if (OldWaitcntInstr) + // Try to merge the required wait with preexisting waitcnt instructions. + // Also erase redundant waitcnt. + Modified = + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, Block.end()); + else + // Update waitcnt brackets after determining the required wait. + ScoreBrackets.applyWaitcnt(Wait); + + // Build new waitcnt instructions unless no wait is needed. + if (Wait.hasWaitExceptVsCnt()) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + auto SWaitInst = BuildMI(Block, Block.end(), Block.back().getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcntBlockEnd\n" + << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; +} + // This is a flat memory operation. Check to see if it has memory tokens other // than LDS. Other address spaces supported by flat memory operations involve // global memory. @@ -1488,8 +1548,12 @@ continue; } + bool FlushVmCnt = (Block.getFirstTerminator() == Inst) && + isPreheaderToFlush(Block, ScoreBrackets); + // Generate an s_waitcnt instruction to be placed before Inst, if needed. - Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, + FlushVmCnt); OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. @@ -1574,9 +1638,110 @@ ++Iter; } + if (Block.getFirstTerminator() == Block.end() && + isPreheaderToFlush(Block, ScoreBrackets)) + Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); + return Modified; } +// Return true if the given machine basic block is a preheader of a loop in +// which we want to flush the vmcnt counter, and false otherwise. +bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets) { + if (PreheadersToFlush.count(&MBB)) + return PreheadersToFlush[&MBB]; + + auto UpdateCache = [&](bool val) { + PreheadersToFlush[&MBB] = val; + return val; + }; + + // A preheader has at least one successor. + if (MBB.succ_begin() == MBB.succ_end()) + return UpdateCache(false); + + // Retrieve the top the level loop of the successor if any. + MachineBasicBlock *Succ = *MBB.succ_begin(); + MachineLoop *Loop = MLI->getLoopFor(Succ); + if (!Loop) + return UpdateCache(false); + + if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets)) + return UpdateCache(true); + + return UpdateCache(false); +} + +// Return true if it is better to flush the vmcnt counter in the preheader of +// the given loop. We currently decide to flush in two situations: +// 1. The loop contains vmem store(s), no vmem load and at least one use of a +// vgpr containing a value that is loaded outside of the loop. (Only on +// targets with no vscnt counter). +// 2. The loop contains vmem load(s), but the loaded values are not used in the +// loop, and at least one use of a vgpr containing a value that is loaded +// outside of the loop. On targets with no vscnt counter, the loop must also +// not contain any vmem store. +bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, + WaitcntBrackets &Brackets) { + if (!ML->getLoopPreheader()) + return false; + + bool HasVMemLoad = false; + bool HasVMemStore = false; + bool UsesVgprLoadedOutside = false; + DenseSet VgprUse; + DenseSet VgprDef; + + for (MachineBasicBlock *const MBB : ML->blocks()) { + for (MachineInstr &MI : *MBB) { + if (SIInstrInfo::isVMEM(MI)) { + if (MI.mayLoad()) + HasVMemLoad = true; + if (MI.mayStore()) + HasVMemStore = true; + } + for (unsigned I = 0; I < MI.getNumOperands(); I++) { + MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) + continue; + RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I); + // Vgpr use + if (Op.isUse()) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprDef.contains(RegNo)) + return false; + VgprUse.insert(RegNo); + // If at least one of Op's registers is in the score brackets, the + // value is likely loaded outside of the loop. + if (Brackets.getRegScore(RegNo, VM_CNT) > 0) { + UsesVgprLoadedOutside = true; + break; + } + } + } + // VMem load vgpr def + else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprUse.contains(RegNo)) + return false; + VgprDef.insert(RegNo); + } + } + } + } + if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) + return true; + bool flush = HasVMemLoad && UsesVgprLoadedOutside; + if (!ST->hasVscnt()) + flush = flush && !HasVMemStore; + return flush; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1584,6 +1749,7 @@ MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); + MLI = &getAnalysis(); PDT = &getAnalysis(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -126,10 +126,11 @@ ; GCN-O0-NEXT: Insert fentry calls ; GCN-O0-NEXT: Insert XRay ops ; GCN-O0-NEXT: SI Memory Legalizer +; GCN-O0-NEXT: MachineDominator Tree Construction +; GCN-O0-NEXT: Machine Natural Loop Construction ; GCN-O0-NEXT: MachinePostDominator Tree Construction ; GCN-O0-NEXT: SI insert wait instructions ; GCN-O0-NEXT: Insert required mode register values -; GCN-O0-NEXT: MachineDominator Tree Construction ; GCN-O0-NEXT: SI Final Branch Preparation ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: Branch relaxation pass @@ -378,11 +379,12 @@ ; GCN-O1-NEXT: Insert fentry calls ; GCN-O1-NEXT: Insert XRay ops ; GCN-O1-NEXT: SI Memory Legalizer +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: SI insert wait instructions ; GCN-O1-NEXT: Insert required mode register values ; GCN-O1-NEXT: SI Insert Hard Clauses -; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: SI Final Branch Preparation ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer @@ -665,11 +667,12 @@ ; GCN-O1-OPTS-NEXT: Insert fentry calls ; GCN-O1-OPTS-NEXT: Insert XRay ops ; GCN-O1-OPTS-NEXT: SI Memory Legalizer +; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction +; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI insert wait instructions ; GCN-O1-OPTS-NEXT: Insert required mode register values ; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses -; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer @@ -954,11 +957,12 @@ ; GCN-O2-NEXT: Insert fentry calls ; GCN-O2-NEXT: Insert XRay ops ; GCN-O2-NEXT: SI Memory Legalizer +; GCN-O2-NEXT: MachineDominator Tree Construction +; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI insert wait instructions ; GCN-O2-NEXT: Insert required mode register values ; GCN-O2-NEXT: SI Insert Hard Clauses -; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: SI Final Branch Preparation ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer @@ -1255,11 +1259,12 @@ ; GCN-O3-NEXT: Insert fentry calls ; GCN-O3-NEXT: Insert XRay ops ; GCN-O3-NEXT: SI Memory Legalizer +; GCN-O3-NEXT: MachineDominator Tree Construction +; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI insert wait instructions ; GCN-O3-NEXT: Insert required mode register values ; GCN-O3-NEXT: SI Insert Hard Clauses -; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: SI Final Branch Preparation ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -0,0 +1,503 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s + +--- | + + @spill = external addrspace(1) global i32 + + define amdgpu_cs void @waitcnt_vm_loop() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_noterm() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_noterm_wait() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_load() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_store() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_use() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_store() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_use_in_loop() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_nowait() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_interval() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_interval2() { + ret void + } + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_noterm +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_noterm +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_noterm +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_noterm_wait +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: +name: waitcnt_vm_loop_noterm_wait +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_WAITCNT 3952 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_load +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_load +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_load +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + renamable $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr7, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_store +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_no_store +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_no_store +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_use +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_no_use +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_no_use +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr2, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr0, implicit $exec + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2 +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2 +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2 +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_store +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_store +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_store +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_use_in_loop +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr4 = V_ADD_U32_e32 renamable $vgpr5, renamable $vgpr1, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_nowait +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.3: + +# GFX10-LABEL: waitcnt_vm_loop2_nowait +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.3: +name: waitcnt_vm_loop2_nowait +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr4, renamable $vgpr5, implicit $exec + $vgpr3 = V_ADD_U32_e32 renamable $vgpr4, renamable $vgpr5, implicit $exec + $vgpr3 = V_ADD_U32_e32 renamable $vgpr4, renamable $vgpr5, implicit $exec + + S_BRANCH %bb.2 + + bb.2: + successors: %bb.2, %bb.3 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_interval +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_interval +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_interval +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_interval2 +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_interval2 +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_interval2 +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + $vgpr11 = COPY $vgpr7 + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +...