diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Sequence.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -355,17 +356,38 @@ DenseSet TrackedWaitcntSet; DenseMap SLoadAddresses; + + MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; + struct LoopInfo { + std::vector> PendingWaits; + std::vector> PendingWaitsFlushed; + bool HasVMEMStore = false; + bool HasVMEMLoad = false; + }; + struct BlockInfo { MachineBasicBlock *MBB; + MachineLoop *OuterLoop = nullptr; std::unique_ptr Incoming; + std::unique_ptr IncomingFlushed; bool Dirty = true; explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {} }; + void getBlockBrackets(BlockInfo &BI, + std::unique_ptr &Brackets, + HardwareLimits &Limits, RegisterEncoding &Encoding, + bool Flushed); + void mergeNewStateIntoSuccessors( + BlockInfo &BI, std::unique_ptr &Brackets, bool &Repeat, + MapVector::iterator BII, bool Flushed); + bool generateLoopsPendingWaitcnts(); + MapVector BlockInfos; + MapVector LoopInfos; // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0 // because of amdgpu-waitcnt-forcezero flag @@ -389,6 +411,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -431,11 +454,14 @@ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr); + MachineInstr *OldWaitcntInstr, bool Flushed, + MachineLoop *Loop); void updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, - WaitcntBrackets &ScoreBrackets); + WaitcntBrackets &ScoreBrackets, bool Flushed, + MachineLoop *Loop); + void insertWaitcntInPreheader(MachineLoop *ML); bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, const MachineInstr *MI); @@ -784,6 +810,7 @@ INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -928,9 +955,11 @@ /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -bool SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr) { +bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr, + bool Flushed, + MachineLoop *Loop) { setForceEmitWaitcnt(); if (MI.isMetaInstruction()) @@ -1110,6 +1139,12 @@ ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I); const bool IsVGPR = TRI->isVectorRegister(*MRI, Op.getReg()); + if (IsVGPR && Loop) { + if (MI.mayLoad()) + LoopInfos[Loop].HasVMEMLoad = true; + if (MI.mayStore()) + LoopInfos[Loop].HasVMEMStore = true; + } for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { if (IsVGPR) { // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the @@ -1184,19 +1219,28 @@ // instruction was modified to handle the required wait. if (Wait.hasWaitExceptVsCnt()) { unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), - MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(Enc); - TrackedWaitcntSet.insert(SWaitInst); - Modified = true; + if (Loop) { + std::pair p(&MI, Wait); + if (Flushed) + LoopInfos[Loop].PendingWaitsFlushed.push_back(p); + else + LoopInfos[Loop].PendingWaits.push_back(p); + } else { + auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), + MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); + Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI - << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "Old Instr: " << MI << "New Instr: " << *SWaitInst + << '\n'); + } } if (Wait.hasWaitVsCnt()) { assert(ST->hasVscnt()); + assert(!Flushed); auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), @@ -1434,7 +1478,8 @@ // Generate s_waitcnt instructions where needed. bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, - WaitcntBrackets &ScoreBrackets) { + WaitcntBrackets &ScoreBrackets, + bool Flushed, MachineLoop *Loop) { bool Modified = false; LLVM_DEBUG({ @@ -1477,11 +1522,12 @@ } // Generate an s_waitcnt instruction to be placed before Inst, if needed. - Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, + Flushed, Loop); OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. - bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst); + bool RestoreVCCZ = !VCCZCorrect && readsVCCZ(Inst) && !Flushed; // Don't examine operands unless we need to track vccz correctness. if (ST->hasReadVCCZBug() || !ST->partialVCCWritesUpdateVCCZ()) { @@ -1565,6 +1611,158 @@ return Modified; } +// Insert a waitcnt vmcnt(0) in the preheader of the given loop. +void SIInsertWaitcnts::insertWaitcntInPreheader(MachineLoop *ML) { + MachineBasicBlock *Preheader = ML->getLoopPreheader(); + assert(Preheader); + AMDGPU::Waitcnt Wait; + Wait.VmCnt = 0; + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + auto Terminator = Preheader->getFirstTerminator(); + if (Terminator != Preheader->end()) { + auto SWaitInst = BuildMI(*Preheader, Terminator, Terminator->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + LLVM_DEBUG(dbgs() << "insertWaitcntInPreheader\n" + << "Old Instr: " << *Terminator + << "New Instr: " << *SWaitInst << '\n'); + } else { + auto SWaitInst = + BuildMI(*Preheader, Preheader->end(), Preheader->back().getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + LLVM_DEBUG(dbgs() << "insertWaitcntInPreheader\n" + << "Old Instr: " << *Preheader->end() + << "New Instr: " << *SWaitInst << '\n'); + } +} + +// Get the brackets associated with the given BlockInfo. +void SIInsertWaitcnts::getBlockBrackets( + BlockInfo &BI, std::unique_ptr &Brackets, + HardwareLimits &Limits, RegisterEncoding &Encoding, bool Flushed) { + std::unique_ptr &Incoming = + (Flushed ? BI.IncomingFlushed : BI.Incoming); + if (Incoming) { + if (!Brackets) + Brackets = std::make_unique(*Incoming); + else + *Brackets = *Incoming; + } else { + if (!Brackets) + Brackets = std::make_unique(ST, Limits, Encoding); + else + *Brackets = WaitcntBrackets(ST, Limits, Encoding); + } +} + +// Merge the given brackets into BI's successors +void SIInsertWaitcnts::mergeNewStateIntoSuccessors( + BlockInfo &BI, std::unique_ptr &Brackets, bool &Repeat, + MapVector::iterator BII, bool Flushed) { + auto getFlushedBrackets = [&]() { + AMDGPU::Waitcnt Wait; + Wait.VmCnt = 0; + WaitcntBrackets FlushedBrackets(*Brackets); + FlushedBrackets.applyWaitcnt(Wait); + return FlushedBrackets; + }; + + if (Brackets->hasPending()) { + for (MachineBasicBlock *Succ : BI.MBB->successors()) { + auto SuccBII = BlockInfos.find(Succ); + BlockInfo &SuccBI = SuccBII->second; + auto mergeInto = [&](std::unique_ptr &Incoming, + bool getFlushed) { + if (!Incoming) { + SuccBI.Dirty = true; + if (SuccBII <= BII) + Repeat = true; + Incoming = std::make_unique( + getFlushed ? getFlushedBrackets() : *Brackets); + } else if (Incoming->merge(getFlushed ? getFlushedBrackets() + : *Brackets)) { + SuccBI.Dirty = true; + if (SuccBII <= BII) + Repeat = true; + } + }; + if (SuccBI.OuterLoop && SuccBI.OuterLoop->getLoopPreheader() == BI.MBB) { + assert(!Flushed); + // Current block is a loop preheader, merge the brackets into both + // flushed and non-flushed brackets of the loop header. + mergeInto(SuccBI.Incoming, false); + mergeInto(SuccBI.IncomingFlushed, true); + } else if (BI.OuterLoop && !BI.OuterLoop->contains(SuccBI.MBB)) { + assert(!SuccBI.OuterLoop); + // Current block is a loop exit, always merge the brackets into + // successor's non-flushed brackets. + mergeInto(SuccBI.Incoming, false); + } else { + // Current block is inside or outside a loop, merge into successor's + // corresponding brackets. + std::unique_ptr &Incoming = + Flushed ? SuccBI.IncomingFlushed : SuccBI.Incoming; + mergeInto(Incoming, false); + } + } + } +} + +// Generate either the flushed or non-flushed pending waitcnt list for the +// loops. +bool SIInsertWaitcnts::generateLoopsPendingWaitcnts() { + bool Modified = false; + for (MachineLoop *const ML : *MLI) { + LoopInfo &LI = LoopInfos[ML]; + auto PendingWaits = LI.PendingWaits; + // If the loop contains no load, at least one store, and at least one + // waitcnt vmcnt would be generated if we don't flush, generate the flush + // waits. + if (!LI.HasVMEMLoad && LI.HasVMEMStore && LI.PendingWaits.size() != 0) { + for (auto W : LI.PendingWaits) + if (W.second.hasWaitVmCnt()) { + PendingWaits = LI.PendingWaitsFlushed; + insertWaitcntInPreheader(ML); + Modified = true; + break; + } + } + for (auto PendingWait : PendingWaits) { + MachineInstr *MI = PendingWait.first; + AMDGPU::Waitcnt Wait = PendingWait.second; + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + MachineBasicBlock *MBB = MI->getParent(); + auto I = MI->getIterator(); + // Try to find preexisting waitcnt + MachineInstr *OldWaitcntInstr = nullptr; + if (I != MBB->instr_begin()) { + I--; + while (I->getOpcode() == AMDGPU::S_WAITCNT) { + OldWaitcntInstr = &(*I); + if (I == MBB->instr_begin()) + break; + I--; + } + } + if (OldWaitcntInstr) { + WaitcntBrackets Brackets(ST, {}, {}); + applyPreexistingWaitcnt(Brackets, *OldWaitcntInstr, Wait, MI); + } + if (Wait.hasWaitExceptVsCnt()) { + auto SWaitInst = BuildMI(*MBB, MI->getIterator(), MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + LLVM_DEBUG(dbgs() << "generateLoopsPendingWaitcnts\n" + << "Old Instr: " << *MI->getIterator() + << "New Instr: " << *SWaitInst << '\n'); + Modified = true; + } + } + } + return Modified; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1572,6 +1770,7 @@ MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); + MLI = &getAnalysis(); PDT = &getAnalysis(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; @@ -1597,6 +1796,7 @@ TrackedWaitcntSet.clear(); BlockInfos.clear(); + LoopInfos.clear(); bool Modified = false; if (!MFI->isEntryFunction()) { @@ -1620,10 +1820,25 @@ Modified = true; } + for (MachineLoop *const LI : *MLI) + LoopInfos.insert({LI, LoopInfo()}); + // Keep iterating over the blocks in reverse post order, inserting and // updating s_waitcnt where needed, until a fix point is reached. - for (auto *MBB : ReversePostOrderTraversal(&MF)) - BlockInfos.insert({MBB, BlockInfo(MBB)}); + for (auto *MBB : ReversePostOrderTraversal(&MF)) { + BlockInfo BI(MBB); + // Before GFX10, there is no VS_CNT counter, so we try to hoist waitcnts out + // of loops when they contain no load and at least one store. The OuterLoop + // element in BlockInfo objects is only used for that optimization so there + // is no need to set it on GFX10+ targets. + if (ST->getGeneration() <= AMDGPUSubtarget::GFX10) { + for (MachineLoop *const LI : *MLI) { + if (LI->contains(MBB) && LI->getLoopPreheader()) + BI.OuterLoop = LI; + } + } + BlockInfos.insert({MBB, std::move(BI)}); + } std::unique_ptr Brackets; bool Repeat; @@ -1636,47 +1851,27 @@ if (!BI.Dirty) continue; - if (BI.Incoming) { - if (!Brackets) - Brackets = std::make_unique(*BI.Incoming); - else - *Brackets = *BI.Incoming; - } else { - if (!Brackets) - Brackets = std::make_unique(ST, Limits, Encoding); - else - *Brackets = WaitcntBrackets(ST, Limits, Encoding); - } - - Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets); BI.Dirty = false; - if (Brackets->hasPending()) { - BlockInfo *MoveBracketsToSucc = nullptr; - for (MachineBasicBlock *Succ : BI.MBB->successors()) { - auto SuccBII = BlockInfos.find(Succ); - BlockInfo &SuccBI = SuccBII->second; - if (!SuccBI.Incoming) { - SuccBI.Dirty = true; - if (SuccBII <= BII) - Repeat = true; - if (!MoveBracketsToSucc) { - MoveBracketsToSucc = &SuccBI; - } else { - SuccBI.Incoming = std::make_unique(*Brackets); - } - } else if (SuccBI.Incoming->merge(*Brackets)) { - SuccBI.Dirty = true; - if (SuccBII <= BII) - Repeat = true; - } - } - if (MoveBracketsToSucc) - MoveBracketsToSucc->Incoming = std::move(Brackets); + // Process block for unflushed brackets + getBlockBrackets(BI, Brackets, Limits, Encoding, false); + Modified |= + insertWaitcntInBlock(MF, *BI.MBB, *Brackets, false, BI.OuterLoop); + mergeNewStateIntoSuccessors(BI, Brackets, Repeat, BII, false); + + // If the block is in a loop and we still haven't found any load, process + // block for flushed brackets + if (BI.OuterLoop && !LoopInfos[BI.OuterLoop].HasVMEMLoad) { + getBlockBrackets(BI, Brackets, Limits, Encoding, true); + Modified |= + insertWaitcntInBlock(MF, *BI.MBB, *Brackets, true, BI.OuterLoop); + mergeNewStateIntoSuccessors(BI, Brackets, Repeat, BII, true); } } } while (Repeat); + Modified |= generateLoopsPendingWaitcnts(); + if (ST->hasScalarStores()) { SmallVector EndPgmBlocks; bool HaveScalarStores = false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -520,6 +520,8 @@ return VsCnt != ~0u; } + bool hasWaitVmCnt() const { return VmCnt != ~0u; } + bool dominates(const Waitcnt &Other) const { return VmCnt <= Other.VmCnt && ExpCnt <= Other.ExpCnt && LgkmCnt <= Other.LgkmCnt && VsCnt <= Other.VsCnt; diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -127,10 +127,11 @@ ; GCN-O0-NEXT: Insert fentry calls ; GCN-O0-NEXT: Insert XRay ops ; GCN-O0-NEXT: SI Memory Legalizer +; GCN-O0-NEXT: MachineDominator Tree Construction +; GCN-O0-NEXT: Machine Natural Loop Construction ; GCN-O0-NEXT: MachinePostDominator Tree Construction ; GCN-O0-NEXT: SI insert wait instructions ; GCN-O0-NEXT: Insert required mode register values -; GCN-O0-NEXT: MachineDominator Tree Construction ; GCN-O0-NEXT: SI Final Branch Preparation ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: Branch relaxation pass @@ -377,11 +378,12 @@ ; GCN-O1-NEXT: Insert fentry calls ; GCN-O1-NEXT: Insert XRay ops ; GCN-O1-NEXT: SI Memory Legalizer +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: SI insert wait instructions ; GCN-O1-NEXT: Insert required mode register values ; GCN-O1-NEXT: SI Insert Hard Clauses -; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: SI Final Branch Preparation ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer @@ -662,11 +664,12 @@ ; GCN-O1-OPTS-NEXT: Insert fentry calls ; GCN-O1-OPTS-NEXT: Insert XRay ops ; GCN-O1-OPTS-NEXT: SI Memory Legalizer +; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction +; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI insert wait instructions ; GCN-O1-OPTS-NEXT: Insert required mode register values ; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses -; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer @@ -949,11 +952,12 @@ ; GCN-O2-NEXT: Insert fentry calls ; GCN-O2-NEXT: Insert XRay ops ; GCN-O2-NEXT: SI Memory Legalizer +; GCN-O2-NEXT: MachineDominator Tree Construction +; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI insert wait instructions ; GCN-O2-NEXT: Insert required mode register values ; GCN-O2-NEXT: SI Insert Hard Clauses -; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: SI Final Branch Preparation ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer @@ -1249,11 +1253,12 @@ ; GCN-O3-NEXT: Insert fentry calls ; GCN-O3-NEXT: Insert XRay ops ; GCN-O3-NEXT: SI Memory Legalizer +; GCN-O3-NEXT: MachineDominator Tree Construction +; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI insert wait instructions ; GCN-O3-NEXT: Insert required mode register values ; GCN-O3-NEXT: SI Insert Hard Clauses -; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: SI Final Branch Preparation ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer diff --git a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll --- a/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-loop-conditions.ll @@ -159,8 +159,8 @@ ; GCN-NEXT: .LBB1_2: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB1_3 Depth=1 ; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] -; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_mov_b64 vcc, vcc +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cbranch_vccnz .LBB1_6 ; GCN-NEXT: .LBB1_3: ; %bb14 ; GCN-NEXT: ; =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -0,0 +1,173 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s + +--- | + + @spill = external addrspace(1) global i32 + + define amdgpu_cs void @waitcnt_vm_loop() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_noterm() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_load() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_store() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_use() { + ret void + } + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop +# GFX9-LABEL: bb.0 +# GFX9: S_WAITCNT 3952 +# GFX9-NEXT: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: bb.1 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +name: waitcnt_vm_loop +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_noterm +# GFX9-LABEL: bb.0 +# GFX9: S_WAITCNT 3952 +# GFX9-LABEL-NEXT: bb.1 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +name: waitcnt_vm_loop_noterm +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_load +# GFX9-LABEL: bb.0 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: bb.1 +# GFX9: S_WAITCNT 3952 +name: waitcnt_vm_loop_load +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + renamable $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr7, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_store +# GFX9-LABEL: bb.0 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: %bb.1 +# GFX9: S_WAITCNT 3952 +name: waitcnt_vm_loop_no_store +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_use +# GFX9-LABEL: bb.0 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: %bb.1 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9-LABEL: %bb.2 +# GFX: S_WAITCNT 3953 +name: waitcnt_vm_loop_no_use +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr2, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr0, implicit $exec + S_ENDPGM 0 + +...