diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -30,6 +30,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -365,6 +366,8 @@ DenseSet TrackedWaitcntSet; DenseMap SLoadAddresses; + DenseMap PreheadersToFlush; + MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; struct BlockInfo { @@ -391,6 +394,9 @@ (void)ForceVMCounter; } + bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); + bool isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -399,6 +405,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -441,7 +448,11 @@ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr); + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt); + bool generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, @@ -794,6 +805,7 @@ INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -938,9 +950,10 @@ /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -bool SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr) { +bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt) { setForceEmitWaitcnt(); if (MI.isMetaInstruction()) @@ -1180,6 +1193,13 @@ if (ForceEmitWaitcnt[VS_CNT]) Wait.VsCnt = 0; + if (FlushVmCnt) { + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB != 0) + Wait.VmCnt = 0; + } + if (OldWaitcntInstr) { // Try to merge the required wait with preexisting waitcnt instructions. // Also erase redundant waitcnt. @@ -1224,6 +1244,46 @@ return Modified; } +// Add a waitcnt to flush the vmcnt counter at the end of the given block if +// needed. +bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { + AMDGPU::Waitcnt Wait; + bool Modified = false; + + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB == 0) + return false; + + Wait.VmCnt = 0; + ScoreBrackets.applyWaitcnt(Wait); + if (OldWaitcntInstr) + // Try to merge the required wait with preexisting waitcnt instructions. + // Also erase redundant waitcnt. + Modified = + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, nullptr); + else + // Update waitcnt brackets after determining the required wait. + ScoreBrackets.applyWaitcnt(Wait); + + // Build new waitcnt instructions unless no wait is needed. + if (Wait.hasWaitExceptVsCnt()) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + auto SWaitInst = BuildMI(Block, Block.end(), Block.back().getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcntBlockEnd\n" + << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; +} + // This is a flat memory operation. Check to see if it has memory tokens other // than LDS. Other address spaces supported by flat memory operations involve // global memory. @@ -1487,8 +1547,12 @@ continue; } + bool FlushVmCnt = (Block.getFirstTerminator() == Inst) && + isPreheaderToFlush(Block, ScoreBrackets); + // Generate an s_waitcnt instruction to be placed before Inst, if needed. - Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, + FlushVmCnt); OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. @@ -1573,9 +1637,101 @@ ++Iter; } + if (Block.getFirstTerminator() == Block.end() && + isPreheaderToFlush(Block, ScoreBrackets)) + Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); + return Modified; } +// Return true if the given machine basic block is a preheader of a loop in +// which we want to flush the vmcnt counter, and false otherwise. +bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets) { + if (PreheadersToFlush.count(&MBB)) + return PreheadersToFlush[&MBB]; + + for (MachineLoop *const LI : *MLI) { + if (LI->getLoopPreheader() != &MBB) + continue; + if (shouldFlushVmCnt(LI, ScoreBrackets)) { + PreheadersToFlush[&MBB] = true; + return true; + } + } + PreheadersToFlush[&MBB] = false; + return false; +} + +// Return true if it is better to flush the vmcnt counter in the preheader of +// the given loop. We currently decide to flush in two situations: +// 1. The loop contains vmem store(s), no vmem load and at least one use of a +// vgpr containing a value that is loaded outside of the loop. (Only on +// targets with no vscnt counter). +// 2. The loop contains vmem load(s), but the loaded values are not used in the +// loop, and at least one use of a vgpr containing a value that is loaded +// outside of the loop. On targets with no vscnt counter, the loop must also +// not contain any vmem store. +bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, + WaitcntBrackets &Brackets) { + if (!ML->getLoopPreheader()) + return false; + + bool HasVMemLoad = false; + bool HasVMemStore = false; + bool UsesVgprLoadedOutside = false; + DenseSet VgprUse; + DenseSet VgprDef; + + for (MachineBasicBlock *const MBB : ML->blocks()) { + for (MachineInstr &MI : *MBB) { + if (SIInstrInfo::isVMEM(MI)) { + if (MI.mayLoad()) + HasVMemLoad = true; + if (MI.mayStore()) + HasVMemStore = true; + } + for (unsigned I = 0; I < MI.getNumOperands(); I++) { + MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) + continue; + RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I); + // Vgpr use + if (Op.isUse()) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprDef.contains(RegNo)) + return false; + VgprUse.insert(RegNo); + // If at least one of Op's registers is in the score brackets, the + // value is likely loaded outside of the loop. + if (Brackets.getRegScore(RegNo, VM_CNT) > 0) { + UsesVgprLoadedOutside = true; + break; + } + } + } + // VMem load vgpr def + else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprUse.contains(RegNo)) + return false; + VgprDef.insert(RegNo); + } + } + } + } + if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) + return true; + bool flush = HasVMemLoad && UsesVgprLoadedOutside; + if (!ST->hasVscnt()) + flush = flush && !HasVMemStore; + return flush; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1583,6 +1739,7 @@ MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); + MLI = &getAnalysis(); PDT = &getAnalysis(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; diff --git a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll @@ -2380,8 +2380,8 @@ ; VI-NEXT: v_mov_b32_e32 v2, -1 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_mov_b64 s[4:5], exec -; VI-NEXT: .LBB40_1: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: s_waitcnt vmcnt(0) +; VI-NEXT: .LBB40_1: ; =>This Inner Loop Header: Depth=1 ; VI-NEXT: v_readfirstlane_b32 s8, v0 ; VI-NEXT: v_readfirstlane_b32 s9, v1 ; VI-NEXT: v_readfirstlane_b32 s10, v2 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -129,10 +129,11 @@ ; GCN-O0-NEXT: Insert fentry calls ; GCN-O0-NEXT: Insert XRay ops ; GCN-O0-NEXT: SI Memory Legalizer +; GCN-O0-NEXT: MachineDominator Tree Construction +; GCN-O0-NEXT: Machine Natural Loop Construction ; GCN-O0-NEXT: MachinePostDominator Tree Construction ; GCN-O0-NEXT: SI insert wait instructions ; GCN-O0-NEXT: Insert required mode register values -; GCN-O0-NEXT: MachineDominator Tree Construction ; GCN-O0-NEXT: SI Final Branch Preparation ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: Branch relaxation pass @@ -378,11 +379,12 @@ ; GCN-O1-NEXT: Insert fentry calls ; GCN-O1-NEXT: Insert XRay ops ; GCN-O1-NEXT: SI Memory Legalizer +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: SI insert wait instructions ; GCN-O1-NEXT: Insert required mode register values ; GCN-O1-NEXT: SI Insert Hard Clauses -; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: SI Final Branch Preparation ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer @@ -662,11 +664,12 @@ ; GCN-O1-OPTS-NEXT: Insert fentry calls ; GCN-O1-OPTS-NEXT: Insert XRay ops ; GCN-O1-OPTS-NEXT: SI Memory Legalizer +; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction +; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI insert wait instructions ; GCN-O1-OPTS-NEXT: Insert required mode register values ; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses -; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer @@ -948,11 +951,12 @@ ; GCN-O2-NEXT: Insert fentry calls ; GCN-O2-NEXT: Insert XRay ops ; GCN-O2-NEXT: SI Memory Legalizer +; GCN-O2-NEXT: MachineDominator Tree Construction +; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI insert wait instructions ; GCN-O2-NEXT: Insert required mode register values ; GCN-O2-NEXT: SI Insert Hard Clauses -; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: SI Final Branch Preparation ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer @@ -1247,11 +1251,12 @@ ; GCN-O3-NEXT: Insert fentry calls ; GCN-O3-NEXT: Insert XRay ops ; GCN-O3-NEXT: SI Memory Legalizer +; GCN-O3-NEXT: MachineDominator Tree Construction +; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI insert wait instructions ; GCN-O3-NEXT: Insert required mode register values ; GCN-O3-NEXT: SI Insert Hard Clauses -; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: SI Final Branch Preparation ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -0,0 +1,469 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s + +--- | + + @spill = external addrspace(1) global i32 + + define amdgpu_cs void @waitcnt_vm_loop() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_noterm() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_load() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_store() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_use() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_store() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_use_in_loop() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_nowait() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_interval() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop2_interval2() { + ret void + } + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_noterm +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_noterm +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_noterm +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_load +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_load +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_load +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + renamable $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr7, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_store +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_no_store +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_no_store +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_use +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_no_use +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_no_use +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr2, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr0, implicit $exec + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2 +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2 +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2 +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_store +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_store +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_store +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_use_in_loop +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr4 = V_ADD_U32_e32 renamable $vgpr5, renamable $vgpr1, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_nowait +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.3: + +# GFX10-LABEL: waitcnt_vm_loop2_nowait +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.3: +name: waitcnt_vm_loop2_nowait +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr4, renamable $vgpr5, implicit $exec + $vgpr3 = V_ADD_U32_e32 renamable $vgpr4, renamable $vgpr5, implicit $exec + $vgpr3 = V_ADD_U32_e32 renamable $vgpr4, renamable $vgpr5, implicit $exec + + S_BRANCH %bb.2 + + bb.2: + successors: %bb.2, %bb.3 + + $vgpr3 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + renamable $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr4, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_interval +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_interval +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_interval +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop2_interval2 +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_interval2 +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_interval2 +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + $vgpr11 = COPY $vgpr7 + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +...