diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -30,6 +30,7 @@ #include "Utils/AMDGPUBaseInfo.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -365,8 +366,12 @@ DenseSet TrackedWaitcntSet; DenseMap SLoadAddresses; + DenseMap PreheaderLoop; + MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; + bool shouldFlush(MachineBasicBlock *MBB); + struct BlockInfo { MachineBasicBlock *MBB; std::unique_ptr Incoming; @@ -399,6 +404,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -439,6 +445,9 @@ bool mayAccessVMEMThroughFlat(const MachineInstr &MI) const; bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; + bool generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr); @@ -794,6 +803,7 @@ INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -806,10 +816,11 @@ return new SIInsertWaitcnts(); } -/// Combine consecutive waitcnt instructions that precede \p MI and follow -/// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added -/// by previous passes. Currently this pass conservatively assumes that these -/// preexisting waitcnt are required for correctness. +/// Combine consecutive waitcnt instructions that precede \p MI (or the end of +/// the block if MI is null) and follow \p OldWaitcntInstr and apply any extra +/// wait from waitcnt that were added by previous passes. Currently this pass +/// conservatively assumes that these preexisting waitcnt are required for +/// correctness. bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, AMDGPU::Waitcnt &Wait, @@ -817,8 +828,9 @@ bool Modified = false; MachineInstr *WaitcntInstr = nullptr; MachineInstr *WaitcntVsCntInstr = nullptr; + auto EndBlockIt = OldWaitcntInstr.getParent()->end(); for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II); - &*II != MI; II = NextI, ++NextI) { + II != EndBlockIt && &*II != MI; II = NextI, ++NextI) { if (II->isMetaInstruction()) continue; @@ -928,6 +940,48 @@ return true; } +/// Generate s_waitcnt instruction to be placed at the end of Block. +/// We may need to flush vmcnt in preheader blocks (inserting a wait at the +/// end of the block) to avoid generating waits in loop body. +/// These flush waits are inserted before terminator instructions in +/// generateWaitcntInstBefore, but this function handles the case in which +/// there is no terminator instruction in Block. +bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, MachineInstr *OldWaitcntInstr) { + + AMDGPU::Waitcnt Wait; + bool Modified = false; + + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB != 0 && shouldFlush(&Block)) { + Wait.VmCnt = 0; + ScoreBrackets.applyWaitcnt(Wait); + if (OldWaitcntInstr) { + // Try to merge the required wait with preexisting waitcnt instructions. + // Also erase redundant waitcnt. + Modified = + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, nullptr); + } else { + // Update waitcnt brackets after determining the required wait. + ScoreBrackets.applyWaitcnt(Wait); + } + } + + // Build new waitcnt instructions unless no wait is needed. + if (Wait.hasWaitExceptVsCnt()) { + unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); + auto SWaitInst = BuildMI(Block, Block.end(), + Block.back().getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(Enc); + TrackedWaitcntSet.insert(SWaitInst); + Modified = true; + + LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" + << "New Instr: " << *SWaitInst << '\n'); + } + + return Modified; +} /// Generate s_waitcnt instruction to be placed before cur_Inst. /// Instructions of a given type are returned in order, /// but instructions of different types can complete out of order. @@ -1180,6 +1234,13 @@ if (ForceEmitWaitcnt[VS_CNT]) Wait.VsCnt = 0; + if (MI.getParent()->getFirstTerminator() == MI) { + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB != 0 && shouldFlush(MI.getParent())) + Wait.VmCnt = 0; + } + if (OldWaitcntInstr) { // Try to merge the required wait with preexisting waitcnt instructions. // Also erase redundant waitcnt. @@ -1573,9 +1634,49 @@ ++Iter; } + // Generate an s_waitcnt instruction to be placed at the end of Block, + // if needed. + Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); + return Modified; } +// Determine if we should flush VMCNT in the given block. It may be worth +// waiting for VMCNT in loop preheaders to avoid unnecessarily waiting in each +// iteration inside loops. This function returns true if MBB is such a +// preheader. +bool SIInsertWaitcnts::shouldFlush(MachineBasicBlock *MBB) { + // We only flush on pre-GFX10, if there is no VSCNT counter. + if (ST->hasVscnt()) + return false; + if (!PreheaderLoop.count(MBB)) + return false; + MachineLoop *Loop = PreheaderLoop[MBB]; + // We flush in the block if it is a preheader and its associated loop contains + // at least one VMEM store and a VMEM ref, but doesn't contain a VMEM load. + bool HasVMemLoad = false; + bool HasVMemStore = false; + bool HasVMemLoadUse = false; + for (MachineBasicBlock *const MBB: Loop->blocks()) { + for (MachineInstr &MI: *MBB) { + if (SIInstrInfo::isVMEM(MI)) { + if (MI.mayLoad()) + HasVMemLoad = true; + if (MI.mayStore()) + HasVMemStore = true; + } + for (MachineOperand &Op : MI.uses()) + if (Op.isReg() && TRI->isVectorRegister(*MRI, Op.getReg())) + for (const MachineOperand &MO : make_range(MRI->def_begin(Op.getReg()), MRI->def_end())) + if (MO.getParent()->mayLoad()) + HasVMemLoadUse = true; + } + if (HasVMemStore && HasVMemLoadUse && !HasVMemLoad) + return true; + } + return false; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1583,6 +1684,7 @@ MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); + MLI = &getAnalysis(); PDT = &getAnalysis(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; @@ -1608,6 +1710,13 @@ BlockInfos.clear(); bool Modified = false; + // Retrieve the loop associated with each preheader. + for (MachineLoop *const LI: *MLI) { + MachineBasicBlock *Preheader = LI->getLoopPreheader(); + if (Preheader) + PreheaderLoop[Preheader] = LI; + } + if (!MFI->isEntryFunction()) { // Wait for any outstanding memory operations that the input registers may // depend on. We can't track them and it's better to do the wait after the diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -0,0 +1,173 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s + +--- | + + @spill = external addrspace(1) global i32 + + define amdgpu_cs void @waitcnt_vm_loop() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_noterm() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_load() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_store() { + ret void + } + + define amdgpu_cs void @waitcnt_vm_loop_no_use() { + ret void + } + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop +# GFX9-LABEL: bb.0 +# GFX9: S_WAITCNT 3952 +# GFX9-NEXT: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: bb.1 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +name: waitcnt_vm_loop +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop +# GFX9-LABEL: bb.0 +# GFX9: S_WAITCNT 3952 +# GFX9-LABEL-NEXT: bb.1 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +name: waitcnt_vm_loop_noterm +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_load +# GFX9-LABEL: bb.0 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: bb.1 +# GFX9: S_WAITCNT 3952 +name: waitcnt_vm_loop_load +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + renamable $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr7, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr7, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_store +# GFX9-LABEL: bb.0 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: %bb.1 +# GFX9: S_WAITCNT 3952 +name: waitcnt_vm_loop_no_store +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# GFX9-LABEL: waitcnt_vm_loop_no_use +# GFX9-LABEL: bb.0 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9: S_BRANCH %bb.1 +# GFX9-LABEL-NEXT: %bb.1 +# GFX9-NOT: S_WAITCNT 395{{2|3}} +# GFX9-LABEL: %bb.2 +# GFX: S_WAITCNT 3953 +name: waitcnt_vm_loop_no_use +body: | + bb.0: + successors: %bb.1 + + renamable $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed renamable $vgpr0, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact renamable $vgpr5, renamable $vgpr6, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 renamable $vgpr2, renamable $vgpr2, implicit $exec + S_CMP_LG_U32 killed renamable $sgpr3, renamable $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr1 = V_ADD_U32_e32 renamable $vgpr0, renamable $vgpr0, implicit $exec + S_ENDPGM 0 + +...