diff --git a/llvm/include/llvm/CodeGen/MachineBasicBlock.h b/llvm/include/llvm/CodeGen/MachineBasicBlock.h --- a/llvm/include/llvm/CodeGen/MachineBasicBlock.h +++ b/llvm/include/llvm/CodeGen/MachineBasicBlock.h @@ -737,6 +737,15 @@ /// other block. bool isLayoutSuccessor(const MachineBasicBlock *MBB) const; + /// Return the successor of this block if it has a single successor. + /// Otherwise return a null pointer. + /// + const MachineBasicBlock *getSingleSuccessor() const; + MachineBasicBlock *getSingleSuccessor() { + return const_cast( + static_cast(this)->getSingleSuccessor()); + } + /// Return the fallthrough block if the block can implicitly /// transfer control to the block after it by falling off the end of /// it. This should return null if it can reach the block after diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp --- a/llvm/lib/CodeGen/MachineBasicBlock.cpp +++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp @@ -915,6 +915,10 @@ return std::next(I) == MachineFunction::const_iterator(MBB); } +const MachineBasicBlock *MachineBasicBlock::getSingleSuccessor() const { + return Successors.size() == 1 ? Successors[0] : nullptr; +} + MachineBasicBlock *MachineBasicBlock::getFallThrough() { MachineFunction::iterator Fallthrough = getIterator(); ++Fallthrough; diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp --- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -31,6 +31,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/Sequence.h" +#include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachinePostDominators.h" #include "llvm/InitializePasses.h" #include "llvm/Support/DebugCounter.h" @@ -355,6 +356,8 @@ DenseSet TrackedWaitcntSet; DenseMap SLoadAddresses; + DenseMap PreheadersToFlush; + MachineLoopInfo *MLI; MachinePostDominatorTree *PDT; struct BlockInfo { @@ -381,6 +384,9 @@ (void)ForceVMCounter; } + bool shouldFlushVmCnt(MachineLoop *ML, WaitcntBrackets &Brackets); + bool isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets); bool runOnMachineFunction(MachineFunction &MF) override; StringRef getPassName() const override { @@ -389,6 +395,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); + AU.addRequired(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } @@ -431,14 +438,23 @@ bool mayAccessLDSThroughFlat(const MachineInstr &MI) const; bool generateWaitcntInstBefore(MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr); + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt); + bool generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); + bool generateWaitcnt(AMDGPU::Waitcnt Wait, + MachineBasicBlock::instr_iterator It, + MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr); void updateEventWaitcntAfter(MachineInstr &Inst, WaitcntBrackets *ScoreBrackets); bool insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block, WaitcntBrackets &ScoreBrackets); bool applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, - AMDGPU::Waitcnt &Wait, const MachineInstr *MI); + AMDGPU::Waitcnt &Wait, + MachineBasicBlock::instr_iterator It); }; } // end anonymous namespace @@ -798,6 +814,7 @@ INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree) INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false, false) @@ -810,53 +827,53 @@ return new SIInsertWaitcnts(); } -/// Combine consecutive waitcnt instructions that precede \p MI and follow +/// Combine consecutive waitcnt instructions that precede \p It and follow /// \p OldWaitcntInstr and apply any extra wait from waitcnt that were added /// by previous passes. Currently this pass conservatively assumes that these /// preexisting waitcnt are required for correctness. -bool SIInsertWaitcnts::applyPreexistingWaitcnt(WaitcntBrackets &ScoreBrackets, - MachineInstr &OldWaitcntInstr, - AMDGPU::Waitcnt &Wait, - const MachineInstr *MI) { +bool SIInsertWaitcnts::applyPreexistingWaitcnt( + WaitcntBrackets &ScoreBrackets, MachineInstr &OldWaitcntInstr, + AMDGPU::Waitcnt &Wait, MachineBasicBlock::instr_iterator It) { bool Modified = false; MachineInstr *WaitcntInstr = nullptr; MachineInstr *WaitcntVsCntInstr = nullptr; - for (auto II = OldWaitcntInstr.getIterator(), NextI = std::next(II); - &*II != MI; II = NextI, ++NextI) { - if (II->isMetaInstruction()) + + for (auto &II : + make_early_inc_range(make_range(OldWaitcntInstr.getIterator(), It))) { + if (II.isMetaInstruction()) continue; - if (II->getOpcode() == AMDGPU::S_WAITCNT) { + if (II.getOpcode() == AMDGPU::S_WAITCNT) { // Conservatively update required wait if this waitcnt was added in an // earlier pass. In this case it will not exist in the tracked waitcnt // set. - if (!TrackedWaitcntSet.count(&*II)) { - unsigned IEnc = II->getOperand(0).getImm(); + if (!TrackedWaitcntSet.count(&II)) { + unsigned IEnc = II.getOperand(0).getImm(); AMDGPU::Waitcnt OldWait = AMDGPU::decodeWaitcnt(IV, IEnc); Wait = Wait.combined(OldWait); } // Merge consecutive waitcnt of the same type by erasing multiples. if (!WaitcntInstr) { - WaitcntInstr = &*II; + WaitcntInstr = &II; } else { - II->eraseFromParent(); + II.eraseFromParent(); Modified = true; } } else { - assert(II->getOpcode() == AMDGPU::S_WAITCNT_VSCNT); - assert(II->getOperand(0).getReg() == AMDGPU::SGPR_NULL); - if (!TrackedWaitcntSet.count(&*II)) { + assert(II.getOpcode() == AMDGPU::S_WAITCNT_VSCNT); + assert(II.getOperand(0).getReg() == AMDGPU::SGPR_NULL); + if (!TrackedWaitcntSet.count(&II)) { unsigned OldVSCnt = - TII->getNamedOperand(*II, AMDGPU::OpName::simm16)->getImm(); + TII->getNamedOperand(II, AMDGPU::OpName::simm16)->getImm(); Wait.VsCnt = std::min(Wait.VsCnt, OldVSCnt); } if (!WaitcntVsCntInstr) { - WaitcntVsCntInstr = &*II; + WaitcntVsCntInstr = &II; } else { - II->eraseFromParent(); + II.eraseFromParent(); Modified = true; } } @@ -876,9 +893,14 @@ Wait.LgkmCnt = ~0u; Wait.ExpCnt = ~0u; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << *MI << "New Instr: " << *WaitcntInstr - << '\n'); + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " << *WaitcntInstr + << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitcntInstr << '\n'); + } else { WaitcntInstr->eraseFromParent(); Modified = true; @@ -899,9 +921,13 @@ ScoreBrackets.applyWaitcnt(Wait); Wait.VsCnt = ~0u; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << *MI - << "New Instr: " << *WaitcntVsCntInstr << '\n'); + LLVM_DEBUG(It == OldWaitcntInstr.getParent()->end() + ? dbgs() << "applyPreexistingWaitcnt\n" + << "New Instr at block end: " + << *WaitcntVsCntInstr << '\n' + : dbgs() << "applyPreexistingWaitcnt\n" + << "Old Instr: " << *It + << "New Instr: " << *WaitcntVsCntInstr << '\n'); } else { WaitcntVsCntInstr->eraseFromParent(); Modified = true; @@ -942,16 +968,18 @@ /// and if so what the value of each counter is. /// The "score bracket" is bound by the lower bound and upper bound /// scores (*_score_LB and *_score_ub respectively). -bool SIInsertWaitcnts::generateWaitcntInstBefore( - MachineInstr &MI, WaitcntBrackets &ScoreBrackets, - MachineInstr *OldWaitcntInstr) { +/// If FlushVmCnt is true, that means that we want to generate a s_waitcnt to +/// flush the vmcnt counter here. +bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr, + bool FlushVmCnt) { setForceEmitWaitcnt(); if (MI.isMetaInstruction()) return false; AMDGPU::Waitcnt Wait; - bool Modified = false; // FIXME: This should have already been handled by the memory legalizer. // Removing this currently doesn't affect any lit tests, but we need to @@ -1188,19 +1216,56 @@ if (ForceEmitWaitcnt[VS_CNT]) Wait.VsCnt = 0; - if (OldWaitcntInstr) { + if (FlushVmCnt) { + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB != 0) + Wait.VmCnt = 0; + } + + return generateWaitcnt(Wait, MI.getIterator(), *MI.getParent(), ScoreBrackets, + OldWaitcntInstr); +} + +// Add a waitcnt to flush the vmcnt counter at the end of the given block if +// needed. +bool SIInsertWaitcnts::generateWaitcntBlockEnd(MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { + AMDGPU::Waitcnt Wait; + + unsigned UB = ScoreBrackets.getScoreUB(VM_CNT); + unsigned LB = ScoreBrackets.getScoreLB(VM_CNT); + if (UB - LB == 0) + return false; + + Wait.VmCnt = 0; + + return generateWaitcnt(Wait, Block.instr_end(), Block, ScoreBrackets, + OldWaitcntInstr); +} + +bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, + MachineBasicBlock::instr_iterator It, + MachineBasicBlock &Block, + WaitcntBrackets &ScoreBrackets, + MachineInstr *OldWaitcntInstr) { + bool Modified = false; + const DebugLoc &DL = Block.findDebugLoc(It); + + if (OldWaitcntInstr) // Try to merge the required wait with preexisting waitcnt instructions. // Also erase redundant waitcnt. Modified = - applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, &MI); - } else { - // Update waitcnt brackets after determining the required wait. + applyPreexistingWaitcnt(ScoreBrackets, *OldWaitcntInstr, Wait, It); + else ScoreBrackets.applyWaitcnt(Wait); - } // ExpCnt can be merged into VINTERP. - if (Wait.ExpCnt != ~0u && SIInstrInfo::isVINTERP(MI)) { - MachineOperand *WaitExp = TII->getNamedOperand(MI, AMDGPU::OpName::waitexp); + if (Wait.ExpCnt != ~0u && It != Block.instr_end() && + SIInstrInfo::isVINTERP(*It)) { + MachineOperand *WaitExp = + TII->getNamedOperand(*It, AMDGPU::OpName::waitexp); if (Wait.ExpCnt < WaitExp->getImm()) { WaitExp->setImm(Wait.ExpCnt); Modified = true; @@ -1208,40 +1273,36 @@ Wait.ExpCnt = ~0u; LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Update Instr: " << MI); + << "Update Instr: " << *It); } // Build new waitcnt instructions unless no wait is needed or the old waitcnt // instruction was modified to handle the required wait. if (Wait.hasWaitExceptVsCnt()) { unsigned Enc = AMDGPU::encodeWaitcnt(IV, Wait); - auto SWaitInst = BuildMI(*MI.getParent(), MI.getIterator(), - MI.getDebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(Enc); + auto SWaitInst = + BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT)).addImm(Enc); TrackedWaitcntSet.insert(SWaitInst); Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI - << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } if (Wait.hasWaitVsCnt()) { assert(ST->hasVscnt()); - auto SWaitInst = - BuildMI(*MI.getParent(), MI.getIterator(), MI.getDebugLoc(), - TII->get(AMDGPU::S_WAITCNT_VSCNT)) - .addReg(AMDGPU::SGPR_NULL, RegState::Undef) - .addImm(Wait.VsCnt); + auto SWaitInst = BuildMI(Block, It, DL, TII->get(AMDGPU::S_WAITCNT_VSCNT)) + .addReg(AMDGPU::SGPR_NULL, RegState::Undef) + .addImm(Wait.VsCnt); TrackedWaitcntSet.insert(SWaitInst); Modified = true; - LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n" - << "Old Instr: " << MI - << "New Instr: " << *SWaitInst << '\n'); + LLVM_DEBUG(dbgs() << "generateWaitcnt\n"; + if (It != Block.instr_end()) dbgs() << "Old Instr: " << *It; + dbgs() << "New Instr: " << *SWaitInst << '\n'); } - return Modified; } @@ -1514,8 +1575,12 @@ continue; } + bool FlushVmCnt = Block.getFirstTerminator() == Inst && + isPreheaderToFlush(Block, ScoreBrackets); + // Generate an s_waitcnt instruction to be placed before Inst, if needed. - Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr); + Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr, + FlushVmCnt); OldWaitcntInstr = nullptr; // Restore vccz if it's not known to be correct already. @@ -1600,9 +1665,101 @@ ++Iter; } + if (Block.getFirstTerminator() == Block.end() && + isPreheaderToFlush(Block, ScoreBrackets)) + Modified |= generateWaitcntBlockEnd(Block, ScoreBrackets, OldWaitcntInstr); + return Modified; } +// Return true if the given machine basic block is a preheader of a loop in +// which we want to flush the vmcnt counter, and false otherwise. +bool SIInsertWaitcnts::isPreheaderToFlush(MachineBasicBlock &MBB, + WaitcntBrackets &ScoreBrackets) { + if (PreheadersToFlush.count(&MBB)) + return PreheadersToFlush[&MBB]; + + auto UpdateCache = [&](bool val) { + PreheadersToFlush[&MBB] = val; + return val; + }; + + MachineBasicBlock *Succ = MBB.getSingleSuccessor(); + if (!Succ) + return UpdateCache(false); + + MachineLoop *Loop = MLI->getLoopFor(Succ); + if (!Loop) + return UpdateCache(false); + + if (Loop->getLoopPreheader() == &MBB && shouldFlushVmCnt(Loop, ScoreBrackets)) + return UpdateCache(true); + + return UpdateCache(false); +} + +// Return true if it is better to flush the vmcnt counter in the preheader of +// the given loop. We currently decide to flush in two situations: +// 1. The loop contains vmem store(s), no vmem load and at least one use of a +// vgpr containing a value that is loaded outside of the loop. (Only on +// targets with no vscnt counter). +// 2. The loop contains vmem load(s), but the loaded values are not used in the +// loop, and at least one use of a vgpr containing a value that is loaded +// outside of the loop. +bool SIInsertWaitcnts::shouldFlushVmCnt(MachineLoop *ML, + WaitcntBrackets &Brackets) { + bool HasVMemLoad = false; + bool HasVMemStore = false; + bool UsesVgprLoadedOutside = false; + DenseSet VgprUse; + DenseSet VgprDef; + + for (MachineBasicBlock *MBB : ML->blocks()) { + for (MachineInstr &MI : *MBB) { + if (SIInstrInfo::isVMEM(MI)) { + if (MI.mayLoad()) + HasVMemLoad = true; + if (MI.mayStore()) + HasVMemStore = true; + } + for (unsigned I = 0; I < MI.getNumOperands(); I++) { + MachineOperand &Op = MI.getOperand(I); + if (!Op.isReg() || !TRI->isVectorRegister(*MRI, Op.getReg())) + continue; + RegInterval Interval = Brackets.getRegInterval(&MI, TII, MRI, TRI, I); + // Vgpr use + if (Op.isUse()) { + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprDef.contains(RegNo)) + return false; + VgprUse.insert(RegNo); + // If at least one of Op's registers is in the score brackets, the + // value is likely loaded outside of the loop. + if (Brackets.getRegScore(RegNo, VM_CNT) > 0) { + UsesVgprLoadedOutside = true; + break; + } + } + } + // VMem load vgpr def + else if (SIInstrInfo::isVMEM(MI) && MI.mayLoad() && Op.isDef()) + for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) { + // If we find a register that is loaded inside the loop, 1. and 2. + // are invalidated and we can exit. + if (VgprUse.contains(RegNo)) + return false; + VgprDef.insert(RegNo); + } + } + } + } + if (!ST->hasVscnt() && HasVMemStore && !HasVMemLoad && UsesVgprLoadedOutside) + return true; + return HasVMemLoad && UsesVgprLoadedOutside; +} + bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) { ST = &MF.getSubtarget(); TII = ST->getInstrInfo(); @@ -1610,6 +1767,7 @@ MRI = &MF.getRegInfo(); IV = AMDGPU::getIsaVersion(ST->getCPU()); const SIMachineFunctionInfo *MFI = MF.getInfo(); + MLI = &getAnalysis(); PDT = &getAnalysis(); ForceEmitZeroWaitcnts = ForceEmitZeroFlag; diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -126,10 +126,11 @@ ; GCN-O0-NEXT: Insert fentry calls ; GCN-O0-NEXT: Insert XRay ops ; GCN-O0-NEXT: SI Memory Legalizer +; GCN-O0-NEXT: MachineDominator Tree Construction +; GCN-O0-NEXT: Machine Natural Loop Construction ; GCN-O0-NEXT: MachinePostDominator Tree Construction ; GCN-O0-NEXT: SI insert wait instructions ; GCN-O0-NEXT: Insert required mode register values -; GCN-O0-NEXT: MachineDominator Tree Construction ; GCN-O0-NEXT: SI Final Branch Preparation ; GCN-O0-NEXT: Post RA hazard recognizer ; GCN-O0-NEXT: Branch relaxation pass @@ -378,11 +379,12 @@ ; GCN-O1-NEXT: Insert fentry calls ; GCN-O1-NEXT: Insert XRay ops ; GCN-O1-NEXT: SI Memory Legalizer +; GCN-O1-NEXT: MachineDominator Tree Construction +; GCN-O1-NEXT: Machine Natural Loop Construction ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: SI insert wait instructions ; GCN-O1-NEXT: Insert required mode register values ; GCN-O1-NEXT: SI Insert Hard Clauses -; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: SI Final Branch Preparation ; GCN-O1-NEXT: SI peephole optimizations ; GCN-O1-NEXT: Post RA hazard recognizer @@ -665,11 +667,12 @@ ; GCN-O1-OPTS-NEXT: Insert fentry calls ; GCN-O1-OPTS-NEXT: Insert XRay ops ; GCN-O1-OPTS-NEXT: SI Memory Legalizer +; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction +; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI insert wait instructions ; GCN-O1-OPTS-NEXT: Insert required mode register values ; GCN-O1-OPTS-NEXT: SI Insert Hard Clauses -; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI Final Branch Preparation ; GCN-O1-OPTS-NEXT: SI peephole optimizations ; GCN-O1-OPTS-NEXT: Post RA hazard recognizer @@ -954,11 +957,12 @@ ; GCN-O2-NEXT: Insert fentry calls ; GCN-O2-NEXT: Insert XRay ops ; GCN-O2-NEXT: SI Memory Legalizer +; GCN-O2-NEXT: MachineDominator Tree Construction +; GCN-O2-NEXT: Machine Natural Loop Construction ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI insert wait instructions ; GCN-O2-NEXT: Insert required mode register values ; GCN-O2-NEXT: SI Insert Hard Clauses -; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: SI Final Branch Preparation ; GCN-O2-NEXT: SI peephole optimizations ; GCN-O2-NEXT: Post RA hazard recognizer @@ -1255,11 +1259,12 @@ ; GCN-O3-NEXT: Insert fentry calls ; GCN-O3-NEXT: Insert XRay ops ; GCN-O3-NEXT: SI Memory Legalizer +; GCN-O3-NEXT: MachineDominator Tree Construction +; GCN-O3-NEXT: Machine Natural Loop Construction ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI insert wait instructions ; GCN-O3-NEXT: Insert required mode register values ; GCN-O3-NEXT: SI Insert Hard Clauses -; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: SI Final Branch Preparation ; GCN-O3-NEXT: SI peephole optimizations ; GCN-O3-NEXT: Post RA hazard recognizer diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-vmcnt-loop.mir @@ -0,0 +1,537 @@ +# RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX9 %s +# RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck -check-prefix=GFX10 %s + +--- + +# The loop contains a store and a use of a value loaded outside of the loop. +# We expect the waitcnt for the use to be hoisted on GFX9, but not on GFX10+ +# because we have the vscnt counter. + +# GFX9-LABEL: waitcnt_vm_loop +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# Same as before, but the loop preheader has no terminator. + +# GFX9-LABEL: waitcnt_vm_loop_noterm +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_noterm +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_noterm +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# Same as before but there is a preexisting waitcnt in the preheader. + +# GFX9-LABEL: waitcnt_vm_loop_noterm_wait +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: +name: waitcnt_vm_loop_noterm_wait +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_WAITCNT 3952 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# The loop contains a store, a load, and uses values loaded both inside and +# outside the loop. +# We do not expect the waitcnt to be hoisted out of the loop. + +# GFX9-LABEL: waitcnt_vm_loop_load +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_load +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_load +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr7 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr7, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr7, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# The loop contains a use of a value loaded outside of the loop, and no store +# nor load. +# We do not expect the waitcnt to be hoisted out of the loop. + +# GFX9-LABEL: waitcnt_vm_loop_no_store +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_no_store +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_no_store +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# The loop contains a store, no load, and doesn't use any value loaded inside +# or outside of the loop. There is only one use of the loaded value in the +# exit block. +# We don't expect any s_waitcnt vmcnt in the loop body or preheader, but expect +# one in the exit block. + + +# GFX9-LABEL: waitcnt_vm_loop_no_use +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop_no_use +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop_no_use +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = V_ADD_U32_e32 $vgpr2, $vgpr2, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec + S_ENDPGM 0 + +... +--- + +# The loop loads a value that is not used in the loop, and uses a value loaded +# outside of the loop. +# We expect the waitcnt to be hoisted of the loop to wait a single time before +# the loop is executed and avoid waiting for the load to complete on each +# iteration. + +# GFX9-LABEL: waitcnt_vm_loop2 +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2 +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2 +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# Same as before with an additional store in the loop. We still expect the +# waitcnt instructions to be hoisted. + +# GFX9-LABEL: waitcnt_vm_loop2_store +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_store +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_store +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + BUFFER_STORE_DWORD_OFFEN_exact $vgpr5, $vgpr6, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# Same as loop2 but the value loaded inside the loop is also used in the loop. +# We do not expect the waitcnt to be hoisted out of the loop. + +# GFX9-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_use_in_loop +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_use_in_loop +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr4 = V_ADD_U32_e32 $vgpr5, $vgpr1, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# The loop contains a use of a value loaded outside of the loop, but we already +# waited for that load to complete. The loop also loads a value that is not used +# in the loop. We do not expect any waitcnt in the loop. + +# GFX9-LABEL: waitcnt_vm_loop2_nowait +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.3: + +# GFX10-LABEL: waitcnt_vm_loop2_nowait +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.3: +name: waitcnt_vm_loop2_nowait +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.2 + + $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec + $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec + $vgpr3 = V_ADD_U32_e32 $vgpr4, $vgpr5, implicit $exec + + S_BRANCH %bb.2 + + bb.2: + successors: %bb.2, %bb.3 + + $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec + $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr4, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.2, implicit killed $scc + S_BRANCH %bb.3 + + bb.3: + S_ENDPGM 0 + +... +--- + +# Similar test case but for register intervals. + +# GFX9-LABEL: waitcnt_vm_loop2_reginterval +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_reginterval +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_reginterval +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# Similar test case but for register intervals. + +# GFX9-LABEL: waitcnt_vm_loop2_reginterval2 +# GFX9-LABEL: bb.0: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.1: +# GFX9: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_loop2_reginterval2 +# GFX10-LABEL: bb.0: +# GFX10-NOT: S_WAITCNT 16 +# GFX10-LABEL: bb.1: +# GFX10: S_WAITCNT 16 +# GFX10-LABEL: bb.2: +name: waitcnt_vm_loop2_reginterval2 +body: | + bb.0: + successors: %bb.1 + + $vgpr0_vgpr1_vgpr2_vgpr3 = GLOBAL_LOAD_DWORDX4 $vgpr10_vgpr11, 0, 0, implicit $exec + + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr10 = COPY $vgpr0 + + $vgpr4_vgpr5_vgpr6_vgpr7 = IMAGE_SAMPLE_V4_V2 $vgpr20_vgpr21, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, $sgpr0_sgpr1_sgpr2_sgpr3, 15, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s128), align 4, addrspace 4) + $vgpr11 = COPY $vgpr7 + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +... +--- + +# The loop loads a value that is not used in the loop, but uses a value loaded +# outside of it. We expect the s_waitcnt instruction to be hoisted. +# A s_waitcnt vmcnt(0) is generated to flush in the preheader, but for this +# specific test case, it would be better to use vmcnt(1) instead. This is +# currently not implemented. + +# GFX9-LABEL: waitcnt_vm_zero +# GFX9-LABEL: bb.0: +# GFX9: S_WAITCNT 3952 +# GFX9-LABEL: bb.1: +# GFX9-NOT: S_WAITCNT 39 +# GFX9-LABEL: bb.2: + +# GFX10-LABEL: waitcnt_vm_zero +# GFX10-LABEL: bb.0: +# GFX10: S_WAITCNT 16240 +# GFX10-LABEL: bb.1: +# GFX10-NOT: S_WAITCNT 16240 +# GFX10-LABEL: bb.2: + +name: waitcnt_vm_zero +body: | + bb.0: + successors: %bb.1 + + $vgpr0 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec + $vgpr1 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr1, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + S_BRANCH %bb.1 + + bb.1: + successors: %bb.1, %bb.2 + + $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr3, implicit $exec + $vgpr2 = BUFFER_LOAD_FORMAT_X_IDXEN killed $vgpr3, $sgpr4_sgpr5_sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec + S_CMP_LG_U32 killed $sgpr3, $sgpr4, implicit-def $scc + S_CBRANCH_SCC1 %bb.1, implicit killed $scc + S_BRANCH %bb.2 + + bb.2: + S_ENDPGM 0 + +...