Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -345,7 +345,7 @@ void incIterCnt() { IterCnt++; } void resetIterCnt() { IterCnt = 0; } - int32_t getIterCnt() { return IterCnt; } + unsigned getIterCnt() { return IterCnt; } void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; } MachineInstr *getWaitcnt() const { return LfWaitcnt; } @@ -1205,7 +1205,7 @@ } ScoreBracket->setRevisitLoop(true); LLVM_DEBUG(dbgs() - << "set-revisit: Block" + << "set-revisit2: Block" << ContainingLoop->getHeader()->getNumber() << '\n';); } } @@ -1639,10 +1639,9 @@ } } -/// Return true if the given basic block is a "bottom" block of a loop. This -/// differs from MachineLoop::getBottomBlock in that it works even if the loop -/// is discontiguous. This also handles multiple back-edges for the same -/// "header" block of a loop. +/// Return true if the given basic block is a "bottom" block of a loop. +/// This works even if the loop is discontiguous. This also handles +/// multiple back-edges for the same "header" block of a loop. bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block) { for (MachineBasicBlock *MBB : Loop->blocks()) { @@ -1776,11 +1775,12 @@ LLVM_DEBUG(dbgs() << '\n';); // The iterative waitcnt insertion algorithm aims for optimal waitcnt - // placement and doesn't always guarantee convergence for a loop. Each - // loop should take at most 2 iterations for it to converge naturally. - // When this max is reached and result doesn't converge, we force - // convergence by inserting a s_waitcnt at the end of loop footer. - if (WaitcntData->getIterCnt() > 2) { + // placement, but doesn't guarantee convergence for a loop. Each + // loop should take at most (n+1) iterations for it to converge naturally, + // where n is the number of bottom blocks. If this threshold is reached and + // the result hasn't converged, then we force convergence by inserting + // a s_waitcnt at the end of loop footer. + if (WaitcntData->getIterCnt() > (countNumBottomBlocks(ContainingLoop) + 1)) { // To ensure convergence, need to make wait events at loop footer be no // more than those from the previous iteration. // As a simplification, instead of tracking individual scores and @@ -1792,16 +1792,16 @@ if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) { ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T)); HasPending = true; + break; } } if (HasPending) { if (!SWaitInst) { - SWaitInst = Block.getParent()->CreateMachineInstr( - TII->get(AMDGPU::S_WAITCNT), DebugLoc()); + SWaitInst = BuildMI(Block, Block.getFirstNonPHI(), + DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); TrackedWaitcntSet.insert(SWaitInst); - const MachineOperand &Op = MachineOperand::CreateImm(0); - SWaitInst->addOperand(MF, Op); #if 0 // TODO: Format the debug output OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context); OutputTransformAdd(SWaitInst, context); @@ -1898,7 +1898,7 @@ if ((std::count(BlockWaitcntProcessedSet.begin(), BlockWaitcntProcessedSet.end(), &MBB) < Count)) { BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); - LLVM_DEBUG(dbgs() << "set-revisit: Block" + LLVM_DEBUG(dbgs() << "set-revisit1: Block" << ContainingLoop->getHeader()->getNumber() << '\n';); } } @@ -1906,7 +1906,7 @@ // Walk over the instructions. insertWaitcntInBlock(MF, MBB); - // Flag that waitcnts have been processed at least once. + // Record that waitcnts have been processed at least once for this block. BlockWaitcntProcessedSet.push_back(&MBB); // See if we want to revisit the loop. If a loop has multiple back-edges, @@ -2004,8 +2004,12 @@ // TODO: Could insert earlier and schedule more liberally with operations // that only use caller preserved registers. MachineBasicBlock &EntryBB = MF.front(); - BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) - .addImm(0); + auto SWaitInst = BuildMI(EntryBB, EntryBB.getFirstNonPHI(), + DebugLoc(), TII->get(AMDGPU::S_WAITCNT)) + .addImm(0); + + LLVM_DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "New Instr: " << *SWaitInst << '\n'); Modified = true; } Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir +++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir @@ -57,3 +57,37 @@ EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec S_ENDPGM ... +--- + +# GCN-LABEL: name: waitcnt-multiple-back-edges{{$}} +# GCN: bb.0: +# GCN: S_WAITCNT 0 +# GCN-NEXT: S_BRANCH %bb.2 + +name: waitcnt-multiple-back-edges +body: | + bb.0: + S_BRANCH %bb.2 + + bb.1: + S_BRANCH %bb.2 + + bb.2: + S_CBRANCH_VCCZ %bb.1, implicit $vcc + + bb.3: + S_CBRANCH_VCCNZ %bb.5, implicit $vcc + + bb.4: + BUFFER_ATOMIC_ADD_OFFSET renamable $vgpr0, renamable $sgpr12_sgpr13_sgpr14_sgpr15, 0, 4, 0, implicit $exec + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.6 + + bb.5: + S_CBRANCH_SCC0 %bb.2, implicit $scc + S_BRANCH %bb.6 + + bb.6: + S_CBRANCH_SCC1 %bb.0, implicit $scc + S_ENDPGM +...