Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -115,11 +115,11 @@ (w) = (enum WaitEventType)((w) + 1)) // This is a per-basic-block object that maintains current score brackets -// of each wait-counter, and a per-register scoreboard for each wait-couner. +// of each wait counter, and a per-register scoreboard for each wait counter. // We also maintain the latest score for every event type that can change the // waitcnt in order to know if there are multiple types of events within // the brackets. When multiple types of event happen in the bracket, -// wait-count may get decreased out of order, therefore we need to put in +// wait count may get decreased out of order, therefore we need to put in // "s_waitcnt 0" before use. class BlockWaitcntBrackets { public: @@ -690,7 +690,7 @@ setScoreLB(T, getScoreUB(T)); } else if (counterOutOfOrder(T)) { // Counter can get decremented out-of-order when there - // are multiple types event in the brack. Also emit an s_wait counter + // are multiple types event in the bracket. Also emit an s_wait counter // with a conservative value of 0 for the counter. NeedWait = CNT_MASK(T); setScoreLB(T, getScoreUB(T)); @@ -1301,27 +1301,37 @@ } } +// Merge the score brackets of the Block's predecessors; +// this merged score bracket is used when adding waitcnts to the Block void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) { BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get(); int32_t MaxPending[NUM_INST_CNTS] = {0}; int32_t MaxFlat[NUM_INST_CNTS] = {0}; bool MixedExpTypes = false; - // Clear the score bracket state. - ScoreBrackets->clear(); - - // Compute the number of pending elements on block entry. + // For single basic block loops, we need to retain the Block's + // score bracket to have accurate Pred info. So, make a copy of Block's + // score bracket, clear() it (which retains several important bits of info), + // populate, and then replace en masse. For non-single basic block loops, + // just clear Block's current score bracket and repopulate in-place. + bool IsSelfPred; + std::unique_ptr S; + + IsSelfPred = (std::find(Block.pred_begin(), Block.pred_end(), &Block)) + != Block.pred_end(); + if (IsSelfPred) { + S = llvm::make_unique(*ScoreBrackets); + ScoreBrackets = S.get(); + } - // IMPORTANT NOTE: If iterative handling of loops is added, the code will - // need to handle single BBs with backedges to themselves. This means that - // they will need to retain and not clear their initial state. + ScoreBrackets->clear(); // See if there are any uninitialized predecessors. If so, emit an // s_waitcnt 0 at the beginning of the block. - for (MachineBasicBlock *pred : Block.predecessors()) { + for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = - BlockWaitcntBracketsMap[pred].get(); - bool Visited = BlockVisitedSet.count(pred); + BlockWaitcntBracketsMap[Pred].get(); + bool Visited = BlockVisitedSet.count(Pred); if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { continue; } @@ -1550,6 +1560,12 @@ } } } + + // if a single block loop, update the score brackets. Not needed for other + // blocks, as we did this in-place + if (IsSelfPred) { + BlockWaitcntBracketsMap[&Block] = llvm::make_unique(*ScoreBrackets); + } } /// Return the "bottom" block of a loop. This differs from Index: test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/waitcnt-loop-single-basic-block.mir @@ -0,0 +1,26 @@ +# RUN: llc -o - %s -march=amdgcn -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +# Check that the waitcnt propogates info in the case of a single basic block loop + +# GCN-LABEL: waitcnt-loop-single-basic-block +# GCN: bb.0 +# GCN: S_WAITCNT 3952 +# GCN-NEXT: GLOBAL_STORE_DWORD +# GCN: S_WAITCNT 3953 +# GCN-NEXT: GLOBAL_STORE_DWORD + +... +name: waitcnt-loop-single-basic-block +body: | + bb.0: + S_BRANCH %bb.1 + bb.1: + GLOBAL_STORE_DWORD $vgpr7_vgpr8, $vgpr11, 0, 0, 0, implicit $exec + $vgpr21 = GLOBAL_LOAD_DWORD $vgpr4_vgpr5, 0, 0, 0, implicit $exec + $vgpr10 = GLOBAL_LOAD_DWORD $vgpr10_vgpr11, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD $vgpr14_vgpr15, $vgpr21, 0, 0, 0, implicit $exec + $vgpr11 = GLOBAL_LOAD_DWORD $vgpr11_vgpr12, 0, 0, 0, implicit $exec + S_CBRANCH_SCC1 %bb.1, implicit $scc + bb.2: + S_ENDPGM +...