Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -367,7 +367,7 @@ DenseMap> BlockWaitcntBracketsMap; - DenseSet BlockWaitcntProcessedSet; + std::vector BlockWaitcntProcessedSet; DenseMap> LoopWaitcntDataMap; @@ -403,7 +403,8 @@ void updateEventWaitCntAfter(MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets); void mergeInputScoreBrackets(MachineBasicBlock &Block); - MachineBasicBlock *loopBottom(const MachineLoop *Loop); + bool isLoopBottom(const MachineLoop *Loop, const MachineBasicBlock *Block); + unsigned countNumBottomBlocks(const MachineLoop *Loop); void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block); void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst); bool isWaitcntStronger(unsigned LHS, unsigned RHS); @@ -1568,15 +1569,29 @@ } } -/// Return the "bottom" block of a loop. This differs from -/// MachineLoop::getBottomBlock in that it works even if the loop is -/// discontiguous. -MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) { - MachineBasicBlock *Bottom = Loop->getHeader(); - for (MachineBasicBlock *MBB : Loop->blocks()) - if (MBB->getNumber() > Bottom->getNumber()) - Bottom = MBB; - return Bottom; +/// Return true if the given basic block is a "bottom" block of a loop. This +/// differs from MachineLoop::getBottomBlock in that it works even if the loop +/// is discontiguous. This also handles multiple back-edges for the same +/// "header" block of a loop. +bool SIInsertWaitcnts::isLoopBottom(const MachineLoop *Loop, + const MachineBasicBlock *Block) { + for (MachineBasicBlock *MBB : Loop->blocks()) { + if (MBB == Block && MBB->isSuccessor(Loop->getHeader())) { + return true; + } + } + return false; +} + +/// Count the number of "bottom" basic blocks of a loop. +unsigned SIInsertWaitcnts::countNumBottomBlocks(const MachineLoop *Loop) { + unsigned Count = 0; + for (MachineBasicBlock *MBB : Loop->blocks()) { + if (MBB->isSuccessor(Loop->getHeader())) { + Count++; + } + } + return Count; } // Generate s_waitcnt instructions where needed. @@ -1685,7 +1700,7 @@ // Check if we need to force convergence at loop footer. MachineLoop *ContainingLoop = MLI->getLoopFor(&Block); - if (ContainingLoop && loopBottom(ContainingLoop) == &Block) { + if (ContainingLoop && isLoopBottom(ContainingLoop, &Block)) { LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get(); WaitcntData->print(); DEBUG(dbgs() << '\n';); @@ -1773,6 +1788,7 @@ TrackedWaitcntSet.clear(); BlockVisitedSet.clear(); VCCZBugHandledSet.clear(); + LoopWaitcntDataMap.clear(); // Walk over the blocks in reverse post-dominator order, inserting // s_waitcnt where needed. @@ -1799,21 +1815,30 @@ // If we are walking into the block from before the loop, then guarantee // at least 1 re-walk over the loop to propagate the information, even if // no S_WAITCNT instructions were generated. - if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I && - (!BlockWaitcntProcessedSet.count(&MBB))) { - BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); - DEBUG(dbgs() << "set-revisit: Block" - << ContainingLoop->getHeader()->getNumber() << '\n';); + if (ContainingLoop && ContainingLoop->getHeader() == &MBB) { + unsigned Count = countNumBottomBlocks(ContainingLoop); + + // If the loop has multiple back-edges, and so more than one "bottom" + // basic block, we have to guarantee a re-walk over every blocks. + if ((std::count(BlockWaitcntProcessedSet.begin(), + BlockWaitcntProcessedSet.end(), &MBB) < Count)) { + BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); + DEBUG(dbgs() << "set-revisit: Block" + << ContainingLoop->getHeader()->getNumber() << '\n';); + } } // Walk over the instructions. insertWaitcntInBlock(MF, MBB); // Flag that waitcnts have been processed at least once. - BlockWaitcntProcessedSet.insert(&MBB); + BlockWaitcntProcessedSet.push_back(&MBB); - // See if we want to revisit the loop. - if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) { + // See if we want to revisit the loop. If a loop has multiple back-edges, + // we shouldn't revisit the same "bottom" basic block. + if (ContainingLoop && isLoopBottom(ContainingLoop, &MBB) && + std::count(BlockWaitcntProcessedSet.begin(), + BlockWaitcntProcessedSet.end(), &MBB) == 1) { MachineBasicBlock *EntryBB = ContainingLoop->getHeader(); BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get(); if (EntrySB && EntrySB->getRevisitLoop()) { Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir +++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-back-edge-loop.mir @@ -0,0 +1,59 @@ +# RUN: llc -o - %s -march=amdgcn -mcpu=fiji -run-pass=si-insert-waitcnts -verify-machineinstrs | FileCheck -check-prefix=GCN %s + +# GCN-LABEL: waitcnt-back-edge-loop +# GCN: bb.2 +# GCN: S_WAITCNT 112 +# GCN: $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec + +--- +name: waitcnt-back-edge-loop +body: | + bb.0: + successors: %bb.1 + + $vgpr1 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 + $vgpr2 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr1_vgpr2 + $vgpr4 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr0 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $sgpr0_sgpr1 = V_CMP_EQ_U32_e64 3, killed $sgpr4, implicit $exec + $vgpr3 = V_CNDMASK_B32_e64 -1082130432, 1065353216, killed $sgpr0_sgpr1, implicit $exec + $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec + S_BRANCH %bb.1 + + bb.3: + successors: %bb.1 + + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + + bb.1: + successors: %bb.5, %bb.2 + + $vgpr5 = V_CVT_I32_F32_e32 killed $vgpr5, implicit $exec + V_CMP_NE_U32_e32 0, $vgpr5, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.5, implicit killed $vcc + + bb.2: + successors: %bb.4, %bb.3 + + V_CMP_EQ_U32_e32 9, killed $vgpr5, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + S_CBRANCH_VCCZ %bb.3, implicit killed $vcc + + bb.4: + successors: %bb.3, %bb.1 + + $vgpr5 = FLAT_LOAD_DWORD $vgpr1_vgpr2, 0, 0, 0, implicit $exec, implicit $flat_scr :: (volatile load 4 from `float addrspace(1)* null`, addrspace 1) + $vgpr4 = V_CVT_I32_F32_e32 $vgpr5, implicit $exec + V_CMP_EQ_U32_e32 2, killed $vgpr4, implicit-def $vcc, implicit $exec + $vcc = S_AND_B64 $exec, killed $vcc, implicit-def dead $scc + $vgpr4 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec + S_CBRANCH_VCCZ %bb.1, implicit killed $vcc + S_BRANCH %bb.3 + + bb.5: + + $vgpr4 = V_MAC_F32_e32 killed $vgpr0, killed $vgpr3, killed $vgpr4, implicit $exec + EXP_DONE 12, killed $vgpr4, undef $vgpr0, undef $vgpr0, undef $vgpr0, 0, 0, 15, implicit $exec + S_ENDPGM +...