Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -874,13 +874,13 @@ // cases. addPass(&PostRAHazardRecognizerID); + addPass(createSIMemoryLegalizerPass()); if (EnableSIInsertWaitcntsPass) addPass(createSIInsertWaitcntsPass()); else addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); - addPass(createSIMemoryLegalizerPass()); addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } Index: llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -361,7 +361,7 @@ AMDGPUAS AMDGPUASI; DenseSet BlockVisitedSet; - DenseSet CompilerGeneratedWaitcntSet; + DenseSet TrackedWaitcntSet; DenseSet VCCZBugHandledSet; DenseMap> @@ -1114,7 +1114,7 @@ BlockWaitcntBrackets *ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); if (!ScoreBracket) { - assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end()); + assert(!BlockVisitedSet.count(TBB)); BlockWaitcntBracketsMap[TBB] = llvm::make_unique(); ScoreBracket = BlockWaitcntBracketsMap[TBB].get(); @@ -1132,7 +1132,7 @@ } else { SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT), MI.getDebugLoc()); - CompilerGeneratedWaitcntSet.insert(SWaitInst); + TrackedWaitcntSet.insert(SWaitInst); } const MachineOperand &Op = @@ -1267,7 +1267,7 @@ for (MachineBasicBlock *pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = BlockWaitcntBracketsMap[pred].get(); - bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end(); + bool Visited = BlockVisitedSet.count(pred); if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { continue; } @@ -1306,7 +1306,7 @@ for (MachineBasicBlock *Pred : Block.predecessors()) { BlockWaitcntBrackets *PredScoreBrackets = BlockWaitcntBracketsMap[Pred].get(); - bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end(); + bool Visited = BlockVisitedSet.count(Pred); if (!Visited || PredScoreBrackets->getWaitAtBeginning()) { continue; } @@ -1354,7 +1354,7 @@ // Set the register scoreboard. for (MachineBasicBlock *Pred : Block.predecessors()) { - if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + if (!BlockVisitedSet.count(Pred)) { continue; } @@ -1468,7 +1468,7 @@ // sequencing predecessors, because changes to EXEC require waitcnts due to // the delayed nature of these operations. for (MachineBasicBlock *Pred : Block.predecessors()) { - if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) { + if (!BlockVisitedSet.count(Pred)) { continue; } @@ -1530,8 +1530,7 @@ if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { // TODO: Register the old waitcnt and optimize the following waitcnts. // Leaving the previously existing waitcnts is conservatively correct. - if (CompilerGeneratedWaitcntSet.find(&Inst) == - CompilerGeneratedWaitcntSet.end()) + if (!TrackedWaitcntSet.count(&Inst)) ++Iter; else { ScoreBrackets->setWaitcnt(&Inst); @@ -1550,7 +1549,7 @@ bool VCCZBugWorkAround = false; if (readsVCCZ(Inst) && - (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) { + (!VCCZBugHandledSet.count(&Inst))) { if (ScoreBrackets->getScoreLB(LGKM_CNT) < ScoreBrackets->getScoreUB(LGKM_CNT) && ScoreBrackets->hasPendingSMEM()) { @@ -1564,11 +1563,29 @@ MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets); if (SWaitInst) { - Block.insert(Inst, SWaitInst); - if (ScoreBrackets->getWaitcnt() != SWaitInst) { - DEBUG(dbgs() << "insertWaitcntInBlock\n" - << "Old Instr: " << Inst << '\n' - << "New Instr: " << *SWaitInst << '\n';); + // We don't (yet) track waitcnts that existed prior to the waitcnt + // pass (we just skip over them); because the waitcnt pass is ignorant + // of them, it may insert a redundant waitcnt. To avoid this, check + // the prev instr. If it and the to-be-inserted waitcnt are the + // same, keep the prev waitcnt and skip the insertion. We assume that + // whomever. e.g., for memory model, inserted the prev waitcnt really + // wants it there. + bool insertSWaitInst = true; + if (Iter != Block.begin()) { + MachineInstr *MIPrevInst = &*std::prev(Iter); + if (MIPrevInst && + MIPrevInst->getOpcode() == AMDGPU::S_WAITCNT && + MIPrevInst->getOperand(0).getImm() == SWaitInst->getOperand(0).getImm()) { + insertSWaitInst = false; + } + } + if (insertSWaitInst) { + Block.insert(Inst, SWaitInst); + if (ScoreBrackets->getWaitcnt() != SWaitInst) { + DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << Inst << '\n' + << "New Instr: " << *SWaitInst << '\n';); + } } } @@ -1656,7 +1673,7 @@ if (!SWaitInst) { SWaitInst = Block.getParent()->CreateMachineInstr( TII->get(AMDGPU::S_WAITCNT), DebugLoc()); - CompilerGeneratedWaitcntSet.insert(SWaitInst); + TrackedWaitcntSet.insert(SWaitInst); const MachineOperand &Op = MachineOperand::CreateImm(0); SWaitInst->addOperand(MF, Op); #if 0 // TODO: Format the debug output @@ -1712,6 +1729,10 @@ RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; + TrackedWaitcntSet.clear(); + BlockVisitedSet.clear(); + VCCZBugHandledSet.clear(); + // Walk over the blocks in reverse post-dominator order, inserting // s_waitcnt where needed. ReversePostOrderTraversal RPOT(&MF); @@ -1738,8 +1759,7 @@ // at least 1 re-walk over the loop to propagate the information, even if // no S_WAITCNT instructions were generated. if (ContainingLoop && ContainingLoop->getHeader() == &MBB && J < I && - (BlockWaitcntProcessedSet.find(&MBB) == - BlockWaitcntProcessedSet.end())) { + (!BlockWaitcntProcessedSet.count(&MBB))) { BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true); DEBUG(dbgs() << "set-revisit: block" << ContainingLoop->getHeader()->getNumber() << '\n';); Index: llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir +++ llvm/trunk/test/CodeGen/AMDGPU/waitcnt-no-redundant.mir @@ -0,0 +1,24 @@ +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s + +# Check that the waitcnt pass does *not* insert a redundant waitcnt instr. +# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127 +# or S_WAITCNT 3952 + +... +# CHECK-LABEL: name: waitcnt-no-redundant +# CHECK: DS_READ_B64 +# CHECK-NEXT: S_WAITCNT 127 +# CHECK-NEXT: FLAT_ATOMIC_CMPSWAP +# CHECK-NEXT: S_WAITCNT 3952 +# CHECK-NEXT: BUFFER_WBINVL1_VOL + +name: waitcnt-no-redundant +body: | + bb.0: + renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + S_WAITCNT 127 + FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr + S_WAITCNT 3952 + BUFFER_WBINVL1_VOL implicit $exec + S_ENDPGM +...