Index: lib/Target/AMDGPU/AMDGPUTargetMachine.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -874,13 +874,13 @@ // cases. addPass(&PostRAHazardRecognizerID); + addPass(createSIMemoryLegalizerPass()); if (EnableSIInsertWaitcntsPass) addPass(createSIInsertWaitcntsPass()); else addPass(createSIInsertWaitsPass()); addPass(createSIShrinkInstructionsPass()); addPass(&SIInsertSkipsPassID); - addPass(createSIMemoryLegalizerPass()); addPass(createSIDebuggerInsertNopsPass()); addPass(&BranchRelaxationPassID); } Index: lib/Target/AMDGPU/SIInsertWaitcnts.cpp =================================================================== --- lib/Target/AMDGPU/SIInsertWaitcnts.cpp +++ lib/Target/AMDGPU/SIInsertWaitcnts.cpp @@ -361,7 +361,7 @@ AMDGPUAS AMDGPUASI; DenseSet BlockVisitedSet; - DenseSet CompilerGeneratedWaitcntSet; + DenseSet TrackedWaitcntSet; DenseSet VCCZBugHandledSet; DenseMap> @@ -1132,7 +1132,7 @@ } else { SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT), MI.getDebugLoc()); - CompilerGeneratedWaitcntSet.insert(SWaitInst); + TrackedWaitcntSet.insert(SWaitInst); } const MachineOperand &Op = @@ -1530,8 +1530,8 @@ if (Inst.getOpcode() == AMDGPU::S_WAITCNT) { // TODO: Register the old waitcnt and optimize the following waitcnts. // Leaving the previously existing waitcnts is conservatively correct. - if (CompilerGeneratedWaitcntSet.find(&Inst) == - CompilerGeneratedWaitcntSet.end()) + if (TrackedWaitcntSet.find(&Inst) == + TrackedWaitcntSet.end()) ++Iter; else { ScoreBrackets->setWaitcnt(&Inst); @@ -1564,11 +1564,29 @@ MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets); if (SWaitInst) { - Block.insert(Inst, SWaitInst); - if (ScoreBrackets->getWaitcnt() != SWaitInst) { - DEBUG(dbgs() << "insertWaitcntInBlock\n" - << "Old Instr: " << Inst << '\n' - << "New Instr: " << *SWaitInst << '\n';); + // We don't (yet) track waitcnts that existed prior to the waitcnt + // pass (we just skip over them); because the waitcnt pass is ignorant + // of them, it may insert a redundant waitcnt. To avoid this, check + // the prev instr. If it and the to-be-inserted waitcnt are the + // same, keep the prev waitcnt and skip the insertion. We assume that + // whomever. e.g., for memory model, inserted the prev waitcnt really + // wants it there. + bool insertSWaitInst = true; + if (Iter != Block.begin()) { + MachineInstr *MIPrevInst = &*std::prev(Iter); + if (MIPrevInst && + MIPrevInst->getOpcode() == AMDGPU::S_WAITCNT && + MIPrevInst->getOperand(0).getImm() == SWaitInst->getOperand(0).getImm()) { + insertSWaitInst = false; + } + } + if (insertSWaitInst) { + Block.insert(Inst, SWaitInst); + if (ScoreBrackets->getWaitcnt() != SWaitInst) { + DEBUG(dbgs() << "insertWaitcntInBlock\n" + << "Old Instr: " << Inst << '\n' + << "New Instr: " << *SWaitInst << '\n';); + } } } @@ -1656,7 +1674,7 @@ if (!SWaitInst) { SWaitInst = Block.getParent()->CreateMachineInstr( TII->get(AMDGPU::S_WAITCNT), DebugLoc()); - CompilerGeneratedWaitcntSet.insert(SWaitInst); + TrackedWaitcntSet.insert(SWaitInst); const MachineOperand &Op = MachineOperand::CreateImm(0); SWaitInst->addOperand(MF, Op); #if 0 // TODO: Format the debug output @@ -1712,6 +1730,10 @@ RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1; + TrackedWaitcntSet.clear(); + BlockVisitedSet.clear(); + VCCZBugHandledSet.clear(); + // Walk over the blocks in reverse post-dominator order, inserting // s_waitcnt where needed. ReversePostOrderTraversal RPOT(&MF); Index: test/CodeGen/AMDGPU/waitcnt-no-redundant.mir =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/waitcnt-no-redundant.mir @@ -0,0 +1,24 @@ +# RUN: llc -mtriple=amdgcn -verify-machineinstrs -run-pass si-insert-waitcnts -o - %s | FileCheck %s + +# Check that the waitcnt pass does *not* insert a redundant waitcnt instr. +# In this testcase, ensure that pass does not insert redundant S_WAITCNT 127 +# or S_WAITCNT 3952 + +... +# CHECK-LABEL: name: waitcnt-no-redundant +# CHECK: DS_READ_B64 +# CHECK-NEXT: S_WAITCNT 127 +# CHECK-NEXT: FLAT_ATOMIC_CMPSWAP +# CHECK-NEXT: S_WAITCNT 3952 +# CHECK-NEXT: BUFFER_WBINVL1_VOL + +name: waitcnt-no-redundant +body: | + bb.0: + renamable $vgpr0_vgpr1 = DS_READ_B64 killed renamable $vgpr0, 0, 0, implicit $m0, implicit $exec + S_WAITCNT 127 + FLAT_ATOMIC_CMPSWAP killed renamable $vgpr0_vgpr1, killed renamable $vgpr3_vgpr4, 0, 0, implicit $exec, implicit $flat_scr + S_WAITCNT 3952 + BUFFER_WBINVL1_VOL implicit $exec + S_ENDPGM +...