diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -207,9 +207,18 @@ return NoHazard; } -static void insertNoopInBundle(MachineInstr *MI, const SIInstrInfo &TII) { - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) - .addImm(0); +static void insertNoopsInBundle(MachineInstr *MI, const SIInstrInfo &TII, + unsigned Quantity) { + while (Quantity > 0) { + unsigned Arg; + if (Quantity >= 8) + Arg = 7; + else + Arg = Quantity - 1; + Quantity -= Arg + 1; + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII.get(AMDGPU::S_NOP)) + .addImm(Arg); + } } void GCNHazardRecognizer::processBundle() { @@ -220,11 +229,11 @@ CurrCycleInstr = &*MI; unsigned WaitStates = PreEmitNoopsCommon(CurrCycleInstr); - if (IsHazardRecognizerMode) + if (IsHazardRecognizerMode) { fixHazards(CurrCycleInstr); - for (unsigned i = 0; i < WaitStates; ++i) - insertNoopInBundle(CurrCycleInstr, TII); + insertNoopsInBundle(CurrCycleInstr, TII, WaitStates); + } // It’s unnecessary to track more than MaxLookAhead instructions. Since we // include the bundled MI directly after, only add a maximum of diff --git a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir --- a/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir +++ b/llvm/test/CodeGen/AMDGPU/hazard-in-bundle.mir @@ -64,3 +64,21 @@ } S_ENDPGM 0 ... + +# GCN-LABEL: name: vmem_vcc_hazard_in_bundle +# GCN: S_LOAD_DWORDX2_IMM +# GCN-NEXT: S_NOP 3 +# GCN: BUFFER_LOAD_DWORD_OFFEN +--- +name: vmem_vcc_hazard_in_bundle +body: | + bb.0: + BUNDLE { + $sgpr0_sgpr1_sgpr2_sgpr3 = IMPLICIT_DEF + $vgpr0 = IMPLICIT_DEF + $vgpr1 = V_ADDC_U32_e32 $vgpr0, $vgpr0, implicit-def $vcc, implicit $vcc, implicit $exec + $sgpr0_sgpr1 = S_LOAD_DWORDX2_IMM $sgpr10_sgpr11, 0, 0, 0 + $vgpr1 = BUFFER_LOAD_DWORD_OFFEN $vgpr0, $sgpr0_sgpr1_sgpr2_sgpr3, $vcc_lo, 0, 0, 0, 0, 0, 0, implicit $exec + } + S_ENDPGM 0 +... diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.p.ll @@ -11,7 +11,6 @@ ; LOOP: s_mov_b32 m0, 0{{$}} ; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: ; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 -; GFX8-NEXT: s_nop 0 ; LOOP-NEXT: ds_gws_sema_p gds ; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.release.all.ll @@ -16,7 +16,6 @@ ; LOOP: s_mov_b32 m0, 0{{$}} ; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: ; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 -; GFX8-NEXT: s_nop 0 ; LOOP-NEXT: ds_gws_sema_release_all gds ; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.gws.sema.v.ll @@ -11,7 +11,6 @@ ; LOOP: s_mov_b32 m0, 0{{$}} ; LOOP: [[LOOP:BB[0-9]+_[0-9]+]]: ; LOOP-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_TRAPSTS, 8, 1), 0 -; GFX8-NEXT: s_nop 0 ; LOOP-NEXT: ds_gws_sema_v gds ; LOOP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; LOOP-NEXT: s_getreg_b32 [[GETREG:s[0-9]+]], hwreg(HW_REG_TRAPSTS, 8, 1)