diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -138,67 +138,71 @@ ScheduleHazardRecognizer::HazardType GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) { MachineInstr *MI = SU->getInstr(); + // If we are not in "HazardRecognizerMode" and therefore not being run from + // the scheduler, track possible stalls from hazards but don't insert noops. + auto HazardType = IsHazardRecognizerMode ? NoopHazard : Hazard; + if (MI->isBundle()) return NoHazard; if (SIInstrInfo::isSMRD(*MI) && checkSMRDHazards(MI) > 0) - return NoopHazard; + return HazardType; // FIXME: Should flat be considered vmem? if ((SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI)) && checkVMEMHazards(MI) > 0) - return NoopHazard; + return HazardType; if (ST.hasNSAtoVMEMBug() && checkNSAtoVMEMHazard(MI) > 0) - return NoopHazard; + return HazardType; if (checkFPAtomicToDenormModeHazard(MI) > 0) - return NoopHazard; + return HazardType; if (ST.hasNoDataDepHazard()) return NoHazard; if (SIInstrInfo::isVALU(*MI) && checkVALUHazards(MI) > 0) - return NoopHazard; + return HazardType; if (SIInstrInfo::isDPP(*MI) && checkDPPHazards(MI) > 0) - return NoopHazard; + return HazardType; if (isDivFMas(MI->getOpcode()) && checkDivFMasHazards(MI) > 0) - return NoopHazard; + return HazardType; if (isRWLane(MI->getOpcode()) && checkRWLaneHazards(MI) > 0) - return NoopHazard; + return HazardType; if (isSGetReg(MI->getOpcode()) && checkGetRegHazards(MI) > 0) - return NoopHazard; + return HazardType; if (isSSetReg(MI->getOpcode()) && checkSetRegHazards(MI) > 0) - return NoopHazard; + return HazardType; if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0) - return NoopHazard; + return HazardType; if (ST.hasReadM0MovRelInterpHazard() && (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) && checkReadM0Hazards(MI) > 0) - return NoopHazard; + return HazardType; if (ST.hasReadM0SendMsgHazard() && isSendMsgTraceDataOrGDS(TII, *MI) && checkReadM0Hazards(MI) > 0) - return NoopHazard; + return HazardType; if (SIInstrInfo::isMAI(*MI) && checkMAIHazards(MI) > 0) - return NoopHazard; + return HazardType; if ((SIInstrInfo::isVMEM(*MI) || SIInstrInfo::isFLAT(*MI) || SIInstrInfo::isDS(*MI)) && checkMAILdStHazards(MI) > 0) - return NoopHazard; + return HazardType; if (MI->isInlineAsm() && checkInlineAsmHazards(MI) > 0) - return NoopHazard; + return HazardType; return NoHazard; } @@ -312,15 +316,19 @@ void GCNHazardRecognizer::AdvanceCycle() { // When the scheduler detects a stall, it will call AdvanceCycle() without // emitting any instructions. - if (!CurrCycleInstr) + if (!CurrCycleInstr) { + EmittedInstrs.push_front(nullptr); return; + } // Do not track non-instructions which do not affect the wait states. // If included, these instructions can lead to buffer overflow such that // detectable hazards are missed. if (CurrCycleInstr->isImplicitDef() || CurrCycleInstr->isDebugInstr() || - CurrCycleInstr->isKill()) + CurrCycleInstr->isKill()) { + CurrCycleInstr = nullptr; return; + } if (CurrCycleInstr->isBundle()) { processBundle(); diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -602,35 +602,6 @@ } } - // Combine adjacent s_nops to use the immediate operand encoding how long - // to wait. - // - // s_nop N - // s_nop M - // => - // s_nop (N + M) - if (MI.getOpcode() == AMDGPU::S_NOP && - MI.getNumOperands() == 1 && // Don't merge with implicit operands - Next != MBB.end() && - (*Next).getOpcode() == AMDGPU::S_NOP && - (*Next).getNumOperands() == 1) { - - MachineInstr &NextMI = *Next; - // The instruction encodes the amount to wait with an offset of 1, - // i.e. 0 is wait 1 cycle. Convert both to cycles and then convert back - // after adding. - uint8_t Nop0 = MI.getOperand(0).getImm() + 1; - uint8_t Nop1 = NextMI.getOperand(0).getImm() + 1; - - // Make sure we don't overflow the bounds. - if (Nop0 + Nop1 <= 8) { - NextMI.getOperand(0).setImm(Nop0 + Nop1 - 1); - MI.eraseFromParent(); - } - - continue; - } - // FIXME: We also need to consider movs of constant operands since // immediate operands are not folded if they have more than one use, and // the operand folding pass is unaware if the immediate will be free since diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -8,7 +8,6 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx16 s[8:23], s[2:3], 0x0 ; GFX9-NEXT: s_lshl_b32 m0, s4, 1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_movrels_b64 s[0:1], s[8:9] ; GFX9-NEXT: s_movrels_b64 s[2:3], s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -887,8 +887,8 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] -; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 @@ -992,8 +992,8 @@ ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_mov_b64 s[6:7], s[10:11] -; GFX7-NEXT: s_nop 1 ; GFX7-NEXT: s_waitcnt vmcnt(0) +; GFX7-NEXT: s_nop 0 ; GFX7-NEXT: v_div_fmas_f32 v0, v1, v2, v3 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 offset:8 ; GFX7-NEXT: s_endpgm @@ -1026,8 +1026,8 @@ ; GFX8-NEXT: s_addc_u32 s1, s5, 0 ; GFX8-NEXT: s_and_b32 s2, 1, s2 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 -; GFX8-NEXT: s_nop 3 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -48,10 +48,10 @@ ; GFX8-NEXT: flat_load_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s3 ; GFX8-NEXT: v_mov_b32_e32 v4, s2 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v4, v2 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_dpp v5, v3 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[4:5] ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -69,8 +69,8 @@ ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -98,8 +98,8 @@ ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -130,8 +130,8 @@ ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -162,8 +162,8 @@ ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -415,7 +415,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -467,7 +466,6 @@ ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -526,8 +524,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -578,8 +576,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -654,7 +652,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -706,7 +703,6 @@ ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -765,8 +761,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -817,8 +813,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -893,7 +889,6 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -945,7 +940,6 @@ ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -1004,8 +998,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1056,8 +1050,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_add_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1134,8 +1128,8 @@ ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1167,8 +1161,8 @@ ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 2 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1201,8 +1195,8 @@ ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_nop 2 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1235,8 +1229,8 @@ ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v0, 5, s[2:3] ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_nop 2 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1619,7 +1613,6 @@ ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -1649,7 +1642,6 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -1682,7 +1674,6 @@ ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm @@ -1715,7 +1706,6 @@ ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s2, v0 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm @@ -1968,7 +1958,6 @@ ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -2020,7 +2009,6 @@ ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -2079,8 +2067,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2131,8 +2119,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_sub_nc_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2705,7 +2693,6 @@ ; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -2756,7 +2743,6 @@ ; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -2814,8 +2800,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2865,8 +2851,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_and_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2941,7 +2927,6 @@ ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -2993,7 +2978,6 @@ ; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -3052,8 +3036,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3104,8 +3088,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_or_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -3180,7 +3164,6 @@ ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -3232,7 +3215,6 @@ ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -3291,8 +3273,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3343,8 +3325,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_xor_b32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -3418,7 +3400,6 @@ ; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -3469,7 +3450,6 @@ ; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -3527,8 +3507,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3578,8 +3558,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_max_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -3838,7 +3818,6 @@ ; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -3889,7 +3868,6 @@ ; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -3947,8 +3925,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3998,8 +3976,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_min_i32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -4259,7 +4237,6 @@ ; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -4311,7 +4288,6 @@ ; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -4370,8 +4346,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -4422,8 +4398,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_max_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -4679,7 +4655,6 @@ ; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm @@ -4730,7 +4705,6 @@ ; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm @@ -4788,8 +4762,8 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 ; GFX1064-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1064-NEXT: s_nop 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) +; GFX1064-NEXT: s_nop 0 ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -4839,8 +4813,8 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 ; GFX1032-NEXT: v_min_u32_e32 v0, s3, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 -; GFX1032-NEXT: s_nop 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) +; GFX1032-NEXT: s_nop 0 ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -223,7 +223,6 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_not_b64 exec, exec ; GFX8-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_add_u32_dpp v2, vcc, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 @@ -280,7 +279,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_not_b64 exec, exec ; GFX9-NEXT: s_or_saveexec_b64 s[10:11], -1 -; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:1 row_mask:0xf bank_mask:0xf bound_ctrl:0 ; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: v_add_u32_dpp v2, v2, v2 row_shr:2 row_mask:0xf bank_mask:0xf bound_ctrl:0 diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll --- a/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/global-saddr-atomics.ll @@ -185,8 +185,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 3 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 @@ -199,9 +199,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 -; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -225,8 +225,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 3 ; GFX9-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 @@ -239,9 +239,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 -; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v0, v1, s[0:1] offset:42 glc ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: buffer_gl0_inv @@ -266,8 +266,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 3 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 @@ -280,9 +280,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 -; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -306,8 +306,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s0, v2 ; GFX9-NEXT: v_readfirstlane_b32 s1, v3 -; GFX9-NEXT: s_nop 4 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_nop 3 ; GFX9-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1 @@ -320,9 +320,9 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_readfirstlane_b32 s0, v2 ; GFX10-NEXT: v_readfirstlane_b32 s1, v3 -; GFX10-NEXT: s_nop 4 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-NEXT: s_nop 2 ; GFX10-NEXT: global_atomic_swap v0, v1, s[0:1] offset:42 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.update.dpp.ll @@ -30,9 +30,7 @@ ; GFX8-OPT: v_add_u32_e32 [[REG:v[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}} ; GFX8-NOOPT: v_add_u32_e64 [[REG:v[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{[0-9]+}} ; GFX8-NOOPT: v_mov_b32_e32 v{{[0-9]+}}, 0 -; GFX8-NOOPT: s_nop 1 -; GFX8-OPT: s_nop 0 -; GFX8-OPT-NEXT: s_nop 0 +; GFX8: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp {{v[0-9]+}}, [[REG]] quad_perm:[1,0,3,2] row_mask:0xf bank_mask:0xf @0 = internal unnamed_addr addrspace(3) global [448 x i32] undef, align 4 define weak_odr amdgpu_kernel void @dpp_test1(i32* %arg) local_unnamed_addr { diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -12,7 +12,6 @@ ; GCN-NEXT: global_load_dwordx4 v[4:7], v16, s[2:3] offset:16 ; GCN-NEXT: global_load_dwordx4 v[8:11], v16, s[2:3] offset:32 ; GCN-NEXT: global_load_dwordx4 v[12:15], v16, s[2:3] offset:48 -; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(3) ; GCN-NEXT: global_store_dwordx4 v16, v[0:3], s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(3) @@ -52,7 +51,6 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[18:19], s[0:1], 0x2c -; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dwordx4 s[0:3], s[16:17], 0x0 ; GCN-NEXT: s_load_dwordx4 s[4:7], s[16:17], 0x10 @@ -124,7 +122,6 @@ ; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56 ; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60 -; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(15) ; GCN-NEXT: buffer_store_dword v3, v1, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(15) @@ -191,7 +188,6 @@ ; GCN-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: global_load_dwordx2 v[8:9], v0, s[2:3] -; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: global_load_dwordx4 v[0:3], v[8:9], off ; GCN-NEXT: global_load_dwordx4 v[4:7], v[8:9], off offset:16 @@ -225,7 +221,6 @@ ; GCN-NEXT: global_load_short_d16_hi v5, v[0:1], off ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16_hi v2, v[0:1], off offset:64 -; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: global_store_dword v[3:4], v5, off ; GCN-NEXT: s_waitcnt vmcnt(1) @@ -254,7 +249,6 @@ ; GCN-NEXT: global_load_short_d16 v5, v[0:1], off ; GCN-NEXT: s_nop 0 ; GCN-NEXT: global_load_short_d16 v2, v[0:1], off offset:64 -; GCN-NEXT: s_nop 0 ; GCN-NEXT: s_waitcnt vmcnt(1) ; GCN-NEXT: global_store_dword v[3:4], v5, off ; GCN-NEXT: s_waitcnt vmcnt(1) diff --git a/llvm/test/CodeGen/AMDGPU/nop-fold.mir b/llvm/test/CodeGen/AMDGPU/nop-fold.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/nop-fold.mir +++ /dev/null @@ -1,137 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -run-pass=si-shrink-instructions %s -o - | FileCheck %s - ---- - -name: merge_2_nop -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: merge_2_nop - ; CHECK: S_NOP 1 - S_NOP 0 - S_NOP 0 - -... - ---- - -name: merge_3_nop -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: merge_3_nop - ; CHECK: S_NOP 2 - S_NOP 0 - S_NOP 0 - S_NOP 0 - - -... - ---- - -name: merge_7_nop -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: merge_7_nop - ; CHECK: S_NOP 6 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - -... - ---- - -name: merge_8_nop -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: merge_8_nop - ; CHECK: S_NOP 7 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - -... ---- - -name: merge_9_nop -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: merge_9_nop - ; CHECK: S_NOP 7 - ; CHECK: S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - S_NOP 0 - -... - ---- - -name: no_merge_impdef0 -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: no_merge_impdef0 - ; CHECK: S_NOP 0, implicit-def $sgpr0 - ; CHECK: S_NOP 0 - S_NOP 0, implicit-def $sgpr0 - S_NOP 0 - -... - ---- - -name: no_merge_impdef1 -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: no_merge_impdef1 - ; CHECK: S_NOP 0 - ; CHECK: S_NOP 0, implicit-def $sgpr0 - S_NOP 0 - S_NOP 0, implicit-def $sgpr0 - -... - ---- - -name: no_merge_impdef_both -tracksRegLiveness: true -body: | - bb.0: - - ; CHECK-LABEL: name: no_merge_impdef_both - ; CHECK: S_NOP 0 - ; CHECK: S_NOP 0, implicit-def $sgpr0 - S_NOP 0 - S_NOP 0, implicit-def $sgpr0 - -... diff --git a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir --- a/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir +++ b/llvm/test/CodeGen/AMDGPU/power-sched-no-instr-sunit.mir @@ -3,7 +3,6 @@ # GCN-LABEL: name: test # GCN: V_MFMA_F32_32X32X1F32 # GCN: S_BARRIER -# GCN: S_NOP 0 # GCN: V_ACCVGPR_READ_B32 # GCN: BUFFER_STORE_DWORD_OFFEN ---