diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -930,10 +930,12 @@ return false; }; - auto IsExpiredFn = [] (MachineInstr *MI, int) { + auto IsExpiredFn = [](MachineInstr *MI, int) { return MI && (SIInstrInfo::isVALU(*MI) || (MI->getOpcode() == AMDGPU::S_WAITCNT && - !MI->getOperand(0).getImm())); + !MI->getOperand(0).getImm()) || + (MI->getOpcode() == AMDGPU::S_WAITCNT_DEPCTR && + MI->getOperand(0).getImm() == 0xffe3)); }; if (::getWaitStatesSince(IsHazardFn, MI, IsExpiredFn) == @@ -941,7 +943,9 @@ return false; const SIInstrInfo *TII = ST.getInstrInfo(); - BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), TII->get(AMDGPU::V_NOP_e32)); + BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), + TII->get(AMDGPU::S_WAITCNT_DEPCTR)) + .addImm(0xffe3); return true; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -1075,7 +1075,7 @@ ; GFX10_W32-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W32-NEXT: s_cselect_b32 s4, 1, 0 ; GFX10_W32-NEXT: BB13_2: ; %exit -; GFX10_W32-NEXT: v_nop +; GFX10_W32-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W32-NEXT: s_or_b32 exec_lo, exec_lo, s5 ; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 @@ -1113,7 +1113,7 @@ ; GFX10_W64-NEXT: s_cmp_lg_u32 s0, 0 ; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 ; GFX10_W64-NEXT: BB13_2: ; %exit -; GFX10_W64-NEXT: v_nop +; GFX10_W64-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10_W64-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.end.cf.i32.ll @@ -18,7 +18,7 @@ ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off ; GCN-NEXT: BB0_2: ; %bb -; GCN-NEXT: v_nop +; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_or_b32 exec_lo, exec_lo, s0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: global_store_dword v[0:1], v0, off diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -124,7 +124,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB0_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -156,7 +156,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB0_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -298,7 +298,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -334,7 +334,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB1_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -520,7 +520,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB2_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -572,7 +572,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB2_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -759,7 +759,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB3_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -811,7 +811,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB3_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -998,7 +998,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB4_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -1050,7 +1050,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB4_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -1194,7 +1194,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB5_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 @@ -1228,7 +1228,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB5_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v2 @@ -1406,7 +1406,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB6_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -1449,7 +1449,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB6_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -1675,7 +1675,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB8_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1708,7 +1708,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB8_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1851,7 +1851,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -1887,7 +1887,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB9_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v0, s0, v0 @@ -2073,7 +2073,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB10_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -2125,7 +2125,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB10_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -2271,7 +2271,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB11_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 @@ -2307,7 +2307,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB11_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v1, 5, v0 @@ -2487,7 +2487,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB12_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -2530,7 +2530,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB12_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mul_lo_u32 v3, s3, v0 @@ -2808,7 +2808,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB14_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -2859,7 +2859,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB14_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -3046,7 +3046,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB15_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -3098,7 +3098,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB15_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -3285,7 +3285,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB16_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -3337,7 +3337,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB16_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -3521,7 +3521,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB17_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -3572,7 +3572,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB17_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -3719,7 +3719,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB18_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -3754,7 +3754,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB18_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -3941,7 +3941,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB19_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -3992,7 +3992,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB19_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -4139,7 +4139,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB20_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -4174,7 +4174,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB20_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -4364,7 +4364,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB21_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -4416,7 +4416,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB21_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -4560,7 +4560,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB22_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -4595,7 +4595,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB22_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -4782,7 +4782,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB23_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1064-NEXT: v_mov_b32_e32 v0, v1 @@ -4833,7 +4833,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB23_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v0 ; GFX1032-NEXT: v_mov_b32_e32 v0, v1 @@ -4977,7 +4977,7 @@ ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB24_2: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 @@ -5012,7 +5012,7 @@ ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB24_2: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -130,7 +130,7 @@ ; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s12, 5 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX1064-NEXT: BB0_3: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 @@ -164,7 +164,7 @@ ; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s10, 5 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX1032-NEXT: BB0_3: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 @@ -364,7 +364,7 @@ ; GFX1064-NEXT: v_mov_b32_e32 v0, s12 ; GFX1064-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1064-NEXT: BB1_3: -; GFX1064-NEXT: v_nop +; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v0 @@ -418,7 +418,7 @@ ; GFX1032-NEXT: v_mov_b32_e32 v0, s10 ; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc ; GFX1032-NEXT: BB1_3: -; GFX1032-NEXT: v_nop +; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s9 ; GFX1032-NEXT: s_waitcnt vmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s4, v0 diff --git a/llvm/test/CodeGen/AMDGPU/cc-update.ll b/llvm/test/CodeGen/AMDGPU/cc-update.ll --- a/llvm/test/CodeGen/AMDGPU/cc-update.ll +++ b/llvm/test/CodeGen/AMDGPU/cc-update.ll @@ -386,7 +386,7 @@ ; GFX1010-NEXT: ; implicit-def: $vcc_hi ; GFX1010-NEXT: s_waitcnt vmcnt(0) ; GFX1010-NEXT: buffer_store_dword v0, off, s[0:3], s6 ; 4-byte Folded Spill -; GFX1010-NEXT: v_nop +; GFX1010-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1010-NEXT: s_mov_b32 s6, 0x20000 ; GFX1010-NEXT: ;;#ASMSTART ; GFX1010-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -29,7 +29,7 @@ ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_store_dword v0, v0, s[8:11], 0 offen -; GCN-NEXT: v_nop +; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GCN-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; GCN-NEXT: s_cbranch_execnz BB0_2 ; GCN-NEXT: ; %bb.3: ; in Loop: Header=BB0_1 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -59,7 +59,7 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10: image_gather4_c_b_cl v[40:43], v[32:39], s[4:11], s[12:15] dmask:0x1 -; GFX10-NEXT: v_nop +; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+4 diff --git a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir --- a/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir +++ b/llvm/test/CodeGen/AMDGPU/vmem-to-salu-hazard.mir @@ -2,7 +2,7 @@ # GCN-LABEL: name: vmem_write_sgpr # GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr @@ -16,7 +16,7 @@ ... # GCN-LABEL: name: vmem_write_exec # GCN: BUFFER_STORE_DWORD_OFFEN_exact -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_exec @@ -35,7 +35,7 @@ # GCN-NEXT: S_MOV_B32 # GCN-NEXT: S_MOV_B32 # GCN-NEXT: S_MOV_B32 -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_chain @@ -54,7 +54,7 @@ ... # GCN-LABEL: name: vmem_smem_write_sgpr # GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_LOAD_DWORD_IMM --- name: vmem_smem_write_sgpr @@ -69,7 +69,7 @@ # GCN-LABEL: name: vmem_snop_write_sgpr # GCN: BUFFER_LOAD_DWORD_OFFEN # GCN-NEXT: S_NOP -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_snop_write_sgpr @@ -115,7 +115,7 @@ # GCN-LABEL: name: vmem_swait_any_write_sgpr # GCN: BUFFER_LOAD_DWORD_OFFEN # GCN-NEXT: S_WAITCNT -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_swait_any_write_sgpr @@ -130,7 +130,7 @@ ... # GCN-LABEL: name: vmem_write_exec_impread # GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN: V_NOP +# GCN: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B64 --- name: vmem_write_exec_impread @@ -144,7 +144,7 @@ ... # GCN-LABEL: name: vmem_write_exec_expread # GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B64 --- name: vmem_write_exec_expread @@ -157,7 +157,7 @@ ... # GCN-LABEL: name: ds_write_m0 # GCN: DS_READ_B32 -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: ds_write_m0 @@ -171,7 +171,7 @@ ... # GCN-LABEL: name: vmem_write_sgpr_fall_through # GCN: BUFFER_LOAD_DWORD_OFFEN -# GCN: V_NOP +# GCN: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_fall_through @@ -189,7 +189,7 @@ # GCN-LABEL: name: vmem_write_sgpr_branch # GCN: BUFFER_LOAD_DWORD_OFFEN # GCN-NEXT: S_BRANCH -# GCN: V_NOP +# GCN: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_branch @@ -209,7 +209,7 @@ # GCN: BUFFER_LOAD_DWORD_OFFEN # GCN-NEXT: S_BRANCH # GCN: bb.2: -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_branch_around @@ -237,7 +237,7 @@ # GCN: S_WAITCNT # GCN: V_ADD_I32 # GCN: bb.2: -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_cbranch_around @@ -262,7 +262,7 @@ ... # GCN-LABEL: name: vmem_write_sgpr_branch_backedge # GCN: $vgpr0 = IMPLICIT_DEF -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_write_sgpr_branch_backedge @@ -280,7 +280,7 @@ ... # GCN-LABEL: name: ds_write_exec # GCN: DS_WRITE_B32_gfx9 -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: ds_write_exec @@ -293,7 +293,7 @@ ... # GCN-LABEL: name: vmem_scratch_exec # GCN: SCRATCH_LOAD_DWORD -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_scratch_exec @@ -305,7 +305,7 @@ ... # GCN-LABEL: name: vmem_flat_exec # GCN: FLAT_LOAD_DWORD -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_flat_exec @@ -318,7 +318,7 @@ ... # GCN-LABEL: name: vmem_global_exec # GCN: GLOBAL_LOAD_DWORD -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_global_exec @@ -331,7 +331,7 @@ ... # GCN-LABEL: name: vmem_global_atomic_exec # GCN: GLOBAL_ATOMIC_ADD_RTN -# GCN-NEXT: V_NOP +# GCN-NEXT: S_WAITCNT_DEPCTR 65507 # GCN-NEXT: S_MOV_B32 --- name: vmem_global_atomic_exec diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -1059,7 +1059,7 @@ ; GFX1064-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032-NEXT: s_or_saveexec_b32 [[COPY_EXEC0:s[0-9]]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: v_nop +; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC0]] @@ -1082,7 +1082,7 @@ ; GFX1064: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GFX1032: s_or_saveexec_b32 [[COPY_EXEC1:s[0-9]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: v_nop +; GCN-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GFX1032-NEXT: s_mov_b32 exec_lo, [[COPY_EXEC1]] ; GCN-NEXT: s_waitcnt vmcnt(0)