diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp --- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp +++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp @@ -135,6 +135,21 @@ if ((OrderingAddrSpace == InstrAddrSpace) && isPowerOf2_32(uint32_t(InstrAddrSpace))) this->IsCrossAddressSpaceOrdering = false; + + // Limit the scope to the maximum supported by the instruction's address + // space. + switch (InstrAddrSpace) { + case SIAtomicAddrSpace::SCRATCH: + if (Scope > SIAtomicScope::SINGLETHREAD) + this->Scope = SIAtomicScope::SINGLETHREAD; + break; + case SIAtomicAddrSpace::LDS: + if (Scope > SIAtomicScope::WORKGROUP) + this->Scope = SIAtomicScope::WORKGROUP; + break; + default: + break; + } } public: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -30,17 +30,16 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -59,18 +58,17 @@ ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v2, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB0_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -88,18 +86,17 @@ ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB0_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_nop 0 +; GFX9-NEXT: s_nop 1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -122,7 +119,6 @@ ; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB0_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -153,7 +149,6 @@ ; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB0_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -192,14 +187,13 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB1_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s1, v0 @@ -225,10 +219,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB1_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -257,10 +250,9 @@ ; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB1_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -295,7 +287,6 @@ ; GFX1064-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] @@ -330,7 +321,6 @@ ; GFX1032-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB1_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -356,10 +346,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -401,18 +390,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB2_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -451,18 +440,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -516,7 +505,6 @@ ; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB2_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -568,7 +556,6 @@ ; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB2_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -595,10 +582,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -640,18 +626,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB3_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -690,18 +676,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB3_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -755,7 +741,6 @@ ; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB3_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -807,7 +792,6 @@ ; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -834,10 +818,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -879,18 +862,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB4_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -929,18 +912,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB4_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -994,7 +977,6 @@ ; GFX1064-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB4_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1046,7 +1028,6 @@ ; GFX1032-NEXT: ds_add_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -1084,12 +1065,12 @@ ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB5_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -1099,7 +1080,6 @@ ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1119,12 +1099,12 @@ ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -1132,8 +1112,7 @@ ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: s_nop 2 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1152,12 +1131,12 @@ ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB5_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1165,8 +1144,7 @@ ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, v[1:2] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_nop 1 +; GFX9-NEXT: s_nop 2 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1190,7 +1168,6 @@ ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB5_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] @@ -1223,7 +1200,6 @@ ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB5_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 @@ -1266,10 +1242,9 @@ ; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB6_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 @@ -1310,10 +1285,9 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1353,10 +1327,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1401,7 +1374,6 @@ ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB6_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1443,7 +1415,6 @@ ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB6_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -1478,10 +1449,9 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1493,10 +1463,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1507,10 +1476,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -1528,7 +1496,6 @@ ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -1544,7 +1511,6 @@ ; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -1573,18 +1539,17 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB8_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1603,18 +1568,18 @@ ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v2, v1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1632,18 +1597,18 @@ ; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1666,7 +1631,6 @@ ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB8_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -1697,7 +1661,6 @@ ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB8_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -1736,14 +1699,13 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s1 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[6:7] -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s1, v0 @@ -1769,10 +1731,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -1801,10 +1762,9 @@ ; GFX9-NEXT: s_mul_i32 s3, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v2, s3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v1, v2 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1839,7 +1799,6 @@ ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[0:1] @@ -1874,7 +1833,6 @@ ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s0 @@ -1900,10 +1858,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -1945,18 +1902,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB10_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1995,18 +1952,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB10_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2060,7 +2017,6 @@ ; GFX1064-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB10_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2112,7 +2068,6 @@ ; GFX1032-NEXT: ds_sub_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB10_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -2150,12 +2105,12 @@ ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB11_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -2165,7 +2120,6 @@ ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -2185,12 +2139,12 @@ ; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB11_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -2200,7 +2154,6 @@ ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2219,12 +2172,12 @@ ; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 ; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB11_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -2234,7 +2187,6 @@ ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2258,7 +2210,6 @@ ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB11_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] @@ -2293,7 +2244,6 @@ ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 @@ -2338,10 +2288,9 @@ ; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 @@ -2382,10 +2331,9 @@ ; GFX8-NEXT: v_add_u32_e32 v2, vcc, s7, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -2425,10 +2373,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB12_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -2473,7 +2420,6 @@ ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB12_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2515,7 +2461,6 @@ ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB12_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 @@ -2550,10 +2495,9 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2565,10 +2509,9 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2579,10 +2522,9 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -2600,7 +2542,6 @@ ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -2616,7 +2557,6 @@ ; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm entry: @@ -2635,10 +2575,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_and_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2680,18 +2619,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_and_rtn_b32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB14_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2730,18 +2669,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_and_rtn_b32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB14_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_and_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2795,7 +2734,6 @@ ; GFX1064-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB14_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -2847,7 +2785,6 @@ ; GFX1032-NEXT: ds_and_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB14_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -2874,10 +2811,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_or_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -2919,18 +2855,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_or_rtn_b32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB15_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2969,18 +2905,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_or_rtn_b32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB15_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_or_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3034,7 +2970,6 @@ ; GFX1064-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB15_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3086,7 +3021,6 @@ ; GFX1032-NEXT: ds_or_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB15_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -3113,10 +3047,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_xor_rtn_b32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3158,18 +3091,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_xor_rtn_b32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB16_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3208,18 +3141,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_xor_rtn_b32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB16_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3273,7 +3206,6 @@ ; GFX1064-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB16_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3325,7 +3257,6 @@ ; GFX1032-NEXT: ds_xor_rtn_b32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB16_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -3352,10 +3283,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_i32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3397,18 +3327,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB17_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3447,18 +3377,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB17_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_max_i32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3514,7 +3444,6 @@ ; GFX1064-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB17_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3568,7 +3497,6 @@ ; GFX1032-NEXT: ds_max_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB17_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -3604,12 +3532,12 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB18_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, 1 @@ -3622,7 +3550,6 @@ ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -3640,12 +3567,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB18_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 @@ -3658,7 +3585,6 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3675,12 +3601,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB18_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, 1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 @@ -3693,7 +3619,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3715,7 +3640,6 @@ ; GFX1064-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB18_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] @@ -3749,7 +3673,6 @@ ; GFX1032-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB18_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 @@ -3779,10 +3702,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_i32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -3824,18 +3746,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB19_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -3874,18 +3796,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB19_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_min_i32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -3941,7 +3863,6 @@ ; GFX1064-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB19_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -3995,7 +3916,6 @@ ; GFX1032-NEXT: ds_min_rtn_i32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB19_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -4031,12 +3951,12 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB20_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_bfrev_b32_e32 v1, -2 @@ -4049,7 +3969,6 @@ ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -4067,12 +3986,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB20_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 @@ -4085,7 +4004,6 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4102,12 +4020,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB20_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_bfrev_b32_e32 v0, -2 ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 @@ -4120,7 +4038,6 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4142,7 +4059,6 @@ ; GFX1064-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB20_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] @@ -4176,7 +4092,6 @@ ; GFX1032-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB20_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 @@ -4206,10 +4121,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4251,18 +4165,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB21_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4301,18 +4215,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB21_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_max_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4366,7 +4280,6 @@ ; GFX1064-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB21_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4418,7 +4331,6 @@ ; GFX1032-NEXT: ds_max_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB21_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -4454,12 +4366,12 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB22_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 @@ -4471,7 +4383,6 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s5 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -4489,12 +4400,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB22_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 @@ -4506,7 +4417,6 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4523,12 +4433,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB22_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -4540,7 +4450,6 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4562,7 +4471,6 @@ ; GFX1064-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB22_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] @@ -4596,7 +4504,6 @@ ; GFX1032-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB22_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 @@ -4626,10 +4533,9 @@ ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_u32 v0, v1, v0 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -4671,18 +4577,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v3, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u32 v0, v0, v3 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB23_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4721,18 +4627,18 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mov_b32_e32 v0, local_var32@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v3, s4 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u32 v0, v0, v3 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB23_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: v_min_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4786,7 +4692,6 @@ ; GFX1064-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB23_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] @@ -4838,7 +4743,6 @@ ; GFX1032-NEXT: ds_min_rtn_u32 v0, v7, v4 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB23_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 @@ -4874,12 +4778,12 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v0, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] -; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_wbinvl1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: BB24_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 @@ -4891,7 +4795,6 @@ ; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -4909,12 +4812,12 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] -; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_wbinvl1_vol +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: BB24_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s5, v1 ; GFX8-NEXT: v_readfirstlane_b32 s4, v0 ; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -4926,7 +4829,6 @@ ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -4943,12 +4845,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var64@abs32@lo ; GFX9-NEXT: v_mov_b32_e32 v1, 0 -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] -; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_wbinvl1_vol +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: BB24_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s5, v1 ; GFX9-NEXT: v_readfirstlane_b32 s4, v0 ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc @@ -4960,7 +4862,6 @@ ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -4982,7 +4883,6 @@ ; GFX1064-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv -; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB24_2: ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] @@ -5016,7 +4916,6 @@ ; GFX1032-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv -; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB24_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 diff --git a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll --- a/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll +++ b/llvm/test/CodeGen/AMDGPU/atomicrmw-nand.ll @@ -13,10 +13,9 @@ ; GCN-NEXT: v_mov_b32_e32 v2, v1 ; GCN-NEXT: v_not_b32_e32 v1, v2 ; GCN-NEXT: v_or_b32_e32 v1, -5, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: ds_cmpst_rtn_b32 v1, v0, v2, v1 -; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: buffer_wbinvl1_vol +; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v2 ; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] ; GCN-NEXT: s_andn2_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll --- a/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll +++ b/llvm/test/CodeGen/AMDGPU/local-atomics-fp.ll @@ -34,7 +34,7 @@ ; HAS-ATOMICS-DAG: v_mov_b32_e32 [[V0:v[0-9]+]], 0x42280000 ; HAS-ATOMICS: ds_add_rtn_f32 [[V2:v[0-9]+]], [[V1:v[0-9]+]], [[V0]] offset:32 ; HAS-ATOMICS: ds_add_f32 [[V3:v[0-9]+]], [[V0]] offset:64 -; HAS-ATOMICS: s_waitcnt vmcnt(0) lgkmcnt(0) +; HAS-ATOMICS: s_waitcnt lgkmcnt(0) ; HAS-ATOMICS: ds_add_rtn_f32 {{v[0-9]+}}, {{v[0-9]+}}, [[V2]] define amdgpu_kernel void @lds_ds_fadd(float addrspace(1)* %out, float addrspace(3)* %ptrf, i32 %idx) { %idx.add = add nuw i32 %idx, 4 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -147,7 +147,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v1, v0 ; GFX6-NEXT: s_endpgm @@ -160,7 +159,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_read_b32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm @@ -174,7 +172,6 @@ ; GFX10-WGP-NEXT: ds_read_b32 v0, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v1, v0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -186,8 +183,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: ds_read_b32 v0, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v1, v0 ; GFX10-CU-NEXT: s_endpgm ; @@ -217,10 +212,9 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v1, v0 ; GFX6-NEXT: s_endpgm @@ -231,10 +225,9 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm @@ -250,7 +243,6 @@ ; GFX10-WGP-NEXT: ds_read_b32 v0, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v1, v0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -260,12 +252,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_read_b32 v0, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v1, v0 ; GFX10-CU-NEXT: s_endpgm ; @@ -275,7 +264,7 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -407,7 +396,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -418,7 +407,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -439,8 +428,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -451,7 +439,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -469,7 +457,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -480,7 +468,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -501,8 +489,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -513,7 +500,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -587,8 +574,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_atomicrmw: @@ -599,8 +585,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_atomicrmw: @@ -613,7 +598,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_acquire_atomicrmw: @@ -624,9 +608,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_atomicrmw: @@ -637,7 +618,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: @@ -654,7 +635,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -665,7 +646,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -686,8 +667,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -698,7 +678,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -716,10 +696,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_atomicrmw: @@ -729,10 +708,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_atomicrmw: @@ -747,7 +725,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_acq_rel_atomicrmw: @@ -756,13 +733,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_atomicrmw: @@ -772,9 +745,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: @@ -791,10 +764,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_atomicrmw: @@ -804,10 +776,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_atomicrmw: @@ -822,7 +793,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_seq_cst_atomicrmw: @@ -831,13 +801,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_atomicrmw: @@ -847,9 +813,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: @@ -867,8 +833,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -880,8 +845,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -895,7 +859,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -907,9 +870,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -921,7 +881,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -940,10 +900,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -954,10 +913,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -973,7 +931,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -983,13 +940,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1000,9 +953,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1021,10 +974,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1035,10 +987,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1054,7 +1005,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1064,13 +1014,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1081,9 +1027,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1166,8 +1112,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -1179,8 +1124,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -1194,7 +1138,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -1206,9 +1149,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_monotonic_cmpxchg: @@ -1221,7 +1161,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1240,7 +1180,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1252,7 +1192,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -1275,8 +1215,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -1289,7 +1228,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -1309,10 +1248,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -1323,10 +1261,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -1342,7 +1279,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -1352,13 +1288,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_monotonic_cmpxchg: @@ -1370,9 +1302,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1391,10 +1323,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -1405,10 +1336,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -1424,7 +1354,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -1434,13 +1363,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_monotonic_cmpxchg: @@ -1452,9 +1377,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1474,8 +1399,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -1487,8 +1411,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -1502,7 +1425,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -1514,9 +1436,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_acquire_acquire_cmpxchg: @@ -1529,7 +1448,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1548,10 +1467,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_release_acquire_cmpxchg: @@ -1562,10 +1480,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_release_acquire_cmpxchg: @@ -1581,7 +1498,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_release_acquire_cmpxchg: @@ -1591,13 +1507,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_release_acquire_cmpxchg: @@ -1609,9 +1521,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1630,10 +1542,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -1644,10 +1555,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -1663,7 +1573,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -1673,13 +1582,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_acq_rel_acquire_cmpxchg: @@ -1691,9 +1596,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1712,10 +1617,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -1726,10 +1630,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -1745,7 +1648,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -1755,13 +1657,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_acquire_cmpxchg: @@ -1773,9 +1671,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1794,10 +1692,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -1808,10 +1705,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -1827,7 +1723,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -1837,13 +1732,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_agent_seq_cst_seq_cst_cmpxchg: @@ -1855,9 +1746,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1877,8 +1768,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1891,8 +1781,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1906,7 +1795,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1918,9 +1806,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1934,7 +1820,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -1956,10 +1842,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1971,10 +1856,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1990,7 +1874,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2001,12 +1884,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2019,9 +1899,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2043,10 +1923,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2058,10 +1937,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2077,7 +1955,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2088,12 +1965,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2106,9 +1980,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2131,8 +2005,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2145,8 +2018,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2160,7 +2032,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2172,9 +2043,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2188,7 +2057,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2210,10 +2079,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2225,10 +2093,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2244,7 +2111,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2255,12 +2121,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2273,9 +2136,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2297,10 +2160,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2312,10 +2174,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2331,7 +2192,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2342,12 +2202,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2360,9 +2217,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2384,10 +2241,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2399,10 +2255,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2418,7 +2273,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2429,12 +2283,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2447,9 +2298,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2471,10 +2322,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2486,10 +2336,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2505,7 +2354,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2516,12 +2364,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2534,9 +2379,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -147,7 +147,6 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v1, v0 ; GFX6-NEXT: s_endpgm @@ -160,7 +159,6 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: ds_read_b32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm @@ -174,7 +172,6 @@ ; GFX10-WGP-NEXT: ds_read_b32 v0, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v1, v0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -186,8 +183,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: ds_read_b32 v0, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v1, v0 ; GFX10-CU-NEXT: s_endpgm ; @@ -217,10 +212,9 @@ ; GFX6-NEXT: s_mov_b32 m0, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_read_b32 v0, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_write_b32 v1, v0 ; GFX6-NEXT: s_endpgm @@ -231,10 +225,9 @@ ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_read_b32 v0, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_write_b32 v1, v0 ; GFX7-NEXT: s_endpgm @@ -250,7 +243,6 @@ ; GFX10-WGP-NEXT: ds_read_b32 v0, v0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v1, v0 ; GFX10-WGP-NEXT: s_endpgm ; @@ -260,12 +252,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_read_b32 v0, v0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v1, v0 ; GFX10-CU-NEXT: s_endpgm ; @@ -275,7 +264,7 @@ ; SKIP-CACHE-INV-NEXT: s_mov_b32 m0, -1 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_read_b32 v0, v0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 @@ -407,7 +396,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -418,7 +407,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -439,8 +428,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -451,7 +439,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -469,7 +457,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v1, s0 ; GFX6-NEXT: v_mov_b32_e32 v0, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -480,7 +468,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s1 ; GFX7-NEXT: v_mov_b32_e32 v1, s0 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -501,8 +489,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s0 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -513,7 +500,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -587,8 +574,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_atomicrmw: @@ -599,8 +585,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_atomicrmw: @@ -613,7 +598,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_acquire_atomicrmw: @@ -624,9 +608,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_atomicrmw: @@ -637,7 +618,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: @@ -654,7 +635,7 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -665,7 +646,7 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -686,8 +667,7 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -698,7 +678,7 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -716,10 +696,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_atomicrmw: @@ -729,10 +708,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_atomicrmw: @@ -747,7 +725,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_acq_rel_atomicrmw: @@ -756,13 +733,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_atomicrmw: @@ -772,9 +745,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: @@ -791,10 +764,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_atomicrmw: @@ -804,10 +776,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_atomicrmw: @@ -822,7 +793,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_seq_cst_atomicrmw: @@ -831,13 +801,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_atomicrmw: @@ -847,9 +813,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v0, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { entry: @@ -867,8 +833,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -880,8 +845,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -895,7 +859,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -907,9 +870,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -921,7 +881,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -940,10 +900,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -954,10 +913,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -973,7 +931,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -983,13 +940,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1000,9 +953,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1021,10 +974,9 @@ ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1035,10 +987,9 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1054,7 +1005,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1064,13 +1014,9 @@ ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1081,9 +1027,9 @@ ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_wrxchg_rtn_b32 v1, v0, v1 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1166,8 +1112,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -1179,8 +1124,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -1194,7 +1138,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -1206,9 +1149,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_monotonic_cmpxchg: @@ -1221,7 +1161,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1240,7 +1180,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX6-NEXT: s_endpgm ; @@ -1252,7 +1192,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX7-NEXT: s_endpgm ; @@ -1275,8 +1215,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_endpgm ; @@ -1289,7 +1228,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -1309,10 +1248,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -1323,10 +1261,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -1342,7 +1279,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -1352,13 +1288,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_monotonic_cmpxchg: @@ -1370,9 +1302,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1391,10 +1323,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -1405,10 +1336,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -1424,7 +1354,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -1434,13 +1363,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_monotonic_cmpxchg: @@ -1452,9 +1377,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1474,8 +1399,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acquire_acquire_cmpxchg: @@ -1487,8 +1411,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acquire_acquire_cmpxchg: @@ -1502,7 +1425,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_acquire_acquire_cmpxchg: @@ -1514,9 +1436,6 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_acquire_acquire_cmpxchg: @@ -1529,7 +1448,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1548,10 +1467,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_release_acquire_cmpxchg: @@ -1562,10 +1480,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_release_acquire_cmpxchg: @@ -1581,7 +1498,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_release_acquire_cmpxchg: @@ -1591,13 +1507,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_release_acquire_cmpxchg: @@ -1609,9 +1521,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1630,10 +1542,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -1644,10 +1555,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -1663,7 +1573,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -1673,13 +1582,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_acq_rel_acquire_cmpxchg: @@ -1691,9 +1596,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1712,10 +1617,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -1726,10 +1630,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -1745,7 +1648,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -1755,13 +1657,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_acquire_cmpxchg: @@ -1773,9 +1671,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1794,10 +1692,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_endpgm ; ; GFX7-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -1808,10 +1705,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_endpgm ; ; GFX10-WGP-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -1827,7 +1723,6 @@ ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -1837,13 +1732,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: local_system_seq_cst_seq_cst_cmpxchg: @@ -1855,9 +1746,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_b32 v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { entry: @@ -1877,8 +1768,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1891,8 +1781,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1906,7 +1795,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -1918,9 +1806,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -1934,7 +1820,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -1956,10 +1842,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -1971,10 +1856,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -1990,7 +1874,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2001,12 +1884,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2019,9 +1899,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2043,10 +1923,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2058,10 +1937,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2077,7 +1955,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2088,12 +1965,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2106,9 +1980,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2131,8 +2005,7 @@ ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2145,8 +2018,7 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2160,7 +2032,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2172,9 +2043,7 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2188,7 +2057,7 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2210,10 +2079,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2225,10 +2093,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2244,7 +2111,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2255,12 +2121,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2273,9 +2136,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2297,10 +2160,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2312,10 +2174,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2331,7 +2192,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2342,12 +2202,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2360,9 +2217,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2384,10 +2241,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2399,10 +2255,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2418,7 +2273,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2429,12 +2283,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2447,9 +2298,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) { @@ -2471,10 +2322,9 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX6-NEXT: buffer_wbinvl1 +; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: ds_write_b32 v0, v1 ; GFX6-NEXT: s_endpgm ; @@ -2486,10 +2336,9 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s1 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7-NEXT: buffer_wbinvl1_vol +; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: ds_write_b32 v0, v1 ; GFX7-NEXT: s_endpgm ; @@ -2505,7 +2354,6 @@ ; GFX10-WGP-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: buffer_gl1_inv ; GFX10-WGP-NEXT: ds_write_b32 v0, v1 ; GFX10-WGP-NEXT: s_endpgm ; @@ -2516,12 +2364,9 @@ ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: buffer_gl0_inv -; GFX10-CU-NEXT: buffer_gl1_inv +; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: ds_write_b32 v0, v1 ; GFX10-CU-NEXT: s_endpgm ; @@ -2534,9 +2379,9 @@ ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s3 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_cmpst_rtn_b32 v1, v0, v1, v2 offset:16 -; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) +; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: ds_write_b32 v0, v1 ; SKIP-CACHE-INV-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in, i32 %old) {