diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp @@ -521,8 +521,15 @@ case AtomicRMWInst::Sub: { // The new value we will be contributing to the atomic operation is the // old value times the number of active lanes. - Value *const Ctpop = B.CreateIntCast( - B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot), Ty, false); + + // Mark the ctpop as convergent so it does not get moved into the + // single-lane basic block. This saves us currently one instruction. + // Another way to save this instruction is reusing the saved exec register + // from the inserted control flow (output of s_saveexec). This is + // currently hard to do though it might work when GlobalISel gets used. + CallInst *const CtpopCall = B.CreateUnaryIntrinsic(Intrinsic::ctpop, Ballot); + CtpopCall->addAttribute(AttributeList::FunctionIndex, Attribute::Convergent); + Value *const Ctpop = B.CreateIntCast(CtpopCall, Ty, false); NewV = B.CreateMul(V, Ctpop); break; } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -13,11 +13,11 @@ ; GCN-LABEL: add_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -29,11 +29,11 @@ ; GCN-LABEL: add_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_add v[[value]] @@ -107,11 +107,11 @@ ; GCN-LABEL: sub_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -123,11 +123,11 @@ ; GCN-LABEL: sub_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -10,11 +10,11 @@ ; GCN-LABEL: add_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { @@ -26,11 +26,11 @@ ; GCN-LABEL: add_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: {{flat|buffer|global}}_atomic_add v[[value]] @@ -62,11 +62,11 @@ ; GCN-LABEL: add_i64_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -79,11 +79,11 @@ ; GCN-LABEL: add_i64_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: {{flat|buffer|global}}_atomic_add_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { entry: @@ -108,11 +108,11 @@ ; GCN-LABEL: sub_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, i32 addrspace(1)* %inout) { @@ -124,11 +124,11 @@ ; GCN-LABEL: sub_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: {{flat|buffer|global}}_atomic_sub v[[value]] @@ -160,11 +160,11 @@ ; GCN-LABEL: sub_i64_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_hi_u32_u24{{(_e[0-9]+)?}} v[[value_hi:[0-9]+]], s[[popcount]], 5 ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value_lo:[0-9]+]], s[[popcount]], 5 ; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}[[value_lo]]:[[value_hi]]{{\]}} @@ -177,11 +177,11 @@ ; GCN-LABEL: sub_i64_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: {{flat|buffer|global}}_atomic_sub_x2 v{{\[}}{{[0-9]+}}:{{[0-9]+}}{{\]}} define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -17,25 +17,24 @@ ; ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB0_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 +; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 +; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB0_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -49,14 +48,13 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -64,7 +62,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 @@ -79,21 +77,20 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB0_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u32 v1, v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB0_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 @@ -107,16 +104,15 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB0_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u32 v1, v2, v1 @@ -125,7 +121,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB0_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -139,16 +135,15 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB0_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u32 v1, v2, v1 @@ -157,7 +152,7 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB0_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -177,17 +172,16 @@ ; ; GFX7LESS-LABEL: add_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -214,13 +208,12 @@ ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s1, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB1_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s1, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -231,7 +224,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB1_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -247,13 +240,12 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s1, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB1_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s1, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -263,7 +255,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB1_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -278,17 +270,16 @@ ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s1, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB1_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s1, s0, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -298,7 +289,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB1_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 @@ -313,17 +304,16 @@ ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB1_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1076,26 +1066,25 @@ ; ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB5_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB5_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -1114,15 +1103,14 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1130,7 +1118,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB5_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 ; GFX8-NEXT: v_mov_b32_e32 v1, s2 @@ -1148,22 +1136,21 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB5_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB5_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 @@ -1180,17 +1167,16 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB5_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -1199,7 +1185,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB5_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 ; GFX1064-NEXT: v_mad_u64_u32 v[0:1], s[2:3], v0, 5, s[2:3] @@ -1214,17 +1200,16 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB5_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 ; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_add_rtn_u64 v[1:2], v3, v[1:2] @@ -1254,22 +1239,21 @@ ; ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB6_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 ; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 @@ -1301,13 +1285,12 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz BB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 @@ -1345,13 +1328,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz BB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s3, s6 ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -1387,19 +1369,18 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB6_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1064-NEXT: s_mul_i32 s6, s2, s6 +; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: s_add_i32 s8, s8, s7 ; GFX1064-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064-NEXT: v_mov_b32_e32 v2, s8 @@ -1430,19 +1411,18 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB6_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1032-NEXT: s_mul_i32 s5, s2, s5 +; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_add_i32 s7, s7, s6 ; GFX1032-NEXT: v_mov_b32_e32 v1, s5 ; GFX1032-NEXT: v_mov_b32_e32 v2, s7 @@ -1569,25 +1549,24 @@ ; ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB8_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo -; GFX7LESS-NEXT: v_mul_u32_u24_e64 v2, s2, 5 +; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 +; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB8_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -1602,14 +1581,13 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB8_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1617,7 +1595,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB8_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 @@ -1633,21 +1611,20 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB8_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u32 v1, v2, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB8_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 @@ -1662,16 +1639,15 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB8_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u32 v1, v2, v1 @@ -1680,7 +1656,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB8_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 @@ -1695,16 +1671,15 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB8_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 ; GFX1032-NEXT: v_mov_b32_e32 v2, local_var32@abs32@lo -; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u32 v1, v2, v1 @@ -1713,7 +1688,7 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: BB8_2: ; GFX1032-NEXT: v_nop -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s3 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 @@ -1734,17 +1709,16 @@ ; ; GFX7LESS-LABEL: sub_i32_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dword s2, s[0:1], 0xb ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB9_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s3, s[6:7] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s3, s2, s3 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1771,13 +1745,12 @@ ; GFX8-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s1, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: s_mul_i32 s1, s0, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1788,7 +1761,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB9_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v1 @@ -1804,13 +1777,12 @@ ; GFX9-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s1, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s1, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s1, s0, s1 ; GFX9-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo @@ -1820,7 +1792,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB9_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s0, v1 @@ -1835,17 +1807,16 @@ ; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1064-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s1, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s1, s[2:3] -; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s1, s0, s1 +; GFX1064-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1064-NEXT: v_mov_b32_e32 v2, s1 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1855,7 +1826,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB9_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mul_lo_u32 v0, s0, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s0, v1 @@ -1870,17 +1841,16 @@ ; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX1032-NEXT: s_load_dword s0, s[0:1], 0x2c ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 -; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s2, s0, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, local_var32@abs32@lo ; GFX1032-NEXT: v_mov_b32_e32 v2, s2 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2151,26 +2121,25 @@ ; ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB11_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX7LESS-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX7LESS-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: BB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -2189,15 +2158,14 @@ ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[2:3], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB11_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX8-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX8-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX8-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2205,7 +2173,7 @@ ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: BB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: v_readfirstlane_b32 s3, v2 ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -2224,22 +2192,21 @@ ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[2:3], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_cbranch_execz BB11_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX9-NEXT: v_mul_u32_u24_e64 v1, s2, 5 +; GFX9-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX9-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: BB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: v_readfirstlane_b32 s3, v2 ; GFX9-NEXT: v_readfirstlane_b32 s2, v1 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v0 @@ -2257,17 +2224,16 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s4, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_cbranch_execz BB11_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3] +; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s4, 5 +; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s4, 5 ; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo -; GFX1064-NEXT: v_mul_hi_u32_u24_e64 v2, s2, 5 -; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s2, 5 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -2276,7 +2242,7 @@ ; GFX1064-NEXT: buffer_gl1_inv ; GFX1064-NEXT: BB11_2: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX1064-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v1 ; GFX1064-NEXT: v_mul_u32_u24_e32 v1, 5, v0 ; GFX1064-NEXT: v_readfirstlane_b32 s3, v2 @@ -2293,17 +2259,16 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s3, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB11_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: v_mul_hi_u32_u24_e64 v2, s3, 5 ; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s3, 5 +; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1032-NEXT: ds_sub_rtn_u64 v[1:2], v3, v[1:2] @@ -2335,22 +2300,21 @@ ; ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz BB12_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v1 ; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX7LESS-NEXT: v_add_i32_e32 v2, vcc, s7, v1 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, s6 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 @@ -2382,13 +2346,12 @@ ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz BB12_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_hi_u32 v1, s2, v1 @@ -2426,13 +2389,12 @@ ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz BB12_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s3, s6 ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 @@ -2468,19 +2430,18 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz BB12_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1064-NEXT: s_mul_i32 s6, s2, s6 +; GFX1064-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1064-NEXT: s_add_i32 s8, s8, s7 ; GFX1064-NEXT: v_mov_b32_e32 v1, s6 ; GFX1064-NEXT: v_mov_b32_e32 v2, s8 @@ -2511,19 +2472,18 @@ ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s5, exec_lo ; GFX1032-NEXT: ; implicit-def: $vcc_hi ; GFX1032-NEXT: ; implicit-def: $vgpr1_vgpr2 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB12_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1032-NEXT: s_mul_i32 s5, s2, s5 +; GFX1032-NEXT: v_mov_b32_e32 v3, local_var64@abs32@lo ; GFX1032-NEXT: s_add_i32 s7, s7, s6 ; GFX1032-NEXT: v_mov_b32_e32 v1, s5 ; GFX1032-NEXT: v_mov_b32_e32 v2, s7 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -19,19 +19,18 @@ ; GFX7-NEXT: s_and_saveexec_b64 s[8:9], s[10:11] ; GFX7-NEXT: s_cbranch_execz BB0_4 ; GFX7-NEXT: ; %bb.1: -; GFX7-NEXT: s_mov_b64 s[10:11], exec ; GFX7-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 ; GFX7-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GFX7-NEXT: s_bcnt1_i32_b64 s12, exec ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7-NEXT: ; implicit-def: $vgpr1 -; GFX7-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX7-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX7-NEXT: s_cbranch_execz BB0_3 ; GFX7-NEXT: ; %bb.2: -; GFX7-NEXT: s_bcnt1_i32_b64 s10, s[10:11] -; GFX7-NEXT: v_mul_u32_u24_e64 v1, s10, 5 +; GFX7-NEXT: v_mul_u32_u24_e64 v1, s12, 5 ; GFX7-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX7-NEXT: BB0_3: -; GFX7-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX7-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7-NEXT: v_mad_u32_u24 v0, v0, 5, s4 @@ -54,17 +53,16 @@ ; GFX8-NEXT: ; %bb.1: ; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX8-NEXT: s_mov_b64 s[10:11], exec +; GFX8-NEXT: s_bcnt1_i32_b64 s12, exec ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX8-NEXT: s_cbranch_execz BB0_3 ; GFX8-NEXT: ; %bb.2: -; GFX8-NEXT: s_bcnt1_i32_b64 s10, s[10:11] -; GFX8-NEXT: v_mul_u32_u24_e64 v1, s10, 5 +; GFX8-NEXT: v_mul_u32_u24_e64 v1, s12, 5 ; GFX8-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX8-NEXT: BB0_3: -; GFX8-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX8-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s4, v1 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s4 @@ -87,17 +85,16 @@ ; GFX9-NEXT: ; %bb.1: ; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 -; GFX9-NEXT: s_mov_b64 s[10:11], exec +; GFX9-NEXT: s_bcnt1_i32_b64 s12, exec ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[10:11], vcc ; GFX9-NEXT: s_cbranch_execz BB0_3 ; GFX9-NEXT: ; %bb.2: -; GFX9-NEXT: s_bcnt1_i32_b64 s10, s[10:11] -; GFX9-NEXT: v_mul_u32_u24_e64 v1, s10, 5 +; GFX9-NEXT: v_mul_u32_u24_e64 v1, s12, 5 ; GFX9-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX9-NEXT: BB0_3: -; GFX9-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX9-NEXT: s_or_b64 exec, exec, s[10:11] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s4, v1 ; GFX9-NEXT: v_mad_u32_u24 v0, v0, 5, s4 @@ -119,19 +116,18 @@ ; GFX1064-NEXT: s_cbranch_execz BB0_4 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1064-NEXT: s_mov_b64 s[10:11], exec +; GFX1064-NEXT: s_bcnt1_i32_b64 s12, exec ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32_e64 v0, exec_hi, v0 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX1064-NEXT: s_and_saveexec_b64 s[12:13], vcc +; GFX1064-NEXT: s_and_saveexec_b64 s[30:31], vcc ; GFX1064-NEXT: s_cbranch_execz BB0_3 ; GFX1064-NEXT: ; %bb.2: -; GFX1064-NEXT: s_bcnt1_i32_b64 s10, s[10:11] -; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s10, 5 +; GFX1064-NEXT: v_mul_u32_u24_e64 v1, s12, 5 ; GFX1064-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX1064-NEXT: BB0_3: ; GFX1064-NEXT: v_nop -; GFX1064-NEXT: s_or_b64 exec, exec, s[12:13] +; GFX1064-NEXT: s_or_b64 exec, exec, s[30:31] ; GFX1064-NEXT: s_waitcnt vmcnt(0) ; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u32_u24 v0, v0, 5, s4 @@ -154,13 +150,12 @@ ; GFX1032-NEXT: s_cbranch_execz BB0_4 ; GFX1032-NEXT: ; %bb.1: ; GFX1032-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 -; GFX1032-NEXT: s_mov_b32 s10, exec_lo +; GFX1032-NEXT: s_bcnt1_i32_b32 s10, exec_lo ; GFX1032-NEXT: ; implicit-def: $vgpr1 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s9, vcc_lo ; GFX1032-NEXT: s_cbranch_execz BB0_3 ; GFX1032-NEXT: ; %bb.2: -; GFX1032-NEXT: s_bcnt1_i32_b32 s10, s10 ; GFX1032-NEXT: v_mul_u32_u24_e64 v1, s10, 5 ; GFX1032-NEXT: buffer_atomic_add v1, off, s[4:7], 0 glc ; GFX1032-NEXT: BB0_3: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -12,11 +12,11 @@ ; GCN-LABEL: add_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -28,11 +28,11 @@ ; GCN-LABEL: add_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_add v[[value]] @@ -77,11 +77,11 @@ ; GCN-LABEL: sub_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -93,11 +93,11 @@ ; GCN-LABEL: sub_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_sub v[[value]] diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -12,11 +12,11 @@ ; GCN-LABEL: add_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_add v[[value]] define amdgpu_kernel void @add_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -28,11 +28,11 @@ ; GCN-LABEL: add_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_add v[[value]] @@ -90,11 +90,11 @@ ; GCN-LABEL: sub_i32_constant: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: v_mul_u32_u24{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[popcount]], 5 ; GCN: buffer_atomic_sub v[[value]] define amdgpu_kernel void @sub_i32_constant(i32 addrspace(1)* %out, <4 x i32> %inout) { @@ -106,11 +106,11 @@ ; GCN-LABEL: sub_i32_uniform: ; GCN: v_mbcnt_lo_u32_b32{{(_e[0-9]+)?}} v[[mbcnt:[0-9]+]], exec_lo, 0 -; GCN64: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] +; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], exec_lo +; GCN64-DAG: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], exec +; GCN64-DAG: v_mbcnt_hi_u32_b32{{(_e[0-9]+)?}} v[[mbcnt]], exec_hi, v[[mbcnt]] ; GCN: v_cmp_eq_u32{{(_e[0-9]+)?}} vcc{{(_lo)?}}, 0, v[[mbcnt]] ; GCN: s_and_saveexec_b{{32|64}} s[[exec:\[?[0-9:]+\]?]], vcc -; GCN32: s_bcnt1_i32_b32 s[[popcount:[0-9]+]], s -; GCN64: s_bcnt1_i32_b64 s[[popcount:[0-9]+]], s ; GCN: s_mul_i32 s[[scalar_value:[0-9]+]], s{{[0-9]+}}, s[[popcount]] ; GCN: v_mov_b32{{(_e[0-9]+)?}} v[[value:[0-9]+]], s[[scalar_value]] ; GCN: buffer_atomic_sub v[[value]]