diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -1162,9 +1162,6 @@ addPass(createAMDGPUAtomicOptimizerPass()); } - if (TM->getOptLevel() > CodeGenOpt::None) - addPass(createSinkingPass()); - // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -29,17 +29,16 @@ ; ; GCN-LABEL: atomic_add: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN-NEXT: s_bcnt1_i32_b64 s4, exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB0_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_add v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_add v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB0_2: ; GCN-NEXT: s_endpgm .entry: @@ -72,15 +71,14 @@ ; ; GCN-LABEL: atomic_add_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN-NEXT: s_bcnt1_i32_b64 s6, exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB1_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_add v1, v2, s[0:3], 0 idxen glc @@ -123,17 +121,16 @@ ; ; GCN-LABEL: atomic_sub: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN-NEXT: s_bcnt1_i32_b64 s4, exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB2_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_sub v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_sub v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB2_2: ; GCN-NEXT: s_endpgm .entry: @@ -166,15 +163,14 @@ ; ; GCN-LABEL: atomic_sub_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN-NEXT: s_bcnt1_i32_b64 s6, exec ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB3_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_sub v1, v2, s[0:3], 0 idxen glc @@ -218,18 +214,17 @@ ; ; GCN-LABEL: atomic_xor: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[4:5], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN-NEXT: s_bcnt1_i32_b64 s4, exec +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN-NEXT: s_and_b32 s4, s4, 1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-NEXT: s_cbranch_execz .LBB4_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GCN-NEXT: s_and_b32 s4, s4, 1 -; GCN-NEXT: v_mov_b32_e32 v0, 0 -; GCN-NEXT: v_mov_b32_e32 v1, s4 -; GCN-NEXT: buffer_atomic_xor v1, v0, s[0:3], 0 idxen +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: buffer_atomic_xor v0, v1, s[0:3], 0 idxen ; GCN-NEXT: .LBB4_2: ; GCN-NEXT: s_endpgm .entry: @@ -264,16 +259,15 @@ ; ; GCN-LABEL: atomic_xor_and_format: ; GCN: ; %bb.0: ; %.entry -; GCN-NEXT: s_mov_b64 s[6:7], exec -; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s7, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GCN-NEXT: s_bcnt1_i32_b64 s4, exec +; GCN-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0 +; GCN-NEXT: s_and_b32 s6, s4, 1 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB5_2 ; GCN-NEXT: ; %bb.1: -; GCN-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GCN-NEXT: s_and_b32 s6, s6, 1 ; GCN-NEXT: v_mov_b32_e32 v1, s6 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: buffer_atomic_xor v1, v2, s[0:3], 0 idxen glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/non-entry-alloca.ll @@ -22,20 +22,20 @@ ; GCN-NEXT: s_cmp_lg_u32 s6, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 -; GCN-NEXT: s_load_dword s6, s[4:5], 0xc +; GCN-NEXT: s_load_dword s7, s[4:5], 0xc +; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_cmp_lg_u32 s6, 0 +; GCN-NEXT: s_cmp_lg_u32 s7, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB0_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_load_dword s6, s[4:5], 0x10 -; GCN-NEXT: s_add_u32 s7, s32, 0x1000 +; GCN-NEXT: s_load_dword s7, s[4:5], 0x10 ; GCN-NEXT: v_mov_b32_e32 v1, 0 -; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: v_mov_b32_e32 v3, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s6, s6, 2 -; GCN-NEXT: s_add_u32 s6, s7, s6 ; GCN-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_lshl_b32 s7, s7, 2 +; GCN-NEXT: s_add_u32 s6, s6, s7 ; GCN-NEXT: buffer_store_dword v3, v2, s[0:3], 0 offen offset:4 ; GCN-NEXT: v_mov_b32_e32 v2, s6 ; GCN-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen @@ -157,11 +157,11 @@ ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_cbranch_execz .LBB2_3 ; GCN-NEXT: ; %bb.1: ; %bb.0 +; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 ; GCN-NEXT: s_and_b64 exec, exec, vcc ; GCN-NEXT: s_cbranch_execz .LBB2_3 ; GCN-NEXT: ; %bb.2: ; %bb.1 -; GCN-NEXT: s_add_u32 s6, s32, 0x1000 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: v_mov_b32_e32 v3, s6 ; GCN-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -23,9 +23,9 @@ ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 @@ -38,10 +38,10 @@ ; GFX7LESS-NEXT: .LBB0_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mad_u32_u24 v0, v0, 5, s4 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -56,25 +56,26 @@ ; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX89-NEXT: s_cbranch_execz .LBB0_2 ; GFX89-NEXT: ; %bb.1: -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: s_mul_i32 s2, s2, 5 +; GFX89-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX89-NEXT: s_mul_i32 s6, s6, 5 ; GFX89-NEXT: s_mov_b32 s11, 0xf000 ; GFX89-NEXT: s_mov_b32 s10, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 ; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v1, s2 +; GFX89-NEXT: v_mov_b32_e32 v1, s6 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX89-NEXT: buffer_atomic_add v1, off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) ; GFX89-NEXT: buffer_wbinvl1_vol ; GFX89-NEXT: .LBB0_2: ; GFX89-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX89-NEXT: v_readfirstlane_b32 s4, v1 ; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: v_readfirstlane_b32 s2, v1 +; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s2 ; GFX89-NEXT: s_mov_b32 s3, 0xf000 ; GFX89-NEXT: s_mov_b32 s2, -1 -; GFX89-NEXT: v_mad_u32_u24 v0, v0, 5, s4 +; GFX89-NEXT: s_nop 1 ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX89-NEXT: s_endpgm ; @@ -242,10 +243,10 @@ ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB1_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 @@ -256,12 +257,12 @@ ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB1_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -559,12 +560,13 @@ ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB2_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_add_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -613,12 +615,13 @@ ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB2_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_add_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_add_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -891,24 +894,25 @@ define amdgpu_kernel void @add_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: s_mov_b32 s7, 0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB3_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -916,39 +920,40 @@ ; GFX7LESS-NEXT: .LBB3_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX89-LABEL: add_i64_constant: ; GFX89: ; %bb.0: ; %entry ; GFX89-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX89-NEXT: s_mov_b64 s[6:7], exec -; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX89-NEXT: s_mov_b64 s[8:9], exec +; GFX89-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX89-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX89-NEXT: s_mov_b32 s7, 0 ; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX89-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX89-NEXT: s_cbranch_execz .LBB3_2 ; GFX89-NEXT: ; %bb.1: -; GFX89-NEXT: s_waitcnt lgkmcnt(0) -; GFX89-NEXT: s_mov_b32 s8, s2 -; GFX89-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX89-NEXT: s_mul_i32 s2, s2, 5 +; GFX89-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX89-NEXT: s_mul_i32 s6, s6, 5 +; GFX89-NEXT: v_mov_b32_e32 v0, s6 ; GFX89-NEXT: s_mov_b32 s11, 0xf000 ; GFX89-NEXT: s_mov_b32 s10, -1 +; GFX89-NEXT: s_waitcnt lgkmcnt(0) +; GFX89-NEXT: s_mov_b32 s8, s2 ; GFX89-NEXT: s_mov_b32 s9, s3 -; GFX89-NEXT: v_mov_b32_e32 v0, s2 -; GFX89-NEXT: v_mov_b32_e32 v1, 0 +; GFX89-NEXT: v_mov_b32_e32 v1, s7 ; GFX89-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX89-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX89-NEXT: s_waitcnt vmcnt(0) @@ -970,20 +975,21 @@ ; GFX1064-LABEL: add_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB3_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s8, s2 ; GFX1064-NEXT: s_mov_b32 s9, s3 @@ -1008,19 +1014,20 @@ ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_mov_b32 s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB3_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: s_mul_i32 s4, s4, 5 ; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s8, s2 ; GFX1032-NEXT: s_mov_b32 s9, s3 @@ -1032,7 +1039,7 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: .LBB3_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -1045,21 +1052,22 @@ ; GFX1164-LABEL: add_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB3_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s6, s6, 5 +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 ; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 @@ -1085,19 +1093,20 @@ ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: s_mov_b32 s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB3_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_mul_i32 s5, s5, 5 +; GFX1132-NEXT: s_mul_i32 s4, s4, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 @@ -1108,7 +1117,7 @@ ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: .LBB3_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -1128,47 +1137,44 @@ define amdgpu_kernel void @add_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %additive) { ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s3, s1, s2 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX7LESS-NEXT: s_mul_i32 s2, s0, s2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s9, s7 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc +; GFX7LESS-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB4_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 ; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s2, v2 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1176,26 +1182,24 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[8:9], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX8-NEXT: s_bcnt1_i32_b64 s8, exec +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v0, 0 +; GFX8-NEXT: s_mul_i32 s2, s1, s8 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s1, s6 -; GFX8-NEXT: s_mov_b32 s15, 0xf000 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc +; GFX8-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB4_2: @@ -1204,7 +1208,6 @@ ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v3, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[0:1], s0, v2, v[0:1] ; GFX8-NEXT: s_mov_b32 s7, 0xf000 @@ -1225,20 +1228,20 @@ ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_mul_i32 s9, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX9-NEXT: s_mul_i32 s12, s2, s8 +; GFX9-NEXT: s_add_i32 s13, s10, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_add_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB4_2: @@ -1274,9 +1277,9 @@ ; GFX1064-NEXT: s_mul_i32 s9, s3, s8 ; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 ; GFX1064-NEXT: s_mul_i32 s8, s2, s8 -; GFX1064-NEXT: s_add_i32 s10, s10, s9 +; GFX1064-NEXT: s_add_i32 s9, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 -; GFX1064-NEXT: v_mov_b32_e32 v1, s10 +; GFX1064-NEXT: v_mov_b32_e32 v1, s9 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s8, s6 ; GFX1064-NEXT: s_mov_b32 s9, s7 @@ -1316,11 +1319,11 @@ ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s8, s3, s1 ; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s8, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 -; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1362,9 +1365,9 @@ ; GFX1164-NEXT: s_mul_i32 s9, s1, s8 ; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 ; GFX1164-NEXT: s_mul_i32 s8, s0, s8 -; GFX1164-NEXT: s_add_i32 s10, s10, s9 +; GFX1164-NEXT: s_add_i32 s9, s10, s9 ; GFX1164-NEXT: v_mov_b32_e32 v0, s8 -; GFX1164-NEXT: v_mov_b32_e32 v1, s10 +; GFX1164-NEXT: v_mov_b32_e32 v1, s9 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_mov_b32 s8, s6 ; GFX1164-NEXT: s_mov_b32 s9, s7 @@ -1408,11 +1411,11 @@ ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 ; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1132-NEXT: s_mul_i32 s3, s0, s3 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_add_i32 s9, s9, s8 +; GFX1132-NEXT: s_mul_i32 s8, s0, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 -; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX1132-NEXT: s_mov_b32 s8, s6 ; GFX1132-NEXT: s_mov_b32 s9, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1547,9 +1550,9 @@ ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB6_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] ; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 @@ -1562,11 +1565,11 @@ ; GFX7LESS-NEXT: .LBB6_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 +; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 -; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -1581,26 +1584,27 @@ ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB6_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX8-NEXT: s_mul_i32 s6, s6, 5 ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 ; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v1, s2 +; GFX8-NEXT: v_mov_b32_e32 v1, s6 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB6_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v1 -; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v1 +; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -1615,26 +1619,27 @@ ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB6_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX9-NEXT: s_mul_i32 s6, s6, 5 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v1, s2 +; GFX9-NEXT: v_mov_b32_e32 v1, s6 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub v1, off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB6_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v1 -; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v1 +; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v0 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -1806,10 +1811,10 @@ ; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mul_i32 s2, s8, s2 +; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s14, -1 ; GFX7LESS-NEXT: s_mov_b32 s12, s6 ; GFX7LESS-NEXT: s_mov_b32 s13, s7 @@ -1820,12 +1825,12 @@ ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB7_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[0:1] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v1 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s8, v0 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v0 +; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -2127,12 +2132,13 @@ ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB8_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_mov_b32_e32 v0, v1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, v1 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; @@ -2181,12 +2187,13 @@ ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB8_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_mov_b32_e32 v0, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v1 +; GFX9-NEXT: v_sub_u32_e32 v0, s2, v0 ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_sub_u32_e32 v0, s4, v0 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; @@ -2459,24 +2466,25 @@ define amdgpu_kernel void @sub_i64_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %inout) { ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec +; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 +; GFX7LESS-NEXT: s_mov_b32 s7, 0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB9_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX7LESS-NEXT: s_mul_i32 s6, s6, 5 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 ; GFX7LESS-NEXT: s_mov_b32 s10, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: s_mov_b32 s8, s2 ; GFX7LESS-NEXT: s_mov_b32 s9, s3 ; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s7 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) @@ -2484,114 +2492,117 @@ ; GFX7LESS-NEXT: .LBB9_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s2, -1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v1 +; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v2 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s5 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GFX7LESS-NEXT: s_mov_b32 s2, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: s_mov_b64 s[8:9], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX8-NEXT: s_mov_b32 s7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB9_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s8, s2 -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX8-NEXT: s_mul_i32 s6, s6, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_mov_b32 s11, 0xf000 ; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: s_mov_b32 s8, s2 ; GFX8-NEXT: s_mov_b32 s9, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB9_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s3, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX8-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, s5 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s4, v0 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v2, s3 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v0 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[6:7], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: s_mov_b64 s[8:9], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX9-NEXT: s_mov_b32 s7, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB9_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s8, s2 -; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] -; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] +; GFX9-NEXT: s_mul_i32 s6, s6, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_mov_b32 s11, 0xf000 ; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: s_mov_b32 s8, s2 ; GFX9-NEXT: s_mov_b32 s9, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB9_2: ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s4, v0 -; GFX9-NEXT: v_readfirstlane_b32 s5, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX9-NEXT: v_mul_hi_u32_u24_e32 v1, 5, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, s5 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s4, v0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: v_mov_b32_e32 v2, s3 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v0 +; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 -; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_constant: ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: s_mov_b64 s[6:7], exec -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1064-NEXT: s_mov_b64 s[8:9], exec +; GFX1064-NEXT: s_mov_b32 s7, 0 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_cbranch_execz .LBB9_2 ; GFX1064-NEXT: ; %bb.1: -; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s6, s6, 5 +; GFX1064-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX1064-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: s_mul_i32 s6, s6, 5 ; GFX1064-NEXT: s_mov_b32 s10, -1 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_mov_b32 s8, s2 ; GFX1064-NEXT: s_mov_b32 s9, s3 @@ -2619,19 +2630,20 @@ ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s5, exec_lo +; GFX1032-NEXT: s_mov_b32 s4, exec_lo +; GFX1032-NEXT: s_mov_b32 s5, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s6, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB9_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s5, s5, 5 +; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1032-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: s_mul_i32 s4, s4, 5 ; GFX1032-NEXT: s_mov_b32 s10, -1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s4 +; GFX1032-NEXT: v_mov_b32_e32 v1, s5 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mov_b32 s8, s2 ; GFX1032-NEXT: s_mov_b32 s9, s3 @@ -2643,7 +2655,7 @@ ; GFX1032-NEXT: buffer_gl1_inv ; GFX1032-NEXT: .LBB9_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2659,21 +2671,22 @@ ; GFX1164-LABEL: sub_i64_constant: ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[6:7], exec +; GFX1164-NEXT: s_mov_b64 s[8:9], exec +; GFX1164-NEXT: s_mov_b32 s7, 0 +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1164-NEXT: s_cbranch_execz .LBB9_2 ; GFX1164-NEXT: ; %bb.1: -; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s6, s6, 5 +; GFX1164-NEXT: s_bcnt1_i32_b64 s6, s[8:9] ; GFX1164-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: s_mul_i32 s6, s6, 5 ; GFX1164-NEXT: s_mov_b32 s10, -1 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: s_mov_b32 s8, s2 ; GFX1164-NEXT: s_mov_b32 s9, s3 @@ -2702,19 +2715,20 @@ ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s5, 0 +; GFX1132-NEXT: s_mov_b32 s5, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s4, 0 +; GFX1132-NEXT: s_mov_b32 s6, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB9_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 +; GFX1132-NEXT: s_bcnt1_i32_b32 s4, s4 ; GFX1132-NEXT: s_mov_b32 s11, 0x31016000 -; GFX1132-NEXT: s_mul_i32 s5, s5, 5 +; GFX1132-NEXT: s_mul_i32 s4, s4, 5 ; GFX1132-NEXT: s_mov_b32 s10, -1 -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, 0 +; GFX1132-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mov_b32 s8, s2 ; GFX1132-NEXT: s_mov_b32 s9, s3 @@ -2725,7 +2739,7 @@ ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: buffer_gl1_inv ; GFX1132-NEXT: .LBB9_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s6 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 @@ -2748,47 +2762,44 @@ define amdgpu_kernel void @sub_i64_uniform(i64 addrspace(1)* %out, i64 addrspace(1)* %inout, i64 %subitive) { ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[8:9], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s8, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s9, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, exec +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, exec_hi, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s3, s1, s2 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 +; GFX7LESS-NEXT: s_mul_i32 s2, s0, s2 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s3, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB10_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_mov_b32 s15, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s14, -1 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s12, s6 -; GFX7LESS-NEXT: s_mov_b32 s13, s7 -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX7LESS-NEXT: s_mul_i32 s7, s1, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s0, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s0, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: s_mov_b32 s11, 0xf000 +; GFX7LESS-NEXT: s_mov_b32 s10, -1 +; GFX7LESS-NEXT: s_mov_b32 s8, s6 +; GFX7LESS-NEXT: s_mov_b32 s9, s7 ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc +; GFX7LESS-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX7LESS-NEXT: s_waitcnt vmcnt(0) ; GFX7LESS-NEXT: buffer_wbinvl1 ; GFX7LESS-NEXT: .LBB10_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s3, v1 ; GFX7LESS-NEXT: s_waitcnt expcnt(0) ; GFX7LESS-NEXT: v_mul_lo_u32 v0, s1, v2 ; GFX7LESS-NEXT: v_mul_hi_u32 v1, s0, v2 ; GFX7LESS-NEXT: v_mul_lo_u32 v2, s0, v2 +; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 ; GFX7LESS-NEXT: v_mov_b32_e32 v3, s3 ; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s2, v2 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc +; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX7LESS-NEXT: s_endpgm ; @@ -2796,31 +2807,28 @@ ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 -; GFX8-NEXT: s_mov_b64 s[8:9], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s8, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s9, v0 +; GFX8-NEXT: s_bcnt1_i32_b64 s8, exec +; GFX8-NEXT: v_mov_b32_e32 v0, s8 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s0, v0, 0 +; GFX8-NEXT: s_mul_i32 s2, s1, s8 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s2, v1 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB10_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s12, s6 -; GFX8-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[8:9], s0, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s1, s6 -; GFX8-NEXT: s_mov_b32 s15, 0xf000 -; GFX8-NEXT: s_mov_b32 s14, -1 -; GFX8-NEXT: s_mov_b32 s13, s7 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: s_mov_b32 s11, 0xf000 +; GFX8-NEXT: s_mov_b32 s10, -1 +; GFX8-NEXT: s_mov_b32 s8, s6 +; GFX8-NEXT: s_mov_b32 s9, s7 ; GFX8-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc +; GFX8-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: buffer_wbinvl1_vol ; GFX8-NEXT: .LBB10_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] -; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mul_lo_u32 v4, s1, v2 ; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s0, v2, 0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -2828,9 +2836,9 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 +; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: s_mov_b32 s7, 0xf000 ; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; @@ -2846,20 +2854,20 @@ ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_cbranch_execz .LBB10_2 ; GFX9-NEXT: ; %bb.1: +; GFX9-NEXT: s_bcnt1_i32_b64 s8, s[8:9] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s12, s6 -; GFX9-NEXT: s_bcnt1_i32_b64 s6, s[8:9] -; GFX9-NEXT: s_mov_b32 s13, s7 -; GFX9-NEXT: s_mul_i32 s7, s3, s6 -; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 -; GFX9-NEXT: s_mul_i32 s6, s2, s6 -; GFX9-NEXT: s_mov_b32 s15, 0xf000 -; GFX9-NEXT: s_mov_b32 s14, -1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-NEXT: s_mul_i32 s9, s3, s8 +; GFX9-NEXT: s_mul_hi_u32 s10, s2, s8 +; GFX9-NEXT: s_mul_i32 s12, s2, s8 +; GFX9-NEXT: s_add_i32 s13, s10, s9 +; GFX9-NEXT: v_mov_b32_e32 v0, s12 +; GFX9-NEXT: s_mov_b32 s11, 0xf000 +; GFX9-NEXT: s_mov_b32 s10, -1 +; GFX9-NEXT: s_mov_b32 s8, s6 +; GFX9-NEXT: s_mov_b32 s9, s7 +; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[12:15], 0 glc +; GFX9-NEXT: buffer_atomic_sub_x2 v[0:1], off, s[8:11], 0 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_wbinvl1_vol ; GFX9-NEXT: .LBB10_2: @@ -2871,8 +2879,8 @@ ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[0:1], s3, v2, v[4:5] ; GFX9-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 -; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc ; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -2897,9 +2905,9 @@ ; GFX1064-NEXT: s_mul_i32 s9, s3, s8 ; GFX1064-NEXT: s_mul_hi_u32 s10, s2, s8 ; GFX1064-NEXT: s_mul_i32 s8, s2, s8 -; GFX1064-NEXT: s_add_i32 s10, s10, s9 +; GFX1064-NEXT: s_add_i32 s9, s10, s9 ; GFX1064-NEXT: v_mov_b32_e32 v0, s8 -; GFX1064-NEXT: v_mov_b32_e32 v1, s10 +; GFX1064-NEXT: v_mov_b32_e32 v1, s9 ; GFX1064-NEXT: s_mov_b32 s10, -1 ; GFX1064-NEXT: s_mov_b32 s8, s6 ; GFX1064-NEXT: s_mov_b32 s9, s7 @@ -2942,11 +2950,11 @@ ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s8, s3, s1 ; GFX1032-NEXT: s_mul_hi_u32 s9, s2, s1 -; GFX1032-NEXT: s_mul_i32 s1, s2, s1 +; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_add_i32 s9, s9, s8 -; GFX1032-NEXT: v_mov_b32_e32 v0, s1 +; GFX1032-NEXT: s_mul_i32 s8, s2, s1 +; GFX1032-NEXT: v_mov_b32_e32 v0, s8 ; GFX1032-NEXT: v_mov_b32_e32 v1, s9 -; GFX1032-NEXT: s_mov_b32 s10, -1 ; GFX1032-NEXT: s_mov_b32 s8, s6 ; GFX1032-NEXT: s_mov_b32 s9, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2991,9 +2999,9 @@ ; GFX1164-NEXT: s_mul_i32 s9, s1, s8 ; GFX1164-NEXT: s_mul_hi_u32 s10, s0, s8 ; GFX1164-NEXT: s_mul_i32 s8, s0, s8 -; GFX1164-NEXT: s_add_i32 s10, s10, s9 +; GFX1164-NEXT: s_add_i32 s9, s10, s9 ; GFX1164-NEXT: v_mov_b32_e32 v0, s8 -; GFX1164-NEXT: v_mov_b32_e32 v1, s10 +; GFX1164-NEXT: v_mov_b32_e32 v1, s9 ; GFX1164-NEXT: s_mov_b32 s10, -1 ; GFX1164-NEXT: s_mov_b32 s8, s6 ; GFX1164-NEXT: s_mov_b32 s9, s7 @@ -3039,11 +3047,11 @@ ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s8, s1, s3 ; GFX1132-NEXT: s_mul_hi_u32 s9, s0, s3 -; GFX1132-NEXT: s_mul_i32 s3, s0, s3 +; GFX1132-NEXT: s_mov_b32 s10, -1 ; GFX1132-NEXT: s_add_i32 s9, s9, s8 +; GFX1132-NEXT: s_mul_i32 s8, s0, s3 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s9 -; GFX1132-NEXT: s_mov_b32 s10, -1 +; GFX1132-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 ; GFX1132-NEXT: s_mov_b32 s8, s6 ; GFX1132-NEXT: s_mov_b32 s9, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -19,25 +19,25 @@ ; ; GFX7LESS-LABEL: add_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB0_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB0_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 @@ -49,24 +49,24 @@ ; GFX8-LABEL: add_i32_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB0_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_add_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB0_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mad_u32_u24 v0, v0, 5, s2 @@ -1074,25 +1074,27 @@ ; ; GFX7LESS-LABEL: add_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB4_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB4_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -1109,24 +1111,26 @@ ; GFX8-LABEL: add_i64_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: s_mov_b32 s3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB4_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB4_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 @@ -1142,23 +1146,25 @@ ; GFX9-LABEL: add_i64_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB4_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB4_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 @@ -1175,6 +1181,7 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s7, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1183,12 +1190,13 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB4_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mul_i32 s6, s4, 5 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB4_2: @@ -1206,25 +1214,27 @@ ; GFX1032-LABEL: add_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB4_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_mul_i32 s2, s2, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB4_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_mad_u64_u32 v[0:1], s2, v2, 5, s[2:3] @@ -1238,8 +1248,9 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s7, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -1247,13 +1258,14 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB4_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mul_i32 s6, s4, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB4_2: @@ -1272,26 +1284,27 @@ ; GFX1132-LABEL: add_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: s_mov_b32 s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB4_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_mul_i32 s2, s2, 5 +; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB4_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -1313,84 +1326,79 @@ ; ; GFX7LESS-LABEL: add_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, exec +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, exec_hi, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s5, s3, s4 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX7LESS-NEXT: s_mul_i32 s4, s2, s4 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s5, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX7LESS-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB5_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] +; GFX7LESS-NEXT: ds_add_rtn_u64 v[2:3], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB5_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v4 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v4 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v4 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 -; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s5 +; GFX7LESS-NEXT: v_add_i32_e32 v0, vcc, s4, v2 ; GFX7LESS-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: add_i64_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, exec +; GFX8-NEXT: v_mov_b32_e32 v0, s6 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v2 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v0, 0 +; GFX8-NEXT: s_mul_i32 s4, s3, s6 +; GFX8-NEXT: ; implicit-def: $vgpr2_vgpr3 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, s4, v1 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB5_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] +; GFX8-NEXT: ds_add_rtn_u64 v[2:3], v2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB5_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_readfirstlane_b32 s4, v0 -; GFX8-NEXT: v_readfirstlane_b32 s5, v1 +; GFX8-NEXT: v_readfirstlane_b32 s4, v2 +; GFX8-NEXT: v_readfirstlane_b32 s5, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mul_lo_u32 v3, s3, v2 -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v2, v[0:1] -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v1 -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: v_mul_lo_u32 v2, s3, v4 +; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[2:3], s2, v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v2, v1 +; GFX8-NEXT: s_nop 1 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: add_i64_uniform: @@ -1408,11 +1416,11 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s3, s6 ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_add_i32 s7, s8, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -1424,12 +1432,11 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s2, v2, v[0:1] -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 ; GFX9-NEXT: v_mad_u64_u32 v[1:2], s[2:3], s3, v2, v[1:2] -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: s_nop 2 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: add_i64_uniform: @@ -1449,9 +1456,9 @@ ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1064-NEXT: s_mul_i32 s6, s2, s6 -; GFX1064-NEXT: s_add_i32 s8, s8, s7 +; GFX1064-NEXT: s_add_i32 s7, s8, s7 ; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: v_mov_b32_e32 v1, s8 +; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] @@ -1485,9 +1492,9 @@ ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1032-NEXT: s_mul_i32 s5, s2, s5 ; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: s_mul_i32 s6, s2, s5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1525,9 +1532,9 @@ ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1164-NEXT: s_mul_i32 s6, s2, s6 -; GFX1164-NEXT: s_add_i32 s8, s8, s7 +; GFX1164-NEXT: s_add_i32 s7, s8, s7 ; GFX1164-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164-NEXT: v_mov_b32_e32 v1, s8 +; GFX1164-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1164-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] @@ -1561,14 +1568,14 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB5_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1132-NEXT: s_mul_i32 s5, s2, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 +; GFX1132-NEXT: s_mul_i32 s6, s2, s5 +; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6 +; GFX1132-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] @@ -1677,25 +1684,25 @@ ; ; GFX7LESS-LABEL: sub_i32_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec +; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s5, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX7LESS-NEXT: ; implicit-def: $vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB7_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 ; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v2, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, s4 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB7_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v1 ; GFX7LESS-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -1708,24 +1715,24 @@ ; GFX8-LABEL: sub_i32_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0 +; GFX8-NEXT: s_mov_b64 s[4:5], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, s5, v0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz .LBB7_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[2:3] -; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] +; GFX8-NEXT: s_mul_i32 s4, s4, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 -; GFX8-NEXT: v_mov_b32_e32 v2, s2 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: ds_sub_rtn_u32 v1, v1, v2 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB7_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v1 ; GFX8-NEXT: v_mul_u32_u24_e32 v0, 5, v0 @@ -2743,25 +2750,27 @@ ; ; GFX7LESS-LABEL: sub_i64_constant: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[4:5], exec +; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s4, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s5, v0 +; GFX7LESS-NEXT: s_mov_b32 s3, 0 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 ; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB11_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX7LESS-NEXT: s_mul_i32 s4, s4, 5 -; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX7LESS-NEXT: s_mul_i32 s2, s2, 5 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2 +; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB11_2: -; GFX7LESS-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: v_readfirstlane_b32 s2, v0 ; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v1 @@ -2778,24 +2787,26 @@ ; GFX8-LABEL: sub_i64_constant: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[4:5], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX8-NEXT: s_mov_b64 s[6:7], exec +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX8-NEXT: s_mov_b32 s3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB11_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX8-NEXT: s_mul_i32 s4, s4, 5 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 +; GFX8-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX8-NEXT: s_mul_i32 s2, s2, 5 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB11_2: -; GFX8-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_readfirstlane_b32 s2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s3, v1 @@ -2812,23 +2823,25 @@ ; GFX9-LABEL: sub_i64_constant: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX9-NEXT: s_mov_b64 s[4:5], exec -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 +; GFX9-NEXT: s_mov_b64 s[6:7], exec +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 +; GFX9-NEXT: s_mov_b32 s3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 ; GFX9-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_cbranch_execz .LBB11_2 ; GFX9-NEXT: ; %bb.1: -; GFX9-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX9-NEXT: s_mul_i32 s4, s4, 5 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[6:7] +; GFX9-NEXT: s_mul_i32 s2, s2, 5 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: .LBB11_2: -; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] +; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_readfirstlane_b32 s2, v0 ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 @@ -2846,6 +2859,7 @@ ; GFX1064: ; %bb.0: ; %entry ; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; GFX1064-NEXT: s_mov_b64 s[4:5], exec +; GFX1064-NEXT: s_mov_b32 s7, 0 ; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 ; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2854,12 +2868,13 @@ ; GFX1064-NEXT: s_cbranch_execz .LBB11_2 ; GFX1064-NEXT: ; %bb.1: ; GFX1064-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1064-NEXT: v_mov_b32_e32 v1, 0 -; GFX1064-NEXT: s_mul_i32 s4, s4, 5 -; GFX1064-NEXT: v_mov_b32_e32 v0, s4 +; GFX1064-NEXT: v_mov_b32_e32 v3, 0 +; GFX1064-NEXT: s_mul_i32 s6, s4, 5 +; GFX1064-NEXT: v_mov_b32_e32 v0, s6 +; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: buffer_gl0_inv ; GFX1064-NEXT: .LBB11_2: @@ -2880,25 +2895,27 @@ ; GFX1032-LABEL: sub_i64_constant: ; GFX1032: ; %bb.0: ; %entry ; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX1032-NEXT: s_mov_b32 s3, exec_lo +; GFX1032-NEXT: s_mov_b32 s2, exec_lo +; GFX1032-NEXT: s_mov_b32 s3, 0 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr0_vgpr1 -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; GFX1032-NEXT: s_and_saveexec_b32 s2, vcc_lo +; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo ; GFX1032-NEXT: s_cbranch_execz .LBB11_2 ; GFX1032-NEXT: ; %bb.1: -; GFX1032-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1032-NEXT: v_mov_b32_e32 v1, 0 -; GFX1032-NEXT: s_mul_i32 s3, s3, 5 -; GFX1032-NEXT: v_mov_b32_e32 v0, s3 +; GFX1032-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1032-NEXT: v_mov_b32_e32 v3, 0 +; GFX1032-NEXT: s_mul_i32 s2, s2, 5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s2 +; GFX1032-NEXT: v_mov_b32_e32 v1, s3 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1032-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: buffer_gl0_inv ; GFX1032-NEXT: .LBB11_2: ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 -; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1032-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 @@ -2915,8 +2932,9 @@ ; GFX1164: ; %bb.0: ; %entry ; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX1164-NEXT: s_mov_b64 s[4:5], exec -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: s_mov_b32 s7, 0 ; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s4, 0 +; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, s5, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0_vgpr1 @@ -2924,13 +2942,14 @@ ; GFX1164-NEXT: s_cbranch_execz .LBB11_2 ; GFX1164-NEXT: ; %bb.1: ; GFX1164-NEXT: s_bcnt1_i32_b64 s4, s[4:5] -; GFX1164-NEXT: v_mov_b32_e32 v1, 0 -; GFX1164-NEXT: s_mul_i32 s4, s4, 5 +; GFX1164-NEXT: v_mov_b32_e32 v3, 0 +; GFX1164-NEXT: s_mul_i32 s6, s4, 5 ; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1164-NEXT: v_mov_b32_e32 v0, s4 +; GFX1164-NEXT: v_mov_b32_e32 v0, s6 +; GFX1164-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: buffer_gl0_inv ; GFX1164-NEXT: .LBB11_2: @@ -2952,26 +2971,27 @@ ; GFX1132-LABEL: sub_i64_constant: ; GFX1132: ; %bb.0: ; %entry ; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, exec_lo -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s3, 0 +; GFX1132-NEXT: s_mov_b32 s3, 0 +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, s2, 0 +; GFX1132-NEXT: s_mov_b32 s4, exec_lo ; GFX1132-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 ; GFX1132-NEXT: s_cbranch_execz .LBB11_2 ; GFX1132-NEXT: ; %bb.1: -; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v0, s3 +; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: s_mul_i32 s2, s2, 5 +; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s2 +; GFX1132-NEXT: v_mov_b32_e32 v1, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v1, v[0:1] +; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: buffer_gl0_inv ; GFX1132-NEXT: .LBB11_2: -; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 ; GFX1132-NEXT: v_mul_u32_u24_e32 v0, 5, v2 ; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 @@ -2996,85 +3016,79 @@ ; ; GFX7LESS-LABEL: sub_i64_uniform: ; GFX7LESS: ; %bb.0: ; %entry -; GFX7LESS-NEXT: s_mov_b64 s[6:7], exec ; GFX7LESS-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 -; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s6, 0 -; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v2, s7, v0 -; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 -; GFX7LESS-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0 +; GFX7LESS-NEXT: s_bcnt1_i32_b64 s4, exec +; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v4, exec_hi, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) +; GFX7LESS-NEXT: s_mul_i32 s5, s3, s4 +; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 +; GFX7LESS-NEXT: s_mul_i32 s4, s2, s4 +; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s5, v0 +; GFX7LESS-NEXT: v_mov_b32_e32 v0, s4 +; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 +; GFX7LESS-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX7LESS-NEXT: s_cbranch_execz .LBB12_2 ; GFX7LESS-NEXT: ; %bb.1: -; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[6:7] -; GFX7LESS-NEXT: v_mov_b32_e32 v3, 0 -; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mul_i32 s7, s3, s6 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 -; GFX7LESS-NEXT: v_mul_hi_u32 v0, s2, v0 -; GFX7LESS-NEXT: s_mul_i32 s6, s2, s6 -; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, s7, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v0, s6 +; GFX7LESS-NEXT: v_mov_b32_e32 v2, 0 ; GFX7LESS-NEXT: s_mov_b32 m0, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX7LESS-NEXT: ds_sub_rtn_u64 v[2:3], v2, v[0:1] ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) ; GFX7LESS-NEXT: .LBB12_2: ; GFX7LESS-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX7LESS-NEXT: s_mov_b32 s7, 0xf000 -; GFX7LESS-NEXT: s_mov_b32 s6, -1 ; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0) -; GFX7LESS-NEXT: s_mov_b32 s4, s0 -; GFX7LESS-NEXT: s_mov_b32 s5, s1 -; GFX7LESS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX7LESS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v2 -; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v2 -; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s4, v2 +; GFX7LESS-NEXT: v_readfirstlane_b32 s5, v3 +; GFX7LESS-NEXT: v_mul_lo_u32 v0, s3, v4 +; GFX7LESS-NEXT: v_mul_hi_u32 v1, s2, v4 +; GFX7LESS-NEXT: v_mul_lo_u32 v2, s2, v4 +; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000 ; GFX7LESS-NEXT: v_add_i32_e32 v1, vcc, v1, v0 -; GFX7LESS-NEXT: v_mov_b32_e32 v3, s1 -; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s0, v2 +; GFX7LESS-NEXT: v_mov_b32_e32 v3, s5 +; GFX7LESS-NEXT: v_sub_i32_e32 v0, vcc, s4, v2 ; GFX7LESS-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX7LESS-NEXT: s_mov_b32 s2, -1 +; GFX7LESS-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX7LESS-NEXT: s_endpgm ; ; GFX8-LABEL: sub_i64_uniform: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: s_mov_b64 s[6:7], exec -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, s7, v0 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: s_bcnt1_i32_b64 s6, exec +; GFX8-NEXT: v_mov_b32_e32 v1, s6 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v4, exec_hi, v0 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[4:5], s2, v1, 0 +; GFX8-NEXT: s_mul_i32 s4, s3, s6 ; GFX8-NEXT: ; implicit-def: $vgpr0_vgpr1 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, s4, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_cbranch_execz .LBB12_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: s_bcnt1_i32_b64 s8, s[6:7] -; GFX8-NEXT: v_mov_b32_e32 v0, s8 -; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u64_u32 v[0:1], s[6:7], s2, v0, 0 -; GFX8-NEXT: s_mul_i32 s6, s3, s8 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, s6, v1 +; GFX8-NEXT: v_mov_b32_e32 v0, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] +; GFX8-NEXT: ds_sub_rtn_u64 v[0:1], v0, v[2:3] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: .LBB12_2: ; GFX8-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX8-NEXT: v_mul_lo_u32 v5, s3, v4 +; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[2:3], s2, v4, 0 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s4, s0 -; GFX8-NEXT: s_mov_b32 s5, s1 -; GFX8-NEXT: v_mul_lo_u32 v4, s3, v2 -; GFX8-NEXT: v_mad_u64_u32 v[2:3], s[0:1], s2, v2, 0 -; GFX8-NEXT: v_readfirstlane_b32 s0, v0 -; GFX8-NEXT: v_readfirstlane_b32 s1, v1 -; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v4 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s0, v2 -; GFX8-NEXT: s_mov_b32 s7, 0xf000 -; GFX8-NEXT: s_mov_b32 s6, -1 +; GFX8-NEXT: v_readfirstlane_b32 s2, v0 +; GFX8-NEXT: v_readfirstlane_b32 s3, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v3, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_sub_u32_e32 v0, vcc, s2, v2 ; GFX8-NEXT: v_subb_u32_e32 v1, vcc, v3, v1, vcc -; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX8-NEXT: s_mov_b32 s3, 0xf000 +; GFX8-NEXT: s_mov_b32 s2, -1 +; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: sub_i64_uniform: @@ -3092,11 +3106,11 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_mul_i32 s7, s3, s6 ; GFX9-NEXT: s_mul_hi_u32 s8, s2, s6 -; GFX9-NEXT: s_add_i32 s8, s8, s7 ; GFX9-NEXT: s_mul_i32 s6, s2, s6 +; GFX9-NEXT: s_add_i32 s7, s8, s7 ; GFX9-NEXT: v_mov_b32_e32 v0, s6 -; GFX9-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) @@ -3104,18 +3118,16 @@ ; GFX9-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX9-NEXT: s_mov_b32 s4, s0 -; GFX9-NEXT: s_mov_b32 s5, s1 ; GFX9-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] -; GFX9-NEXT: v_readfirstlane_b32 s0, v0 -; GFX9-NEXT: v_readfirstlane_b32 s1, v1 +; GFX9-NEXT: v_readfirstlane_b32 s2, v0 +; GFX9-NEXT: v_readfirstlane_b32 s3, v1 +; GFX9-NEXT: v_mov_b32_e32 v2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-NEXT: v_mov_b32_e32 v2, s1 -; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s0, v3 -; GFX9-NEXT: s_mov_b32 s7, 0xf000 -; GFX9-NEXT: s_mov_b32 s6, -1 +; GFX9-NEXT: v_sub_co_u32_e32 v0, vcc, s2, v3 ; GFX9-NEXT: v_subb_co_u32_e32 v1, vcc, v2, v1, vcc -; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GFX9-NEXT: s_mov_b32 s3, 0xf000 +; GFX9-NEXT: s_mov_b32 s2, -1 +; GFX9-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX9-NEXT: s_endpgm ; ; GFX1064-LABEL: sub_i64_uniform: @@ -3135,9 +3147,9 @@ ; GFX1064-NEXT: s_mul_i32 s7, s3, s6 ; GFX1064-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1064-NEXT: s_mul_i32 s6, s2, s6 -; GFX1064-NEXT: s_add_i32 s8, s8, s7 +; GFX1064-NEXT: s_add_i32 s7, s8, s7 ; GFX1064-NEXT: v_mov_b32_e32 v0, s6 -; GFX1064-NEXT: v_mov_b32_e32 v1, s8 +; GFX1064-NEXT: v_mov_b32_e32 v1, s7 ; GFX1064-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1064-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1064-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] @@ -3148,14 +3160,14 @@ ; GFX1064-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_mad_u64_u32 v[3:4], s[4:5], s2, v2, 0 -; GFX1064-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1064-NEXT: v_mad_u64_u32 v[4:5], s[2:3], s3, v2, v[4:5] ; GFX1064-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1064-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1064-NEXT: v_sub_co_u32 v0, vcc, s2, v3 ; GFX1064-NEXT: v_mov_b32_e32 v1, v4 ; GFX1064-NEXT: s_mov_b32 s2, -1 -; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc +; GFX1064-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1064-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1064-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1064-NEXT: s_endpgm ; @@ -3174,9 +3186,9 @@ ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_mul_i32 s6, s3, s5 ; GFX1032-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1032-NEXT: s_mul_i32 s5, s2, s5 ; GFX1032-NEXT: s_add_i32 s7, s7, s6 -; GFX1032-NEXT: v_mov_b32_e32 v0, s5 +; GFX1032-NEXT: s_mul_i32 s6, s2, s5 +; GFX1032-NEXT: v_mov_b32_e32 v0, s6 ; GFX1032-NEXT: v_mov_b32_e32 v1, s7 ; GFX1032-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1032-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3188,14 +3200,14 @@ ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s4 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_mad_u64_u32 v[3:4], s2, s2, v2, 0 -; GFX1032-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1032-NEXT: v_mad_u64_u32 v[4:5], s2, s3, v2, v[4:5] ; GFX1032-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 +; GFX1032-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1032-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 ; GFX1032-NEXT: v_mov_b32_e32 v1, v4 ; GFX1032-NEXT: s_mov_b32 s2, -1 -; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo +; GFX1032-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1032-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1032-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX1032-NEXT: s_endpgm ; @@ -3217,9 +3229,9 @@ ; GFX1164-NEXT: s_mul_i32 s7, s3, s6 ; GFX1164-NEXT: s_mul_hi_u32 s8, s2, s6 ; GFX1164-NEXT: s_mul_i32 s6, s2, s6 -; GFX1164-NEXT: s_add_i32 s8, s8, s7 +; GFX1164-NEXT: s_add_i32 s7, s8, s7 ; GFX1164-NEXT: v_mov_b32_e32 v0, s6 -; GFX1164-NEXT: v_mov_b32_e32 v1, s8 +; GFX1164-NEXT: v_mov_b32_e32 v1, s7 ; GFX1164-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1164-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1164-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] @@ -3230,15 +3242,15 @@ ; GFX1164-NEXT: s_waitcnt lgkmcnt(0) ; GFX1164-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1164-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1164-NEXT: v_readfirstlane_b32 s4, v1 ; GFX1164-NEXT: s_waitcnt_depctr 0xfff ; GFX1164-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1164-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1164-NEXT: v_sub_co_u32 v0, vcc, s2, v3 -; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: s_mov_b32 s2, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1164-NEXT: v_mov_b32_e32 v1, v5 -; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s4, v1, vcc +; GFX1164-NEXT: v_sub_co_ci_u32_e32 v1, vcc, s3, v1, vcc +; GFX1164-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1164-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1164-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1164-NEXT: s_endpgm @@ -3255,14 +3267,14 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB12_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s5, s5 -; GFX1132-NEXT: v_mov_b32_e32 v3, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s6, s3, s5 ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 -; GFX1132-NEXT: s_mul_i32 s5, s2, s5 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 +; GFX1132-NEXT: s_mul_i32 s6, s2, s5 +; GFX1132-NEXT: v_dual_mov_b32 v3, 0 :: v_dual_mov_b32 v0, s6 +; GFX1132-NEXT: v_mov_b32_e32 v1, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] @@ -3273,15 +3285,15 @@ ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: v_mad_u64_u32 v[3:4], null, s2, v2, 0 ; GFX1132-NEXT: v_readfirstlane_b32 s2, v0 -; GFX1132-NEXT: v_readfirstlane_b32 s4, v1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1132-NEXT: v_mad_u64_u32 v[5:6], null, s3, v2, v[4:5] +; GFX1132-NEXT: v_readfirstlane_b32 s3, v1 ; GFX1132-NEXT: v_sub_co_u32 v0, vcc_lo, s2, v3 -; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: s_mov_b32 s2, -1 -; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1132-NEXT: v_mov_b32_e32 v1, v5 -; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s4, v1, vcc_lo +; GFX1132-NEXT: v_sub_co_ci_u32_e32 v1, vcc_lo, s3, v1, vcc_lo +; GFX1132-NEXT: s_mov_b32 s3, 0x31016000 ; GFX1132-NEXT: buffer_store_b64 v[0:1], off, s[0:3], 0 ; GFX1132-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX1132-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll --- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll +++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll @@ -1539,7 +1539,7 @@ ; VI-NEXT: s_mov_b64 s[2:3], 0 ; VI-NEXT: s_cbranch_execnz .LBB14_3 ; VI-NEXT: .LBB14_2: ; %if -; VI-NEXT: s_and_b32 s2, s4, 0xffff +; VI-NEXT: s_and_b32 s2, 0xffff, s4 ; VI-NEXT: s_bcnt1_i32_b32 s2, s2 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -235,10 +235,6 @@ ; GCN-O1-NEXT: Natural Loop Information ; GCN-O1-NEXT: Legacy Divergence Analysis ; GCN-O1-NEXT: AMDGPU IR late optimizations -; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-NEXT: Function Alias Analysis Results -; GCN-O1-NEXT: Code sinking -; GCN-O1-NEXT: Legacy Divergence Analysis ; GCN-O1-NEXT: Unify divergent function exit nodes ; GCN-O1-NEXT: Lazy Value Information Analysis ; GCN-O1-NEXT: Lower SwitchInst's to branches @@ -517,10 +513,6 @@ ; GCN-O1-OPTS-NEXT: Natural Loop Information ; GCN-O1-OPTS-NEXT: Legacy Divergence Analysis ; GCN-O1-OPTS-NEXT: AMDGPU IR late optimizations -; GCN-O1-OPTS-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O1-OPTS-NEXT: Function Alias Analysis Results -; GCN-O1-OPTS-NEXT: Code sinking -; GCN-O1-OPTS-NEXT: Legacy Divergence Analysis ; GCN-O1-OPTS-NEXT: Unify divergent function exit nodes ; GCN-O1-OPTS-NEXT: Lazy Value Information Analysis ; GCN-O1-OPTS-NEXT: Lower SwitchInst's to branches @@ -807,10 +799,6 @@ ; GCN-O2-NEXT: Natural Loop Information ; GCN-O2-NEXT: Legacy Divergence Analysis ; GCN-O2-NEXT: AMDGPU IR late optimizations -; GCN-O2-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O2-NEXT: Function Alias Analysis Results -; GCN-O2-NEXT: Code sinking -; GCN-O2-NEXT: Legacy Divergence Analysis ; GCN-O2-NEXT: Unify divergent function exit nodes ; GCN-O2-NEXT: Lazy Value Information Analysis ; GCN-O2-NEXT: Lower SwitchInst's to branches @@ -1112,10 +1100,6 @@ ; GCN-O3-NEXT: Natural Loop Information ; GCN-O3-NEXT: Legacy Divergence Analysis ; GCN-O3-NEXT: AMDGPU IR late optimizations -; GCN-O3-NEXT: Basic Alias Analysis (stateless AA impl) -; GCN-O3-NEXT: Function Alias Analysis Results -; GCN-O3-NEXT: Code sinking -; GCN-O3-NEXT: Legacy Divergence Analysis ; GCN-O3-NEXT: Unify divergent function exit nodes ; GCN-O3-NEXT: Lazy Value Information Analysis ; GCN-O3-NEXT: Lower SwitchInst's to branches diff --git a/llvm/test/CodeGen/AMDGPU/mul_int24.ll b/llvm/test/CodeGen/AMDGPU/mul_int24.ll --- a/llvm/test/CodeGen/AMDGPU/mul_int24.ll +++ b/llvm/test/CodeGen/AMDGPU/mul_int24.ll @@ -725,13 +725,13 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 ; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_bfe_i32 s4, s4, 0x180000 -; VI-NEXT: s_bfe_i32 s5, s6, 0x180000 -; VI-NEXT: s_mul_i32 s4, s4, s5 -; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: s_bfe_i32 s2, s4, 0x180000 +; VI-NEXT: s_bfe_i32 s4, s6, 0x180000 +; VI-NEXT: s_mul_i32 s2, s2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll --- a/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll +++ b/llvm/test/CodeGen/AMDGPU/non-entry-alloca.ll @@ -24,12 +24,12 @@ ; MUBUF-NEXT: s_cmp_lg_u32 s8, 0 ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_cmp_lg_u32 s9, 0 +; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_cbranch_scc1 .LBB0_3 ; MUBUF-NEXT: ; %bb.2: ; %bb.1 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: s_lshl_b32 s7, s10, 2 -; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0 ; MUBUF-NEXT: v_mov_b32_e32 v2, s6 ; MUBUF-NEXT: v_mov_b32_e32 v3, 1 @@ -60,14 +60,14 @@ ; FLATSCR-NEXT: s_cmp_lg_u32 s4, 0 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: s_cmp_lg_u32 s5, 0 +; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_cbranch_scc1 .LBB0_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 ; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s3, s6, 2 -; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[1:2], s2 ; FLATSCR-NEXT: s_add_i32 s2, s2, s3 ; FLATSCR-NEXT: scratch_load_dword v2, off, s2 @@ -161,8 +161,8 @@ ; FLATSCR-NEXT: s_cbranch_scc1 .LBB1_2 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 ; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 -; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: s_and_b32 s2, s2, 0xfffff000 +; FLATSCR-NEXT: v_mov_b32_e32 v1, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 1 ; FLATSCR-NEXT: s_lshl_b32 s3, s3, 2 ; FLATSCR-NEXT: s_mov_b32 s32, s2 @@ -219,11 +219,12 @@ ; MUBUF-NEXT: s_and_saveexec_b64 s[4:5], vcc ; MUBUF-NEXT: s_cbranch_execz .LBB2_3 ; MUBUF-NEXT: ; %bb.1: ; %bb.0 +; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_and_b64 exec, exec, vcc ; MUBUF-NEXT: s_cbranch_execz .LBB2_3 ; MUBUF-NEXT: ; %bb.2: ; %bb.1 -; MUBUF-NEXT: s_add_i32 s6, s32, 0x1000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0 ; MUBUF-NEXT: v_mov_b32_e32 v3, s6 ; MUBUF-NEXT: buffer_store_dword v2, v3, s[0:3], 0 offen @@ -232,7 +233,6 @@ ; MUBUF-NEXT: v_lshl_add_u32 v2, v4, 2, s6 ; MUBUF-NEXT: buffer_load_dword v2, v2, s[0:3], 0 offen ; MUBUF-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; MUBUF-NEXT: s_mov_b32 s32, s6 ; MUBUF-NEXT: s_waitcnt vmcnt(0) ; MUBUF-NEXT: v_add_u32_e32 v2, v2, v3 ; MUBUF-NEXT: global_store_dword v[0:1], v2, off @@ -255,18 +255,18 @@ ; FLATSCR-NEXT: s_and_saveexec_b64 s[0:1], vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 ; FLATSCR-NEXT: ; %bb.1: ; %bb.0 +; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3 +; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_and_b64 exec, exec, vcc ; FLATSCR-NEXT: s_cbranch_execz .LBB2_3 ; FLATSCR-NEXT: ; %bb.2: ; %bb.1 -; FLATSCR-NEXT: s_add_i32 s2, s32, 0x1000 ; FLATSCR-NEXT: v_mov_b32_e32 v2, 0 ; FLATSCR-NEXT: v_mov_b32_e32 v3, 1 ; FLATSCR-NEXT: scratch_store_dwordx2 off, v[2:3], s2 ; FLATSCR-NEXT: v_lshl_add_u32 v2, v4, 2, s2 ; FLATSCR-NEXT: scratch_load_dword v2, v2, off ; FLATSCR-NEXT: v_and_b32_e32 v3, 0x3ff, v31 -; FLATSCR-NEXT: s_mov_b32 s32, s2 ; FLATSCR-NEXT: s_waitcnt vmcnt(0) ; FLATSCR-NEXT: v_add_u32_e32 v2, v2, v3 ; FLATSCR-NEXT: global_store_dword v[0:1], v2, off @@ -398,3 +398,6 @@ attributes #0 = { nounwind readnone speculatable } attributes #1 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; ASSUME1024: {{.*}} +; DEFAULTSIZE: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -224,15 +224,15 @@ ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[16:17] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] -; GCN-IR-NEXT: .LBB0_6: ; %udiv-end +; GCN-IR-NEXT: .LBB0_6: ; %Flow7 ; GCN-IR-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] ; GCN-IR-NEXT: v_xor_b32_e32 v0, s0, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s1, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %result = sdiv i64 %x, %y @@ -372,98 +372,95 @@ ; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v0 ; GCN-IR-NEXT: v_ashrrev_i32_e32 v5, 31, v3 ; GCN-IR-NEXT: v_xor_b32_e32 v1, v4, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v11, vcc, v0, v4 -; GCN-IR-NEXT: v_subb_u32_e32 v12, vcc, v1, v4, vcc -; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v2 -; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v3 -; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v1, v5 -; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v0, v5, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[11:12] -; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v2 -; GCN-IR-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v3 -; GCN-IR-NEXT: v_min_u32_e32 v0, v0, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v7, v11 -; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 32, v7 -; GCN-IR-NEXT: v_ffbh_u32_e32 v8, v12 -; GCN-IR-NEXT: v_min_u32_e32 v13, v7, v8 -; GCN-IR-NEXT: v_sub_i32_e32 v7, vcc, v0, v13 -; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], 0, 0, vcc -; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[7:8] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[4:5], 63, v[7:8] -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], vcc -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[6:7], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v6, v4 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v5 -; GCN-IR-NEXT: v_cndmask_b32_e64 v10, v12, 0, s[6:7] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[8:9], s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v9, v11, 0, s[6:7] +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, v0, v4 +; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, v1, v4, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v3 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v5 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v5, vcc +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 +; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 +; GCN-IR-NEXT: v_min_u32_e32 v8, v2, v3 +; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6 +; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7 +; GCN-IR-NEXT: v_min_u32_e32 v9, v2, v3 +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v8, v9 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] +; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] +; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 +; GCN-IR-NEXT: v_xor_b32_e32 v12, v5, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[4:5] +; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc ; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, 1, v7 -; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, 0, v8, vcc -; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], 63, v7 -; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[11:12], v7 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v2 +; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v3, vcc +; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 63, v2 +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[6:7], v2 +; GCN-IR-NEXT: v_mov_b32_e32 v4, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-IR-NEXT: s_xor_b64 s[8:9], exec, s[4:5] ; GCN-IR-NEXT: s_cbranch_execz .LBB1_5 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: v_add_i32_e32 v18, vcc, -1, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v19, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v0, v0 -; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[11:12], v14 -; GCN-IR-NEXT: v_not_b32_e32 v9, 0 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v0, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_add_i32_e32 v13, vcc, -1, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v14, vcc, -1, v1, vcc +; GCN-IR-NEXT: v_not_b32_e32 v5, v8 +; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[6:7], v10 +; GCN-IR-NEXT: v_not_b32_e32 v4, 0 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, v5, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v4, vcc ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: .LBB1_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-IR-NEXT: v_lshl_b64 v[14:15], v[14:15], 1 -; GCN-IR-NEXT: v_lshrrev_b32_e32 v0, 31, v8 -; GCN-IR-NEXT: v_or_b32_e32 v0, v14, v0 -; GCN-IR-NEXT: v_sub_i32_e32 v9, vcc, v18, v0 -; GCN-IR-NEXT: v_lshl_b64 v[7:8], v[7:8], 1 -; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, v19, v15, vcc -; GCN-IR-NEXT: v_ashrrev_i32_e32 v13, 31, v9 -; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, 1, v11 -; GCN-IR-NEXT: v_or_b32_e32 v7, v16, v7 -; GCN-IR-NEXT: v_and_b32_e32 v9, 1, v13 -; GCN-IR-NEXT: v_and_b32_e32 v16, v13, v3 -; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2 -; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, 0, v12, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[11:12] -; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v0, v13 -; GCN-IR-NEXT: v_or_b32_e32 v8, v17, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v16, s[4:5] -; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 +; GCN-IR-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 +; GCN-IR-NEXT: v_lshrrev_b32_e32 v4, 31, v3 +; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v4 +; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v13, v10 +; GCN-IR-NEXT: v_subb_u32_e32 v4, vcc, v14, v11, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v8, v2 +; GCN-IR-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v6 +; GCN-IR-NEXT: v_or_b32_e32 v3, v9, v3 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v8 +; GCN-IR-NEXT: v_and_b32_e32 v9, v8, v1 +; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v0 +; GCN-IR-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v9, s[4:5] +; GCN-IR-NEXT: v_mov_b32_e32 v9, v5 ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz .LBB1_3 ; GCN-IR-NEXT: ; %bb.4: ; %Flow ; GCN-IR-NEXT: s_or_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: .LBB1_5: ; %Flow3 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-IR-NEXT: v_lshl_b64 v[2:3], v[7:8], 1 -; GCN-IR-NEXT: v_or_b32_e32 v10, v10, v3 -; GCN-IR-NEXT: v_or_b32_e32 v9, v9, v2 +; GCN-IR-NEXT: v_lshl_b64 v[0:1], v[2:3], 1 +; GCN-IR-NEXT: v_or_b32_e32 v5, v5, v1 +; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v0 ; GCN-IR-NEXT: .LBB1_6: ; %Flow4 ; GCN-IR-NEXT: s_or_b64 exec, exec, s[6:7] -; GCN-IR-NEXT: v_xor_b32_e32 v0, v5, v4 -; GCN-IR-NEXT: v_xor_b32_e32 v1, v1, v6 -; GCN-IR-NEXT: v_xor_b32_e32 v3, v9, v0 -; GCN-IR-NEXT: v_xor_b32_e32 v2, v10, v1 -; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v3, v0 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v0, v4, v12 +; GCN-IR-NEXT: v_xor_b32_e32 v1, v5, v12 +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; GCN-IR-NEXT: s_setpc_b64 s[30:31] %result = sdiv i64 %x, %y ret i64 %result @@ -1068,7 +1065,7 @@ ; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v0, 0, s[16:17] ; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[16:17] -; GCN-IR-NEXT: .LBB9_6: ; %udiv-end +; GCN-IR-NEXT: .LBB9_6: ; %Flow4 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-IR-NEXT: v_xor_b32_e32 v0, s2, v0 ; GCN-IR-NEXT: v_xor_b32_e32 v1, s3, v1 @@ -1278,9 +1275,9 @@ ; GCN-IR-NEXT: v_xor_b32_e32 v1, s5, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s5 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s4, v0 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = sdiv i64 24, %x diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-control-flow.ll @@ -13,11 +13,12 @@ ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0xb ; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 -; SI-NEXT: s_load_dword s0, s[0:1], 0xf ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_lg_u32 s8, 0 ; SI-NEXT: s_cbranch_scc0 .LBB0_4 ; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_load_dword s0, s[0:1], 0xf +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_i32 s2, s11, s0 ; SI-NEXT: s_cbranch_execnz .LBB0_3 ; SI-NEXT: .LBB0_2: ; %if @@ -155,7 +156,6 @@ ; SI-LABEL: sgpr_if_else_valu_cmp_phi_br: ; SI: ; %bb.0: ; %entry ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s2, 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SI-NEXT: ; implicit-def: $sgpr8_sgpr9 @@ -163,6 +163,7 @@ ; SI-NEXT: s_xor_b64 s[10:11], exec, s[10:11] ; SI-NEXT: s_cbranch_execz .LBB3_2 ; SI-NEXT: ; %bb.1: ; %else +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -173,13 +174,13 @@ ; SI-NEXT: s_and_b64 s[8:9], vcc, exec ; SI-NEXT: ; implicit-def: $vgpr0 ; SI-NEXT: .LBB3_2: ; %Flow -; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_or_saveexec_b64 s[0:1], s[10:11] ; SI-NEXT: s_xor_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execz .LBB3_4 ; SI-NEXT: ; %bb.3: ; %if ; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_mov_b32 s14, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[12:13], s[6:7] ; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; SI-NEXT: v_mov_b32_e32 v1, 0 @@ -191,9 +192,10 @@ ; SI-NEXT: s_or_b64 s[8:9], s[2:3], s[6:7] ; SI-NEXT: .LBB3_4: ; %endif ; SI-NEXT: s_or_b64 exec, exec, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm entry: diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll @@ -67,50 +67,48 @@ define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) { ; SI-LABEL: phi_cond_outside_loop: ; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s2, s[0:1], 0x9 ; SI-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 +; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_cmp_eq_u32 s2, 0 +; SI-NEXT: s_cselect_b64 s[6:7], -1, 0 ; SI-NEXT: s_mov_b64 s[2:3], 0 -; SI-NEXT: s_mov_b64 s[4:5], 0 -; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc -; SI-NEXT: s_cbranch_execz .LBB1_2 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SI-NEXT: ; %bb.1: ; %else -; SI-NEXT: s_load_dword s0, s[0:1], 0x9 -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_cmp_eq_u32 s0, 0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], s[0:1], exec -; SI-NEXT: .LBB1_2: ; %endif -; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_and_b64 s[2:3], s[6:7], exec +; SI-NEXT: ; %bb.2: ; %endif +; SI-NEXT: s_or_b64 exec, exec, s[4:5] ; SI-NEXT: .LBB1_3: ; %loop ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: s_and_b64 s[0:1], exec, s[4:5] -; SI-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; SI-NEXT: s_andn2_b64 exec, exec, s[2:3] +; SI-NEXT: s_and_b64 s[4:5], exec, s[2:3] +; SI-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; SI-NEXT: s_andn2_b64 exec, exec, s[0:1] ; SI-NEXT: s_cbranch_execnz .LBB1_3 ; SI-NEXT: ; %bb.4: ; %exit ; SI-NEXT: s_endpgm ; ; FLAT-LABEL: phi_cond_outside_loop: ; FLAT: ; %bb.0: ; %entry +; FLAT-NEXT: s_load_dword s2, s[0:1], 0x24 ; FLAT-NEXT: v_mbcnt_lo_u32_b32 v0, -1, 0 +; FLAT-NEXT: s_mov_b64 s[0:1], 0 ; FLAT-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; FLAT-NEXT: s_waitcnt lgkmcnt(0) +; FLAT-NEXT: s_cmp_eq_u32 s2, 0 +; FLAT-NEXT: s_cselect_b64 s[6:7], -1, 0 ; FLAT-NEXT: s_mov_b64 s[2:3], 0 -; FLAT-NEXT: s_mov_b64 s[4:5], 0 -; FLAT-NEXT: s_and_saveexec_b64 s[6:7], vcc -; FLAT-NEXT: s_cbranch_execz .LBB1_2 +; FLAT-NEXT: s_and_saveexec_b64 s[4:5], vcc ; FLAT-NEXT: ; %bb.1: ; %else -; FLAT-NEXT: s_load_dword s0, s[0:1], 0x24 -; FLAT-NEXT: s_waitcnt lgkmcnt(0) -; FLAT-NEXT: s_cmp_eq_u32 s0, 0 -; FLAT-NEXT: s_cselect_b64 s[0:1], -1, 0 -; FLAT-NEXT: s_and_b64 s[4:5], s[0:1], exec -; FLAT-NEXT: .LBB1_2: ; %endif -; FLAT-NEXT: s_or_b64 exec, exec, s[6:7] +; FLAT-NEXT: s_and_b64 s[2:3], s[6:7], exec +; FLAT-NEXT: ; %bb.2: ; %endif +; FLAT-NEXT: s_or_b64 exec, exec, s[4:5] ; FLAT-NEXT: .LBB1_3: ; %loop ; FLAT-NEXT: ; =>This Inner Loop Header: Depth=1 -; FLAT-NEXT: s_and_b64 s[0:1], exec, s[4:5] -; FLAT-NEXT: s_or_b64 s[2:3], s[0:1], s[2:3] -; FLAT-NEXT: s_andn2_b64 exec, exec, s[2:3] +; FLAT-NEXT: s_and_b64 s[4:5], exec, s[2:3] +; FLAT-NEXT: s_or_b64 s[0:1], s[4:5], s[0:1] +; FLAT-NEXT: s_andn2_b64 exec, exec, s[0:1] ; FLAT-NEXT: s_cbranch_execnz .LBB1_3 ; FLAT-NEXT: ; %bb.4: ; %exit ; FLAT-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll --- a/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll +++ b/llvm/test/CodeGen/AMDGPU/skip-if-dead.ll @@ -1401,17 +1401,18 @@ ; SI-NEXT: s_wqm_b64 exec, exec ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 -; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc -; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: v_cmp_gt_f32_e64 s[4:5], 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SI-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; SI-NEXT: s_cbranch_execz .LBB13_3 ; SI-NEXT: ; %bb.1: ; %bb3 -; SI-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; SI-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; SI-NEXT: s_cbranch_scc0 .LBB13_6 ; SI-NEXT: ; %bb.2: ; %bb3 -; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: s_and_b64 exec, exec, s[4:5] ; SI-NEXT: .LBB13_3: ; %bb4 -; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_or_b64 exec, exec, s[6:7] ; SI-NEXT: s_mov_b32 s1, s0 ; SI-NEXT: s_mov_b32 s2, s0 ; SI-NEXT: s_mov_b32 s3, s0 @@ -1442,18 +1443,19 @@ ; GFX10-WAVE64-NEXT: s_mov_b64 s[2:3], exec ; GFX10-WAVE64-NEXT: s_wqm_b64 exec, exec ; GFX10-WAVE64-NEXT: v_cmp_nle_f32_e32 vcc, 0, v1 +; GFX10-WAVE64-NEXT: v_cmp_gt_f32_e64 s[4:5], 0, v0 ; GFX10-WAVE64-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[4:5], vcc -; GFX10-WAVE64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX10-WAVE64-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX10-WAVE64-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX10-WAVE64-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE64-NEXT: ; %bb.1: ; %bb3 -; GFX10-WAVE64-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], vcc +; GFX10-WAVE64-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GFX10-WAVE64-NEXT: s_andn2_b64 s[2:3], s[2:3], s[8:9] ; GFX10-WAVE64-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE64-NEXT: ; %bb.2: ; %bb3 -; GFX10-WAVE64-NEXT: s_andn2_b64 exec, exec, vcc +; GFX10-WAVE64-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX10-WAVE64-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX10-WAVE64-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX10-WAVE64-NEXT: s_mov_b32 s1, s0 ; GFX10-WAVE64-NEXT: s_mov_b32 s2, s0 ; GFX10-WAVE64-NEXT: s_mov_b32 s3, s0 @@ -1482,18 +1484,19 @@ ; GFX10-WAVE32-NEXT: s_mov_b32 s1, exec_lo ; GFX10-WAVE32-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX10-WAVE32-NEXT: v_cmp_nle_f32_e32 vcc_lo, 0, v1 +; GFX10-WAVE32-NEXT: v_cmp_gt_f32_e64 s2, 0, v0 ; GFX10-WAVE32-NEXT: s_mov_b32 s0, 0 -; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s2, vcc_lo -; GFX10-WAVE32-NEXT: s_xor_b32 s2, exec_lo, s2 +; GFX10-WAVE32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-WAVE32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10-WAVE32-NEXT: s_cbranch_execz .LBB13_3 ; GFX10-WAVE32-NEXT: ; %bb.1: ; %bb3 -; GFX10-WAVE32-NEXT: v_cmp_ngt_f32_e32 vcc_lo, 0, v0 -; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, vcc_lo +; GFX10-WAVE32-NEXT: s_xor_b32 s4, s2, exec_lo +; GFX10-WAVE32-NEXT: s_andn2_b32 s1, s1, s4 ; GFX10-WAVE32-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX10-WAVE32-NEXT: ; %bb.2: ; %bb3 -; GFX10-WAVE32-NEXT: s_andn2_b32 exec_lo, exec_lo, vcc_lo +; GFX10-WAVE32-NEXT: s_and_b32 exec_lo, exec_lo, s2 ; GFX10-WAVE32-NEXT: .LBB13_3: ; %bb4 -; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX10-WAVE32-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX10-WAVE32-NEXT: s_mov_b32 s1, s0 ; GFX10-WAVE32-NEXT: s_mov_b32 s2, s0 ; GFX10-WAVE32-NEXT: s_mov_b32 s3, s0 @@ -1521,20 +1524,22 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[2:3], exec ; GFX11-NEXT: s_wqm_b64 exec, exec +; GFX11-NEXT: v_cmp_gt_f32_e64 s[4:5], 0, v0 ; GFX11-NEXT: s_mov_b32 s0, 0 -; GFX11-NEXT: s_mov_b64 s[4:5], exec +; GFX11-NEXT: s_mov_b64 s[6:7], exec ; GFX11-NEXT: v_cmpx_nle_f32_e32 0, v1 -; GFX11-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; GFX11-NEXT: s_xor_b64 s[6:7], exec, s[6:7] ; GFX11-NEXT: s_cbranch_execz .LBB13_3 ; GFX11-NEXT: ; %bb.1: ; %bb3 -; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc, 0, v0 -; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], vcc +; GFX11-NEXT: s_xor_b64 s[8:9], s[4:5], exec +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[8:9] ; GFX11-NEXT: s_cbranch_scc0 .LBB13_6 ; GFX11-NEXT: ; %bb.2: ; %bb3 -; GFX11-NEXT: s_and_not1_b64 exec, exec, vcc +; GFX11-NEXT: s_and_b64 exec, exec, s[4:5] ; GFX11-NEXT: .LBB13_3: ; %bb4 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 exec, exec, s[4:5] +; GFX11-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -201,16 +201,14 @@ ; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 -; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: s_mov_b32 s8, s0 -; GCN-IR-NEXT: s_mov_b32 s9, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out @@ -997,22 +995,22 @@ ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[6:7], 31 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[0:1], 31 ; GCN-IR-NEXT: s_ashr_i32 s0, s7, 31 -; GCN-IR-NEXT: s_ashr_i32 s6, s1, 31 +; GCN-IR-NEXT: s_ashr_i32 s10, s1, 31 ; GCN-IR-NEXT: s_mov_b32 s1, s0 -; GCN-IR-NEXT: s_mov_b32 s7, s6 +; GCN-IR-NEXT: s_mov_b32 s11, s10 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[0:1] -; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[6:7] +; GCN-IR-NEXT: s_xor_b64 s[6:7], s[8:9], s[10:11] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s0 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s0 -; GCN-IR-NEXT: s_sub_u32 s8, s8, s6 -; GCN-IR-NEXT: s_subb_u32 s9, s9, s6 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 +; GCN-IR-NEXT: s_sub_u32 s6, s6, s10 +; GCN-IR-NEXT: s_subb_u32 s7, s7, s10 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_or_b64 s[14:15], s[10:11], s[12:13] -; GCN-IR-NEXT: s_flbit_i32_b32 s10, s8 +; GCN-IR-NEXT: s_flbit_i32_b32 s10, s6 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s11, s9 +; GCN-IR-NEXT: s_flbit_i32_b32 s11, s7 ; GCN-IR-NEXT: s_min_u32 s12, s10, s11 ; GCN-IR-NEXT: s_flbit_i32_b32 s10, s2 ; GCN-IR-NEXT: s_add_i32 s10, s10, 32 @@ -1038,39 +1036,39 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[2:3], s14 -; GCN-IR-NEXT: s_add_u32 s18, s8, -1 -; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 -; GCN-IR-NEXT: s_not_b64 s[6:7], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s6, s16 -; GCN-IR-NEXT: s_addc_u32 s13, s7, 0 +; GCN-IR-NEXT: s_add_u32 s18, s6, -1 +; GCN-IR-NEXT: s_addc_u32 s19, s7, -1 +; GCN-IR-NEXT: s_not_b64 s[8:9], s[12:13] +; GCN-IR-NEXT: s_add_u32 s12, s8, s16 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 -; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshr_b32 s6, s11, 31 +; GCN-IR-NEXT: s_lshr_b32 s8, s11, 31 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[8:9] ; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s6, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s6, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s6, 31 +; GCN-IR-NEXT: s_sub_u32 s8, s18, s14 +; GCN-IR-NEXT: s_subb_u32 s8, s19, s15 +; GCN-IR-NEXT: s_ashr_i32 s16, s8, 31 ; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s6, s16, 1 -; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[8:9] +; GCN-IR-NEXT: s_and_b32 s8, s16, 1 +; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[6:7] +; GCN-IR-NEXT: s_mov_b64 s[16:17], s[8:9] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[6:7], s[10:11] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: s_branch .LBB8_6 ; GCN-IR-NEXT: .LBB8_5: ; GCN-IR-NEXT: v_mov_b32_e32 v0, s3 @@ -1078,15 +1076,13 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[14:15] ; GCN-IR-NEXT: .LBB8_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_hi_u32 v2, s8, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v1, s8, v1 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: v_readfirstlane_b32 s10, v2 -; GCN-IR-NEXT: v_mul_lo_u32 v2, s9, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s8, v0 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s10, v1 -; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s6, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s6, v1 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s7, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s6, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s7, v2 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s7, v1 +; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v3 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc @@ -1095,6 +1091,8 @@ ; GCN-IR-NEXT: v_mov_b32_e32 v2, s1 ; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s0, v0 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v2, vcc +; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 31 @@ -1376,20 +1374,20 @@ ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_ashr_i32 s6, s3, 31 -; GCN-IR-NEXT: s_mov_b32 s7, s6 -; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s4, s2, s6 -; GCN-IR-NEXT: s_subb_u32 s5, s3, s6 -; GCN-IR-NEXT: s_flbit_i32_b32 s2, s4 -; GCN-IR-NEXT: s_add_i32 s2, s2, 32 -; GCN-IR-NEXT: s_flbit_i32_b32 s3, s5 -; GCN-IR-NEXT: s_min_u32 s8, s2, s3 +; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 +; GCN-IR-NEXT: s_mov_b32 s5, s4 +; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GCN-IR-NEXT: s_sub_u32 s2, s2, s4 +; GCN-IR-NEXT: s_subb_u32 s3, s3, s4 +; GCN-IR-NEXT: s_flbit_i32_b32 s4, s2 +; GCN-IR-NEXT: s_add_i32 s4, s4, 32 +; GCN-IR-NEXT: s_flbit_i32_b32 s5, s3 +; GCN-IR-NEXT: s_min_u32 s8, s4, s5 ; GCN-IR-NEXT: s_add_u32 s6, s8, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[4:5], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 ; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[6:7], 63 -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[12:13] ; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[12:13], s[6:7], 63 ; GCN-IR-NEXT: s_xor_b64 s[14:15], s[10:11], -1 @@ -1406,53 +1404,53 @@ ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 -; GCN-IR-NEXT: s_add_u32 s14, s4, -1 -; GCN-IR-NEXT: s_addc_u32 s15, s5, -1 +; GCN-IR-NEXT: s_add_u32 s14, s2, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 ; GCN-IR-NEXT: s_sub_u32 s8, 58, s8 ; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 ; GCN-IR-NEXT: s_mov_b64 s[12:13], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s5, 0 ; GCN-IR-NEXT: .LBB10_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s7, 31 +; GCN-IR-NEXT: s_lshr_b32 s4, s7, 31 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[2:3] +; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[4:5] ; GCN-IR-NEXT: s_or_b64 s[6:7], s[12:13], s[6:7] -; GCN-IR-NEXT: s_sub_u32 s2, s14, s10 -; GCN-IR-NEXT: s_subb_u32 s2, s15, s11 -; GCN-IR-NEXT: s_ashr_i32 s12, s2, 31 +; GCN-IR-NEXT: s_sub_u32 s4, s14, s10 +; GCN-IR-NEXT: s_subb_u32 s4, s15, s11 +; GCN-IR-NEXT: s_ashr_i32 s12, s4, 31 ; GCN-IR-NEXT: s_mov_b32 s13, s12 -; GCN-IR-NEXT: s_and_b32 s2, s12, 1 -; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_and_b32 s4, s12, 1 +; GCN-IR-NEXT: s_and_b64 s[12:13], s[12:13], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s10, s10, s12 ; GCN-IR-NEXT: s_subb_u32 s11, s11, s13 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[8:9], 0 -; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] +; GCN-IR-NEXT: s_mov_b64 s[12:13], s[4:5] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_3 ; GCN-IR-NEXT: .LBB10_4: ; %Flow5 ; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 ; GCN-IR-NEXT: s_branch .LBB10_6 ; GCN-IR-NEXT: .LBB10_5: ; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 ; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, s[10:11] ; GCN-IR-NEXT: .LBB10_6: ; %udiv-end -; GCN-IR-NEXT: v_mul_lo_u32 v1, s4, v1 -; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 -; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v1, s2, v1 +; GCN-IR-NEXT: v_mul_hi_u32 v2, s2, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v3, s3, v0 +; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 -; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc +; GCN-IR-NEXT: s_mov_b32 s2, -1 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = srem i64 24, %x diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -201,16 +201,14 @@ ; GCN-IR-NEXT: v_mul_hi_u32 v2, s4, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v3, s5, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s4, v0 -; GCN-IR-NEXT: s_mov_b32 s11, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s10, -1 -; GCN-IR-NEXT: s_mov_b32 s8, s0 -; GCN-IR-NEXT: s_mov_b32 s9, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, %y store i64 %result, i64 addrspace(1)* %out @@ -883,15 +881,13 @@ ; GCN-IR-NEXT: v_mul_hi_u32 v2, s2, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v3, s3, v0 ; GCN-IR-NEXT: v_mul_lo_u32 v0, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v3, v1 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 24, v0 -; GCN-IR-NEXT: s_mov_b32 s6, -1 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, 0, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 24, %x store i64 %result, i64 addrspace(1)* %out @@ -1069,15 +1065,13 @@ ; GCN-IR-NEXT: v_mul_lo_u32 v1, v1, 24 ; GCN-IR-NEXT: v_mul_hi_u32 v2, v0, 24 ; GCN-IR-NEXT: v_mul_lo_u32 v0, v0, 24 -; GCN-IR-NEXT: s_mov_b32 s7, 0xf000 -; GCN-IR-NEXT: s_mov_b32 s6, -1 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, v1, v2 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 -; GCN-IR-NEXT: s_mov_b32 s4, s0 -; GCN-IR-NEXT: s_mov_b32 s5, s1 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v1, vcc -; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 +; GCN-IR-NEXT: s_mov_b32 s2, -1 +; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = urem i64 %x, 24 store i64 %result, i64 addrspace(1)* %out diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll @@ -475,9 +475,9 @@ ; SI-NEXT: successors: %bb.7(0x80000000) ; SI-NEXT: {{ $}} ; SI-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[COPY]](p4), 36, 0 :: (dereferenceable invariant load (s64) from %ir.src1.kernarg.offset.cast, align 4, addrspace 4) - ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %50, 0, implicit $exec - ; SI-NEXT: %43:vgpr_32, dead %45:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec - ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %43, %subreg.sub1 + ; SI-NEXT: [[V_ADD_CO_U32_e64_:%[0-9]+]]:vgpr_32, [[V_ADD_CO_U32_e64_1:%[0-9]+]]:sreg_32_xm0_xexec = V_ADD_CO_U32_e64 [[S_LOAD_DWORDX2_IMM]].sub0, killed %51, 0, implicit $exec + ; SI-NEXT: %44:vgpr_32, dead %46:sreg_32_xm0_xexec = V_ADDC_U32_e64 0, killed [[S_LOAD_DWORDX2_IMM]].sub1, killed [[V_ADD_CO_U32_e64_1]], 0, implicit $exec + ; SI-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_ADD_CO_U32_e64_]], %subreg.sub0, killed %44, %subreg.sub1 ; SI-NEXT: [[GLOBAL_LOAD_UBYTE:%[0-9]+]]:vgpr_32 = GLOBAL_LOAD_UBYTE killed [[REG_SEQUENCE]], 0, 0, implicit $exec :: (load (s8) from %ir.i10, addrspace 1) ; SI-NEXT: [[V_MOV_B:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B]], killed [[GLOBAL_LOAD_UBYTE]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) @@ -503,14 +503,14 @@ ; SI-NEXT: bb.5.Flow: ; SI-NEXT: successors: %bb.1(0x40000000), %bb.7(0x40000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %51:vgpr_32, %bb.6 + ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI [[COPY1]](s32), %bb.0, undef %52:vgpr_32, %bb.6 ; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.7, implicit-def dead $exec, implicit-def dead $scc, implicit $exec ; SI-NEXT: S_BRANCH %bb.1 ; SI-NEXT: {{ $}} ; SI-NEXT: bb.6.sw.bb18: ; SI-NEXT: successors: %bb.5(0x80000000) ; SI-NEXT: {{ $}} - ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %37:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 + ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI undef %38:vgpr_32, %bb.3, [[GLOBAL_LOAD_UBYTE1]], %bb.4 ; SI-NEXT: [[V_MOV_B2:%[0-9]+]]:vreg_64 = V_MOV_B64_PSEUDO 0, implicit $exec ; SI-NEXT: GLOBAL_STORE_BYTE killed [[V_MOV_B2]], killed [[PHI1]], 0, 0, implicit $exec :: (store (s8) into `i8 addrspace(1)* null`, addrspace 1) ; SI-NEXT: S_BRANCH %bb.5