diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl --- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl +++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl @@ -690,14 +690,12 @@ // CHECK-LABEL: @test_mbcnt_lo( // CHECK: call i32 @llvm.amdgcn.mbcnt.lo(i32 %src0, i32 %src1) -// CHECK: declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #[[$MBCNT_ATTRS:[0-9]+]] kernel void test_mbcnt_lo(global uint* out, uint src0, uint src1) { *out = __builtin_amdgcn_mbcnt_lo(src0, src1); } // CHECK-LABEL: @test_mbcnt_hi( // CHECK: call i32 @llvm.amdgcn.mbcnt.hi(i32 %src0, i32 %src1) -// CHECK: declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #[[$MBCNT_ATTRS]] kernel void test_mbcnt_hi(global uint* out, uint src0, uint src1) { *out = __builtin_amdgcn_mbcnt_hi(src0, src1); } @@ -834,7 +832,6 @@ // CHECK-DAG: [[$WS_RANGE]] = !{i16 1, i16 1025} // CHECK-DAG: attributes #[[$NOUNWIND_READONLY]] = { mustprogress nocallback nofree nosync nounwind willreturn memory(read) } // CHECK-DAG: attributes #[[$READ_EXEC_ATTRS]] = { convergent } -// CHECK-DAG: attributes #[[$MBCNT_ATTRS]] = {{.* convergent .*}} // CHECK-DAG: ![[$EXEC]] = !{!"exec"} // CHECK-DAG: ![[$EXEC_LO]] = !{!"exec_lo"} // CHECK-DAG: ![[$EXEC_HI]] = !{!"exec_hi"} diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td --- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td +++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td @@ -1833,12 +1833,12 @@ def int_amdgcn_mbcnt_lo : ClangBuiltin<"__builtin_amdgcn_mbcnt_lo">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent]>; + [IntrNoMem]>; def int_amdgcn_mbcnt_hi : ClangBuiltin<"__builtin_amdgcn_mbcnt_hi">, DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], - [IntrNoMem, IntrConvergent]>; + [IntrNoMem]>; // llvm.amdgcn.ds.swizzle src offset def int_amdgcn_ds_swizzle : diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll @@ -449,10 +449,8 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -469,7 +467,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -493,10 +493,8 @@ ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -513,7 +511,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -536,10 +536,8 @@ ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -555,7 +553,9 @@ ; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -579,7 +579,6 @@ ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 @@ -594,7 +593,8 @@ ; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 ; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -618,11 +618,8 @@ ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -640,9 +637,13 @@ ; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: @@ -665,7 +666,6 @@ ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 @@ -681,9 +681,11 @@ ; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: @@ -726,10 +728,8 @@ ; ; GFX8-LABEL: struct_add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -746,7 +746,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -772,10 +774,8 @@ ; ; GFX9-LABEL: struct_add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -792,7 +792,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -817,10 +819,8 @@ ; ; GFX10W64-LABEL: struct_add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -836,7 +836,9 @@ ; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -863,7 +865,6 @@ ; ; GFX10W32-LABEL: struct_add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 @@ -878,7 +879,8 @@ ; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 ; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -905,11 +907,8 @@ ; ; GFX11W64-LABEL: struct_add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB3_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -927,9 +926,13 @@ ; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W64-NEXT: ; %bb.3: @@ -955,7 +958,6 @@ ; ; GFX11W32-LABEL: struct_add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 @@ -971,9 +973,11 @@ ; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB3_4 ; GFX11W32-NEXT: ; %bb.3: @@ -1518,10 +1522,8 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1538,7 +1540,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1562,10 +1566,8 @@ ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1582,7 +1584,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1605,10 +1609,8 @@ ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1624,7 +1626,9 @@ ; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1648,7 +1652,6 @@ ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 @@ -1663,7 +1666,8 @@ ; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 ; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -1687,11 +1691,8 @@ ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1709,9 +1710,13 @@ ; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: @@ -1734,7 +1739,6 @@ ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 @@ -1750,9 +1754,11 @@ ; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll @@ -516,10 +516,8 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -537,7 +535,9 @@ ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -565,10 +565,8 @@ ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -586,7 +584,9 @@ ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -614,10 +614,8 @@ ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -634,7 +632,9 @@ ; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -665,7 +665,6 @@ ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -681,7 +680,8 @@ ; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 @@ -712,11 +712,8 @@ ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -735,9 +732,13 @@ ; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164-NEXT: ; %bb.3: @@ -767,7 +768,6 @@ ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -784,9 +784,11 @@ ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132-NEXT: ; %bb.3: @@ -2016,10 +2018,8 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s6, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB8_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2037,7 +2037,9 @@ ; GFX8-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2065,10 +2067,8 @@ ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s6, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB8_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2086,7 +2086,9 @@ ; GFX9-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2114,10 +2116,8 @@ ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s6, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2134,7 +2134,9 @@ ; GFX1064-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1064-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX1064-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -2165,7 +2167,6 @@ ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s2, exec_lo ; GFX1032-NEXT: s_mov_b32 s4, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -2181,7 +2182,8 @@ ; GFX1032-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1032-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1032-NEXT: s_xor_b32 s5, exec_lo, s5 @@ -2212,11 +2214,8 @@ ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s6, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB8_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2235,9 +2234,13 @@ ; GFX1164-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1164-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1164-NEXT: s_mov_b64 s[4:5], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX1164-NEXT: s_cbranch_execz .LBB8_4 ; GFX1164-NEXT: ; %bb.3: @@ -2267,7 +2270,6 @@ ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s2, exec_lo ; GFX1132-NEXT: s_mov_b32 s4, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -2284,9 +2286,11 @@ ; GFX1132-NEXT: s_cbranch_scc1 .LBB8_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd ; GFX1132-NEXT: s_load_b128 s[0:3], s[0:1], 0x24 -; GFX1132-NEXT: s_mov_b32 s5, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s5, vcc_lo ; GFX1132-NEXT: s_xor_b32 s5, exec_lo, s5 ; GFX1132-NEXT: s_cbranch_execz .LBB8_4 ; GFX1132-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -491,10 +491,8 @@ ; ; GFX8-LABEL: add_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -511,7 +509,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -536,10 +536,8 @@ ; ; GFX9-LABEL: add_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -556,7 +554,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -580,10 +580,8 @@ ; ; GFX1064-LABEL: add_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -599,7 +597,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -626,7 +626,6 @@ ; ; GFX1032-LABEL: add_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -641,7 +640,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -668,11 +668,8 @@ ; ; GFX1164-LABEL: add_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB2_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -690,9 +687,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB2_4 ; GFX1164-NEXT: ; %bb.3: @@ -718,7 +719,6 @@ ; ; GFX1132-LABEL: add_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -734,9 +734,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB2_4 ; GFX1132-NEXT: ; %bb.3: @@ -778,10 +780,8 @@ ; ; GFX8-LABEL: add_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX8-NEXT: .LBB3_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b32 s3, s1 @@ -795,7 +795,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB3_4 @@ -811,10 +813,8 @@ ; ; GFX9-LABEL: add_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-NEXT: .LBB3_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b32 s3, s1 @@ -828,7 +828,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB3_4 @@ -843,10 +845,8 @@ ; ; GFX1064-LABEL: add_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1064-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 @@ -860,7 +860,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB3_4 @@ -877,7 +879,6 @@ ; ; GFX1032-LABEL: add_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB3_1: ; %ComputeLoop @@ -890,7 +891,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB3_4 @@ -907,11 +909,8 @@ ; ; GFX1164-LABEL: add_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164-NEXT: .LBB3_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 @@ -927,8 +926,11 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB3_4 ; GFX1164-NEXT: ; %bb.3: @@ -944,7 +946,6 @@ ; ; GFX1132-LABEL: add_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB3_1: ; %ComputeLoop @@ -959,8 +960,10 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB3_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB3_4 ; GFX1132-NEXT: ; %bb.3: @@ -2071,10 +2074,8 @@ ; ; GFX8-LABEL: sub_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB9_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2091,7 +2092,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -2116,10 +2119,8 @@ ; ; GFX9-LABEL: sub_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB9_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2136,7 +2137,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -2160,10 +2163,8 @@ ; ; GFX1064-LABEL: sub_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2179,7 +2180,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -2206,7 +2209,6 @@ ; ; GFX1032-LABEL: sub_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -2221,7 +2223,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -2248,11 +2251,8 @@ ; ; GFX1164-LABEL: sub_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB9_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -2270,9 +2270,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB9_4 ; GFX1164-NEXT: ; %bb.3: @@ -2298,7 +2302,6 @@ ; ; GFX1132-LABEL: sub_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -2314,9 +2317,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB9_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB9_4 ; GFX1132-NEXT: ; %bb.3: @@ -2358,10 +2363,8 @@ ; ; GFX8-LABEL: sub_i32_varying_nouse: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[0:1], exec ; GFX8-NEXT: s_mov_b32 s2, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX8-NEXT: .LBB10_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX8-NEXT: s_ff1_i32_b32 s3, s1 @@ -2375,7 +2378,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX8-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX8-NEXT: s_cbranch_execz .LBB10_4 @@ -2391,10 +2396,8 @@ ; ; GFX9-LABEL: sub_i32_varying_nouse: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[0:1], exec ; GFX9-NEXT: s_mov_b32 s2, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX9-NEXT: .LBB10_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: s_ff1_i32_b32 s3, s1 @@ -2408,7 +2411,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX9-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX9-NEXT: s_cbranch_execz .LBB10_4 @@ -2423,10 +2428,8 @@ ; ; GFX1064-LABEL: sub_i32_varying_nouse: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[0:1], exec ; GFX1064-NEXT: s_mov_b32 s2, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1064-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1064-NEXT: s_ff1_i32_b32 s3, s1 @@ -2440,7 +2443,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB10_4 @@ -2457,7 +2462,6 @@ ; ; GFX1032-LABEL: sub_i32_varying_nouse: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s1, exec_lo ; GFX1032-NEXT: s_mov_b32 s0, 0 ; GFX1032-NEXT: .LBB10_1: ; %ComputeLoop @@ -2470,7 +2474,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_and_saveexec_b32 s1, vcc_lo ; GFX1032-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1032-NEXT: s_cbranch_execz .LBB10_4 @@ -2487,11 +2492,8 @@ ; ; GFX1164-LABEL: sub_i32_varying_nouse: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec ; GFX1164-NEXT: s_mov_b32 s2, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v1, exec_hi, v1 ; GFX1164-NEXT: .LBB10_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1164-NEXT: s_ctz_i32_b32 s3, s1 @@ -2507,8 +2509,11 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[0:1], exec -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1164-NEXT: s_xor_b64 s[0:1], exec, s[0:1] ; GFX1164-NEXT: s_cbranch_execz .LBB10_4 ; GFX1164-NEXT: ; %bb.3: @@ -2524,7 +2529,6 @@ ; ; GFX1132-LABEL: sub_i32_varying_nouse: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo ; GFX1132-NEXT: s_mov_b32 s0, 0 ; GFX1132-NEXT: .LBB10_1: ; %ComputeLoop @@ -2539,8 +2543,10 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s1, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB10_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s1, exec_lo -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX1132-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX1132-NEXT: s_cbranch_execz .LBB10_4 ; GFX1132-NEXT: ; %bb.3: @@ -3206,9 +3212,7 @@ ; ; GFX8-LABEL: and_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB14_1: ; %ComputeLoop @@ -3226,7 +3230,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3251,9 +3257,7 @@ ; ; GFX9-LABEL: and_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB14_1: ; %ComputeLoop @@ -3271,7 +3275,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3295,10 +3301,8 @@ ; ; GFX1064-LABEL: and_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3314,7 +3318,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3341,7 +3347,6 @@ ; ; GFX1032-LABEL: and_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -3356,7 +3361,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -3383,11 +3389,8 @@ ; ; GFX1164-LABEL: and_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB14_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3405,9 +3408,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB14_4 ; GFX1164-NEXT: ; %bb.3: @@ -3433,7 +3440,6 @@ ; ; GFX1132-LABEL: and_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -3449,9 +3455,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB14_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB14_4 ; GFX1132-NEXT: ; %bb.3: @@ -3499,10 +3507,8 @@ ; ; GFX8-LABEL: or_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB15_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3519,7 +3525,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3544,10 +3552,8 @@ ; ; GFX9-LABEL: or_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB15_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3564,7 +3570,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3588,10 +3596,8 @@ ; ; GFX1064-LABEL: or_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3607,7 +3613,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3634,7 +3642,6 @@ ; ; GFX1032-LABEL: or_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -3649,7 +3656,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -3676,11 +3684,8 @@ ; ; GFX1164-LABEL: or_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB15_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3698,9 +3703,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB15_4 ; GFX1164-NEXT: ; %bb.3: @@ -3726,7 +3735,6 @@ ; ; GFX1132-LABEL: or_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -3742,9 +3750,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB15_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB15_4 ; GFX1132-NEXT: ; %bb.3: @@ -3792,10 +3802,8 @@ ; ; GFX8-LABEL: xor_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB16_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3812,7 +3820,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3837,10 +3847,8 @@ ; ; GFX9-LABEL: xor_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB16_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3857,7 +3865,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3881,10 +3891,8 @@ ; ; GFX1064-LABEL: xor_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3900,7 +3908,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -3927,7 +3937,6 @@ ; ; GFX1032-LABEL: xor_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -3942,7 +3951,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -3969,11 +3979,8 @@ ; ; GFX1164-LABEL: xor_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB16_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -3991,9 +3998,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB16_4 ; GFX1164-NEXT: ; %bb.3: @@ -4019,7 +4030,6 @@ ; ; GFX1132-LABEL: xor_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -4035,9 +4045,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB16_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB16_4 ; GFX1132-NEXT: ; %bb.3: @@ -4085,9 +4097,7 @@ ; ; GFX8-LABEL: max_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: s_brev_b32 s4, 1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB17_1: ; %ComputeLoop @@ -4105,7 +4115,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -4130,9 +4142,7 @@ ; ; GFX9-LABEL: max_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: s_brev_b32 s4, 1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB17_1: ; %ComputeLoop @@ -4150,7 +4160,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -4174,10 +4186,8 @@ ; ; GFX1064-LABEL: max_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_brev_b32 s4, 1 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4193,7 +4203,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -4220,7 +4232,6 @@ ; ; GFX1032-LABEL: max_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_brev_b32 s2, 1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -4235,7 +4246,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -4262,11 +4274,8 @@ ; ; GFX1164-LABEL: max_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_brev_b32 s4, 1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB17_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4284,9 +4293,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB17_4 ; GFX1164-NEXT: ; %bb.3: @@ -4312,7 +4325,6 @@ ; ; GFX1132-LABEL: max_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_brev_b32 s2, 1 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -4328,9 +4340,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB17_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB17_4 ; GFX1132-NEXT: ; %bb.3: @@ -4629,9 +4643,7 @@ ; ; GFX8-LABEL: min_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: s_brev_b32 s4, -2 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB19_1: ; %ComputeLoop @@ -4649,7 +4661,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -4674,9 +4688,7 @@ ; ; GFX9-LABEL: min_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: s_brev_b32 s4, -2 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB19_1: ; %ComputeLoop @@ -4694,7 +4706,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -4718,10 +4732,8 @@ ; ; GFX1064-LABEL: min_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_brev_b32 s4, -2 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4737,7 +4749,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -4764,7 +4778,6 @@ ; ; GFX1032-LABEL: min_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_brev_b32 s2, -2 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -4779,7 +4792,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -4806,11 +4820,8 @@ ; ; GFX1164-LABEL: min_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_brev_b32 s4, -2 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB19_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -4828,9 +4839,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB19_4 ; GFX1164-NEXT: ; %bb.3: @@ -4856,7 +4871,6 @@ ; ; GFX1132-LABEL: min_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_brev_b32 s2, -2 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -4872,9 +4886,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB19_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB19_4 ; GFX1132-NEXT: ; %bb.3: @@ -5173,10 +5189,8 @@ ; ; GFX8-LABEL: umax_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB21_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5193,7 +5207,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -5218,10 +5234,8 @@ ; ; GFX9-LABEL: umax_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB21_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5238,7 +5252,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -5262,10 +5278,8 @@ ; ; GFX1064-LABEL: umax_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, 0 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5281,7 +5295,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -5308,7 +5324,6 @@ ; ; GFX1032-LABEL: umax_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, 0 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -5323,7 +5338,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -5350,11 +5366,8 @@ ; ; GFX1164-LABEL: umax_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, 0 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB21_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5372,9 +5385,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB21_4 ; GFX1164-NEXT: ; %bb.3: @@ -5400,7 +5417,6 @@ ; ; GFX1132-LABEL: umax_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, 0 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -5416,9 +5432,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB21_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB21_4 ; GFX1132-NEXT: ; %bb.3: @@ -5712,9 +5730,7 @@ ; ; GFX8-LABEL: umin_i32_varying: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: s_mov_b32 s4, -1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB23_1: ; %ComputeLoop @@ -5732,7 +5748,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -5757,9 +5775,7 @@ ; ; GFX9-LABEL: umin_i32_varying: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: s_mov_b32 s4, -1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB23_1: ; %ComputeLoop @@ -5777,7 +5793,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -5801,10 +5819,8 @@ ; ; GFX1064-LABEL: umin_i32_varying: ; GFX1064: ; %bb.0: ; %entry -; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1064-NEXT: s_mov_b64 s[2:3], exec ; GFX1064-NEXT: s_mov_b32 s4, -1 -; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1064-NEXT: ; implicit-def: $vgpr1 ; GFX1064-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5820,7 +5836,9 @@ ; GFX1064-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1064-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1064-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: ; implicit-def: $vgpr0 ; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX1064-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -5847,7 +5865,6 @@ ; ; GFX1032-LABEL: umin_i32_varying: ; GFX1032: ; %bb.0: ; %entry -; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1032-NEXT: s_mov_b32 s3, exec_lo ; GFX1032-NEXT: s_mov_b32 s2, -1 ; GFX1032-NEXT: ; implicit-def: $vgpr1 @@ -5862,7 +5879,8 @@ ; GFX1032-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1032-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1032-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: ; implicit-def: $vgpr0 ; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1032-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -5889,11 +5907,8 @@ ; ; GFX1164-LABEL: umin_i32_varying: ; GFX1164: ; %bb.0: ; %entry -; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX1164-NEXT: s_mov_b64 s[2:3], exec ; GFX1164-NEXT: s_mov_b32 s4, -1 -; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX1164-NEXT: ; implicit-def: $vgpr1 ; GFX1164-NEXT: .LBB23_1: ; %ComputeLoop ; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -5911,9 +5926,13 @@ ; GFX1164-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX1164-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1164-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1164-NEXT: s_mov_b64 s[2:3], exec +; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: ; implicit-def: $vgpr0 -; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1164-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX1164-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX1164-NEXT: s_cbranch_execz .LBB23_4 ; GFX1164-NEXT: ; %bb.3: @@ -5939,7 +5958,6 @@ ; ; GFX1132-LABEL: umin_i32_varying: ; GFX1132: ; %bb.0: ; %entry -; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX1132-NEXT: s_mov_b32 s3, exec_lo ; GFX1132-NEXT: s_mov_b32 s2, -1 ; GFX1132-NEXT: ; implicit-def: $vgpr1 @@ -5955,9 +5973,11 @@ ; GFX1132-NEXT: s_cmp_lg_u32 s3, 0 ; GFX1132-NEXT: s_cbranch_scc1 .LBB23_1 ; GFX1132-NEXT: ; %bb.2: ; %ComputeEnd -; GFX1132-NEXT: s_mov_b32 s3, exec_lo +; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: ; implicit-def: $vgpr0 -; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX1132-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX1132-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX1132-NEXT: s_cbranch_execz .LBB23_4 ; GFX1132-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll @@ -448,10 +448,8 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -468,7 +466,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -492,10 +492,8 @@ ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -512,7 +510,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -535,10 +535,8 @@ ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -554,7 +552,9 @@ ; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -578,7 +578,6 @@ ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 @@ -593,7 +592,8 @@ ; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 ; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -617,11 +617,8 @@ ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -639,9 +636,13 @@ ; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: @@ -664,7 +665,6 @@ ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 @@ -680,9 +680,11 @@ ; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: @@ -1224,10 +1226,8 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB6_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1244,7 +1244,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1268,10 +1270,8 @@ ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB6_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1288,7 +1288,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1311,10 +1313,8 @@ ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1330,7 +1330,9 @@ ; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1354,7 +1356,6 @@ ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 @@ -1369,7 +1370,8 @@ ; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 ; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -1393,11 +1395,8 @@ ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB6_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1415,9 +1414,13 @@ ; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W64-NEXT: ; %bb.3: @@ -1440,7 +1443,6 @@ ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 @@ -1456,9 +1458,11 @@ ; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB6_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB6_4 ; GFX11W32-NEXT: ; %bb.3: diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll @@ -463,10 +463,8 @@ ; ; GFX8-LABEL: add_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB2_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -483,7 +481,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -508,10 +508,8 @@ ; ; GFX9-LABEL: add_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB2_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -528,7 +526,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -552,10 +552,8 @@ ; ; GFX10W64-LABEL: add_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -571,7 +569,9 @@ ; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -596,7 +596,6 @@ ; ; GFX10W32-LABEL: add_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 @@ -611,7 +610,8 @@ ; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 ; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -636,11 +636,8 @@ ; ; GFX11W64-LABEL: add_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB2_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -658,9 +655,13 @@ ; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W64-NEXT: ; %bb.3: @@ -684,7 +685,6 @@ ; ; GFX11W32-LABEL: add_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 @@ -700,9 +700,11 @@ ; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB2_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB2_4 ; GFX11W32-NEXT: ; %bb.3: @@ -1362,10 +1364,8 @@ ; ; GFX8-LABEL: sub_i32_varying_vdata: ; GFX8: ; %bb.0: ; %entry -; GFX8-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX8-NEXT: s_mov_b64 s[2:3], exec ; GFX8-NEXT: s_mov_b32 s4, 0 -; GFX8-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX8-NEXT: ; implicit-def: $vgpr1 ; GFX8-NEXT: .LBB7_1: ; %ComputeLoop ; GFX8-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1382,7 +1382,9 @@ ; GFX8-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX8-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX8-NEXT: ; %bb.2: ; %ComputeEnd -; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX8-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX8-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: ; implicit-def: $vgpr0 ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1407,10 +1409,8 @@ ; ; GFX9-LABEL: sub_i32_varying_vdata: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX9-NEXT: s_mov_b64 s[2:3], exec ; GFX9-NEXT: s_mov_b32 s4, 0 -; GFX9-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX9-NEXT: ; implicit-def: $vgpr1 ; GFX9-NEXT: .LBB7_1: ; %ComputeLoop ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1427,7 +1427,9 @@ ; GFX9-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX9-NEXT: ; %bb.2: ; %ComputeEnd -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: ; implicit-def: $vgpr0 ; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1451,10 +1453,8 @@ ; ; GFX10W64-LABEL: sub_i32_varying_vdata: ; GFX10W64: ; %bb.0: ; %entry -; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX10W64-NEXT: s_mov_b64 s[2:3], exec ; GFX10W64-NEXT: s_mov_b32 s4, 0 -; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX10W64-NEXT: ; implicit-def: $vgpr1 ; GFX10W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX10W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1470,7 +1470,9 @@ ; GFX10W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX10W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; GFX10W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX10W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX10W64-NEXT: ; implicit-def: $vgpr0 ; GFX10W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX10W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] @@ -1495,7 +1497,6 @@ ; ; GFX10W32-LABEL: sub_i32_varying_vdata: ; GFX10W32: ; %bb.0: ; %entry -; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX10W32-NEXT: s_mov_b32 s3, exec_lo ; GFX10W32-NEXT: s_mov_b32 s2, 0 ; GFX10W32-NEXT: ; implicit-def: $vgpr1 @@ -1510,7 +1511,8 @@ ; GFX10W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX10W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX10W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX10W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX10W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX10W32-NEXT: ; implicit-def: $vgpr0 ; GFX10W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX10W32-NEXT: s_xor_b32 s3, exec_lo, s3 @@ -1535,11 +1537,8 @@ ; ; GFX11W64-LABEL: sub_i32_varying_vdata: ; GFX11W64: ; %bb.0: ; %entry -; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v1, exec_lo, 0 ; GFX11W64-NEXT: s_mov_b64 s[2:3], exec ; GFX11W64-NEXT: s_mov_b32 s4, 0 -; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v1 ; GFX11W64-NEXT: ; implicit-def: $vgpr1 ; GFX11W64-NEXT: .LBB7_1: ; %ComputeLoop ; GFX11W64-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -1557,9 +1556,13 @@ ; GFX11W64-NEXT: s_cmp_lg_u64 s[2:3], 0 ; GFX11W64-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W64-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W64-NEXT: s_mov_b64 s[2:3], exec +; GFX11W64-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11W64-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0 +; GFX11W64-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX11W64-NEXT: ; implicit-def: $vgpr0 -; GFX11W64-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX11W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11W64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX11W64-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W64-NEXT: ; %bb.3: @@ -1583,7 +1586,6 @@ ; ; GFX11W32-LABEL: sub_i32_varying_vdata: ; GFX11W32: ; %bb.0: ; %entry -; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0 ; GFX11W32-NEXT: s_mov_b32 s3, exec_lo ; GFX11W32-NEXT: s_mov_b32 s2, 0 ; GFX11W32-NEXT: ; implicit-def: $vgpr1 @@ -1599,9 +1601,11 @@ ; GFX11W32-NEXT: s_cmp_lg_u32 s3, 0 ; GFX11W32-NEXT: s_cbranch_scc1 .LBB7_1 ; GFX11W32-NEXT: ; %bb.2: ; %ComputeEnd -; GFX11W32-NEXT: s_mov_b32 s3, exec_lo +; GFX11W32-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0 +; GFX11W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX11W32-NEXT: ; implicit-def: $vgpr0 -; GFX11W32-NEXT: v_cmpx_eq_u32_e32 0, v2 +; GFX11W32-NEXT: s_and_saveexec_b32 s3, vcc_lo ; GFX11W32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX11W32-NEXT: s_cbranch_execz .LBB7_4 ; GFX11W32-NEXT: ; %bb.3: