diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2295,17 +2295,31 @@ (REG_SEQUENCE SReg_64, $src, sub0, (i32 (IMPLICIT_DEF)), sub1) >; -class ZExt_i64_i1_Pat : GCNPat < - (i64 (ext i1:$src)), - (REG_SEQUENCE VReg_64, - (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), - /*src1mod*/(i32 0), /*src1*/(i32 1), $src), - sub0, (S_MOV_B32 (i32 0)), sub1) ->; +multiclass ZExt_i64_i1_Pat { + def: GCNPat < + (i64 (ext i1:$src)), + (REG_SEQUENCE VReg_64, + (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0), + /*src1mod*/(i32 0), /*src1*/(i32 1), $src), + sub0, (S_MOV_B32 (i32 0)), sub1) + >; + + let WaveSizePredicate = isWave32 in + def : GCNPat < + (i64 (UniformUnaryFrag SReg_1:$src)), + (S_BFE_U64 (REG_SEQUENCE SReg_64, SReg_32:$src, sub0, (i32 (IMPLICIT_DEF)), sub1), (i32 0x10000)) + >; + + let WaveSizePredicate = isWave64 in + def : GCNPat < + (i64 (UniformUnaryFrag SReg_1:$src)), + (S_BFE_U64 SReg_64:$src, (i32 0x10000)) + >; +} -def : ZExt_i64_i1_Pat; -def : ZExt_i64_i1_Pat; +defm : ZExt_i64_i1_Pat; +defm : ZExt_i64_i1_Pat; // FIXME: We need to use COPY_TO_REGCLASS to work-around the fact that // REG_SEQUENCE patterns don't support instructions with multiple outputs. diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -29,10 +29,11 @@ ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 ; SI-NEXT: s_xor_b64 s[4:5], s[6:7], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: s_bfe_u64 s[4:5], s[4:5], 0x10000 +; SI-NEXT: s_add_u32 s4, s10, s4 +; SI-NEXT: s_addc_u32 s5, s11, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -45,15 +46,16 @@ ; VI-NEXT: s_add_u32 s2, s6, s0 ; VI-NEXT: v_mov_b32_e32 v2, s7 ; VI-NEXT: s_addc_u32 s3, s7, s1 -; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[0:1], 0 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[1:2] -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_add_u32_e32 v2, vcc, s2, v2 +; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc +; VI-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x10000 +; VI-NEXT: s_add_u32 s0, s2, s0 +; VI-NEXT: s_addc_u32 s1, s3, s1 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -67,13 +69,14 @@ ; GFX9-NEXT: s_add_u32 s0, s6, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s7, s3 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[2:3], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0 +; GFX9-NEXT: s_xor_b64 s[2:3], s[2:3], vcc +; GFX9-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x10000 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm ; @@ -87,11 +90,13 @@ ; GFX10-NEXT: s_add_u32 s0, s6, s2 ; GFX10-NEXT: s_addc_u32 s1, s7, s3 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[6:7] -; GFX10-NEXT: s_xor_b32 s2, s2, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 -; GFX10-NEXT: v_add_co_u32 v0, s0, s0, v0 -; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s1, 0, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[0:1], s[6:7] +; GFX10-NEXT: s_xor_b32 s2, s2, s6 +; GFX10-NEXT: s_bfe_u64 s[2:3], s[2:3], 0x10000 +; GFX10-NEXT: s_add_u32 s0, s0, s2 +; GFX10-NEXT: s_addc_u32 s1, s1, s3 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX10-NEXT: s_endpgm ; @@ -100,18 +105,19 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s2, s6, s0 ; GFX11-NEXT: s_addc_u32 s3, s7, s1 ; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], s[6:7] +; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[2:3], s[6:7] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_xor_b32 s0, s0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_add_co_u32 v0, s0, s2, v0 -; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 +; GFX11-NEXT: s_xor_b32 s0, s0, s6 +; GFX11-NEXT: s_bfe_u64 s[0:1], s[0:1], 0x10000 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_add_u32 s0, s2, s0 +; GFX11-NEXT: s_addc_u32 s1, s3, s1 +; GFX11-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[4:5] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -251,7 +257,7 @@ ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v1 +; SI-NEXT: v_add_i32_e32 v2, vcc, v1, v0 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 ; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 ; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] @@ -275,7 +281,7 @@ ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5 +; VI-NEXT: v_add_u32_e32 v6, vcc, v5, v4 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 ; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 ; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] @@ -606,8 +612,8 @@ ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 -; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 +; SI-NEXT: v_add_i32_e32 v5, vcc, v3, v1 +; SI-NEXT: v_add_i32_e32 v4, vcc, v2, v0 ; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 ; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 @@ -635,8 +641,8 @@ ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 -; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 +; VI-NEXT: v_add_u32_e32 v9, vcc, v3, v1 +; VI-NEXT: v_add_u32_e32 v8, vcc, v2, v0 ; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 ; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 ; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -7,21 +7,22 @@ ; SI-LABEL: s_uaddo_i64_zext: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_add_u32 s0, s6, s0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_addc_u32 s1, s7, s1 +; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; SI-NEXT: s_bfe_u64 s[6:7], vcc, 0x10000 +; SI-NEXT: s_add_u32 s6, s0, s6 +; SI-NEXT: s_addc_u32 s7, s1, s7 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_add_u32 s4, s6, s8 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_addc_u32 s5, s7, s9 -; SI-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -30,17 +31,18 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_add_u32 s0, s6, s0 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_addc_u32 s1, s7, s1 -; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[1:2] ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -52,13 +54,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_add_u32 s0, s6, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_addc_u32 s1, s7, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b) @@ -475,7 +478,7 @@ ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_add_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_add_i32_e32 v0, vcc, v1, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -8,21 +8,22 @@ ; SI-LABEL: s_usubo_i64_zext: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0xd +; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd ; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_sub_u32 s0, s6, s0 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_mov_b32_e32 v1, s7 +; SI-NEXT: s_subb_u32 s1, s7, s1 +; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; SI-NEXT: s_bfe_u64 s[6:7], vcc, 0x10000 +; SI-NEXT: s_add_u32 s6, s0, s6 +; SI-NEXT: s_addc_u32 s7, s1, s7 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 -; SI-NEXT: s_sub_u32 s4, s6, s8 ; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: v_mov_b32_e32 v1, s7 -; SI-NEXT: s_subb_u32 s5, s7, s9 -; SI-NEXT: v_cmp_gt_u64_e32 vcc, s[4:5], v[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_add_i32_e32 v0, vcc, s4, v0 -; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -31,17 +32,18 @@ ; VI-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s6 +; VI-NEXT: v_mov_b32_e32 v1, s6 ; VI-NEXT: s_sub_u32 s0, s6, s0 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: s_subb_u32 s1, s7, s1 -; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[2:3] -; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc -; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 +; VI-NEXT: v_mov_b32_e32 v2, s7 +; VI-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[1:2] ; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000 +; VI-NEXT: s_add_u32 s0, s0, s2 +; VI-NEXT: s_addc_u32 s1, s1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm ; @@ -53,13 +55,14 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_sub_u32 s0, s6, s2 -; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_subb_u32 s1, s7, s3 +; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cmp_gt_u64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_bfe_u64 s[2:3], vcc, 0x10000 +; GFX9-NEXT: s_add_u32 s0, s0, s2 +; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v0 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[4:5] ; GFX9-NEXT: s_endpgm %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0 @@ -475,7 +478,7 @@ ; SI-NEXT: s_mov_b32 s4, s2 ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_sub_i32_e32 v0, vcc, v0, v1 +; SI-NEXT: v_subrev_i32_e32 v0, vcc, v1, v0 ; SI-NEXT: v_and_b32_e32 v1, 0xffff, v0 ; SI-NEXT: buffer_store_short v0, off, s[8:11], 0 ; SI-NEXT: v_cmp_ne_u32_e32 vcc, v1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/zero_extend.ll b/llvm/test/CodeGen/AMDGPU/zero_extend.ll --- a/llvm/test/CodeGen/AMDGPU/zero_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/zero_extend.ll @@ -38,7 +38,7 @@ ; GCN-LABEL: {{^}}s_cmp_zext_i1_to_i64: ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, 0 ; GCN-DAG: s_cmp_eq_u32 -; GCN: v_cndmask_b32 +; GCN: s_bfe_u64 define amdgpu_kernel void @s_cmp_zext_i1_to_i64(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { %cmp = icmp eq i32 %a, %b %ext = zext i1 %cmp to i64