diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2107,6 +2107,22 @@ (EXTRACT_SUBREG $a, sub0) >; +def : GCNPat < + (i1 (UniformUnaryFrag i32:$a)), + (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) +>; + +def : GCNPat < + (i1 (UniformUnaryFrag i16:$a)), + (S_CMP_EQ_U32 (S_AND_B32 (i32 1), $a), (i32 1)) +>; + +def : GCNPat < + (i1 (UniformUnaryFrag i64:$a)), + (S_CMP_EQ_U32 (S_AND_B32 (i32 1), + (i32 (EXTRACT_SUBREG $a, sub0))), (i32 1)) +>; + def : GCNPat < (i1 (trunc i32:$a)), (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1)) diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fold-binop-select.ll @@ -397,7 +397,8 @@ ; GCN-NEXT: v_mov_b32_e32 v1, 0x80 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s0, 1, s0 -; GCN-NEXT: v_cmp_eq_u32_e64 vcc, s0, 1 +; GCN-NEXT: s_cmp_eq_u32 s0, 1 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GCN-NEXT: flat_store_short v[0:1], v0 ; GCN-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -175,7 +175,8 @@ ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccnz BB4_2 @@ -220,7 +221,8 @@ ; GCN-NEXT: s_addc_u32 s1, s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_and_b32 s4, 1, s4 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, 1 +; GCN-NEXT: s_cmp_eq_u32 s4, 1 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GCN-NEXT: s_and_b64 vcc, exec, s[4:5] ; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_cbranch_vccnz BB5_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -17,7 +17,7 @@ ; VI-DAG: s_load_dword [[SC:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x94 ; GCN-DAG: s_and_b32 [[AND_I1:s[0-9]+]], 1, s{{[0-9]+}} -; GCN: v_cmp_eq_u32_e64 vcc, [[AND_I1]], 1 +; GCN: s_cmp_eq_u32 [[AND_I1]], 1 ; GCN-DAG: v_mov_b32_e32 [[VC:v[0-9]+]], [[SC]] ; GCN-DAG: v_mov_b32_e32 [[VB:v[0-9]+]], [[SB]] diff --git a/llvm/test/CodeGen/AMDGPU/select-i1.ll b/llvm/test/CodeGen/AMDGPU/select-i1.ll --- a/llvm/test/CodeGen/AMDGPU/select-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/select-i1.ll @@ -15,12 +15,14 @@ ; GCN-LABEL: {{^}}s_minmax_i1: ; GCN: s_load_dword [[LOAD:s[0-9]+]], +; GCN: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]] +; GCN: s_cmp_eq_u32 [[COND]], 1 +; GCN: s_cselect_b64 vcc, -1, 0 ; GCN-DAG: s_lshr_b32 [[A:s[0-9]+]], [[LOAD]], 8 ; GCN-DAG: s_lshr_b32 [[B:s[0-9]+]], [[LOAD]], 16 -; GCN-DAG: s_and_b32 [[COND:s[0-9]+]], 1, [[LOAD]] + ; GCN: v_mov_b32_e32 [[V_B:v[0-9]+]], [[B]] ; GCN: v_mov_b32_e32 [[V_A:v[0-9]+]], [[A]] -; GCN: v_cmp_eq_u32_e64 vcc, [[COND]], 1 ; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], [[V_B]], [[V_A]] ; GCN: v_and_b32_e32 v{{[0-9]+}}, 1, [[SEL]] define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, [8 x i32], i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind { diff --git a/llvm/test/CodeGen/AMDGPU/trunc.ll b/llvm/test/CodeGen/AMDGPU/trunc.ll --- a/llvm/test/CodeGen/AMDGPU/trunc.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc.ll @@ -97,9 +97,9 @@ ; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x13 ; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x4c ; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]] -; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}} +; GCN: s_cmp_eq_u32 [[MASKED]], 1{{$}} +; SI: s_cselect_b64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], -1, 0 ; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]] -; VI: s_cmp_lg_u64 s{{\[}}[[VLO]]:[[VHI]]], 0 ; VI: s_cselect_b32 {{s[0-9]+}}, 63, -12 define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, [8 x i32], i64 %x) { %trunc = trunc i64 %x to i1 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -418,8 +418,10 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f32: -; GFX1032: v_cmp_eq_u32_e64 vcc_lo, -; GFX1064: v_cmp_eq_u32_e64 vcc, +; GFX1032: s_cmp_eq_u32 s0, 1 +; GFX1032: s_cselect_b32 vcc_lo, -1, 0 +; GFX1064: s_cmp_eq_u32 s0, 1 +; GFX1064: s_cselect_b64 vcc, -1, 0 ; GCN: v_div_fmas_f32 v{{[0-9]+}}, {{[vs][0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind { %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone @@ -428,8 +430,10 @@ } ; GCN-LABEL: {{^}}test_div_fmas_f64: -; GFX1032: v_cmp_eq_u32_e64 vcc_lo, -; GFX1064: v_cmp_eq_u32_e64 vcc, +; GFX1032: s_cmp_eq_u32 s0, 1 +; GFX1032: s_cselect_b32 vcc_lo, -1, 0 +; GFX1064: s_cmp_eq_u32 s0, 1 +; GFX1064: s_cselect_b64 vcc, -1, 0 ; GCN-DAG: v_div_fmas_f64 v[{{[0-9:]+}}], {{[vs]}}[{{[0-9:]+}}], v[{{[0-9:]+}}], v[{{[0-9:]+}}] define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind { %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone