diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2116,6 +2116,19 @@ } } // end isWave32 +def : GCNPat < + (i32 (DivergentBinFrag i32:$src0, (i32 -1))), + (V_NOT_B32_e32 $src0) +>; + +def : GCNPat < + (i64 (DivergentBinFrag i64:$src0, (i64 -1))), + (REG_SEQUENCE VReg_64, + (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub0))), sub0, + (V_NOT_B32_e32 (i32 (EXTRACT_SUBREG i64:$src0, sub1))), sub1 + ) +>; + def : GCNPat < (f16 (sint_to_fp i1:$src)), (V_CVT_F16_F32_e32 ( diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -157,6 +157,42 @@ let has_sdst = 0; } +class UniformUnaryFrag : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return !N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + +class UniformBinFrag : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return !N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + +class DivergentBinFrag : PatFrag < + (ops node:$src0, node:$src1), + (Op $src0, $src1), + [{ return N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + let isMoveImm = 1 in { let isReMaterializable = 1, isAsCheapAsAMove = 1 in { @@ -172,11 +208,11 @@ let Defs = [SCC] in { def S_NOT_B32 : SOP1_32 <"s_not_b32", - [(set i32:$sdst, (not i32:$src0))] + [(set i32:$sdst, (UniformUnaryFrag i32:$src0))] >; def S_NOT_B64 : SOP1_64 <"s_not_b64", - [(set i64:$sdst, (not i64:$src0))] + [(set i64:$sdst, (UniformUnaryFrag i64:$src0))] >; def S_WQM_B32 : SOP1_32 <"s_wqm_b32">; def S_WQM_B64 : SOP1_64 <"s_wqm_b64">; @@ -426,41 +462,6 @@ "$sdst, $src0, $src1", pattern >; -class UniformUnaryFrag : PatFrag < - (ops node:$src0), - (Op $src0), - [{ return !N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} - -class UniformBinFrag : PatFrag < - (ops node:$src0, node:$src1), - (Op $src0, $src1), - [{ return !N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} - -class DivergentBinFrag : PatFrag < - (ops node:$src0, node:$src1), - (Op $src0, $src1), - [{ return N->isDivergent(); }]> { - // This check is unnecessary as it's captured by the result register - // bank constraint. - // - // FIXME: Should add a way for the emitter to recognize this is a - // trivially true predicate to eliminate the check. - let GISelPredicateCode = [{return true;}]; -} let Defs = [SCC] in { // Carry out goes to SCC let isCommutable = 1 in { diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-not-isel.ll @@ -0,0 +1,34 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: name: scalar_not_i32 +; GCN: S_NOT_B32 +define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %val) { + %not.val = xor i32 %val, -1 + store i32 %not.val, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: scalar_not_i64 +; GCN: S_NOT_B64 +define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %val) { + %not.val = xor i64 %val, -1 + store i64 %not.val, i64 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: vector_not_i32 +; GCN: V_NOT_B32_e32 +define i32 @vector_not_i32(i32 %val) { + %not.val = xor i32 %val, -1 + ret i32 %not.val +} + +; GCN-LABEL: name: vector_not_i64 +; GCN: V_NOT_B32_e32 +; GCN: V_NOT_B32_e32 +define i64 @vector_not_i64(i64 %val) { + %not.val = xor i64 %val, -1 + ret i64 %not.val +} + + diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -40,8 +40,8 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_or3_b32 v1, v3, v1, v5 ; GCN-NEXT: v_or3_b32 v0, v2, v0, v4 -; GCN-NEXT: v_not_b32_e32 v0, v0 ; GCN-NEXT: v_not_b32_e32 v1, v1 +; GCN-NEXT: v_not_b32_e32 v0, v0 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: @@ -103,8 +103,8 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_and_b32_e32 v1, v1, v5 ; GCN-NEXT: v_and_b32_e32 v0, v0, v4 -; GCN-NEXT: v_not_b32_e32 v0, v0 ; GCN-NEXT: v_not_b32_e32 v1, v1 +; GCN-NEXT: v_not_b32_e32 v0, v0 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.round.f64.ll @@ -89,8 +89,8 @@ ; SI-NEXT: v_add_i32_e32 v6, vcc, s4, v4 ; SI-NEXT: v_lshr_b64 v[4:5], s[2:3], v6 ; SI-NEXT: v_and_b32_e32 v7, 0x80000000, v3 -; SI-NEXT: v_not_b32_e32 v4, v4 ; SI-NEXT: v_not_b32_e32 v5, v5 +; SI-NEXT: v_not_b32_e32 v4, v4 ; SI-NEXT: v_and_b32_e32 v5, v3, v5 ; SI-NEXT: v_and_b32_e32 v4, v2, v4 ; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v6 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -375,11 +375,11 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc -; GCN-IR-NEXT: v_not_b32_e32 v6, v8 -; GCN-IR-NEXT: v_not_b32_e32 v7, v9 -; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 +; GCN-IR-NEXT: v_not_b32_e32 v7, v8 +; GCN-IR-NEXT: v_not_b32_e32 v6, v9 +; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v7, v10 ; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 -; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc +; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v6, v11, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0