diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -551,11 +551,11 @@ >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", - [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))] + [(set i32:$sdst, (UniformUnaryFrag (xor_oneuse i32:$src0, i32:$src1)))] >; def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", - [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] + [(set i64:$sdst, (UniformUnaryFrag (xor_oneuse i64:$src0, i64:$src1)))] >; def S_NAND_B32 : SOP2_32 <"s_nand_b32", diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -688,6 +688,21 @@ let isReMaterializable = 1 in defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>; +def : GCNPat< + (i32 (DivergentUnaryFrag (xor_oneuse i32:$src0, i32:$src1))), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag (xor_oneuse i64:$src0, i64:$src1))), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, @@ -696,6 +711,21 @@ } // End SubtargetPredicate = HasDLInsts +def : GCNPat< + (i32 (DivergentUnaryFrag (xor_oneuse i32:$src0, i32:$src1))), + (i32 (V_XOR_B32_e64 (i32 (V_NOT_B32_e64 $src0)), $src1)) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag (xor_oneuse i64:$src0, i64:$src1))), + (REG_SEQUENCE VReg_64, (i32 (V_XOR_B32_e64 + (i32 (V_NOT_B32_e64 (i32 (EXTRACT_SUBREG $src0, sub0)))), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XOR_B32_e64 + (i32 (V_NOT_B32_e64 (i32 (EXTRACT_SUBREG $src0, sub1)))), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + let SubtargetPredicate = HasFmaLegacy32 in { let Constraints = "$vdst = $src2", diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -794,6 +794,18 @@ list ret = [!con(Outs, (set Ins))]; } +class DivergentUnaryFrag : PatFrag < + (ops node:$src0), + (Op $src0), + [{ return N->isDivergent(); }]> { + // This check is unnecessary as it's captured by the result register + // bank constraint. + // + // FIXME: Should add a way for the emitter to recognize this is a + // trivially true predicate to eliminate the check. + let GISelPredicateCode = [{return true;}]; +} + class VOPPatOrNull { list ret = !if(!ne(P.NeedPatGen,PatGenMode.NoPattern), VOPPatGen.ret, []); } diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll @@ -0,0 +1,40 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: name: uniform_xnor_i64 +; GCN: S_XNOR_B64 +define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + store i64 %res, i64 addrspace(1)* %out + ret void +} +; GCN-LABEL: name: divergent_xnor_i64 +; GCN: V_NOT_B32_e64 +; GCN: V_XOR_B32_e64 +; GCN: V_NOT_B32_e64 +; GCN: V_XOR_B32_e64 +define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + ret i64 %res +} + +; GCN-LABEL: name: uniform_xnor_i32 +; GCN: S_XNOR_B32 +define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: divergent_xnor_i32 +; GCN: V_NOT_B32_e64 +; GCN: V_XOR_B32_e64 +define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + ret i32 %res +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -163,8 +163,8 @@ ; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5 +; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/xnor.ll @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}xnor_s_v_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: s_not_b32 +; GCN: v_not_b32 ; GCN: v_xor_b32 define amdgpu_kernel void @xnor_s_v_i32_one_use(i32 addrspace(1)* %out, i32 %s) { %v = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -100,7 +100,7 @@ ; GCN-LABEL: {{^}}xnor_v_s_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: s_not_b32 +; GCN: v_not_b32 ; GCN: v_xor_b32 define amdgpu_kernel void @xnor_v_s_i32_one_use(i32 addrspace(1)* %out, i32 %s) { %v = call i32 @llvm.amdgcn.workitem.id.x() #1 @@ -112,9 +112,10 @@ ; GCN-LABEL: {{^}}xnor_i64_s_v_one_use ; GCN-NOT: s_xnor_b64 -; GCN: s_not_b64 -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN-DAG: v_not_b32 +; GCN-DAG: v_not_b32 +; GCN-DAG: v_xor_b32 +; GCN-DAG: v_xor_b32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define amdgpu_kernel void @xnor_i64_s_v_one_use( @@ -131,9 +132,10 @@ ; GCN-LABEL: {{^}}xnor_i64_v_s_one_use ; GCN-NOT: s_xnor_b64 -; GCN: s_not_b64 -; GCN: v_xor_b32 -; GCN: v_xor_b32 +; GCN-DAG: v_not_b32 +; GCN-DAG: v_not_b32 +; GCN-DAG: v_xor_b32 +; GCN-DAG: v_xor_b32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define amdgpu_kernel void @xnor_i64_v_s_one_use(