diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3290,6 +3290,11 @@ return false; } + virtual bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + return N0.hasOneUse(); + } + virtual bool isSDNodeAlwaysUniform(const SDNode * N) const { return false; } diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -1070,7 +1070,7 @@ return DAG.getNode(Opc, DL, VT, N00, OpNode); return SDValue(); } - if (N0.hasOneUse()) { + if (TLI.isReassocProfitable(DAG, N0, N1)) { // Reassociate: (op (op x, c1), y) -> (op (op x, y), c1) // iff (op x, c1) has one use if (SDValue OpNode = DAG.getNode(Opc, SDLoc(N0), VT, N00, N1)) diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -449,6 +449,11 @@ bool isSDNodeSourceOfDivergence(const SDNode *N, FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override; + bool hasMemSDNodeUser(SDNode *N) const; + + virtual bool isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const override; + bool isCanonicalized(SelectionDAG &DAG, SDValue Op, unsigned MaxDepth = 5) const; bool isCanonicalized(Register Reg, MachineFunction &MF, diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9550,6 +9550,9 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const { + if (SDValue RV = reassociateScalarOps(N, DCI.DAG)) + return RV; + EVT VT = N->getValueType(0); if (VT != MVT::i64) return SDValue(); @@ -10462,6 +10465,9 @@ if (VT != MVT::i32 && VT != MVT::i64) return SDValue(); + if (DAG.isBaseWithConstantOffset(SDValue(N, 0))) + return SDValue(); + unsigned Opc = N->getOpcode(); SDValue Op0 = N->getOperand(0); SDValue Op1 = N->getOperand(1); @@ -10483,12 +10489,6 @@ if (Op1->isDivergent()) std::swap(Op1, Op2); - // If either operand is constant this will conflict with - // DAGCombiner::ReassociateOps(). - if (DAG.isConstantIntBuildVectorOrConstantInt(Op0) || - DAG.isConstantIntBuildVectorOrConstantInt(Op1)) - return SDValue(); - SDLoc SL(N); SDValue Add1 = DAG.getNode(Opc, SL, VT, Op0, Op1); return DAG.getNode(Opc, SL, VT, Add1, Op2); @@ -12477,3 +12477,32 @@ Cost.first += (Size + 255) / 256; return Cost; } + +bool llvm::SITargetLowering::hasMemSDNodeUser(SDNode *N) const { + + SDNode::use_iterator I = N->use_begin(), E = N->use_end(); + for (; I != E; ++I) { + if (isa(*I)) + return true; + } + + return false; +} + +bool llvm::SITargetLowering::isReassocProfitable(SelectionDAG &DAG, SDValue N0, + SDValue N1) const { + if (N0.hasOneUse()) { + // Take care of the oportunity to keep N0 uniform + if (!(!N0->isDivergent() && N1->isDivergent())) + return true; + // Check if we have a good chance to form the memory access pattern with the + // base and offset + if (DAG.isBaseWithConstantOffset(N0) && hasMemSDNodeUser(*N0->use_begin())) + return true; + // Check if we have a good chance for the further constant folding. + if (isCommutativeBinOp(N1->getOpcode()) && + DAG.isConstantIntBuildVectorOrConstantInt(N1->getOperand(1))) + return true; + } + return false; +} diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -551,11 +551,11 @@ >; def S_XNOR_B32 : SOP2_32 <"s_xnor_b32", - [(set i32:$sdst, (not (xor_oneuse i32:$src0, i32:$src1)))] + [(set i32:$sdst, (UniformUnaryFrag (xor_oneuse i32:$src0, i32:$src1)))] >; def S_XNOR_B64 : SOP2_64 <"s_xnor_b64", - [(set i64:$sdst, (not (xor_oneuse i64:$src0, i64:$src1)))] + [(set i64:$sdst, (UniformUnaryFrag (xor_oneuse i64:$src0, i64:$src1)))] >; def S_NAND_B32 : SOP2_32 <"s_nand_b32", diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -637,9 +637,9 @@ ) >; -def : divergent_i64_BinOp ; -def : divergent_i64_BinOp ; -def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; let SubtargetPredicate = Has16BitInsts in { @@ -688,6 +688,36 @@ let isReMaterializable = 1 in defm V_XNOR_B32 : VOP2Inst <"v_xnor_b32", VOP_I32_I32_I32, xnor>; +def : GCNPat< + (i32 (DivergentUnaryFrag (xor_oneuse i32:$src0, i32:$src1))), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i32 (DivergentBinFrag (not i32:$src0), i32:$src1)), + (i32 (V_XNOR_B32_e64 $src0, $src1)) +>; + +def : GCNPat< + (i64 (DivergentUnaryFrag (xor_oneuse i64:$src0, i64:$src1))), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + +def : GCNPat< + (i64 (DivergentBinFrag (not i64:$src0), i64:$src1)), + (REG_SEQUENCE VReg_64, (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub0)), + (i32 (EXTRACT_SUBREG $src1, sub0)))), sub0, + (i32 (V_XNOR_B32_e64 + (i32 (EXTRACT_SUBREG $src0, sub1)), + (i32 (EXTRACT_SUBREG $src1, sub1)))), sub1) +>; + let Constraints = "$vdst = $src2", DisableEncoding = "$src2", isConvertibleToThreeAddress = 1, diff --git a/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/divergence-driven-xnor.ll @@ -0,0 +1,44 @@ +; RUN: llc -march=amdgcn -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=gfx906 -stop-after=amdgpu-isel < %s | FileCheck -check-prefix=GCN_DL %s + +; GCN-LABEL: name: uniform_xnor_i64 +; GCN: S_XNOR_B64 +define amdgpu_kernel void @uniform_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + store i64 %res, i64 addrspace(1)* %out + ret void +} +; GCN-LABEL: name: divergent_xnor_i64 +; GCN: V_XOR_B32_e64 +; GCN: V_XOR_B32_e64 +; GCN: V_NOT_B32_e32 +; GCN: V_NOT_B32_e32 +; GCN_DL: V_XNOR_B32_e64 +; GCN_DL: V_XNOR_B32_e64 +define i64 @divergent_xnor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) { + %xor = xor i64 %a, %b + %res = xor i64 %xor, -1 + ret i64 %res +} + +; GCN-LABEL: name: uniform_xnor_i32 +; GCN: S_XNOR_B32 +define amdgpu_kernel void @uniform_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + store i32 %res, i32 addrspace(1)* %out + ret void +} + +; GCN-LABEL: name: divergent_xnor_i32 +; GCN: V_XOR_B32_e64 +; GCN: V_NOT_B32_e32 +; GCN_DL: V_XNOR_B32_e64 +define i32 @divergent_xnor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) { + %xor = xor i32 %a, %b + %res = xor i32 %xor, -1 + ret i32 %res +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 diff --git a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll --- a/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll +++ b/llvm/test/CodeGen/AMDGPU/fused-bitlogic.ll @@ -163,8 +163,8 @@ ; GCN-NEXT: v_xor_b32_e32 v1, v3, v1 ; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: v_xnor_b32_e32 v1, v1, v5 +; GCN-NEXT: v_xnor_b32_e32 v0, v0, v4 ; GCN-NEXT: global_store_dwordx2 v6, v[0:1], s[0:1] ; GCN-NEXT: s_endpgm bb: diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -472,10 +472,10 @@ ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v7 -; GFX9-O0-NEXT: v_or_b32_e32 v0, v0, v3 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v0, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 -; GFX9-O0-NEXT: v_or_b32_e32 v6, v1, v2 +; GFX9-O0-NEXT: v_or_b32_e64 v6, v1, v2 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/xnor.ll b/llvm/test/CodeGen/AMDGPU/xnor.ll --- a/llvm/test/CodeGen/AMDGPU/xnor.ll +++ b/llvm/test/CodeGen/AMDGPU/xnor.ll @@ -61,8 +61,8 @@ ; GCN-LABEL: {{^}}vector_xnor_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xnor_i32_one_use(i32 %a, i32 %b) { entry: @@ -73,10 +73,10 @@ ; GCN-LABEL: {{^}}vector_xnor_i64_one_use ; GCN-NOT: s_xnor_b64 -; GCN: v_not_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 ; GCN-DL: v_xnor_b32 define i64 @vector_xnor_i64_one_use(i64 %a, i64 %b) { @@ -150,8 +150,8 @@ ; GCN-LABEL: {{^}}vector_xor_na_b_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xor_na_b_i32_one_use(i32 %a, i32 %b) { entry: @@ -162,8 +162,8 @@ ; GCN-LABEL: {{^}}vector_xor_a_nb_i32_one_use ; GCN-NOT: s_xnor_b32 -; GCN: v_not_b32 ; GCN: v_xor_b32 +; GCN: v_not_b32 ; GCN-DL: v_xnor_b32 define i32 @vector_xor_a_nb_i32_one_use(i32 %a, i32 %b) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/xor3.ll b/llvm/test/CodeGen/AMDGPU/xor3.ll --- a/llvm/test/CodeGen/AMDGPU/xor3.ll +++ b/llvm/test/CodeGen/AMDGPU/xor3.ll @@ -26,13 +26,13 @@ define amdgpu_ps float @xor3_vgpr_b(i32 inreg %a, i32 %b, i32 inreg %c) { ; GFX9-LABEL: xor3_vgpr_b: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 -; GFX9-NEXT: v_xor_b32_e32 v0, s3, v0 +; GFX9-NEXT: s_xor_b32 s0, s3, s2 +; GFX9-NEXT: v_xor_b32_e32 v0, s0, v0 ; GFX9-NEXT: ; return to shader part epilog ; ; GFX10-LABEL: xor3_vgpr_b: ; GFX10: ; %bb.0: -; GFX10-NEXT: v_xor3_b32 v0, s2, v0, s3 +; GFX10-NEXT: v_xor3_b32 v0, s3, s2, v0 ; GFX10-NEXT: ; return to shader part epilog %x = xor i32 %a, %b %result = xor i32 %x, %c