Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -646,6 +646,9 @@ SDValue expandVSXLoadForLE(SDNode *N, DAGCombinerInfo &DCI) const; SDValue expandVSXStoreForLE(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue extendSubTreeForBitPermutation(SDNode *N, + DAGCombinerInfo &DCI) const; + SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG, Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -10932,11 +10932,84 @@ return SDValue(N, 0); } +// This method tries to increase the opportunity to generate rotate-and-mask +// instructions (e.g. rlwinm) in tryBitPermutation by reordering ZEXT and ANDI. +// Since tryBitPermutation stops analyzing nodes if it hits a ZEXT node while +// traversing SDNodes, we want to avoid ZEXT between two nodes that can be +// folded into a rotate-and-mask instruction. +// +// For example, we modify these nodes +// t9: i32 = add t7, Constant:i32<1> +// t11: i32 = and t9, Constant:i32<255> +// t12: i64 = zero_extend t11 +// t14: i64 = shl t12, Constant:i64<2> +// into +// t9: i32 = add t7, Constant:i32<1> +// t25: i64 = any_extend t9 +// t27: i64 = and t25, Constant:i64<255> +// t14: i64 = shl t12, Constant:i64<2> +// to fold t27 and t14 into a rotate-and-mask instruction. +// Such case often happens in array accesses with logical AND operation in +// an index, e.g. array[i & 0xFF]; +// +// We modify nodes only if the parent of ZEXT node (t14 in above example) has +// a logical opcode supported in tryBitPermutation and the first operand of +// AND node (t9 in example) is not a supported logical opcode. +SDValue +PPCTargetLowering::extendSubTreeForBitPermutation(SDNode *N, + DAGCombinerInfo &DCI) const { + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + auto IsSupprtedInBitPermutation = [&](unsigned Opcode) { + return (Opcode == ISD::AND || Opcode == ISD::OR || Opcode == ISD::ROTL || + Opcode == ISD::SHL || Opcode == ISD::SRL); + }; + + // We currently support only the case with zero extension from i32 to i64. + // Also we do not optimize if ZEXT or ANDI node has multiple uses. + if (!Subtarget.isPPC64() || + N->getOpcode() != ISD::ZERO_EXTEND || + N->getValueType(0) != MVT::i64 || + !N->hasOneUse()) + return SDValue(); + + SDValue OperandVal = N->getOperand(0); + if (!OperandVal.hasOneUse() || + OperandVal.getValueType() != MVT::i32 || + OperandVal.getOpcode() != ISD::AND || + !isa(OperandVal.getOperand(1)) || + IsSupprtedInBitPermutation(OperandVal.getOperand(0).getOpcode())) + return SDValue(); + + // If the parent of zext node is not supported in tryBitPermutation, + // we do not do combining. + if (!IsSupprtedInBitPermutation((*N->use_begin())->getOpcode())) + return SDValue(); + + uint64_t Mask = OperandVal.getConstantOperandVal(1); + if ((Mask & 0xFFFFFFFF00000000uLL) != 0) + return SDValue(); + + SDValue NewExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, + OperandVal.getOperand(0)); + SDValue NewAndVal = DAG.getNode(OperandVal.getOpcode(), DL, + MVT::i64, NewExtVal, + DAG.getConstant(Mask, DL, MVT::i64)); + + return NewAndVal; +} + SDValue PPCTargetLowering::DAGCombineExtBoolTrunc(SDNode *N, DAGCombinerInfo &DCI) const { SelectionDAG &DAG = DCI.DAG; SDLoc dl(N); + // We try to reorder ZEXT and ANDI to increase the opportunity to generate + // rotate-and-mask instructions (e.g. rlwinm) in tryBitPermutation. + if (SDValue RV = extendSubTreeForBitPermutation(N, DCI)) + return RV; + // If we're tracking CR bits, we need to be careful that we don't have: // zext(binary-ops(trunc(x), trunc(y))) // or Index: test/CodeGen/PowerPC/zext-bitperm.ll =================================================================== --- /dev/null +++ test/CodeGen/PowerPC/zext-bitperm.ll @@ -0,0 +1,23 @@ +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu | FileCheck %s +; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s + +; Test case for PPCTargetLowering::extendSubTreeForBitPermutation. +; We expect mask and rotate are folded into a rlwinm instruction. + + +define zeroext i32 @func(i32* %p, i32 zeroext %i) { +; CHECK-LABEL: @func +; CHECK: addi [[REG1:[0-9]+]], 4, 1 +; CHECK: rlwinm [[REG2:[0-9]+]], [[REG1]], 2, 22, 29 +; CHECK-NOT: sldi +; CHECK: lwzx 3, 3, [[REG2]] +; CHECK: blr +entry: + %add = add i32 %i, 1 + %and = and i32 %add, 255 + %idxprom = zext i32 %and to i64 + %arrayidx = getelementptr inbounds i32, i32* %p, i64 %idxprom + %0 = load i32, i32* %arrayidx, align 4 + ret i32 %0 +} +