Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -3818,6 +3818,7 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC, SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const { + bool SwapOperands = false; if (ConstantSDNode *RHSC = dyn_cast(RHS.getNode())) { unsigned C = RHSC->getZExtValue(); if (!isLegalICmpImmediate(C)) { @@ -3854,9 +3855,20 @@ break; } } + } else if ((LHS->getOpcode() == ISD::SHL || LHS->getOpcode() == ISD::SRL || + LHS->getOpcode() == ISD::SRA || LHS->getOpcode() == ISD::ROTR) && + (RHS->getOpcode() != ISD::SHL && RHS->getOpcode() != ISD::SRL && + RHS->getOpcode() != ISD::SRA && RHS->getOpcode() != ISD::ROTR)) { + // Compare instructions can shift their second operand. + SwapOperands = true; } ARMCC::CondCodes CondCode = IntCCToARMCC(CC); + if (SwapOperands) { + CondCode = ARMCC::getOppositeCondition(CondCode); + std::swap(LHS, RHS); + } + ARMISD::NodeType CompareType; switch (CondCode) { default: @@ -9960,6 +9972,90 @@ return SDValue(); } +static SDValue PerformUnfoldSHL(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + if (ST->isThumb() && ST->isThumb1Only()) + return SDValue(); + + if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR) + return SDValue(); + + if (N->getOperand(0).getOpcode() != ISD::SHL) + return SDValue(); + + SDValue SHL = N->getOperand(0); + + auto *C1ShlC2 = dyn_cast(N->getOperand(1)); + auto *C2 = dyn_cast(SHL.getOperand(1)); + if (!C1ShlC2 || !C2) + return SDValue(); + + DEBUG(dbgs() << "Try to unfold: "; N->dump()); + // DAG combiner will fold: + // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2) + // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2) + + // Many instructions can perform the shift for free, but it requires both + // the operands to be registers. If c1 << c2 is too large, a mov immediate + // instruction will needed. So, unfold back to the original pattern if: + // - if c1 and c2 are small enough that they don't require mov imms. + // - the user(s) of the node can perform an shl + + APInt C2Int = C2->getAPIntValue(); + APInt C1Int = C1ShlC2->getAPIntValue(); + + // Check that C1 could have been shl'd by C2. + APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(), + C2Int.getBitWidth() - C2->getZExtValue()); + if ((C1Int & Mask) != C1Int) + return SDValue(); + + // Undo c1 << c2 + C1Int.lshrInPlace(C2Int); + + // The immediates are encoded as an 8-bit value that can be rotated. + unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros(); + if (C1Int.getBitWidth() - Zeros > 8) + return SDValue(); + + Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros(); + if (C2Int.getBitWidth() - Zeros > 8) + return SDValue(); + + // Check that all the users could perform the shl themselves. + SDValue BinOp = SDValue(N, 0); + for (auto U : N->uses()) { + switch(U->getOpcode()) { + default: + return SDValue(); + case ISD::ADD: + case ISD::SUB: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + case ISD::SETCC: + case ARMISD::CMP: + break; + } + } + + SelectionDAG &DAG = DCI.DAG; + SDLoc dl(N); + SDValue X = SHL.getOperand(0); + BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X, + DAG.getConstant(C1Int, dl, MVT::i32)); + SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1)); + + DEBUG(dbgs() << "Unfolding: "; N->dump()); + DEBUG(dbgs() << "Into shift operation:\n"; + BinOp.dump(); + Res.dump()); + + DAG.ReplaceAllUsesWith(SDValue(N, 0), Res); + return SDValue(N, 0); +} + /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD. /// static SDValue PerformADDCombine(SDNode *N, @@ -9968,6 +10064,10 @@ SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); + // Only works one way, because it needs an immediate operand. + if (SDValue Result = PerformUnfoldSHL(N, DCI, Subtarget)) + return Result; + // First try with the default operand order. if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget)) return Result; @@ -10162,10 +10262,11 @@ return SDValue(); } -// Try combining OR nodes to SMULWB, SMULWT. static SDValue PerformORCombineToSMULWBT(SDNode *OR, TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *Subtarget) { + + // Try combining OR nodes to SMULWB, SMULWT. if (!Subtarget->hasV6Ops() || (Subtarget->isThumb() && (!Subtarget->hasThumb2() || !Subtarget->hasDSP()))) @@ -10223,95 +10324,18 @@ return SDValue(OR, 0); } -/// PerformORCombine - Target-specific dag combine xforms for ISD::OR -static SDValue PerformORCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI, - const ARMSubtarget *Subtarget) { - // Attempt to use immediate-form VORR - BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); - SDLoc dl(N); - EVT VT = N->getValueType(0); - SelectionDAG &DAG = DCI.DAG; - - if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) - return SDValue(); - - APInt SplatBits, SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - if (BVN && Subtarget->hasNEON() && - BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { - if (SplatBitSize <= 64) { - EVT VorrVT; - SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), - SplatUndef.getZExtValue(), SplatBitSize, - DAG, dl, VorrVT, VT.is128BitVector(), - OtherModImm); - if (Val.getNode()) { - SDValue Input = - DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); - SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); - return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); - } - } - } - - if (!Subtarget->isThumb1Only()) { - // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) - if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) - return Result; - if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) - return Result; - } - - // The code below optimizes (or (and X, Y), Z). - // The AND operand needs to have a single user to make these optimizations - // profitable. - SDValue N0 = N->getOperand(0); - if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) - return SDValue(); - SDValue N1 = N->getOperand(1); - - // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. - if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && - DAG.getTargetLoweringInfo().isTypeLegal(VT)) { - APInt SplatUndef; - unsigned SplatBitSize; - bool HasAnyUndefs; - - APInt SplatBits0, SplatBits1; - BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); - BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); - // Ensure that the second operand of both ands are constants - if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, - HasAnyUndefs) && !HasAnyUndefs) { - if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, - HasAnyUndefs) && !HasAnyUndefs) { - // Ensure that the bit width of the constants are the same and that - // the splat arguments are logical inverses as per the pattern we - // are trying to simplify. - if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && - SplatBits0 == ~SplatBits1) { - // Canonicalize the vector type to make instruction selection - // simpler. - EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; - SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, - N0->getOperand(1), - N0->getOperand(0), - N1->getOperand(0)); - return DAG.getNode(ISD::BITCAST, dl, VT, Result); - } - } - } - } - - // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when - // reasonable. +static SDValue PerformBFICombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { // BFI is only available on V6T2+ if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops()) return SDValue(); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + SelectionDAG &DAG = DCI.DAG; SDLoc DL(N); // 1) or (and A, mask), val => ARMbfi A, val, mask // iff (val & mask) == val @@ -10355,7 +10379,7 @@ // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); - return SDValue(); + return SDValue(N, 0); } } else if (N1.getOpcode() == ISD::AND) { // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask @@ -10381,7 +10405,7 @@ DAG.getConstant(Mask, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); - return SDValue(); + return SDValue(N, 0); } else if (ARM::isBitFieldInvertedMask(~Mask) && (~Mask == Mask2)) { // The pack halfword instruction works better for masks that fit it, @@ -10397,7 +10421,7 @@ DAG.getConstant(Mask2, DL, MVT::i32)); // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); - return SDValue(); + return SDValue(N, 0); } } @@ -10417,8 +10441,107 @@ // Do not add new nodes to DAG combiner worklist. DCI.CombineTo(N, Res, false); + return SDValue(N, 0); + } + + return SDValue(); +} + +/// PerformORCombine - Target-specific dag combine xforms for ISD::OR +static SDValue PerformORCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + + // Attempt to use immediate-form VORR + BuildVectorSDNode *BVN = dyn_cast(N->getOperand(1)); + SDLoc dl(N); + EVT VT = N->getValueType(0); + SelectionDAG &DAG = DCI.DAG; + + if(!DAG.getTargetLoweringInfo().isTypeLegal(VT)) + return SDValue(); + + APInt SplatBits, SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + if (BVN && Subtarget->hasNEON() && + BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { + if (SplatBitSize <= 64) { + EVT VorrVT; + SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(), + SplatUndef.getZExtValue(), SplatBitSize, + DAG, dl, VorrVT, VT.is128BitVector(), + OtherModImm); + if (Val.getNode()) { + SDValue Input = + DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0)); + SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val); + return DAG.getNode(ISD::BITCAST, dl, VT, Vorr); + } + } + } + + if (!Subtarget->isThumb1Only()) { + // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c)) + if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI)) + return Result; + if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget)) + return Result; } + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + + // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant. + if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() && + DAG.getTargetLoweringInfo().isTypeLegal(VT)) { + + // The code below optimizes (or (and X, Y), Z). + // The AND operand needs to have a single user to make these optimizations + // profitable. + if (N0.getOpcode() != ISD::AND || !N0.hasOneUse()) + return SDValue(); + + APInt SplatUndef; + unsigned SplatBitSize; + bool HasAnyUndefs; + + APInt SplatBits0, SplatBits1; + BuildVectorSDNode *BVN0 = dyn_cast(N0->getOperand(1)); + BuildVectorSDNode *BVN1 = dyn_cast(N1->getOperand(1)); + // Ensure that the second operand of both ands are constants + if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize, + HasAnyUndefs) && !HasAnyUndefs) { + // Ensure that the bit width of the constants are the same and that + // the splat arguments are logical inverses as per the pattern we + // are trying to simplify. + if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() && + SplatBits0 == ~SplatBits1) { + // Canonicalize the vector type to make instruction selection + // simpler. + EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32; + SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT, + N0->getOperand(1), + N0->getOperand(0), + N1->getOperand(0)); + return DAG.getNode(ISD::BITCAST, dl, VT, Result); + } + } + } + } + + // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when + // reasonable. + if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) { + if (SDValue Result = PerformBFICombine(N, DCI, Subtarget)) + return Result; + } + + if (SDValue Result = PerformUnfoldSHL(N, DCI, Subtarget)) + return Result; + return SDValue(); } Index: test/CodeGen/ARM/unfold-shifts.ll =================================================================== --- /dev/null +++ test/CodeGen/ARM/unfold-shifts.ll @@ -0,0 +1,143 @@ +; RUN: llc -mtriple armv6t2 %s -o - | FileCheck %s +; RUN: llc -mtriple thumbv6t2 %s -o - | FileCheck %s --check-prefix=CHECK-T2 +; RUN: llc -mtriple armv7 %s -o - | FileCheck %s +; RUN: llc -mtriple thumbv7 %s -o - | FileCheck %s --check-prefix=CHECK-T2 +; RUN: llc -mtriple thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-T2 +; RUN: llc -mtriple thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-T2 + +; CHECK-LABEL: unfold1 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #255 +; CHECK: add r0, r1, r0, lsl #1 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #255 +; CHECK-T2: add.w r0, r1, r0, lsl #1 +define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 1 + %shl = or i32 %or, 510 + %add = add nsw i32 %shl, %b + ret i32 %add +} + +; CHECK-LABEL: unfold2 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #4080 +; CHECK: sub r0, r1, r0, lsl #2 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #4080 +; CHECK-T2: sub.w r0, r1, r0, lsl #2 +define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 2 + %shl = or i32 %or, 16320 + %sub = sub nsw i32 %b, %shl + ret i32 %sub +} + +; CHECK-LABEL: unfold3 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #65280 +; CHECK: and r0, r1, r0, lsl #4 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #65280 +; CHECK-T2: and.w r0, r1, r0, lsl #4 +define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 4 + %shl = or i32 %or, 1044480 + %and = and i32 %shl, %b + ret i32 %and +} + +; CHECK-LABEL: unfold4 +; CHECK-NOT: mov +; CHECK: orr r0, r0, #1044480 +; CHECK: eor r0, r1, r0, lsl #5 +; CHECK-T2-NOT: mov +; CHECK-T2: orr r0, r0, #1044480 +; CHECK-T2: eor.w r0, r1, r0, lsl #5 +define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 5 + %shl = or i32 %or, 33423360 + %xor = xor i32 %shl, %b + ret i32 %xor +} + +; CHECK-LABEL: unfold5 +; CHECK-NOT: mov +; CHECK: add r0, r0, #496 +; CHECK: orr r0, r1, r0, lsl #6 +; CHECK-T2: add.w r0, r0, #496 +; CHECK-T2: orr.w r0, r1, r0, lsl #6 +define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 6 + %shl = add i32 %add, 31744 + %or = or i32 %shl, %b + ret i32 %or +} + +; CHECK-LABEL: unfold6 +; CHECK-NOT: mov +; CHECK: add r0, r0, #7936 +; CHECK: and r0, r1, r0, lsl #8 +; CHECK-T2-NOT: mov +; CHECK-T2: add.w r0, r0, #7936 +; CHECK-T2: and.w r0, r1, r0, lsl #8 +define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 8 + %shl = add i32 %add, 2031616 + %and = and i32 %shl, %b + ret i32 %and +} + +; CHECK-LABEL: unfold7 +; CHECK-NOT: mov +; CHECK: add r0, r0, #126976 +; CHECK: eor r0, r1, r0, lsl #9 +; CHECK-T2-NOT: mov +; CHECK-T2: add.w r0, r0, #126976 +; CHECK-T2: eor.w r0, r1, r0, lsl #9 +define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 9 + %shl = add i32 %add, 65011712 + %xor = xor i32 %shl, %b + ret i32 %xor +} + +; CHECK-LABEL: unfold8 +; CHECK-NOT: mov r2 +; CHECK: orr r2, r0, #4080 +; CHECK: cmp r1, r2, lsl #10 +; CHECK-T2-NOT: mov.w r2 +; CHECK-T2: orr r2, r0, #4080 +; CHECK-T2: cmp.w r1, r2, lsl #10 +define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) { +entry: + %or = shl i32 %a, 10 + %shl = or i32 %or, 4177920 + %cmp = icmp sgt i32 %shl, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; CHECK-LABEL: unfold9 +; CHECK-NOT: mov r2 +; CHECK: add r2, r0, #7936 +; CHECK: cmp r1, r2, lsl #11 +; CHECK-T2-NOT: mov.w r2 +; CHECK-T2: add.w r2, r0, #7936 +; CHECK-T2: cmp.w r1, r2, lsl #11 +define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) { +entry: + %add = shl i32 %a, 11 + %shl = add i32 %add, 16252928 + %cmp = icmp sgt i32 %shl, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} +