Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -1177,6 +1177,8 @@ if (Subtarget->hasV6Ops()) setTargetDAGCombine(ISD::SRL); + if (Subtarget->isThumb1Only()) + setTargetDAGCombine(ISD::SHL); setStackPointerRegisterToSaveRestore(ARM::SP); @@ -10419,8 +10421,25 @@ if (Level == BeforeLegalizeTypes) return true; - if (Subtarget->isThumb() && Subtarget->isThumb1Only()) + if (Subtarget->isThumb() && Subtarget->isThumb1Only()) { + // Avoid making expensive immediates by commuting shifts. (This logic + // only applies to Thumb1 because ARM and Thumb2 immediates can be shifted + // for free.) + if (N->getOpcode() != ISD::SHL) + return true; + SDValue N1 = N->getOperand(0); + if (N1->getOpcode() != ISD::ADD && N1->getOpcode() != ISD::AND && + N1->getOpcode() != ISD::OR && N1->getOpcode() != ISD::XOR) + return true; + if (auto *Const = dyn_cast(N1->getOperand(1))) { + if (Const->getAPIntValue().ult(256)) + return false; + if (N1->getOpcode() == ISD::ADD && Const->getAPIntValue().slt(0) && + Const->getAPIntValue().sgt(-256)) + return false; + } return true; + } if (N->getOpcode() != ISD::SHL) return true; @@ -12420,8 +12439,10 @@ /// combining instead of DAG legalizing because the build_vectors for 64-bit /// vector element shift counts are generally not legal, and it is hard to see /// their values after they get legalized to loads from a constant pool. -static SDValue PerformShiftCombine(SDNode *N, SelectionDAG &DAG, +static SDValue PerformShiftCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, const ARMSubtarget *ST) { + SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); if (N->getOpcode() == ISD::SRL && VT == MVT::i32 && ST->hasV6Ops()) { // Canonicalize (srl (bswap x), 16) to (rotr (bswap x), 16) if the high @@ -12436,6 +12457,40 @@ } } + if (ST->isThumb1Only() && + N->getOpcode() == ISD::SHL && VT == MVT::i32 && + N->getOperand(0)->getOpcode() == ISD::AND && + N->getOperand(0)->hasOneUse()) { + if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer()) + return SDValue(); + // Look for the pattern (shl (and x, AndMask), ShiftAmt). This doesn't + // usually show up because instcombine prefers to canonicalize it to + // (and (shl x, ShiftAmt) (shl AndMask, ShiftAmt)), but the shift can come + // out of GEP lowering in some cases. + SDValue N0 = N->getOperand(0); + ConstantSDNode *ShiftAmtNode = dyn_cast(N->getOperand(1)); + if (!ShiftAmtNode) + return SDValue(); + uint32_t ShiftAmt = (uint32_t)ShiftAmtNode->getZExtValue(); + ConstantSDNode *AndMaskNode = dyn_cast(N0->getOperand(1)); + if (!AndMaskNode) + return SDValue(); + uint32_t AndMask = (uint32_t)AndMaskNode->getZExtValue(); + // Don't transform uxtb/uxth. + if (AndMask == 255 || AndMask == 65535) + return SDValue(); + if (isMask_32(AndMask)) { + uint32_t MaskedBits = countLeadingZeros(AndMask); + if (MaskedBits > ShiftAmt) { + SDLoc DL(N); + SDValue SHL = DAG.getNode(ISD::SHL, DL, MVT::i32, N0->getOperand(0), + DAG.getConstant(MaskedBits, DL, MVT::i32)); + return DAG.getNode(ISD::SRL, DL, MVT::i32, SHL, + DAG.getConstant(MaskedBits - ShiftAmt, DL, MVT::i32)); + } + } + } + // Nothing to be done for scalar shifts. const TargetLowering &TLI = DAG.getTargetLoweringInfo(); if (!VT.isVector() || !TLI.isTypeLegal(VT)) @@ -12854,7 +12909,7 @@ case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG); case ISD::SHL: case ISD::SRA: - case ISD::SRL: return PerformShiftCombine(N, DCI.DAG, Subtarget); + case ISD::SRL: return PerformShiftCombine(N, DCI, Subtarget); case ISD::SIGN_EXTEND: case ISD::ZERO_EXTEND: case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget); Index: test/CodeGen/Thumb/shift-and.ll =================================================================== --- test/CodeGen/Thumb/shift-and.ll +++ test/CodeGen/Thumb/shift-and.ll @@ -188,3 +188,73 @@ %shl = shl i32 %shr, 3 ret i32 %shl } + +define i32 @test16(i32 %x) { +; CHECK-LABEL: test16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsls r0, r0, #28 +; CHECK-NEXT: lsrs r0, r0, #26 +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %x, 15 + %shl = shl i32 %0, 2 + ret i32 %shl +} + +define i32* @test17(i32* %p, i32 %x) { +; CHECK-LABEL: test17: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: lsrs r1, r1, #26 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: bx lr +entry: + %0 = and i32 %x, 15 + %shl = getelementptr i32, i32* %p, i32 %0 + ret i32* %shl +} + +define i32* @test18(i32* %p, i32 %x) { +; CHECK-LABEL: test18: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adds r1, r1, #1 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: lsrs r1, r1, #26 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: bx lr +entry: + %xx = add i32 %x, 1 + %0 = and i32 %xx, 15 + %shl = getelementptr i32, i32* %p, i32 %0 + ret i32* %shl +} + +define i32* @test19(i32* %p, i32 %x) { +; CHECK-LABEL: test19: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r1, r1, #1 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: lsrs r1, r1, #26 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: bx lr +entry: + %xx = sub i32 %x, 1 + %0 = and i32 %xx, 15 + %shl = getelementptr i32, i32* %p, i32 %0 + ret i32* %shl +} + +define i32* @test20(i32* %p, i32 %x) { +; CHECK-LABEL: test20: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: subs r1, r1, #1 +; CHECK-NEXT: lsls r1, r1, #28 +; CHECK-NEXT: lsrs r1, r1, #26 +; CHECK-NEXT: adds r0, r0, r1 +; CHECK-NEXT: bx lr +entry: + %xx = add i32 %x, 15 + %0 = and i32 %xx, 15 + %shl = getelementptr i32, i32* %p, i32 %0 + ret i32* %shl +}