Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -3818,6 +3818,7 @@
 SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                                      SDValue &ARMcc, SelectionDAG &DAG,
                                      const SDLoc &dl) const {
+  bool SwapOperands = false;
   if (ConstantSDNode *RHSC = dyn_cast<ConstantSDNode>(RHS.getNode())) {
     unsigned C = RHSC->getZExtValue();
     if (!isLegalICmpImmediate(C)) {
@@ -3854,9 +3855,20 @@
         break;
       }
     }
+  } else if ((LHS->getOpcode() == ISD::SHL || LHS->getOpcode() == ISD::SRL ||
+              LHS->getOpcode() == ISD::SRA || LHS->getOpcode() == ISD::ROTR) &&
+             (RHS->getOpcode() != ISD::SHL && RHS->getOpcode() != ISD::SRL &&
+              RHS->getOpcode() != ISD::SRA && RHS->getOpcode() != ISD::ROTR)) {
+    // Compare instructions can shift their second operand.
+    SwapOperands = true;
   }
 
   ARMCC::CondCodes CondCode = IntCCToARMCC(CC);
+  if (SwapOperands) {
+    CondCode = ARMCC::getOppositeCondition(CondCode);
+    std::swap(LHS, RHS);
+  }
+
   ARMISD::NodeType CompareType;
   switch (CondCode) {
   default:
@@ -9960,6 +9972,90 @@
   return SDValue();
 }
 
+static SDValue PerformUnfoldSHL(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const ARMSubtarget *ST) {
+  if (ST->isThumb() && ST->isThumb1Only())
+    return SDValue();
+
+  if (N->getOpcode() != ISD::ADD && N->getOpcode() != ISD::OR)
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::SHL)
+    return SDValue();
+
+  SDValue SHL = N->getOperand(0);
+
+  auto *C1ShlC2 = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  auto *C2 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
+  if (!C1ShlC2 || !C2)
+    return SDValue();
+
+  DEBUG(dbgs() << "Try to unfold: "; N->dump());
+  // DAG combiner will fold:
+  // (shl (add x, c1), c2) -> (add (shl x, c2), c1 << c2)
+  // (shl (or x, c1), c2) -> (or (shl x, c2), c1 << c2)
+
+  // Many instructions can  perform the shift for free, but it requires both
+  // the operands to be registers. If c1 << c2 is too large, a mov immediate
+  // instruction will needed. So, unfold back to the original pattern if:
+  // - if c1 and c2 are small enough that they don't require mov imms.
+  // - the user(s) of the node can perform an shl
+
+  APInt C2Int = C2->getAPIntValue();
+  APInt C1Int = C1ShlC2->getAPIntValue();
+
+  // Check that C1 could have been shl'd by C2.
+  APInt Mask = APInt::getHighBitsSet(C2Int.getBitWidth(),
+                                     C2Int.getBitWidth() - C2->getZExtValue());
+  if ((C1Int & Mask) != C1Int)
+    return SDValue();
+
+  // Undo c1 << c2
+  C1Int.lshrInPlace(C2Int);
+
+  // The immediates are encoded as an 8-bit value that can be rotated.
+  unsigned Zeros = C1Int.countLeadingZeros() + C1Int.countTrailingZeros();
+  if (C1Int.getBitWidth() - Zeros > 8)
+    return SDValue();
+
+  Zeros = C2Int.countLeadingZeros() + C2Int.countTrailingZeros();
+  if (C2Int.getBitWidth() - Zeros > 8)
+    return SDValue();
+
+  // Check that all the users could perform the shl themselves.
+  SDValue BinOp = SDValue(N, 0);
+  for (auto U : N->uses()) {
+    switch(U->getOpcode()) {
+    default:
+      return SDValue();
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+    case ISD::SETCC:
+    case ARMISD::CMP:
+      break;
+    }
+  }
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(N);
+  SDValue X = SHL.getOperand(0);
+  BinOp = DAG.getNode(N->getOpcode(), dl, MVT::i32, X,
+                      DAG.getConstant(C1Int, dl, MVT::i32));
+  SDValue Res = DAG.getNode(ISD::SHL, dl, MVT::i32, BinOp, SHL.getOperand(1));
+
+  DEBUG(dbgs()  << "Unfolding: "; N->dump());
+  DEBUG(dbgs() << "Into shift operation:\n";
+        BinOp.dump();
+        Res.dump());
+
+  DAG.ReplaceAllUsesWith(SDValue(N, 0), Res);
+  return SDValue(N, 0);
+}
+
 /// PerformADDCombine - Target-specific dag combine xforms for ISD::ADD.
 ///
 static SDValue PerformADDCombine(SDNode *N,
@@ -9968,6 +10064,10 @@
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
+  // Only works one way, because it needs an immediate operand.
+  if (SDValue Result = PerformUnfoldSHL(N, DCI, Subtarget))
+    return Result;
+
   // First try with the default operand order.
   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
     return Result;
@@ -10162,10 +10262,11 @@
   return SDValue();
 }
 
-// Try combining OR nodes to SMULWB, SMULWT.
 static SDValue PerformORCombineToSMULWBT(SDNode *OR,
                                          TargetLowering::DAGCombinerInfo &DCI,
                                          const ARMSubtarget *Subtarget) {
+
+  // Try combining OR nodes to SMULWB, SMULWT.
   if (!Subtarget->hasV6Ops() ||
       (Subtarget->isThumb() &&
        (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
@@ -10223,95 +10324,18 @@
   return SDValue(OR, 0);
 }
 
-/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
-static SDValue PerformORCombine(SDNode *N,
-                                TargetLowering::DAGCombinerInfo &DCI,
-                                const ARMSubtarget *Subtarget) {
-  // Attempt to use immediate-form VORR
-  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
-  SDLoc dl(N);
-  EVT VT = N->getValueType(0);
-  SelectionDAG &DAG = DCI.DAG;
-
-  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
-    return SDValue();
-
-  APInt SplatBits, SplatUndef;
-  unsigned SplatBitSize;
-  bool HasAnyUndefs;
-  if (BVN && Subtarget->hasNEON() &&
-      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize <= 64) {
-      EVT VorrVT;
-      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
-                                      SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, dl, VorrVT, VT.is128BitVector(),
-                                      OtherModImm);
-      if (Val.getNode()) {
-        SDValue Input =
-          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
-        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
-        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
-      }
-    }
-  }
-
-  if (!Subtarget->isThumb1Only()) {
-    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
-    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
-      return Result;
-    if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
-      return Result;
-  }
-
-  // The code below optimizes (or (and X, Y), Z).
-  // The AND operand needs to have a single user to make these optimizations
-  // profitable.
-  SDValue N0 = N->getOperand(0);
-  if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
-    return SDValue();
-  SDValue N1 = N->getOperand(1);
-
-  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
-  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
-      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
-    APInt SplatUndef;
-    unsigned SplatBitSize;
-    bool HasAnyUndefs;
-
-    APInt SplatBits0, SplatBits1;
-    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
-    BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
-    // Ensure that the second operand of both ands are constants
-    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
-                                      HasAnyUndefs) && !HasAnyUndefs) {
-        if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
-                                          HasAnyUndefs) && !HasAnyUndefs) {
-            // Ensure that the bit width of the constants are the same and that
-            // the splat arguments are logical inverses as per the pattern we
-            // are trying to simplify.
-            if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
-                SplatBits0 == ~SplatBits1) {
-                // Canonicalize the vector type to make instruction selection
-                // simpler.
-                EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
-                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
-                                             N0->getOperand(1),
-                                             N0->getOperand(0),
-                                             N1->getOperand(0));
-                return DAG.getNode(ISD::BITCAST, dl, VT, Result);
-            }
-        }
-    }
-  }
-
-  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
-  // reasonable.
+static SDValue PerformBFICombine(SDNode *N,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const ARMSubtarget *Subtarget) {
 
   // BFI is only available on V6T2+
   if (Subtarget->isThumb1Only() || !Subtarget->hasV6T2Ops())
     return SDValue();
 
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SelectionDAG &DAG = DCI.DAG;
   SDLoc DL(N);
   // 1) or (and A, mask), val => ARMbfi A, val, mask
   //      iff (val & mask) == val
@@ -10355,7 +10379,7 @@
 
       // Do not add new nodes to DAG combiner worklist.
       DCI.CombineTo(N, Res, false);
-      return SDValue();
+      return SDValue(N, 0);
     }
   } else if (N1.getOpcode() == ISD::AND) {
     // case (2) or (and A, mask), (and B, mask2) => ARMbfi A, (lsr B, amt), mask
@@ -10381,7 +10405,7 @@
                         DAG.getConstant(Mask, DL, MVT::i32));
       // Do not add new nodes to DAG combiner worklist.
       DCI.CombineTo(N, Res, false);
-      return SDValue();
+      return SDValue(N, 0);
     } else if (ARM::isBitFieldInvertedMask(~Mask) &&
                (~Mask == Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
@@ -10397,7 +10421,7 @@
                         DAG.getConstant(Mask2, DL, MVT::i32));
       // Do not add new nodes to DAG combiner worklist.
       DCI.CombineTo(N, Res, false);
-      return SDValue();
+      return SDValue(N, 0);
     }
   }
 
@@ -10417,8 +10441,107 @@
 
     // Do not add new nodes to DAG combiner worklist.
     DCI.CombineTo(N, Res, false);
+    return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
+/// PerformORCombine - Target-specific dag combine xforms for ISD::OR
+static SDValue PerformORCombine(SDNode *N,
+                                TargetLowering::DAGCombinerInfo &DCI,
+                                const ARMSubtarget *Subtarget) {
+
+  // Attempt to use immediate-form VORR
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(N->getOperand(1));
+  SDLoc dl(N);
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+
+  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
+  APInt SplatBits, SplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (BVN && Subtarget->hasNEON() &&
+      BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
+    if (SplatBitSize <= 64) {
+      EVT VorrVT;
+      SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
+                                      SplatUndef.getZExtValue(), SplatBitSize,
+                                      DAG, dl, VorrVT, VT.is128BitVector(),
+                                      OtherModImm);
+      if (Val.getNode()) {
+        SDValue Input =
+          DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
+        SDValue Vorr = DAG.getNode(ARMISD::VORRIMM, dl, VorrVT, Input, Val);
+        return DAG.getNode(ISD::BITCAST, dl, VT, Vorr);
+      }
+    }
+  }
+
+  if (!Subtarget->isThumb1Only()) {
+    // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
+    if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
+      return Result;
+    if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
+      return Result;
   }
 
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // (or (and B, A), (and C, ~A)) => (VBSL A, B, C) when A is a constant.
+  if (Subtarget->hasNEON() && N1.getOpcode() == ISD::AND && VT.isVector() &&
+      DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+
+    // The code below optimizes (or (and X, Y), Z).
+    // The AND operand needs to have a single user to make these optimizations
+    // profitable.
+    if (N0.getOpcode() != ISD::AND || !N0.hasOneUse())
+      return SDValue();
+
+    APInt SplatUndef;
+    unsigned SplatBitSize;
+    bool HasAnyUndefs;
+
+    APInt SplatBits0, SplatBits1;
+    BuildVectorSDNode *BVN0 = dyn_cast<BuildVectorSDNode>(N0->getOperand(1));
+    BuildVectorSDNode *BVN1 = dyn_cast<BuildVectorSDNode>(N1->getOperand(1));
+    // Ensure that the second operand of both ands are constants
+    if (BVN0 && BVN0->isConstantSplat(SplatBits0, SplatUndef, SplatBitSize,
+                                      HasAnyUndefs) && !HasAnyUndefs) {
+        if (BVN1 && BVN1->isConstantSplat(SplatBits1, SplatUndef, SplatBitSize,
+                                          HasAnyUndefs) && !HasAnyUndefs) {
+            // Ensure that the bit width of the constants are the same and that
+            // the splat arguments are logical inverses as per the pattern we
+            // are trying to simplify.
+            if (SplatBits0.getBitWidth() == SplatBits1.getBitWidth() &&
+                SplatBits0 == ~SplatBits1) {
+                // Canonicalize the vector type to make instruction selection
+                // simpler.
+                EVT CanonicalVT = VT.is128BitVector() ? MVT::v4i32 : MVT::v2i32;
+                SDValue Result = DAG.getNode(ARMISD::VBSL, dl, CanonicalVT,
+                                             N0->getOperand(1),
+                                             N0->getOperand(0),
+                                             N1->getOperand(0));
+                return DAG.getNode(ISD::BITCAST, dl, VT, Result);
+            }
+        }
+    }
+  }
+
+  // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
+  // reasonable.
+  if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
+    if (SDValue Result = PerformBFICombine(N, DCI, Subtarget))
+      return Result;
+  }
+
+  if (SDValue Result = PerformUnfoldSHL(N, DCI, Subtarget))
+    return Result;
+
   return SDValue();
 }
 
Index: test/CodeGen/ARM/unfold-shifts.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/unfold-shifts.ll
@@ -0,0 +1,143 @@
+; RUN: llc -mtriple armv6t2 %s -o - | FileCheck %s
+; RUN: llc -mtriple thumbv6t2 %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple armv7 %s -o - | FileCheck %s
+; RUN: llc -mtriple thumbv7 %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple thumbv7m %s -o - | FileCheck %s --check-prefix=CHECK-T2
+; RUN: llc -mtriple thumbv8m.main %s -o - | FileCheck %s --check-prefix=CHECK-T2
+
+; CHECK-LABEL: unfold1
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #255
+; CHECK: add r0, r1, r0, lsl #1
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #255
+; CHECK-T2: add.w r0, r1, r0, lsl #1
+define arm_aapcscc i32 @unfold1(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 1
+  %shl = or i32 %or, 510
+  %add = add nsw i32 %shl, %b
+  ret i32 %add
+}
+
+; CHECK-LABEL: unfold2
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #4080
+; CHECK: sub r0, r1, r0, lsl #2
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #4080
+; CHECK-T2: sub.w r0, r1, r0, lsl #2
+define arm_aapcscc i32 @unfold2(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 2
+  %shl = or i32 %or, 16320
+  %sub = sub nsw i32 %b, %shl
+  ret i32 %sub
+}
+
+; CHECK-LABEL: unfold3
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #65280
+; CHECK: and r0, r1, r0, lsl #4
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #65280
+; CHECK-T2: and.w r0, r1, r0, lsl #4
+define arm_aapcscc i32 @unfold3(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 4
+  %shl = or i32 %or, 1044480
+  %and = and i32 %shl, %b
+  ret i32 %and
+}
+
+; CHECK-LABEL: unfold4
+; CHECK-NOT: mov
+; CHECK: orr r0, r0, #1044480
+; CHECK: eor r0, r1, r0, lsl #5
+; CHECK-T2-NOT: mov
+; CHECK-T2: orr r0, r0, #1044480
+; CHECK-T2: eor.w r0, r1, r0, lsl #5
+define arm_aapcscc i32 @unfold4(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 5
+  %shl = or i32 %or, 33423360
+  %xor = xor i32 %shl, %b
+  ret i32 %xor
+}
+
+; CHECK-LABEL: unfold5
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #496
+; CHECK: orr r0, r1, r0, lsl #6
+; CHECK-T2: add.w r0, r0, #496
+; CHECK-T2: orr.w r0, r1, r0, lsl #6
+define arm_aapcscc i32 @unfold5(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 6
+  %shl = add i32 %add, 31744
+  %or = or i32 %shl, %b
+  ret i32 %or
+}
+
+; CHECK-LABEL: unfold6
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #7936
+; CHECK: and r0, r1, r0, lsl #8
+; CHECK-T2-NOT: mov
+; CHECK-T2: add.w r0, r0, #7936
+; CHECK-T2: and.w r0, r1, r0, lsl #8
+define arm_aapcscc i32 @unfold6(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 8
+  %shl = add i32 %add, 2031616
+  %and = and i32 %shl, %b
+  ret i32 %and
+}
+
+; CHECK-LABEL: unfold7
+; CHECK-NOT: mov
+; CHECK: add r0, r0, #126976
+; CHECK: eor r0, r1, r0, lsl #9
+; CHECK-T2-NOT: mov
+; CHECK-T2: add.w r0, r0, #126976
+; CHECK-T2: eor.w r0, r1, r0, lsl #9
+define arm_aapcscc i32 @unfold7(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 9
+  %shl = add i32 %add, 65011712
+  %xor = xor i32 %shl, %b
+  ret i32 %xor
+}
+
+; CHECK-LABEL: unfold8
+; CHECK-NOT: mov r2
+; CHECK: orr r2, r0, #4080
+; CHECK: cmp r1, r2, lsl #10
+; CHECK-T2-NOT: mov.w r2
+; CHECK-T2: orr r2, r0, #4080
+; CHECK-T2: cmp.w r1, r2, lsl #10
+define arm_aapcscc i32 @unfold8(i32 %a, i32 %b) {
+entry:
+  %or = shl i32 %a, 10
+  %shl = or i32 %or, 4177920
+  %cmp = icmp sgt i32 %shl, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; CHECK-LABEL: unfold9
+; CHECK-NOT: mov r2
+; CHECK: add r2, r0, #7936
+; CHECK: cmp r1, r2, lsl #11
+; CHECK-T2-NOT: mov.w r2
+; CHECK-T2: add.w r2, r0, #7936
+; CHECK-T2: cmp.w r1, r2, lsl #11
+define arm_aapcscc i32 @unfold9(i32 %a, i32 %b) {
+entry:
+  %add = shl i32 %a, 11
+  %shl = add i32 %add, 16252928
+  %cmp = icmp sgt i32 %shl, %b
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+