Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7534,19 +7534,45 @@
   // future CPUs have a cheaper MADD instruction, this may need to be
   // gated on a subtarget feature. For Cyclone, 32-bit MADD is 4 cycles and
   // 64-bit is 5 cycles, so this is always a win.
+  // More aggressively, some multiplications can be lowered to shift+add+shift 
+  // if the constant is (2^N + 1) * 2^M.
+  // TODO: consider constants in the form of (2^N - 1 ) * 2^M,
+  // (2^N + 1 ) * 2^M + 1, or (2^N + 1) * (2^M + 1).
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     const APInt &Value = C->getAPIntValue();
     EVT VT = N->getValueType(0);
     SDLoc DL(N);
     if (Value.isNonNegative()) {
+      // Lg2 is used to test if the mul can be lowered to shift+add+shift.
+      unsigned Lg2 = Value.countTrailingZeros();
+      // Conservatively do no lower to shift+add+shift if the mul might be
+      // folded into smul or umul.
+      if (Lg2 && (isSignExtended(N->getOperand(0).getNode(), DAG) ||
+                  isZeroExtended(N->getOperand(0).getNode(), DAG)))
+        Lg2 = 0;
+      // Conservatively do no lower to shift+add+shift if the mul might be
+      // folded into madd or msub.
+      if (Lg2)
+        for (SDNode *Use : N->uses())
+          if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
+            Lg2 = 0;
+            break;
+          }
+      APInt Shift = Value.ashr(Lg2);
       // (mul x, 2^N + 1) => (add (shl x, N), x)
-      APInt VM1 = Value - 1;
+      // (mul x, (2^N + 1) * 2^M) => (shl (add (shl x, N), x), M)
+      APInt VM1 = Shift - 1;
       if (VM1.isPowerOf2()) {
-        SDValue ShiftedVal =
+        SDValue ShiftedValue =
             DAG.getNode(ISD::SHL, DL, VT, N->getOperand(0),
                         DAG.getConstant(VM1.logBase2(), DL, MVT::i64));
-        return DAG.getNode(ISD::ADD, DL, VT, ShiftedVal,
-                           N->getOperand(0));
+        SDValue Add =
+            DAG.getNode(ISD::ADD, DL, VT, ShiftedValue, N->getOperand(0));
+        if (Lg2)
+          return DAG.getNode(ISD::SHL, DL, VT, Add,
+                             DAG.getConstant(Lg2, DL, MVT::i64));
+        else
+          return Add;
       }
       // (mul x, 2^N - 1) => (sub (shl x, N), x)
       APInt VP1 = Value + 1;
Index: test/CodeGen/AArch64/arm64-mul.ll
===================================================================
--- test/CodeGen/AArch64/arm64-mul.ll
+++ test/CodeGen/AArch64/arm64-mul.ll
@@ -104,7 +104,7 @@
 ; CHECK-LABEL: t10:
 ; CHECK: mul {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
   %tmp1 = sext i32 %a to i64
-  %tmp2 = mul i64 %tmp1, 2147483650 ; = 2^31 + 2
+  %tmp2 = mul i64 %tmp1, 2147483650 ; = 2^31 + 2 
   ret i64 %tmp2
 }
 
Index: test/CodeGen/AArch64/mul_pow2.ll
===================================================================
--- test/CodeGen/AArch64/mul_pow2.ll
+++ test/CodeGen/AArch64/mul_pow2.ll
@@ -2,6 +2,7 @@
 
 ; Convert mul x, pow2 to shift.
 ; Convert mul x, pow2 +/- 1 to shift + add/sub.
+; Convert mul x, (pow2 + 1) * pow2 to shift + add + shift.
 
 define i32 @test2(i32 %x) {
 ; CHECK-LABEL: test2
@@ -36,6 +37,15 @@
   ret i32 %mul
 }
 
+define i32 @test6(i32 %x) {
+; CHECK-LABEL: test6
+; CHECK: add w8, w0, w0, lsl #1
+; CHECK: lsl w0, w8, #1
+
+  %mul = mul nsw i32 %x, 6 
+  ret i32 %mul
+}
+
 define i32 @test7(i32 %x) {
 ; CHECK-LABEL: test7
 ; CHECK: lsl {{w[0-9]+}}, w0, #3
@@ -57,10 +67,18 @@
 ; CHECK-LABEL: test9
 ; CHECK: add w0, w0, w0, lsl #3
 
-  %mul = mul nsw i32 %x, 9
+  %mul = mul nsw i32 %x, 9 
   ret i32 %mul
 }
 
+define i32 @test10(i32 %x) {
+; CHECK-LABEL: test10
+; CHECK: add w8, w0, w0, lsl #2
+; CHECK: lsl w0, w8, #1
+
+  %mul = mul nsw i32 %x, 10
+  ret i32 %mul
+}
 ; Convert mul x, -pow2 to shift.
 ; Convert mul x, -(pow2 +/- 1) to shift + add/sub.