Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -12871,11 +12871,55 @@
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  // Canonicalize X*(Y+1) -> X*Y+X and (X+1)*Y -> X*Y+Y,
+  // and in MachineCombiner pass, add+mul will be combined into madd.
+  // Similarly, X*(1-Y) -> X - X*Y and (1-Y)*X -> X - Y*X.
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue Op0;
+  SDValue Op1;
+
+  if (!isa<ConstantSDNode>(N0) && !isa<ConstantSDNode>(N1)) {
+    if (N0->getOpcode() == ISD::ADD || N0->getOpcode() == ISD::SUB) {
+      Op0 = N0;
+      Op1 = N1;
+    } else if (N1->getOpcode() == ISD::ADD || N1->getOpcode() == ISD::SUB) {
+      Op0 = N1;
+      Op1 = N0;
+    }
+
+    if (!Op0 || !Op0->hasOneUse())
+      return SDValue();
+
+    if (Op0->getOpcode() == ISD::ADD &&
+        isa<ConstantSDNode>(Op0->getOperand(1))) {
+      ConstantSDNode *C = cast<ConstantSDNode>(Op0->getOperand(1));
+      if (C && C->isOne()) {
+        SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, Op1, Op0->getOperand(0));
+        SDValue Res = DAG.getNode(ISD::ADD, DL, VT, MulVal, Op1);
+        return Res;
+      }
+    }
+
+    if (Op0->getOpcode() == ISD::SUB &&
+        isa<ConstantSDNode>(Op0->getOperand(0))) {
+      ConstantSDNode *C = cast<ConstantSDNode>(Op0->getOperand(0));
+      if (C && C->isOne()) {
+        SDValue MulVal = DAG.getNode(ISD::MUL, DL, VT, Op1, Op0->getOperand(1));
+        SDValue Res = DAG.getNode(ISD::SUB, DL, VT, Op1, MulVal);
+        return Res;
+      }
+    }
+
+    return SDValue();
+  }
+
   // The below optimizations require a constant RHS.
   if (!isa<ConstantSDNode>(N->getOperand(1)))
     return SDValue();
 
-  SDValue N0 = N->getOperand(0);
   ConstantSDNode *C = cast<ConstantSDNode>(N->getOperand(1));
   const APInt &ConstValue = C->getAPIntValue();
 
@@ -12954,8 +12998,6 @@
       return SDValue();
   }
 
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
   SDValue ShiftedVal = DAG.getNode(ISD::SHL, DL, VT, N0,
                                    DAG.getConstant(ShiftAmt, DL, MVT::i64));
 
Index: llvm/test/CodeGen/AArch64/madd-combiner.ll
===================================================================
--- llvm/test/CodeGen/AArch64/madd-combiner.ll
+++ llvm/test/CodeGen/AArch64/madd-combiner.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-apple-darwin -fast-isel -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-apple-darwin            -verify-machineinstrs < %s | FileCheck %s -check-prefixes=CHECK-MADD-MSUB
 
 ; Test that we use the correct register class.
 define i32 @mul_add_imm(i32 %a, i32 %b) {
@@ -35,3 +36,50 @@
   ret void
 }
 
+define i32 @add1_mul_val1(i32 %a, i32 %b) {
+; CHECK-LABEL:     add1_mul_val1
+; CHECK-MADD-MSUB: madd    w0, w1, w0, w1
+  %1 = add i32 %a, 1
+  %2 = mul i32 %1, %b
+  ret i32 %2
+}
+
+define i32 @add1_mul_val2(i32 %a, i32 %b) {
+; CHECK-LABEL:     add1_mul_val2
+; CHECK-MADD-MSUB: madd    w0, w0, w1, w0
+  %1 = add i32 %b, 1
+  %2 = mul i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @add1_mul_val3(i64 %a, i64 %b) {
+; CHECK-LABEL:     add1_mul_val3
+; CHECK-MADD-MSUB: madd    x0, x0, x1, x0
+  %1 = add i64 %b, 1
+  %2 = mul i64 %a, %1
+  ret i64 %2
+}
+
+define i32 @sub1_mul_val1(i32 %a, i32 %b) {
+; CHECK-LABEL:     sub1_mul_val1
+; CHECK-MADD-MSUB: msub    w0, w1, w0, w1
+  %1 = sub i32 1, %a
+  %2 = mul i32 %1, %b
+  ret i32 %2
+}
+
+define i32 @sub1_mul_val2(i32 %a, i32 %b) {
+; CHECK-LABEL:     sub1_mul_val2
+; CHECK-MADD-MSUB: msub    w0, w0, w1, w0
+  %1 = sub i32 1, %b
+  %2 = mul i32 %a, %1
+  ret i32 %2
+}
+
+define i64 @sub1_mul_val3(i64 %a, i64 %b) {
+; CHECK-LABEL:     sub1_mul_val3
+; CHECK-MADD-MSUB: msub    x0, x0, x1, x0
+  %1 = sub i64 1, %b
+  %2 = mul i64 %a, %1
+  ret i64 %2
+}