Index: llvm/lib/CodeGen/MachineCombiner.cpp
===================================================================
--- llvm/lib/CodeGen/MachineCombiner.cpp
+++ llvm/lib/CodeGen/MachineCombiner.cpp
@@ -71,6 +71,7 @@
   improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
                           MachineTraceMetrics::Trace BlockTrace,
                           SmallVectorImpl<MachineInstr *> &InsInstrs,
+                          SmallVectorImpl<MachineInstr *> &DelInstrs,
                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
                           MachineCombinerPattern Pattern);
   bool preservesResourceLen(MachineBasicBlock *MBB,
@@ -242,6 +243,7 @@
     MachineBasicBlock *MBB, MachineInstr *Root,
     MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
     MachineCombinerPattern Pattern) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
@@ -269,8 +271,13 @@
   // A more flexible cost calculation for the critical path includes the slack
   // of the original code sequence. This may allow the transform to proceed
   // even if the instruction depths (data dependency cycles) become worse.
+
   unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
-  unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
+  unsigned RootLatency = 0;
+
+  for (auto I : DelInstrs)
+    RootLatency += TSchedModel.computeInstrLatency(I);
+
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
 
   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
@@ -421,7 +428,7 @@
       // resource pressure.
       if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
           (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                   InstrIdxForVirtReg, P) &&
+                                   DelInstrs, InstrIdxForVirtReg, P) &&
            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
         for (auto *InstrPtr : InsInstrs)
           MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
Index: llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
===================================================================
--- llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
+++ llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -2,7 +2,7 @@
 define void @foo_2d(double* %src) {
 ; CHECK-LABEL: %entry
 ; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
-; CHECK: fmul {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+; CHECK: fmadd {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
 entry:
   %arrayidx1 = getelementptr inbounds double, double* %src, i64 5
   %arrayidx2 = getelementptr inbounds double, double* %src, i64 11
Index: llvm/test/CodeGen/AArch64/machine-combiner_madd_during_address_computation.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/machine-combiner_madd_during_address_computation.ll
@@ -0,0 +1,66 @@
+; Converted from a machine-reduced C++ test case because hand-written test cases passed without first improving the compiler.
+
+; test all AArch64 subarches known as of Dec. 7 2016 to have scheduling models
+; ----------------------------------------------------------------------------
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -o - %s | FileCheck %s
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a72 -o - %s | FileCheck %s
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cortex-a73 -o - %s | FileCheck %s
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=cyclone    -o - %s | FileCheck %s
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=exynos-m1  -o - %s | FileCheck %s
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=exynos-m2  -o - %s | FileCheck %s
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=kryo       -o - %s | FileCheck %s
+; RUN: llc -O=3 -mtriple=aarch64-linux-gnu -mcpu=vulcan     -o - %s | FileCheck %s
+
+; CHECK-NOT: mul
+; CHECK:     madd
+; CHECK-NOT: mul
+
+target triple = "aarch64-sarc-linux-gnu"
+
+%class.BtlConfig = type { %class.C }
+%class.C = type { %class.B }
+%class.B = type { %class.D* }
+%class.D = type { %class.basic_string.base, [4 x i8] }
+%class.basic_string.base = type <{ i64, i64, i32 }>
+%class.basic_string = type <{ i64, i64, i32, [4 x i8] }>
+@a = global %class.BtlConfig zeroinitializer, align 8
+@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+declare i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C*) local_unnamed_addr
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+define internal void @_GLOBAL__sub_I_main_adv.ii() section ".text.startup" {
+entry:
+  %tmp.i.i = alloca %class.D, align 8
+  %agg.tmp.i.i = alloca %class.D, align 8
+  %0 = bitcast %class.D* %tmp.i.i to i8*
+  %1 = bitcast %class.D* %agg.tmp.i.i to i8*
+  %call8.i.i = tail call i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C* getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0))
+  %cmp9.i.i = icmp sgt i64 %call8.i.i, 0
+  br i1 %cmp9.i.i, label %for.body.lr.ph.i.i, label %__cxx_global_var_init.exit
+for.body.lr.ph.i.i:
+  %2 = bitcast %class.D* %agg.tmp.i.i to %class.basic_string*
+  br label %for.body.i.i
+for.body.i.i:
+  %conv11.i.i = phi i64 [ 0, %for.body.lr.ph.i.i ], [ %conv.i.i, %for.body.i.i ]
+  %i.010.i.i = phi i32 [ undef, %for.body.lr.ph.i.i ], [ %inc.i.i, %for.body.i.i ]
+  %3 = load %class.D*, %class.D** getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0, i32 0, i32 0), align 8, !tbaa !1, !noalias !6
+  %arrayidx.i.i.i = getelementptr inbounds %class.D, %class.D* %3, i64 %conv11.i.i
+  %4 = bitcast %class.D* %arrayidx.i.i.i to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull %0, i8* %4, i64 24, i32 8, i1 false)
+  %inc.i.i = add i32 %i.010.i.i, 1
+  %conv.i.i = zext i32 %inc.i.i to i64
+  %call.i.i = call i64 @_ZN1CI1D1AIS0_EE5m_fn1Ev(%class.C* getelementptr inbounds (%class.BtlConfig, %class.BtlConfig* @a, i64 0, i32 0))
+  %cmp.i.i = icmp slt i64 %conv.i.i, %call.i.i
+  br i1 %cmp.i.i, label %for.body.i.i, label %__cxx_global_var_init.exit.loopexit
+__cxx_global_var_init.exit.loopexit:
+  br label %__cxx_global_var_init.exit
+__cxx_global_var_init.exit:
+  ret void
+}
+!1 = !{!2, !3, i64 0}
+!2 = !{!"foo", !3, i64 0}
+!3 = !{!"bar", !4, i64 0}
+!4 = !{!"baz", !5, i64 0}
+!5 = !{!"boo"}
+!6 = !{!7}
+!7 = distinct !{!7, !8, !"_ZN1CI1D1AIS0_EEixEl: %agg.result"}
+!8 = distinct !{!8, !"_ZN1CI1D1AIS0_EEixEl"}
Index: llvm/test/CodeGen/AArch64/mul-lohi.ll
===================================================================
--- llvm/test/CodeGen/AArch64/mul-lohi.ll
+++ llvm/test/CodeGen/AArch64/mul-lohi.ll
@@ -3,16 +3,18 @@
 
 define i128 @test_128bitmul(i128 %lhs, i128 %rhs) {
 ; CHECK-LABEL: test_128bitmul:
-; CHECK-DAG: mul [[PART1:x[0-9]+]], x0, x3
-; CHECK-DAG: umulh [[CARRY:x[0-9]+]], x0, x2
-; CHECK: mul [[PART2:x[0-9]+]], x1, x2
-; CHECK: mul x0, x0, x2
+; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
+; CHECK:       madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
+; CHECK-NEXT:  ret
 
 ; CHECK-BE-LABEL: test_128bitmul:
-; CHECK-BE-DAG: mul [[PART1:x[0-9]+]], x1, x2
-; CHECK-BE-DAG: umulh [[CARRY:x[0-9]+]], x1, x3
-; CHECK-BE: mul [[PART2:x[0-9]+]], x0, x3
-; CHECK-BE: mul x1, x1, x3
+; CHECK-BE:       umulh [[HI:x[0-9]+]], x1, x3
+; CHECK-BE:       madd  [[TEMP1:x[0-9]+]], x1, x2, [[HI]]
+; CHECK-BE-DAG:   madd  x0, x0, x3, [[TEMP1]]
+; CHECK-BE-DAG:   mul   x1, x1, x3
+; CHECK-BE-NEXT:  ret
 
   %prod = mul i128 %lhs, %rhs
   ret i128 %prod
@@ -25,8 +27,8 @@
 ; CHECK-LABEL: test_128bitmul_optsize:
 ; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
 ; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
 ; CHECK-NEXT:  ret
 
   %prod = mul i128 %lhs, %rhs
@@ -37,8 +39,8 @@
 ; CHECK-LABEL: test_128bitmul_minsize:
 ; CHECK:       umulh [[HI:x[0-9]+]], x0, x2
 ; CHECK-NEXT:  madd  [[TEMP1:x[0-9]+]], x0, x3, [[HI]]
-; CHECK-NEXT:  madd  x1, x1, x2, [[TEMP1]]
-; CHECK-NEXT:  mul   x0, x0, x2
+; CHECK-DAG:   madd  x1, x1, x2, [[TEMP1]]
+; CHECK-DAG:   mul   x0, x0, x2
 ; CHECK-NEXT:  ret
 
   %prod = mul i128 %lhs, %rhs