Index: include/llvm/CodeGen/MachineCombinerPattern.h
===================================================================
--- include/llvm/CodeGen/MachineCombinerPattern.h
+++ include/llvm/CodeGen/MachineCombinerPattern.h
@@ -21,6 +21,12 @@
 ///
 ///
 namespace MachineCombinerPattern {
+
+// Encode optional information into the enum value of each pattern.
+enum MC_COST_CALCULATION : int {
+  USE_SLACK = 1 << 31
+};
+
 // Forward declaration
 enum MC_PATTERN : int {
   // These are commutative variants for reassociating a computation chain. See
@@ -29,10 +35,11 @@
   MC_REASSOC_AX_YB = 1,
   MC_REASSOC_XA_BY = 2,
   MC_REASSOC_XA_YB = 3,
+  LAST_REASSOC_PATTERN = MC_REASSOC_XA_YB,
 
-  /// Enumeration of instruction pattern supported by AArch64 machine combiner
-  MC_NONE,
-  MC_MULADDW_OP1,
+  // These AArch64 patterns allow slack in the critical path calculation to
+  // enable more optimization possibilities.
+  MC_MULADDW_OP1  = (LAST_REASSOC_PATTERN + 1) | USE_SLACK,
   MC_MULADDW_OP2,
   MC_MULSUBW_OP1,
   MC_MULSUBW_OP2,
Index: lib/CodeGen/MachineCombiner.cpp
===================================================================
--- lib/CodeGen/MachineCombiner.cpp
+++ lib/CodeGen/MachineCombiner.cpp
@@ -72,7 +72,7 @@
                            MachineTraceMetrics::Trace BlockTrace,
                            SmallVectorImpl<MachineInstr *> &InsInstrs,
                            DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-                           bool NewCodeHasLessInsts);
+                           bool MustReduceCriticalPath, bool MustReduceDepth);
   bool preservesResourceLen(MachineBasicBlock *MBB,
                             MachineTraceMetrics::Trace BlockTrace,
                             SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -210,43 +210,50 @@
   return NewRootLatency;
 }
 
-/// True when the new instruction sequence does not lengthen the critical path
-/// and the new sequence has less instructions or the new sequence improves the
-/// critical path.
 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
 /// sequence to replace the old sequence is that it cannot lengthen the critical
-/// path. This is decided by the formula:
-/// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)).
-/// If the new sequence has an equal length critical path but does not reduce
-/// the number of instructions (NewCodeHasLessInsts is false), then it is not
-/// considered an improvement. The slack is the number of cycles Root can be
-/// delayed before the critical patch becomes longer.
+/// path. The definition of "improve" may be restricted by specifying that the
+/// new path is strictly shorter than the old path (MustReduceCriticalPath) and
+/// that the new path improves the data dependency chain (MustReduceDepth).
 bool MachineCombiner::improvesCriticalPathLen(
     MachineBasicBlock *MBB, MachineInstr *Root,
     MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-    bool NewCodeHasLessInsts) {
+    bool MustReduceCriticalPath, bool MustReduceDepth) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
          "Missing machine model\n");
   // NewRoot is the last instruction in the \p InsInstrs vector.
-  // Get depth and latency of NewRoot.
   unsigned NewRootIdx = InsInstrs.size() - 1;
   MachineInstr *NewRoot = InsInstrs[NewRootIdx];
-  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
-  unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
 
-  // Get depth, latency and slack of Root.
+  // Get depth and latency of NewRoot and Root.
+  unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(Root).Depth;
+
+  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
+        dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
+        dbgs() << " RootDepth: " << RootDepth << "\n");
+
+  // For a transform such as reassociation, the cost equation is
+  // conservatively calculated so that we must improve the depth (data
+  // dependency cycles) in the critical path to proceed with the transform.
+  // Being conservative also protects against inaccuracies in the underlying
+  // machine trace metrics and CPU models.
+  if (MustReduceDepth)
+    return NewRootDepth < RootDepth;
+
+  // A more flexible cost calculation for the critical path includes the slack
+  // of the original code sequence. This may allow the transform to proceed
+  // even if the instruction depths (data dependency cycles) become worse.
+  unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
   unsigned RootLatency = TSchedModel.computeInstrLatency(Root);
   unsigned RootSlack = BlockTrace.getInstrSlack(Root);
 
-  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
-        dbgs() << " NewRootDepth: " << NewRootDepth
-               << " NewRootLatency: " << NewRootLatency << "\n";
-        dbgs() << " RootDepth: " << RootDepth << " RootLatency: " << RootLatency
-               << " RootSlack: " << RootSlack << "\n";
+  DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
+        dbgs() << " RootLatency: " << RootLatency << "\n";
+        dbgs() << " RootSlack: " << RootSlack << "\n";
         dbgs() << " NewRootDepth + NewRootLatency = "
                << NewRootDepth + NewRootLatency << "\n";
         dbgs() << " RootDepth + RootLatency + RootSlack = "
@@ -255,10 +262,10 @@
   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
   unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
   
-  if (NewCodeHasLessInsts)
-    return NewCycleCount <= OldCycleCount;
-  else
+  if (MustReduceCriticalPath)
     return NewCycleCount < OldCycleCount;
+  else
+    return NewCycleCount <= OldCycleCount;
 }
 
 /// helper routine to convert instructions into SC
@@ -380,14 +387,18 @@
         // in a single instruction.
         if (!NewInstCount)
           continue;
+
         // Substitute when we optimize for codesize and the new sequence has
         // fewer instructions OR
         // the new sequence neither lengthens the critical path nor increases
         // resource pressure.
+        bool MustReduceCriticalPath = NewInstCount >= OldInstCount;
+        bool MustReduceDepth = !(P & MachineCombinerPattern::USE_SLACK);
         if (doSubstitute(NewInstCount, OldInstCount) ||
             (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                      InstrIdxForVirtReg,
-                                      NewInstCount < OldInstCount) &&
+                                     InstrIdxForVirtReg,
+                                     MustReduceCriticalPath,
+                                     MustReduceDepth) &&
              preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
           for (auto *InstrPtr : InsInstrs)
             MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
Index: test/CodeGen/X86/machine-combiner.ll
===================================================================
--- test/CodeGen/X86/machine-combiner.ll
+++ test/CodeGen/X86/machine-combiner.ll
@@ -632,10 +632,10 @@
 ; AVX-NEXT:  callq   bar
 ; AVX-NEXT:  vmovsd  %xmm0, (%rsp)
 ; AVX-NEXT:  callq   bar
-; AVX-NEXT:  vmovsd  (%rsp), %xmm1
-; AVX:       vaddsd  8(%rsp), %xmm1, %xmm1
+; AVX-NEXT:  vmovsd  8(%rsp), %xmm1
+; AVX:       vaddsd  16(%rsp), %xmm1, %xmm1
+; AVX-NEXT:  vaddsd  (%rsp), %xmm0, %xmm0
 ; AVX-NEXT:  vaddsd  %xmm0, %xmm1, %xmm0
-; AVX-NEXT:  vaddsd  16(%rsp), %xmm0, %xmm0
 
   %x0 = call double @bar()
   %x1 = call double @bar()
@@ -656,9 +656,10 @@
 ; AVX-NEXT:  callq   bar
 ; AVX-NEXT:  vmovsd  %xmm0, (%rsp)
 ; AVX-NEXT:  callq   bar
+; AVX-NEXT:  vmovsd  8(%rsp), %xmm1
+; AVX:       vaddsd  16(%rsp), %xmm1, %xmm1
 ; AVX-NEXT:  vaddsd  (%rsp), %xmm0, %xmm0
-; AVX-NEXT:  vaddsd  8(%rsp), %xmm0, %xmm0
-; AVX-NEXT:  vaddsd  16(%rsp), %xmm0, %xmm0
+; AVX-NEXT:  vaddsd  %xmm0, %xmm1, %xmm0
 
   %x0 = call double @bar()
   %x1 = call double @bar()