Index: lib/CodeGen/MachineCombiner.cpp
===================================================================
--- lib/CodeGen/MachineCombiner.cpp
+++ lib/CodeGen/MachineCombiner.cpp
@@ -22,6 +22,7 @@
 #include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
@@ -34,7 +35,13 @@
 
 STATISTIC(NumInstCombined, "Number of machineinst combined");
 
+static cl::opt<unsigned>
+inc_threshold("machine-combiner-inc-threshold", cl::Hidden,
+              cl::desc("Incremental depth computation will be used for basic "
+                       "blocks with more instructions."), cl::init(500));
+
 namespace {
+
 class MachineCombiner : public MachineFunctionPass {
   const TargetInstrInfo *TII;
   const TargetRegisterInfo *TRI;
@@ -73,7 +80,7 @@
                           SmallVectorImpl<MachineInstr *> &InsInstrs,
                           SmallVectorImpl<MachineInstr *> &DelInstrs,
                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-                          MachineCombinerPattern Pattern);
+                          MachineCombinerPattern Pattern, bool SlackIsAccurate);
   bool preservesResourceLen(MachineBasicBlock *MBB,
                             MachineTraceMetrics::Trace BlockTrace,
                             SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -247,7 +254,8 @@
     SmallVectorImpl<MachineInstr *> &InsInstrs,
     SmallVectorImpl<MachineInstr *> &DelInstrs,
     DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
-    MachineCombinerPattern Pattern) {
+    MachineCombinerPattern Pattern,
+    bool SlackIsAccurate) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
          "Missing machine model\n");
   // NewRoot is the last instruction in the \p InsInstrs vector.
@@ -258,7 +266,7 @@
   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
 
-  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << Root << "\n";
+  DEBUG(dbgs() << "DEPENDENCE DATA FOR " << *Root << "\n";
         dbgs() << " NewRootDepth: " << NewRootDepth << "\n";
         dbgs() << " RootDepth: " << RootDepth << "\n");
 
@@ -270,28 +278,30 @@
   if (getCombinerObjective(Pattern) == CombinerObjective::MustReduceDepth)
     return NewRootDepth < RootDepth;
 
-  // A more flexible cost calculation for the critical path includes the slack
-  // of the original code sequence. This may allow the transform to proceed
-  // even if the instruction depths (data dependency cycles) become worse.
-
   unsigned NewRootLatency = getLatency(Root, NewRoot, BlockTrace);
   unsigned RootLatency = 0;
 
   for (auto I : DelInstrs)
     RootLatency += TSchedModel.computeInstrLatency(I);
 
-  unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
 
+  unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
+  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+  unsigned OldCycleCount = RootDepth + RootLatency +
+                           (SlackIsAccurate ? RootSlack : 0);
   DEBUG(dbgs() << " NewRootLatency: " << NewRootLatency << "\n";
         dbgs() << " RootLatency: " << RootLatency << "\n";
-        dbgs() << " RootSlack: " << RootSlack << "\n";
+        if (SlackIsAccurate)
+          dbgs() << " RootSlack: " << RootSlack << "\n";
         dbgs() << " NewRootDepth + NewRootLatency = "
-               << NewRootDepth + NewRootLatency << "\n";
-        dbgs() << " RootDepth + RootLatency + RootSlack = "
-               << RootDepth + RootLatency + RootSlack << "\n";);
-
-  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
-  unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+               << NewCycleCount << "\n";
+        if (SlackIsAccurate)
+          dbgs() << " RootDepth + RootLatency + RootSlack = "
+                 << OldCycleCount << "\n";
+        else
+          dbgs() << " RootDepth + RootLatency = "
+                 << OldCycleCount << "\n";
+        );
 
   return NewCycleCount <= OldCycleCount;
 }
@@ -357,14 +367,40 @@
 static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
                                      SmallVector<MachineInstr *, 16> InsInstrs,
                                      SmallVector<MachineInstr *, 16> DelInstrs,
-                                     MachineTraceMetrics *Traces) {
+                                     MachineTraceMetrics::Ensemble *MinInstr,
+                                     MachineBasicBlock::iterator &LastUpdate,
+                                     SparseSet<LiveRegUnit> &RegUnits,
+                                     bool IncrementalUpdate) {
+  if (IncrementalUpdate) {
+    // Update depths and RegUnits since last update. We do this before removing
+    // instructions, which will invalidate the iterator up to MI.
+    auto End = ++MI.getIterator();
+    for (; LastUpdate != End; LastUpdate++) {
+      MinInstr->updateDepth(MBB, *LastUpdate, RegUnits);
+    }
+  }
+
   for (auto *InstrPtr : InsInstrs)
-    MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
-  for (auto *InstrPtr : DelInstrs)
+    MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
+
+  for (auto *InstrPtr : DelInstrs) {
+    if (&*LastUpdate == InstrPtr)
+      LastUpdate++;
     InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
-  ++NumInstCombined;
-  Traces->invalidate(MBB);
-  Traces->verifyAnalysis();
+    // Erase all LiveRegs defined by the removed instruction
+    for (auto I = RegUnits.begin(); I != RegUnits.end(); ) {
+      if (I->MI == InstrPtr)
+        I = RegUnits.erase(I);
+      else
+        I++;
+    }
+  }
+
+  if (IncrementalUpdate)
+    for (auto *InstrPtr : InsInstrs)
+      MinInstr->updateDepth(MBB, *InstrPtr, RegUnits);
+
+  NumInstCombined++;
 }
 
 /// Substitute a slow code sequence with a faster one by
@@ -376,11 +412,16 @@
 /// sequence is shorter.
 bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
   bool Changed = false;
-  DEBUG(dbgs() << "Combining MBB " << MBB->getName() << "\n");
-
+  bool IncrementalUpdate = false;
   auto BlockIter = MBB->begin();
+  auto LastUpdate = MBB->begin();
   // Check if the block is in a loop.
   const MachineLoop *ML = MLI->getLoopFor(MBB);
+  if (!MinInstr)
+    MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
+
+  SparseSet<LiveRegUnit> RegUnits;
+  RegUnits.setUniverse(TRI->getNumRegUnits());
 
   while (BlockIter != MBB->end()) {
     auto &MI = *BlockIter++;
@@ -411,7 +452,6 @@
     // This is only an artificial restriction though. In practice there is
     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
     // based on an internal cost heuristic.
-
     if (!TII->getMachineCombinerPatterns(MI, Patterns))
       continue;
 
@@ -419,9 +459,6 @@
       SmallVector<MachineInstr *, 16> InsInstrs;
       SmallVector<MachineInstr *, 16> DelInstrs;
       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
-      if (!MinInstr)
-        MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-      Traces->verifyAnalysis();
       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
                                       InstrIdxForVirtReg);
       unsigned NewInstCount = InsInstrs.size();
@@ -441,18 +478,36 @@
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
       if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
-        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
+        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
+                                 LastUpdate, RegUnits, IncrementalUpdate);
         // Eagerly stop after the first pattern fires.
         Changed = true;
         break;
       } else {
-        // Calculating the trace metrics may be expensive,
-        // so only do this when necessary.
+        // For big basic blocks, Wwe only compute the full trace the first time
+        // we hit this the first  time. We do not invalidate the trace, but
+        // instead update the instruction depths incrementally.
+        // NOTE: Only the instruction depths up to MI are relatively accurate.
+        // All other trace information is not updated.
+        if (IncrementalUpdate) {
+          // Update depths since the last incremental update.
+          for (; LastUpdate != BlockIter; LastUpdate++)
+            MinInstr->updateDepth(MBB, *LastUpdate, RegUnits);
+        }
         MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+        Traces->verifyAnalysis();
         if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
-                                    InstrIdxForVirtReg, P) &&
+                                    InstrIdxForVirtReg, P,
+                                    !IncrementalUpdate) &&
             preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
-          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, MinInstr,
+                                   LastUpdate, RegUnits, IncrementalUpdate);
+          if (MBB->size() > inc_threshold)
+            // Use incremental depth updates for basic blocks above treshold
+            IncrementalUpdate = true;
+          else
+            MinInstr->invalidate(MBB);
+
           // Eagerly stop after the first pattern fires.
           Changed = true;
           break;
@@ -467,6 +522,8 @@
     }
   }
 
+  if (Changed)
+    Traces->invalidate(MBB);
   return Changed;
 }
 
Index: test/CodeGen/AArch64/machine-combiner.ll
===================================================================
--- test/CodeGen/AArch64/machine-combiner.ll
+++ test/CodeGen/AArch64/machine-combiner.ll
@@ -1,5 +1,10 @@
 ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -disable-post-ra < %s | FileCheck %s
 
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \
+; RUN:     -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s
+
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 
Index: test/CodeGen/X86/machine-combiner.ll
===================================================================
--- test/CodeGen/X86/machine-combiner.ll
+++ test/CodeGen/X86/machine-combiner.ll
@@ -1,6 +1,11 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
 
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-inc-threshold=0 < %s | FileCheck %s --check-prefix=AVX
+
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
 
Index: test/CodeGen/X86/mul-constant-result.ll
===================================================================
--- test/CodeGen/X86/mul-constant-result.ll
+++ test/CodeGen/X86/mul-constant-result.ll
@@ -1,6 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell| FileCheck %s --check-prefix=X64-HSW
+
+; Incremental updates of the instruction depths should be enough for this test
+; case.
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=haswell -machine-combiner-inc-threshold=0| FileCheck %s --check-prefix=X64-HSW
 
 ; Function Attrs: norecurse nounwind readnone uwtable
 define i32 @mult(i32, i32) local_unnamed_addr #0 {