Index: llvm/trunk/lib/CodeGen/MachineCombiner.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/MachineCombiner.cpp
+++ llvm/trunk/lib/CodeGen/MachineCombiner.cpp
@@ -39,6 +39,20 @@
               cl::desc("Incremental depth computation will be used for basic "
                        "blocks with more instructions."), cl::init(500));
 
+#ifdef EXPENSIVE_CHECKS
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(true));
+#else
+static cl::opt<bool> VerifyPatternOrder(
+    "machine-combiner-verify-pattern-order", cl::Hidden,
+    cl::desc(
+        "Verify that the generated patterns are ordered by increasing latency"),
+    cl::init(false));
+#endif
+
 namespace {
 class MachineCombiner : public MachineFunctionPass {
   const TargetInstrInfo *TII;
@@ -85,6 +99,14 @@
                             SmallVectorImpl<MachineInstr *> &DelInstrs);
   void instr2instrSC(SmallVectorImpl<MachineInstr *> &Instrs,
                      SmallVectorImpl<const MCSchedClassDesc *> &InstrsSC);
+  std::pair<unsigned, unsigned>
+  getLatenciesForInstrSequences(MachineInstr &MI,
+                                SmallVectorImpl<MachineInstr *> &InsInstrs,
+                                SmallVectorImpl<MachineInstr *> &DelInstrs,
+                                MachineTraceMetrics::Trace BlockTrace);
+
+  void verifyPatternOrder(MachineBasicBlock *MBB, MachineInstr &Root,
+                          SmallVector<MachineCombinerPattern, 16> &Patterns);
 };
 }
 
@@ -242,6 +264,29 @@
   }
 }
 
+/// Estimate the latency of the new and original instruction sequence by summing
+/// up the latencies of the inserted and deleted instructions. This assumes
+/// that the inserted and deleted instructions are dependent instruction chains,
+/// which might not hold in all cases.
+std::pair<unsigned, unsigned> MachineCombiner::getLatenciesForInstrSequences(
+    MachineInstr &MI, SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    MachineTraceMetrics::Trace BlockTrace) {
+  assert(!InsInstrs.empty() && "Only support sequences that insert instrs.");
+  unsigned NewRootLatency = 0;
+  // NewRoot is the last instruction in the \p InsInstrs vector.
+  MachineInstr *NewRoot = InsInstrs.back();
+  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
+    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
+  NewRootLatency += getLatency(&MI, NewRoot, BlockTrace);
+
+  unsigned RootLatency = 0;
+  for (auto I : DelInstrs)
+    RootLatency += TSchedModel.computeInstrLatency(I);
+
+  return {NewRootLatency, RootLatency};
+}
+
 /// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
 /// The new code sequence ends in MI NewRoot. A necessary condition for the new
 /// sequence to replace the old sequence is that it cannot lengthen the critical
@@ -257,10 +302,6 @@
     bool SlackIsAccurate) {
   assert(TSchedModel.hasInstrSchedModelOrItineraries() &&
          "Missing machine model\n");
-  // NewRoot is the last instruction in the \p InsInstrs vector.
-  unsigned NewRootIdx = InsInstrs.size() - 1;
-  MachineInstr *NewRoot = InsInstrs[NewRootIdx];
-
   // Get depth and latency of NewRoot and Root.
   unsigned NewRootDepth = getDepth(InsInstrs, InstrIdxForVirtReg, BlockTrace);
   unsigned RootDepth = BlockTrace.getInstrCycles(*Root).Depth;
@@ -282,17 +323,9 @@
   // even if the instruction depths (data dependency cycles) become worse.
 
   // Account for the latency of the inserted and deleted instructions by
-  // adding up their latencies. This assumes that the inserted and deleted
-  // instructions are dependent instruction chains, which might not hold
-  // in all cases.
-  unsigned NewRootLatency = 0;
-  for (unsigned i = 0; i < InsInstrs.size() - 1; i++)
-    NewRootLatency += TSchedModel.computeInstrLatency(InsInstrs[i]);
-  NewRootLatency += getLatency(Root, NewRoot, BlockTrace);
-
-  unsigned RootLatency = 0;
-  for (auto I : DelInstrs)
-    RootLatency += TSchedModel.computeInstrLatency(I);
+  unsigned NewRootLatency, RootLatency;
+  std::tie(NewRootLatency, RootLatency) =
+      getLatenciesForInstrSequences(*Root, InsInstrs, DelInstrs, BlockTrace);
 
   unsigned RootSlack = BlockTrace.getInstrSlack(*Root);
   unsigned NewCycleCount = NewRootDepth + NewRootLatency;
@@ -409,6 +442,34 @@
   NumInstCombined++;
 }
 
+// Check that the difference between original and new latency is decreasing for
+// later patterns. This helps to discover sub-optimal pattern orderings.
+void MachineCombiner::verifyPatternOrder(
+    MachineBasicBlock *MBB, MachineInstr &Root,
+    SmallVector<MachineCombinerPattern, 16> &Patterns) {
+  long PrevLatencyDiff = std::numeric_limits<long>::max();
+  for (auto P : Patterns) {
+    SmallVector<MachineInstr *, 16> InsInstrs;
+    SmallVector<MachineInstr *, 16> DelInstrs;
+    DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
+    TII->genAlternativeCodeSequence(Root, P, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg);
+    // Found pattern, but did not generate alternative sequence.
+    // This can happen e.g. when an immediate could not be materialized
+    // in a single instruction.
+    if (InsInstrs.empty() || !TSchedModel.hasInstrSchedModelOrItineraries())
+      continue;
+
+    unsigned NewRootLatency, RootLatency;
+    std::tie(NewRootLatency, RootLatency) = getLatenciesForInstrSequences(
+        Root, InsInstrs, DelInstrs, MinInstr->getTrace(MBB));
+    long CurrentLatencyDiff = ((long)RootLatency) - ((long)NewRootLatency);
+    assert(CurrentLatencyDiff <= PrevLatencyDiff &&
+           "Current pattern is better than previous pattern.");
+    PrevLatencyDiff = CurrentLatencyDiff;
+  }
+}
+
 /// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
@@ -459,11 +520,16 @@
     // The algorithm does not try to evaluate all patterns and pick the best.
     // This is only an artificial restriction though. In practice there is
     // mostly one pattern, and getMachineCombinerPatterns() can order patterns
-    // based on an internal cost heuristic.
+    // based on an internal cost heuristic. If
+    // machine-combiner-verify-pattern-order is enabled, all patterns are
+    // checked to ensure later patterns do not provide better latency savings.
 
     if (!TII->getMachineCombinerPatterns(MI, Patterns))
       continue;
 
+    if (VerifyPatternOrder)
+      verifyPatternOrder(MBB, MI, Patterns);
+
     for (auto P : Patterns) {
       SmallVector<MachineInstr *, 16> InsInstrs;
       SmallVector<MachineInstr *, 16> DelInstrs;
Index: llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
+++ llvm/trunk/test/CodeGen/AArch64/aarch64-combine-fmul-fsub.mir
@@ -1,7 +1,7 @@
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
-# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=cortex-a57 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=UNPROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=falkor -enable-unsafe-fp-math %s -machine-combiner-verify-pattern-order=true | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=exynos-m1 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
+# RUN: llc -run-pass=machine-combiner -o - -mtriple=aarch64-unknown-linux -mcpu=thunderx2t99 -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true %s | FileCheck --check-prefixes=PROFITABLE,ALL %s
 #
 name:            f1_2s
 registers:
@@ -26,8 +26,8 @@
 # UNPROFITABLE-NEXT: FSUBv2f32 killed %3, %2
 #
 # PROFITABLE-LABEL: name: f1_2s
-# PROFITABLE: %5:fpr64 = FNEGv2f32 %2
-# PROFITABLE-NEXT: FMLAv2f32 killed %5, %0, %1
+# PROFITABLE: [[R1:%[0-9]+]]:fpr64 = FNEGv2f32 %2
+# PROFITABLE-NEXT: FMLAv2f32 killed [[R1]], %0, %1
 ---
 name:            f1_4s
 registers:
@@ -52,8 +52,8 @@
 # UNPROFITABLE-NEXT: FSUBv4f32 killed %3, %2
 #
 # PROFITABLE-LABEL: name: f1_4s
-# PROFITABLE: %5:fpr128 = FNEGv4f32 %2
-# PROFITABLE-NEXT: FMLAv4f32 killed %5, %0, %1
+# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv4f32 %2
+# PROFITABLE-NEXT: FMLAv4f32 killed [[R1]], %0, %1
 ---
 name:            f1_2d
 registers:
@@ -78,8 +78,8 @@
 # UNPROFITABLE-NEXT: FSUBv2f64 killed %3, %2
 #
 # PROFITABLE-LABEL: name: f1_2d
-# PROFITABLE: %5:fpr128 = FNEGv2f64 %2
-# PROFITABLE-NEXT: FMLAv2f64 killed %5, %0, %1
+# PROFITABLE: [[R1:%[0-9]+]]:fpr128 = FNEGv2f64 %2
+# PROFITABLE-NEXT: FMLAv2f64 killed [[R1]], %0, %1
 ---
 name:            f1_both_fmul_2s
 registers:
Index: llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll
+++ llvm/trunk/test/CodeGen/AArch64/machine-combiner.ll
@@ -3,7 +3,7 @@
 ; Incremental updates of the instruction depths should be enough for this test
 ; case.
 ; RUN: llc -mtriple=aarch64-gnu-linux -mcpu=cortex-a57 -enable-unsafe-fp-math \
-; RUN:     -disable-post-ra -machine-combiner-inc-threshold=0 < %s | FileCheck %s
+; RUN:     -disable-post-ra -machine-combiner-inc-threshold=0 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s
 
 ; Verify that the first two adds are independent regardless of how the inputs are
 ; commuted. The destination registers are used as source registers for the third add.
Index: llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir
+++ llvm/trunk/test/CodeGen/AArch64/machine-combiner.mir
@@ -1,6 +1,6 @@
 # RUN: llc -mtriple=aarch64-none-linux-gnu -mcpu=cortex-a57 -enable-unsafe-fp-math \
 # RUN:     -run-pass machine-combiner -machine-combiner-inc-threshold=0 \
-# RUN:     -verify-machineinstrs  -o - %s | FileCheck %s
+# RUN:     -machine-combiner-verify-pattern-order=true -verify-machineinstrs  -o - %s | FileCheck %s
 ---
 # Test incremental depth updates succeed when triggered after the removal of
 # the first instruction in a basic block.
Index: llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll
+++ llvm/trunk/test/CodeGen/X86/machine-combiner-int-vec.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse2 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=AVX
 
 ; Verify that 128-bit vector logical ops are reassociated.
 
Index: llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll
+++ llvm/trunk/test/CodeGen/X86/machine-combiner-int.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -o - | FileCheck %s --check-prefix=DEAD
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -machine-combiner-verify-pattern-order=true | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -stop-after machine-combiner -machine-combiner-verify-pattern-order=true -o - | FileCheck %s --check-prefix=DEAD
 
 ; Verify that integer multiplies are reassociated. The first multiply in 
 ; each test should be independent of the result of the preceding add (lea).
Index: llvm/trunk/test/CodeGen/X86/machine-combiner.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/machine-combiner.ll
+++ llvm/trunk/test/CodeGen/X86/machine-combiner.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math -machine-combiner-verify-pattern-order=true < %s | FileCheck %s --check-prefix=AVX
 
 ; Incremental updates of the instruction depths should be enough for this test
 ; case.