Index: lib/CodeGen/MachineCombiner.cpp
===================================================================
--- lib/CodeGen/MachineCombiner.cpp
+++ lib/CodeGen/MachineCombiner.cpp
@@ -67,10 +67,11 @@
   unsigned getLatency(MachineInstr *Root, MachineInstr *NewRoot,
                       MachineTraceMetrics::Trace BlockTrace);
   bool
-  preservesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
+  improvesCriticalPathLen(MachineBasicBlock *MBB, MachineInstr *Root,
                            MachineTraceMetrics::Trace BlockTrace,
                            SmallVectorImpl<MachineInstr *> &InsInstrs,
-                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg);
+                           DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+                           bool NewCodeHasLessInsts);
   bool preservesResourceLen(MachineBasicBlock *MBB,
                             MachineTraceMetrics::Trace BlockTrace,
                             SmallVectorImpl<MachineInstr *> &InsInstrs,
@@ -208,19 +209,24 @@
   return NewRootLatency;
 }
 
-/// True when the new instruction sequence does not
-/// lengthen the critical path. The DAGCombine code sequence ends in MI
-/// (Machine Instruction) Root. The new code sequence ends in MI NewRoot. A
-/// necessary condition for the new sequence to replace the old sequence is that
-/// it cannot lengthen the critical path. This is decided by the formula
+/// True when the new instruction sequence does not lengthen the critical path
+/// and the new sequence has less instructions or the new sequence improves the
+/// critical path.
+/// The DAGCombine code sequence ends in MI (Machine Instruction) Root.
+/// The new code sequence ends in MI NewRoot. A necessary condition for the new
+/// sequence to replace the old sequence is that it cannot lengthen the critical
+/// path. This is decided by the formula:
 /// (NewRootDepth + NewRootLatency) <= (RootDepth + RootLatency + RootSlack)).
-/// The slack is the number of cycles Root can be delayed before the critical
-/// patch becomes longer.
-bool MachineCombiner::preservesCriticalPathLen(
+/// If the new sequence has an equal length critical path but does not reduce
+/// the number of instructions (NewCodeHasLessInsts is false), then it is not
+/// considered an improvement. The slack is the number of cycles Root can be
+/// delayed before the critical patch becomes longer.
+bool MachineCombiner::improvesCriticalPathLen(
     MachineBasicBlock *MBB, MachineInstr *Root,
     MachineTraceMetrics::Trace BlockTrace,
     SmallVectorImpl<MachineInstr *> &InsInstrs,
-    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) {
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg,
+    bool NewCodeHasLessInsts) {
 
   assert(TSchedModel.hasInstrSchedModel() && "Missing machine model\n");
   // NewRoot is the last instruction in the \p InsInstrs vector.
@@ -245,9 +251,13 @@
         dbgs() << " RootDepth + RootLatency + RootSlack "
                << RootDepth + RootLatency + RootSlack << "\n";);
 
-  /// True when the new sequence does not lengthen the critical path.
-  return ((NewRootDepth + NewRootLatency) <=
-          (RootDepth + RootLatency + RootSlack));
+  unsigned NewCycleCount = NewRootDepth + NewRootLatency;
+  unsigned OldCycleCount = RootDepth + RootLatency + RootSlack;
+  
+  if (NewCodeHasLessInsts)
+    return NewCycleCount <= OldCycleCount;
+  else
+    return NewCycleCount < OldCycleCount;
 }
 
 /// helper routine to convert instructions into SC
@@ -359,18 +369,21 @@
         Traces->verifyAnalysis();
         TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
                                         InstrIdxForVirtReg);
+        unsigned NewInstCount = InsInstrs.size();
+        unsigned OldInstCount = DelInstrs.size();
         // Found pattern, but did not generate alternative sequence.
         // This can happen e.g. when an immediate could not be materialized
         // in a single instruction.
-        if (!InsInstrs.size())
+        if (!NewInstCount)
           continue;
         // Substitute when we optimize for codesize and the new sequence has
         // fewer instructions OR
         // the new sequence neither lengthens the critical path nor increases
         // resource pressure.
-        if (doSubstitute(InsInstrs.size(), DelInstrs.size()) ||
-            (preservesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                      InstrIdxForVirtReg) &&
+        if (doSubstitute(NewInstCount, OldInstCount) ||
+            (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
+                                      InstrIdxForVirtReg,
+                                      NewInstCount < OldInstCount) &&
              preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
           for (auto *InstrPtr : InsInstrs)
             MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -6334,22 +6334,11 @@
   return isHighLatencyDef(DefMI->getOpcode());
 }
 
-/// If the input instruction is part of a chain of dependent ops that are
-/// suitable for reassociation, return the earlier instruction in the sequence
-/// that defines its first operand, otherwise return a nullptr.
-/// If the instruction's operands must be commuted to be considered a
-/// reassociation candidate, Commuted will be set to true.
-static MachineInstr *isReassocCandidate(const MachineInstr &Inst,
-                                        unsigned AssocOpcode,
-                                        bool checkPrevOneUse,
-                                        bool &Commuted) {
-  if (Inst.getOpcode() != AssocOpcode)
-    return nullptr;
-  
-  MachineOperand Op1 = Inst.getOperand(1);
-  MachineOperand Op2 = Inst.getOperand(2);
-  
-  const MachineBasicBlock *MBB = Inst.getParent();
+static bool hasVirtualRegDefsInBasicBlock(const MachineInstr &Inst,
+                                          const MachineBasicBlock *MBB) {
+  assert(Inst.getNumOperands() == 3 && "Reassociation needs binary operators");
+  const MachineOperand &Op1 = Inst.getOperand(1);
+  const MachineOperand &Op2 = Inst.getOperand(2);
   const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
 
   // We need virtual register definitions.
@@ -6359,53 +6348,56 @@
     MI1 = MRI.getUniqueVRegDef(Op1.getReg());
   if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
     MI2 = MRI.getUniqueVRegDef(Op2.getReg());
-  
+
   // And they need to be in the trace (otherwise, they won't have a depth).
-  if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB)
-    return nullptr;
-  
-  Commuted = false;
-  if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) {
+  if (MI1 && MI2 && MI1->getParent() == MBB && MI2->getParent() == MBB)
+    return true;
+
+  return false;
+}
+
+static bool hasReassocSibling(const MachineInstr &Inst, bool &Commuted) {
+  const MachineBasicBlock *MBB = Inst.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineInstr *MI1 = MRI.getUniqueVRegDef(Inst.getOperand(1).getReg());
+  MachineInstr *MI2 = MRI.getUniqueVRegDef(Inst.getOperand(2).getReg());
+  unsigned AssocOpcode = Inst.getOpcode();
+
+  // If only one operand has the same opcode and it's the second source operand,
+  // the operands must be commuted.
+  Commuted = MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode;
+  if (Commuted)
     std::swap(MI1, MI2);
-    Commuted = true;
-  }
 
-  // Avoid reassociating operands when it won't provide any benefit. If both
-  // operands are produced by instructions of this type, we may already
-  // have the optimal sequence.
-  if (MI2->getOpcode() == AssocOpcode)
-    return nullptr;
-  
-  // The instruction must only be used by the other instruction that we
-  // reassociate with.
-  if (checkPrevOneUse && !MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
-    return nullptr;
-  
-  // We must match a simple chain of dependent ops.
-  // TODO: This check is not necessary for the earliest instruction in the
-  // sequence. Instead of a sequence of 3 dependent instructions with the same
-  // opcode, we only need to find a sequence of 2 dependent instructions with
-  // the same opcode plus 1 other instruction that adds to the height of the
-  // trace.
-  if (MI1->getOpcode() != AssocOpcode)
-    return nullptr;
+  // 1. The previous instruction must be the same type as Inst.
+  // 2. The previous instruction must have virtual register definitions for its
+  //    operands in the same basic block as Inst.
+  // 3. The previous instruction's result must only be used by Inst.
+  if (MI1->getOpcode() == AssocOpcode &&
+      hasVirtualRegDefsInBasicBlock(*MI1, MBB) &&
+      MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
+    return true;
   
-  return MI1;
+  return false;
 }
 
-/// Select a pattern based on how the operands of each associative operation
-/// need to be commuted.
-static MachineCombinerPattern::MC_PATTERN getPattern(bool CommutePrev,
-                                                     bool CommuteRoot) {
-  if (CommutePrev) {
-    if (CommuteRoot)
-      return MachineCombinerPattern::MC_REASSOC_XA_YB;
-    return MachineCombinerPattern::MC_REASSOC_XA_BY;
-  } else {
-    if (CommuteRoot)
-      return MachineCombinerPattern::MC_REASSOC_AX_YB;
-    return MachineCombinerPattern::MC_REASSOC_AX_BY;
-  }
+/// Return true if the input instruction is part of a chain of dependent ops
+/// that are suitable for reassociation, otherwise return false.
+/// If the instruction's operands must be commuted to have a previous
+/// instruction of the same type define the first source operand, Commuted will
+/// be set to true.
+static bool isReassocCandidate(const MachineInstr &Inst, unsigned AssocOpcode,
+                               bool &Commuted) {
+  // 1. The instruction must have the correct type.
+  // 2. The instruction must have virtual register definitions for its
+  //    operands in the same basic block.
+  // 3. The instruction must have a reassociatable sibling.
+  if (Inst.getOpcode() == AssocOpcode &&
+      hasVirtualRegDefsInBasicBlock(Inst, Inst.getParent()) &&
+      hasReassocSibling(Inst, Commuted))
+    return true;
+
+  return false;
 }
 
 bool X86InstrInfo::getMachineCombinerPatterns(MachineInstr &Root,
@@ -6413,26 +6405,35 @@
   if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
     return false;
 
+  // TODO: There is nothing x86-specific here except the instruction type.
+  // This logic could be hoisted into the machine combiner pass itself.
+
+  // Look for this reassociation pattern:
+  //   B = A op X (Prev)
+  //   C = B op Y (Root)
+
   // TODO: There are many more associative instruction types to match:
   //       1. Other forms of scalar FP add (non-AVX)
   //       2. Other data types (double, integer, vectors)
   //       3. Other math / logic operations (mul, and, or)
   unsigned AssocOpcode = X86::VADDSSrr;
 
-  // TODO: There is nothing x86-specific here except the instruction type.
-  // This logic could be hoisted into the machine combiner pass itself.
-  bool CommuteRoot;
-  if (MachineInstr *Prev = isReassocCandidate(Root, AssocOpcode, true,
-                                              CommuteRoot)) {
-    bool CommutePrev;
-    if (isReassocCandidate(*Prev, AssocOpcode, false, CommutePrev)) {
-      // We found a sequence of instructions that may be suitable for a
-      // reassociation of operands to increase ILP.
-      Patterns.push_back(getPattern(CommutePrev, CommuteRoot));
-      return true;
+  bool Commute = false;
+  if (isReassocCandidate(Root, AssocOpcode, Commute)) {
+    // We found a sequence of instructions that may be suitable for a
+    // reassociation of operands to increase ILP. Specify each commutation
+    // possibility for the Prev instruction in the sequence and let the
+    // machine combiner decide if changing the operands is worthwhile.
+    if (Commute) {
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_YB);
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_YB);
+    } else {
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_AX_BY);
+      Patterns.push_back(MachineCombinerPattern::MC_REASSOC_XA_BY);
     }
+    return true;
   }
-  
+
   return false;
 }
 
@@ -6525,14 +6526,16 @@
 
   // Select the previous instruction in the sequence based on the input pattern.
   MachineInstr *Prev = nullptr;
-  if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_BY ||
-      Pattern == MachineCombinerPattern::MC_REASSOC_XA_BY)
-    Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
-  else if (Pattern == MachineCombinerPattern::MC_REASSOC_AX_YB ||
-           Pattern == MachineCombinerPattern::MC_REASSOC_XA_YB)
-    Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
-  else
-    llvm_unreachable("Unknown pattern for machine combiner");
+  switch (Pattern) {
+    case MachineCombinerPattern::MC_REASSOC_AX_BY:
+    case MachineCombinerPattern::MC_REASSOC_XA_BY:
+      Prev = MRI.getUniqueVRegDef(Root.getOperand(1).getReg());
+      break;
+    case MachineCombinerPattern::MC_REASSOC_AX_YB:
+    case MachineCombinerPattern::MC_REASSOC_XA_YB:
+      Prev = MRI.getUniqueVRegDef(Root.getOperand(2).getReg());
+  }
+  assert(Prev && "Unknown pattern for machine combiner");
   
   reassociateOps(Root, *Prev, Pattern, InsInstrs, DelInstrs, InstIdxForVirtReg);
   return;
Index: test/CodeGen/X86/fp-fast.ll
===================================================================
--- test/CodeGen/X86/fp-fast.ll
+++ test/CodeGen/X86/fp-fast.ll
@@ -114,81 +114,3 @@
   ret float %t2
 }
 
-; Verify that the first two adds are independent regardless of how the inputs are 
-; commuted. The destination registers are used as source registers for the third add.
-
-define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %t0, %x2
-  %t2 = fadd float %t1, %x3
-  ret float %t2
-}
-
-define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %x2, %t0
-  %t2 = fadd float %t1, %x3
-  ret float %t2
-}
-
-define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %t0, %x2
-  %t2 = fadd float %x3, %t1
-  ret float %t2
-}
-
-define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
-; CHECK-LABEL: reassociate_adds4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %x2, %t0
-  %t2 = fadd float %x3, %t1
-  ret float %t2
-}
-
-; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
-; produced because that would cost more compile time.
-
-define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
-; CHECK-LABEL: reassociate_adds5:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddss %xmm7, %xmm6, %xmm1
-; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    retq
-  %t0 = fadd float %x0, %x1
-  %t1 = fadd float %t0, %x2
-  %t2 = fadd float %t1, %x3
-  %t3 = fadd float %t2, %x4
-  %t4 = fadd float %t3, %x5
-  %t5 = fadd float %t4, %x6
-  %t6 = fadd float %t5, %x7
-  ret float %t6
-}
Index: test/CodeGen/X86/machine-combiner.ll
===================================================================
--- test/CodeGen/X86/machine-combiner.ll
+++ test/CodeGen/X86/machine-combiner.ll
@@ -0,0 +1,99 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx -enable-unsafe-fp-math < %s | FileCheck %s
+
+; Verify that the first two adds are independent regardless of how the inputs are
+; commuted. The destination registers are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define float @reassociate_adds5(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
+; CHECK-NEXT:    vaddss %xmm6, %xmm1, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm7, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  %t3 = fadd float %t2, %x4
+  %t4 = fadd float %t3, %x5
+  %t5 = fadd float %t4, %x6
+  %t6 = fadd float %t5, %x7
+  ret float %t6
+}
+
+; Verify that we only need two associative operations to reassociate the operands.
+; Also, we should reassociate such that the result of the high latency division
+; is used by the final 'add' rather than reassociating the %x3 operand with the
+; division. The latter reassociation would not improve anything.
+ 
+define float @reassociate_adds6(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vdivss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fdiv float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+