Index: lib/Target/X86/X86InstrInfo.h
===================================================================
--- lib/Target/X86/X86InstrInfo.h
+++ lib/Target/X86/X86InstrInfo.h
@@ -26,6 +26,16 @@
   class X86RegisterInfo;
   class X86Subtarget;
 
+  namespace MachineCombinerPattern {
+    enum MC_PATTERN : int {
+      MC_NONE = 0,
+      MC_REASSOC_1 = 1,
+      MC_REASSOC_2 = 2,
+      MC_REASSOC_3 = 3,
+      MC_REASSOC_4 = 4,
+    };
+  } // end namespace MachineCombinerPattern
+
 namespace X86 {
   // X86 specific condition code. These correspond to X86_*_COND in
   // X86InstrInfo.td. They must be kept in synch.
@@ -426,6 +436,26 @@
                              const MachineInstr *UseMI,
                              unsigned UseIdx) const override;
 
+  
+  bool useMachineCombiner() const override {
+    return true;
+  }
+  
+  /// Return true when there is potentially a faster code sequence
+  /// for an instruction chain ending in <Root>. All potential patterns are
+  /// output in the <Pattern> array.
+  bool hasPattern(
+      MachineInstr &Root,
+      SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &P) const override;
+  
+  /// When hasPattern() finds a pattern, this function generates the
+  /// instructions that could replace the original code sequence.
+  void genAlternativeCodeSequence(
+          MachineInstr &Root, MachineCombinerPattern::MC_PATTERN P,
+          SmallVectorImpl<MachineInstr *> &InsInstrs,
+          SmallVectorImpl<MachineInstr *> &DelInstrs,
+          DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
Index: lib/Target/X86/X86InstrInfo.cpp
===================================================================
--- lib/Target/X86/X86InstrInfo.cpp
+++ lib/Target/X86/X86InstrInfo.cpp
@@ -6223,6 +6223,257 @@
   return isHighLatencyDef(DefMI->getOpcode());
 }
 
+// TODO: There is nothing x86-specific here except the instruction type.
+// This logic could be hoisted into the machine combiner pass itself.
+bool X86InstrInfo::hasPattern(MachineInstr &Root,
+        SmallVectorImpl<MachineCombinerPattern::MC_PATTERN> &Pattern) const {
+  unsigned Opc = Root.getOpcode();
+  MachineBasicBlock *MBB = Root.getParent();
+  MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+  MachineFunction &MF = *(MBB->getParent());
+
+  // TODO: Ideally, we would use instruction-level fast-math-flags instead of a
+  // global parameter.
+  if (!MF.getTarget().Options.UnsafeFPMath)
+    return false;
+
+  // TODO: There are many more associative instruction types to match:
+  //       1. Other forms of scalar FP add (non-AVX)
+  //       2. Other data types (double, integer, vectors)
+  //       3. Other math / logic operations (mul, and, or)
+  unsigned AssocOpcode = X86::VADDSSrr;
+  
+  if (Opc != AssocOpcode)
+    return false;
+  
+  MachineOperand Op1 = Root.getOperand(1);
+  MachineOperand Op2 = Root.getOperand(2);
+      
+  // We need virtual register definitions.
+  MachineInstr *MI1 = nullptr;
+  MachineInstr *MI2 = nullptr;
+  if (Op1.isReg() && TargetRegisterInfo::isVirtualRegister(Op1.getReg()))
+    MI1 = MRI.getUniqueVRegDef(Op1.getReg());
+  if (Op2.isReg() && TargetRegisterInfo::isVirtualRegister(Op2.getReg()))
+    MI2 = MRI.getUniqueVRegDef(Op2.getReg());
+  
+  // And they need to be in the trace (otherwise, they won't have a depth).
+  if (!MI1 || !MI2 || MI1->getParent() != MBB || MI2->getParent() != MBB)
+    return false;
+
+  bool CommuteRoot = false;
+  if (MI1->getOpcode() != AssocOpcode && MI2->getOpcode() == AssocOpcode) {
+    std::swap(MI1, MI2);
+    CommuteRoot = true;
+  }
+
+  // The instruction must only be used by the other instruction that we
+  // reassociate with.
+  if (!MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg()))
+    return false;
+  
+  // We must match a simple chain of dependent ops.
+  if (MI1->getOpcode() != AssocOpcode)
+    return false;
+  
+  // Avoid reassociating operands when it won't provide any benefit. If both
+  // operands are produced by instructions of this type, we may already
+  // have the optimal sequence.
+  if (MI2->getOpcode() == AssocOpcode)
+    return false;
+
+  // Look for a third dependent instruction in the sequence.
+  MachineOperand Op11 = MI1->getOperand(1);
+  MachineOperand Op12 = MI1->getOperand(2);
+  
+  // We need virtual register definitions.
+  MachineInstr *MI11 = nullptr;
+  MachineInstr *MI12 = nullptr;
+  if (Op11.isReg() && TargetRegisterInfo::isVirtualRegister(Op11.getReg()))
+    MI11 = MRI.getUniqueVRegDef(Op11.getReg());
+  if (Op12.isReg() && TargetRegisterInfo::isVirtualRegister(Op12.getReg()))
+    MI12 = MRI.getUniqueVRegDef(Op12.getReg());
+  
+  // And they need to be in the trace (otherwise, they won't have a depth).
+  if (!MI11 || !MI12 || MI11->getParent() != MBB || MI12->getParent() != MBB)
+    return false;
+
+  bool CommutePrev = false;
+  if (MI11->getOpcode() != AssocOpcode && MI12->getOpcode() == AssocOpcode) {
+    std::swap(MI11, MI12);
+    CommutePrev = true;
+  }
+  
+  // TODO: This restriction can be relaxed. The third instruction in the
+  // sequence could be any other instruction that adds to the height of this
+  // trace.
+  if (MI11->getOpcode() != AssocOpcode)
+    return false;
+  
+  // Avoid reassociating operands when it won't provide any benefit. If the
+  // earlier instruction's operands are both produced by this type of
+  // instruction, then we might have just optimized it.
+  if (MI12->getOpcode() == AssocOpcode)
+    return false;
+
+  // We found a sequence of instructions that may be suitable for a
+  // reassociation of operands to increase ILP. Select the appropriate codegen
+  // pattern based on the commutation of operands found here.
+  if (CommutePrev) {
+    if (CommuteRoot)
+      Pattern.push_back(MachineCombinerPattern::MC_REASSOC_4);
+    else
+      Pattern.push_back(MachineCombinerPattern::MC_REASSOC_3);
+  } else {
+    if (CommuteRoot)
+      Pattern.push_back(MachineCombinerPattern::MC_REASSOC_2);
+    else
+      Pattern.push_back(MachineCombinerPattern::MC_REASSOC_1);
+  }
+
+  return true;
+}
+
+/// Attempt the following reassociation to reduce critical path length:
+///   A = (Anything)
+///   B = A op X (Prev)
+///   C = B op Y (Root)
+///   ===>
+///   A = (Anything)
+///   B = X op Y
+///   C = A op B
+static void reassociateOps(MachineInstr &Root,
+                           MachineOperand &OpA, MachineOperand &OpB,
+                           MachineOperand &OpC, MachineOperand &OpX,
+                           MachineOperand &OpY,
+                           SmallVectorImpl<MachineInstr *> &InsInstrs,
+                           SmallVectorImpl<MachineInstr *> &DelInstrs) {
+  unsigned RegA = OpA.getReg();
+  unsigned RegB = OpB.getReg();
+  unsigned RegC = OpC.getReg();
+  unsigned RegX = OpX.getReg();
+  unsigned RegY = OpY.getReg();
+  
+  MachineFunction *MF = Root.getParent()->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+  const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+  const TargetRegisterClass *RC = Root.getRegClassConstraint(0, TII, TRI);
+
+  if (TargetRegisterInfo::isVirtualRegister(RegA))
+    MRI.constrainRegClass(RegA, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegB))
+    MRI.constrainRegClass(RegB, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegC))
+    MRI.constrainRegClass(RegC, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegX))
+    MRI.constrainRegClass(RegX, RC);
+  if (TargetRegisterInfo::isVirtualRegister(RegY))
+    MRI.constrainRegClass(RegY, RC);
+
+  bool KillA = OpA.isKill();
+  bool KillB = OpB.isKill();
+  bool KillX = OpX.isKill();
+  bool KillY = OpY.isKill();
+  
+  unsigned Opcode = Root.getOpcode();
+  MachineInstr &Prev = *(MRI.getUniqueVRegDef(OpB.getReg()));
+
+  // Create new instructions for insertion.
+  MachineInstrBuilder MIB1 =
+    BuildMI(*MF, Prev.getDebugLoc(), TII->get(Opcode), RegB)
+      .addReg(RegX, getKillRegState(KillX))
+      .addReg(RegY, getKillRegState(KillY));
+  InsInstrs.push_back(MIB1);
+  
+  MachineInstrBuilder MIB2 =
+    BuildMI(*MF, Root.getDebugLoc(), TII->get(Opcode), RegC)
+      .addReg(RegA, getKillRegState(KillA))
+      .addReg(RegB, getKillRegState(KillB));
+  InsInstrs.push_back(MIB2);
+
+  // Record old instructions for deletion.
+  DelInstrs.push_back(&Prev);
+  DelInstrs.push_back(&Root);
+}
+
+void X86InstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root,
+    MachineCombinerPattern::MC_PATTERN Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  
+  MachineBasicBlock &MBB = *Root.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  
+  // This is an unfortunate amount of code to match commuted variants
+  // of the pattern we want to reassociate.
+  switch (Pattern) {
+    default:
+      assert("Unknown pattern for machine combiner");
+      break;
+    case MachineCombinerPattern::MC_REASSOC_1: {
+      // A = (Anything)
+      // B = A op X (Prev)
+      // C = B op Y (Root)
+      MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(1).getReg()));
+      MachineOperand &OpA = Prev.getOperand(1);
+      MachineOperand &OpB = Root.getOperand(1);
+      MachineOperand &OpC = Root.getOperand(0);
+      MachineOperand &OpX = Prev.getOperand(2);
+      MachineOperand &OpY = Root.getOperand(2);
+
+      reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs);
+      break;
+    }
+    case MachineCombinerPattern::MC_REASSOC_2: {
+      // A = (Anything)
+      // B = A op X (Prev)
+      // C = Y op B (Root)
+      MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(2).getReg()));
+      MachineOperand &OpA = Prev.getOperand(1);
+      MachineOperand &OpB = Root.getOperand(2);
+      MachineOperand &OpC = Root.getOperand(0);
+      MachineOperand &OpX = Prev.getOperand(2);
+      MachineOperand &OpY = Root.getOperand(1);
+      
+      reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs);
+      break;
+    }
+    case MachineCombinerPattern::MC_REASSOC_3: {
+      // A = (Anything)
+      // B = X op A (Prev)
+      // C = B op Y (Root)
+      MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(1).getReg()));
+      MachineOperand &OpA = Prev.getOperand(2);
+      MachineOperand &OpB = Root.getOperand(1);
+      MachineOperand &OpC = Root.getOperand(0);
+      MachineOperand &OpX = Prev.getOperand(1);
+      MachineOperand &OpY = Root.getOperand(2);
+      
+      reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs);
+      break;
+    }
+    case MachineCombinerPattern::MC_REASSOC_4: {
+      // A = (Anything)
+      // B = X op A (Prev)
+      // C = Y op B (Root)
+      MachineInstr &Prev = *(MRI.getUniqueVRegDef(Root.getOperand(2).getReg()));
+      MachineOperand &OpA = Prev.getOperand(2);
+      MachineOperand &OpB = Root.getOperand(2);
+      MachineOperand &OpC = Root.getOperand(0);
+      MachineOperand &OpX = Prev.getOperand(1);
+      MachineOperand &OpY = Root.getOperand(1);
+      
+      reassociateOps(Root, OpA, OpB, OpC, OpX, OpY, InsInstrs, DelInstrs);
+      break;
+    }
+  } // end switch (Pattern)
+  
+  return;
+}
+
 namespace {
   /// Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
Index: lib/Target/X86/X86TargetMachine.cpp
===================================================================
--- lib/Target/X86/X86TargetMachine.cpp
+++ lib/Target/X86/X86TargetMachine.cpp
@@ -24,6 +24,10 @@
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
+static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
+                               cl::desc("Enable the machine combiner pass"),
+                               cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
   RegisterTargetMachine<X86TargetMachine> X(TheX86_32Target);
@@ -224,6 +228,8 @@
 
 bool X86PassConfig::addILPOpts() {
   addPass(&EarlyIfConverterID);
+  if (EnableMachineCombinerPass)
+    addPass(&MachineCombinerID);
   return true;
 }
 
Index: test/CodeGen/X86/fp-fast.ll
===================================================================
--- test/CodeGen/X86/fp-fast.ll
+++ test/CodeGen/X86/fp-fast.ll
@@ -114,3 +114,68 @@
   ret float %t2
 }
 
+; Verify that the first two adds are independent regardless of how the inputs are 
+; commuted. The destination registers are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+; Verify that we reassociate some of these ops. The optimal balanced tree of adds is not
+; produced because that would cost more compile time.
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm2, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm5, %xmm4, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm7, %xmm6, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  %t3 = fadd float %t2, %x4
+  %t4 = fadd float %t3, %x5
+  %t5 = fadd float %t4, %x6
+  %t6 = fadd float %t5, %x7
+  ret float %t6
+}