Index: include/llvm/Target/TargetLowering.h
===================================================================
--- include/llvm/Target/TargetLowering.h
+++ include/llvm/Target/TargetLowering.h
@@ -527,6 +527,31 @@
     return false;
   }
 
+  /// This pair is an opcode and value type.
+  typedef std::pair<unsigned, MVT> AssocType;
+  
+  /// This map type may be used by a target to count reassociations.
+  typedef std::map<AssocType, unsigned> AssocMap;
+
+  /// Return true if a reassociation optimization that exposes more
+  /// instruction-level-parallelism should be attempted for the specified
+  /// opcode and type. Targets may override this function based on the
+  /// amount of software-visible parallelism that is possible balanced against
+  /// the number of registers that are needed to support that number of
+  /// independent instructions.
+  /// \param Type The opcode and MVT of the instruction to reassociate.
+  /// \param Map A map for keeping track of reassociated operations.
+  virtual bool shouldReassociate(AssocType Type, AssocMap &Map) const {
+    return false;
+  }
+
+  /// This hook is called if a reassociation of the specified type was
+  /// completed. Targets may override this function to keep track of
+  /// the number of times a reassociation of this type has occurred.
+  /// \param Type The opcode and MVT of the instruction that was reassociated.
+  /// \param Map A map for keeping track of reassociated operations.
+  virtual void didReassociate(AssocType Type, AssocMap &Map) const {}
+
   /// Return how this operation should be treated: either it is legal, needs to
   /// be promoted to a larger size, needs to be expanded to some other code
   /// sequence, or the target has a custom expander for it.
Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -406,6 +406,11 @@
     /// requirements are checked by the function (e.g. that trunc is
     /// single-use) and if missed an empty SDValue is returned.
     SDValue distributeTruncateThroughAnd(SDNode *N);
+    
+    /// Given a dependent sequence of associative math/logic operations,
+    /// reassociate the operands to increase the instruction-level-parallelism.
+    SDValue reassociateForILP(SDNode *);
+    TargetLoweringBase::AssocMap ReassociateMap;
 
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL)
@@ -7786,6 +7791,58 @@
   return SDValue();
 }
 
+SDValue DAGCombiner::reassociateForILP(SDNode *N) {
+  assert(N->getNumOperands() == 2 && "Invalid node for binop reassociation");
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+  unsigned Op = N->getOpcode();
+  
+  // Only try this after type legalization. Reassociating earlier could have
+  // unknown consequences on register usage. Also, a new combiner is created
+  // post-type-legalization, so the map that is tracking this transform does
+  // not live long enough to support earlier passes.
+  if (!LegalTypes)
+    return SDValue();
+  
+  TargetLoweringBase::AssocType Type = std::make_pair(Op, VT.getSimpleVT());
+  
+  // Tell the target how many times we have already completed a reassociation
+  // of this type and ask the target if it's ok to try another.
+  if (!TLI.shouldReassociate(Type, ReassociateMap))
+    return SDValue();
+
+  // Swap chains of this operation to LHS to avoid duplicating predicate logic
+  // for each commutation of the pattern.
+  if (SelectionDAG::isCommutativeBinOp(Op) && N1.getOpcode() == Op &&
+      N1.hasOneUse() && N0.getOpcode() != Op)
+    std::swap(N1, N0);
+  
+  // Convert a chain of 3 dependent operations into 2 independent operations
+  // and 1 dependent operation.
+  if (N0.getOpcode() == Op && N0.hasOneUse() && N1.getOpcode() != Op) {
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    
+    // Swap second set of operands if needed.
+    if (SelectionDAG::isCommutativeBinOp(Op) && N01.getOpcode() == Op &&
+        N01.hasOneUse() && N00.getOpcode() != Op)
+      std::swap(N01, N00);
+
+    //  (op N0: (op N00: (op z, w), N01: y), N1: x) ->
+    //  (op N00: (op z, w), (op N1: x, N01: y))
+    // TODO: The innermost op - (op z, w) - does not need to match the others.
+    if (N00.getOpcode() == Op) {
+      // Tell the target that a reassociation of this type was completed.
+      TLI.didReassociate(Type, ReassociateMap);
+      SDValue NewOp = DAG.getNode(Op, DL, VT, N1, N01);
+      return DAG.getNode(Op, DL, VT, N00, NewOp);
+    }
+  }
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitFADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -7920,6 +7977,10 @@
                            N0.getOperand(0), DAG.getConstantFP(4.0, DL, VT));
       }
     }
+    
+    if (SDValue Reassociated = reassociateForILP(N))
+      return Reassociated;
+
   } // enable-unsafe-fp-math
 
   // FADD -> FMA combines:
Index: lib/Target/X86/X86ISelLowering.h
===================================================================
--- lib/Target/X86/X86ISelLowering.h
+++ lib/Target/X86/X86ISelLowering.h
@@ -1084,6 +1084,10 @@
 
     /// Reassociate floating point divisions into multiply by reciprocal.
     bool combineRepeatedFPDivisors(unsigned NumUsers) const override;
+
+    /// Increase instruction-level-parallelism for associative operations.
+    bool shouldReassociate(AssocType Type, AssocMap &Map) const override;
+    void didReassociate(AssocType Type, AssocMap &Map) const override;
   };
 
   namespace X86 {
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -12929,6 +12929,42 @@
   return NumUsers > 1;
 }
 
+bool X86TargetLowering::shouldReassociate(AssocType Type, AssocMap &Map) const {
+  // These are the opcodes that we allow to be reassociated and the number
+  // of times each opcode may be reassociated. The limit is a conservative
+  // heuristic based on the number of architected general-purpose or FP/vector
+  // registers. We must avoid spilling for the optimization to have a benefit.
+  static const std::map<unsigned, unsigned> ReassociationLimits = {
+    { ISD::FADD, 1 }
+  };
+
+  // TODO: Add the rest of the associative opcodes.
+  // TODO: Programmatically base the limit on number of registers.
+  // TODO: Limits should be based on MVT or register class. For example,
+  //       a *vector* integer add does not use the same registers as a *scalar*
+  //       integer add.
+
+  // If the input opcode type is not a reassociation candidate, we're done.
+  const auto &LimitIter = ReassociationLimits.find(Type.first);
+  if (LimitIter == ReassociationLimits.end())
+    return false;
+
+  // If we hit the reassociation limit for this type of instruction, we're done.
+  unsigned ReassociationMax = LimitIter->second;
+  if (Map[Type] >= ReassociationMax)
+    return false;
+  
+  return true;
+}
+
+void X86TargetLowering::didReassociate(AssocType Type, AssocMap &Map) const {
+  // TODO: The map should be updated based on register type rather than
+  //       instruction type. For example, any floating point reassociation
+  //       should affect all other floating point operations because they all
+  //       require registers from the same register class.
+  Map[Type]++;
+}
+
 static bool isAllOnes(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   return C && C->isAllOnesValue();
Index: test/CodeGen/X86/fp-fast.ll
===================================================================
--- test/CodeGen/X86/fp-fast.ll
+++ test/CodeGen/X86/fp-fast.ll
@@ -114,3 +114,70 @@
   ret float %t2
 }
 
+; Verify that the first two adds are independent; the destination registers
+; are used as source registers for the third add.
+
+define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %x2, %t0
+  %t2 = fadd float %t1, %x3
+  ret float %t2
+}
+
+define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) {
+; CHECK-LABEL: reassociate_adds3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm2, %xmm3, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %x3, %t1
+  ret float %t2
+}
+
+; FIXME: We should be able to do more reassociations for this test case, but the limit is set
+; conservatively low to avoid spilling disasters.
+
+define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) {
+; CHECK-LABEL: reassociate_adds4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm3, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm4, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm5, %xmm0, %xmm0
+; CHECK-NEXT:    vaddss %xmm6, %xmm7, %xmm1
+; CHECK-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %t0 = fadd float %x0, %x1
+  %t1 = fadd float %t0, %x2
+  %t2 = fadd float %t1, %x3
+  %t3 = fadd float %t2, %x4
+  %t4 = fadd float %t3, %x5
+  %t5 = fadd float %t4, %x6
+  %t6 = fadd float %t5, %x7
+  ret float %t6
+}
+
+