Index: include/llvm/Transforms/Scalar/Reassociate.h
===================================================================
--- include/llvm/Transforms/Scalar/Reassociate.h
+++ include/llvm/Transforms/Scalar/Reassociate.h
@@ -72,6 +72,14 @@
   DenseMap<BasicBlock *, unsigned> RankMap;
   DenseMap<AssertingVH<Value>, unsigned> ValueRankMap;
   SetVector<AssertingVH<Instruction>> RedoInsts;
+
+  unsigned ReassociateStep;
+  // Arbitrary, but prevents quadratic behavior.
+  static const unsigned GlobalReassociateLimit = 10;
+  static const unsigned NumBinaryOps =
+      Instruction::BinaryOpsEnd - Instruction::BinaryOpsBegin;
+  DenseMap<std::pair<Value *, Value *>, unsigned> PairMap[NumBinaryOps];
+
   bool MadeChange;
 
 public:
@@ -105,6 +113,7 @@
                                  SetVector<AssertingVH<Instruction>> &Insts);
   void OptimizeInst(Instruction *I);
   Instruction *canonicalizeNegConstExpr(Instruction *I);
+  void BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT);
 };
 
 } // end namespace llvm
Index: lib/Transforms/Scalar/Reassociate.cpp
===================================================================
--- lib/Transforms/Scalar/Reassociate.cpp
+++ lib/Transforms/Scalar/Reassociate.cpp
@@ -27,6 +27,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -2187,11 +2188,99 @@
     return;
   }
 
+  if (ReassociateStep == 1 && Ops.size() > 2 &&
+      Ops.size() <= GlobalReassociateLimit) {
+    unsigned Max = 1;
+    unsigned BestRank = 0;
+    std::pair<unsigned, unsigned> BestPair;
+    unsigned Idx = I->getOpcode() - Instruction::BinaryOpsBegin;
+    for (unsigned i = 0; i < Ops.size() - 1; ++i)
+      for (unsigned j = i + 1; j < Ops.size(); ++j) {
+        unsigned Score = 0;
+        Value *Op0 = Ops[i].Op;
+        Value *Op1 = Ops[j].Op;
+        if (std::less<Value *>()(Op1, Op0))
+          std::swap(Op0, Op1);
+        auto it = PairMap[Idx].find({Op0, Op1});
+        if (it != PairMap[Idx].end())
+          Score += it->second;
+
+        unsigned MaxRank = std::max(Ops[i].Rank, Ops[j].Rank);
+        if (Score > Max || (Score == Max && MaxRank < BestRank)) {
+          BestPair = {i, j};
+          Max = Score;
+          BestRank = MaxRank;
+        }
+      }
+    if (Max > 1) {
+      auto Op0 = Ops[BestPair.first];
+      auto Op1 = Ops[BestPair.second];
+      Ops.erase(&Ops[BestPair.second]);
+      Ops.erase(&Ops[BestPair.first]);
+      Ops.push_back(Op0);
+      Ops.push_back(Op1);
+    }
+  }
   // Now that we ordered and optimized the expressions, splat them back into
   // the expression tree, removing any unneeded nodes.
   RewriteExprTree(I, Ops);
 }
 
+void
+ReassociatePass::BuildPairMap(ReversePostOrderTraversal<Function *> &RPOT) {
+  // Make a "pairmap" of how often each operand pair occurs.
+  for (BasicBlock *BI : RPOT) {
+    for (Instruction &I : *BI) {
+      if (!I.isAssociative())
+        continue;
+
+      // Ignore nodes that aren't at the root of trees.
+      if (I.hasOneUse() && I.user_back()->getOpcode() == I.getOpcode())
+        continue;
+
+      // Collect all operands in a single reassociable expression.
+      // Since Reassociate has already been run once, we can assume things
+      // are already canonical according to Reassociation's regime.
+      SmallVector<Value *, 8> Worklist = { I.getOperand(0), I.getOperand(1) };
+      SmallVector<Value *, 8> Ops;
+      while (!Worklist.empty() && Ops.size() <= GlobalReassociateLimit) {
+        Value *Op = Worklist.pop_back_val();
+        Instruction *OpI = dyn_cast<Instruction>(Op);
+        if (!OpI || OpI->getOpcode() != I.getOpcode() || !OpI->hasOneUse()) {
+          Ops.push_back(Op);
+          continue;
+        }
+        // be paranoid about self-referencing expressions in unreachable code
+        if (OpI->getOperand(0) != OpI)
+          Worklist.push_back(OpI->getOperand(0));
+        if (OpI->getOperand(1) != OpI)
+          Worklist.push_back(OpI->getOperand(1));
+      }
+      // Skip extremely long expressions.
+      if (Ops.size() > GlobalReassociateLimit)
+        continue;
+
+      // Add all pairwise combinations of operands to the pair map.
+      unsigned BinaryIdx = I.getOpcode() - Instruction::BinaryOpsBegin;
+      SmallSet<std::pair<Value *, Value*>, 32> Visited;
+      for (unsigned i = 0; i < Ops.size() - 1; ++i) {
+        for (unsigned j = i + 1; j < Ops.size(); ++j) {
+          // Canonicalize operand orderings.
+          Value *Op0 = Ops[i];
+          Value *Op1 = Ops[j];
+          if (std::less<Value *>()(Op1, Op0))
+            std::swap(Op0, Op1);
+          if (!Visited.insert({Op0, Op1}).second)
+            continue;
+          auto res = PairMap[BinaryIdx].insert({{Op0, Op1}, 1});
+          if (!res.second)
+            ++res.first->second;
+        }
+      }
+    }
+  }
+}
+
 PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
   // Get the functions basic blocks in Reverse Post Order. This order is used by
   // BuildRankMap to pre calculate ranks correctly. It also excludes dead basic
@@ -2204,43 +2293,49 @@
 
   MadeChange = false;
   // Traverse the same blocks that was analysed by BuildRankMap.
-  for (BasicBlock *BI : RPOT) {
-    assert(RankMap.count(&*BI) && "BB should be ranked.");
-    // Optimize every instruction in the basic block.
-    for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
-      if (isInstructionTriviallyDead(&*II)) {
-        EraseInst(&*II++);
-      } else {
-        OptimizeInst(&*II);
-        assert(II->getParent() == &*BI && "Moved to a different block!");
-        ++II;
+  for (ReassociateStep = 0; ReassociateStep < 2; ++ReassociateStep) {
+    for (BasicBlock *BI : RPOT) {
+      assert(RankMap.count(&*BI) && "BB should be ranked.");
+      // Optimize every instruction in the basic block.
+      for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;)
+        if (isInstructionTriviallyDead(&*II)) {
+          EraseInst(&*II++);
+        } else {
+          OptimizeInst(&*II);
+          assert(II->getParent() == &*BI && "Moved to a different block!");
+          ++II;
+        }
+
+      // Make a copy of all the instructions to be redone so we can remove dead
+      // instructions.
+      SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
+      // Iterate over all instructions to be reevaluated and remove trivially dead
+      // instructions. If any operand of the trivially dead instruction becomes
+      // dead mark it for deletion as well. Continue this process until all
+      // trivially dead instructions have been removed.
+      while (!ToRedo.empty()) {
+        Instruction *I = ToRedo.pop_back_val();
+        if (isInstructionTriviallyDead(I)) {
+          RecursivelyEraseDeadInsts(I, ToRedo);
+          MadeChange = true;
+        }
       }
 
-    // Make a copy of all the instructions to be redone so we can remove dead
-    // instructions.
-    SetVector<AssertingVH<Instruction>> ToRedo(RedoInsts);
-    // Iterate over all instructions to be reevaluated and remove trivially dead
-    // instructions. If any operand of the trivially dead instruction becomes
-    // dead mark it for deletion as well. Continue this process until all
-    // trivially dead instructions have been removed.
-    while (!ToRedo.empty()) {
-      Instruction *I = ToRedo.pop_back_val();
-      if (isInstructionTriviallyDead(I)) {
-        RecursivelyEraseDeadInsts(I, ToRedo);
-        MadeChange = true;
+      // Now that we have removed dead instructions, we can reoptimize the
+      // remaining instructions.
+      while (!RedoInsts.empty()) {
+        Instruction *I = RedoInsts.pop_back_val();
+        if (isInstructionTriviallyDead(I))
+          EraseInst(I);
+        else
+          OptimizeInst(I);
       }
     }
-
-    // Now that we have removed dead instructions, we can reoptimize the
-    // remaining instructions.
-    while (!RedoInsts.empty()) {
-      Instruction *I = RedoInsts.pop_back_val();
-      if (isInstructionTriviallyDead(I))
-        EraseInst(I);
-      else
-        OptimizeInst(I);
-    }
+    if (ReassociateStep == 0)
+      BuildPairMap(RPOT);
   }
+  for (auto &Entry : PairMap)
+    Entry.clear();
 
   // We are done with the rank map.
   RankMap.clear();
Index: test/Transforms/Reassociate/basictest.ll
===================================================================
--- test/Transforms/Reassociate/basictest.ll
+++ test/Transforms/Reassociate/basictest.ll
@@ -242,3 +242,18 @@
 if.end:                                           ; preds = %entry
   ret i64 0
 }
+
+; CHECK-LABEL: @test17
+; CHECK: %[[A:.*]] = mul i32 %X4, %X3
+; CHECK-NEXT:  %[[C:.*]] = mul i32 %[[A]], %X1
+; CHECK-NEXT: %[[D:.*]] = mul i32 %[[A]], %X2
+; CHECK-NEXT: %[[E:.*]] = xor i32 %[[C]], %[[D]]
+; CHECK-NEXT: ret i32 %[[E]]
+define i32 @test17(i32 %X1, i32 %X2, i32 %X3, i32 %X4) {
+  %A = mul i32 %X3, %X1
+  %B = mul i32 %X3, %X2
+  %C = mul i32 %A, %X4
+  %D = mul i32 %B, %X4
+  %E = xor i32 %C, %D
+  ret i32 %E
+}
Index: test/Transforms/Reassociate/canonicalize-neg-const.ll
===================================================================
--- test/Transforms/Reassociate/canonicalize-neg-const.ll
+++ test/Transforms/Reassociate/canonicalize-neg-const.ll
@@ -169,10 +169,11 @@
 
 define double @pr34078(double %A) {
 ; CHECK-LABEL: @pr34078(
-; CHECK-NEXT:    [[SUB:%.*]] = fsub fast double 1.000000e+00, %A
+; CHECK-NEXT:    [[A_NEG:%.*]] = fsub fast double -0.000000e+00, %A
+; CHECK-NEXT:    [[SUB:%.*]] = fadd fast double [[A_NEG]], 1.000000e+00
 ; CHECK-NEXT:    [[POW2:%.*]] = fmul double %A, %A
 ; CHECK-NEXT:    [[MUL5_NEG:%.*]] = fmul fast double [[POW2]], -5.000000e-01
-; CHECK-NEXT:    [[SUB1:%.*]] = fadd fast double [[MUL5_NEG]], [[SUB]]
+; CHECK-NEXT:    [[SUB1:%.*]] = fadd fast double [[SUB]], [[MUL5_NEG]]
 ; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast double [[SUB1]], 2.000000e+00
 ; CHECK-NEXT:    ret double [[FACTOR]]
 ;
Index: test/Transforms/Reassociate/fast-ReassociateVector.ll
===================================================================
--- test/Transforms/Reassociate/fast-ReassociateVector.ll
+++ test/Transforms/Reassociate/fast-ReassociateVector.ll
@@ -221,9 +221,8 @@
 
 define <2 x double> @test9(<2 x double> %b, <2 x double> %a) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast <2 x double> zeroinitializer, [[A:%.*]]
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x double> [[B:%.*]], <double 1.234000e+03, double 1.234000e+03>
-; CHECK-NEXT:    ret <2 x double> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd fast <2 x double> [[B:%.*]], <double 1.234000e+03, double 1.234000e+03>
+; CHECK-NEXT:    ret <2 x double> [[TMP1]]
 ;
   %1 = fadd fast <2 x double> %a, <double 1.234000e+03, double 1.234000e+03>
   %2 = fadd fast <2 x double> %b, %1
@@ -253,7 +252,6 @@
 
 define <2 x float> @test10(<2 x float> %a, <2 x float> %b, <2 x float> %z) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[TMP1:%.*]] = fsub fast <2 x float> zeroinitializer, zeroinitializer
 ; CHECK-NEXT:    [[E:%.*]] = fmul fast <2 x float> [[A:%.*]], <float 4.000000e+01, float 4.000000e+01>
 ; CHECK-NEXT:    [[F:%.*]] = fmul fast <2 x float> [[E]], [[Z:%.*]]
 ; CHECK-NEXT:    ret <2 x float> [[F]]
Index: test/Transforms/Reassociate/fast-basictest.ll
===================================================================
--- test/Transforms/Reassociate/fast-basictest.ll
+++ test/Transforms/Reassociate/fast-basictest.ll
@@ -176,7 +176,7 @@
 
 define float @test8(float %X, float %Y, float %Z) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[A:%.*]] = fmul fast float %Y, %X
+; CHECK-NEXT:    [[A:%.*]] = fmul fast float %X, %Y
 ; CHECK-NEXT:    [[C:%.*]] = fsub fast float %Z, [[A]]
 ; CHECK-NEXT:    ret float [[C]]
 ;
@@ -267,8 +267,8 @@
 
 define float @test12(float %X) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float %X, -3.000000e+00
-; CHECK-NEXT:    [[Z:%.*]] = fadd fast float [[FACTOR]], 6.000000e+00
+; CHECK-NEXT:    [[FACTOR:%.*]] = fmul fast float %X, 3.000000e+00
+; CHECK-NEXT:    [[Z:%.*]] = fsub fast float 6.000000e+00, [[FACTOR]]
 ; CHECK-NEXT:    ret float [[Z]]
 ;
   %A = fsub fast float 1.000000e+00, %X
Index: test/Transforms/Reassociate/mulfactor.ll
===================================================================
--- test/Transforms/Reassociate/mulfactor.ll
+++ test/Transforms/Reassociate/mulfactor.ll
@@ -86,8 +86,8 @@
 ; (x^5) * (y^3) * z
 define i32 @test6(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %x, %x
-; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], %y
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %y, %x
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i32 [[TMP1]], %x
 ; CHECK-NEXT:    [[F:%.*]] = mul i32 %y, %x
 ; CHECK-NEXT:    [[G:%.*]] = mul i32 [[F]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i32 [[G]], [[TMP2]]
Index: test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll
===================================================================
--- test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll
+++ test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll
@@ -7,7 +7,6 @@
 ; CHECK-NEXT:    [[T2_NEG:%.*]] = fmul fast half %a, 0xH4500
 ; CHECK-NEXT:    [[REASS_MUL:%.*]] = fmul fast half %b, 0xH4500
 ; CHECK-NEXT:    [[T51:%.*]] = fsub fast half [[REASS_MUL]], [[T2_NEG]]
-; CHECK-NEXT:    [[T5:%.*]] = fadd fast half [[REASS_MUL]], [[T2_NEG]]
 ; CHECK-NEXT:    ret half [[T51]]
 ;
   %t1 = fmul fast half %b, 0xH4200 ; 3*b