Index: lib/Transforms/Scalar/Reassociate.cpp
===================================================================
--- lib/Transforms/Scalar/Reassociate.cpp
+++ lib/Transforms/Scalar/Reassociate.cpp
@@ -893,7 +893,10 @@
 /// that computes the negative version of the value specified.  The negative
 /// version of the value is returned, and BI is left pointing at the instruction
 /// that should be processed next by the reassociation pass.
-static Value *NegateValue(Value *V, Instruction *BI) {
+/// Also adds the intermediate instructions that it creates to be redone at the
+/// end to see if we have more opportunities to reassociate.
+static Value *NegateValue(Value *V, Instruction *BI,
+                          SetVector<AssertingVH<Instruction>> &ToRedo) {
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (C->getType()->isFPOrFPVectorTy()) {
       return ConstantExpr::getFNeg(C);
@@ -914,8 +917,8 @@
   if (BinaryOperator *I =
           isReassociableOp(V, Instruction::Add, Instruction::FAdd)) {
     // Push the negates through the add.
-    I->setOperand(0, NegateValue(I->getOperand(0), BI));
-    I->setOperand(1, NegateValue(I->getOperand(1), BI));
+    I->setOperand(0, NegateValue(I->getOperand(0), BI, ToRedo));
+    I->setOperand(1, NegateValue(I->getOperand(1), BI, ToRedo));
     if (I->getOpcode() == Instruction::Add) {
       I->setHasNoUnsignedWrap(false);
       I->setHasNoSignedWrap(false);
@@ -928,6 +931,10 @@
     //
     I->moveBefore(BI);
     I->setName(I->getName()+".neg");
+
+    // Add the intermediate negates to the redo list as processing them later
+    // could open up more reassociating opportunities.
+    ToRedo.insert(I);
     return I;
   }
 
@@ -968,12 +975,15 @@
     } else {
       TheNeg->andIRFlags(BI);
     }
+    ToRedo.insert(TheNeg);
     return TheNeg;
   }
 
   // Insert a 'neg' instruction that subtracts the value from zero to get the
   // negation.
-  return CreateNeg(V, V->getName() + ".neg", BI, BI);
+  BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+  ToRedo.insert(NewNeg);
+  return NewNeg;
 }
 
 /// Return true if we should break up this subtract of X-Y into (X + -Y).
@@ -1007,14 +1017,15 @@
 
 /// If we have (X-Y), and if either X is an add, or if this is only used by an
 /// add, transform this into (X+(0-Y)) to promote better reassociation.
-static BinaryOperator *BreakUpSubtract(Instruction *Sub) {
+static BinaryOperator *
+BreakUpSubtract(Instruction *Sub, SetVector<AssertingVH<Instruction>> &ToRedo) {
   // Convert a subtract into an add and a neg instruction. This allows sub
   // instructions to be commuted with other add instructions.
   //
   // Calculate the negative value of Operand 1 of the sub instruction,
   // and set it as the RHS of the add instruction we just made.
   //
-  Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub, ToRedo);
   BinaryOperator *New = CreateAdd(Sub->getOperand(0), NegVal, "", Sub, Sub);
   Sub->setOperand(0, Constant::getNullValue(Sub->getType())); // Drop use of op.
   Sub->setOperand(1, Constant::getNullValue(Sub->getType())); // Drop use of op.
@@ -2081,7 +2092,7 @@
   // see if we can convert it to X+-Y.
   if (I->getOpcode() == Instruction::Sub) {
     if (ShouldBreakUpSubtract(I)) {
-      Instruction *NI = BreakUpSubtract(I);
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
@@ -2092,6 +2103,13 @@
           (!I->hasOneUse() ||
            !isReassociableOp(I->user_back(), Instruction::Mul))) {
         Instruction *NI = LowerNegateToMultiply(I);
+        // If the negate got simplified, add the users to be
+        // redone later to see if we can reassociate further (especially
+        // when we optimizing this instruction while redoing)
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
         RedoInsts.insert(I);
         MadeChange = true;
         I = NI;
@@ -2099,7 +2117,7 @@
     }
   } else if (I->getOpcode() == Instruction::FSub) {
     if (ShouldBreakUpSubtract(I)) {
-      Instruction *NI = BreakUpSubtract(I);
+      Instruction *NI = BreakUpSubtract(I, RedoInsts);
       RedoInsts.insert(I);
       MadeChange = true;
       I = NI;
@@ -2109,7 +2127,14 @@
       if (isReassociableOp(I->getOperand(1), Instruction::FMul) &&
           (!I->hasOneUse() ||
            !isReassociableOp(I->user_back(), Instruction::FMul))) {
+        // If the negate got simplified, add the users to be
+        // redone later to see if we can reassociate further (especially
+        // when we optimizing this instruction while redoing)
         Instruction *NI = LowerNegateToMultiply(I);
+        for (User *U : NI->users()) {
+          if (BinaryOperator *Tmp = dyn_cast<BinaryOperator>(U))
+            RedoInsts.insert(Tmp);
+        }
         RedoInsts.insert(I);
         MadeChange = true;
         I = NI;
@@ -2124,8 +2149,14 @@
   // If this is an interior node of a reassociable tree, ignore it until we
   // get to the root of the tree, to avoid N^2 analysis.
   unsigned Opcode = BO->getOpcode();
-  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode)
+  if (BO->hasOneUse() && BO->user_back()->getOpcode() == Opcode) {
+    // During the initial run we will get to the root of the tree.
+    // But if we get here while we are redoing instructions, there is no
+    // guarantee that the root will be visited. So Redo later
+    if (BO->user_back() != BO)
+      RedoInsts.insert(BO->user_back());
     return;
+  }
 
   // If this is an add tree that is used by a sub instruction, ignore it
   // until we process the subtract.
Index: test/Transforms/Reassociate/fast-ReassociateVector.ll
===================================================================
--- test/Transforms/Reassociate/fast-ReassociateVector.ll
+++ test/Transforms/Reassociate/fast-ReassociateVector.ll
@@ -16,9 +16,9 @@
 ; Check that a*a*b+a*a*c is turned into a*(a*(b+c)).
 define <2 x float> @test2(<2 x float> %a, <2 x float> %b, <2 x float> %c) {
 ; CHECK-LABEL: @test2
-; CHECK-NEXT: fadd fast <2 x float> %c, %b
-; CHECK-NEXT: fmul fast <2 x float> %a, %tmp2
-; CHECK-NEXT: fmul fast <2 x float> %tmp3, %a
+; CHECK-NEXT: [[TMP1:%tmp.*]] = fadd fast <2 x float> %c, %b
+; CHECK-NEXT: [[TMP2:%tmp.*]] = fmul fast <2 x float> %a, %a
+; CHECK-NEXT: fmul fast <2 x float> [[TMP2]], [[TMP1]]
 ; CHECK-NEXT: ret <2 x float>
 
   %t0 = fmul fast <2 x float> %a, %b
@@ -133,8 +133,8 @@
 ; Check x*y+y*x -> x*y*2.
 define <2 x double> @test11(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: @test11
-; CHECK-NEXT: %factor = fmul fast <2 x double> %y, <double 2.000000e+00, double 2.000000e+00>
-; CHECK-NEXT: %tmp1 = fmul fast <2 x double> %factor, %x
+; CHECK-NEXT: %factor = fmul fast <2 x double> %x, <double 2.000000e+00, double 2.000000e+00>
+; CHECK-NEXT: %tmp1 = fmul fast <2 x double> %factor, %y
 ; CHECK-NEXT: ret <2 x double> %tmp1
 
   %1 = fmul fast <2 x double> %x, %y
Index: test/Transforms/Reassociate/fast-basictest.ll
===================================================================
--- test/Transforms/Reassociate/fast-basictest.ll
+++ test/Transforms/Reassociate/fast-basictest.ll
@@ -108,7 +108,7 @@
 ; CHECK-LABEL: @test7
 ; CHECK-NEXT: fadd fast float %C, %B
 ; CHECK-NEXT: fmul fast float %A, %A
-; CHECK-NEXT: fmul fast float %1, %tmp2
+; CHECK-NEXT: fmul fast float %tmp3, %tmp2
 ; CHECK-NEXT: ret float
 
   %aa = fmul fast float %A, %A
Index: test/Transforms/Reassociate/fast-fp-commute.ll
===================================================================
--- test/Transforms/Reassociate/fast-fp-commute.ll
+++ test/Transforms/Reassociate/fast-fp-commute.ll
@@ -33,8 +33,8 @@
 
 define float @test3(float %x, float %y) {
 ; CHECK-LABEL: test3
-; CHECK-NEXT: %factor = fmul fast float %y, 2.000000e+00
-; CHECK-NEXT: %tmp1 = fmul fast float %factor, %x
+; CHECK-NEXT: %factor = fmul fast float %x, 2.000000e+00
+; CHECK-NEXT: %tmp1 = fmul fast float %factor, %y
 ; CHECK-NEXT: ret float %tmp1
 
   %1 = fmul fast float %x, %y
Index: test/Transforms/Reassociate/fast-multistep.ll
===================================================================
--- test/Transforms/Reassociate/fast-multistep.ll
+++ test/Transforms/Reassociate/fast-multistep.ll
@@ -3,9 +3,9 @@
 define float @fmultistep1(float %a, float %b, float %c) {
 ; Check that a*a*b+a*a*c is turned into a*(a*(b+c)).
 ; CHECK-LABEL: @fmultistep1
-; CHECK-NEXT: fadd fast float %c, %b
-; CHECK-NEXT: fmul fast float %a, %tmp2
-; CHECK-NEXT: fmul fast float %tmp3, %a
+; CHECK-NEXT: [[TMP1:%tmp.*]] = fadd fast float %c, %b
+; CHECK-NEXT: [[TMP2:%tmp.*]] = fmul fast float %a, %a
+; CHECK-NEXT: fmul fast float [[TMP2]], [[TMP1]]
 ; CHECK-NEXT: ret float
 
   %t0 = fmul fast float %a, %b
Index: test/Transforms/Reassociate/multistep.ll
===================================================================
--- test/Transforms/Reassociate/multistep.ll
+++ test/Transforms/Reassociate/multistep.ll
@@ -8,9 +8,9 @@
   %t2 = mul i64 %a, %c
   %t3 = mul i64 %a, %t2 ; a*(a*c)
   %t4 = add i64 %t1, %t3
-; CHECK-NEXT: add i64 %c, %b
-; CHECK-NEXT: mul i64 %a, %tmp{{.*}}
-; CHECK-NEXT: mul i64 %tmp{{.*}}, %a
+; CHECK-NEXT: [[TMP1:%tmp.*]] = add i64 %c, %b
+; CHECK-NEXT: [[TMP2:%tmp.*]] = mul i64 %a, %a
+; CHECK-NEXT: mul i64 [[TMP2]], [[TMP1]]
 ; CHECK-NEXT: ret
   ret i64 %t4
 }
Index: test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll
===================================================================
--- /dev/null
+++ test/Transforms/Reassociate/reassoc-intermediate-fnegs.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -reassociate -S | FileCheck %s
+; CHECK: faddsubAssoc1
+; CHECK: [[TMP1:%tmp.*]] = fmul fast half %a, 0xH4500
+; CHECK: [[TMP2:%tmp.*]] = fmul fast half %b, 0xH4500
+; CHECK: fsub fast half [[TMP2]], [[TMP1]]
+; CHECK: ret
+; Input is A op (B op C)
+define half @faddsubAssoc1(half %a, half %b) {
+  %tmp1 = fmul fast half %b, 0xH4200 ; 3*b
+  %tmp2 = fmul fast half %a, 0xH4500 ; 5*a
+  %tmp3 = fmul fast half %b, 0xH4000 ; 2*b
+  %tmp4 = fsub fast half %tmp2, %tmp1 ; 5 * a - 3 * b
+  %tmp5 = fsub fast half %tmp3, %tmp4 ; 2 * b - ( 5 * a - 3 * b)
+  ret half %tmp5 ; = 5 * (b - a)
+}
+
+; CHECK: faddsubAssoc2
+; CHECK: [[TMP1:%tmp.*]] = fmul fast half %a, 0xH4500
+; CHECK: [[TMP2:%tmp.*]] = fmul fast half %b, 0xH3C00
+; CHECK: fadd fast half [[TMP2]], [[TMP1]]
+; CHECK: ret
+; Input is (A op B) op C
+define half @faddsubAssoc2(half %a, half %b) {
+  %tmp1 = fmul fast half %b, 0xH4200 ; 3*b
+  %tmp2 = fmul fast half %a, 0xH4500 ; 5*a
+  %tmp3 = fmul fast half %b, 0xH4000 ; 2*b
+  %tmp4 = fadd fast half %tmp2, %tmp1 ; 5 * a + 3 * b
+  %tmp5 = fsub fast half %tmp4, %tmp3 ; (5 * a + 3 * b) - (2 * b)
+  ret half %tmp5 ; = 5 * a + b
+} 
Index: test/Transforms/Reassociate/secondary.ll
===================================================================
--- test/Transforms/Reassociate/secondary.ll
+++ test/Transforms/Reassociate/secondary.ll
@@ -6,7 +6,7 @@
 
 ; CHECK:     define
 ; CHECK-NOT: undef
-; CHECK:     %factor = mul i32 %tmp3, -2
+; CHECK:     %factor = mul i32 %tmp3.neg, 2
 ; CHECK-NOT: undef
 ; CHECK:     }
 
Index: test/Transforms/Reassociate/xor_reassoc.ll
===================================================================
--- test/Transforms/Reassociate/xor_reassoc.ll
+++ test/Transforms/Reassociate/xor_reassoc.ll
@@ -88,8 +88,8 @@
   %xor1 = xor i32 %xor, %and
   ret i32 %xor1
 ; CHECK-LABEL: @xor_special2(
-; CHECK: %xor = xor i32 %y, 123
-; CHECK: %xor1 = xor i32 %xor, %x
+; CHECK: %xor = xor i32 %x, 123
+; CHECK: %xor1 = xor i32 %xor, %y
 ; CHECK: ret i32 %xor1
 }