Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1256,6 +1256,10 @@
   InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
                                         TTI::TargetCostKind CostKind) const;
 
+  /// \returns The cost savings from using a floating-point contract
+  /// instead of separate multiply and add/subtract.
+  InstructionCost getFMACostSavings(Type *Ty, FastMathFlags FMF) const;
+
   /// \returns The cost of Call instructions.
   InstructionCost getCallInstrCost(
       Function *F, Type *RetTy, ArrayRef<Type *> Tys,
Index: llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -36,6 +36,7 @@
 class InsertElementInst;
 class InsertValueInst;
 class Instruction;
+class InstructionCost;
 class LoopInfo;
 class OptimizationRemarkEmitter;
 class PHINode;
@@ -88,6 +89,9 @@
   ///       every time we run into a memory barrier.
   void collectSeedInstructions(BasicBlock *BB);
 
+  /// Adjust cost if vectorizing a tree will lose FMA opportunities.
+  void adjustForFMAs(InstructionCost &Cost, ArrayRef<Value *> &VL);
+
   /// Try to vectorize a chain that starts at two arithmetic instrs.
   bool tryToVectorizePair(Value *A, Value *B, slpvectorizer::BoUpSLP &R);
 
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -909,6 +909,33 @@
   return Cost;
 }
 
+InstructionCost
+TargetTransformInfo::getFMACostSavings(Type *Ty, FastMathFlags FMF) const {
+
+  if (!FMF.allowContract())
+    return 0;
+
+  // Savings = Cost(FMul) + Cost(FAdd) - Cost(FMA).
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost MulCost =
+    getArithmeticInstrCost(Instruction::FMul, Ty, CostKind);
+  InstructionCost AddCost =
+    getArithmeticInstrCost(Instruction::FAdd, Ty, CostKind);
+
+  Intrinsic::ID ID = Intrinsic::fmuladd;
+  SmallVector<Type *, 2> Tys;
+  Tys.push_back(Ty);
+  Tys.push_back(Ty);
+  ArrayRef<Type *> ArgTys(Tys);
+  IntrinsicCostAttributes CostAttrs(ID, Ty, ArgTys);
+  InstructionCost FMACost = getIntrinsicInstrCost(CostAttrs, CostKind);
+
+  InstructionCost Savings = MulCost + AddCost - FMACost;
+  if (Savings < 0)
+    Savings = 0;
+  return Savings;
+}
+
 InstructionCost
 TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
                                       ArrayRef<Type *> Tys,
Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -10076,6 +10076,38 @@
   return tryToVectorizeList(VL, R);
 }
 
+void SLPVectorizerPass::adjustForFMAs(InstructionCost &Cost,
+                                      ArrayRef<Value *> &VL) {
+  FastMathFlags FMF;
+  FMF.set();
+
+  // Only increase cost if every scalar in VL is a floating multiply feeding
+  // exactly one floating add/subtract, and contracts are enabled.
+  for (Value *U : VL) {
+    auto *FPMO = dyn_cast<FPMathOperator>(U);
+    if (!(FPMO &&
+          FPMO->getOpcode() == Instruction::FMul &&
+          FPMO->hasAllowContract() &&
+          FPMO->hasOneUse()))
+      return;
+    auto *I = dyn_cast<Instruction>(FPMO->user_back());
+    if (!(I &&
+          (I->getOpcode() == Instruction::FAdd ||
+           I->getOpcode() == Instruction::FSub) &&
+          I->hasAllowContract()))
+      return;
+    FMF &= FPMO->getFastMathFlags();
+  }
+
+  InstructionCost FMASavings = TTI->getFMACostSavings(VL[0]->getType(), FMF);
+
+  if (FMASavings > 0) {
+    LLVM_DEBUG(dbgs() << "SLP: Adding cost " << VL.size() * FMASavings
+               << " for vectorization that breaks " << VL.size() << " FMAs\n");
+    Cost += FMASavings * VL.size();
+  }
+}
+
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
                                            bool LimitForRegisterSize) {
   if (VL.size() < 2)
@@ -10173,6 +10205,7 @@
 
       R.computeMinimumValueSizes();
       InstructionCost Cost = R.getTreeCost();
+      adjustForFMAs(Cost, Ops);
       CandidateFound = true;
       MinCost = std::min(MinCost, Cost);
 
@@ -11221,6 +11254,21 @@
         VectorCost =
             TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, FMF, CostKind);
       ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy, CostKind);
+
+      if (RdxKind == RecurKind::FAdd) {
+        auto *I = dyn_cast<Instruction>(FirstReducedVal);
+        if (I &&
+            I->getOpcode() == Instruction::FMul &&
+            I->hasAllowContract()) {
+          InstructionCost FMASavings = TTI->getFMACostSavings(ScalarTy, FMF);
+          if (FMASavings > 0) {
+            LLVM_DEBUG(dbgs() << "SLP: Adding cost " << ReduxWidth * FMASavings
+                       << " for horizontal reduction that breaks "
+                       << ReduxWidth << " FMAs\n");
+            VectorCost += FMASavings * ReduxWidth;
+          }
+        }
+      }
       break;
     }
     case RecurKind::FMax:
Index: llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -5,18 +5,20 @@
 ; adds may look profitable, but is not because it eliminates generation of
 ; floating-point FMAs that would be more profitable.
 
-; FIXME: We generate a horizontal reduction today.
-
 define void @hr() {
 ; CHECK-LABEL: @hr(
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[ADD3]] = fadd fast double [[MUL3]], [[ADD2]]
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
@@ -45,18 +47,18 @@
 ; may look profitable; but both are not because this eliminates generation
 ; of floating-point FMAs that would be more profitable.
 
-; FIXME: We generate a horizontal reduction today, and if that's disabled, we
-; still vectorize some of the multiplies.
-
 define double @hr_or_mul() {
 ; CHECK-LABEL: @hr_or_mul(
 ; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 3 to double
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> poison, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> <double 7.000000e+00, double -4.300000e+01, double 2.200000e-02, double 9.500000e+00>, [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast double [[TMP3]], [[CVT0]]
-; CHECK-NEXT:    ret double [[OP_RDX]]
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul fast double 7.000000e+00, [[CVT0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[CVT0]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul fast double -4.300000e+01, [[CVT0]]
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul fast double 2.200000e-02, [[CVT0]]
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double 9.500000e+00, [[CVT0]]
+; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast double [[MUL3]], [[ADD2]]
+; CHECK-NEXT:    ret double [[ADD3]]
 ;
   %cvt0 = uitofp i16 3 to double
   %mul0 = fmul fast double 7.000000e+00, %cvt0