diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6441,7 +6441,46 @@
       if (NeedToShuffleReuses) {
         CommonCost -= (EntryVF - VL.size()) * ScalarEltCost;
       }
-      InstructionCost ScalarCost = VecTy->getNumElements() * ScalarEltCost;
+
+      // Try to estimate the number of operations in VL which could be combined
+      // with their user to a single fmuladd/fmulsub instruction, if those are
+      // cheap on the target.
+      unsigned NumScalarFusable = 0;
+      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+        const Instruction *I = cast<Instruction>(VL[i]);
+        if (!I->hasOneUse() || ShuffleOrOp != Instruction::FMul ||
+            !I->hasAllowReassoc())
+          continue;
+
+        const Instruction *UserI = cast<Instruction>(*I->user_begin());
+        bool IsCandOpcode = UserI->getOpcode() == Instruction::FSub ||
+                            UserI->getOpcode() == Instruction::FAdd;
+        if (IsCandOpcode && UserI->getOperand(0) == I &&
+            UserI->hasAllowReassoc()) {
+          // TODO: How can we estimate the cost of fmulsub for which we don't
+          // have an intrinsic?
+          FastMathFlags FMF = I->getFastMathFlags();
+          FMF &= UserI->getFastMathFlags();
+          IntrinsicCostAttributes ICA(
+              Intrinsic::fmuladd, VL[i]->getType(),
+              {VL[i]->getType(), VL[i]->getType(), VL[i]->getType()}, FMF);
+          NumScalarFusable +=
+              TTI->getIntrinsicInstrCost(ICA, TTI::TCK_RecipThroughput) <=
+              ScalarEltCost;
+        }
+      }
+
+      // All operations in VL can be fused to fmuladd/fmulsub with their single
+      // users. So the whole bundle can be considered free when both scalarized
+      // or vectorized. If the common cost is low slightly biasing towards using
+      // the vector version should be profitable in most cases.
+      if (NumScalarFusable == VecTy->getNumElements())
+        return CommonCost - 1;
+
+      // If some of the scalar operations can be fused with their users to
+      // fmuladd/fmulsub, consider them as free.
+      InstructionCost ScalarCost =
+          (VecTy->getNumElements() - NumScalarFusable) * ScalarEltCost;
       for (unsigned I = 0, Num = VL0->getNumOperands(); I < Num; ++I) {
         if (all_of(VL, [I](Value *V) {
               return isConstant(cast<Instruction>(V)->getOperand(I));
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/slp-fma-loss.ll
@@ -7,21 +7,20 @@
 ; CHECK-LABEL: @slp_not_profitable_with_fast_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul fast float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul fast <2 x float> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub fast <2 x float> [[TMP7]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd fast <2 x float> [[TMP7]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP10]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP11]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul fast float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul fast float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul fast float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
@@ -47,21 +46,20 @@
 ; CHECK-LABEL: @slp_not_profitable_with_reassoc_fmf(
 ; CHECK-NEXT:    [[GEP_B_1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 1
 ; CHECK-NEXT:    [[A_0:%.*]] = load float, ptr [[A:%.*]], align 4
+; CHECK-NEXT:    [[B_1:%.*]] = load float, ptr [[GEP_B_1]], align 4
+; CHECK-NEXT:    [[MUL_0:%.*]] = fmul reassoc float [[B_1]], [[A_0]]
 ; CHECK-NEXT:    [[B_0:%.*]] = load float, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[GEP_B_1]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> poison, float [[B_0]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[B_0]], i32 1
-; CHECK-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[TMP3]], [[TMP1]]
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <2 x float> poison, float [[A_0]], i32 0
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[A_0]], i32 1
-; CHECK-NEXT:    [[TMP7:%.*]] = fmul reassoc <2 x float> [[TMP1]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = fsub reassoc <2 x float> [[TMP7]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP9:%.*]] = fadd reassoc <2 x float> [[TMP7]], [[SHUFFLE]]
-; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x float> [[TMP8]], <2 x float> [[TMP9]], <2 x i32> <i32 0, i32 3>
-; CHECK-NEXT:    store <2 x float> [[TMP10]], ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
-; CHECK-NEXT:    store float [[TMP11]], ptr [[B]], align 4
+; CHECK-NEXT:    [[GEP_B_2:%.*]] = getelementptr inbounds float, ptr [[B]], i64 2
+; CHECK-NEXT:    [[B_2:%.*]] = load float, ptr [[GEP_B_2]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = fmul float [[B_2]], [[B_0]]
+; CHECK-NEXT:    [[SUB:%.*]] = fsub reassoc float [[MUL_0]], [[MUL_1]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = fmul float [[B_0]], [[B_1]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = fmul reassoc float [[B_2]], [[A_0]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd reassoc float [[MUL_3]], [[MUL_2]]
+; CHECK-NEXT:    store float [[SUB]], ptr [[A]], align 4
+; CHECK-NEXT:    [[GEP_A_1:%.*]] = getelementptr inbounds float, ptr [[A]], i64 1
+; CHECK-NEXT:    store float [[ADD]], ptr [[GEP_A_1]], align 4
+; CHECK-NEXT:    store float [[B_2]], ptr [[B]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %gep.B.1 = getelementptr inbounds float, ptr %B, i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
--- a/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll
@@ -5,18 +5,20 @@
 ; adds may look profitable, but is not because it eliminates generation of
 ; floating-point FMAs that would be more profitable.
 
-; FIXME: We generate a horizontal reduction today.
-
 define void @hr() {
 ; CHECK-LABEL: @hr(
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[OP_RDX:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[PHI0:%.*]] = phi double [ 0.000000e+00, [[TMP0:%.*]] ], [ [[ADD3:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[CVT0:%.*]] = uitofp i16 0 to double
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x double> <double poison, double 0.000000e+00, double 0.000000e+00, double 0.000000e+00>, double [[CVT0]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = fmul fast <4 x double> zeroinitializer, [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fadd.v4f64(double -0.000000e+00, <4 x double> [[TMP2]])
-; CHECK-NEXT:    [[OP_RDX]] = fadd fast double [[TMP3]], [[PHI0]]
+; CHECK-NEXT:    [[MUL0:%.*]] = fmul fast double 0.000000e+00, [[CVT0]]
+; CHECK-NEXT:    [[ADD0:%.*]] = fadd fast double [[MUL0]], [[PHI0]]
+; CHECK-NEXT:    [[MUL1:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast double [[MUL1]], [[ADD0]]
+; CHECK-NEXT:    [[MUL2:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast double [[MUL2]], [[ADD1]]
+; CHECK-NEXT:    [[MUL3:%.*]] = fmul fast double 0.000000e+00, 0.000000e+00
+; CHECK-NEXT:    [[ADD3]] = fadd fast double [[MUL3]], [[ADD2]]
 ; CHECK-NEXT:    br i1 true, label [[EXIT:%.*]], label [[LOOP]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void