Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1193,6 +1193,13 @@
       VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
+  /// Calculate the cost of a call to the llvm.fmuladd intrinsic. This is
+  /// modeled as the cost of a normal fmul instruction plus the cost of an fadd
+  /// reduction.
+  InstructionCost getFMulAddReductionCost(
+      VectorType *Ty, Optional<FastMathFlags> FMF,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+
   /// Calculate the cost of an extended reduction pattern, similar to
   /// getArithmeticReductionCost of an Add reduction with an extension and
   /// optional multiply. This is the cost of as:
@@ -1662,6 +1669,9 @@
   virtual InstructionCost
   getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsUnsigned,
                          TTI::TargetCostKind CostKind) = 0;
+  virtual InstructionCost
+  getFMulAddReductionCost(VectorType *Ty, Optional<FastMathFlags> FMF,
+                          TTI::TargetCostKind CostKind) = 0;
   virtual InstructionCost getExtendedAddReductionCost(
       bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) = 0;
@@ -2177,6 +2187,11 @@
                          TTI::TargetCostKind CostKind) override {
     return Impl.getMinMaxReductionCost(Ty, CondTy, IsUnsigned, CostKind);
   }
+  InstructionCost
+  getFMulAddReductionCost(VectorType *Ty, Optional<FastMathFlags> FMF,
+                          TTI::TargetCostKind CostKind) override {
+    return Impl.getFMulAddReductionCost(Ty, FMF, CostKind);
+  }
   InstructionCost getExtendedAddReductionCost(
       bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) override {
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -637,6 +637,11 @@
     return 1;
   }
 
+  InstructionCost getFMulAddReductionCost(VectorType *, Optional<FastMathFlags>,
+                                          TTI::TargetCostKind) const {
+    return 1;
+  }
+
   InstructionCost
   getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned, Type *ResTy,
                               VectorType *Ty,
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2174,6 +2174,16 @@
            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
+  InstructionCost getFMulAddReductionCost(VectorType *Ty,
+                                          Optional<FastMathFlags> FMF,
+                                          TTI::TargetCostKind CostKind) {
+    InstructionCost FAddReductionCost = thisT()->getArithmeticReductionCost(
+        Instruction::FAdd, Ty, FMF, CostKind);
+    InstructionCost FMulCost =
+        thisT()->getArithmeticInstrCost(Instruction::FMul, Ty, CostKind);
+    return FMulCost + FAddReductionCost;
+  }
+
   InstructionCost getExtendedAddReductionCost(bool IsMLA, bool IsUnsigned,
                                               Type *ResTy, VectorType *Ty,
                                               TTI::TargetCostKind CostKind) {
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -917,6 +917,14 @@
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getFMulAddReductionCost(
+    VectorType *Ty, Optional<FastMathFlags> FMF,
+    TTI::TargetCostKind CostKind) const {
+  InstructionCost Cost = TTIImpl->getFMulAddReductionCost(Ty, FMF, CostKind);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getExtendedAddReductionCost(
     bool IsMLA, bool IsUnsigned, Type *ResTy, VectorType *Ty,
     TTI::TargetCostKind CostKind) const {
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1983,6 +1983,7 @@
   case RecurKind::FMax:
   case RecurKind::SelectICmp:
   case RecurKind::SelectFCmp:
+  case RecurKind::FMulAdd:
     return true;
   default:
     return false;
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7255,8 +7255,14 @@
   const RecurrenceDescriptor &RdxDesc =
       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
 
-  InstructionCost BaseCost = TTI.getArithmeticReductionCost(
-      RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
+  InstructionCost BaseCost;
+  if (RdxDesc.getRecurrenceKind() == RecurKind::FMulAdd)
+    // Recognize a call to the llvm.fmuladd intrinsic.
+    BaseCost = TTI.getFMulAddReductionCost(VectorTy, RdxDesc.getFastMathFlags(),
+                                           CostKind);
+  else
+    BaseCost = TTI.getArithmeticReductionCost(
+        RdxDesc.getOpcode(), VectorTy, RdxDesc.getFastMathFlags(), CostKind);
 
   // If we're using ordered reductions then we can just return the base cost
   // here, since getArithmeticReductionCost calculates the full ordered
@@ -7929,6 +7935,12 @@
     return TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I);
   }
   case Instruction::Call: {
+    // Recognize a call to the llvm.fmuladd intrinsic.
+    if (RecurrenceDescriptor::isFMulAddIntrinsic(I)) {
+      // Detect reduction patterns.
+      if (auto RedCost = getReductionPatternCost(I, VF, VectorTy, CostKind))
+        return *RedCost;
+    }
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
     InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
Index: llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll
@@ -395,50 +395,50 @@
 ; CHECK-ORDERED-LABEL: @fmuladd_strict
 ; CHECK-ORDERED: vector.body:
 ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ]
-; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[FMUL:%.*]] = fmul <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]])
-; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]])
-; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]])
-; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]])
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[RDX1:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[RDX2:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX3:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
 ; CHECK-ORDERED: for.end
 ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
 ; CHECK-ORDERED: ret float [[RES]]
 
 ; CHECK-UNORDERED-LABEL: @fmuladd_strict
 ; CHECK-UNORDERED: vector.body
-; CHECK-UNORDERED:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
-; CHECK-UNORDERED:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
-; CHECK-UNORDERED:    [[VEC_PHI2:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
-; CHECK-UNORDERED:    [[VEC_PHI3:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
-; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[FMULADD]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]])
-; CHECK-UNORDERED: [[FMULADD1]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED: [[FMULADD2]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED: [[FMULADD3]] = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
 ; CHECK-UNORDERED-NOT: call float @llvm.vector.reduce.fadd
 ; CHECK-UNORDERED: middle.block
-; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <4 x float> [[FMULADD1]], [[FMULADD]]
-; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <4 x float> [[FMULADD2]], [[BIN_RDX]]
-; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <4 x float> [[FMULADD3]], [[BIN_RDX1]]
-; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
 ; CHECK-UNORDERED: for.body
 ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
 ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
@@ -475,50 +475,50 @@
 ; CHECK-ORDERED-LABEL: @fmuladd_strict_fmf
 ; CHECK-ORDERED: vector.body:
 ; CHECK-ORDERED: [[VEC_PHI:%.*]] = phi float [ 0.000000e+00, %vector.ph ], [ [[RDX2:%.*]], %vector.body ]
-; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
-; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[VEC_PHI]], <4 x float> [[FMUL]])
-; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
-; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX]], <4 x float> [[FMUL1]])
-; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
-; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX1]], <4 x float> [[FMUL2]])
-; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <4 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
-; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float [[RDX2]], <4 x float> [[FMUL3]])
+; CHECK-ORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-ORDERED: [[FMUL:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD]], [[WIDE_LOAD4]]
+; CHECK-ORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[VEC_PHI]], <vscale x 8 x float> [[FMUL]])
+; CHECK-ORDERED: [[FMUL1:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD1]], [[WIDE_LOAD5]]
+; CHECK-ORDERED: [[RDX1:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX]], <vscale x 8 x float> [[FMUL1]])
+; CHECK-ORDERED: [[FMUL2:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD2]], [[WIDE_LOAD6]]
+; CHECK-ORDERED: [[RDX2:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX1]], <vscale x 8 x float> [[FMUL2]])
+; CHECK-ORDERED: [[FMUL3:%.*]] = fmul nnan <vscale x 8 x float> [[WIDE_LOAD3]], [[WIDE_LOAD7]]
+; CHECK-ORDERED: [[RDX3:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float [[RDX2]], <vscale x 8 x float> [[FMUL3]])
 ; CHECK-ORDERED: for.end
 ; CHECK-ORDERED: [[RES:%.*]] = phi float [ [[SCALAR:%.*]], %for.body ], [ [[RDX3]], %middle.block ]
 ; CHECK-ORDERED: ret float [[RES]]
 
 ; CHECK-UNORDERED-LABEL: @fmuladd_strict_fmf
 ; CHECK-UNORDERED: vector.body
-; CHECK-UNORDERED:    [[VEC_PHI:%.*]] = phi <4 x float> [ <float 0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
-; CHECK-UNORDERED:    [[VEC_PHI1:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
-; CHECK-UNORDERED:    [[VEC_PHI2:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
-; CHECK-UNORDERED:    [[VEC_PHI3:%.*]] = phi <4 x float> [ <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
-; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <4 x float>, <4 x float>*
-; CHECK-UNORDERED: [[FMULADD]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD]], <4 x float> [[WIDE_LOAD4]], <4 x float> [[VEC_PHI]])
-; CHECK-UNORDERED: [[FMULADD1]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD1]], <4 x float> [[WIDE_LOAD5]], <4 x float> [[VEC_PHI1]])
-; CHECK-UNORDERED: [[FMULADD2]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD2]], <4 x float> [[WIDE_LOAD6]], <4 x float> [[VEC_PHI2]])
-; CHECK-UNORDERED: [[FMULADD3]] = call nnan <4 x float> @llvm.fmuladd.v4f32(<4 x float> [[WIDE_LOAD3]], <4 x float> [[WIDE_LOAD7]], <4 x float> [[VEC_PHI3]])
+; CHECK-UNORDERED:    [[VEC_PHI:%.*]] = phi <vscale x 8 x float> [ insertelement (<vscale x 8 x float> shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), float 0.000000e+00, i32 0), %vector.ph ], [ [[FMULADD:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD1:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI2:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD2:%.*]], %vector.body ]
+; CHECK-UNORDERED:    [[VEC_PHI3:%.*]] = phi <vscale x 8 x float> [ shufflevector (<vscale x 8 x float> insertelement (<vscale x 8 x float> poison, float -0.000000e+00, i32 0), <vscale x 8 x float> poison, <vscale x 8 x i32> zeroinitializer), %vector.ph ], [ [[FMULADD3:%.*]], %vector.body ]
+; CHECK-UNORDERED: [[WIDE_LOAD:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD1:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD2:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD3:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD4:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD5:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD6:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[WIDE_LOAD7:%.*]] = load <vscale x 8 x float>, <vscale x 8 x float>*
+; CHECK-UNORDERED: [[FMULADD]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD]], <vscale x 8 x float> [[WIDE_LOAD4]], <vscale x 8 x float> [[VEC_PHI]])
+; CHECK-UNORDERED: [[FMULADD1]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD1]], <vscale x 8 x float> [[WIDE_LOAD5]], <vscale x 8 x float> [[VEC_PHI1]])
+; CHECK-UNORDERED: [[FMULADD2]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD2]], <vscale x 8 x float> [[WIDE_LOAD6]], <vscale x 8 x float> [[VEC_PHI2]])
+; CHECK-UNORDERED: [[FMULADD3]] = call nnan <vscale x 8 x float> @llvm.fmuladd.nxv8f32(<vscale x 8 x float> [[WIDE_LOAD3]], <vscale x 8 x float> [[WIDE_LOAD7]], <vscale x 8 x float> [[VEC_PHI3]])
 ; CHECK-UNORDERED-NOT: call nnan float @llvm.vector.reduce.fadd
 ; CHECK-UNORDERED: middle.block
-; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <4 x float> [[FMULADD1]], [[FMULADD]]
-; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <4 x float> [[FMULADD2]], [[BIN_RDX]]
-; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <4 x float> [[FMULADD3]], [[BIN_RDX1]]
-; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> [[BIN_RDX2]]
+; CHECK-UNORDERED: [[BIN_RDX:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD1]], [[FMULADD]]
+; CHECK-UNORDERED: [[BIN_RDX1:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD2]], [[BIN_RDX]]
+; CHECK-UNORDERED: [[BIN_RDX2:%.*]] = fadd nnan <vscale x 8 x float> [[FMULADD3]], [[BIN_RDX1]]
+; CHECK-UNORDERED: [[RDX:%.*]] = call nnan float @llvm.vector.reduce.fadd.nxv8f32(float -0.000000e+00, <vscale x 8 x float> [[BIN_RDX2]]
 ; CHECK-UNORDERED: for.body
 ; CHECK-UNORDERED: [[SUM_07:%.*]] = phi float [ [[SCALAR:%.*]], %scalar.ph ], [ [[MULADD:%.*]], %for.body ]
 ; CHECK-UNORDERED: [[LOAD:%.*]] = load float, float*
Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-cost.ll
@@ -48,3 +48,53 @@
 for.end:
   ret double %add
 }
+
+; CHECK-VF4: Found an estimated cost of 23 for VF 4 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+; CHECK-VF8: Found an estimated cost of 46 for VF 8 For instruction:   %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+
+define float @fmuladd_strict32(float* %a, float* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds float, float* %b, i64 %iv
+  %1 = load float, float* %arrayidx2, align 4
+  %muladd = tail call float @llvm.fmuladd.f32(float %0, float %1, float %sum.07)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %muladd
+}
+
+declare float @llvm.fmuladd.f32(float, float, float)
+
+; CHECK-VF4: Found an estimated cost of 22 for VF 4 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+; CHECK-VF8: Found an estimated cost of 44 for VF 8 For instruction:   %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+
+define double @fmuladd_strict64(double* %a, double* %b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi double [ 0.000000e+00, %entry ], [ %muladd, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %iv
+  %0 = load double, double* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds double, double* %b, i64 %iv
+  %1 = load double, double* %arrayidx2, align 4
+  %muladd = tail call double @llvm.fmuladd.f64(double %0, double %1, double %sum.07)
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret double %muladd
+}
+
+declare double @llvm.fmuladd.f64(double, double, double)