Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1194,6 +1194,18 @@
       unsigned Opcode, VectorType *Ty, bool IsPairwiseForm,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
 
+  /// Calculate the cost of performing a strict (in-order) vector reduction.
+  ///
+  /// This is the cost of reducing the vector value of type \p Ty to a scalar
+  /// value using the operation denoted by \p Opcode. The reduction is
+  /// performed in lane order using an initial scalar value and a vector with
+  /// the same element types as the scalar, i.e.
+  ///
+  ///  result = (init + v0 + v1 + v2 + v3)
+  InstructionCost getArithmeticStrictReductionCost(
+      unsigned Opcode, VectorType *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
+
   InstructionCost getMinMaxReductionCost(
       VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const;
@@ -1658,6 +1670,9 @@
                              bool IsPairwiseForm,
                              TTI::TargetCostKind CostKind) = 0;
   virtual InstructionCost
+  getArithmeticStrictReductionCost(unsigned Opcode, VectorType *Ty,
+                                   TTI::TargetCostKind CostKind) = 0;
+  virtual InstructionCost
   getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                          bool IsPairwiseForm, bool IsUnsigned,
                          TTI::TargetCostKind CostKind) = 0;
@@ -2163,6 +2178,11 @@
                                            CostKind);
   }
   InstructionCost
+  getArithmeticStrictReductionCost(unsigned Opcode, VectorType *Ty,
+                                   TTI::TargetCostKind CostKind) override {
+    return Impl.getArithmeticStrictReductionCost(Opcode, Ty, CostKind);
+  }
+  InstructionCost
   getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
                          bool IsPairwiseForm, bool IsUnsigned,
                          TTI::TargetCostKind CostKind) override {
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -626,6 +626,11 @@
     return 1;
   }
 
+  InstructionCost getArithmeticStrictReductionCost(unsigned, VectorType *,
+                                                   TTI::TargetCostKind) const {
+    return 1;
+  }
+
   InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool, bool,
                                          TTI::TargetCostKind) const {
     return 1;
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -2065,6 +2065,31 @@
            thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
   }
 
+  /// Try to calculate the cost of performing strict (in-order) reductions,
+  /// which involves doing a sequence of floating point additions in lane
+  /// order, starting with an initial value. For example, consider a scalar
+  /// initial value 'InitVal' of type float and a vector of type <4 x float>:
+  ///
+  ///   Vector = <float %v0, float %v1, float %v2, float %v3>
+  ///
+  ///   %add1 = %InitVal + %v0
+  ///   %add2 = %add1 + %v1
+  ///   %add3 = %add2 + %v2
+  ///   %add4 = %add3 + %v3
+  ///
+  /// As a simple estimate we can say the cost of such a reduction is 4 times
+  /// the cost of a scalar FP addition. We can only estimate the costs for
+  /// fixed-width vectors here because for scalable vectors we do not know the
+  /// runtime number of operations.
+  InstructionCost
+  getArithmeticStrictReductionCost(unsigned Opcode, VectorType *Ty,
+                                   TTI::TargetCostKind CostKind) {
+    auto *VTy = cast<FixedVectorType>(Ty);
+    InstructionCost Cost =
+        getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind);
+    return Cost * VTy->getNumElements();
+  }
+
   /// Try to calculate op costs for min/max reduction operations.
   /// \param CondTy Conditional type for the Select instruction.
   InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -902,6 +902,14 @@
   return Cost;
 }
 
+InstructionCost TargetTransformInfo::getArithmeticStrictReductionCost(
+    unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const {
+  InstructionCost Cost =
+      TTIImpl->getArithmeticStrictReductionCost(Opcode, Ty, CostKind);
+  assert(Cost >= 0 && "TTI should not produce negative costs!");
+  return Cost;
+}
+
 InstructionCost TargetTransformInfo::getMinMaxReductionCost(
     VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
     TTI::TargetCostKind CostKind) const {
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -131,6 +131,18 @@
     return BaseT::getMaxVScale();
   }
 
+  /// Try to return an estimate cost factor that can be used as a multiplier
+  /// when scalarizing an operation for a vector with ElementCount \p VF.
+  /// For scalable vectors this currently takes the most pessimistic view based
+  /// upon the maximum possible value for vscale.
+  unsigned getScalarizationCostFactor(ElementCount VF) const {
+    if (!VF.isScalable())
+      return VF.getKnownMinValue();
+    Optional<unsigned> MaxNumVScale = getMaxVScale();
+    assert(MaxNumVScale && "Expected valid max vscale value");
+    return *MaxNumVScale * VF.getKnownMinValue();
+  }
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
@@ -305,6 +317,10 @@
       unsigned Opcode, VectorType *Ty, bool IsPairwiseForm,
       TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
 
+  InstructionCost getArithmeticStrictReductionCost(
+      unsigned Opcode, VectorType *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+
   InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
                                  ArrayRef<int> Mask, int Index,
                                  VectorType *SubTp);
Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -1404,14 +1404,9 @@
     return InstructionCost::getInvalid();
 
   ElementCount LegalVF = LT.second.getVectorElementCount();
-  Optional<unsigned> MaxNumVScale = getMaxVScale();
-  assert(MaxNumVScale && "Expected valid max vscale value");
-
   InstructionCost MemOpCost =
       getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I);
-  unsigned MaxNumElementsPerGather =
-      MaxNumVScale.getValue() * LegalVF.getKnownMinValue();
-  return LT.first * MaxNumElementsPerGather * MemOpCost;
+  return LT.first * MemOpCost * getScalarizationCostFactor(LegalVF);
 }
 
 bool AArch64TTIImpl::useNeonVector(const Type *Ty) const {
@@ -1819,6 +1814,18 @@
   }
 }
 
+InstructionCost AArch64TTIImpl::getArithmeticStrictReductionCost(
+    unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) {
+  if (!isa<ScalableVectorType>(Ty))
+    return BaseT::getArithmeticStrictReductionCost(Opcode, Ty, CostKind);
+
+  auto *VTy = cast<ScalableVectorType>(Ty);
+  InstructionCost Cost =
+      getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind);
+  Cost *= getScalarizationCostFactor(VTy->getElementCount());
+  return Cost;
+}
+
 InstructionCost
 AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
                                            bool IsPairwiseForm,
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7168,6 +7168,11 @@
 
   const RecurrenceDescriptor &RdxDesc =
       Legal->getReductionVars()[cast<PHINode>(ReductionPhi)];
+
+  if (useOrderedReductions(RdxDesc))
+    return TTI.getArithmeticStrictReductionCost(RdxDesc.getOpcode(), VectorTy,
+                                                CostKind);
+
   InstructionCost BaseCost = TTI.getArithmeticReductionCost(
       RdxDesc.getOpcode(), VectorTy, false, CostKind);
 
Index: llvm/test/Analysis/CostModel/AArch64/strict-fadd.ll
===================================================================
--- /dev/null
+++ llvm/test/Analysis/CostModel/AArch64/strict-fadd.ll
@@ -0,0 +1,49 @@
+; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \
+; RUN:   -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4
+; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \
+; RUN:   -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8
+
+target triple="aarch64-unknown-linux-gnu"
+
+; CHECK-VF4: Found an estimated cost of 8 for VF 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 16 for VF 8 For instruction:   %add = fadd float %0, %sum.07
+
+define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret float %add
+}
+
+
+; CHECK-VF4: Found an estimated cost of 8 for VF 4 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 16 for VF 8 For instruction:   %add = fadd double %0, %sum.07
+
+define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %iv
+  %0 = load double, double* %arrayidx, align 4
+  %add = fadd double %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret double %add
+}
Index: llvm/test/Analysis/CostModel/AArch64/sve-strict-fadd.ll
===================================================================
--- /dev/null
+++ llvm/test/Analysis/CostModel/AArch64/sve-strict-fadd.ll
@@ -0,0 +1,55 @@
+; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \
+; RUN:   -scalable-vectorization=on -force-vector-width=4 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4
+; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \
+; RUN:   -scalable-vectorization=on -force-vector-width=8 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8
+
+target triple="aarch64-unknown-linux-gnu"
+
+; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction:   %add = fadd float %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction:   %add = fadd float %0, %sum.07
+
+define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %iv
+  %0 = load float, float* %arrayidx, align 4
+  %add = fadd float %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret float %add
+}
+
+
+; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction:   %add = fadd double %0, %sum.07
+; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction:   %add = fadd double %0, %sum.07
+
+define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 {
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ]
+  %arrayidx = getelementptr inbounds double, double* %a, i64 %iv
+  %0 = load double, double* %arrayidx, align 4
+  %add = fadd double %0, %sum.07
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, %n
+  br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0
+
+for.end:
+  ret double %add
+}
+
+attributes #0 = { "target-features"="+sve" }
+
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.interleave.count", i32 1}
+!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}