Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6545,9 +6545,21 @@
 
     // If we have a scalar reduction (vector reductions are already dealt with
     // by this point), we can increase the critical path length if the loop
-    // we're interleaving is inside another loop. Limit, by default to 2, so the
-    // critical path only gets increased by one reduction operation.
+    // we're interleaving is inside another loop. For tree-wise reductions
+    // set the limit to 2, and for ordered reductions it's best to disable
+    // interleaving entirely.
     if (HasReductions && TheLoop->getLoopDepth() > 1) {
+      bool HasOrderedReductions =
+          any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool {
+            const RecurrenceDescriptor &RdxDesc = Reduction.second;
+            return RdxDesc.isOrdered();
+          });
+      if (HasOrderedReductions) {
+        LLVM_DEBUG(
+            dbgs() << "LV: Not interleaving scalar ordered reductions.\n");
+        return 1;
+      }
+
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll
@@ -0,0 +1,42 @@
+; REQUIRES: asserts
+; RUN: opt -loop-vectorize -enable-strict-reductions=true -force-vector-width=1 -S < %s -debug 2>log | FileCheck %s
+; RUN: cat log | FileCheck %s --check-prefix=CHECK-DEBUG
+
+target triple = "aarch64-unknown-linux-gnu"
+
+; CHECK-DEBUG: LV: Not interleaving scalar ordered reductions.
+
+define void @foo(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %M, i64 %N) {
+; CHECK-LABEL: @foo(
+; CHECK-NOT: vector.body
+
+entry:
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %entry, %for.cond3
+  %i.023.us = phi i64 [ %inc8.us, %for.cond3 ], [ 0, %entry ]
+  %arrayidx.us = getelementptr inbounds float, float* %dst, i64 %i.023.us
+  %mul.us = mul nsw i64 %i.023.us, %N
+  br label %for.body3.us
+
+for.body3.us:                                     ; preds = %for.body.us, %for.body3.us
+  %0 = phi float [ 0.000000e+00, %for.body.us ], [ %add6.us, %for.body3.us ]
+  %j.021.us = phi i64 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ]
+  %add.us = add nsw i64 %j.021.us, %mul.us
+  %arrayidx4.us = getelementptr inbounds float, float* %src, i64 %add.us
+  %1 = load float, float* %arrayidx4.us, align 4
+  %add6.us = fadd float %1, %0
+  %inc.us = add nuw nsw i64 %j.021.us, 1
+  %exitcond.not = icmp eq i64 %inc.us, %N
+  br i1 %exitcond.not, label %for.cond3, label %for.body3.us
+
+for.cond3:                                        ; preds = %for.body3.us
+  %add6.us.lcssa = phi float [ %add6.us, %for.body3.us ]
+  store float %add6.us.lcssa, float* %arrayidx.us, align 4
+  %inc8.us = add nuw nsw i64 %i.023.us, 1
+  %exitcond26.not = icmp eq i64 %inc8.us, %M
+  br i1 %exitcond26.not, label %exit, label %for.body.us
+
+exit:                                             ; preds = %for.cond3
+  ret void
+}