Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -6545,9 +6545,21 @@ // If we have a scalar reduction (vector reductions are already dealt with // by this point), we can increase the critical path length if the loop - // we're interleaving is inside another loop. Limit, by default to 2, so the - // critical path only gets increased by one reduction operation. + // we're interleaving is inside another loop. For tree-wise reductions + // set the limit to 2, and for ordered reductions it's best to disable + // interleaving entirely. if (HasReductions && TheLoop->getLoopDepth() > 1) { + bool HasOrderedReductions = + any_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { + const RecurrenceDescriptor &RdxDesc = Reduction.second; + return RdxDesc.isOrdered(); + }); + if (HasOrderedReductions) { + LLVM_DEBUG( + dbgs() << "LV: Not interleaving scalar ordered reductions.\n"); + return 1; + } + unsigned F = static_cast(MaxNestedScalarReductionIC); SmallIC = std::min(SmallIC, F); StoresIC = std::min(StoresIC, F); Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd-vf1.ll @@ -0,0 +1,42 @@ +; REQUIRES: asserts +; RUN: opt -loop-vectorize -enable-strict-reductions=true -force-vector-width=1 -S < %s -debug 2>log | FileCheck %s +; RUN: cat log | FileCheck %s --check-prefix=CHECK-DEBUG + +target triple = "aarch64-unknown-linux-gnu" + +; CHECK-DEBUG: LV: Not interleaving scalar ordered reductions. + +define void @foo(float* noalias nocapture %dst, float* noalias nocapture readonly %src, i64 %M, i64 %N) { +; CHECK-LABEL: @foo( +; CHECK-NOT: vector.body + +entry: + br label %for.body.us + +for.body.us: ; preds = %entry, %for.cond3 + %i.023.us = phi i64 [ %inc8.us, %for.cond3 ], [ 0, %entry ] + %arrayidx.us = getelementptr inbounds float, float* %dst, i64 %i.023.us + %mul.us = mul nsw i64 %i.023.us, %N + br label %for.body3.us + +for.body3.us: ; preds = %for.body.us, %for.body3.us + %0 = phi float [ 0.000000e+00, %for.body.us ], [ %add6.us, %for.body3.us ] + %j.021.us = phi i64 [ 0, %for.body.us ], [ %inc.us, %for.body3.us ] + %add.us = add nsw i64 %j.021.us, %mul.us + %arrayidx4.us = getelementptr inbounds float, float* %src, i64 %add.us + %1 = load float, float* %arrayidx4.us, align 4 + %add6.us = fadd float %1, %0 + %inc.us = add nuw nsw i64 %j.021.us, 1 + %exitcond.not = icmp eq i64 %inc.us, %N + br i1 %exitcond.not, label %for.cond3, label %for.body3.us + +for.cond3: ; preds = %for.body3.us + %add6.us.lcssa = phi float [ %add6.us, %for.body3.us ] + store float %add6.us.lcssa, float* %arrayidx.us, align 4 + %inc8.us = add nuw nsw i64 %i.023.us, 1 + %exitcond26.not = icmp eq i64 %inc8.us, %M + br i1 %exitcond26.not, label %exit, label %for.body.us + +exit: ; preds = %for.cond3 + ret void +}