diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -332,10 +332,6 @@ cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); -// FIXME: When loop hints are passed which allow reordering of FP operations, -// we still choose to use strict reductions with this flag. We should instead -// use the default behaviour of vectorizing with unordered reductions if -// reordering is allowed. cl::opt EnableStrictReductions( "enable-strict-reductions", cl::init(false), cl::Hidden, cl::desc("Enable the vectorisation of loops with in-order (strict) " @@ -558,6 +554,10 @@ /// Fix the non-induction PHIs in the OrigPHIsToFix vector. void fixNonInductionPHIs(VPTransformState &State); + /// Returns true if the reordering of FP operations is not allowed, but we are + /// able to vectorize with strict in-order reductions for the given RdxDesc. + bool useOrderedReductions(RecurrenceDescriptor &RdxDesc); + /// Create a broadcast instruction. This method generates a broadcast /// instruction (shuffle) for loop invariant values and for the induction /// value. If this is the induction variable then we extend it to N, N+1, ... @@ -1306,6 +1306,15 @@ /// outside. In loop reductions are collected into InLoopReductionChains. void collectInLoopReductions(); + /// Returns true if we should use strict in-order reductions for the given + /// RdxDesc. This is true if the -enable-strict-reductions flag is passed, + /// the IsOrdered flag of RdxDesc is set and we do not allow reordering + /// of FP operations. + bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { + return EnableStrictReductions && !Hints->allowReordering() && + RdxDesc.isOrdered(); + } + /// \returns The smallest bitwidth each instruction can be represented with. /// The vector equivalents of these instructions should be truncated to this /// type. @@ -4316,10 +4325,6 @@ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); } -static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { - return EnableStrictReductions && RdxDesc.isOrdered(); -} - void InnerLoopVectorizer::fixReduction(VPWidenPHIRecipe *PhiR, VPTransformState &State) { PHINode *OrigPhi = cast(PhiR->getUnderlyingValue()); @@ -4348,7 +4353,7 @@ BasicBlock *VectorLoopLatch = LI->getLoopFor(LoopVectorBody)->getLoopLatch(); bool IsOrdered = State.VF.isVector() && IsInLoopReductionPhi && - useOrderedReductions(RdxDesc); + Cost->useOrderedReductions(RdxDesc); for (unsigned Part = 0; Part < UF; ++Part) { if (IsOrdered && Part > 0) @@ -4654,6 +4659,10 @@ } } +bool InnerLoopVectorizer::useOrderedReductions(RecurrenceDescriptor &RdxDesc) { + return Cost->useOrderedReductions(RdxDesc); +} + void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPValue *VPDef, VPUser &Operands, unsigned UF, ElementCount VF, bool IsPtrLoopInvariant, @@ -4793,7 +4802,7 @@ bool IsOrdered = State.VF.isVector() && Cost->isInLoopReduction(cast(PN)) && - useOrderedReductions(*RdxDesc); + Cost->useOrderedReductions(*RdxDesc); for (unsigned Part = 0; Part < State.UF; ++Part) { // This is phase one of vectorizing PHIs. @@ -9486,7 +9495,7 @@ Value *PrevInChain = State.get(getChainOp(), 0); for (unsigned Part = 0; Part < State.UF; ++Part) { RecurKind Kind = RdxDesc->getRecurrenceKind(); - bool IsOrdered = useOrderedReductions(*RdxDesc); + bool IsOrdered = State.ILV->useOrderedReductions(*RdxDesc); Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-strict-fadd.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED +; RUN: opt < %s -loop-vectorize -scalable-vectorization=on -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-ORDERED-LABEL: @fadd_strict diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -1,6 +1,7 @@ ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-NOT-VECTORIZED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=false -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED ; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true -hints-allow-reordering=false -S 2>%t | FileCheck %s --check-prefix=CHECK-ORDERED +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -enable-strict-reductions=true -hints-allow-reordering=true -S 2>%t | FileCheck %s --check-prefix=CHECK-UNORDERED define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { ; CHECK-ORDERED-LABEL: @fadd_strict @@ -551,10 +552,10 @@ ; return sum; ;} ; -; Note: These tests do not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even -; with the -hints-allow-reordering flag set to true. ; Strict reduction could be performed in-loop, but ordered FP induction variables are not supported +; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even +; with the -hints-allow-reordering flag set to true. define float @induction_and_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, i64 %N) { ; CHECK-ORDERED-LABEL: @induction_and_reduction ; CHECK-ORDERED-NOT: vector.body @@ -594,25 +595,41 @@ ; CHECK-ORDERED: vector.body ; CHECK-ORDERED: %[[RDX_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[FADD2:.*]], %vector.body ] ; CHECK-ORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] -; CHECK-ORDERED: %[[STEP_ADD:.*]] = fadd fast <4 x float> %[[IND_PHI]], ; CHECK-ORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* -; CHECK-ORDERED: %[[LOAD2:.*]] = load <4 x float>, <4 x float>* ; CHECK-ORDERED: %[[FADD1:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[RDX_PHI]], <4 x float> %[[LOAD1]]) -; CHECK-ORDERED: %[[FADD2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[FADD1]], <4 x float> %[[LOAD2]]) -; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[STEP_ADD]], +; CHECK-ORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], ; CHECK-ORDERED: for.body -; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD3:.*]], %for.body ] +; CHECK-ORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD2:.*]], %for.body ] ; CHECK-ORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] ; CHECK-ORDERED: store float %[[IND_SUM_PHI]], float* ; CHECK-ORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 -; CHECK-ORDERED: %[[LOAD3:.*]] = load float, float* -; CHECK-ORDERED: %[[FADD3]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD3]] +; CHECK-ORDERED: %[[LOAD2:.*]] = load float, float* +; CHECK-ORDERED: %[[FADD2]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]] ; CHECK-ORDERED: for.end -; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD3]], %for.body ], [ %[[FADD2]], %middle.block ] +; CHECK-ORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD2]], %for.body ], [ %[[FADD1]], %middle.block ] ; CHECK-ORDERED: ret float %[[RES_PHI]] ; CHECK-UNORDERED-LABEL: @fast_induction_and_reduction -; CHECK-UNORDERED-NOT: vector.body +; CHECK-UNORDERED: vector.ph +; CHECK-UNORDERED: %[[INDUCTION:.*]] = fadd fast <4 x float> {{.*}}, +; CHECK-UNORDERED: vector.body +; CHECK-UNORDERED: %[[RDX_PHI:.*]] = phi <4 x float> [ , %vector.ph ], [ %[[VEC_FADD:.*]], %vector.body ] +; CHECK-UNORDERED: %[[IND_PHI:.*]] = phi <4 x float> [ %[[INDUCTION]], %vector.ph ], [ %[[VEC_IND_NEXT:.*]], %vector.body ] +; CHECK-UNORDERED: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* +; CHECK-UNORDERED: %[[VEC_FADD]] = fadd <4 x float> %[[RDX_PHI]], %[[LOAD1]] +; CHECK-UNORDERED: %[[VEC_IND_NEXT]] = fadd fast <4 x float> %[[IND_PHI]], +; CHECK-UNORDERED: middle.block: +; CHECK-UNORDERED: %[[VEC_RDX:.*]] = call float @llvm.vector.reduce.fadd.v4f32(float -0.000000e+00, <4 x float> %[[VEC_FADD]]) +; CHECK-UNORDERED: for.body: +; CHECK-UNORDERED: %[[RDX_SUM_PHI:.*]] = phi float [ {{.*}}, %scalar.ph ], [ %[[FADD:.*]], %for.body ] +; CHECK-UNORDERED: %[[IND_SUM_PHI:.*]] = phi fast float [ {{.*}}, %scalar.ph ], [ %[[ADD_IND:.*]], %for.body ] +; CHECK-UNORDERED: store float %[[IND_SUM_PHI]], float* +; CHECK-UNORDERED: %[[ADD_IND]] = fadd fast float %[[IND_SUM_PHI]], 2.000000e+00 +; CHECK-UNORDERED: %[[LOAD2:.*]] = load float, float* +; CHECK-UNORDERED: %[[FADD]] = fadd float %[[RDX_SUM_PHI]], %[[LOAD2]] +; CHECK-UNORDERED: for.end +; CHECK-UNORDERED: %[[RES_PHI:.*]] = phi float [ %[[FADD]], %for.body ], [ %[[VEC_RDX]], %middle.block ] +; CHECK-UNORDERED: ret float %[[RES_PHI]] ; CHECK-NOT-VECTORIZED-LABEL: @fast_induction_and_reduction ; CHECK-NOT-VECTORIZED-NOT: vector.body @@ -632,13 +649,15 @@ %add3 = fadd float %sum.015, %0 %iv.next = add nuw nsw i64 %iv, 1 %exitcond.not = icmp eq i64 %iv.next, %N - br i1 %exitcond.not, label %for.end, label %for.body + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2 for.end: ret float %add3 } ; The FP induction is fast, but here we can't vectorize as only one of the reductions is an FAdd that can be performed in-loop +; Note: This test does not use metadata hints, and as such we should not expect the CHECK-UNORDERED case to vectorize, even +; with the -hints-allow-reordering flag set to true. define float @fast_induction_unordered_reduction(float* nocapture readonly %values, float %init, float* noalias nocapture %A, float* noalias nocapture %B, i64 %N) { ; CHECK-ORDERED-LABEL: @fast_induction_unordered_reduction