diff --git a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h --- a/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ b/llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -144,6 +144,13 @@ return (ForceKind)Force.Value; } + /// \return true if the cost-model for scalable vectorization should + /// favor vectorization with scalable vectors over fixed-width vectors when + /// the cost-model is inconclusive. + bool isScalableVectorizationPreferred() const { + return Scalable.Value == SK_PreferScalable; + } + /// \return true if scalable vectorization has been explicitly enabled. bool isScalableVectorizationExplicitlyEnabled() const { return Scalable.Value == SK_PreferFixedWidth || diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -70,6 +70,7 @@ #include "llvm/ADT/Optional.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/StringRef.h" @@ -1204,6 +1205,16 @@ CM_ScalarEpilogueNotAllowedUsePredicate }; +/// ElementCountComparator creates a total ordering for ElementCount +/// for the purposes of using it in a set structure. +struct ElementCountComparator { + bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { + return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < + std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); + } +}; +using ElementCountSet = SmallSet; + /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. /// In many cases vectorization is not profitable. This can happen because of @@ -1236,10 +1247,12 @@ bool runtimeChecksRequired(); /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every power of two up to MaxVF. If UserVF is not ZERO + /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectVectorizationFactor(const ElementCountSet &CandidateVFs); + VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, const LoopVectorizationPlanner &LVP); @@ -6013,6 +6026,14 @@ return RTCostA < RTCostB; } + // When set to preferred, for now assume vscale may be larger than 1, so + // that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (Hints->isScalableVectorizationPreferred()) + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getKnownMinValue()) <= + (CostB * A.Width.getKnownMinValue()); + // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) // <=> (CostA * B.Width) < (CostB * A.Width) @@ -6020,30 +6041,30 @@ (CostB * A.Width.getKnownMinValue()); } -VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); - +VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( + const ElementCountSet &VFCandidates) { InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); + assert(VFCandidates.count(ElementCount::getFixed(1)) && + "Expected Scalar VF to be a candidate"); const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF.isVector()) { + if (ForceVectorization && VFCandidates.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. ChosenFactor.Cost = std::numeric_limits::max(); } - for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); - i *= 2) { + for (const auto &i : VFCandidates) { + // The cost for scalar VF=1 is already calculated, so ignore it. + if (i.isScalar()) + continue; + // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. @@ -6053,7 +6074,9 @@ VectorizationFactor Candidate(i, C.first); LLVM_DEBUG( dbgs() << "LV: Vector loop of width " << i << " costs: " - << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) + << (*Candidate.Cost.getValue() / + Candidate.Width.getKnownMinValue()) + << (i.isScalable() ? " (assuming a minimum vscale of 1)" : "") << ".\n"); if (!C.second && !ForceVectorization) { @@ -7967,17 +7990,21 @@ // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes({UserVF}, {UserVF}); + buildVPlansWithVPRecipes(UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } - ElementCount MaxVF = MaxFactors.FixedVF; - assert(!MaxVF.isScalable() && - "Scalable vectors not yet supported beyond this point"); + // Populate the set of Vectorization Factor Candidates. + ElementCountSet VFCandidates; + for (auto VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) + VFCandidates.insert(VF); + for (auto VF = ElementCount::getScalable(1); + ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) + VFCandidates.insert(VF); - for (ElementCount VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { + for (const auto VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); @@ -7988,14 +8015,15 @@ } CM.collectInLoopReductions(); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); + buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); LLVM_DEBUG(printPlans(dbgs())); if (!MaxFactors.hasVector()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - auto SelectedVF = CM.selectVectorizationFactor(MaxVF); + auto SelectedVF = CM.selectVectorizationFactor(VFCandidates); // Check if it is profitable to vectorize with runtime checks. unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -41,7 +41,7 @@ for.inc: ; preds = %for.body, %if.then %inc = add nuw nsw i64 %i.07, 1 %exitcond.not = icmp eq i64 %inc, %n - br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !5 } @@ -90,3 +90,5 @@ !2 = !{!"llvm.loop.vectorize.width", i32 4} !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} !4 = !{!"llvm.loop.vectorize.enable", i1 true} +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll @@ -10,9 +10,13 @@ define void @test0(i32* %a, i8* %b, i32* %c) { ; CHECK: LV: Checking a loop in "test0" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 16 +; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: vscale x 16 entry: br label %loop @@ -39,9 +43,13 @@ define void @test1(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test1" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16 entry: br label %loop @@ -69,9 +77,13 @@ define void @test2(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test2" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16 entry: br label %loop @@ -99,9 +111,13 @@ define void @test3(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test3" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 16 entry: br label %loop @@ -129,9 +145,13 @@ define void @test4(i32* %a, i32* %b) { ; CHECK: LV: Checking a loop in "test4" ; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_PREFERRED_MAXBW-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_PREFERRED_MAXBW: LV: Selecting VF: 4 entry: br label %loop