diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -344,10 +344,14 @@ VFRange &Range, SmallPtrSetImpl &DeadInstructions, const DenseMap &SinkAfter); - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, - /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + /// Build VPlans for power-of-2 VF's for the ranges specified by Range1 and + /// Range2. according to the information gathered by Legal when it checked if + /// it is legal to vectorize the loop. This method creates VPlans using + /// VPRecipes. When the Maximum VF is set, the corresponding Minimum VF must + /// also be set. + void buildVPlansWithVPRecipes(VFRange Range1, + VFRange Range2 = {ElementCount::getFixed(1), + ElementCount::getFixed(1)}); /// Adjust the recipes for any inloop reductions. The chain of instructions /// leading from the loop exit instr to the phi need to be converted to diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -432,6 +432,17 @@ return None; } +/// Generate power-of-2 VFs starting from the minimum VF up to and including +/// maximum VF. For scalable VFs, it is only the minimum known value that is a +/// power of 2, the eventual VF may not be if the runtime value for vscale is +/// not also a power of two. +static void genVFs(SmallVectorImpl &List, const ElementCount &Min, + const ElementCount &Max) { + assert((Min.isScalable() == Max.isScalable()) && "Scalable flags must match"); + for (auto VF = Min; ElementCount::isKnownLE(VF, Max); VF *= 2) + List.push_back(VF); +} + // Forward declare GeneratedRTChecks. class GeneratedRTChecks; @@ -1241,7 +1252,9 @@ /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectVectorizationFactor(const MaxVFCandidates &MaxFactors); + VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, const LoopVectorizationPlanner &LVP); @@ -1505,7 +1518,7 @@ /// Returns true if the target machine supports all of the reduction /// variables found for the given VF. - bool canVectorizeReductions(ElementCount VF) { + bool canVectorizeReductions(ElementCount VF) const { return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { RecurrenceDescriptor RdxDesc = Reduction.second; return TTI.isLegalToVectorizeReduction(RdxDesc, VF); @@ -5973,6 +5986,14 @@ InstructionCost::CostType CostA = *A.Cost.getValue(); InstructionCost::CostType CostB = *B.Cost.getValue(); + // When set to preferred, for now assume vscale may be larger than 1, so + // that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (Hints->isScalableVectorizationPreferred()) + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getKnownMinValue()) <= + (CostB * A.Width.getKnownMinValue()); + // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) // <=> (CostA * B.Width) < (CostB * A.Width) @@ -5980,13 +6001,8 @@ (CostB * A.Width.getKnownMinValue()); } -VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); - +VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( + const MaxVFCandidates &MaxFactors) { InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); @@ -5995,15 +6011,20 @@ VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF.isVector()) { + if (ForceVectorization && MaxFactors.hasVectorCandidates()) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. ChosenFactor.Cost = std::numeric_limits::max(); } - for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); - i *= 2) { + // Iterate through all VFs, prioritising scalable vectors so they have a + // slight edge over fixed-width vectors when the cost is the same. + SmallVector VFCandidates; + genVFs(VFCandidates, ElementCount::getScalable(1), MaxFactors.ScalableVF); + genVFs(VFCandidates, ElementCount::getFixed(2), MaxFactors.FixedVF); + + for (const auto &i : VFCandidates) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. @@ -6011,10 +6032,10 @@ assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); VectorizationFactor Candidate(i, C.first); - LLVM_DEBUG( - dbgs() << "LV: Vector loop of width " << i << " costs: " - << (*Candidate.Cost.getValue() / Candidate.Width.getFixedValue()) - << ".\n"); + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " + << (*Candidate.Cost.getValue() / + Candidate.Width.getKnownMinValue()) + << ".\n"); if (!C.second && !ForceVectorization) { LLVM_DEBUG( @@ -7927,17 +7948,15 @@ // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes({UserVF}, {UserVF}); + buildVPlansWithVPRecipes({UserVF, UserVF.getWithIncrement(1)}); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } - ElementCount MaxVF = MaxFactors.FixedVF; - assert(!MaxVF.isScalable() && - "Scalable vectors not yet supported beyond this point"); - - for (ElementCount VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { + SmallVector VFCandidates; + genVFs(VFCandidates, ElementCount::getFixed(1), MaxFactors.FixedVF); + genVFs(VFCandidates, ElementCount::getScalable(1), MaxFactors.ScalableVF); + for (auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); @@ -7949,13 +7968,18 @@ CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); + VFRange FixedVFRange = {ElementCount::getFixed(1), + MaxFactors.FixedVF.getWithIncrement(1)}; + VFRange ScalableVFRange = {ElementCount::getScalable(1), + MaxFactors.ScalableVF.getWithIncrement(1)}; + buildVPlansWithVPRecipes(FixedVFRange, ScalableVFRange); + LLVM_DEBUG(printPlans(dbgs())); if (!MaxFactors.hasVectorCandidates()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - auto SelectedVF = CM.selectVectorizationFactor(MaxVF); + auto SelectedVF = CM.selectVectorizationFactor(MaxFactors); // Check if it is profitable to vectorize with runtime checks. unsigned NumRuntimePointerChecks = Requirements.getNumRuntimePointerChecks(); @@ -8901,8 +8925,8 @@ return toVPRecipeResult(tryToWiden(Instr, Operands)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(VFRange Range1, + VFRange Range2) { assert(OrigLoop->isInnermost() && "Inner loop expected."); // Collect instructions from the original loop that will become trivially dead @@ -8926,13 +8950,17 @@ for (Instruction *I : DeadInstructions) SinkAfter.erase(I); - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); - VF = SubRange.End; - } + auto CollectVPlans = [&](VFRange R) { + for (ElementCount VF = R.Start; ElementCount::isKnownLT(VF, R.End);) { + VFRange SubRange = {VF, R.End}; + VPlans.push_back( + buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + VF = SubRange.End; + } + }; + + CollectVPlans(Range1); + CollectVPlans(Range2); } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-op-cost.ll @@ -41,7 +41,7 @@ for.inc: ; preds = %for.body, %if.then %inc = add nuw nsw i64 %i.07, 1 %exitcond.not = icmp eq i64 %inc, %n - br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body, !llvm.loop !5 } @@ -90,3 +90,5 @@ !2 = !{!"llvm.loop.vectorize.width", i32 4} !3 = !{!"llvm.loop.vectorize.scalable.enable", i1 true} !4 = !{!"llvm.loop.vectorize.enable", i1 true} +!5 = distinct !{!5, !6} +!6 = !{!"llvm.loop.vectorize.scalable.enable", i1 false} diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll @@ -10,9 +10,13 @@ define void @test0(i32* %a, i8* %b, i32* %c) { ; CHECK: LV: Checking a loop in "test0" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4. ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4. ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 16. entry: br label %loop @@ -39,9 +43,13 @@ define void @test1(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test1" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 4. ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: vscale x 4. ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16. entry: br label %loop @@ -69,9 +77,13 @@ define void @test2(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test2" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON: LV: Selecting VF: 4. ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4. ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16. entry: br label %loop @@ -99,9 +111,13 @@ define void @test3(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test3" ; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_ON: LV: Selecting VF: 4. ; CHECK_SCALABLE_PREFERRED: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4. ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16. entry: br label %loop @@ -124,15 +140,16 @@ ret void } -; Test the fallback mechanism when scalable vectors are not feasible due -; to e.g. dependence distance. For the '-scalable-vectorization=exclusive' -; it shouldn't try to vectorize with fixed-width vectors. define void @test4(i32* %a, i32* %b) { ; CHECK: LV: Checking a loop in "test4" ; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON: LV: Selecting VF: 4. ; CHECK_SCALABLE_PREFERRED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_PREFERRED: LV: Selecting VF: 4. ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 ; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 4. entry: br label %loop