diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -330,10 +330,14 @@ VFRange &Range, SmallPtrSetImpl &DeadInstructions, const DenseMap &SinkAfter); - /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, - /// according to the information gathered by Legal when it checked if it is - /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + /// Build VPlans for power-of-2 VF's for the ranges specified by Range1 and + /// Range2. according to the information gathered by Legal when it checked if + /// it is legal to vectorize the loop. This method creates VPlans using + /// VPRecipes. When the Maximum VF is set, the corresponding Minimum VF must + /// also be set. + void buildVPlansWithVPRecipes(VFRange Range1, + VFRange Range2 = {ElementCount::getFixed(1), + ElementCount::getFixed(0)}); /// Adjust the recipes for any inloop reductions. The chain of instructions /// leading from the loop exit instr to the phi need to be converted to diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -425,6 +425,17 @@ return None; } +/// Generate power-of-2 VFs starting from the minimum VF up to and including +/// maximum VF. For scalable VFs, it is only the minimum known value that is a +/// power of 2, the eventual VF may not be if the runtime value for vscale is +/// not also a power of two. +static void genVFs(SmallVectorImpl &List, const ElementCount &Min, + const ElementCount &Max) { + assert((Min.isScalable() == Max.isScalable()) && "Scalable flags must match"); + for (auto VF = Min; ElementCount::isKnownLE(VF, Max); VF *= 2) + List.push_back(VF); +} + // Forward declare GeneratedRTChecks. class GeneratedRTChecks; @@ -1235,7 +1246,9 @@ /// This method checks every power of two up to MaxVF. If UserVF is not ZERO /// then this vectorization factor will be selected if vectorization is /// possible. - VectorizationFactor selectVectorizationFactor(ElementCount MaxVF); + VectorizationFactor + selectVectorizationFactor(const MaxVFCandidates &MaxFactors); + VectorizationFactor selectEpilogueVectorizationFactor(const ElementCount MaxVF, const LoopVectorizationPlanner &LVP); @@ -1499,7 +1512,7 @@ /// Returns true if the target machine supports all of the reduction /// variables found for the given VF. - bool canVectorizeReductions(ElementCount VF) { + bool canVectorizeReductions(ElementCount VF) const { return (all_of(Legal->getReductionVars(), [&](auto &Reduction) -> bool { RecurrenceDescriptor RdxDesc = Reduction.second; return TTI.isLegalToVectorizeReduction(RdxDesc, VF); @@ -5905,13 +5918,8 @@ return MaxVF; } -VectorizationFactor -LoopVectorizationCostModel::selectVectorizationFactor(ElementCount MaxVF) { - // FIXME: This can be fixed for scalable vectors later, because at this stage - // the LoopVectorizer will only consider vectorizing a loop with scalable - // vectors when the loop has a hint to enable vectorization for a given VF. - assert(!MaxVF.isScalable() && "scalable vectors not yet supported"); - +VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( + const MaxVFCandidates &MaxFactors) { InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); @@ -5921,21 +5929,25 @@ float Cost = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && MaxVF.isVector()) { + if (ForceVectorization && MaxFactors.hasVectorCandidates()) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. Cost = std::numeric_limits::max(); } - for (auto i = ElementCount::getFixed(2); ElementCount::isKnownLE(i, MaxVF); - i *= 2) { + // Iterate through all VFs, prioritising scalable vectors so they have a + // slight edge over fixed-width vectors when the cost is the same. + SmallVector VFCandidates; + genVFs(VFCandidates, ElementCount::getScalable(1), MaxFactors.ScalableVF); + genVFs(VFCandidates, ElementCount::getFixed(2), MaxFactors.FixedVF); + for (const auto &i : VFCandidates) { // Notice that the vector loop needs to be executed less times, so // we need to divide the cost of the vector loops by the width of // the vector elements. VectorizationCostTy C = expectedCost(i); assert(C.first.isValid() && "Unexpected invalid cost for vector loop"); - float VectorCost = *C.first.getValue() / (float)i.getFixedValue(); + float VectorCost = *C.first.getValue() / (float)i.getKnownMinValue(); LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " << (int)VectorCost << ".\n"); if (!C.second && !ForceVectorization) { @@ -7587,12 +7599,7 @@ } } - unsigned N; - if (isScalarAfterVectorization(I, VF)) { - assert(!VF.isScalable() && "VF is assumed to be non scalable"); - N = VF.getKnownMinValue(); - } else - N = 1; + unsigned N = isScalarAfterVectorization(I, VF) ? VF.getKnownMinValue() : 1; return N * TTI.getCastInstrCost(Opcode, VectorTy, SrcVecTy, CCH, CostKind, I); } @@ -7802,17 +7809,15 @@ // profitable to scalarize. CM.selectUserVectorizationFactor(UserVF); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes({UserVF}, {UserVF}); + buildVPlansWithVPRecipes({UserVF, UserVF.getWithIncrement(1)}); LLVM_DEBUG(printPlans(dbgs())); return {{UserVF, 0}}; } - ElementCount MaxVF = MaxFactors.FixedVF; - assert(!MaxVF.isScalable() && - "Scalable vectors not yet supported beyond this point"); - - for (ElementCount VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { + SmallVector VFCandidates; + genVFs(VFCandidates, ElementCount::getFixed(1), MaxFactors.FixedVF); + genVFs(VFCandidates, ElementCount::getScalable(1), MaxFactors.ScalableVF); + for (auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. CM.collectUniformsAndScalars(VF); @@ -7824,13 +7829,18 @@ CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxVF); + VFRange FixedVFRange = {ElementCount::getFixed(1), + MaxFactors.FixedVF.getWithIncrement(1)}; + VFRange ScalableVFRange = {ElementCount::getScalable(1), + MaxFactors.ScalableVF.getWithIncrement(1)}; + buildVPlansWithVPRecipes(FixedVFRange, ScalableVFRange); + LLVM_DEBUG(printPlans(dbgs())); if (!MaxFactors.hasVectorCandidates()) return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - return CM.selectVectorizationFactor(MaxVF); + return CM.selectVectorizationFactor(MaxFactors); } void LoopVectorizationPlanner::setBestPlan(ElementCount VF, unsigned UF) { @@ -8729,8 +8739,8 @@ return toVPRecipeResult(tryToWiden(Instr, *Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(VFRange Range1, + VFRange Range2) { assert(OrigLoop->isInnermost() && "Inner loop expected."); // Collect instructions from the original loop that will become trivially dead @@ -8754,13 +8764,17 @@ for (Instruction *I : DeadInstructions) SinkAfter.erase(I); - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); - for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { - VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); - VF = SubRange.End; - } + auto CollectVPlans = [&](VFRange R) { + for (ElementCount VF = R.Start; ElementCount::isKnownLT(VF, R.End);) { + VFRange SubRange = {VF, R.End}; + VPlans.push_back( + buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + VF = SubRange.End; + } + }; + + CollectVPlans(Range1); + CollectVPlans(Range2); } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll --- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll +++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vf-analysis.ll @@ -10,9 +10,13 @@ define void @test0(i32* %a, i8* %b, i32* %c) { ; CHECK: LV: Checking a loop in "test0" ; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ON: LV: Selecting VF: vscale x 4. ; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Selecting VF: vscale x 4. ; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_DISABLED: LV: Selecting VF: 1 ; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 16 +; CHECK_MAXBW: LV: Selecting VF: vscale x 16. entry: br label %loop @@ -38,9 +42,13 @@ define void @test1(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test1" ; CHECK_ON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ON: LV: Selecting VF: vscale x 4. ; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 4 +; CHECK_ALWAYSON: LV: Selecting VF: vscale x 4. ; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_DISABLED: LV: Selecting VF: 1 ; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 4 +; CHECK_MAXBW: LV: Selecting VF: 16. entry: br label %loop @@ -68,9 +76,13 @@ define void @test2(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test2" ; CHECK_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_ON: LV: Selecting VF: vscale x 2. ; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_ALWAYSON: LV: Selecting VF: vscale x 2. ; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_DISABLED: LV: Selecting VF: 1 ; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 2 +; CHECK_MAXBW: LV: Selecting VF: 16. entry: br label %loop @@ -98,9 +110,13 @@ define void @test3(i32* %a, i8* %b) { ; CHECK: LV: Checking a loop in "test3" ; CHECK_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_ON: LV: Selecting VF: 1. ; CHECK_ALWAYSON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_ALWAYSON: LV: Selecting VF: 1. ; CHECK_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_DISABLED: LV: Selecting VF: 1 ; CHECK_MAXBW: LV: Found feasible scalable VF = vscale x 1 +; CHECK_MAXBW: LV: Selecting VF: 16. entry: br label %loop