diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -267,7 +267,7 @@ LoopVectorizationLegality *Legal; /// The profitability analysis. - LoopVectorizationCostModel &CM; + SmallVector CMs; /// The interleaved access analysis. InterleavedAccessInfo &IAI; @@ -292,16 +292,18 @@ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, + ArrayRef CMs, InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), ORE(ORE) {} + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CMs(CMs), + IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or /// std::nullopt if vectorization and interleaving should be avoided up front. + void plan(ElementCount UserVF, unsigned UserIC, + LoopVectorizationCostModel &CM); std::optional plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best @@ -331,7 +333,12 @@ VPlans, [&](const std::pair>> - &Plan) { return Plan.first->hasVF(VF); }); + &Plan) { + return any_of(Plan.second, + [VF](const std::pair &P) { + return P.first.Width == VF; + }); + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -357,13 +364,14 @@ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(ElementCount MinVF, ElementCount MaxVF); + void buildVPlans(ElementCount MinVF, ElementCount MaxVF, + LoopVectorizationCostModel &CM); private: /// Build a VPlan according to the information gathered by Legal. \return a /// VPlan for vectorization factors \p Range.Start and up to \p Range.End /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range); + VPlanPtr buildVPlan(VFRange &Range, LoopVectorizationCostModel &CM); /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. @@ -371,13 +379,16 @@ /// returned VPlan is valid for. If no VPlan can be built for the input range, /// set the largest included VF to the maximum VF for which no plan could be /// built. - std::optional tryToBuildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions); + std::optional + tryToBuildVPlanWithVPRecipes(VFRange &Range, + SmallPtrSetImpl &DeadInstructions, + LoopVectorizationCostModel &CM); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF, + LoopVectorizationCostModel &CM); // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be @@ -386,7 +397,8 @@ // between the phi and live-out recipes when folding the tail. void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF); + ElementCount MinVF, + LoopVectorizationCostModel &CM); std::optional getVScaleForTuning() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5440,8 +5440,8 @@ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && - CM.foldTailByMasking() && MaxTripCount) { + // TODO: Missing check for CM.foldTailByMasking here. + if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { // If we are folding the tail and the trip count is a known (possibly small) // constant, the trip count will be rounded up to an integer number of // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), @@ -5489,7 +5489,8 @@ return TTI->getVScaleForTuning(); } VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { - assert(!VPlans.empty()); + if (VPlans.empty()) + return VectorizationFactor::Disabled(); InstructionCost ExpectedCost = VPlans[0].second[0].first.ScalarCost; const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, @@ -5508,10 +5509,6 @@ // Filter out unprofitable VFs. for (auto &[Plan, VFsAndCosts] : VPlans) { - erase_if(VFsAndCosts, [this, &ChosenFactor](const auto &Candidate) { - return !isMoreProfitable(Candidate.first, ChosenFactor); - }); - for (const auto &[Candidate, HasVec] : VFsAndCosts) { if (Candidate.Width.isScalar()) continue; @@ -5602,11 +5599,12 @@ return Result; } - if (!CM.isScalarEpilogueAllowed()) { - LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " - "epilogue is allowed.\n"); - return Result; - } + // TODO: check whether scalar epilogue is allowed. + /* if (!CM.isScalarEpilogueAllowed()) {*/ + /*LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "*/ + /*"epilogue is allowed.\n");*/ + /*return Result;*/ + /*}*/ // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. @@ -7511,7 +7509,7 @@ VF = ElementCount::getFixed(determineVPlanVF( TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(), - CM)); + *CMs[0])); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. @@ -7526,7 +7524,7 @@ "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF, VF); + buildVPlans(VF, VF, *CMs[0]); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) @@ -7543,13 +7541,39 @@ std::optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + for (LoopVectorizationCostModel *CM : CMs) + plan(UserVF, UserIC, *CM); + + if (VPlans.empty()) + return std::nullopt; + + if (UserVF && hasPlanWithVF(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: Selecting User VF: " << UserVF << ".\n"); + return {{UserVF, 0, 0}}; + } + // Select the optimal vectorization factor. + VectorizationFactor VF = selectVectorizationFactor(); + assert((VF.Width.isScalar() || VF.ScalarCost > 0) && + "when vectorizing, the scalar cost must be non-zero."); + + if (!hasPlanWithVF(VF.Width)) { + LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width + << ".\n"); + return std::nullopt; + } + + return VF; +} + +void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + LoopVectorizationCostModel &CM) { assert(OrigLoop->isInnermost() && "Inner loop expected."); CM.collectValuesToIgnore(); CM.collectElementTypesForWidening(); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. - return std::nullopt; + return; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && @@ -7576,15 +7600,13 @@ if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(UserVF, UserVF, CM); if (!hasPlanWithVF(UserVF)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF << ".\n"); - return std::nullopt; - } - - LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + } else + LLVM_DEBUG(printPlans(dbgs())); + return; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7605,27 +7627,17 @@ // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF.isVector()) + if (VF.isVector()) { CM.collectInstsToScalarize(VF); + } } CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF, CM); + buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF, + CM); LLVM_DEBUG(printPlans(dbgs())); - if (!MaxFactors.hasVector()) - return VectorizationFactor::Disabled(); - - // Select the optimal vectorization factor. - VectorizationFactor VF = selectVectorizationFactor(); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); - if (!hasPlanWithVF(VF.Width)) { - LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width - << ".\n"); - return std::nullopt; - } - return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -8086,11 +8098,12 @@ /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, - ElementCount MaxVF) { + ElementCount MaxVF, + LoopVectorizationCostModel &CM) { auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - VPlans.emplace_back(buildVPlan(SubRange), + VPlans.emplace_back(buildVPlan(SubRange, CM), SmallVector>()); VF = SubRange.End; } @@ -8696,8 +8709,8 @@ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes( + ElementCount MinVF, ElementCount MaxVF, LoopVectorizationCostModel &CM) { assert(OrigLoop->isInnermost() && "Inner loop expected."); // Add assume instructions we need to drop to DeadInstructions, to prevent @@ -8714,7 +8727,7 @@ auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions); + auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions, CM); if (!Plan) { VF = SubRange.End; continue; @@ -8723,6 +8736,8 @@ for (ElementCount CostVF = VF; ElementCount::isKnownLT(CostVF, SubRange.End); CostVF *= 2) { auto [VecCost, IsVec] = CM.expectedCost(CostVF, &InvalidCosts); + if (!VecCost.isValid()) + continue; Costs.emplace_back(VectorizationFactor(CostVF, VecCost, ScalarCost), IsVec); #ifndef NDEBUG @@ -8880,7 +8895,8 @@ } std::optional LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions) { + VFRange &Range, SmallPtrSetImpl &DeadInstructions, + LoopVectorizationCostModel &CM) { SmallPtrSet *, 1> InterleaveGroups; @@ -8914,7 +8930,7 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { + auto applyIG = [IG, &CM](ElementCount VF) -> bool { return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); @@ -9061,7 +9077,7 @@ // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, - RecipeBuilder, Range.Start); + RecipeBuilder, Range.Start, CM); // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. @@ -9121,7 +9137,8 @@ return std::make_optional(std::move(Plan)); } -VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { +VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range, + LoopVectorizationCostModel &CM) { // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -9163,7 +9180,7 @@ // and live-out recipes when folding the tail. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF) { + ElementCount MinVF, LoopVectorizationCostModel &CM) { for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = @@ -9922,7 +9939,8 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, ArrayRef(&CM), IAI, PSE, + Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10262,7 +10280,8 @@ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, {&CM}, IAI, PSE, Hints, + ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth();