diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -267,7 +267,7 @@ LoopVectorizationLegality *Legal; /// The profitability analysis. - LoopVectorizationCostModel &CM; + SmallVector CMs; /// The interleaved access analysis. InterleavedAccessInfo &IAI; @@ -292,16 +292,18 @@ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, + ArrayRef CMs, InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), ORE(ORE) {} + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CMs(CMs), + IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or /// std::nullopt if vectorization and interleaving should be avoided up front. + void plan(ElementCount UserVF, unsigned UserIC, + LoopVectorizationCostModel &CM); std::optional plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best @@ -331,7 +333,12 @@ VPlans, [&](const std::pair>> - &Plan) { return Plan.first->hasVF(VF); }); + &Plan) { + return any_of(Plan.second, + [VF](const std::pair &P) { + return P.first.Width == VF; + }); + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -357,24 +364,27 @@ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(ElementCount MinVF, ElementCount MaxVF); + void buildVPlans(ElementCount MinVF, ElementCount MaxVF, + LoopVectorizationCostModel &CM); private: /// Build a VPlan according to the information gathered by Legal. \return a /// VPlan for vectorization factors \p Range.Start and up to \p Range.End /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range); + VPlanPtr buildVPlan(VFRange &Range, LoopVectorizationCostModel &CM); /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr buildVPlanWithVPRecipes(VFRange &Range, - SmallPtrSetImpl &DeadInstructions); + SmallPtrSetImpl &DeadInstructions, + LoopVectorizationCostModel &CM); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF, + LoopVectorizationCostModel &CM); // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be @@ -383,7 +393,8 @@ // between the phi and live-out recipes when folding the tail. void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF); + ElementCount MinVF, + LoopVectorizationCostModel &CM); std::optional getVScaleForTuning() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5394,8 +5394,8 @@ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && - CM.foldTailByMasking() && MaxTripCount) { + // TODO: Missing check for CM.foldTailByMasking here. + if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { // If we are folding the tail and the trip count is a known (possibly small) // constant, the trip count will be rounded up to an integer number of // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), @@ -5443,7 +5443,8 @@ return TTI->getVScaleForTuning(); } VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { - assert(!VPlans.empty()); + if (VPlans.empty()) + return VectorizationFactor::Disabled(); InstructionCost ExpectedCost = VPlans[0].second[0].first.ScalarCost; const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, @@ -5462,10 +5463,6 @@ // Filter out unprofitable VFs. for (auto &[Plan, VFsAndCosts] : VPlans) { - erase_if(VFsAndCosts, [this, &ChosenFactor](const auto &Candidate) { - return !isMoreProfitable(Candidate.first, ChosenFactor); - }); - for (const auto &[Candidate, HasVec] : VFsAndCosts) { if (Candidate.Width.isScalar()) continue; @@ -5554,12 +5551,13 @@ return Result; } - if (!CM.isScalarEpilogueAllowed()) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " - "allowed.\n";); - return Result; - } + // TODO: check whether scalar epilogue is allowed. + /* if (!CM.isScalarEpilogueAllowed()) {*/ + /*LLVM_DEBUG(*/ + /*dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "*/ + /*"allowed.\n";);*/ + /*return Result;*/ + /*}*/ // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. @@ -7466,7 +7464,7 @@ VF = ElementCount::getFixed(determineVPlanVF( TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(), - CM)); + *CMs[0])); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. @@ -7481,7 +7479,7 @@ "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF, VF); + buildVPlans(VF, VF, *CMs[0]); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) @@ -7498,13 +7496,32 @@ std::optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + for (LoopVectorizationCostModel *CM : CMs) + plan(UserVF, UserIC, *CM); + + if (VPlans.empty()) + return std::nullopt; + + if (UserVF && hasPlanWithVF(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: Selecting User VF: " << UserVF << ".\n"); + return {{UserVF, 0, 0}}; + } + // Select the optimal vectorization factor. + VectorizationFactor VF = selectVectorizationFactor(); + assert((VF.Width.isScalar() || VF.ScalarCost > 0) && + "when vectorizing, the scalar cost must be non-zero."); + return VF; +} + +void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + LoopVectorizationCostModel &CM) { assert(OrigLoop->isInnermost() && "Inner loop expected."); CM.collectValuesToIgnore(); CM.collectElementTypesForWidening(); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. - return std::nullopt; + return; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && @@ -7531,9 +7548,9 @@ if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(UserVF, UserVF, CM); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + return; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7554,22 +7571,17 @@ // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF.isVector()) + if (VF.isVector()) { CM.collectInstsToScalarize(VF); + } } CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF, CM); + buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF, + CM); LLVM_DEBUG(printPlans(dbgs())); - if (!MaxFactors.hasVector()) - return VectorizationFactor::Disabled(); - - // Select the optimal vectorization factor. - VectorizationFactor VF = selectVectorizationFactor(); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); - return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -8030,11 +8042,12 @@ /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, - ElementCount MaxVF) { + ElementCount MaxVF, + LoopVectorizationCostModel &CM) { auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.emplace_back(buildVPlan(SubRange), + VPlans.emplace_back(buildVPlan(SubRange, CM), SmallVector>()); VF = SubRange.End; } @@ -8645,8 +8658,8 @@ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes( + ElementCount MinVF, ElementCount MaxVF, LoopVectorizationCostModel &CM) { assert(OrigLoop->isInnermost() && "Inner loop expected."); // Add assume instructions we need to drop to DeadInstructions, to prevent @@ -8663,11 +8676,13 @@ auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - auto Plan = buildVPlanWithVPRecipes(SubRange, DeadInstructions); + auto Plan = buildVPlanWithVPRecipes(SubRange, DeadInstructions, CM); SmallVector> Costs; for (ElementCount CostVF = VF; ElementCount::isKnownLT(CostVF, SubRange.End); CostVF *= 2) { auto [VecCost, IsVec] = CM.expectedCost(CostVF, &InvalidCosts); + if (!VecCost.isValid()) + continue; Costs.emplace_back(VectorizationFactor(CostVF, VecCost, ScalarCost), IsVec); #ifndef NDEBUG @@ -8832,7 +8847,8 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions) { + VFRange &Range, SmallPtrSetImpl &DeadInstructions, + LoopVectorizationCostModel &CM) { SmallPtrSet *, 1> InterleaveGroups; @@ -8866,7 +8882,7 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { + auto applyIG = [IG, &CM](ElementCount VF) -> bool { return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); @@ -8991,7 +9007,7 @@ // After here, VPBB should not be used. VPBB = nullptr; - auto RequiresScalarEpilogue = [this](ElementCount VF) { + auto RequiresScalarEpilogue = [CM](ElementCount VF) { return CM.requiresScalarEpilogue(VF); }; assert( @@ -9017,7 +9033,7 @@ // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, - RecipeBuilder, Range.Start); + RecipeBuilder, Range.Start, CM); // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. @@ -9076,7 +9092,8 @@ return Plan; } -VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { +VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range, + LoopVectorizationCostModel &CM) { // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -9119,7 +9136,7 @@ // and live-out recipes when folding the tail. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF) { + ElementCount MinVF, LoopVectorizationCostModel &CM) { for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = @@ -9880,7 +9897,8 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, ArrayRef(&CM), IAI, PSE, + Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10220,7 +10238,8 @@ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, {&CM}, IAI, PSE, Hints, + ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth();