diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -267,7 +267,7 @@ LoopVectorizationLegality *Legal; /// The profitability analysis. - LoopVectorizationCostModel &CM; + SmallVector CMs; /// The interleaved access analysis. InterleavedAccessInfo &IAI; @@ -292,16 +292,18 @@ LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, + ArrayRef CMs, InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), - PSE(PSE), Hints(Hints), ORE(ORE) {} + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CMs(CMs), + IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or /// std::nullopt if vectorization and interleaving should be avoided up front. + void plan(ElementCount UserVF, unsigned UserIC, + LoopVectorizationCostModel &CM); std::optional plan(ElementCount UserVF, unsigned UserIC); /// Use the VPlan-native path to plan how to best vectorize, return the best @@ -331,7 +333,12 @@ VPlans, [&](const std::pair>> - &Plan) { return Plan.first->hasVF(VF); }); + &Plan) { + return any_of(Plan.second, + [VF](const std::pair &P) { + return P.first.Width == VF; + }); + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -357,24 +364,27 @@ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(ElementCount MinVF, ElementCount MaxVF); + void buildVPlans(ElementCount MinVF, ElementCount MaxVF, + LoopVectorizationCostModel &CM); private: /// Build a VPlan according to the information gathered by Legal. \return a /// VPlan for vectorization factors \p Range.Start and up to \p Range.End /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range); + VPlanPtr buildVPlan(VFRange &Range, LoopVectorizationCostModel &CM); /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr buildVPlanWithVPRecipes(VFRange &Range, - SmallPtrSetImpl &DeadInstructions); + SmallPtrSetImpl &DeadInstructions, + LoopVectorizationCostModel &CM); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF, + LoopVectorizationCostModel &CM); // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be @@ -383,7 +393,8 @@ // between the phi and live-out recipes when folding the tail. void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF); + ElementCount MinVF, + LoopVectorizationCostModel &CM); std::optional getVScaleForTuning() const; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5398,8 +5398,8 @@ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && - CM.foldTailByMasking() && MaxTripCount) { + // TODO: Missing check for CM.foldTailByMasking here. + if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { // If we are folding the tail and the trip count is a known (possibly small) // constant, the trip count will be rounded up to an integer number of // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), @@ -5447,7 +5447,8 @@ return TTI->getVScaleForTuning(); } VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { - assert(!VPlans.empty()); + if (VPlans.empty()) + return VectorizationFactor::Disabled(); InstructionCost ExpectedCost = VPlans[0].second[0].first.ScalarCost; const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, @@ -5466,10 +5467,6 @@ // Filter out unprofitable VFs. for (auto &[Plan, VFsAndCosts] : VPlans) { - erase_if(VFsAndCosts, [this, &ChosenFactor](const auto &Candidate) { - return !isMoreProfitable(Candidate.first, ChosenFactor); - }); - for (const auto &[Candidate, HasVec] : VFsAndCosts) { if (Candidate.Width.isScalar()) continue; @@ -5558,12 +5555,13 @@ return Result; } - if (!CM.isScalarEpilogueAllowed()) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " - "allowed.\n";); - return Result; - } + // TODO: check whether scalar epilogue is allowed. + /* if (!CM.isScalarEpilogueAllowed()) {*/ + /*LLVM_DEBUG(*/ + /*dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "*/ + /*"allowed.\n";);*/ + /*return Result;*/ + /*}*/ // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. @@ -7470,7 +7468,7 @@ VF = ElementCount::getFixed(determineVPlanVF( TTI->getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector) .getFixedValue(), - CM)); + *CMs[0])); LLVM_DEBUG(dbgs() << "LV: VPlan computed VF " << VF << ".\n"); // Make sure we have a VF > 1 for stress testing. @@ -7485,7 +7483,7 @@ "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF, VF); + buildVPlans(VF, VF, *CMs[0]); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) @@ -7502,13 +7500,32 @@ std::optional LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { + for (LoopVectorizationCostModel *CM : CMs) + plan(UserVF, UserIC, *CM); + + if (VPlans.empty()) + return std::nullopt; + + if (UserVF && hasPlanWithVF(UserVF)) { + LLVM_DEBUG(dbgs() << "LV: Selecting User VF: " << UserVF << ".\n"); + return {{UserVF, 0, 0}}; + } + // Select the optimal vectorization factor. + VectorizationFactor VF = selectVectorizationFactor(); + assert((VF.Width.isScalar() || VF.ScalarCost > 0) && + "when vectorizing, the scalar cost must be non-zero."); + return VF; +} + +void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC, + LoopVectorizationCostModel &CM) { assert(OrigLoop->isInnermost() && "Inner loop expected."); CM.collectValuesToIgnore(); CM.collectElementTypesForWidening(); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. - return std::nullopt; + return; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && @@ -7535,9 +7552,9 @@ if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(UserVF, UserVF, CM); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + return; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7558,81 +7575,17 @@ // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF.isVector()) + if (VF.isVector()) { CM.collectInstsToScalarize(VF); + } } CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); + buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF, CM); + buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF, + CM); LLVM_DEBUG(printPlans(dbgs())); - if (!MaxFactors.hasVector()) - return VectorizationFactor::Disabled(); - - // Emit a report of VFs with invalid costs in the loop. - if (!InvalidCosts.empty()) { - // Group the remarks per instruction, keeping the instruction order from - // InvalidCosts. - std::map Numbering; - unsigned I = 0; - for (auto &Pair : InvalidCosts) - if (!Numbering.count(Pair.first)) - Numbering[Pair.first] = I++; - - // Sort the list, first on instruction(number) then on VF. - llvm::sort(InvalidCosts, - [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { - if (Numbering[A.first] != Numbering[B.first]) - return Numbering[A.first] < Numbering[B.first]; - ElementCountComparator ECC; - return ECC(A.second, B.second); - }); - - // For a list of ordered instruction-vf pairs: - // [(load, vf1), (load, vf2), (store, vf1)] - // Group the instructions together to emit separate remarks for: - // load (vf1, vf2) - // store (vf1) - auto Tail = ArrayRef(InvalidCosts); - auto Subset = ArrayRef(); - do { - if (Subset.empty()) - Subset = Tail.take_front(1); - - Instruction *I = Subset.front().first; - - // If the next instruction is different, or if there are no other pairs, - // emit a remark for the collated subset. e.g. - // [(load, vf1), (load, vf2))] - // to emit: - // remark: invalid costs for 'load' at VF=(vf, vf2) - if (Subset == Tail || Tail[Subset.size()].first != I) { - std::string OutString; - raw_string_ostream OS(OutString); - assert(!Subset.empty() && "Unexpected empty range"); - OS << "Instruction with invalid costs prevented vectorization at VF=("; - for (const auto &Pair : Subset) - OS << (Pair.second == Subset.front().second ? "" : ", ") - << Pair.second; - OS << "):"; - if (auto *CI = dyn_cast(I)) - OS << " call to " << CI->getCalledFunction()->getName(); - else - OS << " " << I->getOpcodeName(); - OS.flush(); - reportVectorizationInfo(OutString, "InvalidCost", ORE, OrigLoop, I); - Tail = Tail.drop_front(Subset.size()); - Subset = {}; - } else - // Grow the subset by one element - Subset = Tail.take_front(Subset.size() + 1); - } while (!Tail.empty()); - } - // Select the optimal vectorization factor. - VectorizationFactor VF = selectVectorizationFactor(); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); - return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { @@ -8094,11 +8047,12 @@ /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, - ElementCount MaxVF) { + ElementCount MaxVF, + LoopVectorizationCostModel &CM) { auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.emplace_back(buildVPlan(SubRange), + VPlans.emplace_back(buildVPlan(SubRange, CM), SmallVector>()); VF = SubRange.End; } @@ -8709,8 +8663,8 @@ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes( + ElementCount MinVF, ElementCount MaxVF, LoopVectorizationCostModel &CM) { assert(OrigLoop->isInnermost() && "Inner loop expected."); // Add assume instructions we need to drop to DeadInstructions, to prevent @@ -8727,11 +8681,13 @@ auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - auto Plan = buildVPlanWithVPRecipes(SubRange, DeadInstructions); + auto Plan = buildVPlanWithVPRecipes(SubRange, DeadInstructions, CM); SmallVector> Costs; for (ElementCount CostVF = VF; ElementCount::isKnownLT(CostVF, SubRange.End); CostVF *= 2) { auto [VecCost, IsVec] = CM.expectedCost(CostVF, &InvalidCosts); + if (!VecCost.isValid()) + continue; Costs.emplace_back(VectorizationFactor(CostVF, VecCost, ScalarCost), IsVec); #ifndef NDEBUG @@ -8889,7 +8845,8 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions) { + VFRange &Range, SmallPtrSetImpl &DeadInstructions, + LoopVectorizationCostModel &CM) { SmallPtrSet *, 1> InterleaveGroups; @@ -8923,7 +8880,7 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { + auto applyIG = [IG, &CM](ElementCount VF) -> bool { return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); @@ -9066,7 +9023,7 @@ // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, - RecipeBuilder, Range.Start); + RecipeBuilder, Range.Start, CM); // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. @@ -9124,7 +9081,8 @@ return Plan; } -VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { +VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range, + LoopVectorizationCostModel &CM) { // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -9167,7 +9125,7 @@ // and live-out recipes when folding the tail. void LoopVectorizationPlanner::adjustRecipesForReductions( VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF) { + ElementCount MinVF, LoopVectorizationCostModel &CM) { for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = @@ -9927,7 +9885,8 @@ // Use the planner for outer loop vectorization. // TODO: CM is not used at this point inside the planner. Turn CM into an // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, ArrayRef(&CM), IAI, PSE, + Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -10267,7 +10226,8 @@ LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F, &Hints, IAI); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, {&CM}, IAI, PSE, Hints, + ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth();