diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -25,6 +25,7 @@ #define LLVM_TRANSFORMS_VECTORIZE_LOOPVECTORIZATIONPLANNER_H #include "VPlan.h" +#include "llvm/ADT/SmallSet.h" #include "llvm/Support/InstructionCost.h" namespace llvm { @@ -217,6 +218,16 @@ } }; +/// ElementCountComparator creates a total ordering for ElementCount +/// for the purposes of using it in a set structure. +struct ElementCountComparator { + bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { + return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < + std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); + } +}; +using ElementCountSet = SmallSet; + /// A class that represents two vectorization factors (initialized with 0 by /// default). One for fixed-width vectorization and one for scalable /// vectorization. This can be used by the vectorizer to choose from a range of @@ -280,6 +291,9 @@ SmallVector VPlans; + /// Profitable vector factors. + SmallVector ProfitableVFs; + /// A builder used to construct the current plan. VPBuilder Builder; @@ -336,6 +350,12 @@ /// Check if the number of runtime checks exceeds the threshold. bool requiresTooManyRuntimeChecks() const; + /// \return The most profitable vectorization factor and the cost of that VF + /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if + /// epilogue vectorization is not supported for the loop. + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF); + protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -370,6 +390,20 @@ void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); + + /// \return The most profitable vectorization factor and the cost of that VF. + /// This method checks every VF in \p CandidateVFs. + VectorizationFactor + selectVectorizationFactor(const ElementCountSet &CandidateVFs); + + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B) const; + + /// Determines if we have the infrastructure to vectorize the loop and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const ElementCount VF) const; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1160,16 +1160,6 @@ CM_ScalarEpilogueNotAllowedUsePredicate }; -/// ElementCountComparator creates a total ordering for ElementCount -/// for the purposes of using it in a set structure. -struct ElementCountComparator { - bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { - return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < - std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); - } -}; -using ElementCountSet = SmallSet; - using InstructionVFPair = std::pair; /// LoopVectorizationCostModel - estimates the expected speedups due to @@ -1203,18 +1193,6 @@ /// otherwise. bool runtimeChecksRequired(); - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. - VectorizationFactor - selectVectorizationFactor(const ElementCountSet &CandidateVFs); - - /// \return The most profitable vectorization factor and the cost of that VF - /// for vectorizing the epilogue. Returns VectorizationFactor::Disabled if - /// epilogue vectorization is not supported for the loop. - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); - /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { @@ -1632,11 +1610,6 @@ Function **Variant, bool *NeedsMask = nullptr) const; - /// Returns true if the per-lane cost of VectorizationFactor A is lower than - /// that of B. - bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B) const; - /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { WideningDecisions.clear(); @@ -1644,10 +1617,29 @@ Scalars.clear(); } - /// Convenience function that returns the value of vscale_range iff - /// vscale_range.min == vscale_range.max or otherwise returns the value - /// returned by the corresponding TLI method. - std::optional getVScaleForTuning() const; + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); + + bool hasPredStores() const { return NumPredStores > 0; } + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; private: unsigned NumPredStores = 0; @@ -1674,23 +1666,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. If \p Invalid is not nullptr, this function - /// will add a pair(Instruction*, ElementCount) to \p Invalid for - /// each instruction that has an Invalid cost for the given VF. - VectorizationCostTy - expectedCost(ElementCount VF, - SmallVectorImpl *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); @@ -1853,15 +1828,6 @@ Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize the loop and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -1907,9 +1873,6 @@ /// All element types found in the loop. SmallPtrSet ElementTypesInLoop; - - /// Profitable vector factors. - SmallVector ProfitableVFs; }; } // end namespace llvm @@ -5333,9 +5296,13 @@ return MaxVF; } -std::optional LoopVectorizationCostModel::getVScaleForTuning() const { - if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { - auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); +/// Convenience function that returns the value of vscale_range iff +/// vscale_range.min == vscale_range.max or otherwise returns the value +/// returned by the corresponding TLI method. +static std::optional +getVScaleForTuning(const TargetTransformInfo &TTI, const Function *Fn) { + if (Fn->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = Fn->getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); auto Max = Attr.getVScaleRangeMax(); if (Max && Min == Max) @@ -5345,12 +5312,12 @@ return TTI.getVScaleForTuning(); } -bool LoopVectorizationCostModel::isMoreProfitable( +bool LoopVectorizationPlanner::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { // If the trip count is a known (possibly small) constant, the trip count @@ -5364,9 +5331,9 @@ auto GetCostForTC = [MaxTripCount, this](unsigned VF, InstructionCost VectorCost, InstructionCost ScalarCost) { - return foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) - : VectorCost * (MaxTripCount / VF) + - ScalarCost * (MaxTripCount % VF); + return CM.foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) + : VectorCost * (MaxTripCount / VF) + + ScalarCost * (MaxTripCount % VF); }; auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); @@ -5377,7 +5344,8 @@ // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (std::optional VScale = getVScaleForTuning()) { + if (std::optional VScale = + getVScaleForTuning(*TTI, OrigLoop->getHeader()->getParent())) { if (A.Width.isScalable()) EstimatedWidthA *= *VScale; if (B.Width.isScalable()) @@ -5460,9 +5428,10 @@ } while (!Tail.empty()); } -VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( +VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor( const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + InstructionCost ExpectedCost = + CM.expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); assert(VFCandidates.count(ElementCount::getFixed(1)) && @@ -5472,7 +5441,7 @@ ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; - bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (ForceVectorization && VFCandidates.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost @@ -5486,12 +5455,14 @@ if (i.isScalar()) continue; - VectorizationCostTy C = expectedCost(i, &InvalidCosts); + LoopVectorizationCostModel::VectorizationCostTy C = + CM.expectedCost(i, &InvalidCosts); VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; - if (std::optional VScale = getVScaleForTuning()) + if (std::optional VScale = + getVScaleForTuning(*TTI, OrigLoop->getHeader()->getParent())) AssumedMinimumVscale = *VScale; unsigned Width = Candidate.Width.isScalable() @@ -5520,12 +5491,13 @@ ChosenFactor = Candidate; } - emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop); + emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); - if (!EnableCondStoresVectorization && NumPredStores) { - reportVectorizationFailure("There are conditional stores.", + if (!EnableCondStoresVectorization && CM.hasPredStores()) { + reportVectorizationFailure( + "There are conditional stores.", "store that is conditionally executed prevents vectorization", - "ConditionalStore", ORE, TheLoop); + "ConditionalStore", ORE, OrigLoop); ChosenFactor = ScalarCost; } @@ -5537,11 +5509,11 @@ return ChosenFactor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( +bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. - if (any_of(TheLoop->getHeader()->phis(), + if (any_of(OrigLoop->getHeader()->phis(), [&](PHINode &Phi) { return Legal->isFixedOrderRecurrence(&Phi); })) return false; @@ -5550,20 +5522,20 @@ for (const auto &Entry : Legal->getInductionVars()) { // Look for uses of the value of the induction at the last iteration. Value *PostInc = - Entry.first->getIncomingValueForBlock(TheLoop->getLoopLatch()); + Entry.first->getIncomingValueForBlock(OrigLoop->getLoopLatch()); for (User *U : PostInc->users()) - if (!TheLoop->contains(cast(U))) + if (!OrigLoop->contains(cast(U))) return false; // Look for uses of penultimate value of the induction. for (User *U : Entry.first->users()) - if (!TheLoop->contains(cast(U))) + if (!OrigLoop->contains(cast(U))) return false; } // Epilogue vectorization code has not been auditted to ensure it handles // non-latch exits properly. It may be fine, but it needs auditted and // tested. - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) + if (OrigLoop->getExitingBlock() != OrigLoop->getLoopLatch()) return false; return true; @@ -5587,22 +5559,21 @@ unsigned Multiplier = 1; if (VF.isScalable()) - Multiplier = getVScaleForTuning().value_or(1); + Multiplier = getVScaleForTuning(TTI, TheFunction).value_or(1); if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; } -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { +VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); return Result; } - if (!isScalarEpilogueAllowed()) { + if (!CM.isScalarEpilogueAllowed()) { LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " "epilogue is allowed.\n"); return Result; @@ -5619,7 +5590,7 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) + if (hasPlanWithVF(ForcedEC)) return {ForcedEC, 0, 0}; else { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " @@ -5628,14 +5599,14 @@ } } - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { + if (OrigLoop->getHeader()->getParent()->hasOptSize() || + OrigLoop->getHeader()->getParent()->hasMinSize()) { LLVM_DEBUG( dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) { + if (!CM.isEpilogueVectorizationProfitable(MainLoopVF)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -5647,7 +5618,8 @@ ElementCount EstimatedRuntimeVF = MainLoopVF; if (MainLoopVF.isScalable()) { EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (std::optional VScale = getVScaleForTuning()) + if (std::optional VScale = + getVScaleForTuning(*TTI, OrigLoop->getHeader()->getParent())) EstimatedRuntimeVF *= *VScale; } @@ -5656,7 +5628,7 @@ ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) + hasPlanWithVF(NextVF.Width)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) @@ -7617,7 +7589,7 @@ return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = selectVectorizationFactor(VFCandidates); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); if (!hasPlanWithVF(VF.Width)) { LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width @@ -10254,8 +10226,9 @@ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, - *PSE.getSE())) { + !areRuntimeChecksProfitable( + Checks, VF, getVScaleForTuning(*TTI, L->getHeader()->getParent()), + L, *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( DEBUG_TYPE, "CantReorderMemOps", L->getStartLoc(), @@ -10376,7 +10349,7 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + LVP.selectEpilogueVectorizationFactor(VF.Width); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue