diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -278,11 +278,14 @@ OptimizationRemarkEmitter *ORE; - SmallVector VPlans; + SmallVector>, 4> VPlans; /// A builder used to construct the current plan. VPBuilder Builder; + using InstructionVFPair = std::pair; + SmallVector InvalidCosts; + public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, @@ -322,8 +325,11 @@ /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. bool hasPlanWithVF(ElementCount VF) const { - return any_of(VPlans, - [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); + return any_of( + VPlans, + [&](const std::pair> &Plan) { + return Plan.first->hasVF(VF); + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -336,6 +342,15 @@ /// Check if the number of runtime checks exceeds the threshold. bool requiresTooManyRuntimeChecks() const; + /// \return The most profitable vectorization factor and the cost of that VF. + /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO + /// then this vectorization factor will be selected if vectorization is + /// possible. + VectorizationFactor selectVectorizationFactor(); + + VectorizationFactor + selectEpilogueVectorizationFactor(const ElementCount MaxVF); + protected: /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is @@ -367,6 +382,20 @@ void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); + + std::optional getVScaleForTuning() const; + + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B) const; + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const Loop &L, + const ElementCount VF) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; }; } // namespace llvm diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -1186,17 +1186,6 @@ /// otherwise. bool runtimeChecksRequired(); - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO - /// then this vectorization factor will be selected if vectorization is - /// possible. - VectorizationFactor - selectVectorizationFactor(const ElementCountSet &CandidateVFs); - - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); - /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { @@ -1612,6 +1601,19 @@ /// returned by the corresponding TLI method. std::optional getVScaleForTuning() const; + using VectorizationCostTy = std::pair; + + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + using InstructionVFPair = std::pair; + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); + private: unsigned NumPredStores = 0; @@ -1642,17 +1644,6 @@ /// operate on vector values after type legalization in the backend. If this /// latter value is false, then all operations will be scalarized (i.e. no /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. If \p Invalid is not nullptr, this function - /// will add a pair(Instruction*, ElementCount) to \p Invalid for - /// each instruction that has an Invalid cost for the given VF. - VectorizationCostTy - expectedCost(ElementCount VF, - SmallVectorImpl *Invalid = nullptr); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. @@ -1817,16 +1808,6 @@ Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize loop \p L and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const Loop &L, - const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -5402,35 +5383,87 @@ } while (!Tail.empty()); } -VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( - const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; - LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); - assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(VFCandidates.count(ElementCount::getFixed(1)) && - "Expected Scalar VF to be a candidate"); +bool LoopVectorizationPlanner::isMoreProfitable( + const VectorizationFactor &A, const VectorizationFactor &B) const { + InstructionCost CostA = A.Cost; + InstructionCost CostB = B.Cost; + + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + + if (!A.Width.isScalable() && !B.Width.isScalable() && + CM.foldTailByMasking() && MaxTripCount) { + // If we are folding the tail and the trip count is a known (possibly small) + // constant, the trip count will be rounded up to an integer number of + // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), + // which we compare directly. When not folding the tail, the total cost will + // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is + // approximated with the per-lane cost below instead of using the tripcount + // as here. + auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); + auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); + return RTCostA < RTCostB; + } + + // Improve estimate for the vector width if it is scalable. + unsigned EstimatedWidthA = A.Width.getKnownMinValue(); + unsigned EstimatedWidthB = B.Width.getKnownMinValue(); + if (std::optional VScale = getVScaleForTuning()) { + if (A.Width.isScalable()) + EstimatedWidthA *= *VScale; + if (B.Width.isScalable()) + EstimatedWidthB *= *VScale; + } + + // Assume vscale may be larger than 1 (or the value being tuned for), + // so that scalable vectorization is slightly favorable over fixed-width + // vectorization. + if (A.Width.isScalable() && !B.Width.isScalable()) + return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + + // To avoid the need for FP division: + // (CostA / A.Width) < (CostB / B.Width) + // <=> (CostA * B.Width) < (CostB * A.Width) + return (CostA * EstimatedWidthB) < (CostB * EstimatedWidthA); +} +std::optional LoopVectorizationPlanner::getVScaleForTuning() const { + Function *TheFunction = OrigLoop->getHeader()->getParent(); + if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { + auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); + auto Min = Attr.getVScaleRangeMin(); + auto Max = Attr.getVScaleRangeMax(); + if (Max && Min == Max) + return Max; + } + + return TTI->getVScaleForTuning(); +} +VectorizationFactor LoopVectorizationPlanner::selectVectorizationFactor() { + assert(!VPlans.empty()); + + InstructionCost ExpectedCost = VPlans[0].second[0].ScalarCost; const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; + LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); + assert(hasPlanWithVF(ElementCount::getFixed(1)) && + "Expected Scalar VF to be a candidate"); - bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && VFCandidates.size() > 1) { + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (ForceVectorization && VPlans.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. ChosenFactor.Cost = InstructionCost::getMax(); } - SmallVector InvalidCosts; - for (const auto &i : VFCandidates) { - // The cost for scalar VF=1 is already calculated, so ignore it. - if (i.isScalar()) - continue; - - VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); + // Filter out unprofitable VFs. + for (auto &[_, VFsAndCosts] : VPlans) { + erase_if(VFsAndCosts, [this, &ScalarCost](const auto &Candidate) { + return !isMoreProfitable(Candidate, ScalarCost); + }); + for (const auto &Candidate : VFsAndCosts) { #ifndef NDEBUG unsigned AssumedMinimumVscale = 1; if (std::optional VScale = getVScaleForTuning()) @@ -5439,38 +5472,35 @@ Candidate.Width.isScalable() ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale : Candidate.Width.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i + LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << Candidate.Width << " costs: " << (Candidate.Cost / Width)); - if (i.isScalable()) + if (Candidate.Width.isScalable()) LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"); LLVM_DEBUG(dbgs() << ".\n"); #endif - if (!C.second && !ForceVectorization) { - LLVM_DEBUG( - dbgs() << "LV: Not considering vector loop of width " << i - << " because it will not generate any vector instructions.\n"); - continue; - } - - // If profitable add it to ProfitableVF list. - if (isMoreProfitable(Candidate, ScalarCost)) - ProfitableVFs.push_back(Candidate); + /* if (!C.second && !ForceVectorization) {*/ + /*LLVM_DEBUG(*/ + /*dbgs() << "LV: Not considering vector loop of width " << i*/ + /*<< " because it will not generate any vector instructions.\n");*/ + /*continue;*/ + /*}*/ if (isMoreProfitable(Candidate, ChosenFactor)) ChosenFactor = Candidate; + } } - emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop); + emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); - if (!EnableCondStoresVectorization && NumPredStores) { - reportVectorizationFailure( - "There are conditional stores.", - "store that is conditionally executed prevents vectorization", - "ConditionalStore", ORE, TheLoop); - ChosenFactor = ScalarCost; - } + /* if (!EnableCondStoresVectorization && NumPredStores) {*/ + /*reportVectorizationFailure(*/ + /*"There are conditional stores.",*/ + /*"store that is conditionally executed prevents vectorization",*/ + /*"ConditionalStore", ORE, TheLoop);*/ + /*ChosenFactor = ScalarCost;*/ + /*}*/ LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() @@ -5480,7 +5510,7 @@ return ChosenFactor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( +bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( const Loop &L, ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. @@ -5511,7 +5541,7 @@ return true; } -bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( +bool LoopVectorizationPlanner::isEpilogueVectorizationProfitable( const ElementCount VF) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into @@ -5519,12 +5549,12 @@ // with vectorization factors larger than a certain value. // Allow the target to opt out entirely. - if (!TTI.preferEpilogueVectorization()) + if (!TTI->preferEpilogueVectorization()) return false; // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). - if (TTI.getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) + if (TTI->getMaxInterleaveFactor(VF.getKnownMinValue()) <= 1) return false; // FIXME: We should consider changing the threshold for scalable // vectors to take VScaleForTuning into account. @@ -5533,16 +5563,15 @@ return false; } -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { +VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( + const ElementCount MainLoopVF) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); return Result; } - if (!isScalarEpilogueAllowed()) { + if (!CM.isScalarEpilogueAllowed()) { LLVM_DEBUG( dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " "allowed.\n";); @@ -5551,7 +5580,7 @@ // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. - if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { + if (!isCandidateForEpilogueVectorization(*OrigLoop, MainLoopVF)) { LLVM_DEBUG( dbgs() << "LEV: Unable to vectorize epilogue because the loop is " "not a supported candidate.\n";); @@ -5561,7 +5590,7 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) + if (hasPlanWithVF(ForcedEC)) return {ForcedEC, 0, 0}; else { LLVM_DEBUG( @@ -5571,8 +5600,8 @@ } } - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { + if (OrigLoop->getHeader()->getParent()->hasOptSize() || + OrigLoop->getHeader()->getParent()->hasMinSize()) { LLVM_DEBUG( dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n";); @@ -5595,13 +5624,15 @@ EstimatedRuntimeVF *= *VScale; } - for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && - ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && - (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) - Result = NextVF; + for (auto &[_, VFsAndCosts] : VPlans) { + for (const auto &NextVF : VFsAndCosts) { + if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && + (Result.Width.isScalar() || isMoreProfitable(NextVF, Result))) + Result = NextVF; + } + } if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " @@ -7582,18 +7613,19 @@ return VectorizationFactor::Disabled(); // Select the optimal vectorization factor. - VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); + VectorizationFactor VF = selectVectorizationFactor(); assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); return VF; } VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { - assert(count_if(VPlans, - [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == - 1 && - "Best VF has not a single VPlan."); + assert( + count_if(VPlans, + [VF](const std::pair> + &Plan) { return Plan.first->hasVF(VF); }) == 1 && + "Best VF has not a single VPlan."); - for (const VPlanPtr &Plan : VPlans) { + for (const auto &[Plan, _] : VPlans) { if (Plan->hasVF(VF)) return *Plan.get(); } @@ -7737,7 +7769,7 @@ #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void LoopVectorizationPlanner::printPlans(raw_ostream &O) { - for (const auto &Plan : VPlans) + for (const auto &[Plan, _] : VPlans) if (PrintVPlansInDotFormat) Plan->printDOT(O); else @@ -8048,7 +8080,8 @@ auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back(buildVPlan(SubRange)); + VPlans.emplace_back(buildVPlan(SubRange), + SmallVector()); VF = SubRange.End; } } @@ -8697,10 +8730,18 @@ auto &ConditionalAssumes = Legal->getConditionalAssumes(); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); + InstructionCost ScalarCost = CM.expectedCost(ElementCount::getFixed(1)).first; auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back(buildVPlanWithVPRecipes(SubRange, DeadInstructions)); + auto Plan = buildVPlanWithVPRecipes(SubRange, DeadInstructions); + SmallVector Costs; + for (ElementCount CostVF = VF; + ElementCount::isKnownLT(CostVF, SubRange.End); CostVF *= 2) { + InstructionCost VecCost = CM.expectedCost(CostVF, &InvalidCosts).first; + Costs.emplace_back(CostVF, VecCost, ScalarCost); + } + VPlans.emplace_back(std::move(Plan), Costs); VF = SubRange.End; } } @@ -10353,7 +10394,7 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + LVP.selectEpilogueVectorizationFactor(VF.Width); if (EpilogueVF.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue