Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -375,8 +375,9 @@ /// Returns true if vector representation of the instruction \p I /// requires mask. - bool isMaskRequired(const Instruction *I) const { - return MaskedOp.contains(I); + bool isMaskRequired(bool FoldTailByMasking, const Instruction *I) const { + return MaskedOp.contains(I) || + (FoldTailByMasking && FoldTailMaskedOp.contains(I)); } unsigned getNumStores() const { return LAI->getNumStores(); } @@ -384,8 +385,9 @@ /// Returns all assume calls in predicated blocks. They need to be dropped /// when flattening the CFG. - const SmallPtrSetImpl &getConditionalAssumes() const { - return ConditionalAssumes; + const SmallPtrSetImpl & + getConditionalAssumes(bool FoldTailByMasking) const { + return FoldTailByMasking ? FoldTailConditionalAssumes : ConditionalAssumes; } PredicatedScalarEvolution *getPredicatedScalarEvolution() const { @@ -545,6 +547,11 @@ /// flattened. SmallPtrSet ConditionalAssumes; + /// Same as MaskedOp above when folding tail by masking. + SmallPtrSet FoldTailMaskedOp; + /// Same as ConditionalAssumes above when folding tail by masking. + SmallPtrSet FoldTailConditionalAssumes; + /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1430,14 +1430,11 @@ // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; - SmallPtrSet TmpMaskedOp; - SmallPtrSet TmpConditionalAssumes; - // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes)) { + if (!blockCanBePredicated(BB, SafePointers, FoldTailMaskedOp, + FoldTailConditionalAssumes)) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } @@ -1445,10 +1442,6 @@ LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); - MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); - ConditionalAssumes.insert(TmpConditionalAssumes.begin(), - TmpConditionalAssumes.end()); - return true; } Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -189,7 +189,10 @@ /// Vector width with best cost. ElementCount Width; - /// Cost of the loop with that width. + /// Whether the entire loop is predicated. + bool FoldTailByMasking; + + /// Cost of the loop with that width and vectorization style. InstructionCost Cost; /// Cost of the scalar loop. @@ -199,17 +202,19 @@ /// to runtime checks. ElementCount MinProfitableTripCount; - VectorizationFactor(ElementCount Width, InstructionCost Cost, - InstructionCost ScalarCost) - : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} + VectorizationFactor(ElementCount Width, bool FoldTailByMasking, + InstructionCost Cost, InstructionCost ScalarCost) + : Width(Width), FoldTailByMasking(FoldTailByMasking), Cost(Cost), + ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0, 0}; + return {ElementCount::getFixed(1), false, 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { - return Width == rhs.Width && Cost == rhs.Cost; + return Width == rhs.Width && FoldTailByMasking == rhs.FoldTailByMasking && + Cost == rhs.Cost; } bool operator!=(const VectorizationFactor &rhs) const { @@ -266,9 +271,6 @@ /// The legality analysis. LoopVectorizationLegality *Legal; - /// The profitability analysis. - LoopVectorizationCostModel &CM; - /// The interleaved access analysis. InterleavedAccessInfo &IAI; @@ -283,28 +285,40 @@ /// A builder used to construct the current plan. VPBuilder Builder; + /// Profitable vector factors. + SmallVector ProfitableVFs; + public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} - /// Plan how to best vectorize, return the best VF and its cost, or - /// std::nullopt if vectorization and interleaving should be avoided up front. - std::optional plan(ElementCount UserVF, unsigned UserIC); + /// Plan how to best vectorize with a given cost model. + void plan(LoopVectorizationCostModel &CM, ElementCount UserVF, + unsigned UserIC); + + /// \return The most profitable vectorization factor and the cost of that VF. + /// This method checks every VF in the plans in \p VPlans. If UserVF is not + /// ZERO then this vectorization factor will be selected if vectorization is + /// possible. + std::optional selectVectorizationFactor(); + + VectorizationFactor + selectEpilogueVectorizationFactor(const VectorizationFactor &MainVF); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(ElementCount UserVF); + VectorizationFactor planInVPlanNativePath(LoopVectorizationCostModel &CM, + ElementCount UserVF); /// Return the best VPlan for \p VF. - VPlan &getBestPlanFor(ElementCount VF) const; + VPlan &getBestPlanFor(ElementCount VF, bool FoldTailByMasking) const; /// Generate the IR code for the body of the vectorized loop according to the /// best selected \p VF, \p UF and VPlan \p BestPlan. @@ -321,9 +335,10 @@ /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. - bool hasPlanWithVF(ElementCount VF) const { - return any_of(VPlans, - [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); + bool hasPlanWithVF(ElementCount VF, bool FoldTailByMasking) const { + return any_of(VPlans, [&](const VPlanPtr &Plan) { + return Plan->hasVF(VF) && Plan->foldTailByMasking() == FoldTailByMasking; + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -340,13 +355,14 @@ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(ElementCount MinVF, ElementCount MaxVF); + void buildVPlans(LoopVectorizationCostModel &CM, ElementCount MinVF, + ElementCount MaxVF); private: /// Build a VPlan according to the information gathered by Legal. \return a /// VPlan for vectorization factors \p Range.Start and up to \p Range.End /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range); + VPlanPtr buildVPlan(LoopVectorizationCostModel &CM, VFRange &Range); /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. @@ -355,21 +371,39 @@ /// set the largest included VF to the maximum VF for which no plan could be /// built. std::optional tryToBuildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions); + LoopVectorizationCostModel &CM, VFRange &Range, + SmallPtrSetImpl &DeadInstructions); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(LoopVectorizationCostModel &CM, + ElementCount MinVF, ElementCount MaxVF); // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be // converted to reductions, with one operand being vector and the other being // the scalar reduction chain. For other reductions, a select is introduced // between the phi and live-out recipes when folding the tail. - void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, + void adjustRecipesForReductions(LoopVectorizationCostModel &CM, + VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); + + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const Loop &L, + const ElementCount VF) const; + + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; }; } // namespace llvm Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -234,7 +234,6 @@ "force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values( - clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN( TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), @@ -1170,18 +1169,18 @@ /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, - PredicatedScalarEvolution &PSE, LoopInfo *LI, - LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(bool FoldTailByMasking, ScalarEpilogueLowering SEL, + Loop *L, PredicatedScalarEvolution &PSE, + LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) - : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), - TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI) {} + : ScalarEpilogueStatus(SEL), FoldTailByMasking(FoldTailByMasking), + TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factors (both fixed and /// scalable). If the factors are 0, vectorization and interleaving should be @@ -1192,17 +1191,6 @@ /// otherwise. bool runtimeChecksRequired(); - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO - /// then this vectorization factor will be selected if vectorization is - /// possible. - VectorizationFactor - selectVectorizationFactor(const ElementCountSet &CandidateVFs); - - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); - /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { @@ -1560,16 +1548,20 @@ return IsRequired; } - /// Returns true if a scalar epilogue is not allowed due to optsize or a - /// loop hint annotation. + /// Returns false if a scalar epilogue is not allowed due to, for example, + /// optsize or a tail folding. It is use either as a check for when + /// interleaving/epilog vectorization can occur, or for checking cases where a + /// epilog would be required for correctness. bool isScalarEpilogueAllowed() const { - return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; + return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed || + (!FoldTailByMasking && + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate); } /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { - if (!CanFoldTailByMasking) + if (!FoldTailByMasking) return TailFoldingStyle::None; if (ForceTailFoldingStyle.getNumOccurrences()) @@ -1579,9 +1571,7 @@ } /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { - return getTailFoldingStyle() != TailFoldingStyle::None; - } + bool foldTailByMasking() const { return FoldTailByMasking; } /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate @@ -1620,11 +1610,6 @@ Function **Variant, bool *NeedsMask = nullptr) const; - /// Returns true if the per-lane cost of VectorizationFactor A is lower than - /// that of B. - bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B) const; - /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { WideningDecisions.clear(); @@ -1632,6 +1617,26 @@ Scalars.clear(); } + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); + + /// Return the NumPredStores, to be checked by the Planner. + unsigned getNumPredStores() { return NumPredStores; } + private: unsigned NumPredStores = 0; @@ -1657,23 +1662,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. If \p Invalid is not nullptr, this function - /// will add a pair(Instruction*, ElementCount) to \p Invalid for - /// each instruction that has an Invalid cost for the given VF. - VectorizationCostTy - expectedCost(ElementCount VF, - SmallVectorImpl *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); @@ -1745,7 +1733,7 @@ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool CanFoldTailByMasking = false; + bool FoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -1836,16 +1824,6 @@ Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize loop \p L and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const Loop &L, - const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -1891,9 +1869,6 @@ /// All element types found in the loop. SmallPtrSet ElementTypesInLoop; - - /// Profitable vector factors. - SmallVector ProfitableVFs; }; } // end namespace llvm @@ -3455,7 +3430,7 @@ Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; - bool MaskRequired = Legal->isMaskRequired(CI); + bool MaskRequired = Legal->isMaskRequired(foldTailByMasking(), CI); for (auto &ArgOp : CI->args()) ScalarTys.push_back(ArgOp->getType()); @@ -3941,7 +3916,7 @@ // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { + if (State.Plan->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); SelectInst *Sel = nullptr; @@ -4471,7 +4446,7 @@ return false; case Instruction::Load: case Instruction::Store: { - if (!Legal->isMaskRequired(I)) + if (!Legal->isMaskRequired(foldTailByMasking(), I)) return false; // When we know the load's address is loop invariant and the instruction // in the original scalar loop was unconditionally executed then we @@ -4498,13 +4473,13 @@ // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); case Instruction::Call: - return Legal->isMaskRequired(I); + return Legal->isMaskRequired(foldTailByMasking(), I); } } std::pair LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, - ElementCount VF) const { + ElementCount VF) const { assert(I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || @@ -4610,7 +4585,7 @@ // load, or any gaps in a store-access). bool PredicatedAccessRequiresMasking = blockNeedsPredicationForAnyReason(I->getParent()) && - Legal->isMaskRequired(I); + Legal->isMaskRequired(foldTailByMasking(), I); bool LoadAccessWithGapsRequiresEpilogMasking = isa(I) && Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); @@ -5110,12 +5085,22 @@ case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: - [[fallthrough]]; + LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating " + "predicated vector loop.\n"); + // We cannot add a scalar tail, but fall through to the code below both with + // and without FoldTailByMasking. FoldTailByMasking=false will only be + // allowed if the trip count is known to be a multiple of the VF. Otherwise + // FoldTailByMasking=true plans will be used. + break; case CM_ScalarEpilogueNotNeededUsePredicate: - LLVM_DEBUG( - dbgs() << "LV: vector predicate hint/switch found.\n" - << "LV: Not allowing scalar epilogue, creating predicated " - << "vector loop.\n"); + // If this cost model is for predicated plans then fall through to the + // prepareToFoldTailByMasking checks below, else return the unpredicated max + // size. + if (!FoldTailByMasking) + return computeFeasibleMaxVF(TC, UserVF, false); + LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Trying predicated vector loop.\n"); break; case CM_ScalarEpilogueNotAllowedLowTripLoop: // fallthrough as a special case of OptForSize @@ -5139,17 +5124,8 @@ // a bottom-test and a single exiting block. We'd have to handle the fact // that not every instruction executes on the last iteration. This will // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF, false); - } + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return FixedScalableVFPair::getNone(); - } // Now try the tail folding @@ -5194,25 +5170,22 @@ if (Rem->isZero()) { // Accept MaxFixedVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); - return MaxFactors; + return FoldTailByMasking ? FixedScalableVFPair::getNone() : MaxFactors; } } + // If this cost model is not for tail folding then return at this point and + // leave it for the other model. + if (!FoldTailByMasking && + ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate) + return FixedScalableVFPair::getNone(); + // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - CanFoldTailByMasking = true; - return MaxFactors; - } - - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + assert(FoldTailByMasking); return MaxFactors; } @@ -5358,12 +5331,12 @@ return TTI.getVScaleForTuning(); } -bool LoopVectorizationCostModel::isMoreProfitable( +bool LoopVectorizationPlanner::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { // If the trip count is a known (possibly small) constant, the trip count @@ -5374,15 +5347,20 @@ // some extra overheads, but for the purpose of comparing the costs of // different VFs we can use this to compare the total loop-body cost // expected after vectorization. - auto GetCostForTC = [MaxTripCount, this](unsigned VF, - InstructionCost VectorCost, - InstructionCost ScalarCost) { - return foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF) - : VectorCost * (MaxTripCount / VF) + - ScalarCost * (MaxTripCount % VF); + auto GetCostForTC = [MaxTripCount](bool FoldTailByMasking, unsigned VF, + InstructionCost VectorCost, + InstructionCost ScalarCost) { + return FoldTailByMasking ? VectorCost * divideCeil(MaxTripCount, VF) + : VectorCost * (MaxTripCount / VF) + + ScalarCost * (MaxTripCount % VF); }; - auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost); - auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost); + auto RTCostA = GetCostForTC(A.FoldTailByMasking, A.Width.getFixedValue(), + CostA, A.ScalarCost); + auto RTCostB = GetCostForTC(B.FoldTailByMasking, B.Width.getFixedValue(), + CostB, B.ScalarCost); + + if (A.FoldTailByMasking && !B.FoldTailByMasking) + return RTCostA <= RTCostB; return RTCostA < RTCostB; } @@ -5390,13 +5368,19 @@ // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (std::optional VScale = getVScaleForTuning(TheFunction, TTI)) { + if (std::optional VScale = + getVScaleForTuning(OrigLoop->getHeader()->getParent(), *TTI)) { if (A.Width.isScalable()) EstimatedWidthA *= *VScale; if (B.Width.isScalable()) EstimatedWidthB *= *VScale; } + // If one plan is predicated and the other is not, opt for the predicated + // scheme on a tie. + if (A.FoldTailByMasking && !B.FoldTailByMasking) + return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA); + // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. @@ -5473,20 +5457,45 @@ } while (!Tail.empty()); } -VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( - const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; +std::optional +LoopVectorizationPlanner::selectVectorizationFactor() { + LLVM_DEBUG(printPlans(dbgs())); + + // If we had no plans as they were all invalid, return the invalid cost + if (VPlans.size() == 0) + return std::nullopt; + + // If we only have one plan due to the UserVF, return it. We try with both + // predicated and unpredicated loops. + ElementCount UserVF = Hints.getWidth(); + bool UserPredicated = Hints.getPredicate(); + if (UserVF && hasPlanWithVF(UserVF, UserPredicated)) { + VPlan &Plan = getBestPlanFor(UserVF, UserPredicated); + auto Cost = Plan.getCostModel()->expectedCost(UserVF); + if (Cost.first.isValid()) + return VectorizationFactor(UserVF, UserPredicated, Cost.first, 0); + } else if (UserVF && hasPlanWithVF(UserVF, !UserPredicated)) { + VPlan &Plan = getBestPlanFor(UserVF, !UserPredicated); + auto Cost = Plan.getCostModel()->expectedCost(UserVF); + if (Cost.first.isValid()) + return VectorizationFactor(UserVF, !UserPredicated, Cost.first, 0); + } + + assert(VPlans[0]->hasScalarVFOnly() && + "Expected Scalar VPlan to be a the first candidate"); + + InstructionCost ExpectedCost = + VPlans[0]->getCostModel()->expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(VFCandidates.count(ElementCount::getFixed(1)) && - "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, - ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), + VPlans[0]->foldTailByMasking(), + ExpectedCost, ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; - bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && VFCandidates.size() > 1) { + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (ForceVectorization && VPlans.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. @@ -5494,63 +5503,80 @@ } SmallVector InvalidCosts; - for (const auto &i : VFCandidates) { - // The cost for scalar VF=1 is already calculated, so ignore it. - if (i.isScalar()) - continue; + for (const VPlanPtr &VPlan : drop_begin(VPlans)) { + for (const ElementCount &i : VPlan->getVFs()) { + // The cost for scalar VF=1 is already calculated, so ignore it. + if (i.isScalar()) + continue; - VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); + LoopVectorizationCostModel::VectorizationCostTy C = + VPlan->getCostModel()->expectedCost(i, &InvalidCosts); + VectorizationFactor Candidate(i, VPlan->foldTailByMasking(), C.first, + ScalarCost.ScalarCost); #ifndef NDEBUG - unsigned AssumedMinimumVscale = 1; - if (std::optional VScale = getVScaleForTuning(TheFunction, TTI)) - AssumedMinimumVscale = *VScale; - unsigned Width = - Candidate.Width.isScalable() - ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale - : Candidate.Width.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " - << Candidate.Cost << " => " << (Candidate.Cost / Width)); - if (i.isScalable()) - LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " - << AssumedMinimumVscale << ")"); - LLVM_DEBUG(dbgs() << ".\n"); -#endif - - if (!C.second && !ForceVectorization) { + unsigned AssumedMinimumVscale = 1; + if (std::optional VScale = + getVScaleForTuning(OrigLoop->getHeader()->getParent(), *TTI)) + AssumedMinimumVscale = *VScale; + unsigned Width = + Candidate.Width.isScalable() + ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale + : Candidate.Width.getFixedValue(); LLVM_DEBUG( - dbgs() << "LV: Not considering vector loop of width " << i - << " because it will not generate any vector instructions.\n"); - continue; - } + dbgs() << "LV: " << (VPlan->foldTailByMasking() ? "Tail folded " : "") + << "Vector loop of width " << i << " costs: " << Candidate.Cost + << " => " << (Candidate.Cost / Width)); + if (i.isScalable()) + LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " + << AssumedMinimumVscale << ")"); + LLVM_DEBUG(dbgs() << ".\n"); +#endif - // If profitable add it to ProfitableVF list. - if (isMoreProfitable(Candidate, ScalarCost)) - ProfitableVFs.push_back(Candidate); + if (!C.second && !ForceVectorization) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " << i + << " because it will not generate any vector instructions.\n"); + continue; + } - if (isMoreProfitable(Candidate, ChosenFactor)) - ChosenFactor = Candidate; - } + // FIXME: Possibly remove EnableCondStoresVectorization now. + if (!EnableCondStoresVectorization && + VPlan->getCostModel()->getNumPredStores()) { + reportVectorizationFailure( + "There are conditional stores.", + "store that is conditionally executed prevents vectorization", + "ConditionalStore", ORE, OrigLoop); + continue; + } - emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop); + // If profitable add it to ProfitableVF list. + if (isMoreProfitable(Candidate, ScalarCost)) + ProfitableVFs.push_back(Candidate); - if (!EnableCondStoresVectorization && NumPredStores) { - reportVectorizationFailure("There are conditional stores.", - "store that is conditionally executed prevents vectorization", - "ConditionalStore", ORE, TheLoop); - ChosenFactor = ScalarCost; + if (isMoreProfitable(Candidate, ChosenFactor)) + ChosenFactor = Candidate; + } } - LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && - !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); - LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); + emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); + + LLVM_DEBUG({ + if (ForceVectorization && !ChosenFactor.Width.isScalar() && + !isMoreProfitable(ChosenFactor, ScalarCost)) + dbgs() << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"; + }); + LLVM_DEBUG(dbgs() << "LV: Selecting " + << (ChosenFactor.FoldTailByMasking ? "Tail folded " : "") + << "VF: " << ChosenFactor.Width << ".\n"); + assert((ChosenFactor.Width.isScalar() || ChosenFactor.ScalarCost > 0) && + "when vectorizing, the scalar cost must be non-zero."); return ChosenFactor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( +bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( const Loop &L, ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. @@ -5581,7 +5607,7 @@ return true; } -bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( +bool LoopVectorizationPlanner::isEpilogueVectorizationProfitable( const ElementCount VF) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into @@ -5589,40 +5615,40 @@ // with vectorization factors larger than a certain value. // Allow the target to opt out entirely. - if (!TTI.preferEpilogueVectorization()) + if (!TTI->preferEpilogueVectorization()) return false; // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). - if (TTI.getMaxInterleaveFactor(VF) <= 1) + if (TTI->getMaxInterleaveFactor(VF) <= 1) return false; unsigned Multiplier = 1; if (VF.isScalable()) - Multiplier = getVScaleForTuning(TheFunction, TTI).value_or(1); + Multiplier = getVScaleForTuning(OrigLoop->getHeader()->getParent(), *TTI) + .value_or(1); if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF) return true; return false; } -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { +VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( + const VectorizationFactor &MainLoopVF) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n"); return Result; } - if (!isScalarEpilogueAllowed()) { - LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no " - "epilogue is allowed.\n"); + if (MainLoopVF.FoldTailByMasking) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue not required as the vector loop is " + "predicated.\n";); return Result; } // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. - if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { + if (!isCandidateForEpilogueVectorization(*OrigLoop, MainLoopVF.Width)) { LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop " "is not a supported candidate.\n"); return Result; @@ -5631,8 +5657,8 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n"); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) - return {ForcedEC, 0, 0}; + if (hasPlanWithVF(ForcedEC, false)) + return {ForcedEC, false, 0, 0}; else { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not " "viable.\n"); @@ -5640,14 +5666,14 @@ } } - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { + Function *TheFunction = OrigLoop->getHeader()->getParent(); + if (TheFunction->hasOptSize() || TheFunction->hasMinSize()) { LLVM_DEBUG( dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n"); return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) { + if (!isEpilogueVectorizationProfitable(MainLoopVF.Width)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -5656,19 +5682,20 @@ // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. - ElementCount EstimatedRuntimeVF = MainLoopVF; - if (MainLoopVF.isScalable()) { - EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (std::optional VScale = getVScaleForTuning(TheFunction, TTI)) + ElementCount EstimatedRuntimeVF = MainLoopVF.Width; + if (MainLoopVF.Width.isScalable()) { + EstimatedRuntimeVF = + ElementCount::getFixed(MainLoopVF.Width.getKnownMinValue()); + if (std::optional VScale = getVScaleForTuning(TheFunction, *TTI)) EstimatedRuntimeVF *= *VScale; } for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + if (((!NextVF.Width.isScalable() && MainLoopVF.Width.isScalable() && ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && + ElementCount::isKnownLT(NextVF.Width, MainLoopVF.Width)) && (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) + hasPlanWithVF(NextVF.Width, NextVF.FoldTailByMasking)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) @@ -6529,7 +6556,7 @@ "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; - if (Legal->isMaskRequired(I)) { + if (Legal->isMaskRequired(foldTailByMasking(), I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); } else { @@ -6583,7 +6610,8 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost( - I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, + I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(foldTailByMasking(), I), Alignment, TargetTransformInfo::TCK_RecipThroughput, I); } @@ -6618,11 +6646,12 @@ (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + AS, CostKind, Legal->isMaskRequired(foldTailByMasking(), I), + UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. - assert(!Legal->isMaskRequired(I) && + assert(!Legal->isMaskRequired(foldTailByMasking(), I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, @@ -7338,8 +7367,9 @@ return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: case LoopVectorizationCostModel::CM_Widen: - return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked - : TTI::CastContextHint::Normal; + return Legal->isMaskRequired(foldTailByMasking(), I) + ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; case LoopVectorizationCostModel::CM_Widen_Reverse: return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: @@ -7508,7 +7538,8 @@ } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(LoopVectorizationCostModel &CM, + ElementCount UserVF) { assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level @@ -7537,13 +7568,13 @@ "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF, VF); + buildVPlans(CM, VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; + return {VF, false /*TailFold*/, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -7552,12 +7583,15 @@ return VectorizationFactor::Disabled(); } -std::optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +void LoopVectorizationPlanner::plan(LoopVectorizationCostModel &CM, + ElementCount UserVF, unsigned UserIC) { + CM.collectValuesToIgnore(); + CM.collectElementTypesForWidening(); + assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. - return std::nullopt; + return; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && @@ -7584,66 +7618,29 @@ if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); - if (!hasPlanWithVF(UserVF)) { - LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF - << ".\n"); - return std::nullopt; - } - - LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + buildVPlansWithVPRecipes(CM, UserVF, UserVF); + return; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); } - // Populate the set of Vectorization Factor Candidates. - ElementCountSet VFCandidates; - for (auto VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) - VFCandidates.insert(VF); - for (auto VF = ElementCount::getScalable(1); - ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) - VFCandidates.insert(VF); - - for (const auto &VF : VFCandidates) { - // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); - - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - if (VF.isVector()) - CM.collectInstsToScalarize(VF); - } - CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); - - LLVM_DEBUG(printPlans(dbgs())); - if (!MaxFactors.hasVector()) - return VectorizationFactor::Disabled(); - - // Select the optimal vectorization factor. - VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); - if (!hasPlanWithVF(VF.Width)) { - LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width - << ".\n"); - return std::nullopt; - } - return VF; + buildVPlansWithVPRecipes(CM, ElementCount::getFixed(1), MaxFactors.FixedVF); + buildVPlansWithVPRecipes(CM, ElementCount::getScalable(1), MaxFactors.ScalableVF); } -VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { +VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF, + bool FoldTailByMasking) const { assert(count_if(VPlans, - [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == - 1 && + [VF, FoldTailByMasking](const VPlanPtr &Plan) { + return Plan->hasVF(VF) && + Plan->foldTailByMasking() == FoldTailByMasking; + }) == 1 && "Best VF has not a single VPlan."); for (const VPlanPtr &Plan : VPlans) { - if (Plan->hasVF(VF)) + if (Plan->hasVF(VF) && Plan->foldTailByMasking() == FoldTailByMasking) return *Plan.get(); } llvm_unreachable("No plan found!"); @@ -7693,8 +7690,9 @@ assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF - << '\n'); + LLVM_DEBUG(dbgs() << "Executing best plan with TailFold=" + << (BestVPlan.foldTailByMasking() ? "true" : "false") + << ", VF=" << BestVF << ", UF=" << BestUF << '\n'); // Workaround! Compute the trip count of the original loop and cache it // before we start modifying the CFG. This code has a systemic problem @@ -8093,12 +8091,13 @@ /// of VF's starting at a given VF and extending it as much as possible. Each /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, +void LoopVectorizationPlanner::buildVPlans(LoopVectorizationCostModel &CM, + ElementCount MinVF, ElementCount MaxVF) { auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - VPlans.push_back(buildVPlan(SubRange)); + VPlans.push_back(buildVPlan(CM, SubRange)); VF = SubRange.End; } } @@ -8164,7 +8163,7 @@ if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - assert(CM.foldTailByMasking() && "must fold the tail"); + assert(Plan.foldTailByMasking() && "must fold the tail"); // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. @@ -8236,7 +8235,7 @@ return nullptr; VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) + if (Legal->isMaskRequired(Plan->foldTailByMasking(), I)) Mask = createBlockInMask(I->getParent(), *Plan); // Determine if the pointer operand of the access is either consecutive or @@ -8452,7 +8451,7 @@ // vector variant at this VF requires a mask, so we synthesize an // all-true mask. VPValue *Mask = nullptr; - if (Legal->isMaskRequired(CI)) + if (Legal->isMaskRequired(Plan->foldTailByMasking(), CI)) Mask = createBlockInMask(CI->getParent(), *Plan); else Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue( @@ -8703,22 +8702,33 @@ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes( + LoopVectorizationCostModel &CM, ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); + for (ElementCount VF = MinVF; ElementCount::isKnownLE(VF, MaxVF); VF *= 2) { + // Collect Uniform and Scalar instructions after vectorization with VF. + CM.collectUniformsAndScalars(VF); + + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + if (VF.isVector()) + CM.collectInstsToScalarize(VF); + } + // Add assume instructions we need to drop to DeadInstructions, to prevent // them from being added to the VPlan. // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. SmallPtrSet DeadInstructions; - auto &ConditionalAssumes = Legal->getConditionalAssumes(); + auto &ConditionalAssumes = + Legal->getConditionalAssumes(CM.foldTailByMasking()); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); auto MaxVFTimes2 = MaxVF * 2; for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) { VFRange SubRange = {VF, MaxVFTimes2}; - if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions)) + if (auto Plan = tryToBuildVPlanWithVPRecipes(CM, SubRange, DeadInstructions)) VPlans.push_back(std::move(*Plan)); VF = SubRange.End; } @@ -8851,7 +8861,8 @@ } std::optional LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions) { + LoopVectorizationCostModel &CM, VFRange &Range, + SmallPtrSetImpl &DeadInstructions) { SmallPtrSet *, 1> InterleaveGroups; @@ -8885,7 +8896,7 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { + auto applyIG = [IG, &CM](ElementCount VF) -> bool { return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); @@ -8907,7 +8918,7 @@ // followed by a region for the vector loop, followed by the middle block. The // skeleton vector loop region contains a header and latch block. VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique(Preheader); + auto Plan = std::make_unique(CM.foldTailByMasking(), &CM, Preheader); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); @@ -9031,8 +9042,8 @@ VPlanTransforms::removeRedundantInductionCasts(*Plan); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, - RecipeBuilder, Range.Start); + adjustRecipesForReductions(CM, cast(TopRegion->getExiting()), + Plan, RecipeBuilder, Range.Start); // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. @@ -9092,7 +9103,8 @@ return std::make_optional(std::move(Plan)); } -VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { +VPlanPtr LoopVectorizationPlanner::buildVPlan(LoopVectorizationCostModel &CM, + VFRange &Range) { // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -9101,7 +9113,7 @@ assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = std::make_unique(); + auto Plan = std::make_unique(CM.foldTailByMasking(), &CM); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); @@ -9123,7 +9135,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + CM.getTailFoldingStyle(false)); return Plan; } @@ -9133,8 +9145,8 @@ // reduction chain. For other reductions, a select is introduced between the phi // and live-out recipes when folding the tail. void LoopVectorizationPlanner::adjustRecipesForReductions( - VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF) { + LoopVectorizationCostModel &CM, VPBasicBlock *LatchVPBB, VPlanPtr &Plan, + VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = @@ -9221,7 +9233,7 @@ // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. - if (CM.foldTailByMasking()) { + if (Plan->foldTailByMasking()) { Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -9888,12 +9900,10 @@ ScalarEpilogueLowering SEL = getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI); - LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + LoopVectorizationCostModel CM(false, SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, + ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. - // TODO: CM is not used at this point inside the planner. Turn CM into an - // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -9901,7 +9911,7 @@ CM.collectElementTypesForWidening(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(CM, UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -9909,7 +9919,7 @@ if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) return false; - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking); { GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, @@ -10141,7 +10151,7 @@ assert(L->isInnermost() && "Inner loop expected."); - InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); + InterleavedAccessInfo BaseIAI(PSE, L, DT, LI, LVL.getLAI()); bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); // If an override option has been passed in for interleaved accesses, use it. @@ -10150,12 +10160,12 @@ // Analyze interleaved memory accesses. if (UseInterleaved) - IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + BaseIAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); // Check the function attributes and profiles to find out if this function // should be optimized for size. ScalarEpilogueLowering SEL = - getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI); + getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &BaseIAI); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -10229,21 +10239,31 @@ return false; } - // Use the cost model. - LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, - F, &Hints, IAI); - CM.collectValuesToIgnore(); - CM.collectElementTypesForWidening(); - // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, BaseIAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + // We plan with two different cost models with FoldTailByMasking = false and + // true, adding the useful vplans from each and picking the best below in + // selectVectorizationFactor. + LoopVectorizationCostModel BaseCM(false, SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, + AC, ORE, F, &Hints, BaseIAI); + LVP.plan(BaseCM, UserVF, UserIC); + + InterleavedAccessInfo PredIAI(PSE, L, DT, LI, LVL.getLAI()); + if (UseInterleaved && SEL != CM_ScalarEpilogueAllowed) + PredIAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + LoopVectorizationCostModel PredCM(true, SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, + AC, ORE, F, &Hints, PredIAI); + if (SEL != CM_ScalarEpilogueAllowed) + LVP.plan(PredCM, UserVF, UserIC); + // Plan how to best vectorize, return the best VF and its cost. - std::optional MaybeVF = LVP.plan(UserVF, UserIC); + // Use the cost model. + std::optional MaybeVF = LVP.selectVectorizationFactor(); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10253,7 +10273,8 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = VF.FoldTailByMasking ? PredCM.selectInterleaveCount(VF.Width, VF.Cost) + : BaseCM.selectInterleaveCount(VF.Width, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if @@ -10370,10 +10391,9 @@ assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking); InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM, BFI, PSI, Checks); - - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + BestPlan.getCostModel(), BFI, PSI, Checks); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { @@ -10387,17 +10407,18 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + LVP.selectEpilogueVectorizationFactor(VF); if (EpilogueVF.Width.isVector()) { - // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. + // TODOD: Predicated remainders EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); + VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF, false); + EpilogueVectorizerMainLoop MainILV( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, + BestMainPlan.getCostModel(), BFI, PSI, Checks); LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; @@ -10406,11 +10427,11 @@ // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI, - Checks); + VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF, false); + EpilogueVectorizerEpilogueLoop EpilogILV( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, + BestEpiPlan.getCostModel(), BFI, PSI, Checks); - VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); @@ -10457,11 +10478,10 @@ if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, - PSI, Checks); - - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VF.MinProfitableTripCount, IC, &LVL, + BestPlan.getCostModel(), BFI, PSI, Checks); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -49,6 +49,7 @@ class InnerLoopVectorizer; class IRBuilderBase; class LoopInfo; +class LoopVectorizationCostModel; class PredicateScalarEvolution; class raw_ostream; class RecurrenceDescriptor; @@ -2253,8 +2254,18 @@ /// Values used outside the plan. MapVector LiveOuts; + /// Whether this plan should foldTailByMasking + bool FoldTailByMasking = false; + + /// The cost model used to construct and cost this VPlan. + /// TODO: Remove this and the dependencies on the costmodel. + LoopVectorizationCostModel *Cost; + public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { + VPlan(bool FoldTailByMasking = false, + LoopVectorizationCostModel *Cost = nullptr, + VPBlockBase *Entry = nullptr) + : Entry(Entry), FoldTailByMasking(FoldTailByMasking), Cost(Cost) { if (Entry) Entry->setPlan(this); } @@ -2299,6 +2310,8 @@ /// longer, because it may be stale. void disableValue2VPValue() { Value2VPValueEnabled = false; } + const SmallSetVector &getVFs() const { return VFs; } + void addVF(ElementCount VF) { VFs.insert(VF); } void setVF(ElementCount VF) { @@ -2416,6 +2429,10 @@ return LiveOuts; } + bool foldTailByMasking() const { return FoldTailByMasking; } + + LoopVectorizationCostModel *getCostModel() const { return Cost; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -794,6 +794,8 @@ std::string Out; raw_string_ostream RSO(Out); RSO << Name << " for "; + if (FoldTailByMasking) + RSO << "Tail Folded "; if (!VFs.empty()) { RSO << "VF={" << VFs[0]; for (ElementCount VF : drop_begin(VFs)) Index: llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll @@ -14,7 +14,7 @@ ; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load ; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load ; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load -; COST: LV: Selecting VF: 1. +; COST: LV: Selecting Tail folded VF: 1. define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) { ; CHECK-LABEL: @test( Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-LABEL: Checking a loop in 'simple_memset' -; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS: VPlan 'Initial VPlan for Tail Folded VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { ; VPLANS-NEXT: Live-in vp<[[TC:%[0-9]+]]> = original trip-count ; VPLANS-EMPTY: ; VPLANS-NEXT: vector.ph: Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll @@ -778,28 +778,40 @@ ; CHECK-LABEL: @simple_memset_trip1024( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; CHECK-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i32 [[VAL]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ] ; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; CHECK-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] -; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; CHECK-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP6:%.*]] = mul i64 [[TMP5]], 4 +; CHECK-NEXT: [[TMP7:%.*]] = add i64 [[TMP6]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 1 +; CHECK-NEXT: [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0 +; CHECK-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP12]], align 4 +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP10]], i64 [[TMP14]] +; CHECK-NEXT: store [[BROADCAST_SPLAT3]], ptr [[TMP15]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP17:%.*]] = mul i64 [[TMP16]], 8 +; CHECK-NEXT: [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP17]] +; CHECK-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL @@ -10,48 +9,6 @@ ; Test the different tail folding styles. define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features" = "+sve" { -; NONE-LABEL: @simple_memset_tailfold( -; NONE-NEXT: entry: -; NONE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; NONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; NONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]] -; NONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; NONE: vector.ph: -; NONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; NONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]] -; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] -; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 -; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NONE-NEXT: br label [[VECTOR_BODY:%.*]] -; NONE: vector.body: -; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; NONE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0 -; NONE-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] -; NONE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; NONE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 -; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] -; NONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; NONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; NONE: middle.block: -; NONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] -; NONE-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; NONE: scalar.ph: -; NONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; NONE-NEXT: br label [[WHILE_BODY:%.*]] -; NONE: while.body: -; NONE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; NONE-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; NONE-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; NONE-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; NONE-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; NONE-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; NONE: while.end.loopexit: -; NONE-NEXT: ret void -; ; DATA-LABEL: @simple_memset_tailfold( ; DATA-NEXT: entry: ; DATA-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) Index: llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll @@ -6,7 +6,7 @@ ; Trip count of 5 - shouldn't be vectorized. ; CHECK-LABEL: tripcount5 -; CHECK: LV: Selecting VF: 1 +; CHECK: LV: Selecting Tail folded VF: 1 define void @tripcount5(ptr nocapture readonly %in, ptr nocapture %out, ptr nocapture readonly %consts, i32 %n) #0 { entry: %arrayidx20 = getelementptr inbounds i32, ptr %out, i32 1 @@ -388,8 +388,8 @@ ; Larger example with predication that should also not be vectorized ; CHECK-LABEL: predicated -; CHECK: LV: Selecting VF: 1 -; CHECK: LV: Selecting VF: 1 +; CHECK: LV: Selecting Tail folded VF: 1 +; CHECK: LV: Selecting Tail folded VF: 1 define dso_local i32 @predicated(i32 noundef %0, ptr %glob) #0 { %2 = alloca [101 x i32], align 4 %3 = alloca [21 x i32], align 4 Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll @@ -133,7 +133,7 @@ br label %while.body while.body: - %N.addr.09 = phi i32 [ %dec, %while.body ], [ 2049, %while.body.preheader ] + %N.addr.09 = phi i32 [ %dec, %while.body ], [ 2051, %while.body.preheader ] %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ] %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ] %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ] Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -1,4 +1,5 @@ ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT +; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TAILPRED target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" @@ -6,6 +7,7 @@ ; When TP is disabled, this test can vectorize with a VF of 16. ; When TP is enabled, this test should vectorize with a VF of 8. +; When both are allowed, the VF=8 with tail folding should win out. ; ; DEFAULT: load <16 x i8>, <16 x i8>* ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16> Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -8,9 +8,9 @@ define i32 @foo() { ; CHECK-LABEL: foo -; CHECK-PWR8: Executing best plan with VF=16, UF=4 +; CHECK-PWR8: Executing best plan with TailFold=false, VF=16, UF=4 -; CHECK-PWR9: Executing best plan with VF=8, UF=8 +; CHECK-PWR9: Executing best plan with TailFold=false, VF=8, UF=8 entry: @@ -46,7 +46,7 @@ ; CHECK-LABEL: goo -; CHECK: Executing best plan with VF=16, UF=4 +; CHECK: Executing best plan with TailFold=false, VF=16, UF=4 entry: br label %for.body @@ -79,7 +79,7 @@ define i64 @bar(ptr nocapture %a) { ; CHECK-LABEL: bar -; CHECK: Executing best plan with VF=2, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=2, UF=12 entry: br label %for.body @@ -107,7 +107,7 @@ define void @hoo(i32 %n) { ; CHECK-LABEL: hoo -; CHECK: Executing best plan with VF=1, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=1, UF=12 entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -112,7 +112,7 @@ ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: @@ -244,7 +244,7 @@ ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -8,7 +8,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_1 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -97,7 +97,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_2 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -165,7 +165,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_3_reduction -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -238,7 +238,7 @@ ; containing %conv at the end, because %conv is the last recipe in the block. define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_4_requires_split_at_end_of_block -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -335,7 +335,7 @@ ; Test case that requires sinking a recipe in a replicate region after another replicate region. define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize { ; CHECK-LABEL: sink_replicate_region_after_replicate_region -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -408,7 +408,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias %dst) { ; CHECK-LABEL: need_new_block_after_sinking_pr56146 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: Index: llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,7 +36,7 @@ ; Check for crash exposed by D76992. ; CHECK-LABEL: 'test' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: Index: llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: LV: Checking a loop in 'sink1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -73,7 +73,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -151,7 +151,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -231,7 +231,7 @@ ; Make sure we do not sink uniform instructions. define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -300,7 +300,7 @@ ; Loop with predicated load. define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -394,7 +394,7 @@ ; loaded value. define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -497,7 +497,7 @@ ; on loaded value. define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -600,7 +600,7 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -699,7 +699,7 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -760,7 +760,7 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -970,7 +970,7 @@ ; need to be removed before merging. define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize { ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: