Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -375,8 +375,9 @@ /// Returns true if vector representation of the instruction \p I /// requires mask. - bool isMaskRequired(const Instruction *I) const { - return MaskedOp.contains(I); + bool isMaskRequired(bool FoldTailByMasking, const Instruction *I) const { + return MaskedOp.contains(I) || + (FoldTailByMasking && FoldTailMaskedOp.contains(I)); } unsigned getNumStores() const { return LAI->getNumStores(); } @@ -384,8 +385,9 @@ /// Returns all assume calls in predicated blocks. They need to be dropped /// when flattening the CFG. - const SmallPtrSetImpl &getConditionalAssumes() const { - return ConditionalAssumes; + const SmallPtrSetImpl & + getConditionalAssumes(bool FoldTailByMasking) const { + return FoldTailByMasking ? FoldTailConditionalAssumes : ConditionalAssumes; } private: @@ -545,6 +547,11 @@ /// flattened. SmallPtrSet ConditionalAssumes; + /// Same as MaskedOp above when folding tail by masking. + SmallPtrSet FoldTailMaskedOp; + /// Same as ConditionalAssumes above when folding tail by masking. + SmallPtrSet FoldTailConditionalAssumes; + /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1424,14 +1424,11 @@ // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; - SmallPtrSet TmpMaskedOp; - SmallPtrSet TmpConditionalAssumes; - // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes)) { + if (!blockCanBePredicated(BB, SafePointers, FoldTailMaskedOp, + FoldTailConditionalAssumes)) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } @@ -1439,10 +1436,6 @@ LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); - MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); - ConditionalAssumes.insert(TmpConditionalAssumes.begin(), - TmpConditionalAssumes.end()); - return true; } Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -189,6 +189,9 @@ /// Vector width with best cost. ElementCount Width; + /// Whether the entire loop is predicated. + bool FoldTailByMasking; + /// Cost of the loop with that width. InstructionCost Cost; @@ -199,17 +202,19 @@ /// to runtime checks. ElementCount MinProfitableTripCount; - VectorizationFactor(ElementCount Width, InstructionCost Cost, - InstructionCost ScalarCost) - : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} + VectorizationFactor(ElementCount Width, bool FoldTailByMasking, + InstructionCost Cost, InstructionCost ScalarCost) + : Width(Width), FoldTailByMasking(FoldTailByMasking), Cost(Cost), + ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0, 0}; + return {ElementCount::getFixed(1), false, 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { - return Width == rhs.Width && Cost == rhs.Cost; + return Width == rhs.Width && FoldTailByMasking == rhs.FoldTailByMasking && + Cost == rhs.Cost; } bool operator!=(const VectorizationFactor &rhs) const { @@ -266,9 +271,6 @@ /// The legality analysis. LoopVectorizationLegality *Legal; - /// The profitability analysis. - LoopVectorizationCostModel &CM; - /// The interleaved access analysis. InterleavedAccessInfo &IAI; @@ -283,28 +285,41 @@ /// A builder used to construct the current plan. VPBuilder Builder; + /// Profitable vector factors. + SmallVector ProfitableVFs; + public: LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI, const TargetTransformInfo *TTI, LoopVectorizationLegality *Legal, - LoopVectorizationCostModel &CM, InterleavedAccessInfo &IAI, PredicatedScalarEvolution &PSE, const LoopVectorizeHints &Hints, OptimizationRemarkEmitter *ORE) - : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI), + : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), IAI(IAI), PSE(PSE), Hints(Hints), ORE(ORE) {} /// Plan how to best vectorize, return the best VF and its cost, or /// std::nullopt if vectorization and interleaving should be avoided up front. - std::optional plan(ElementCount UserVF, unsigned UserIC); + void plan(LoopVectorizationCostModel &CM, ElementCount UserVF, + unsigned UserIC); + + /// \return The most profitable vectorization factor and the cost of that VF. + /// This method checks every VF in the plans in \p VPlans. If UserVF is not + /// ZERO then this vectorization factor will be selected if vectorization is + /// possible. + std::optional selectVectorizationFactor(); + + VectorizationFactor + selectEpilogueVectorizationFactor(const VectorizationFactor &MainVF); /// Use the VPlan-native path to plan how to best vectorize, return the best /// VF and its cost. - VectorizationFactor planInVPlanNativePath(ElementCount UserVF); + VectorizationFactor planInVPlanNativePath(LoopVectorizationCostModel &CM, + ElementCount UserVF); /// Return the best VPlan for \p VF. - VPlan &getBestPlanFor(ElementCount VF) const; + VPlan &getBestPlanFor(ElementCount VF, bool FoldTailByMasking) const; /// Generate the IR code for the body of the vectorized loop according to the /// best selected \p VF, \p UF and VPlan \p BestPlan. @@ -321,9 +336,10 @@ /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. - bool hasPlanWithVF(ElementCount VF) const { - return any_of(VPlans, - [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); + bool hasPlanWithVF(ElementCount VF, bool FoldTailByMasking) const { + return any_of(VPlans, [&](const VPlanPtr &Plan) { + return Plan->hasVF(VF) && Plan->foldTailByMasking() == FoldTailByMasking; + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -340,33 +356,51 @@ /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. - void buildVPlans(ElementCount MinVF, ElementCount MaxVF); + void buildVPlans(LoopVectorizationCostModel &CM, ElementCount MinVF, + ElementCount MaxVF); private: /// Build a VPlan according to the information gathered by Legal. \return a /// VPlan for vectorization factors \p Range.Start and up to \p Range.End /// exclusive, possibly decreasing \p Range.End. - VPlanPtr buildVPlan(VFRange &Range); + VPlanPtr buildVPlan(LoopVectorizationCostModel &CM, VFRange &Range); /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr - buildVPlanWithVPRecipes(VFRange &Range, + buildVPlanWithVPRecipes(LoopVectorizationCostModel &CM, VFRange &Range, SmallPtrSetImpl &DeadInstructions); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(LoopVectorizationCostModel &CM, + ElementCount MinVF, ElementCount MaxVF); // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be // converted to reductions, with one operand being vector and the other being // the scalar reduction chain. For other reductions, a select is introduced // between the phi and live-out recipes when folding the tail. - void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan, + void adjustRecipesForReductions(LoopVectorizationCostModel &CM, + VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, ElementCount MinVF); + + /// Determines if we have the infrastructure to vectorize loop \p L and its + /// epilogue, assuming the main loop is vectorized by \p VF. + bool isCandidateForEpilogueVectorization(const Loop &L, + const ElementCount VF) const; + + /// Returns true if the per-lane cost of VectorizationFactor A is lower than + /// that of B. + bool isMoreProfitable(const VectorizationFactor &A, + const VectorizationFactor &B) const; + + /// Returns true if epilogue vectorization is considered profitable, and + /// false otherwise. + /// \p VF is the vectorization factor chosen for the original loop. + bool isEpilogueVectorizationProfitable(const ElementCount VF) const; }; } // namespace llvm Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -236,7 +236,6 @@ "force-tail-folding-style", cl::desc("Force the tail folding style"), cl::init(TailFoldingStyle::None), cl::values( - clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"), clEnumValN( TailFoldingStyle::Data, "data", "Create lane mask for data only, using active.lane.mask intrinsic"), @@ -1184,18 +1183,18 @@ /// different operations. class LoopVectorizationCostModel { public: - LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L, - PredicatedScalarEvolution &PSE, LoopInfo *LI, - LoopVectorizationLegality *Legal, + LoopVectorizationCostModel(bool FoldTailByMasking, ScalarEpilogueLowering SEL, + Loop *L, PredicatedScalarEvolution &PSE, + LoopInfo *LI, LoopVectorizationLegality *Legal, const TargetTransformInfo &TTI, const TargetLibraryInfo *TLI, DemandedBits *DB, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, const Function *F, const LoopVectorizeHints *Hints, InterleavedAccessInfo &IAI) - : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), - TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F), - Hints(Hints), InterleaveInfo(IAI) {} + : ScalarEpilogueStatus(SEL), FoldTailByMasking(FoldTailByMasking), + TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB), + AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {} /// \return An upper bound for the vectorization factors (both fixed and /// scalable). If the factors are 0, vectorization and interleaving should be @@ -1206,17 +1205,6 @@ /// otherwise. bool runtimeChecksRequired(); - /// \return The most profitable vectorization factor and the cost of that VF. - /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO - /// then this vectorization factor will be selected if vectorization is - /// possible. - VectorizationFactor - selectVectorizationFactor(const ElementCountSet &CandidateVFs); - - VectorizationFactor - selectEpilogueVectorizationFactor(const ElementCount MaxVF, - const LoopVectorizationPlanner &LVP); - /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. bool selectUserVectorizationFactor(ElementCount UserVF) { @@ -1234,7 +1222,7 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); + unsigned selectInterleaveCount(VectorizationFactor VF); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1563,13 +1551,15 @@ /// Returns true if a scalar epilogue is not allowed due to optsize or a /// loop hint annotation. bool isScalarEpilogueAllowed() const { - return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; + return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed || + (!FoldTailByMasking && + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate); } /// Returns the TailFoldingStyle that is best for the current loop. TailFoldingStyle getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { - if (!CanFoldTailByMasking) + if (!FoldTailByMasking) return TailFoldingStyle::None; if (ForceTailFoldingStyle.getNumOccurrences()) @@ -1579,9 +1569,7 @@ } /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { - return getTailFoldingStyle() != TailFoldingStyle::None; - } + bool foldTailByMasking() const { return FoldTailByMasking; } /// Returns true if the instructions in this block requires predication /// for any reason, e.g. because tail folding now requires a predicate @@ -1620,11 +1608,6 @@ Function **Variant, bool *NeedsMask = nullptr) const; - /// Returns true if the per-lane cost of VectorizationFactor A is lower than - /// that of B. - bool isMoreProfitable(const VectorizationFactor &A, - const VectorizationFactor &B) const; - /// Invalidates decisions already taken by the cost model. void invalidateCostModelingDecisions() { WideningDecisions.clear(); @@ -1632,10 +1615,25 @@ Scalars.clear(); } - /// Convenience function that returns the value of vscale_range iff - /// vscale_range.min == vscale_range.max or otherwise returns the value - /// returned by the corresponding TLI method. - std::optional getVScaleForTuning() const; + /// The vectorization cost is a combination of the cost itself and a boolean + /// indicating whether any of the contributing operations will actually + /// operate on vector values after type legalization in the backend. If this + /// latter value is false, then all operations will be scalarized (i.e. no + /// vectorization has actually taken place). + using VectorizationCostTy = std::pair; + + /// Returns the expected execution cost. The unit of the cost does + /// not matter because we use the 'cost' units to compare different + /// vector widths. The cost that is returned is *not* normalized by + /// the factor width. If \p Invalid is not nullptr, this function + /// will add a pair(Instruction*, ElementCount) to \p Invalid for + /// each instruction that has an Invalid cost for the given VF. + VectorizationCostTy + expectedCost(ElementCount VF, + SmallVectorImpl *Invalid = nullptr); + + /// Return the NumPredStores, to be checked by the Planner. + unsigned getNumPredStores() { return NumPredStores; } private: unsigned NumPredStores = 0; @@ -1662,23 +1660,6 @@ /// of elements. ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements); - /// The vectorization cost is a combination of the cost itself and a boolean - /// indicating whether any of the contributing operations will actually - /// operate on vector values after type legalization in the backend. If this - /// latter value is false, then all operations will be scalarized (i.e. no - /// vectorization has actually taken place). - using VectorizationCostTy = std::pair; - - /// Returns the expected execution cost. The unit of the cost does - /// not matter because we use the 'cost' units to compare different - /// vector widths. The cost that is returned is *not* normalized by - /// the factor width. If \p Invalid is not nullptr, this function - /// will add a pair(Instruction*, ElementCount) to \p Invalid for - /// each instruction that has an Invalid cost for the given VF. - VectorizationCostTy - expectedCost(ElementCount VF, - SmallVectorImpl *Invalid = nullptr); - /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); @@ -1750,7 +1731,7 @@ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool CanFoldTailByMasking = false; + bool FoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the @@ -1841,16 +1822,6 @@ Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); } - /// Determines if we have the infrastructure to vectorize loop \p L and its - /// epilogue, assuming the main loop is vectorized by \p VF. - bool isCandidateForEpilogueVectorization(const Loop &L, - const ElementCount VF) const; - - /// Returns true if epilogue vectorization is considered profitable, and - /// false otherwise. - /// \p VF is the vectorization factor chosen for the original loop. - bool isEpilogueVectorizationProfitable(const ElementCount VF) const; - public: /// The loop that we evaluate. Loop *TheLoop; @@ -1896,9 +1867,6 @@ /// All element types found in the loop. SmallPtrSet ElementTypesInLoop; - - /// Profitable vector factors. - SmallVector ProfitableVFs; }; } // end namespace llvm @@ -3511,7 +3479,8 @@ // We don't support masked function calls yet, but we can scalarize a // masked call with branches (unless VF is scalable). - if (!TLI || CI->isNoBuiltin() || !VecFunc || Legal->isMaskRequired(CI)) + if (!TLI || CI->isNoBuiltin() || !VecFunc || + Legal->isMaskRequired(foldTailByMasking(), CI)) return VF.isScalable() ? InstructionCost::getInvalid() : Cost; // If the corresponding vector cost is cheaper, return its cost. @@ -3935,7 +3904,7 @@ // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { + if (State.Plan->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); SelectInst *Sel = nullptr; @@ -4462,7 +4431,7 @@ return false; case Instruction::Load: case Instruction::Store: { - if (!Legal->isMaskRequired(I)) + if (!Legal->isMaskRequired(foldTailByMasking(), I)) return false; // When we know the load's address is loop invariant and the instruction // in the original scalar loop was unconditionally executed then we @@ -4489,13 +4458,13 @@ // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); case Instruction::Call: - return Legal->isMaskRequired(I); + return Legal->isMaskRequired(foldTailByMasking(), I); } } std::pair LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, - ElementCount VF) const { + ElementCount VF) const { assert(I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || @@ -4601,7 +4570,7 @@ // load, or any gaps in a store-access). bool PredicatedAccessRequiresMasking = blockNeedsPredicationForAnyReason(I->getParent()) && - Legal->isMaskRequired(I); + Legal->isMaskRequired(foldTailByMasking(), I); bool LoadAccessWithGapsRequiresEpilogMasking = isa(I) && Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); @@ -5101,12 +5070,18 @@ case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: - [[fallthrough]]; + LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating " + "predicated vector loop.\n"); + break; case CM_ScalarEpilogueNotNeededUsePredicate: - LLVM_DEBUG( - dbgs() << "LV: vector predicate hint/switch found.\n" - << "LV: Not allowing scalar epilogue, creating predicated " - << "vector loop.\n"); + // If this cost model is for predicated plans then fall through to the + // prepareToFoldTailByMasking checks below, else return the unpredicated max + // size. + if (!FoldTailByMasking) + return computeFeasibleMaxVF(TC, UserVF, false); + LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Trying predicated vector loop.\n"); break; case CM_ScalarEpilogueNotAllowedLowTripLoop: // fallthrough as a special case of OptForSize @@ -5130,17 +5105,8 @@ // a bottom-test and a single exiting block. We'd have to handle the fact // that not every instruction executes on the last iteration. This will // require a lane mask which varies through the vector loop body. (TODO) - if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) { - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - return computeFeasibleMaxVF(TC, UserVF, false); - } + if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) return FixedScalableVFPair::getNone(); - } // Now try the tail folding @@ -5175,25 +5141,22 @@ if (Rem->isZero()) { // Accept MaxFixedVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); - return MaxFactors; + return FoldTailByMasking ? FixedScalableVFPair::getNone() : MaxFactors; } } + // If this cost model is not for tail folding then return at this point and + // leave it for the other model. + if (!FoldTailByMasking && + ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate) + return FixedScalableVFPair::getNone(); + // If we don't know the precise trip count, or if the trip count that we // found modulo the vectorization factor is not zero, try to fold the tail // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - CanFoldTailByMasking = true; - return MaxFactors; - } - - // If there was a tail-folding hint/switch, but we can't fold the tail by - // masking, fallback to a vectorization with a scalar epilogue. - if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) { - LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a " - "scalar epilogue instead.\n"); - ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; + assert(FoldTailByMasking); return MaxFactors; } @@ -5323,7 +5286,12 @@ return MaxVF; } -std::optional LoopVectorizationCostModel::getVScaleForTuning() const { +/// Convenience function that returns the value of vscale_range iff +/// vscale_range.min == vscale_range.max or otherwise returns the value +/// returned by the corresponding TLI method. +static std::optional +getVScaleForTuning(const Loop *L, const TargetTransformInfo &TTI) { + Function *TheFunction = L->getHeader()->getParent(); if (TheFunction->hasFnAttribute(Attribute::VScaleRange)) { auto Attr = TheFunction->getFnAttribute(Attribute::VScaleRange); auto Min = Attr.getVScaleRangeMin(); @@ -5335,37 +5303,51 @@ return TTI.getVScaleForTuning(); } -bool LoopVectorizationCostModel::isMoreProfitable( +bool LoopVectorizationPlanner::isMoreProfitable( const VectorizationFactor &A, const VectorizationFactor &B) const { InstructionCost CostA = A.Cost; InstructionCost CostB = B.Cost; - unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - - if (!A.Width.isScalable() && !B.Width.isScalable() && foldTailByMasking() && - MaxTripCount) { - // If we are folding the tail and the trip count is a known (possibly small) - // constant, the trip count will be rounded up to an integer number of - // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), - // which we compare directly. When not folding the tail, the total cost will - // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is - // approximated with the per-lane cost below instead of using the tripcount - // as here. - auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); - auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); + unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop); + + if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) { + // If the trip count is a known (possibly small) constant, the trip count + // will be rounded up to an integer number of iterations under + // FoldTailByMasking. The total cost in that case will be + // PerIterationCost*ceil(TripCount/VF). When not folding the tail, the total + // cost will be PerIterationCost*floor(TC/VF) + Scalar remainder cost, where + // the scalar cost is approximated using the correct fraction of the vector + // cost. + auto RTCostA = + A.Width.getFixedValue() + ? (CostA * divideCeil(MaxTripCount, A.Width.getFixedValue())) + : (CostA * MaxTripCount) / A.Width.getFixedValue(); + auto RTCostB = + B.Width.getFixedValue() + ? (CostB * divideCeil(MaxTripCount, B.Width.getFixedValue())) + : (CostB * MaxTripCount) / B.Width.getFixedValue(); + + if (A.FoldTailByMasking && !B.FoldTailByMasking) + return RTCostA <= RTCostB; + return RTCostA < RTCostB; } // Improve estimate for the vector width if it is scalable. unsigned EstimatedWidthA = A.Width.getKnownMinValue(); unsigned EstimatedWidthB = B.Width.getKnownMinValue(); - if (std::optional VScale = getVScaleForTuning()) { + if (std::optional VScale = getVScaleForTuning(OrigLoop, *TTI)) { if (A.Width.isScalable()) EstimatedWidthA *= *VScale; if (B.Width.isScalable()) EstimatedWidthB *= *VScale; } + // If one plan is predicated and the other is not, opt for the predicated + // scheme on a tie. + if (A.FoldTailByMasking && !B.FoldTailByMasking) + return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA); + // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. @@ -5442,20 +5424,45 @@ } while (!Tail.empty()); } -VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( - const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; +std::optional +LoopVectorizationPlanner::selectVectorizationFactor() { + LLVM_DEBUG(printPlans(dbgs())); + + // If we had no plans as they were all invalid, return the invalid cost + if (VPlans.size() == 0) + return std::nullopt; + + // If we only have one plan due to the UserVF, return it. We try with both + // predicated and unpredicated loops. + ElementCount UserVF = Hints.getWidth(); + bool UserPredicated = Hints.getPredicate(); + if (UserVF && hasPlanWithVF(UserVF, UserPredicated)) { + VPlan &Plan = getBestPlanFor(UserVF, UserPredicated); + auto Cost = Plan.getCostModel()->expectedCost(UserVF); + if (Cost.first.isValid()) + return VectorizationFactor(UserVF, UserPredicated, Cost.first, 0); + } else if (UserVF && hasPlanWithVF(UserVF, !UserPredicated)) { + VPlan &Plan = getBestPlanFor(UserVF, !UserPredicated); + auto Cost = Plan.getCostModel()->expectedCost(UserVF); + if (Cost.first.isValid()) + return VectorizationFactor(UserVF, !UserPredicated, Cost.first, 0); + } + + assert(VPlans[0]->hasScalarVFOnly() && + "Expected Scalar VPlan to be a the first candidate"); + + InstructionCost ExpectedCost = + VPlans[0]->getCostModel()->expectedCost(ElementCount::getFixed(1)).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(VFCandidates.count(ElementCount::getFixed(1)) && - "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, - ExpectedCost); + const VectorizationFactor ScalarCost(ElementCount::getFixed(1), + VPlans[0]->foldTailByMasking(), + ExpectedCost, ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; - bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; - if (ForceVectorization && VFCandidates.size() > 1) { + bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; + if (ForceVectorization && VPlans.size() > 1) { // Ignore scalar width, because the user explicitly wants vectorization. // Initialize cost to max so that VF = 2 is, at least, chosen during cost // evaluation. @@ -5463,63 +5470,79 @@ } SmallVector InvalidCosts; - for (const auto &i : VFCandidates) { - // The cost for scalar VF=1 is already calculated, so ignore it. - if (i.isScalar()) - continue; + for (const VPlanPtr &VPlan : drop_begin(VPlans)) { + for (const ElementCount &i : VPlan->getVFs()) { + // The cost for scalar VF=1 is already calculated, so ignore it. + if (i.isScalar()) + continue; - VectorizationCostTy C = expectedCost(i, &InvalidCosts); - VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost); + LoopVectorizationCostModel::VectorizationCostTy C = + VPlan->getCostModel()->expectedCost(i, &InvalidCosts); + VectorizationFactor Candidate(i, VPlan->foldTailByMasking(), C.first, + ScalarCost.ScalarCost); #ifndef NDEBUG - unsigned AssumedMinimumVscale = 1; - if (std::optional VScale = getVScaleForTuning()) - AssumedMinimumVscale = *VScale; - unsigned Width = - Candidate.Width.isScalable() - ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale - : Candidate.Width.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " - << Candidate.Cost << " => " << (Candidate.Cost / Width)); - if (i.isScalable()) - LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " - << AssumedMinimumVscale << ")"); - LLVM_DEBUG(dbgs() << ".\n"); -#endif - - if (!C.second && !ForceVectorization) { + unsigned AssumedMinimumVscale = 1; + if (std::optional VScale = getVScaleForTuning(OrigLoop, *TTI)) + AssumedMinimumVscale = *VScale; + unsigned Width = + Candidate.Width.isScalable() + ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale + : Candidate.Width.getFixedValue(); LLVM_DEBUG( - dbgs() << "LV: Not considering vector loop of width " << i - << " because it will not generate any vector instructions.\n"); - continue; - } + dbgs() << "LV: " << (VPlan->foldTailByMasking() ? "Tail folded " : "") + << "Vector loop of width " << i << " costs: " << Candidate.Cost + << " => " << (Candidate.Cost / Width)); + if (i.isScalable()) + LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " + << AssumedMinimumVscale << ")"); + LLVM_DEBUG(dbgs() << ".\n"); +#endif - // If profitable add it to ProfitableVF list. - if (isMoreProfitable(Candidate, ScalarCost)) - ProfitableVFs.push_back(Candidate); + if (!C.second && !ForceVectorization) { + LLVM_DEBUG( + dbgs() + << "LV: Not considering vector loop of width " << i + << " because it will not generate any vector instructions.\n"); + continue; + } - if (isMoreProfitable(Candidate, ChosenFactor)) - ChosenFactor = Candidate; - } + // FIXME: Possibly remove EnableCondStoresVectorization now. + if (!EnableCondStoresVectorization && + VPlan->getCostModel()->getNumPredStores()) { + reportVectorizationFailure( + "There are conditional stores.", + "store that is conditionally executed prevents vectorization", + "ConditionalStore", ORE, OrigLoop); + continue; + } - emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop); + // If profitable add it to ProfitableVF list. + if (isMoreProfitable(Candidate, ScalarCost)) + ProfitableVFs.push_back(Candidate); - if (!EnableCondStoresVectorization && NumPredStores) { - reportVectorizationFailure("There are conditional stores.", - "store that is conditionally executed prevents vectorization", - "ConditionalStore", ORE, TheLoop); - ChosenFactor = ScalarCost; + if (isMoreProfitable(Candidate, ChosenFactor)) + ChosenFactor = Candidate; + } } - LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && - !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); - LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); + emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop); + + LLVM_DEBUG({ + if (ForceVectorization && !ChosenFactor.Width.isScalar() && + !isMoreProfitable(ChosenFactor, ScalarCost)) + dbgs() << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"; + }); + LLVM_DEBUG(dbgs() << "LV: Selecting " + << (ChosenFactor.FoldTailByMasking ? "Tail folded " : "") + << "VF: " << ChosenFactor.Width << ".\n"); + assert((ChosenFactor.Width.isScalar() || ChosenFactor.ScalarCost > 0) && + "when vectorizing, the scalar cost must be non-zero."); return ChosenFactor; } -bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization( +bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization( const Loop &L, ElementCount VF) const { // Cross iteration phis such as reductions need special handling and are // currently unsupported. @@ -5550,7 +5573,7 @@ return true; } -bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable( +bool LoopVectorizationPlanner::isEpilogueVectorizationProfitable( const ElementCount VF) const { // FIXME: We need a much better cost-model to take different parameters such // as register pressure, code size increase and cost of extra branches into @@ -5558,12 +5581,12 @@ // with vectorization factors larger than a certain value. // Allow the target to opt out entirely. - if (!TTI.preferEpilogueVectorization()) + if (!TTI->preferEpilogueVectorization()) return false; // We also consider epilogue vectorization unprofitable for targets that don't // consider interleaving beneficial (eg. MVE). - if (TTI.getMaxInterleaveFactor(VF) <= 1) + if (TTI->getMaxInterleaveFactor(VF) <= 1) return false; // FIXME: We should consider changing the threshold for scalable // vectors to take VScaleForTuning into account. @@ -5572,25 +5595,23 @@ return false; } -VectorizationFactor -LoopVectorizationCostModel::selectEpilogueVectorizationFactor( - const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) { +VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor( + const VectorizationFactor &MainLoopVF) { VectorizationFactor Result = VectorizationFactor::Disabled(); if (!EnableEpilogueVectorization) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";); return Result; } - if (!isScalarEpilogueAllowed()) { - LLVM_DEBUG( - dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is " - "allowed.\n";); + if (MainLoopVF.FoldTailByMasking) { + LLVM_DEBUG(dbgs() << "LEV: Epilogue not required as the vector loop is " + "predicated.\n";); return Result; } // Not really a cost consideration, but check for unsupported cases here to // simplify the logic. - if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) { + if (!isCandidateForEpilogueVectorization(*OrigLoop, MainLoopVF.Width)) { LLVM_DEBUG( dbgs() << "LEV: Unable to vectorize epilogue because the loop is " "not a supported candidate.\n";); @@ -5600,8 +5621,8 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) - return {ForcedEC, 0, 0}; + if (hasPlanWithVF(ForcedEC, false)) + return {ForcedEC, false, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -5610,15 +5631,15 @@ } } - if (TheLoop->getHeader()->getParent()->hasOptSize() || - TheLoop->getHeader()->getParent()->hasMinSize()) { + if (OrigLoop->getHeader()->getParent()->hasOptSize() || + OrigLoop->getHeader()->getParent()->hasMinSize()) { LLVM_DEBUG( dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n";); return Result; } - if (!isEpilogueVectorizationProfitable(MainLoopVF)) { + if (!isEpilogueVectorizationProfitable(MainLoopVF.Width)) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for " "this loop\n"); return Result; @@ -5627,19 +5648,20 @@ // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know // the main loop handles 8 lanes per iteration. We could still benefit from // vectorizing the epilogue loop with VF=4. - ElementCount EstimatedRuntimeVF = MainLoopVF; - if (MainLoopVF.isScalable()) { - EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue()); - if (std::optional VScale = getVScaleForTuning()) + ElementCount EstimatedRuntimeVF = MainLoopVF.Width; + if (MainLoopVF.Width.isScalable()) { + EstimatedRuntimeVF = + ElementCount::getFixed(MainLoopVF.Width.getKnownMinValue()); + if (std::optional VScale = getVScaleForTuning(OrigLoop, *TTI)) EstimatedRuntimeVF *= *VScale; } for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && + if (((!NextVF.Width.isScalable() && MainLoopVF.Width.isScalable() && ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && + ElementCount::isKnownLT(NextVF.Width, MainLoopVF.Width)) && (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) + hasPlanWithVF(NextVF.Width, NextVF.FoldTailByMasking)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) @@ -5724,8 +5746,7 @@ } unsigned -LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, - InstructionCost LoopCost) { +LoopVectorizationCostModel::selectInterleaveCount(VectorizationFactor VF) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. // There are many micro-architectural considerations that we can't predict @@ -5739,8 +5760,8 @@ // overhead. // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - - if (!isScalarEpilogueAllowed()) + if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate || + VF.FoldTailByMasking || TheLoop->getHeader()->getParent()->hasOptSize()) return 1; // We used the distance for the interleave count. @@ -5755,21 +5776,22 @@ // because with the above conditions interleaving can expose ILP and break // cross iteration dependences for reductions. if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && - !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) + !(InterleaveSmallLoopScalarReduction && HasReductions && + VF.Width.isScalar())) return 1; // If we did not calculate the cost for VF (because the user selected the VF) // then we calculate the cost of VF here. - if (LoopCost == 0) { - LoopCost = expectedCost(VF).first; - assert(LoopCost.isValid() && "Expected to have chosen a VF with valid cost"); + if (VF.Cost == 0) { + VF.Cost = expectedCost(VF.Width).first; + assert(VF.Cost.isValid() && "Expected to have chosen a VF with valid cost"); // Loop body is free and there is no need for interleaving. - if (LoopCost == 0) + if (VF.Cost == 0) return 1; } - RegisterUsage R = calculateRegisterUsage({VF})[0]; + RegisterUsage R = calculateRegisterUsage({VF.Width})[0]; // We divide by these constants so assume that we have at least one // instruction that uses at least one register. for (auto& pair : R.MaxLocalUsers) { @@ -5794,7 +5816,7 @@ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF.isScalar()) { + if (VF.Width.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { @@ -5818,10 +5840,10 @@ } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Width); // Check if the user has overridden the max. - if (VF.isScalar()) { + if (VF.Width.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { @@ -5840,8 +5862,8 @@ // the InterleaveCount as if vscale is '1', although if some information about // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { - MaxInterleaveCount = - std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); + MaxInterleaveCount = std::min(*BestKnownTC / VF.Width.getKnownMinValue(), + MaxInterleaveCount); // Make sure MaxInterleaveCount is greater than 0. MaxInterleaveCount = std::max(1u, MaxInterleaveCount); } @@ -5861,7 +5883,7 @@ // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF.isVector() && HasReductions) { + if (VF.Width.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5871,26 +5893,28 @@ // vectorized the loop we will have done the runtime check and so interleaving // won't require further checks. bool ScalarInterleavingRequiresPredication = - (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + (VF.Width.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); })); bool ScalarInterleavingRequiresRuntimePointerCheck = - (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); + (VF.Width.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. - LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' + LLVM_DEBUG(dbgs() << "LV: Loop cost is " << VF.Cost << '\n' << "LV: IC is " << IC << '\n' - << "LV: VF is " << VF << '\n'); + << "LV: VF is " << VF.Width << '\n' + << "LV: Fold Tail is " + << (VF.FoldTailByMasking ? "true" : "false") << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); if (!ScalarInterleavingRequiresRuntimePointerCheck && - !ScalarInterleavingRequiresPredication && LoopCost < SmallLoopCost) { + !ScalarInterleavingRequiresPredication && VF.Cost < SmallLoopCost) { // We assume that the cost overhead is 1 and we use the cost model // to estimate the cost of the loop and interleave until the cost of the // loop overhead is about 5% of the cost of the loop. unsigned SmallIC = std::min(IC, (unsigned)llvm::bit_floor( - SmallLoopCost / *LoopCost.getValue())); + SmallLoopCost / *VF.Cost.getValue())); // Interleave until store/load ports (estimated by max interleave count) are // saturated. @@ -5947,7 +5971,7 @@ // If there are scalar reductions and TTI has enabled aggressive // interleaving for reductions, we will interleave to expose ILP. - if (InterleaveSmallLoopScalarReduction && VF.isScalar() && + if (InterleaveSmallLoopScalarReduction && VF.Width.isScalar() && AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); // Interleave no less than SmallIC but not as aggressive as the normal IC @@ -6500,7 +6524,7 @@ "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; - if (Legal->isMaskRequired(I)) { + if (Legal->isMaskRequired(foldTailByMasking(), I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); } else { @@ -6554,7 +6578,8 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost( - I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, + I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(foldTailByMasking(), I), Alignment, TargetTransformInfo::TCK_RecipThroughput, I); } @@ -6589,11 +6614,12 @@ (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + AS, CostKind, Legal->isMaskRequired(foldTailByMasking(), I), + UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. - assert(!Legal->isMaskRequired(I) && + assert(!Legal->isMaskRequired(foldTailByMasking(), I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, @@ -7309,8 +7335,9 @@ return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: case LoopVectorizationCostModel::CM_Widen: - return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked - : TTI::CastContextHint::Normal; + return Legal->isMaskRequired(foldTailByMasking(), I) + ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; case LoopVectorizationCostModel::CM_Widen_Reverse: return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: @@ -7478,7 +7505,8 @@ } VectorizationFactor -LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) { +LoopVectorizationPlanner::planInVPlanNativePath(LoopVectorizationCostModel &CM, + ElementCount UserVF) { assert(!UserVF.isScalable() && "scalable vectors not yet supported"); ElementCount VF = UserVF; // Outer loop handling: They may require CFG and instruction level @@ -7507,13 +7535,13 @@ "VF needs to be a power of two"); LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "") << "VF " << VF << " to build VPlans.\n"); - buildVPlans(VF, VF); + buildVPlans(CM, VF, VF); // For VPlan build stress testing, we bail out after VPlan construction. if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; + return {VF, false, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -7522,12 +7550,15 @@ return VectorizationFactor::Disabled(); } -std::optional -LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) { +void LoopVectorizationPlanner::plan(LoopVectorizationCostModel &CM, + ElementCount UserVF, unsigned UserIC) { + CM.collectValuesToIgnore(); + CM.collectElementTypesForWidening(); + assert(OrigLoop->isInnermost() && "Inner loop expected."); FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC); if (!MaxFactors) // Cases that should not to be vectorized nor interleaved. - return std::nullopt; + return; // Invalidate interleave groups if all blocks of loop will be predicated. if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && @@ -7554,55 +7585,30 @@ if (CM.selectUserVectorizationFactor(UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); - LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + buildVPlansWithVPRecipes(CM, UserVF, UserVF); + return; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); } - // Populate the set of Vectorization Factor Candidates. - ElementCountSet VFCandidates; - for (auto VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) - VFCandidates.insert(VF); - for (auto VF = ElementCount::getScalable(1); - ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) - VFCandidates.insert(VF); - - for (const auto &VF : VFCandidates) { - // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); - - // Collect the instructions (and their associated costs) that will be more - // profitable to scalarize. - if (VF.isVector()) - CM.collectInstsToScalarize(VF); - } - CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); - - LLVM_DEBUG(printPlans(dbgs())); - if (!MaxFactors.hasVector()) - return VectorizationFactor::Disabled(); - - // Select the optimal vectorization factor. - VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); - return VF; + buildVPlansWithVPRecipes(CM, ElementCount::getFixed(1), MaxFactors.FixedVF); + buildVPlansWithVPRecipes(CM, ElementCount::getScalable(1), + MaxFactors.ScalableVF); } -VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { +VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF, + bool FoldTailByMasking) const { assert(count_if(VPlans, - [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == - 1 && + [VF, FoldTailByMasking](const VPlanPtr &Plan) { + return Plan->hasVF(VF) && + Plan->foldTailByMasking() == FoldTailByMasking; + }) == 1 && "Best VF has not a single VPlan."); for (const VPlanPtr &Plan : VPlans) { - if (Plan->hasVF(VF)) + if (Plan->hasVF(VF) && Plan->foldTailByMasking() == FoldTailByMasking) return *Plan.get(); } llvm_unreachable("No plan found!"); @@ -7652,8 +7658,9 @@ assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF - << '\n'); + LLVM_DEBUG(dbgs() << "Executing best plan with TailFold=" + << (BestVPlan.foldTailByMasking() ? "true" : "false") + << ", VF=" << BestVF << ", UF=" << BestUF << '\n'); // Workaround! Compute the trip count of the original loop and cache it // before we start modifying the CFG. This code has a systemic problem @@ -8051,12 +8058,13 @@ /// of VF's starting at a given VF and extending it as much as possible. Each /// vectorization decision can potentially shorten this sub-range during /// buildVPlan(). -void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF, +void LoopVectorizationPlanner::buildVPlans(LoopVectorizationCostModel &CM, + ElementCount MinVF, ElementCount MaxVF) { auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back(buildVPlan(SubRange)); + VPlans.push_back(buildVPlan(CM, SubRange)); VF = SubRange.End; } } @@ -8122,7 +8130,7 @@ if (!CM.blockNeedsPredicationForAnyReason(BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - assert(CM.foldTailByMasking() && "must fold the tail"); + assert(Plan.foldTailByMasking() && "must fold the tail"); // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. @@ -8194,7 +8202,7 @@ return nullptr; VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) + if (Legal->isMaskRequired(Plan->foldTailByMasking(), I)) Mask = createBlockInMask(I->getParent(), *Plan); // Determine if the pointer operand of the access is either consecutive or @@ -8666,22 +8674,34 @@ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, - ElementCount MaxVF) { +void LoopVectorizationPlanner::buildVPlansWithVPRecipes( + LoopVectorizationCostModel &CM, ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); + auto MaxVFPlusOne = MaxVF.getWithIncrement(1); + for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne); + VF *= 2) { + // Collect Uniform and Scalar instructions after vectorization with VF. + CM.collectUniformsAndScalars(VF); + + // Collect the instructions (and their associated costs) that will be more + // profitable to scalarize. + if (VF.isVector()) + CM.collectInstsToScalarize(VF); + } + // Add assume instructions we need to drop to DeadInstructions, to prevent // them from being added to the VPlan. // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. SmallPtrSet DeadInstructions; - auto &ConditionalAssumes = Legal->getConditionalAssumes(); + auto &ConditionalAssumes = + Legal->getConditionalAssumes(CM.foldTailByMasking()); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); - auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back(buildVPlanWithVPRecipes(SubRange, DeadInstructions)); + VPlans.push_back(buildVPlanWithVPRecipes(CM, SubRange, DeadInstructions)); VF = SubRange.End; } } @@ -8813,7 +8833,8 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions) { + LoopVectorizationCostModel &CM, VFRange &Range, + SmallPtrSetImpl &DeadInstructions) { SmallPtrSet *, 1> InterleaveGroups; @@ -8847,7 +8868,7 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { + auto applyIG = [IG, &CM](ElementCount VF) -> bool { return (VF.isVector() && // Query is illegal for VF == 1 CM.getWideningDecision(IG->getInsertPos(), VF) == LoopVectorizationCostModel::CM_Interleave); @@ -8869,7 +8890,7 @@ // followed by a region for the vector loop, followed by the middle block. The // skeleton vector loop region contains a header and latch block. VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique(Preheader); + auto Plan = std::make_unique(CM.foldTailByMasking(), &CM, Preheader); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); @@ -8989,8 +9010,8 @@ VPlanTransforms::removeRedundantInductionCasts(*Plan); // Adjust the recipes for any inloop reductions. - adjustRecipesForReductions(cast(TopRegion->getExiting()), Plan, - RecipeBuilder, Range.Start); + adjustRecipesForReductions(CM, cast(TopRegion->getExiting()), + Plan, RecipeBuilder, Range.Start); // Sink users of fixed-order recurrence past the recipe defining the previous // value and introduce FirstOrderRecurrenceSplice VPInstructions. @@ -9048,7 +9069,8 @@ return Plan; } -VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) { +VPlanPtr LoopVectorizationPlanner::buildVPlan(LoopVectorizationCostModel &CM, + VFRange &Range) { // Outer loop handling: They may require CFG and instruction level // transformations before even evaluating whether vectorization is profitable. // Since we cannot modify the incoming IR, we need to build VPlan upfront in @@ -9057,7 +9079,7 @@ assert(EnableVPlanNativePath && "VPlan-native path is not enabled."); // Create new empty VPlan - auto Plan = std::make_unique(); + auto Plan = std::make_unique(CM.foldTailByMasking(), &CM); // Build hierarchical CFG VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan); @@ -9080,7 +9102,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + CM.getTailFoldingStyle(false)); return Plan; } @@ -9090,8 +9112,8 @@ // reduction chain. For other reductions, a select is introduced between the phi // and live-out recipes when folding the tail. void LoopVectorizationPlanner::adjustRecipesForReductions( - VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder, - ElementCount MinVF) { + LoopVectorizationCostModel &CM, VPBasicBlock *LatchVPBB, VPlanPtr &Plan, + VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) { for (const auto &Reduction : CM.getInLoopReductionChains()) { PHINode *Phi = Reduction.first; const RecurrenceDescriptor &RdxDesc = @@ -9178,7 +9200,7 @@ // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. - if (CM.foldTailByMasking()) { + if (Plan->foldTailByMasking()) { Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -9846,12 +9868,10 @@ ScalarEpilogueLowering SEL = getScalarEpilogueLowering( F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, *LVL, &IAI); - LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F, - &Hints, IAI); + LoopVectorizationCostModel CM(false, SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, + ORE, F, &Hints, IAI); // Use the planner for outer loop vectorization. - // TODO: CM is not used at this point inside the planner. Turn CM into an - // optional argument if we don't need it in the future. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, IAI, PSE, Hints, ORE); // Get user vectorization factor. ElementCount UserVF = Hints.getWidth(); @@ -9859,7 +9879,7 @@ CM.collectElementTypesForWidening(); // Plan how to best vectorize, return the best VF and its cost. - const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF); + const VectorizationFactor VF = LVP.planInVPlanNativePath(CM, UserVF); // If we are stress testing VPlan builds, do not attempt to generate vector // code. Masked vector code generation support will follow soon. @@ -9867,7 +9887,7 @@ if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) return false; - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking); { GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, @@ -10099,7 +10119,7 @@ assert(L->isInnermost() && "Inner loop expected."); - InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI()); + InterleavedAccessInfo BaseIAI(PSE, L, DT, LI, LVL.getLAI()); bool UseInterleaved = TTI->enableInterleavedAccessVectorization(); // If an override option has been passed in for interleaved accesses, use it. @@ -10108,12 +10128,12 @@ // Analyze interleaved memory accesses. if (UseInterleaved) - IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + BaseIAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); // Check the function attributes and profiles to find out if this function // should be optimized for size. ScalarEpilogueLowering SEL = getScalarEpilogueLowering( - F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &IAI); + F, L, Hints, PSI, BFI, TTI, TLI, AC, LI, PSE.getSE(), DT, LVL, &BaseIAI); // Check the loop for a trip count threshold: vectorize loops with a tiny trip // count by optimizing for size, to minimize overheads. @@ -10187,21 +10207,36 @@ return false; } - // Use the cost model. - LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, - F, &Hints, IAI); - CM.collectValuesToIgnore(); - CM.collectElementTypesForWidening(); - // Use the planner for vectorization. - LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE); + LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, BaseIAI, PSE, Hints, ORE); // Get user vectorization factor and interleave count. ElementCount UserVF = Hints.getWidth(); unsigned UserIC = Hints.getInterleave(); + // Don't use a predicated vector body if the user has forced a vectorization + // factor of 1. + if (UserVF.isScalar()) + SEL = CM_ScalarEpilogueAllowed; + + // We plan with two different cost models with FoldTailByMasking = false and + // true, adding the useful vplans from each and picking the best below in + // selectVectorizationFactor. + LoopVectorizationCostModel BaseCM(false, SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, + AC, ORE, F, &Hints, BaseIAI); + LVP.plan(BaseCM, UserVF, UserIC); + + InterleavedAccessInfo PredIAI(PSE, L, DT, LI, LVL.getLAI()); + if (UseInterleaved && SEL != CM_ScalarEpilogueAllowed) + PredIAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI)); + LoopVectorizationCostModel PredCM(true, SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, + AC, ORE, F, &Hints, PredIAI); + if (SEL != CM_ScalarEpilogueAllowed) + LVP.plan(PredCM, UserVF, UserIC); + // Plan how to best vectorize, return the best VF and its cost. - std::optional MaybeVF = LVP.plan(UserVF, UserIC); + // Use the cost model. + std::optional MaybeVF = LVP.selectVectorizationFactor(); VectorizationFactor VF = VectorizationFactor::Disabled(); unsigned IC = 1; @@ -10211,7 +10246,8 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = VF.FoldTailByMasking ? PredCM.selectInterleaveCount(VF) + : BaseCM.selectInterleaveCount(VF); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if @@ -10223,7 +10259,7 @@ bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled; if (!ForceVectorization && - !areRuntimeChecksProfitable(Checks, VF, CM.getVScaleForTuning(), L, + !areRuntimeChecksProfitable(Checks, VF, getVScaleForTuning(L, *TTI), L, *PSE.getSE())) { ORE->emit([&]() { return OptimizationRemarkAnalysisAliasing( @@ -10328,10 +10364,9 @@ assert(IC > 1 && "interleave count should not be 1 or 0"); // If we decided that it is not legal to vectorize the loop, then // interleave it. + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking); InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, - &CM, BFI, PSI, Checks); - - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + BestPlan.getCostModel(), BFI, PSI, Checks); LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { @@ -10345,17 +10380,18 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); + LVP.selectEpilogueVectorizationFactor(VF); if (EpilogueVF.Width.isVector()) { - // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. + // TODOD: Predicated remainders EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); - EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, - EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); + VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF, false); + EpilogueVectorizerMainLoop MainILV( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, + BestMainPlan.getCostModel(), BFI, PSI, Checks); LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; @@ -10364,11 +10400,11 @@ // edges from the first pass. EPI.MainLoopVF = EPI.EpilogueVF; EPI.MainLoopUF = EPI.EpilogueUF; - EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC, - ORE, EPI, &LVL, &CM, BFI, PSI, - Checks); + VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF, false); + EpilogueVectorizerEpilogueLoop EpilogILV( + L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, + BestEpiPlan.getCostModel(), BFI, PSI, Checks); - VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); @@ -10415,11 +10451,10 @@ if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { + VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking); InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, - PSI, Checks); - - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VF.MinProfitableTripCount, IC, &LVL, + BestPlan.getCostModel(), BFI, PSI, Checks); LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -51,6 +51,7 @@ class InnerLoopVectorizer; class IRBuilderBase; class LoopInfo; +class LoopVectorizationCostModel; class PredicateScalarEvolution; class raw_ostream; class RecurrenceDescriptor; @@ -2221,8 +2222,18 @@ /// Values used outside the plan. MapVector LiveOuts; + /// Whether this plan should foldTailByMasking + bool FoldTailByMasking = false; + + /// The cost model used to construct and cost this VPlan. + /// TODO: Remove this and the dependencies on the costmodel. + LoopVectorizationCostModel *Cost; + public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { + VPlan(bool FoldTailByMasking = false, + LoopVectorizationCostModel *Cost = nullptr, + VPBlockBase *Entry = nullptr) + : Entry(Entry), FoldTailByMasking(FoldTailByMasking), Cost(Cost) { if (Entry) Entry->setPlan(this); } @@ -2267,6 +2278,8 @@ /// longer, because it may be stale. void disableValue2VPValue() { Value2VPValueEnabled = false; } + const SmallSetVector &getVFs() const { return VFs; } + void addVF(ElementCount VF) { VFs.insert(VF); } void setVF(ElementCount VF) { @@ -2405,6 +2418,10 @@ return LiveOuts; } + bool foldTailByMasking() const { return FoldTailByMasking; } + + LoopVectorizationCostModel *getCostModel() const { return Cost; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -795,6 +795,8 @@ std::string Out; raw_string_ostream RSO(Out); RSO << Name << " for "; + if (FoldTailByMasking) + RSO << "Tail Folded "; if (!VFs.empty()) { RSO << "VF={" << VFs[0]; for (ElementCount VF : drop_begin(VFs)) Index: llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll @@ -106,49 +106,39 @@ ; ; TFFALLBACK-LABEL: @test_widen( ; TFFALLBACK-NEXT: entry: -; TFFALLBACK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TFFALLBACK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; TFFALLBACK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFFALLBACK: vector.ph: +; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: -; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_CALL_CONTINUE2:%.*]] ] -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ , [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_CALL_CONTINUE2]] ] -; TFFALLBACK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[TMP0]], i32 4, <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i64> poison) -; TFFALLBACK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; TFFALLBACK-NEXT: br i1 [[TMP1]], label [[PRED_CALL_IF:%.*]], label [[PRED_CALL_CONTINUE:%.*]] -; TFFALLBACK: pred.call.if: -; TFFALLBACK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_MASKED_LOAD]], i32 0 -; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR4:[0-9]+]] -; TFFALLBACK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; TFFALLBACK-NEXT: br label [[PRED_CALL_CONTINUE]] -; TFFALLBACK: pred.call.continue: -; TFFALLBACK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_CALL_IF]] ] -; TFFALLBACK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; TFFALLBACK-NEXT: br i1 [[TMP6]], label [[PRED_CALL_IF1:%.*]], label [[PRED_CALL_CONTINUE2]] -; TFFALLBACK: pred.call.if1: -; TFFALLBACK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[WIDE_MASKED_LOAD]], i32 1 -; TFFALLBACK-NEXT: [[TMP8:%.*]] = call i64 @foo(i64 [[TMP7]]) #[[ATTR4]] -; TFFALLBACK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1 -; TFFALLBACK-NEXT: br label [[PRED_CALL_CONTINUE2]] -; TFFALLBACK: pred.call.continue2: -; TFFALLBACK-NEXT: [[TMP10:%.*]] = phi <2 x i64> [ [[TMP5]], [[PRED_CALL_CONTINUE]] ], [ [[TMP9]], [[PRED_CALL_IF1]] ] -; TFFALLBACK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP10]], ptr [[TMP11]], i32 4, <2 x i1> [[ACTIVE_LANE_MASK]]) -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX_NEXT]], i64 1024) -; TFFALLBACK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], -; TFFALLBACK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 -; TFFALLBACK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector( [[WIDE_LOAD]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer)) +; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; TFFALLBACK: middle.block: -; TFFALLBACK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; TFFALLBACK: scalar.ph: -; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]] ; TFFALLBACK: for.body: ; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 -; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4]] +; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR2:[0-9]+]] ; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 ; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -231,7 +221,7 @@ ; TFFALLBACK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[TMP0]], 50 ; TFFALLBACK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END]] ; TFFALLBACK: if.then: -; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @foo(i64 [[TMP0]]) #[[ATTR4]] +; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @foo(i64 [[TMP0]]) #[[ATTR2]] ; TFFALLBACK-NEXT: br label [[IF_END]] ; TFFALLBACK: if.end: ; TFFALLBACK-NEXT: [[TMP2:%.*]] = phi i64 [ [[TMP1]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ] @@ -334,10 +324,10 @@ ; TFFALLBACK-NEXT: [[CMP:%.*]] = icmp ugt i64 [[TMP0]], 50 ; TFFALLBACK-NEXT: br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]] ; TFFALLBACK: if.then: -; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @foo(i64 [[TMP0]]) #[[ATTR5:[0-9]+]] +; TFFALLBACK-NEXT: [[TMP1:%.*]] = call i64 @foo(i64 [[TMP0]]) #[[ATTR3:[0-9]+]] ; TFFALLBACK-NEXT: br label [[IF_END]] ; TFFALLBACK: if.else: -; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @foo(i64 0) #[[ATTR5]] +; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @foo(i64 0) #[[ATTR3]] ; TFFALLBACK-NEXT: br label [[IF_END]] ; TFFALLBACK: if.end: ; TFFALLBACK-NEXT: [[TMP3:%.*]] = phi i64 [ [[TMP1]], [[IF_THEN]] ], [ [[TMP2]], [[IF_ELSE]] ] @@ -476,7 +466,7 @@ ; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 -; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR6:[0-9]+]] +; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR4:[0-9]+]] ; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 ; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 @@ -605,49 +595,39 @@ ; ; TFFALLBACK-LABEL: @test_widen_optmask( ; TFFALLBACK-NEXT: entry: -; TFFALLBACK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; TFFALLBACK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 2 +; TFFALLBACK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; TFFALLBACK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; TFFALLBACK: vector.ph: +; TFFALLBACK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 2 +; TFFALLBACK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; TFFALLBACK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] ; TFFALLBACK-NEXT: br label [[VECTOR_BODY:%.*]] ; TFFALLBACK: vector.body: -; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_CALL_CONTINUE2:%.*]] ] -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <2 x i1> [ , [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_CALL_CONTINUE2]] ] -; TFFALLBACK-NEXT: [[TMP0:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <2 x i64> @llvm.masked.load.v2i64.p0(ptr [[TMP0]], i32 4, <2 x i1> [[ACTIVE_LANE_MASK]], <2 x i64> poison) -; TFFALLBACK-NEXT: [[TMP1:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 0 -; TFFALLBACK-NEXT: br i1 [[TMP1]], label [[PRED_CALL_IF:%.*]], label [[PRED_CALL_CONTINUE:%.*]] -; TFFALLBACK: pred.call.if: -; TFFALLBACK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[WIDE_MASKED_LOAD]], i32 0 -; TFFALLBACK-NEXT: [[TMP3:%.*]] = call i64 @foo(i64 [[TMP2]]) #[[ATTR7:[0-9]+]] -; TFFALLBACK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 -; TFFALLBACK-NEXT: br label [[PRED_CALL_CONTINUE]] -; TFFALLBACK: pred.call.continue: -; TFFALLBACK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ poison, [[VECTOR_BODY]] ], [ [[TMP4]], [[PRED_CALL_IF]] ] -; TFFALLBACK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[ACTIVE_LANE_MASK]], i32 1 -; TFFALLBACK-NEXT: br i1 [[TMP6]], label [[PRED_CALL_IF1:%.*]], label [[PRED_CALL_CONTINUE2]] -; TFFALLBACK: pred.call.if1: -; TFFALLBACK-NEXT: [[TMP7:%.*]] = extractelement <2 x i64> [[WIDE_MASKED_LOAD]], i32 1 -; TFFALLBACK-NEXT: [[TMP8:%.*]] = call i64 @foo(i64 [[TMP7]]) #[[ATTR7]] -; TFFALLBACK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[TMP8]], i32 1 -; TFFALLBACK-NEXT: br label [[PRED_CALL_CONTINUE2]] -; TFFALLBACK: pred.call.continue2: -; TFFALLBACK-NEXT: [[TMP10:%.*]] = phi <2 x i64> [ [[TMP5]], [[PRED_CALL_CONTINUE]] ], [ [[TMP9]], [[PRED_CALL_IF1]] ] -; TFFALLBACK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] -; TFFALLBACK-NEXT: call void @llvm.masked.store.v2i64.p0(<2 x i64> [[TMP10]], ptr [[TMP11]], i32 4, <2 x i1> [[ACTIVE_LANE_MASK]]) -; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; TFFALLBACK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 [[INDEX_NEXT]], i64 1024) -; TFFALLBACK-NEXT: [[TMP12:%.*]] = xor <2 x i1> [[ACTIVE_LANE_MASK_NEXT]], -; TFFALLBACK-NEXT: [[TMP13:%.*]] = extractelement <2 x i1> [[TMP12]], i32 0 -; TFFALLBACK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; TFFALLBACK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; TFFALLBACK-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP4]], align 4 +; TFFALLBACK-NEXT: [[TMP5:%.*]] = call @foo_vector_nomask( [[WIDE_LOAD]]) +; TFFALLBACK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]] +; TFFALLBACK-NEXT: store [[TMP5]], ptr [[TMP6]], align 4 +; TFFALLBACK-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() +; TFFALLBACK-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 2 +; TFFALLBACK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]] +; TFFALLBACK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; TFFALLBACK: middle.block: -; TFFALLBACK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; TFFALLBACK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; TFFALLBACK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; TFFALLBACK: scalar.ph: -; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; TFFALLBACK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; TFFALLBACK-NEXT: br label [[FOR_BODY:%.*]] ; TFFALLBACK: for.body: ; TFFALLBACK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] ; TFFALLBACK-NEXT: [[GEP:%.*]] = getelementptr i64, ptr [[B]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: [[LOAD:%.*]] = load i64, ptr [[GEP]], align 4 -; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR7]] +; TFFALLBACK-NEXT: [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]] ; TFFALLBACK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDVARS_IV]] ; TFFALLBACK-NEXT: store i64 [[CALL]], ptr [[ARRAYIDX]], align 4 ; TFFALLBACK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 Index: llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll @@ -14,7 +14,7 @@ ; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction: %0 = load ; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction: %0 = load ; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction: %0 = load -; COST: LV: Selecting VF: 1. +; COST: LV: Selecting Tail folded VF: 1. define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) { ; CHECK-LABEL: @test( Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-LABEL: Checking a loop in 'simple_memset' -; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS: VPlan 'Initial VPlan for Tail Folded VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { ; VPLANS-NEXT: Live-in vp<[[TC:%[0-9]+]]> = original trip-count ; VPLANS-EMPTY: ; VPLANS-NEXT: vector.ph: Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -1,5 +1,4 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL @@ -10,48 +9,6 @@ ; Test the different tail folding styles. define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features" = "+sve" { -; NONE-LABEL: @simple_memset_tailfold( -; NONE-NEXT: entry: -; NONE-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) -; NONE-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; NONE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]] -; NONE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; NONE: vector.ph: -; NONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; NONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]] -; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] -; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 -; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer -; NONE-NEXT: br label [[VECTOR_BODY:%.*]] -; NONE: vector.body: -; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; NONE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0 -; NONE-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] -; NONE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; NONE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 -; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] -; NONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; NONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] -; NONE: middle.block: -; NONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] -; NONE-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] -; NONE: scalar.ph: -; NONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] -; NONE-NEXT: br label [[WHILE_BODY:%.*]] -; NONE: while.body: -; NONE-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] -; NONE-NEXT: [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]] -; NONE-NEXT: store i32 [[VAL]], ptr [[GEP]], align 4 -; NONE-NEXT: [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1 -; NONE-NEXT: [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]] -; NONE-NEXT: br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]] -; NONE: while.end.loopexit: -; NONE-NEXT: ret void -; ; DATA-LABEL: @simple_memset_tailfold( ; DATA-NEXT: entry: ; DATA-NEXT: [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1) Index: llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll +++ llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll @@ -6,7 +6,7 @@ ; Trip count of 5 - shouldn't be vectorized. ; CHECK-LABEL: tripcount5 -; CHECK: LV: Selecting VF: 1 +; CHECK: LV: Selecting Tail folded VF: 1 define void @tripcount5(ptr nocapture readonly %in, ptr nocapture %out, ptr nocapture readonly %consts, i32 %n) #0 { entry: %arrayidx20 = getelementptr inbounds i32, ptr %out, i32 1 @@ -388,8 +388,8 @@ ; Larger example with predication that should also not be vectorized ; CHECK-LABEL: predicated -; CHECK: LV: Selecting VF: 1 -; CHECK: LV: Selecting VF: 1 +; CHECK: LV: Selecting Tail folded VF: 1 +; CHECK: LV: Selecting Tail folded VF: 1 define dso_local i32 @predicated(i32 noundef %0, ptr %glob) #0 { %2 = alloca [101 x i32], align 4 %3 = alloca [21 x i32], align 4 Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -1,11 +1,13 @@ ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT -; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TAILPRED +; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED +; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=DEFAULT target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-none-eabi" ; When TP is disabled, this test can vectorize with a VF of 16. ; When TP is enabled, this test should vectorize with a VF of 8. +; When both are allowed, the VF=16 without tail folding should win out. ; ; DEFAULT: load <16 x i8>, <16 x i8>* ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16> Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -8,9 +8,9 @@ define i32 @foo() { ; CHECK-LABEL: foo -; CHECK-PWR8: Executing best plan with VF=16, UF=4 +; CHECK-PWR8: Executing best plan with TailFold=false, VF=16, UF=4 -; CHECK-PWR9: Executing best plan with VF=8, UF=8 +; CHECK-PWR9: Executing best plan with TailFold=false, VF=8, UF=8 entry: @@ -46,7 +46,7 @@ ; CHECK-LABEL: goo -; CHECK: Executing best plan with VF=16, UF=4 +; CHECK: Executing best plan with TailFold=false, VF=16, UF=4 entry: br label %for.body @@ -79,7 +79,7 @@ define i64 @bar(ptr nocapture %a) { ; CHECK-LABEL: bar -; CHECK: Executing best plan with VF=2, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=2, UF=12 entry: br label %for.body @@ -107,7 +107,7 @@ define void @hoo(i32 %n) { ; CHECK-LABEL: hoo -; CHECK: Executing best plan with VF=1, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=1, UF=12 entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -108,11 +108,12 @@ ; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 +; CHECK-NEXT: LV: Fold Tail is false ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: @@ -240,11 +241,12 @@ ; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 +; CHECK-NEXT: LV: Fold Tail is false ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -14,7 +14,9 @@ ; ; -; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata. +; This loop will be vectorized, although the trip count is below the threshold, but +; vectorization is explicitly forced in metadata. The trip count of 4 is chosen as +; it more nicely divides the loop count of 20, produce a lower total cost. ; define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( @@ -27,20 +29,20 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 20 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -52,7 +54,7 @@ ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -8,7 +8,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_1 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -97,7 +97,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_2 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -165,7 +165,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_3_reduction -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -238,7 +238,7 @@ ; containing %conv at the end, because %conv is the last recipe in the block. define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_4_requires_split_at_end_of_block -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -335,7 +335,7 @@ ; Test case that requires sinking a recipe in a replicate region after another replicate region. define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize { ; CHECK-LABEL: sink_replicate_region_after_replicate_region -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -408,7 +408,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias %dst) { ; CHECK-LABEL: need_new_block_after_sinking_pr56146 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence.ll @@ -74,7 +74,7 @@ ; UNROLL-NO-IC-NEXT: store i32 [[ADD35]], ptr [[ARRAYIDX34]], align 4 ; UNROLL-NO-IC-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; UNROLL-NO-IC-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; UNROLL-NO-IC-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL-NO-IC: for.exit: ; UNROLL-NO-IC-NEXT: ret void ; @@ -130,7 +130,7 @@ ; UNROLL-NO-VF-NEXT: store i32 [[ADD35]], ptr [[ARRAYIDX34]], align 4 ; UNROLL-NO-VF-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; UNROLL-NO-VF: for.exit: ; UNROLL-NO-VF-NEXT: ret void ; @@ -185,7 +185,7 @@ ; SINK-AFTER-NEXT: store i32 [[ADD35]], ptr [[ARRAYIDX34]], align 4 ; SINK-AFTER-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; SINK-AFTER-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; SINK-AFTER-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; SINK-AFTER-NEXT: br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; SINK-AFTER: for.exit: ; SINK-AFTER-NEXT: ret void ; @@ -342,7 +342,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP18]] = select i1 [[TMP16]], i32 [[VEC_PHI1]], i32 [[TMP14]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt i32 [[TMP17]], [[TMP18]] ; UNROLL-NO-VF-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select i1 [[RDX_MINMAX_CMP]], i32 [[TMP17]], i32 [[TMP18]] @@ -373,7 +373,7 @@ ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; ; SINK-AFTER-LABEL: @recurrence_2( ; SINK-AFTER-NEXT: entry: @@ -619,7 +619,7 @@ ; UNROLL-NO-VF-NEXT: store double [[TMP17]], ptr [[TMP19]], align 8 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP3]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] @@ -641,7 +641,7 @@ ; UNROLL-NO-VF-NEXT: [[ADVARS_IV_NEXT]] = add nuw nsw i64 [[ADVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[ADVARS_IV_NEXT]] to i32 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[N]] -; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[SCALAR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] ; UNROLL-NO-VF: for.end.loopexit: ; UNROLL-NO-VF-NEXT: br label [[FOR_END]] ; UNROLL-NO-VF: for.end: @@ -980,18 +980,18 @@ ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: ; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP1:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ [[E_015]], [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[I_016]], [[INDEX]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP2]] = add i32 [[OFFSET_IDX]], -1 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP0]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP3]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[TMP3]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ [[E_015]], [[FOR_COND1_PREHEADER]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[I_016]], [[FOR_COND1_PREHEADER]] ] ; UNROLL-NO-VF-NEXT: br label [[FOR_COND1:%.*]] ; UNROLL-NO-VF: for.cond.cleanup: @@ -1002,9 +1002,9 @@ ; UNROLL-NO-VF-NEXT: [[K_0]] = phi i32 [ [[DEC:%.*]], [[FOR_COND1]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; UNROLL-NO-VF-NEXT: [[CMP2:%.*]] = icmp sgt i32 [[K_0]], 1 ; UNROLL-NO-VF-NEXT: [[DEC]] = add nsw i32 [[K_0]], -1 -; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP8:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP2]], label [[FOR_COND1]], label [[FOR_COND_CLEANUP3]], !llvm.loop [[LOOP9:![0-9]+]] ; UNROLL-NO-VF: for.cond.cleanup3: -; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[E_1_LCSSA]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_COND1]] ], [ [[TMP1]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[INC]] = add nuw nsw i32 [[I_016]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 49 ; UNROLL-NO-VF-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 @@ -1199,7 +1199,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP10]] = load i32, ptr [[TMP8]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1214,7 +1214,7 @@ ; UNROLL-NO-VF-NEXT: [[VAR1:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_NEXT]] ; UNROLL-NO-VF-NEXT: [[VAR2]] = load i32, ptr [[VAR1]], align 4 ; UNROLL-NO-VF-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], [[N]] -; UNROLL-NO-VF-NEXT: br i1 [[COND]], label [[FOR_END]], label [[SCALAR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[COND]], label [[FOR_END]], label [[SCALAR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -1338,7 +1338,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP1]] = add i64 0, 1 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], undef -; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 undef, undef ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1352,7 +1352,7 @@ ; UNROLL-NO-VF-NEXT: [[VAR3]] = add i64 0, 1 ; UNROLL-NO-VF-NEXT: [[I_NEXT]] = add nuw nsw i64 [[I]], 1 ; UNROLL-NO-VF-NEXT: [[COND:%.*]] = icmp eq i64 [[I_NEXT]], undef -; UNROLL-NO-VF-NEXT: br i1 [[COND]], label [[FOR_END]], label [[SCALAR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[COND]], label [[FOR_END]], label [[SCALAR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -1464,7 +1464,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP3]] = add i32 [[TMP1]], [[X]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 96 -; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 96, 96 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1479,7 +1479,7 @@ ; UNROLL-NO-VF-NEXT: [[BC:%.*]] = zext i32 [[INC_PHI]] to i64 ; UNROLL-NO-VF-NEXT: [[ADDX]] = add i32 [[INC_PHI]], [[X]] ; UNROLL-NO-VF-NEXT: [[CMP:%.*]] = icmp eq i32 [[INC_PHI]], 95 -; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: [[VAL_PHI_LCSSA:%.*]] = phi i32 [ [[SCALAR_RECUR]], [[FOR_BODY]] ], [ [[TMP2]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: ret i32 [[VAL_PHI_LCSSA]] @@ -1695,7 +1695,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP15]] = add i32 [[VEC_PHI3]], [[TMP13]] ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 10240 -; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP15]], [[TMP14]] ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 10240, 10240 @@ -1723,7 +1723,7 @@ ; UNROLL-NO-VF-NEXT: [[INC1]] = add nuw nsw i32 [[I_011]], 1 ; UNROLL-NO-VF-NEXT: [[ADD_PTR]] = getelementptr inbounds double, ptr [[B_ADDR_012]], i64 25 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INC1]], 10240 -; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] ; ; SINK-AFTER-LABEL: @PR33613( ; SINK-AFTER-NEXT: entry: @@ -1930,7 +1930,7 @@ ; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], ptr [[TMP15]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -1950,7 +1950,7 @@ ; UNROLL-NO-VF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] ; UNROLL-NO-VF-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX5]], align 4 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -2175,7 +2175,7 @@ ; UNROLL-NO-VF-NEXT: store i32 [[TMP13]], ptr [[TMP15]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2197,7 +2197,7 @@ ; UNROLL-NO-VF-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX5]], align 4 ; UNROLL-NO-VF-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -2405,7 +2405,7 @@ ; UNROLL-NO-VF-NEXT: store i32 [[TMP15]], ptr [[TMP17]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]] ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2426,7 +2426,7 @@ ; UNROLL-NO-VF-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]] ; UNROLL-NO-VF-NEXT: store i32 [[MUL]], ptr [[ARRAYIDX5]], align 4 ; UNROLL-NO-VF-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[N]] -; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -2683,7 +2683,7 @@ ; UNROLL-NO-VF-NEXT: [[TMP7]] = add i16 [[TMP3]], 5 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 42 -; UNROLL-NO-VF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 43, 42 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -2701,7 +2701,7 @@ ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add i16 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[REC_2_PREV]] = zext i16 [[IV_NEXT]] to i32 ; UNROLL-NO-VF-NEXT: [[REC_1_PREV]] = add i16 [[IV_NEXT]], 5 -; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP24:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP]], label [[FOR_END]], label [[FOR_COND]], !llvm.loop [[LOOP25:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; @@ -2905,50 +2905,34 @@ ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-VF-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 +; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: -; UNROLL-NO-VF-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 -; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 -; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 +; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] ; UNROLL-NO-VF-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_UDIV_CONTINUE4:%.*]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_UDIV_CONTINUE4]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[PRED_UDIV_CONTINUE4]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[PRED_UDIV_CONTINUE4]] ] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ] ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]] -; UNROLL-NO-VF-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[VEC_IV2:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = icmp ule i32 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = icmp ule i32 [[VEC_IV2]], [[TRIP_COUNT_MINUS_1]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP2]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] -; UNROLL-NO-VF: pred.udiv.if: -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = udiv i32 219220132, [[TMP4]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]] -; UNROLL-NO-VF: pred.udiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP3]], label [[PRED_UDIV_IF3:%.*]], label [[PRED_UDIV_CONTINUE4]] -; UNROLL-NO-VF: pred.udiv.if3: -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = add i32 [[OFFSET_IDX]], -1 -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = udiv i32 219220132, [[TMP7]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE4]] -; UNROLL-NO-VF: pred.udiv.continue4: -; UNROLL-NO-VF-NEXT: [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF3]] ] -; UNROLL-NO-VF-NEXT: [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] -; UNROLL-NO-VF-NEXT: [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP6]] -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = select i1 [[TMP2]], i32 [[TMP10]], i32 [[VEC_PHI]] -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = select i1 [[TMP3]], i32 [[TMP11]], i32 [[VEC_PHI1]] -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25:![0-9]+]], !llvm.loop [[LOOP26:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = udiv i32 219220132, [[TMP2]] +; UNROLL-NO-VF-NEXT: [[TMP5]] = udiv i32 219220132, [[TMP3]] +; UNROLL-NO-VF-NEXT: [[TMP6]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] +; UNROLL-NO-VF-NEXT: [[TMP7]] = add i32 [[VEC_PHI1]], [[TMP4]] +; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26:![0-9]+]], !llvm.loop [[LOOP27:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP13]], [[TMP12]] -; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP7]], [[TMP6]] +; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: -; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] +; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] ; UNROLL-NO-VF-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[BB]] ], [ [[BIN_RDX]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: br label [[BB2:%.*]] @@ -2963,7 +2947,7 @@ ; UNROLL-NO-VF-NEXT: [[VAR7]] = udiv i32 219220132, [[VAR3]] ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27:![0-9]+]], !llvm.loop [[LOOP28:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28:![0-9]+]], !llvm.loop [[LOOP29:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region( ; SINK-AFTER-NEXT: bb: @@ -3280,62 +3264,38 @@ ; UNROLL-NO-VF-NEXT: [[TMP0:%.*]] = add i32 [[Y:%.*]], 1 ; UNROLL-NO-VF-NEXT: [[SMIN:%.*]] = call i32 @llvm.smin.i32(i32 [[Y]], i32 1) ; UNROLL-NO-VF-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[SMIN]] -; UNROLL-NO-VF-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; UNROLL-NO-VF-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP1]], 2 +; UNROLL-NO-VF-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; UNROLL-NO-VF: vector.ph: -; UNROLL-NO-VF-NEXT: [[N_RND_UP:%.*]] = add i32 [[TMP1]], 1 -; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 2 -; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; UNROLL-NO-VF-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[TMP1]], 2 +; UNROLL-NO-VF-NEXT: [[N_VEC:%.*]] = sub i32 [[TMP1]], [[N_MOD_VF]] ; UNROLL-NO-VF-NEXT: [[IND_END:%.*]] = sub i32 [[Y]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[TMP1]], 1 ; UNROLL-NO-VF-NEXT: br label [[VECTOR_BODY:%.*]] ; UNROLL-NO-VF: vector.body: -; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE7:%.*]] ] -; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[PRED_STORE_CONTINUE7]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[PRED_STORE_CONTINUE7]] ] -; UNROLL-NO-VF-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[PRED_STORE_CONTINUE7]] ] +; UNROLL-NO-VF-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VECTOR_RECUR:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ] +; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 1 ; UNROLL-NO-VF-NEXT: [[OFFSET_IDX:%.*]] = sub i32 [[Y]], [[INDEX]] -; UNROLL-NO-VF-NEXT: [[TMP2:%.*]] = add i32 [[OFFSET_IDX]], 0 -; UNROLL-NO-VF-NEXT: [[TMP3:%.*]] = add i32 [[OFFSET_IDX]], -1 -; UNROLL-NO-VF-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[VEC_IV3:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = icmp ule i32 [[VEC_IV]], [[TRIP_COUNT_MINUS_1]] -; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = icmp ule i32 [[VEC_IV3]], [[TRIP_COUNT_MINUS_1]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]] -; UNROLL-NO-VF: pred.udiv.if: -; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = udiv i32 219220132, [[TMP2]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE]] -; UNROLL-NO-VF: pred.udiv.continue: -; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = phi i32 [ poison, [[VECTOR_BODY]] ], [ [[TMP6]], [[PRED_UDIV_IF]] ] -; UNROLL-NO-VF-NEXT: br i1 [[TMP5]], label [[PRED_UDIV_IF4:%.*]], label [[PRED_UDIV_CONTINUE5:%.*]] -; UNROLL-NO-VF: pred.udiv.if4: -; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = udiv i32 219220132, [[TMP3]] -; UNROLL-NO-VF-NEXT: br label [[PRED_UDIV_CONTINUE5]] -; UNROLL-NO-VF: pred.udiv.continue5: -; UNROLL-NO-VF-NEXT: [[TMP9]] = phi i32 [ poison, [[PRED_UDIV_CONTINUE]] ], [ [[TMP8]], [[PRED_UDIV_IF4]] ] +; UNROLL-NO-VF-NEXT: [[TMP4:%.*]] = add i32 [[OFFSET_IDX]], 0 +; UNROLL-NO-VF-NEXT: [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], -1 +; UNROLL-NO-VF-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP2]] +; UNROLL-NO-VF-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP3]] +; UNROLL-NO-VF-NEXT: [[TMP8:%.*]] = udiv i32 219220132, [[TMP4]] +; UNROLL-NO-VF-NEXT: [[TMP9]] = udiv i32 219220132, [[TMP5]] ; UNROLL-NO-VF-NEXT: [[TMP10]] = add i32 [[VEC_PHI]], [[VECTOR_RECUR]] -; UNROLL-NO-VF-NEXT: [[TMP11]] = add i32 [[VEC_PHI2]], [[TMP7]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; UNROLL-NO-VF: pred.store.if: -; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = add i32 [[INDEX]], 0 -; UNROLL-NO-VF-NEXT: [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[X:%.*]], i32 [[TMP12]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP2]], ptr [[TMP13]], align 4 -; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE]] -; UNROLL-NO-VF: pred.store.continue: -; UNROLL-NO-VF-NEXT: br i1 [[TMP5]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7]] -; UNROLL-NO-VF: pred.store.if6: -; UNROLL-NO-VF-NEXT: [[TMP14:%.*]] = add i32 [[INDEX]], 1 -; UNROLL-NO-VF-NEXT: [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 [[TMP14]] -; UNROLL-NO-VF-NEXT: store i32 [[TMP3]], ptr [[TMP15]], align 4 -; UNROLL-NO-VF-NEXT: br label [[PRED_STORE_CONTINUE7]] -; UNROLL-NO-VF: pred.store.continue7: -; UNROLL-NO-VF-NEXT: [[TMP16:%.*]] = select i1 [[TMP4]], i32 [[TMP10]], i32 [[VEC_PHI]] -; UNROLL-NO-VF-NEXT: [[TMP17:%.*]] = select i1 [[TMP5]], i32 [[TMP11]], i32 [[VEC_PHI2]] -; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 2 -; UNROLL-NO-VF-NEXT: [[TMP18:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; UNROLL-NO-VF-NEXT: br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF25]], !llvm.loop [[LOOP29:![0-9]+]] +; UNROLL-NO-VF-NEXT: [[TMP11]] = add i32 [[VEC_PHI2]], [[TMP8]] +; UNROLL-NO-VF-NEXT: store i32 [[TMP4]], ptr [[TMP6]], align 4 +; UNROLL-NO-VF-NEXT: store i32 [[TMP5]], ptr [[TMP7]], align 4 +; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 +; UNROLL-NO-VF-NEXT: [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF26]], !llvm.loop [[LOOP30:![0-9]+]] ; UNROLL-NO-VF: middle.block: -; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP17]], [[TMP16]] -; UNROLL-NO-VF-NEXT: br i1 true, label [[BB1:%.*]], label [[SCALAR_PH]] +; UNROLL-NO-VF-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP10]] +; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP1]], [[N_VEC]] +; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[BB1:%.*]], label [[SCALAR_PH]] ; UNROLL-NO-VF: scalar.ph: ; UNROLL-NO-VF-NEXT: [[SCALAR_RECUR_INIT:%.*]] = phi i32 [ 0, [[BB:%.*]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ] ; UNROLL-NO-VF-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[Y]], [[BB]] ] @@ -3357,7 +3317,7 @@ ; UNROLL-NO-VF-NEXT: [[VAR8]] = add nsw i32 [[VAR3]], -1 ; UNROLL-NO-VF-NEXT: [[IV_NEXT]] = add nsw i32 [[IV]], 1 ; UNROLL-NO-VF-NEXT: [[VAR9:%.*]] = icmp slt i32 [[VAR3]], 2 -; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF27]], !llvm.loop [[LOOP30:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VAR9]], label [[BB1]], label [[BB2]], !prof [[PROF28]], !llvm.loop [[LOOP31:![0-9]+]] ; ; SINK-AFTER-LABEL: @sink_into_replication_region_multiple( ; SINK-AFTER-NEXT: bb: @@ -3589,7 +3549,7 @@ ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[TMP9]], align 4 ; UNROLL-NO-VF-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 2 ; UNROLL-NO-VF-NEXT: [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 -; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP31:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] ; UNROLL-NO-VF: middle.block: ; UNROLL-NO-VF-NEXT: [[CMP_N:%.*]] = icmp eq i32 16, 16 ; UNROLL-NO-VF-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] @@ -3610,7 +3570,7 @@ ; UNROLL-NO-VF-NEXT: [[EXT:%.*]] = zext i1 [[B3]] to i32 ; UNROLL-NO-VF-NEXT: [[A_GEP:%.*]] = getelementptr i32, ptr [[A_PTR]], i16 [[IV]] ; UNROLL-NO-VF-NEXT: store i32 0, ptr [[A_GEP]], align 4 -; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP32:![0-9]+]] +; UNROLL-NO-VF-NEXT: br i1 [[VEC_DEAD]], label [[FOR_END]], label [[LOOP]], !llvm.loop [[LOOP33:![0-9]+]] ; UNROLL-NO-VF: for.end: ; UNROLL-NO-VF-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,7 +36,7 @@ ; Check for crash exposed by D76992. ; CHECK-LABEL: 'test' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: Index: llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll +++ llvm/test/Transforms/LoopVectorize/pr45679-fold-tail-by-masking.ll @@ -67,7 +67,7 @@ ; CHECK-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14 -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; CHECK: exit: ; CHECK-NEXT: ret void ; @@ -129,7 +129,7 @@ ; VF2UF2-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; VF2UF2-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; VF2UF2-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14 -; VF2UF2-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; VF2UF2-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF2UF2: exit: ; VF2UF2-NEXT: ret void ; @@ -139,50 +139,27 @@ ; VF1UF4: vector.ph: ; VF1UF4-NEXT: br label [[VECTOR_BODY:%.*]] ; VF1UF4: vector.body: -; VF1UF4-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] -; VF1UF4-NEXT: [[VEC_IV:%.*]] = add i32 [[INDEX]], 0 -; VF1UF4-NEXT: [[VEC_IV4:%.*]] = add i32 [[INDEX]], 1 -; VF1UF4-NEXT: [[VEC_IV5:%.*]] = add i32 [[INDEX]], 2 -; VF1UF4-NEXT: [[VEC_IV6:%.*]] = add i32 [[INDEX]], 3 -; VF1UF4-NEXT: [[TMP0:%.*]] = icmp ule i32 [[VEC_IV]], 13 -; VF1UF4-NEXT: [[TMP1:%.*]] = icmp ule i32 [[VEC_IV4]], 13 -; VF1UF4-NEXT: [[TMP2:%.*]] = icmp ule i32 [[VEC_IV5]], 13 -; VF1UF4-NEXT: [[TMP3:%.*]] = icmp ule i32 [[VEC_IV6]], 13 -; VF1UF4-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; VF1UF4: pred.store.if: -; VF1UF4-NEXT: [[INDUCTION:%.*]] = add i32 [[INDEX]], 0 -; VF1UF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[INDUCTION]] +; VF1UF4-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VF1UF4-NEXT: [[TMP0:%.*]] = add i32 [[INDEX]], 0 +; VF1UF4-NEXT: [[TMP1:%.*]] = add i32 [[INDEX]], 1 +; VF1UF4-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 2 +; VF1UF4-NEXT: [[TMP3:%.*]] = add i32 [[INDEX]], 3 +; VF1UF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i32 [[TMP0]] +; VF1UF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP1]] +; VF1UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP2]] +; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[TMP3]] ; VF1UF4-NEXT: store i32 13, ptr [[TMP4]], align 1 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]] -; VF1UF4: pred.store.continue: -; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; VF1UF4: pred.store.if4: -; VF1UF4-NEXT: [[INDUCTION1:%.*]] = add i32 [[INDEX]], 1 -; VF1UF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDUCTION1]] ; VF1UF4-NEXT: store i32 13, ptr [[TMP5]], align 1 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE8]] -; VF1UF4: pred.store.continue5: -; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; VF1UF4: pred.store.if6: -; VF1UF4-NEXT: [[INDUCTION2:%.*]] = add i32 [[INDEX]], 2 -; VF1UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDUCTION2]] ; VF1UF4-NEXT: store i32 13, ptr [[TMP6]], align 1 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE10]] -; VF1UF4: pred.store.continue7: -; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] -; VF1UF4: pred.store.if8: -; VF1UF4-NEXT: [[INDUCTION3:%.*]] = add i32 [[INDEX]], 3 -; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 [[INDUCTION3]] ; VF1UF4-NEXT: store i32 13, ptr [[TMP7]], align 1 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE12]] -; VF1UF4: pred.store.continue9: -; VF1UF4-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; VF1UF4-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 16 +; VF1UF4-NEXT: [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4 +; VF1UF4-NEXT: [[TMP8:%.*]] = icmp eq i32 [[INDEX_NEXT]], 12 ; VF1UF4-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; VF1UF4: middle.block: -; VF1UF4-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; VF1UF4-NEXT: [[CMP_N:%.*]] = icmp eq i32 14, 12 +; VF1UF4-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; VF1UF4: scalar.ph: -; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ 12, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; VF1UF4-NEXT: br label [[LOOP:%.*]] ; VF1UF4: loop: ; VF1UF4-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] @@ -190,7 +167,7 @@ ; VF1UF4-NEXT: store i32 13, ptr [[ARRAYIDX]], align 1 ; VF1UF4-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; VF1UF4-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], 14 -; VF1UF4-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP2:![0-9]+]] +; VF1UF4-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP3:![0-9]+]] ; VF1UF4: exit: ; VF1UF4-NEXT: ret void ; @@ -356,58 +333,31 @@ ; VF1UF4: vector.ph: ; VF1UF4-NEXT: br label [[VECTOR_BODY:%.*]] ; VF1UF4: vector.body: -; VF1UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] -; VF1UF4-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 -; VF1UF4-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1 -; VF1UF4-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2 -; VF1UF4-NEXT: [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3 -; VF1UF4-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 13 -; VF1UF4-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 13 -; VF1UF4-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 13 -; VF1UF4-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 13 -; VF1UF4-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; VF1UF4: pred.store.if: -; VF1UF4-NEXT: [[INDUCTION:%.*]] = add i64 [[INDEX]], 0 -; VF1UF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDUCTION]] -; VF1UF4-NEXT: [[TMP5:%.*]] = load i64, ptr [[TMP4]], align 8 -; VF1UF4-NEXT: store i64 [[TMP5]], ptr [[B:%.*]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE]] -; VF1UF4: pred.store.continue: -; VF1UF4-NEXT: [[TMP6:%.*]] = phi i64 [ poison, [[VECTOR_BODY]] ], [ [[TMP5]], [[PRED_STORE_IF]] ] -; VF1UF4-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; VF1UF4: pred.store.if4: -; VF1UF4-NEXT: [[INDUCTION1:%.*]] = add i64 [[INDEX]], 1 -; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDUCTION1]] -; VF1UF4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP7]], align 8 -; VF1UF4-NEXT: store i64 [[TMP8]], ptr [[B]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE8]] -; VF1UF4: pred.store.continue5: -; VF1UF4-NEXT: [[TMP9:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE]] ], [ [[TMP8]], [[PRED_STORE_IF7]] ] -; VF1UF4-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; VF1UF4: pred.store.if6: -; VF1UF4-NEXT: [[INDUCTION2:%.*]] = add i64 [[INDEX]], 2 -; VF1UF4-NEXT: [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDUCTION2]] -; VF1UF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP10]], align 8 +; VF1UF4-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VF1UF4-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; VF1UF4-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; VF1UF4-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; VF1UF4-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; VF1UF4-NEXT: [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[TMP0]] +; VF1UF4-NEXT: [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP1]] +; VF1UF4-NEXT: [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP2]] +; VF1UF4-NEXT: [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP3]] +; VF1UF4-NEXT: [[TMP8:%.*]] = load i64, ptr [[TMP4]], align 8 +; VF1UF4-NEXT: [[TMP9:%.*]] = load i64, ptr [[TMP5]], align 8 +; VF1UF4-NEXT: [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8 +; VF1UF4-NEXT: [[TMP11:%.*]] = load i64, ptr [[TMP7]], align 8 +; VF1UF4-NEXT: store i64 [[TMP8]], ptr [[B:%.*]], align 8 +; VF1UF4-NEXT: store i64 [[TMP9]], ptr [[B]], align 8 +; VF1UF4-NEXT: store i64 [[TMP10]], ptr [[B]], align 8 ; VF1UF4-NEXT: store i64 [[TMP11]], ptr [[B]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE10]] -; VF1UF4: pred.store.continue7: -; VF1UF4-NEXT: [[TMP12:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE8]] ], [ [[TMP11]], [[PRED_STORE_IF9]] ] -; VF1UF4-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] -; VF1UF4: pred.store.if8: -; VF1UF4-NEXT: [[INDUCTION3:%.*]] = add i64 [[INDEX]], 3 -; VF1UF4-NEXT: [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[INDUCTION3]] -; VF1UF4-NEXT: [[TMP14:%.*]] = load i64, ptr [[TMP13]], align 8 -; VF1UF4-NEXT: store i64 [[TMP14]], ptr [[B]], align 8 -; VF1UF4-NEXT: br label [[PRED_STORE_CONTINUE12]] -; VF1UF4: pred.store.continue9: -; VF1UF4-NEXT: [[TMP15:%.*]] = phi i64 [ poison, [[PRED_STORE_CONTINUE10]] ], [ [[TMP14]], [[PRED_STORE_IF11]] ] -; VF1UF4-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; VF1UF4-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; VF1UF4-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; VF1UF4-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VF1UF4-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; VF1UF4-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; VF1UF4: middle.block: -; VF1UF4-NEXT: br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]] +; VF1UF4-NEXT: [[CMP_N:%.*]] = icmp eq i64 14, 12 +; VF1UF4-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; VF1UF4: scalar.ph: -; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; VF1UF4-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 12, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; VF1UF4-NEXT: br label [[FOR_BODY:%.*]] ; VF1UF4: for.body: ; VF1UF4-NEXT: [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -416,7 +366,7 @@ ; VF1UF4-NEXT: store i64 [[V]], ptr [[B]], align 8 ; VF1UF4-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1 ; VF1UF4-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 14 -; VF1UF4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; VF1UF4-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; VF1UF4: for.end: ; VF1UF4-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: LV: Checking a loop in 'sink1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -73,7 +73,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -151,7 +151,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -231,7 +231,7 @@ ; Make sure we do not sink uniform instructions. define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -300,7 +300,7 @@ ; Loop with predicated load. define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -394,7 +394,7 @@ ; loaded value. define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -497,7 +497,7 @@ ; on loaded value. define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -600,7 +600,7 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -699,7 +699,7 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -760,7 +760,7 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -970,7 +970,7 @@ ; need to be removed before merging. define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize { ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: