Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -375,8 +375,9 @@ /// Returns true if vector representation of the instruction \p I /// requires mask. - bool isMaskRequired(const Instruction *I) const { - return MaskedOp.contains(I); + bool isMaskRequired(bool FoldTailByMasking, const Instruction *I) const { + return MaskedOp.contains(I) || + (FoldTailByMasking && FoldTailMaskedOp.contains(I)); } unsigned getNumStores() const { return LAI->getNumStores(); } @@ -384,8 +385,9 @@ /// Returns all assume calls in predicated blocks. They need to be dropped /// when flattening the CFG. - const SmallPtrSetImpl &getConditionalAssumes() const { - return ConditionalAssumes; + const SmallPtrSetImpl & + getConditionalAssumes(bool FoldTailByMasking) const { + return FoldTailByMasking ? FoldTailConditionalAssumes : ConditionalAssumes; } private: @@ -545,6 +547,11 @@ /// flattened. SmallPtrSet ConditionalAssumes; + /// Same as MaskedOp above when folding tail by masking. + SmallPtrSet FoldTailMaskedOp; + /// Same as ConditionalAssumes above when folding tail by masking. + SmallPtrSet FoldTailConditionalAssumes; + /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1424,14 +1424,11 @@ // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; - SmallPtrSet TmpMaskedOp; - SmallPtrSet TmpConditionalAssumes; - // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes)) { + if (!blockCanBePredicated(BB, SafePointers, FoldTailMaskedOp, + FoldTailConditionalAssumes)) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } @@ -1439,10 +1436,6 @@ LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); - MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); - ConditionalAssumes.insert(TmpConditionalAssumes.begin(), - TmpConditionalAssumes.end()); - return true; } Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -179,6 +179,43 @@ }; }; +struct VectorizationScheme { + bool FoldTailByMasking; + ElementCount Width; + + VectorizationScheme(bool F, ElementCount W) + : FoldTailByMasking(F), Width(W) {} + + bool operator==(const VectorizationScheme &rhs) const { + return Width == rhs.Width && FoldTailByMasking == rhs.FoldTailByMasking; + } + bool operator!=(const VectorizationScheme &rhs) const { + return !(*this == rhs); + } +}; +template <> struct DenseMapInfo { + static inline VectorizationScheme getEmptyKey() { + return {false, ElementCount::getScalable(~0U)}; + } + static inline VectorizationScheme getTombstoneKey() { + return {false, ElementCount::getFixed(~0U - 1)}; + } + static unsigned getHashValue(const VectorizationScheme &EltCnt) { + unsigned HashVal = EltCnt.Width.getKnownMinValue() * 37U; + if (EltCnt.FoldTailByMasking) + HashVal -= 2; + if (EltCnt.Width.isScalable()) + return (HashVal - 1U); + + return HashVal; + } + + static bool isEqual(const VectorizationScheme &LHS, + const VectorizationScheme &RHS) { + return LHS == RHS; + } +}; + /// TODO: The following VectorizationFactor was pulled out of /// LoopVectorizationCostModel class. LV also deals with /// VectorizerParams::VectorizationFactor and VectorizationCostTy. @@ -186,8 +223,8 @@ /// Information about vectorization costs. struct VectorizationFactor { - /// Vector width with best cost. - ElementCount Width; + /// FoldTailByMasking and Vector width. + VectorizationScheme Scheme; /// Cost of the loop with that width. InstructionCost Cost; @@ -199,17 +236,20 @@ /// to runtime checks. ElementCount MinProfitableTripCount; - VectorizationFactor(ElementCount Width, InstructionCost Cost, + VectorizationFactor(bool FoldTailByMasking, ElementCount Width, + InstructionCost Cost, InstructionCost ScalarCost) + : Scheme(FoldTailByMasking, Width), Cost(Cost), ScalarCost(ScalarCost) {} + VectorizationFactor(VectorizationScheme Sc, InstructionCost Cost, InstructionCost ScalarCost) - : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} + : Scheme(Sc), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0, 0}; + return {false, ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { - return Width == rhs.Width && Cost == rhs.Cost; + return Scheme == rhs.Scheme && Cost == rhs.Cost; } bool operator!=(const VectorizationFactor &rhs) const { @@ -304,7 +344,7 @@ VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Return the best VPlan for \p VF. - VPlan &getBestPlanFor(ElementCount VF) const; + VPlan &getBestPlanFor(VectorizationScheme VF) const; /// Generate the IR code for the body of the vectorized loop according to the /// best selected \p VF, \p UF and VPlan \p BestPlan. @@ -321,9 +361,10 @@ /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. - bool hasPlanWithVF(ElementCount VF) const { - return any_of(VPlans, - [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); + bool hasPlanWithVF(bool FoldTailByMasking, ElementCount VF) const { + return any_of(VPlans, [&](const VPlanPtr &Plan) { + return Plan->foldTailByMasking() == FoldTailByMasking && Plan->hasVF(VF); + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -351,13 +392,14 @@ /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr - buildVPlanWithVPRecipes(VFRange &Range, + buildVPlanWithVPRecipes(bool FoldTailByMasking, VFRange &Range, SmallPtrSetImpl &DeadInstructions); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(bool FoldTailByMasking, ElementCount MinVF, + ElementCount MaxVF); // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -500,7 +500,8 @@ /// latter is the case when vectorizing the epilogue loop. In the case of /// epilogue vectorization, this function is overriden to handle the more /// complex control flow around the loops. - virtual std::pair createVectorizedLoopSkeleton(); + virtual std::pair + createVectorizedLoopSkeleton(VPlan &Plan); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -563,7 +564,7 @@ /// bypass block and the end value on the edge from bypass to this loop. PHINode *createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &ID, - ArrayRef BypassBlocks, + ArrayRef BypassBlocks, VPlan &Plan, std::pair AdditionalBypass = {nullptr, nullptr}); protected: @@ -611,7 +612,7 @@ Value *getOrCreateTripCount(BasicBlock *InsertBlock); /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); + Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock, VPlan &Plan); /// Returns a bitcasted value to the requested vector type. /// Also handles bitcasts of vector <-> vector types. @@ -620,7 +621,7 @@ /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. - void emitIterationCountCheck(BasicBlock *Bypass); + void emitIterationCountCheck(BasicBlock *Bypass, VPlan &Plan); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. Returns the block containing the checks or @@ -643,12 +644,13 @@ /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( + VPlan &Plan, std::pair AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate /// conditional branches in the middle block, preparing the builder and /// running the verifier. Return the preheader of the completed vector loop. - BasicBlock *completeLoopSkeleton(); + BasicBlock *completeLoopSkeleton(VPlan &Plan); /// Collect poison-generating recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those @@ -832,15 +834,16 @@ // Override this function to handle the more complex control flow around the // three loops. - std::pair createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); + std::pair + createVectorizedLoopSkeleton(VPlan &Plan) final { + return createEpilogueVectorizedLoopSkeleton(Plan); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. virtual std::pair - createEpilogueVectorizedLoopSkeleton() = 0; + createEpilogueVectorizedLoopSkeleton(VPlan &Plan) = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -868,7 +871,8 @@ EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - std::pair createEpilogueVectorizedLoopSkeleton() final; + std::pair + createEpilogueVectorizedLoopSkeleton(VPlan &Plan) final; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -898,7 +902,8 @@ } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - std::pair createEpilogueVectorizedLoopSkeleton() final; + std::pair + createEpilogueVectorizedLoopSkeleton(VPlan &Plan) final; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -1163,15 +1168,19 @@ CM_ScalarEpilogueNotAllowedUsePredicate }; -/// ElementCountComparator creates a total ordering for ElementCount -/// for the purposes of using it in a set structure. -struct ElementCountComparator { - bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { - return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < - std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); +/// VectorizationSchemeComparator creates a total ordering for +/// VectorizationScheme for the purposes of using it in a set structure. +struct VectorizationSchemeComparator { + bool operator()(const VectorizationScheme &LHS, + const VectorizationScheme &RHS) const { + return std::make_tuple(LHS.FoldTailByMasking, LHS.Width.isScalable(), + LHS.Width.getKnownMinValue()) < + std::make_tuple(RHS.FoldTailByMasking, RHS.Width.isScalable(), + RHS.Width.getKnownMinValue()); } }; -using ElementCountSet = SmallSet; +using ElementCountSet = + SmallSet; using InstructionVFPair = std::pair; @@ -1219,10 +1228,11 @@ /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. - bool selectUserVectorizationFactor(ElementCount UserVF) { - collectUniformsAndScalars(UserVF); - collectInstsToScalarize(UserVF); - return expectedCost(UserVF).first.isValid(); + bool selectUserVectorizationFactor(bool FoldTailByMasking, + ElementCount UserVF) { + collectUniformsAndScalars(FoldTailByMasking, UserVF); + collectInstsToScalarize(FoldTailByMasking, UserVF); + return expectedCost({FoldTailByMasking, UserVF}).first.isValid(); } /// \return The size (in bits) of the smallest and widest types in the code @@ -1234,7 +1244,8 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); + unsigned selectInterleaveCount(VectorizationScheme VF, + InstructionCost LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1243,7 +1254,7 @@ /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(ElementCount VF); + void setCostBasedWideningDecision(bool FoldTailByMasking, ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1259,7 +1270,7 @@ /// \return Returns information about the register usages of the loop for the /// given vectorization factors. SmallVector - calculateRegisterUsage(ArrayRef VFs); + calculateRegisterUsage(ArrayRef VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -1288,7 +1299,8 @@ /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + bool isProfitableToScalarize(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { assert(VF.isVector() && "Profitable to scalarize relevant only for VF > 1."); @@ -1297,14 +1309,15 @@ if (EnableVPlanNativePath) return false; - auto Scalars = InstsToScalarize.find(VF); + auto Scalars = InstsToScalarize.find({FoldTailByMasking, VF}); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); return Scalars->second.contains(I); } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + bool isUniformAfterVectorization(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { // Pseudo probe needs to be duplicated for each unrolled iteration and // vector lane so that profiled loop trip count can be accurately // accumulated instead of being under counted. @@ -1319,14 +1332,15 @@ if (EnableVPlanNativePath) return false; - auto UniformsPerVF = Uniforms.find(VF); + auto UniformsPerVF = Uniforms.find({FoldTailByMasking, VF}); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); return UniformsPerVF->second.count(I); } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + bool isScalarAfterVectorization(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { if (VF.isScalar()) return true; @@ -1335,7 +1349,7 @@ if (EnableVPlanNativePath) return false; - auto ScalarsPerVF = Scalars.find(VF); + auto ScalarsPerVF = Scalars.find({FoldTailByMasking, VF}); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); return ScalarsPerVF->second.count(I); @@ -1343,10 +1357,11 @@ /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + bool canTruncateToMinimalBitwidth(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { return VF.isVector() && MinBWs.contains(I) && - !isProfitableToScalarize(I, VF) && - !isScalarAfterVectorization(I, VF); + !isProfitableToScalarize(I, FoldTailByMasking, VF) && + !isScalarAfterVectorization(I, FoldTailByMasking, VF); } /// Decision that was taken during cost calculation for memory instruction. @@ -1361,26 +1376,33 @@ /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, + void setWideningDecision(Instruction *I, bool FoldTailByMasking, + ElementCount VF, InstWidening W, InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + WideningDecisions[std::make_pair( + I, VectorizationScheme(FoldTailByMasking, VF))] = + std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. void setWideningDecision(const InterleaveGroup *Grp, - ElementCount VF, InstWidening W, - InstructionCost Cost) { + bool FoldTailByMasking, ElementCount VF, + InstWidening W, InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { if (auto *I = Grp->getMember(i)) { if (Grp->getInsertPos() == I) - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + WideningDecisions[std::make_pair( + I, VectorizationScheme(FoldTailByMasking, VF))] = + std::make_pair(W, Cost); else - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); + WideningDecisions[std::make_pair( + I, VectorizationScheme(FoldTailByMasking, VF))] = + std::make_pair(W, 0); } } } @@ -1388,14 +1410,16 @@ /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { + InstWidening getWideningDecision(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = + std::make_pair(I, VectorizationScheme(FoldTailByMasking, VF)); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1404,9 +1428,11 @@ /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - InstructionCost getWideningCost(Instruction *I, ElementCount VF) { + InstructionCost getWideningCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF) { assert(VF.isVector() && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = + std::make_pair(I, VectorizationScheme(FoldTailByMasking, VF)); assert(WideningDecisions.contains(InstOnVF) && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1440,18 +1466,18 @@ /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(ElementCount VF); + void collectInstsToScalarize(bool FoldTailByMasking, ElementCount VF); - /// Collect Uniform and Scalar values for the given \p VF. - /// The sets depend on CM decision for Load/Store instructions - /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(ElementCount VF) { + /// Collect Uniform and Scalar values for the given \p FoldTailByMasking and + /// \p VF. The sets depend on CM decision for Load/Store instructions that may + /// be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(bool FoldTailByMasking, ElementCount VF) { // Do the analysis once. - if (VF.isScalar() || Uniforms.contains(VF)) + if (VF.isScalar() || Uniforms.contains({FoldTailByMasking, VF})) return; - setCostBasedWideningDecision(VF); - collectLoopUniforms(VF); - collectLoopScalars(VF); + setCostBasedWideningDecision(FoldTailByMasking, VF); + collectLoopUniforms(FoldTailByMasking, VF); + collectLoopScalars(FoldTailByMasking, VF); } /// Returns true if the target machine supports masked store operation @@ -1513,29 +1539,32 @@ /// for which our chosen predication strategy is scalarization (i.e. we /// don't have an alternate strategy such as masking available). /// \p VF is the vectorization factor that will be used to vectorize \p I. - bool isScalarWithPredication(Instruction *I, ElementCount VF) const; + bool isScalarWithPredication(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const; /// Returns true if \p I is an instruction that needs to be predicated /// at runtime. The result is independent of the predication mechanism. /// Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) const; + bool isPredicatedInst(bool FoldTailByMasking, Instruction *I) const; /// Return the costs for our two available strategies for lowering a /// div/rem operation which requires speculating at least one lane. /// First result is for scalarization (will be invalid for scalable /// vectors); second is for the safe-divisor strategy. std::pair - getDivRemSpeculationCost(Instruction *I, + getDivRemSpeculationCost(Instruction *I, bool FoldTailByMasking, ElementCount VF) const; /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); + bool memoryInstructionCanBeWidened(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); + bool interleavedAccessCanBeWidened(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1567,9 +1596,9 @@ } /// Returns the TailFoldingStyle that is best for the current loop. - TailFoldingStyle - getTailFoldingStyle(bool IVUpdateMayOverflow = true) const { - if (!CanFoldTailByMasking) + TailFoldingStyle getTailFoldingStyle(bool FoldTailByMasking, + bool IVUpdateMayOverflow = true) const { + if (!FoldTailByMasking) return TailFoldingStyle::None; if (ForceTailFoldingStyle.getNumOccurrences()) @@ -1578,17 +1607,14 @@ return TTI.getPreferredTailFoldingStyle(IVUpdateMayOverflow); } - /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { - return getTailFoldingStyle() != TailFoldingStyle::None; + bool shouldGenerateNonFoldedVFs() const { + return !MayFoldTailByMasking || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } - /// Returns true if the instructions in this block requires predication - /// for any reason, e.g. because tail folding now requires a predicate - /// or because the block in the original loop was predicated. - bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { - return foldTailByMasking() || Legal->blockNeedsPredication(BB); - } + /// Returns true if all loop blocks should be masked to fold tail loop. + bool mayFoldTailByMasking() const { return MayFoldTailByMasking; } /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a @@ -1616,7 +1642,7 @@ /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, + InstructionCost getVectorCallCost(CallInst *CI, bool FoldTailByMasking, ElementCount VF, Function **Variant, bool *NeedsMask = nullptr) const; @@ -1676,17 +1702,18 @@ /// will add a pair(Instruction*, ElementCount) to \p Invalid for /// each instruction that has an Invalid cost for the given VF. VectorizationCostTy - expectedCost(ElementCount VF, + expectedCost(VectorizationScheme VF, SmallVectorImpl *Invalid = nullptr); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + VectorizationCostTy getInstructionCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - InstructionCost getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy); + InstructionCost getInstructionCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF, Type *&VectorTy); /// Return the cost of instructions in an inloop reduction pattern, if I is /// part of that pattern. @@ -1695,20 +1722,28 @@ TTI::TargetCostKind CostKind); /// Calculate vectorization cost of memory instruction \p I. - InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); + InstructionCost getMemoryInstructionCost(Instruction *I, + bool FoldTailByMasking, + ElementCount VF); /// The cost computation for scalarized memory instruction. - InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); + InstructionCost getMemInstScalarizationCost(Instruction *I, + bool FoldTailByMasking, + ElementCount VF); /// The cost computation for interleaving group of memory instructions. - InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); + InstructionCost getInterleaveGroupCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// The cost computation for Gather/Scatter instruction. - InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); + InstructionCost getGatherScatterCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); + InstructionCost getConsecutiveMemOpCost(Instruction *I, + bool FoldTailByMasking, + ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. @@ -1718,12 +1753,15 @@ /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. - InstructionCost getScalarizationOverhead(Instruction *I, ElementCount VF, + InstructionCost getScalarizationOverhead(Instruction *I, + bool FoldTailByMasking, + ElementCount VF, TTI::TargetCostKind CostKind) const; /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. - bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); + bool useEmulatedMaskMemRefHack(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated @@ -1750,25 +1788,25 @@ ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool CanFoldTailByMasking = false; + bool MayFoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap InstsToScalarize; + DenseMap InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap> Uniforms; + DenseMap> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap> Scalars; + DenseMap> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap> ForcedScalars; + DenseMap> ForcedScalars; /// PHINodes of the reductions that should be expanded in-loop along with /// their associated chains of reduction operations, in program order from top @@ -1788,6 +1826,7 @@ /// Currently, only single-use chains are considered for scalarization. InstructionCost computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, + bool FoldTailByMasking, ElementCount VF); /// Collect the instructions that are uniform after vectorization. An @@ -1799,7 +1838,7 @@ /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(ElementCount VF); + void collectLoopUniforms(bool FoldTailByMasking, ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized @@ -1808,18 +1847,18 @@ /// CM_Scalarize. Non-uniform scalarized instructions will be represented by /// VF values in the vectorized loop, each corresponding to an iteration of /// the original scalar loop. - void collectLoopScalars(ElementCount VF); + void collectLoopScalars(bool FoldTailByMasking, ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap, + using DecisionList = DenseMap, std::pair>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, ElementCount VF) const { + bool needsExtract(Value *V, bool FoldTailByMasking, ElementCount VF) const { Instruction *I = dyn_cast(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) @@ -1831,14 +1870,18 @@ // the scalars are collected. That should be a safe assumption in most // cases, because we check if the operands have vectorizable types // beforehand in LoopVectorizationLegality. - return !Scalars.contains(VF) || !isScalarAfterVectorization(I, VF); + return !Scalars.contains({FoldTailByMasking, VF}) || + !isScalarAfterVectorization(I, FoldTailByMasking, VF); }; /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, + bool FoldTailByMasking, ElementCount VF) const { - return SmallVector(make_filter_range( - Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); + return SmallVector( + make_filter_range(Ops, [this, FoldTailByMasking, VF](Value *V) { + return this->needsExtract(V, FoldTailByMasking, VF); + })); } /// Determines if we have the infrastructure to vectorize loop \p L and its @@ -2898,8 +2941,8 @@ return TripCount; } -Value * -InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock, + VPlan &Plan) { if (VectorTripCount) return VectorTripCount; @@ -2918,7 +2961,7 @@ // exit, with the last early-exit vector comparison also producing all-true. // For scalable vectors the VF is not guaranteed to be a power of 2, but this // is accounted for in emitIterationCountCheck that adds an overflow check. - if (Cost->foldTailByMasking()) { + if (Plan.foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); @@ -2980,7 +3023,8 @@ return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } -void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { +void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass, + VPlan &Plan) { Value *Count = getOrCreateTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. @@ -3011,7 +3055,7 @@ Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); }; - TailFoldingStyle Style = Cost->getTailFoldingStyle(); + TailFoldingStyle Style = Cost->getTailFoldingStyle(Plan.foldTailByMasking()); if (Style == TailFoldingStyle::None) CheckMinIters = Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); @@ -3161,9 +3205,10 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &II, - ArrayRef BypassBlocks, + ArrayRef BypassBlocks, VPlan &Plan, std::pair AdditionalBypass) { - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + Value *VectorTripCount = + getOrCreateVectorTripCount(LoopVectorPreHeader, Plan); assert(VectorTripCount && "Expected valid arguments"); Instruction *OldInduction = Legal->getPrimaryInduction(); @@ -3219,7 +3264,7 @@ } void InnerLoopVectorizer::createInductionResumeValues( - std::pair AdditionalBypass) { + VPlan &Plan, std::pair AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); @@ -3234,15 +3279,16 @@ PHINode *OrigPhi = InductionEntry.first; const InductionDescriptor &II = InductionEntry.second; PHINode *BCResumeVal = createInductionResumeValue( - OrigPhi, II, LoopBypassBlocks, AdditionalBypass); + OrigPhi, II, LoopBypassBlocks, Plan, AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(VPlan &Plan) { // The trip counts should be cached by now. Value *Count = getOrCreateTripCount(LoopVectorPreHeader); - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + Value *VectorTripCount = + getOrCreateVectorTripCount(LoopVectorPreHeader, Plan); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3254,7 +3300,7 @@ // Thus if tail is to be folded, we know we don't need to run the // remainder and we can use the previous value for the condition (true). // 3) Otherwise, construct a runtime check. - if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { + if (!Cost->requiresScalarEpilogue(VF) && !Plan.foldTailByMasking()) { Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); @@ -3275,7 +3321,7 @@ } std::pair -InnerLoopVectorizer::createVectorizedLoopSkeleton() { +InnerLoopVectorizer::createVectorizedLoopSkeleton(VPlan &Plan) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3317,7 +3363,7 @@ // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitIterationCountCheck(LoopScalarPreHeader); + emitIterationCountCheck(LoopScalarPreHeader, Plan); // Generate the code to check any assumptions that we've made for SCEV // expressions. @@ -3329,9 +3375,9 @@ emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(); + createInductionResumeValues(Plan); - return {completeLoopSkeleton(), nullptr}; + return {completeLoopSkeleton(Plan), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -3456,7 +3502,8 @@ } InstructionCost LoopVectorizationCostModel::getVectorCallCost( - CallInst *CI, ElementCount VF, Function **Variant, bool *NeedsMask) const { + CallInst *CI, bool FoldTailByMasking, ElementCount VF, Function **Variant, + bool *NeedsMask) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3481,7 +3528,7 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. InstructionCost ScalarizationCost = - getScalarizationOverhead(CI, VF, CostKind); + getScalarizationOverhead(CI, FoldTailByMasking, VF, CostKind); InstructionCost Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; @@ -3511,7 +3558,7 @@ // We don't support masked function calls yet, but we can scalarize a // masked call with branches (unless VF is scalable). - if (!TLI || CI->isNoBuiltin() || !VecFunc || Legal->isMaskRequired(CI)) + if (!TLI || CI->isNoBuiltin() || !VecFunc || Legal->isMaskRequired(FoldTailByMasking, CI)) return VF.isScalable() ? InstructionCost::getInvalid() : Cost; // If the corresponding vector cost is cheaper, return its cost. @@ -3732,10 +3779,11 @@ // Fix-up external users of the induction variables. for (const auto &Entry : Legal->getInductionVars()) - fixupIVUsers(Entry.first, Entry.second, - getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), - IVEndValues[Entry.first], LoopMiddleBlock, - VectorLoop->getHeader(), Plan); + fixupIVUsers( + Entry.first, Entry.second, + getOrCreateVectorTripCount(VectorLoop->getLoopPreheader(), Plan), + IVEndValues[Entry.first], LoopMiddleBlock, VectorLoop->getHeader(), + Plan); } // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated @@ -3935,7 +3983,7 @@ // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { + if (State.Plan->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); SelectInst *Sel = nullptr; @@ -4228,18 +4276,21 @@ return Cost->useOrderedReductions(RdxDesc); } -void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopScalars(bool FoldTailByMasking, + ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF.isVector() && !Scalars.contains(VF) && + assert(VF.isVector() && !Scalars.contains({FoldTailByMasking, VF}) && "This function should not be visited twice for the same VF"); // This avoids any chances of creating a REPLICATE recipe during planning // since that would result in generation of scalarized code during execution, // which is not supported for scalable vectors. if (VF.isScalable()) { - Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); + Scalars[{FoldTailByMasking, VF}].insert( + Uniforms[{FoldTailByMasking, VF}].begin(), + Uniforms[{FoldTailByMasking, VF}].end()); return; } @@ -4256,7 +4307,8 @@ // memory access is not a gather or scatter operation. The value operand of a // store will remain scalar if the store is scalarized. auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { - InstWidening WideningDecision = getWideningDecision(MemAccess, VF); + InstWidening WideningDecision = + getWideningDecision(MemAccess, FoldTailByMasking, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); if (auto *Store = dyn_cast(MemAccess)) @@ -4309,7 +4361,8 @@ // // (1) Add to the worklist all instructions that have been identified as // uniform-after-vectorization. - Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); + Worklist.insert(Uniforms[{FoldTailByMasking, VF}].begin(), + Uniforms[{FoldTailByMasking, VF}].end()); // (2) Add to the worklist all bitcast and getelementptr instructions used by // memory accesses requiring a scalar use. The pointer operands of loads and @@ -4334,7 +4387,7 @@ // Insert the forced scalars. // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector // induction variable when the PHI user is scalarized. - auto ForcedScalar = ForcedScalars.find(VF); + auto ForcedScalar = ForcedScalars.find({FoldTailByMasking, VF}); if (ForcedScalar != ForcedScalars.end()) for (auto *I : ForcedScalar->second) { LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); @@ -4370,7 +4423,7 @@ // If tail-folding is applied, the primary induction variable will be used // to feed a vector compare. - if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) + if (Ind == Legal->getPrimaryInduction() && FoldTailByMasking) continue; // Returns true if \p Indvar is a pointer induction that is used directly by @@ -4412,12 +4465,12 @@ << "\n"); } - Scalars[VF].insert(Worklist.begin(), Worklist.end()); + Scalars[{FoldTailByMasking, VF}].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationCostModel::isScalarWithPredication( - Instruction *I, ElementCount VF) const { - if (!isPredicatedInst(I)) + Instruction *I, bool FoldTailByMasking, ElementCount VF) const { + if (!isPredicatedInst(FoldTailByMasking, I)) return false; // Do we have a non-scalar lowering for this predicated @@ -4445,14 +4498,23 @@ // We have the option to use the safe-divisor idiom to avoid predication. // The cost based decision here will always select safe-divisor for // scalable vectors as scalarization isn't legal. - const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); + const auto [ScalarCost, SafeDivisorCost] = + getDivRemSpeculationCost(I, FoldTailByMasking, VF); return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); } } } -bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { - if (!blockNeedsPredicationForAnyReason(I->getParent())) +static bool blockNeedsPredicationForAnyReason(bool FoldTailByMasking, + LoopVectorizationLegality *Legal, + BasicBlock *BB) { + return FoldTailByMasking || Legal->blockNeedsPredication(BB); +} + +bool LoopVectorizationCostModel::isPredicatedInst(bool FoldTailByMasking, + Instruction *I) const { + if (!blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal, + I->getParent())) return false; // Can we prove this instruction is safe to unconditionally execute? @@ -4462,7 +4524,7 @@ return false; case Instruction::Load: case Instruction::Store: { - if (!Legal->isMaskRequired(I)) + if (!Legal->isMaskRequired(FoldTailByMasking, I)) return false; // When we know the load's address is loop invariant and the instruction // in the original scalar loop was unconditionally executed then we @@ -4489,13 +4551,14 @@ // context sensitive reasoning return !isSafeToSpeculativelyExecute(I); case Instruction::Call: - return Legal->isMaskRequired(I); + return Legal->isMaskRequired(FoldTailByMasking, I); } } std::pair LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, - ElementCount VF) const { + bool FoldTailByMasking, + ElementCount VF) const { assert(I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || @@ -4525,7 +4588,8 @@ // The cost of insertelement and extractelement instructions needed for // scalarization. - ScalarizationCost += getScalarizationOverhead(I, VF, CostKind); + ScalarizationCost += + getScalarizationOverhead(I, FoldTailByMasking, VF, CostKind); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -4559,9 +4623,9 @@ } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( - Instruction *I, ElementCount VF) { + Instruction *I, bool FoldTailByMasking, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); - assert(getWideningDecision(I, VF) == CM_Unknown && + assert(getWideningDecision(I, FoldTailByMasking, VF) == CM_Unknown && "Decision should not be set yet."); auto *Group = getInterleavedAccessGroup(I); assert(Group && "Must have a group."); @@ -4600,8 +4664,9 @@ // (either a gap at the end of a load-access that may result in a speculative // load, or any gaps in a store-access). bool PredicatedAccessRequiresMasking = - blockNeedsPredicationForAnyReason(I->getParent()) && - Legal->isMaskRequired(I); + blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal, + I->getParent()) && + Legal->isMaskRequired(FoldTailByMasking, I); bool LoadAccessWithGapsRequiresEpilogMasking = isa(I) && Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); @@ -4628,7 +4693,7 @@ } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( - Instruction *I, ElementCount VF) { + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // Get and ensure we have a valid memory instruction. assert((isa(I)) && "Invalid memory instruction"); @@ -4641,7 +4706,7 @@ // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I, VF)) + if (isScalarWithPredication(I, FoldTailByMasking, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -4653,18 +4718,19 @@ return true; } -void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopUniforms(bool FoldTailByMasking, + ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF.isVector() && !Uniforms.contains(VF) && + assert(VF.isVector() && !Uniforms.contains({FoldTailByMasking, VF}) && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll // not analyze again. Uniforms.count(VF) will return 1. - Uniforms[VF].clear(); + Uniforms[{FoldTailByMasking, VF}].clear(); // We now know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after @@ -4692,7 +4758,7 @@ << *I << "\n"); return; } - if (isScalarWithPredication(I, VF)) { + if (isScalarWithPredication(I, FoldTailByMasking, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -4722,7 +4788,8 @@ }; auto isUniformDecision = [&](Instruction *I, ElementCount VF) { - InstWidening WideningDecision = getWideningDecision(I, VF); + InstWidening WideningDecision = + getWideningDecision(I, FoldTailByMasking, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4874,7 +4941,7 @@ addToWorklistIfAllowed(IndUpdate); } - Uniforms[VF].insert(Worklist.begin(), Worklist.end()); + Uniforms[{FoldTailByMasking, VF}].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationCostModel::runtimeChecksRequired() { @@ -5101,12 +5168,13 @@ case CM_ScalarEpilogueAllowed: return computeFeasibleMaxVF(TC, UserVF, false); case CM_ScalarEpilogueNotAllowedUsePredicate: - [[fallthrough]]; + LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Not allowing scalar epilogue, creating " + "predicated vector loop.\n"); + break; case CM_ScalarEpilogueNotNeededUsePredicate: - LLVM_DEBUG( - dbgs() << "LV: vector predicate hint/switch found.\n" - << "LV: Not allowing scalar epilogue, creating predicated " - << "vector loop.\n"); + LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n" + << "LV: Trying predicated vector loop.\n"); break; case CM_ScalarEpilogueNotAllowedLowTripLoop: // fallthrough as a special case of OptForSize @@ -5184,7 +5252,7 @@ // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - CanFoldTailByMasking = true; + MayFoldTailByMasking = true; return MaxFactors; } @@ -5284,10 +5352,10 @@ // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorElementCount). - SmallVector VFs; + SmallVector VFs; for (ElementCount VS = MaxVectorElementCount * 2; ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) - VFs.push_back(VS); + VFs.push_back({FoldTailByMasking, VS}); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5302,7 +5370,7 @@ Selected = false; } if (Selected) { - MaxVF = VFs[i]; + MaxVF = VFs[i].Width; break; } } @@ -5342,35 +5410,51 @@ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && foldTailByMasking() && + if (!A.Scheme.Width.isScalable() && !B.Scheme.Width.isScalable() && MaxTripCount) { - // If we are folding the tail and the trip count is a known (possibly small) - // constant, the trip count will be rounded up to an integer number of - // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), - // which we compare directly. When not folding the tail, the total cost will - // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is - // approximated with the per-lane cost below instead of using the tripcount - // as here. - auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); - auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); + // If the trip count is a known (possibly small) constant, the trip count + // will be rounded up to an integer number of iterations under + // FoldTailByMasking. The total cost in that case will be + // PerIterationCost*ceil(TripCount/VF). When not folding the tail, the total + // cost will be PerIterationCost*floor(TC/VF) + Scalar remainder cost, where + // the scalar cost is approximated using the correct fraction of the vector + // cost. + auto RTCostA = + A.Scheme.Width.getFixedValue() + ? (CostA * divideCeil(MaxTripCount, A.Scheme.Width.getFixedValue())) + : (CostA * MaxTripCount) / A.Scheme.Width.getFixedValue(); + auto RTCostB = + B.Scheme.Width.getFixedValue() + ? (CostB * divideCeil(MaxTripCount, B.Scheme.Width.getFixedValue())) + : (CostB * MaxTripCount) / B.Scheme.Width.getFixedValue(); + + if (A.Scheme.FoldTailByMasking && !B.Scheme.FoldTailByMasking) + return RTCostA <= RTCostB; + return RTCostA < RTCostB; } // Improve estimate for the vector width if it is scalable. - unsigned EstimatedWidthA = A.Width.getKnownMinValue(); - unsigned EstimatedWidthB = B.Width.getKnownMinValue(); + unsigned EstimatedWidthA = A.Scheme.Width.getKnownMinValue(); + unsigned EstimatedWidthB = B.Scheme.Width.getKnownMinValue(); if (std::optional VScale = getVScaleForTuning()) { - if (A.Width.isScalable()) + if (A.Scheme.Width.isScalable()) EstimatedWidthA *= *VScale; - if (B.Width.isScalable()) + if (B.Scheme.Width.isScalable()) EstimatedWidthB *= *VScale; } + // If one plan is predicated and the other is not, opt for the predicated + // scheme on a tie. + if (A.Scheme.FoldTailByMasking && !B.Scheme.FoldTailByMasking) + return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA); + // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + if (A.Scheme.Width.isScalable() && !B.Scheme.Width.isScalable()) + return (CostA * B.Scheme.Width.getFixedValue()) <= + (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) @@ -5398,8 +5482,8 @@ sort(InvalidCosts, [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { if (Numbering[A.first] != Numbering[B.first]) return Numbering[A.first] < Numbering[B.first]; - ElementCountComparator ECC; - return ECC(A.second, B.second); + VectorizationSchemeComparator ECC; + return ECC({false, A.second}, {false, B.second}); }); // For a list of ordered instruction-vf pairs: @@ -5444,14 +5528,15 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + InstructionCost ExpectedCost = + expectedCost({false, ElementCount::getFixed(1)}).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(VFCandidates.count(ElementCount::getFixed(1)) && + assert(VFCandidates.count({false, ElementCount::getFixed(1)}) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, - ExpectedCost); + const VectorizationFactor ScalarCost(false, ElementCount::getFixed(1), + ExpectedCost, ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -5465,7 +5550,7 @@ SmallVector InvalidCosts; for (const auto &i : VFCandidates) { // The cost for scalar VF=1 is already calculated, so ignore it. - if (i.isScalar()) + if (i.Width.isScalar()) continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); @@ -5476,12 +5561,13 @@ if (std::optional VScale = getVScaleForTuning()) AssumedMinimumVscale = *VScale; unsigned Width = - Candidate.Width.isScalable() - ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale - : Candidate.Width.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: " + Candidate.Scheme.Width.isScalable() + ? Candidate.Scheme.Width.getKnownMinValue() * AssumedMinimumVscale + : Candidate.Scheme.Width.getFixedValue(); + LLVM_DEBUG(dbgs() << "LV: " << (i.FoldTailByMasking ? "Tail folded " : "") + << "Vector loop of width " << i.Width << " costs: " << Candidate.Cost << " => " << (Candidate.Cost / Width)); - if (i.isScalable()) + if (i.Width.isScalable()) LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"); LLVM_DEBUG(dbgs() << ".\n"); @@ -5489,7 +5575,7 @@ if (!C.second && !ForceVectorization) { LLVM_DEBUG( - dbgs() << "LV: Not considering vector loop of width " << i + dbgs() << "LV: Not considering vector loop of width " << i.Width << " because it will not generate any vector instructions.\n"); continue; } @@ -5511,11 +5597,16 @@ ChosenFactor = ScalarCost; } - LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && - !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); - LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); + LLVM_DEBUG({ + if (ForceVectorization && !ChosenFactor.Scheme.Width.isScalar() && + !isMoreProfitable(ChosenFactor, ScalarCost)) + dbgs() << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"; + }); + LLVM_DEBUG( + dbgs() << "LV: Selecting " + << (ChosenFactor.Scheme.FoldTailByMasking ? "Tail folded " : "") + << "VF: " << ChosenFactor.Scheme.Width << ".\n"); return ChosenFactor; } @@ -5600,8 +5691,8 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) - return {ForcedEC, 0, 0}; + if (LVP.hasPlanWithVF(false, ForcedEC)) + return {false, ForcedEC, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -5635,16 +5726,16 @@ } for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && - ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && - (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) + if (((!NextVF.Scheme.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownLT(NextVF.Scheme.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownLT(NextVF.Scheme.Width, MainLoopVF)) && + (Result.Scheme.Width.isScalar() || isMoreProfitable(NextVF, Result)) && + LVP.hasPlanWithVF(false, NextVF.Scheme.Width)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " - << Result.Width << "\n";); + << Result.Scheme.Width << "\n";); return Result; } @@ -5724,7 +5815,7 @@ } unsigned -LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, +LoopVectorizationCostModel::selectInterleaveCount(VectorizationScheme VF, InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5740,7 +5831,8 @@ // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!isScalarEpilogueAllowed()) + if (!isScalarEpilogueAllowed() || + TheLoop->getHeader()->getParent()->hasOptSize()) return 1; // We used the distance for the interleave count. @@ -5755,7 +5847,8 @@ // because with the above conditions interleaving can expose ILP and break // cross iteration dependences for reductions. if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && - !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) + !(InterleaveSmallLoopScalarReduction && HasReductions && + VF.Width.isScalar())) return 1; // If we did not calculate the cost for VF (because the user selected the VF) @@ -5794,7 +5887,7 @@ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF.isScalar()) { + if (VF.Width.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { @@ -5818,10 +5911,10 @@ } // Clamp the interleave ranges to reasonable counts. - unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF); + unsigned MaxInterleaveCount = TTI.getMaxInterleaveFactor(VF.Width); // Check if the user has overridden the max. - if (VF.isScalar()) { + if (VF.Width.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { @@ -5840,8 +5933,8 @@ // the InterleaveCount as if vscale is '1', although if some information about // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { - MaxInterleaveCount = - std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); + MaxInterleaveCount = std::min(*BestKnownTC / VF.Width.getKnownMinValue(), + MaxInterleaveCount); // Make sure MaxInterleaveCount is greater than 0. MaxInterleaveCount = std::max(1u, MaxInterleaveCount); } @@ -5861,7 +5954,7 @@ // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF.isVector() && HasReductions) { + if (VF.Width.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5871,17 +5964,19 @@ // vectorized the loop we will have done the runtime check and so interleaving // won't require further checks. bool ScalarInterleavingRequiresPredication = - (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + (VF.Width.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); })); bool ScalarInterleavingRequiresRuntimePointerCheck = - (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); + (VF.Width.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' - << "LV: VF is " << VF << '\n'); + << "LV: VF is " << VF.Width << '\n' + << "LV: Fold Tail is " + << (VF.FoldTailByMasking ? "true" : "false") << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); if (!ScalarInterleavingRequiresRuntimePointerCheck && @@ -5947,7 +6042,7 @@ // If there are scalar reductions and TTI has enabled aggressive // interleaving for reductions, we will interleave to expose ILP. - if (InterleaveSmallLoopScalarReduction && VF.isScalar() && + if (InterleaveSmallLoopScalarReduction && VF.Width.isScalar() && AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); // Interleave no less than SmallIC but not as aggressive as the normal IC @@ -5971,7 +6066,8 @@ } SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { +LoopVectorizationCostModel::calculateRegisterUsage( + ArrayRef VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -6084,7 +6180,7 @@ // there is no previous entry for ClassID. SmallMapVector RegUsage; - if (VFs[j].isScalar()) { + if (VFs[j].Width.isScalar()) { for (auto *Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); @@ -6093,12 +6189,13 @@ RegUsage[ClassID] += 1; } } else { - collectUniformsAndScalars(VFs[j]); + collectUniformsAndScalars(VFs[j].FoldTailByMasking, VFs[j].Width); for (auto *Inst : OpenIntervals) { // Skip ignored values for VF > 1. if (VecValuesToIgnore.count(Inst)) continue; - if (isScalarAfterVectorization(Inst, VFs[j])) { + if (isScalarAfterVectorization(Inst, VFs[j].FoldTailByMasking, + VFs[j].Width)) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); // FIXME: The target might use more than one register for the type @@ -6107,7 +6204,7 @@ } else { unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); - RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j].Width); } } } @@ -6137,17 +6234,18 @@ bool IsScalar = all_of(Inst->users(), [&](User *U) { auto *I = cast(U); return TheLoop != LI->getLoopFor(I->getParent()) || - isScalarAfterVectorization(I, VFs[i]); + isScalarAfterVectorization(I, VFs[i].FoldTailByMasking, + VFs[i].Width); }); - ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i]; + ElementCount VF = IsScalar ? ElementCount::getFixed(1) : VFs[i].Width; unsigned ClassID = TTI.getRegisterClassForType(VF.isVector(), Inst->getType()); Invariant[ClassID] += GetRegUsage(Inst->getType(), VF); } LLVM_DEBUG({ - dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; + dbgs() << "LV(REG): VF = " << VFs[i].Width << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { @@ -6172,8 +6270,8 @@ return RUs; } -bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, - ElementCount VF) { +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such @@ -6182,25 +6280,26 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert((isPredicatedInst(I)) && + assert(isPredicatedInst(FoldTailByMasking, I) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(bool FoldTailByMasking, + ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. - if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains(VF)) + if (VF.isScalar() || VF.isZero() || InstsToScalarize.contains({FoldTailByMasking, VF})) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's // not profitable to scalarize any instructions, the presence of VF in the // map will indicate that we've analyzed it already. - ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + ScalarCostsTy &ScalarCostsVF = InstsToScalarize[{FoldTailByMasking, VF}]; PredicatedBBsAfterVectorization[VF].clear(); @@ -6208,17 +6307,19 @@ // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockNeedsPredicationForAnyReason(BB)) + if (!blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal, BB)) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I, VF)) { + if (isScalarWithPredication(&I, FoldTailByMasking, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && - computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + if (!VF.isScalable() && + !useEmulatedMaskMemRefHack(&I, FoldTailByMasking, VF) && + computePredInstDiscount(&I, ScalarCosts, FoldTailByMasking, VF) >= + 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. PredicatedBBsAfterVectorization[VF].insert(BB); @@ -6227,8 +6328,9 @@ } InstructionCost LoopVectorizationCostModel::computePredInstDiscount( - Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { - assert(!isUniformAfterVectorization(PredInst, VF) && + Instruction *PredInst, ScalarCostsTy &ScalarCosts, bool FoldTailByMasking, + ElementCount VF) { + assert(!isUniformAfterVectorization(PredInst, FoldTailByMasking, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the @@ -6248,12 +6350,12 @@ // already be scalar to avoid traversing chains that are unlikely to be // beneficial. if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || - isScalarAfterVectorization(I, VF)) + isScalarAfterVectorization(I, FoldTailByMasking, VF)) return false; // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I, VF)) + if (isScalarWithPredication(I, FoldTailByMasking, VF)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6268,7 +6370,7 @@ // the lane zero values for uniforms rather than asserting. for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) - if (isUniformAfterVectorization(J, VF)) + if (isUniformAfterVectorization(J, FoldTailByMasking, VF)) return false; // Otherwise, we can scalarize the instruction. @@ -6288,7 +6390,8 @@ // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. - InstructionCost VectorCost = getInstructionCost(I, VF).first; + InstructionCost VectorCost = + getInstructionCost(I, FoldTailByMasking, VF).first; // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the @@ -6296,12 +6399,14 @@ // computing the scalarization overhead. InstructionCost ScalarCost = VF.getFixedValue() * - getInstructionCost(I, ElementCount::getFixed(1)).first; + getInstructionCost(I, FoldTailByMasking, ElementCount::getFixed(1)) + .first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, FoldTailByMasking, VF) && + !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ true, @@ -6320,7 +6425,7 @@ "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) { + else if (needsExtract(J, FoldTailByMasking, VF)) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), /*Insert*/ false, @@ -6342,7 +6447,7 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost( - ElementCount VF, SmallVectorImpl *Invalid) { + VectorizationScheme VF, SmallVectorImpl *Invalid) { VectorizationCostTy Cost; // For each block. @@ -6353,10 +6458,11 @@ for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. if (ValuesToIgnore.count(&I) || - (VF.isVector() && VecValuesToIgnore.count(&I))) + (VF.Width.isVector() && VecValuesToIgnore.count(&I))) continue; - VectorizationCostTy C = getInstructionCost(&I, VF); + VectorizationCostTy C = + getInstructionCost(&I, VF.FoldTailByMasking, VF.Width); // Check if we should override the cost. if (C.first.isValid() && @@ -6365,12 +6471,12 @@ // Keep a list of instructions with invalid costs. if (Invalid && !C.first.isValid()) - Invalid->emplace_back(&I, VF); + Invalid->emplace_back(&I, VF.Width); BlockCost.first += C.first; BlockCost.second |= C.second; LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first - << " for VF " << VF << " For instruction: " << I + << " for VF " << VF.Width << " For instruction: " << I << '\n'); } @@ -6381,7 +6487,7 @@ // the predicated block, if it is an if-else block. Thus, scale the block's // cost by the probability of executing it. blockNeedsPredication from // Legal is used so as to not include all blocks in tail folded loops. - if (VF.isScalar() && Legal->blockNeedsPredication(BB)) + if (VF.Width.isScalar() && Legal->blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -6426,9 +6532,8 @@ Legal->hasStride(I->getOperand(1)); } -InstructionCost -LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getMemInstScalarizationCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { assert(VF.isVector() && "Scalarization cost of instruction implies vectorization."); if (VF.isScalable()) @@ -6461,12 +6566,12 @@ // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF, CostKind); + Cost += getScalarizationOverhead(I, FoldTailByMasking, VF, CostKind); // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(FoldTailByMasking, I)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6477,7 +6582,7 @@ /*Insert=*/false, /*Extract=*/true, CostKind); Cost += TTI.getCFInstrCost(Instruction::Br, CostKind); - if (useEmulatedMaskMemRefHack(I, VF)) + if (useEmulatedMaskMemRefHack(I, FoldTailByMasking, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; @@ -6486,9 +6591,8 @@ return Cost; } -InstructionCost -LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -6500,7 +6604,7 @@ "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; - if (Legal->isMaskRequired(I)) { + if (Legal->isMaskRequired(FoldTailByMasking, I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); } else { @@ -6544,9 +6648,8 @@ CostKind, VF.getKnownMinValue() - 1)); } -InstructionCost -LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getGatherScatterCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6554,13 +6657,13 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost( - I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, + I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(FoldTailByMasking, I), Alignment, TargetTransformInfo::TCK_RecipThroughput, I); } -InstructionCost -LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // TODO: Once we have support for interleaving with scalable vectors // we can calculate the cost properly here. if (VF.isScalable()) @@ -6589,11 +6692,12 @@ (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + AS, CostKind, Legal->isMaskRequired(FoldTailByMasking, I), + UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. - assert(!Legal->isMaskRequired(I) && + assert(!Legal->isMaskRequired(FoldTailByMasking, I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, @@ -6775,9 +6879,8 @@ return I == RetI ? std::optional(BaseCost) : std::nullopt; } -InstructionCost -LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getMemoryInstructionCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. if (VF.isScalar()) { @@ -6790,33 +6893,36 @@ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, TTI::TCK_RecipThroughput, OpInfo, I); } - return getWideningCost(I, VF); + return getWideningCost(I, FoldTailByMasking, VF); } LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, + bool FoldTailByMasking, ElementCount VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. - if (isUniformAfterVectorization(I, VF)) + if (isUniformAfterVectorization(I, FoldTailByMasking, VF)) VF = ElementCount::getFixed(1); - if (VF.isVector() && isProfitableToScalarize(I, VF)) - return VectorizationCostTy(InstsToScalarize[VF][I], false); + if (VF.isVector() && isProfitableToScalarize(I, FoldTailByMasking, VF)) + return VectorizationCostTy(InstsToScalarize[{FoldTailByMasking, VF}][I], + false); // Forced scalars do not have any scalarization overhead. - auto ForcedScalar = ForcedScalars.find(VF); + auto ForcedScalar = ForcedScalars.find({FoldTailByMasking, VF}); if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) return VectorizationCostTy( - (getInstructionCost(I, ElementCount::getFixed(1)).first * + (getInstructionCost(I, FoldTailByMasking, ElementCount::getFixed(1)) + .first * VF.getKnownMinValue()), false); } Type *VectorTy; - InstructionCost C = getInstructionCost(I, VF, VectorTy); + InstructionCost C = getInstructionCost(I, FoldTailByMasking, VF, VectorTy); bool TypeNotScalarized = false; if (VF.isVector() && VectorTy->isVectorTy()) { @@ -6837,7 +6943,8 @@ } InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( - Instruction *I, ElementCount VF, TTI::TargetCostKind CostKind) const { + Instruction *I, bool FoldTailByMasking, ElementCount VF, + TTI::TargetCostKind CostKind) const { // There is no mechanism yet to create a scalable scalarization loop, // so this is currently Invalid. @@ -6871,13 +6978,15 @@ // Skip operands that do not require extraction/scalarization and do not incur // any overhead. SmallVector Tys; - for (auto *V : filterExtractingOperands(Ops, VF)) + for (auto *V : filterExtractingOperands(Ops, FoldTailByMasking, VF)) Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), Tys, CostKind); + filterExtractingOperands(Ops, FoldTailByMasking, VF), Tys, + CostKind); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { +void LoopVectorizationCostModel::setCostBasedWideningDecision( + bool FoldTailByMasking, ElementCount VF) { if (VF.isScalar()) return; NumPredStores = 0; @@ -6892,7 +7001,8 @@ // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa(&I) && isScalarWithPredication(&I, VF)) + if (isa(&I) && + isScalarWithPredication(&I, FoldTailByMasking, VF)) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -6905,7 +7015,7 @@ // stores. Note that even with tail folding we know that at least // one lane is active (i.e. generalized predication is not possible // here), and the logic below depends on this fact. - if (!foldTailByMasking()) + if (!FoldTailByMasking) return true; // For scalable vectors, a uniform memop load is always @@ -6920,8 +7030,9 @@ }; const InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I, VF) ? - getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); + isLegalGatherOrScatter(&I, VF) + ? getGatherScatterCost(&I, FoldTailByMasking, VF) + : InstructionCost::getInvalid(); // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract @@ -6934,22 +7045,25 @@ // costs compare as maximumal large. If both are invalid, we get // scalable invalid which signals a failure and a vectorization abort. if (GatherScatterCost < ScalarizationCost) - setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); + setWideningDecision(&I, FoldTailByMasking, VF, CM_GatherScatter, + GatherScatterCost); else - setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); + setWideningDecision(&I, FoldTailByMasking, VF, CM_Scalarize, + ScalarizationCost); continue; } // We assume that widening is the best solution when possible. - if (memoryInstructionCanBeWidened(&I, VF)) { - InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); + if (memoryInstructionCanBeWidened(&I, FoldTailByMasking, VF)) { + InstructionCost Cost = + getConsecutiveMemOpCost(&I, FoldTailByMasking, VF); int ConsecutiveStride = Legal->isConsecutivePtr( getLoadStoreType(&I), getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; - setWideningDecision(&I, VF, Decision, Cost); + setWideningDecision(&I, FoldTailByMasking, VF, Decision, Cost); continue; } @@ -6961,21 +7075,21 @@ assert(Group && "Fail to get an interleaved access group."); // Make one decision for the whole group. - if (getWideningDecision(&I, VF) != CM_Unknown) + if (getWideningDecision(&I, FoldTailByMasking, VF) != CM_Unknown) continue; NumAccesses = Group->getNumMembers(); - if (interleavedAccessCanBeWidened(&I, VF)) - InterleaveCost = getInterleaveGroupCost(&I, VF); + if (interleavedAccessCanBeWidened(&I, FoldTailByMasking, VF)) + InterleaveCost = getInterleaveGroupCost(&I, FoldTailByMasking, VF); } InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I, VF) - ? getGatherScatterCost(&I, VF) * NumAccesses + ? getGatherScatterCost(&I, FoldTailByMasking, VF) * NumAccesses : InstructionCost::getInvalid(); InstructionCost ScalarizationCost = - getMemInstScalarizationCost(&I, VF) * NumAccesses; + getMemInstScalarizationCost(&I, FoldTailByMasking, VF) * NumAccesses; // Choose better solution for the current VF, // write down this decision and use it during vectorization. @@ -6996,9 +7110,9 @@ // receives the same decision. The whole group receives the cost, but // the cost will actually be assigned to one instruction. if (auto Group = getInterleavedAccessGroup(&I)) - setWideningDecision(Group, VF, Decision, Cost); + setWideningDecision(Group, FoldTailByMasking, VF, Decision, Cost); else - setWideningDecision(&I, VF, Decision, Cost); + setWideningDecision(&I, FoldTailByMasking, VF, Decision, Cost); } } @@ -7017,7 +7131,7 @@ Instruction *PtrDef = dyn_cast_or_null(getLoadStorePointerOperand(&I)); if (PtrDef && TheLoop->contains(PtrDef) && - getWideningDecision(&I, VF) != CM_GatherScatter) + getWideningDecision(&I, FoldTailByMasking, VF) != CM_GatherScatter) AddrDefs.insert(PtrDef); } @@ -7039,45 +7153,46 @@ // by cost functions, but since this involves the task of finding out // if the loaded register is involved in an address computation, it is // instead changed here when we know this is the case. - InstWidening Decision = getWideningDecision(I, VF); + InstWidening Decision = getWideningDecision(I, FoldTailByMasking, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. setWideningDecision( - I, VF, CM_Scalarize, + I, FoldTailByMasking, VF, CM_Scalarize, (VF.getKnownMinValue() * - getMemoryInstructionCost(I, ElementCount::getFixed(1)))); + getMemoryInstructionCost(I, FoldTailByMasking, + ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) setWideningDecision( - Member, VF, CM_Scalarize, + Member, FoldTailByMasking, VF, CM_Scalarize, (VF.getKnownMinValue() * - getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + getMemoryInstructionCost(Member, FoldTailByMasking, + ElementCount::getFixed(1)))); } } } else // Make sure I gets scalarized and a cost estimate without // scalarization overhead. - ForcedScalars[VF].insert(I); + ForcedScalars[{FoldTailByMasking, VF}].insert(I); } } -InstructionCost -LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy) { +InstructionCost LoopVectorizationCostModel::getInstructionCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF, Type *&VectorTy) { Type *RetTy = I->getType(); - if (canTruncateToMinimalBitwidth(I, VF)) + if (canTruncateToMinimalBitwidth(I, FoldTailByMasking, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); auto SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto hasSingleCopyAfterVectorization = [this](Instruction *I, - ElementCount VF) -> bool { + auto hasSingleCopyAfterVectorization = + [this](Instruction *I, bool FoldTailByMasking, ElementCount VF) -> bool { if (VF.isScalar()) return true; - auto Scalarized = InstsToScalarize.find(VF); + auto Scalarized = InstsToScalarize.find({FoldTailByMasking, VF}); assert(Scalarized != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); return !Scalarized->second.count(I) && @@ -7088,7 +7203,7 @@ }; (void) hasSingleCopyAfterVectorization; - if (isScalarAfterVectorization(I, VF)) { + if (isScalarAfterVectorization(I, FoldTailByMasking, VF)) { // With the exception of GEPs and PHIs, after scalarization there should // only be one copy of the instruction generated in the loop. This is // because the VF is either 1, or any instructions that need scalarizing @@ -7098,7 +7213,7 @@ I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || - hasSingleCopyAfterVectorization(I, VF)); + hasSingleCopyAfterVectorization(I, FoldTailByMasking, VF)); VectorTy = RetTy; } else VectorTy = ToVectorTy(RetTy, VF); @@ -7172,8 +7287,9 @@ case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - if (VF.isVector() && isPredicatedInst(I)) { - const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); + if (VF.isVector() && isPredicatedInst(FoldTailByMasking, I)) { + const auto [ScalarCost, SafeDivisorCost] = + getDivRemSpeculationCost(I, FoldTailByMasking, VF); return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? ScalarCost : SafeDivisorCost; } @@ -7257,7 +7373,7 @@ case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); - if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) + if (canTruncateToMinimalBitwidth(Op0AsInstruction, FoldTailByMasking, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, @@ -7268,16 +7384,17 @@ case Instruction::Load: { ElementCount Width = VF; if (Width.isVector()) { - InstWidening Decision = getWideningDecision(I, Width); + InstWidening Decision = getWideningDecision(I, FoldTailByMasking, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); - if (getWideningCost(I, VF) == InstructionCost::getInvalid()) + if (getWideningCost(I, FoldTailByMasking, VF) == + InstructionCost::getInvalid()) return InstructionCost::getInvalid(); if (Decision == CM_Scalarize) Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getLoadStoreType(I), Width); - return getMemoryInstructionCost(I, VF); + return getMemoryInstructionCost(I, FoldTailByMasking, VF); } case Instruction::BitCast: if (I->getType()->isPointerTy()) @@ -7302,15 +7419,16 @@ if (VF.isScalar() || !TheLoop->contains(I)) return TTI::CastContextHint::Normal; - switch (getWideningDecision(I, VF)) { + switch (getWideningDecision(I, FoldTailByMasking, VF)) { case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; case LoopVectorizationCostModel::CM_Interleave: return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: case LoopVectorizationCostModel::CM_Widen: - return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked - : TTI::CastContextHint::Normal; + return Legal->isMaskRequired(FoldTailByMasking, I) + ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; case LoopVectorizationCostModel::CM_Widen_Reverse: return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: @@ -7351,7 +7469,7 @@ Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; - if (canTruncateToMinimalBitwidth(I, VF)) { + if (canTruncateToMinimalBitwidth(I, FoldTailByMasking, VF)) { // This cast is going to be shrunk. This may remove the cast or it might // turn it into slightly different cast. For example, if MinBW == 16, // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". @@ -7377,7 +7495,7 @@ return *RedCost; Function *Variant; CallInst *CI = cast(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, &Variant); + InstructionCost CallCost = getVectorCallCost(CI, FoldTailByMasking, VF, &Variant); if (getVectorIntrinsicIDForCall(CI, TLI)) { InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); return std::min(CallCost, IntrinsicCost); @@ -7513,7 +7631,7 @@ if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; + return {false, VF, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -7530,7 +7648,8 @@ return std::nullopt; // Invalidate interleave groups if all blocks of loop will be predicated. - if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && + if (blockNeedsPredicationForAnyReason(CM.mayFoldTailByMasking(), Legal, + OrigLoop->getHeader()) && !useMaskedInterleavedAccesses(*TTI)) { LLVM_DEBUG( dbgs() @@ -7551,12 +7670,13 @@ "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (CM.selectUserVectorizationFactor(UserVF)) { + // TODOD: Should this compare tail folded vs non-tail-folded? + if (CM.selectUserVectorizationFactor(CM.mayFoldTailByMasking(), UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(CM.mayFoldTailByMasking(), UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + return {{CM.mayFoldTailByMasking(), UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7564,26 +7684,48 @@ // Populate the set of Vectorization Factor Candidates. ElementCountSet VFCandidates; - for (auto VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) - VFCandidates.insert(VF); - for (auto VF = ElementCount::getScalable(1); - ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) - VFCandidates.insert(VF); + VFCandidates.insert({false, ElementCount::getFixed(1)}); + if (CM.shouldGenerateNonFoldedVFs()) { + for (auto VF = ElementCount::getFixed(2); + ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) + VFCandidates.insert({false, VF}); + for (auto VF = ElementCount::getScalable(1); + ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) + VFCandidates.insert({false, VF}); + } + if (CM.mayFoldTailByMasking()) { + for (auto VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) + VFCandidates.insert({true, VF}); + for (auto VF = ElementCount::getScalable(1); + ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) + VFCandidates.insert({true, VF}); + } for (const auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); + CM.collectUniformsAndScalars(VF.FoldTailByMasking, VF.Width); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF.isVector()) - CM.collectInstsToScalarize(VF); + if (VF.Width.isVector()) + CM.collectInstsToScalarize(VF.FoldTailByMasking, VF.Width); } CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); + buildVPlansWithVPRecipes(false, ElementCount::getFixed(1), + CM.shouldGenerateNonFoldedVFs() + ? MaxFactors.FixedVF + : ElementCount::getFixed(1)); + if (CM.shouldGenerateNonFoldedVFs()) + buildVPlansWithVPRecipes(false, ElementCount::getScalable(1), + MaxFactors.ScalableVF); + if (CM.mayFoldTailByMasking()) { + buildVPlansWithVPRecipes(true, ElementCount::getFixed(1), + MaxFactors.FixedVF); + buildVPlansWithVPRecipes(true, ElementCount::getScalable(1), + MaxFactors.ScalableVF); + } LLVM_DEBUG(printPlans(dbgs())); if (!MaxFactors.hasVector()) @@ -7591,18 +7733,22 @@ // Select the optimal vectorization factor. VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + assert((VF.Scheme.Width.isScalar() || VF.ScalarCost > 0) && + "when vectorizing, the scalar cost must be non-zero."); return VF; } -VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { +VPlan &LoopVectorizationPlanner::getBestPlanFor(VectorizationScheme VF) const { assert(count_if(VPlans, - [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == - 1 && + [VF](const VPlanPtr &Plan) { + return Plan->hasVF(VF.Width) && + Plan->foldTailByMasking() == VF.FoldTailByMasking; + }) == 1 && "Best VF has not a single VPlan."); for (const VPlanPtr &Plan : VPlans) { - if (Plan->hasVF(VF)) + if (Plan->hasVF(VF.Width) && + Plan->foldTailByMasking() == VF.FoldTailByMasking) return *Plan.get(); } llvm_unreachable("No plan found!"); @@ -7652,8 +7798,9 @@ assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF - << '\n'); + LLVM_DEBUG(dbgs() << "Executing best plan with TailFold=" + << (BestVPlan.foldTailByMasking() ? "true" : "false") + << ", VF=" << BestVF << ", UF=" << BestUF << '\n'); // Workaround! Compute the trip count of the original loop and cache it // before we start modifying the CFG. This code has a systemic problem @@ -7674,7 +7821,7 @@ VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = - ILV.createVectorizedLoopSkeleton(); + ILV.createVectorizedLoopSkeleton(BestVPlan); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. @@ -7706,7 +7853,7 @@ // 2. Copy and widen instructions from the old loop into the new loop. BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), - ILV.getOrCreateVectorTripCount(nullptr), + ILV.getOrCreateVectorTripCount(nullptr, BestVPlan), CanonicalIVStartValue, State, IsEpilogueVectorization); @@ -7762,7 +7909,7 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair -EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(VPlan &Plan) { createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector @@ -7790,14 +7937,14 @@ emitIterationCountCheck(LoopScalarPreHeader, false); // Generate the induction variable. - EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader, Plan); // Skip induction resume value creation here because they will be created in // the second pass for the scalar loop. The induction resume values for the // inductions in the epilogue loop are created before executing the plan for // the epilogue loop. - return {completeLoopSkeleton(), nullptr}; + return {completeLoopSkeleton(Plan), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7880,7 +8027,8 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( + VPlan &Plan) { createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to @@ -7978,10 +8126,11 @@ // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues({VecEpilogueIterationCountCheck, + createInductionResumeValues(Plan, + {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); - return {completeLoopSkeleton(), EPResumeVal}; + return {completeLoopSkeleton(Plan), EPResumeVal}; } BasicBlock * @@ -8119,14 +8268,15 @@ VPValue *BlockMask = nullptr; if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredicationForAnyReason(BB)) + if (!blockNeedsPredicationForAnyReason(Plan.foldTailByMasking(), Legal, + BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - assert(CM.foldTailByMasking() && "must fold the tail"); + assert(Plan.foldTailByMasking() && "must fold the tail"); // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. - TailFoldingStyle TFStyle = CM.getTailFoldingStyle(); + TailFoldingStyle TFStyle = CM.getTailFoldingStyle(Plan.foldTailByMasking()); if (useActiveLaneMaskForControlFlow(TFStyle)) return BlockMaskCache[BB] = Plan.getActiveLaneMaskPhi(); @@ -8179,13 +8329,13 @@ auto willWiden = [&](ElementCount VF) -> bool { LoopVectorizationCostModel::InstWidening Decision = - CM.getWideningDecision(I, VF); + CM.getWideningDecision(I, Plan->foldTailByMasking(), VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point."); if (Decision == LoopVectorizationCostModel::CM_Interleave) return true; - if (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF)) + if (CM.isScalarAfterVectorization(I, Plan->foldTailByMasking(), VF) || + CM.isProfitableToScalarize(I, Plan->foldTailByMasking(), VF)) return false; return Decision != LoopVectorizationCostModel::CM_Scalarize; }; @@ -8194,13 +8344,13 @@ return nullptr; VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) + if (Legal->isMaskRequired(Plan->foldTailByMasking(), I)) Mask = createBlockInMask(I->getParent(), *Plan); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. LoopVectorizationCostModel::InstWidening Decision = - CM.getWideningDecision(I, Range.Start); + CM.getWideningDecision(I, Plan->foldTailByMasking(), Range.Start); bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; @@ -8222,9 +8372,10 @@ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { // Returns true if an instruction \p I should be scalarized instead of // vectorized for the chosen vectorization factor. - auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { - return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF); + auto ShouldScalarizeInstruction = [&CM, &Plan](Instruction *I, + ElementCount VF) { + return CM.isScalarAfterVectorization(I, Plan.foldTailByMasking(), VF) || + CM.isProfitableToScalarize(I, Plan.foldTailByMasking(), VF); }; bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8266,7 +8417,8 @@ Phi, Operands[0], Step, *II, LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { - return CM.isScalarAfterVectorization(Phi, VF); + return CM.isScalarAfterVectorization( + Phi, Plan.foldTailByMasking(), VF); }, Range)); } @@ -8351,8 +8503,8 @@ VFRange &Range, VPlanPtr &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { - return CM.isScalarWithPredication(CI, VF); + [this, CI, &Plan](ElementCount VF) { + return CM.isScalarWithPredication(CI, Plan->foldTailByMasking(), VF); }, Range); @@ -8376,7 +8528,7 @@ // Is it beneficial to perform intrinsic call compared to lib // call? InstructionCost CallCost = - CM.getVectorCallCost(CI, VF, &Variant); + CM.getVectorCallCost(CI, Plan->foldTailByMasking(), VF, &Variant); InstructionCost IntrinsicCost = CM.getVectorIntrinsicCost(CI, VF); return IntrinsicCost <= CallCost; @@ -8406,7 +8558,7 @@ // finds a valid variant. if (Variant) return false; - CM.getVectorCallCost(CI, VF, &Variant, &NeedsMask); + CM.getVectorCallCost(CI, Plan->foldTailByMasking(), VF, &Variant, &NeedsMask); // If we found a valid vector variant at this VF, then store the VF // in case we need to generate a mask. if (Variant) @@ -8441,15 +8593,16 @@ return nullptr; } -bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { +bool VPRecipeBuilder::shouldWiden(Instruction *I, bool FoldTailByMasking, + VFRange &Range) const { assert(!isa(I) && !isa(I) && !isa(I) && !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](ElementCount VF) -> bool { - return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || - CM.isScalarWithPredication(I, VF); + auto WillScalarize = [this, I, FoldTailByMasking](ElementCount VF) -> bool { + return CM.isScalarAfterVectorization(I, FoldTailByMasking, VF) || + CM.isProfitableToScalarize(I, FoldTailByMasking, VF) || + CM.isScalarWithPredication(I, FoldTailByMasking, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8467,7 +8620,7 @@ case Instruction::URem: { // If not provably safe, use a select to form a safe divisor before widening the // div/rem operation itself. Otherwise fall through to general handling below. - if (CM.isPredicatedInst(I)) { + if (CM.isPredicatedInst(Plan->foldTailByMasking(), I)) { SmallVector Ops(Operands.begin(), Operands.end()); VPValue *Mask = createBlockInMask(I->getParent(), *Plan); VPValue *One = @@ -8530,10 +8683,12 @@ VFRange &Range, VPlan &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { + return CM.isUniformAfterVectorization(I, Plan.foldTailByMasking(), VF); + }, Range); - bool IsPredicated = CM.isPredicatedInst(I); + bool IsPredicated = CM.isPredicatedInst(Plan.foldTailByMasking(), I); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -8651,7 +8806,7 @@ if (isa(Instr) || isa(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); - if (!shouldWiden(Instr, Range)) + if (!shouldWiden(Instr, Plan->foldTailByMasking(), Range)) return nullptr; if (auto GEP = dyn_cast(Instr)) @@ -8666,7 +8821,8 @@ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(bool FoldTailByMasking, + ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -8675,13 +8831,14 @@ // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. SmallPtrSet DeadInstructions; - auto &ConditionalAssumes = Legal->getConditionalAssumes(); + auto &ConditionalAssumes = Legal->getConditionalAssumes(FoldTailByMasking); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back(buildVPlanWithVPRecipes(SubRange, DeadInstructions)); + VPlans.push_back( + buildVPlanWithVPRecipes(FoldTailByMasking, SubRange, DeadInstructions)); VF = SubRange.End; } } @@ -8813,7 +8970,8 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions) { + bool FoldTailByMasking, VFRange &Range, + SmallPtrSetImpl &DeadInstructions) { SmallPtrSet *, 1> InterleaveGroups; @@ -8847,10 +9005,11 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { - return (VF.isVector() && // Query is illegal for VF == 1 - CM.getWideningDecision(IG->getInsertPos(), VF) == - LoopVectorizationCostModel::CM_Interleave); + auto applyIG = [IG, this, &FoldTailByMasking](ElementCount VF) -> bool { + return ( + VF.isVector() && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), FoldTailByMasking, VF) == + LoopVectorizationCostModel::CM_Interleave); }; if (!getDecisionAndClampRange(applyIG, Range)) continue; @@ -8869,7 +9028,7 @@ // followed by a region for the vector loop, followed by the middle block. The // skeleton vector loop region contains a header and latch block. VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique(Preheader); + auto Plan = std::make_unique(Preheader, FoldTailByMasking); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); @@ -8891,7 +9050,7 @@ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - CM.getTailFoldingStyle(IVUpdateMayOverflow)); + CM.getTailFoldingStyle(FoldTailByMasking, IVUpdateMayOverflow)); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9080,7 +9239,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - CM.getTailFoldingStyle()); + CM.getTailFoldingStyle(false)); return Plan; } @@ -9133,7 +9292,8 @@ VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); VPValue *CondOp = nullptr; - if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { + if (blockNeedsPredicationForAnyReason(Plan->foldTailByMasking(), Legal, + R->getParent())) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(WidenRecipe->getParent(), WidenRecipe->getIterator()); @@ -9178,7 +9338,7 @@ // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. - if (CM.foldTailByMasking()) { + if (Plan->foldTailByMasking()) { Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -9867,16 +10027,16 @@ if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) return false; - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme); { GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.Width, 1, LVL, &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Scheme.Width, + VF.Scheme.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); + LVP.executePlan(VF.Scheme.Width, 1, BestPlan, LB, DT, false); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -9940,7 +10100,7 @@ // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. - if (VF.Width.isScalar()) { + if (VF.Scheme.Width.isScalar()) { if (CheckCost > VectorizeMemoryCheckThreshold) { LLVM_DEBUG( dbgs() @@ -9983,8 +10143,8 @@ // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. - unsigned IntVF = VF.Width.getKnownMinValue(); - if (VF.Width.isScalable()) { + unsigned IntVF = VF.Scheme.Width.getKnownMinValue(); + if (VF.Scheme.Width.isScalable()) { unsigned AssumedMinimumVscale = 1; if (VScale) AssumedMinimumVscale = *VScale; @@ -10211,13 +10371,14 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Scheme, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + if (VF.Scheme.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Scheme.Width, + SelectedIC); // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = @@ -10241,7 +10402,7 @@ // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; - if (VF.Width.isScalar()) { + if (VF.Scheme.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", @@ -10307,7 +10468,7 @@ << VecDiagMsg.second; }); } else if (VectorizeLoop && !InterleaveLoop) { - LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Scheme.Width << ") in " << DebugLocStr << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, @@ -10315,7 +10476,7 @@ << IntDiagMsg.second; }); } else if (VectorizeLoop && InterleaveLoop) { - LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Scheme.Width << ") in " << DebugLocStr << '\n'); LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } @@ -10331,8 +10492,8 @@ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme); + LVP.executePlan(VF.Scheme.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), @@ -10345,17 +10506,19 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); - if (EpilogueVF.Width.isVector()) { + CM.selectEpilogueVectorizationFactor(VF.Scheme.Width, LVP); + if (EpilogueVF.Scheme.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); + // TODOD: Predicated remainders + EpilogueLoopVectorizationInfo EPI(VF.Scheme.Width, IC, + EpilogueVF.Scheme.Width, 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); + VPlan &BestMainPlan = LVP.getBestPlanFor({false, EPI.MainLoopVF}); LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; @@ -10368,7 +10531,7 @@ ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + VPlan &BestEpiPlan = LVP.getBestPlanFor({false, EPI.EpilogueVF}); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); @@ -10377,7 +10540,8 @@ // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated // before vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { - if (isa(&R)) + if (isa(&R) || + isa(&R)) continue; Value *ResumeV = nullptr; @@ -10401,7 +10565,7 @@ } ResumeV = MainILV.createInductionResumeValue( - IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); + IndPhi, *ID, {EPI.MainLoopIterationCountCheck}, BestEpiPlan); } assert(ResumeV && "Must have a resume value"); VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); @@ -10415,12 +10579,12 @@ if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, - PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, + VF.Scheme.Width, VF.MinProfitableTripCount, IC, + &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme); + LVP.executePlan(VF.Scheme.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there @@ -10434,7 +10598,7 @@ return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), L->getHeader()) << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) + << NV("VectorizationFactor", VF.Scheme.Width) << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; }); } Index: llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -64,7 +64,8 @@ /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. - bool shouldWiden(Instruction *I, VFRange &Range) const; + bool shouldWiden(Instruction *I, bool FoldTailByMasking, + VFRange &Range) const; /// Check if the load or store instruction \p I should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -2221,8 +2221,12 @@ /// Values used outside the plan. MapVector LiveOuts; + /// Whether this plan should foldTailByMasking + bool FoldTailByMasking = false; + public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { + VPlan(VPBlockBase *Entry = nullptr, bool FoldTailByMasking = false) + : Entry(Entry), FoldTailByMasking(FoldTailByMasking) { if (Entry) Entry->setPlan(this); } @@ -2405,6 +2409,8 @@ return LiveOuts; } + bool foldTailByMasking() const { return FoldTailByMasking; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -795,6 +795,8 @@ std::string Out; raw_string_ostream RSO(Out); RSO << Name << " for "; + if (FoldTailByMasking) + RSO << "Tail Folded "; if (!VFs.empty()) { RSO << "VF={" << VFs[0]; for (ElementCount VF : drop_begin(VFs)) Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-LABEL: Checking a loop in 'simple_memset' -; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS: VPlan 'Initial VPlan for Tail Folded VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { ; VPLANS-NEXT: Live-in vp<[[TC:%[0-9]+]]> = original trip-count ; VPLANS-EMPTY: ; VPLANS-NEXT: vector.ph: Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll @@ -20,25 +20,37 @@ ; NONE: vector.ph: ; NONE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() ; NONE-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; NONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]] -; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]] -; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 -; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; NONE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; NONE-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; NONE-NEXT: [[TMP6:%.*]] = sub i64 [[TMP5]], 1 +; NONE-NEXT: [[N_RND_UP:%.*]] = add i64 [[UMAX]], [[TMP6]] +; NONE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP3]] +; NONE-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] +; NONE-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[UMAX]], 1 +; NONE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0 +; NONE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector [[BROADCAST_SPLATINSERT]], poison, zeroinitializer +; NONE-NEXT: [[BROADCAST_SPLATINSERT4:%.*]] = insertelement poison, i32 [[VAL:%.*]], i64 0 +; NONE-NEXT: [[BROADCAST_SPLAT5:%.*]] = shufflevector [[BROADCAST_SPLATINSERT4]], poison, zeroinitializer ; NONE-NEXT: br label [[VECTOR_BODY:%.*]] ; NONE: vector.body: -; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ] -; NONE-NEXT: [[TMP4:%.*]] = add i64 [[INDEX1]], 0 -; NONE-NEXT: [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]] -; NONE-NEXT: [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0 -; NONE-NEXT: store [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4 -; NONE-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64() -; NONE-NEXT: [[TMP8:%.*]] = mul i64 [[TMP7]], 4 -; NONE-NEXT: [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]] -; NONE-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]] -; NONE-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; NONE-NEXT: [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VECTOR_BODY]] ] +; NONE-NEXT: [[TMP7:%.*]] = add i64 [[INDEX1]], 0 +; NONE-NEXT: [[BROADCAST_SPLATINSERT2:%.*]] = insertelement poison, i64 [[INDEX1]], i64 0 +; NONE-NEXT: [[BROADCAST_SPLAT3:%.*]] = shufflevector [[BROADCAST_SPLATINSERT2]], poison, zeroinitializer +; NONE-NEXT: [[TMP8:%.*]] = call @llvm.experimental.stepvector.nxv4i64() +; NONE-NEXT: [[TMP9:%.*]] = add zeroinitializer, [[TMP8]] +; NONE-NEXT: [[VEC_IV:%.*]] = add [[BROADCAST_SPLAT3]], [[TMP9]] +; NONE-NEXT: [[TMP10:%.*]] = icmp ule [[VEC_IV]], [[BROADCAST_SPLAT]] +; NONE-NEXT: [[TMP11:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP7]] +; NONE-NEXT: [[TMP12:%.*]] = getelementptr i32, ptr [[TMP11]], i32 0 +; NONE-NEXT: call void @llvm.masked.store.nxv4i32.p0( [[BROADCAST_SPLAT5]], ptr [[TMP12]], i32 4, [[TMP10]]) +; NONE-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; NONE-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 4 +; NONE-NEXT: [[INDEX_NEXT6]] = add nuw i64 [[INDEX1]], [[TMP14]] +; NONE-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC]] +; NONE-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; NONE: middle.block: -; NONE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]] -; NONE-NEXT: br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; NONE-NEXT: br i1 true, label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; NONE: scalar.ph: ; NONE-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; NONE-NEXT: br label [[WHILE_BODY:%.*]] Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -1,11 +1,13 @@ ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT -; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TAILPRED +; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED +; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=DEFAULT target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-none-eabi" ; When TP is disabled, this test can vectorize with a VF of 16. ; When TP is enabled, this test should vectorize with a VF of 8. +; When both are allowed, the VF=16 without tail folding should win out. ; ; DEFAULT: load <16 x i8>, <16 x i8>* ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16> Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -8,9 +8,9 @@ define i32 @foo() { ; CHECK-LABEL: foo -; CHECK-PWR8: Executing best plan with VF=16, UF=4 +; CHECK-PWR8: Executing best plan with TailFold=false, VF=16, UF=4 -; CHECK-PWR9: Executing best plan with VF=8, UF=8 +; CHECK-PWR9: Executing best plan with TailFold=false, VF=8, UF=8 entry: @@ -46,7 +46,7 @@ ; CHECK-LABEL: goo -; CHECK: Executing best plan with VF=16, UF=4 +; CHECK: Executing best plan with TailFold=false, VF=16, UF=4 entry: br label %for.body @@ -79,7 +79,7 @@ define i64 @bar(ptr nocapture %a) { ; CHECK-LABEL: bar -; CHECK: Executing best plan with VF=2, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=2, UF=12 entry: br label %for.body @@ -107,7 +107,7 @@ define void @hoo(i32 %n) { ; CHECK-LABEL: hoo -; CHECK: Executing best plan with VF=1, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=1, UF=12 entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -108,11 +108,12 @@ ; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 +; CHECK-NEXT: LV: Fold Tail is false ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: @@ -240,11 +241,12 @@ ; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 1 ; CHECK-NEXT: LV: VF is vscale x 4 +; CHECK-NEXT: LV: Fold Tail is false ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -14,7 +14,9 @@ ; ; -; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata. +; This loop will be vectorized, although the trip count is below the threshold, but +; vectorization is explicitly forced in metadata. The trip count of 4 is chosen as +; it more nicely divides the loop count of 20, produce a lower total cost. ; define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( @@ -27,20 +29,20 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 20 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -52,7 +54,7 @@ ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -8,7 +8,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_1 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -97,7 +97,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_2 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -165,7 +165,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_3_reduction -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -238,7 +238,7 @@ ; containing %conv at the end, because %conv is the last recipe in the block. define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_4_requires_split_at_end_of_block -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -335,7 +335,7 @@ ; Test case that requires sinking a recipe in a replicate region after another replicate region. define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize { ; CHECK-LABEL: sink_replicate_region_after_replicate_region -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -408,7 +408,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias %dst) { ; CHECK-LABEL: need_new_block_after_sinking_pr56146 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: Index: llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,7 +36,7 @@ ; Check for crash exposed by D76992. ; CHECK-LABEL: 'test' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: Index: llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -16,50 +16,27 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[VEC_IV1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[VEC_IV2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IV3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV1]], 14 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV2]], 14 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV3]], 14 -; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: store i32 0, ptr [[TMP4]], align 4 ; CHECK-NEXT: store i32 0, ptr [[TMP5]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; CHECK: pred.store.if4: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: store i32 0, ptr [[TMP6]], align 4 ; CHECK-NEXT: store i32 0, ptr [[TMP7]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]] -; CHECK: pred.store.continue5: -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] -; CHECK: pred.store.if6: -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]] -; CHECK-NEXT: store i32 0, ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]] -; CHECK: pred.store.continue7: -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.if8: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP10]] -; CHECK-NEXT: store i32 0, ptr [[TMP11]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.continue9: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 15, 12 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 12, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -69,7 +46,7 @@ ; CHECK-NEXT: store i32 0, ptr [[DST_PTR]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: br label %for.body @@ -92,55 +69,32 @@ ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds double, ptr [[PTR1:%.*]], i64 15 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 128 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 96 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] -; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 14 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 14 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14 -; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 -; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP1]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP9]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP2]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] -; CHECK: pred.store.continue10: -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP11]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP3]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 15, 12 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -151,7 +105,7 @@ ; CHECK-NEXT: store double 0.000000e+00, ptr [[ADDR]], align 8 ; CHECK-NEXT: [[PTR]] = getelementptr inbounds double, ptr [[ADDR]], i64 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[PTR]], [[PTR2]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; entry: %ptr2 = getelementptr inbounds double, ptr %ptr1, i64 15 Index: llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: LV: Checking a loop in 'sink1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -73,7 +73,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -151,7 +151,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -231,7 +231,7 @@ ; Make sure we do not sink uniform instructions. define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -300,7 +300,7 @@ ; Loop with predicated load. define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -394,7 +394,7 @@ ; loaded value. define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -497,7 +497,7 @@ ; on loaded value. define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -600,7 +600,7 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -699,7 +699,7 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -760,7 +760,7 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: @@ -970,7 +970,7 @@ ; need to be removed before merging. define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize { ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count ; CHECK-EMPTY: