Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h =================================================================== --- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h +++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h @@ -378,8 +378,9 @@ /// Returns true if vector representation of the instruction \p I /// requires mask. - bool isMaskRequired(const Instruction *I) const { - return MaskedOp.contains(I); + bool isMaskRequired(bool FoldTailByMasking, const Instruction *I) const { + return MaskedOp.contains(I) || + (FoldTailByMasking && FoldTailMaskedOp.contains(I)); } unsigned getNumStores() const { return LAI->getNumStores(); } @@ -387,8 +388,9 @@ /// Returns all assume calls in predicated blocks. They need to be dropped /// when flattening the CFG. - const SmallPtrSetImpl &getConditionalAssumes() const { - return ConditionalAssumes; + const SmallPtrSetImpl & + getConditionalAssumes(bool FoldTailByMasking) const { + return FoldTailByMasking ? FoldTailConditionalAssumes : ConditionalAssumes; } private: @@ -548,6 +550,11 @@ /// flattened. SmallPtrSet ConditionalAssumes; + /// Same as MaskedOp above when folding tail by masking. + SmallPtrSet FoldTailMaskedOp; + /// Same as ConditionalAssumes above when folding tail by masking. + SmallPtrSet FoldTailConditionalAssumes; + /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp @@ -1410,14 +1410,11 @@ // The list of pointers that we can safely read and write to remains empty. SmallPtrSet SafePointers; - SmallPtrSet TmpMaskedOp; - SmallPtrSet TmpConditionalAssumes; - // Check and mark all blocks for predication, including those that ordinarily // do not need predication such as the header block. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp, - TmpConditionalAssumes)) { + if (!blockCanBePredicated(BB, SafePointers, FoldTailMaskedOp, + FoldTailConditionalAssumes)) { LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n"); return false; } @@ -1425,10 +1422,6 @@ LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n"); - MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end()); - ConditionalAssumes.insert(TmpConditionalAssumes.begin(), - TmpConditionalAssumes.end()); - return true; } Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -184,10 +184,47 @@ /// VectorizerParams::VectorizationFactor and VectorizationCostTy. /// We need to streamline them. +struct VectorizationScheme { + bool FoldTailByMasking; + ElementCount Width; + + VectorizationScheme(bool F, ElementCount W) + : FoldTailByMasking(F), Width(W) {} + + bool operator==(const VectorizationScheme &rhs) const { + return Width == rhs.Width && FoldTailByMasking == rhs.FoldTailByMasking; + } + bool operator!=(const VectorizationScheme &rhs) const { + return !(*this == rhs); + } +}; +template <> struct DenseMapInfo { + static inline VectorizationScheme getEmptyKey() { + return {false, ElementCount::getScalable(~0U)}; + } + static inline VectorizationScheme getTombstoneKey() { + return {false, ElementCount::getFixed(~0U - 1)}; + } + static unsigned getHashValue(const VectorizationScheme &EltCnt) { + unsigned HashVal = EltCnt.Width.getKnownMinValue() * 37U; + if (EltCnt.FoldTailByMasking) + HashVal -= 2; + if (EltCnt.Width.isScalable()) + return (HashVal - 1U); + + return HashVal; + } + + static bool isEqual(const VectorizationScheme &LHS, + const VectorizationScheme &RHS) { + return LHS == RHS; + } +}; + /// Information about vectorization costs. struct VectorizationFactor { - /// Vector width with best cost. - ElementCount Width; + /// FoldTailByMasking and Vector width. + VectorizationScheme Scheme; /// Cost of the loop with that width. InstructionCost Cost; @@ -199,17 +236,20 @@ /// to runtime checks. ElementCount MinProfitableTripCount; - VectorizationFactor(ElementCount Width, InstructionCost Cost, + VectorizationFactor(bool FoldTailByMasking, ElementCount Width, + InstructionCost Cost, InstructionCost ScalarCost) + : Scheme(FoldTailByMasking, Width), Cost(Cost), ScalarCost(ScalarCost) {} + VectorizationFactor(VectorizationScheme Sc, InstructionCost Cost, InstructionCost ScalarCost) - : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {} + : Scheme(Sc), Cost(Cost), ScalarCost(ScalarCost) {} /// Width 1 means no vectorization, cost 0 means uncomputed cost. static VectorizationFactor Disabled() { - return {ElementCount::getFixed(1), 0, 0}; + return {false, ElementCount::getFixed(1), 0, 0}; } bool operator==(const VectorizationFactor &rhs) const { - return Width == rhs.Width && Cost == rhs.Cost; + return Scheme == rhs.Scheme && Cost == rhs.Cost; } bool operator!=(const VectorizationFactor &rhs) const { @@ -304,7 +344,7 @@ VectorizationFactor planInVPlanNativePath(ElementCount UserVF); /// Return the best VPlan for \p VF. - VPlan &getBestPlanFor(ElementCount VF) const; + VPlan &getBestPlanFor(VectorizationScheme VF) const; /// Generate the IR code for the body of the vectorized loop according to the /// best selected \p VF, \p UF and VPlan \p BestPlan. @@ -321,9 +361,10 @@ /// Look through the existing plans and return true if we have one with all /// the vectorization factors in question. - bool hasPlanWithVF(ElementCount VF) const { - return any_of(VPlans, - [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); }); + bool hasPlanWithVF(bool FoldTailByMasking, ElementCount VF) const { + return any_of(VPlans, [&](const VPlanPtr &Plan) { + return Plan->foldTailByMasking() == FoldTailByMasking && Plan->hasVF(VF); + }); } /// Test a \p Predicate on a \p Range of VF's. Return the value of applying @@ -351,13 +392,15 @@ /// Build a VPlan using VPRecipes according to the information gather by /// Legal. This method is only used for the legacy inner loop vectorizer. VPlanPtr buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, + bool FoldTailByMasking, VFRange &Range, + SmallPtrSetImpl &DeadInstructions, const MapVector &SinkAfter); /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive, /// according to the information gathered by Legal when it checked if it is /// legal to vectorize the loop. This method creates VPlans using VPRecipes. - void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF); + void buildVPlansWithVPRecipes(bool FoldTailByMasking, ElementCount MinVF, + ElementCount MaxVF); // Adjust the recipes for reductions. For in-loop reductions the chain of // instructions leading from the loop exit instr to the phi need to be Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -478,7 +478,8 @@ /// latter is the case when vectorizing the epilogue loop. In the case of /// epilogue vectorization, this function is overriden to handle the more /// complex control flow around the loops. - virtual std::pair createVectorizedLoopSkeleton(); + virtual std::pair + createVectorizedLoopSkeleton(VPlan &Plan); /// Fix the vectorized code, taking care of header phi's, live-outs, and more. void fixVectorizedLoop(VPTransformState &State, VPlan &Plan); @@ -541,7 +542,7 @@ /// bypass block and the end value on the edge from bypass to this loop. PHINode *createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &ID, - ArrayRef BypassBlocks, + ArrayRef BypassBlocks, VPlan &Plan, std::pair AdditionalBypass = {nullptr, nullptr}); protected: @@ -589,7 +590,7 @@ Value *getOrCreateTripCount(BasicBlock *InsertBlock); /// Returns (and creates if needed) the trip count of the widened loop. - Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock); + Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock, VPlan &Plan); /// Returns a bitcasted value to the requested vector type. /// Also handles bitcasts of vector <-> vector types. @@ -598,7 +599,7 @@ /// Emit a bypass check to see if the vector trip count is zero, including if /// it overflows. - void emitIterationCountCheck(BasicBlock *Bypass); + void emitIterationCountCheck(BasicBlock *Bypass, VPlan &Plan); /// Emit a bypass check to see if all of the SCEV assumptions we've /// had to make are correct. Returns the block containing the checks or @@ -621,12 +622,13 @@ /// block, the \p AdditionalBypass pair provides information about the bypass /// block and the end value on the edge from bypass to this loop. void createInductionResumeValues( + VPlan &Plan, std::pair AdditionalBypass = {nullptr, nullptr}); /// Complete the loop skeleton by adding debug MDs, creating appropriate /// conditional branches in the middle block, preparing the builder and /// running the verifier. Return the preheader of the completed vector loop. - BasicBlock *completeLoopSkeleton(); + BasicBlock *completeLoopSkeleton(VPlan &Plan); /// Collect poison-generating recipes that may generate a poison value that is /// used after vectorization, even when their operands are not poison. Those @@ -810,15 +812,16 @@ // Override this function to handle the more complex control flow around the // three loops. - std::pair createVectorizedLoopSkeleton() final { - return createEpilogueVectorizedLoopSkeleton(); + std::pair + createVectorizedLoopSkeleton(VPlan &Plan) final { + return createEpilogueVectorizedLoopSkeleton(Plan); } /// The interface for creating a vectorized skeleton using one of two /// different strategies, each corresponding to one execution of the vplan /// as described above. virtual std::pair - createEpilogueVectorizedLoopSkeleton() = 0; + createEpilogueVectorizedLoopSkeleton(VPlan &Plan) = 0; /// Holds and updates state information required to vectorize the main loop /// and its epilogue in two separate passes. This setup helps us avoid @@ -846,7 +849,8 @@ EPI, LVL, CM, BFI, PSI, Check) {} /// Implements the interface for creating a vectorized skeleton using the /// *main loop* strategy (ie the first pass of vplan execution). - std::pair createEpilogueVectorizedLoopSkeleton() final; + std::pair + createEpilogueVectorizedLoopSkeleton(VPlan &Plan) final; protected: /// Emits an iteration count bypass check once for the main loop (when \p @@ -876,7 +880,8 @@ } /// Implements the interface for creating a vectorized skeleton using the /// *epilogue loop* strategy (ie the second pass of vplan execution). - std::pair createEpilogueVectorizedLoopSkeleton() final; + std::pair + createEpilogueVectorizedLoopSkeleton(VPlan &Plan) final; protected: /// Emits an iteration count bypass check after the main vector loop has @@ -1144,15 +1149,19 @@ CM_ScalarEpilogueNotAllowedUsePredicate }; -/// ElementCountComparator creates a total ordering for ElementCount -/// for the purposes of using it in a set structure. -struct ElementCountComparator { - bool operator()(const ElementCount &LHS, const ElementCount &RHS) const { - return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) < - std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue()); +/// VectorizationSchemeComparator creates a total ordering for +/// VectorizationScheme for the purposes of using it in a set structure. +struct VectorizationSchemeComparator { + bool operator()(const VectorizationScheme &LHS, + const VectorizationScheme &RHS) const { + return std::make_tuple(LHS.FoldTailByMasking, LHS.Width.isScalable(), + LHS.Width.getKnownMinValue()) < + std::make_tuple(RHS.FoldTailByMasking, RHS.Width.isScalable(), + RHS.Width.getKnownMinValue()); } }; -using ElementCountSet = SmallSet; +using ElementCountSet = + SmallSet; /// LoopVectorizationCostModel - estimates the expected speedups due to /// vectorization. @@ -1198,10 +1207,11 @@ /// Setup cost-based decisions for user vectorization factor. /// \return true if the UserVF is a feasible VF to be chosen. - bool selectUserVectorizationFactor(ElementCount UserVF) { - collectUniformsAndScalars(UserVF); - collectInstsToScalarize(UserVF); - return expectedCost(UserVF).first.isValid(); + bool selectUserVectorizationFactor(bool FoldTailByMasking, + ElementCount UserVF) { + collectUniformsAndScalars(FoldTailByMasking, UserVF); + collectInstsToScalarize(FoldTailByMasking, UserVF); + return expectedCost({FoldTailByMasking, UserVF}).first.isValid(); } /// \return The size (in bits) of the smallest and widest types in the code @@ -1213,7 +1223,8 @@ /// If interleave count has been specified by metadata it will be returned. /// Otherwise, the interleave count is computed and returned. VF and LoopCost /// are the selected vectorization factor and the cost of the selected VF. - unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost); + unsigned selectInterleaveCount(VectorizationScheme VF, + InstructionCost LoopCost); /// Memory access instruction may be vectorized in more than one way. /// Form of instruction after vectorization depends on cost. @@ -1222,7 +1233,7 @@ /// the lists of loop-uniform and loop-scalar instructions. /// The calculated cost is saved with widening decision in order to /// avoid redundant calculations. - void setCostBasedWideningDecision(ElementCount VF); + void setCostBasedWideningDecision(bool FoldTailByMasking, ElementCount VF); /// A struct that represents some properties of the register usage /// of a loop. @@ -1238,7 +1249,7 @@ /// \return Returns information about the register usages of the loop for the /// given vectorization factors. SmallVector - calculateRegisterUsage(ArrayRef VFs); + calculateRegisterUsage(ArrayRef VFs); /// Collect values we want to ignore in the cost model. void collectValuesToIgnore(); @@ -1267,7 +1278,8 @@ /// \returns True if it is more profitable to scalarize instruction \p I for /// vectorization factor \p VF. - bool isProfitableToScalarize(Instruction *I, ElementCount VF) const { + bool isProfitableToScalarize(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { assert(VF.isVector() && "Profitable to scalarize relevant only for VF > 1."); @@ -1276,14 +1288,15 @@ if (EnableVPlanNativePath) return false; - auto Scalars = InstsToScalarize.find(VF); + auto Scalars = InstsToScalarize.find({FoldTailByMasking, VF}); assert(Scalars != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); return Scalars->second.find(I) != Scalars->second.end(); } /// Returns true if \p I is known to be uniform after vectorization. - bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const { + bool isUniformAfterVectorization(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { if (VF.isScalar()) return true; @@ -1292,14 +1305,15 @@ if (EnableVPlanNativePath) return false; - auto UniformsPerVF = Uniforms.find(VF); + auto UniformsPerVF = Uniforms.find({FoldTailByMasking, VF}); assert(UniformsPerVF != Uniforms.end() && "VF not yet analyzed for uniformity"); return UniformsPerVF->second.count(I); } /// Returns true if \p I is known to be scalar after vectorization. - bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const { + bool isScalarAfterVectorization(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { if (VF.isScalar()) return true; @@ -1308,7 +1322,7 @@ if (EnableVPlanNativePath) return false; - auto ScalarsPerVF = Scalars.find(VF); + auto ScalarsPerVF = Scalars.find({FoldTailByMasking, VF}); assert(ScalarsPerVF != Scalars.end() && "Scalar values are not calculated for VF"); return ScalarsPerVF->second.count(I); @@ -1316,10 +1330,11 @@ /// \returns True if instruction \p I can be truncated to a smaller bitwidth /// for vectorization factor \p VF. - bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const { + bool canTruncateToMinimalBitwidth(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { return VF.isVector() && MinBWs.find(I) != MinBWs.end() && - !isProfitableToScalarize(I, VF) && - !isScalarAfterVectorization(I, VF); + !isProfitableToScalarize(I, FoldTailByMasking, VF) && + !isScalarAfterVectorization(I, FoldTailByMasking, VF); } /// Decision that was taken during cost calculation for memory instruction. @@ -1334,26 +1349,33 @@ /// Save vectorization decision \p W and \p Cost taken by the cost model for /// instruction \p I and vector width \p VF. - void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W, + void setWideningDecision(Instruction *I, bool FoldTailByMasking, + ElementCount VF, InstWidening W, InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + WideningDecisions[std::make_pair( + I, VectorizationScheme(FoldTailByMasking, VF))] = + std::make_pair(W, Cost); } /// Save vectorization decision \p W and \p Cost taken by the cost model for /// interleaving group \p Grp and vector width \p VF. void setWideningDecision(const InterleaveGroup *Grp, - ElementCount VF, InstWidening W, - InstructionCost Cost) { + bool FoldTailByMasking, ElementCount VF, + InstWidening W, InstructionCost Cost) { assert(VF.isVector() && "Expected VF >=2"); /// Broadcast this decicion to all instructions inside the group. /// But the cost will be assigned to one instruction only. for (unsigned i = 0; i < Grp->getFactor(); ++i) { if (auto *I = Grp->getMember(i)) { if (Grp->getInsertPos() == I) - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost); + WideningDecisions[std::make_pair( + I, VectorizationScheme(FoldTailByMasking, VF))] = + std::make_pair(W, Cost); else - WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0); + WideningDecisions[std::make_pair( + I, VectorizationScheme(FoldTailByMasking, VF))] = + std::make_pair(W, 0); } } } @@ -1361,14 +1383,16 @@ /// Return the cost model decision for the given instruction \p I and vector /// width \p VF. Return CM_Unknown if this instruction did not pass /// through the cost modeling. - InstWidening getWideningDecision(Instruction *I, ElementCount VF) const { + InstWidening getWideningDecision(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const { assert(VF.isVector() && "Expected VF to be a vector VF"); // Cost model is not run in the VPlan-native path - return conservative // result until this changes. if (EnableVPlanNativePath) return CM_GatherScatter; - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = + std::make_pair(I, VectorizationScheme(FoldTailByMasking, VF)); auto Itr = WideningDecisions.find(InstOnVF); if (Itr == WideningDecisions.end()) return CM_Unknown; @@ -1377,9 +1401,11 @@ /// Return the vectorization cost for the given instruction \p I and vector /// width \p VF. - InstructionCost getWideningCost(Instruction *I, ElementCount VF) { + InstructionCost getWideningCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF) { assert(VF.isVector() && "Expected VF >=2"); - std::pair InstOnVF = std::make_pair(I, VF); + std::pair InstOnVF = + std::make_pair(I, VectorizationScheme(FoldTailByMasking, VF)); assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() && "The cost is not calculated"); return WideningDecisions[InstOnVF].second; @@ -1413,18 +1439,19 @@ /// Collects the instructions to scalarize for each predicated instruction in /// the loop. - void collectInstsToScalarize(ElementCount VF); + void collectInstsToScalarize(bool FoldTailByMasking, ElementCount VF); - /// Collect Uniform and Scalar values for the given \p VF. - /// The sets depend on CM decision for Load/Store instructions - /// that may be vectorized as interleave, gather-scatter or scalarized. - void collectUniformsAndScalars(ElementCount VF) { + /// Collect Uniform and Scalar values for the given \p FoldTailByMasking and + /// \p VF. The sets depend on CM decision for Load/Store instructions that may + /// be vectorized as interleave, gather-scatter or scalarized. + void collectUniformsAndScalars(bool FoldTailByMasking, ElementCount VF) { // Do the analysis once. - if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end()) + if (VF.isScalar() || + Uniforms.find({FoldTailByMasking, VF}) != Uniforms.end()) return; - setCostBasedWideningDecision(VF); - collectLoopUniforms(VF); - collectLoopScalars(VF); + setCostBasedWideningDecision(FoldTailByMasking, VF); + collectLoopUniforms(FoldTailByMasking, VF); + collectLoopScalars(FoldTailByMasking, VF); } /// Returns true if the target machine supports masked store operation @@ -1486,29 +1513,32 @@ /// for which our chosen predication strategy is scalarization (i.e. we /// don't have an alternate strategy such as masking available). /// \p VF is the vectorization factor that will be used to vectorize \p I. - bool isScalarWithPredication(Instruction *I, ElementCount VF) const; + bool isScalarWithPredication(Instruction *I, bool FoldTailByMasking, + ElementCount VF) const; /// Returns true if \p I is an instruction that needs to be predicated /// at runtime. The result is independent of the predication mechanism. /// Superset of instructions that return true for isScalarWithPredication. - bool isPredicatedInst(Instruction *I) const; + bool isPredicatedInst(bool FoldTailByMasking, Instruction *I) const; /// Return the costs for our two available strategies for lowering a /// div/rem operation which requires speculating at least one lane. /// First result is for scalarization (will be invalid for scalable /// vectors); second is for the safe-divisor strategy. std::pair - getDivRemSpeculationCost(Instruction *I, + getDivRemSpeculationCost(Instruction *I, bool FoldTailByMasking, ElementCount VF) const; /// Returns true if \p I is a memory instruction with consecutive memory /// access that can be widened. - bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF); + bool memoryInstructionCanBeWidened(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// Returns true if \p I is a memory instruction in an interleaved-group /// of memory accesses that can be vectorized with wide vector loads/stores /// and shuffles. - bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF); + bool interleavedAccessCanBeWidened(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// Check if \p Instr belongs to any interleaved access group. bool isAccessInterleaved(Instruction *Instr) { @@ -1539,23 +1569,22 @@ return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; } + bool shouldGenerateNonFoldedVFs() const { + return !MayFoldTailByMasking || + ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate || + ScalarEpilogueStatus == CM_ScalarEpilogueAllowed; + } + /// Returns true if all loop blocks should be masked to fold tail loop. - bool foldTailByMasking() const { return FoldTailByMasking; } + bool mayFoldTailByMasking() const { return MayFoldTailByMasking; } /// Returns true if were tail-folding and want to use the active lane mask /// for vector loop control flow. - bool useActiveLaneMaskForControlFlow() const { + bool useActiveLaneMaskForControlFlow(bool FoldTailByMasking) const { return FoldTailByMasking && TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow; } - /// Returns true if the instructions in this block requires predication - /// for any reason, e.g. because tail folding now requires a predicate - /// or because the block in the original loop was predicated. - bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const { - return foldTailByMasking() || Legal->blockNeedsPredication(BB); - } - /// A SmallMapVector to store the InLoop reduction op chains, mapping phi /// nodes to the chain of instructions representing the reductions. Uses a /// MapVector to ensure deterministic iteration order. @@ -1582,7 +1611,8 @@ /// if it's needed. The flag NeedToScalarize shows if the call needs to be /// scalarized - /// i.e. either vector version isn't available, or is too expensive. - InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF, + InstructionCost getVectorCallCost(CallInst *CI, bool FoldTailByMasking, + ElementCount VF, bool &NeedToScalarize) const; /// Returns true if the per-lane cost of VectorizationFactor A is lower than @@ -1642,17 +1672,18 @@ /// each instruction that has an Invalid cost for the given VF. using InstructionVFPair = std::pair; VectorizationCostTy - expectedCost(ElementCount VF, + expectedCost(VectorizationScheme VF, SmallVectorImpl *Invalid = nullptr); /// Returns the execution time cost of an instruction for a given vector /// width. Vector width of one means scalar. - VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF); + VectorizationCostTy getInstructionCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// The cost-computation logic from getInstructionCost which provides /// the vector type as an output parameter. - InstructionCost getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy); + InstructionCost getInstructionCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF, Type *&VectorTy); /// Return the cost of instructions in an inloop reduction pattern, if I is /// part of that pattern. @@ -1661,20 +1692,28 @@ TTI::TargetCostKind CostKind); /// Calculate vectorization cost of memory instruction \p I. - InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF); + InstructionCost getMemoryInstructionCost(Instruction *I, + bool FoldTailByMasking, + ElementCount VF); /// The cost computation for scalarized memory instruction. - InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF); + InstructionCost getMemInstScalarizationCost(Instruction *I, + bool FoldTailByMasking, + ElementCount VF); /// The cost computation for interleaving group of memory instructions. - InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF); + InstructionCost getInterleaveGroupCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// The cost computation for Gather/Scatter instruction. - InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF); + InstructionCost getGatherScatterCost(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// The cost computation for widening instruction \p I with consecutive /// memory access. - InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF); + InstructionCost getConsecutiveMemOpCost(Instruction *I, + bool FoldTailByMasking, + ElementCount VF); /// The cost calculation for Load/Store instruction \p I with uniform pointer - /// Load: scalar load + broadcast. @@ -1685,11 +1724,13 @@ /// Estimate the overhead of scalarizing an instruction. This is a /// convenience wrapper for the type-based getScalarizationOverhead API. InstructionCost getScalarizationOverhead(Instruction *I, + bool FoldTailByMasking, ElementCount VF) const; /// Returns true if an artificially high cost for emulated masked memrefs /// should be used. - bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF); + bool useEmulatedMaskMemRefHack(Instruction *I, bool FoldTailByMasking, + ElementCount VF); /// Map of scalar integer values to the smallest bitwidth they can be legally /// represented as. The vector equivalents of these values should be truncated @@ -1715,26 +1756,27 @@ /// iterations to execute in the scalar loop. ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed; - /// All blocks of loop are to be masked to fold tail of scalar iterations. - bool FoldTailByMasking = false; + /// The loop may be masked so that all blocks of loop are to be masked to fold + /// tail of scalar iterations. + bool MayFoldTailByMasking = false; /// A map holding scalar costs for different vectorization factors. The /// presence of a cost for an instruction in the mapping indicates that the /// instruction will be scalarized when vectorizing with the associated /// vectorization factor. The entries are VF-ScalarCostTy pairs. - DenseMap InstsToScalarize; + DenseMap InstsToScalarize; /// Holds the instructions known to be uniform after vectorization. /// The data is collected per VF. - DenseMap> Uniforms; + DenseMap> Uniforms; /// Holds the instructions known to be scalar after vectorization. /// The data is collected per VF. - DenseMap> Scalars; + DenseMap> Scalars; /// Holds the instructions (address computations) that are forced to be /// scalarized. - DenseMap> ForcedScalars; + DenseMap> ForcedScalars; /// PHINodes of the reductions that should be expanded in-loop along with /// their associated chains of reduction operations, in program order from top @@ -1754,6 +1796,7 @@ /// Currently, only single-use chains are considered for scalarization. InstructionCost computePredInstDiscount(Instruction *PredInst, ScalarCostsTy &ScalarCosts, + bool FoldTailByMasking, ElementCount VF); /// Collect the instructions that are uniform after vectorization. An @@ -1765,7 +1808,7 @@ /// scalarized instruction will be represented by VF scalar values in the /// vectorized loop, each corresponding to an iteration of the original /// scalar loop. - void collectLoopUniforms(ElementCount VF); + void collectLoopUniforms(bool FoldTailByMasking, ElementCount VF); /// Collect the instructions that are scalar after vectorization. An /// instruction is scalar if it is known to be uniform or will be scalarized @@ -1774,18 +1817,18 @@ /// CM_Scalarize. Non-uniform scalarized instructions will be represented by /// VF values in the vectorized loop, each corresponding to an iteration of /// the original scalar loop. - void collectLoopScalars(ElementCount VF); + void collectLoopScalars(bool FoldTailByMasking, ElementCount VF); /// Keeps cost model vectorization decision and cost for instructions. /// Right now it is used for memory instructions only. - using DecisionList = DenseMap, + using DecisionList = DenseMap, std::pair>; DecisionList WideningDecisions; /// Returns true if \p V is expected to be vectorized and it needs to be /// extracted. - bool needsExtract(Value *V, ElementCount VF) const { + bool needsExtract(Value *V, bool FoldTailByMasking, ElementCount VF) const { Instruction *I = dyn_cast(V); if (VF.isScalar() || !I || !TheLoop->contains(I) || TheLoop->isLoopInvariant(I)) @@ -1797,15 +1840,18 @@ // the scalars are collected. That should be a safe assumption in most // cases, because we check if the operands have vectorizable types // beforehand in LoopVectorizationLegality. - return Scalars.find(VF) == Scalars.end() || - !isScalarAfterVectorization(I, VF); + return Scalars.find({FoldTailByMasking, VF}) == Scalars.end() || + !isScalarAfterVectorization(I, FoldTailByMasking, VF); }; /// Returns a range containing only operands needing to be extracted. SmallVector filterExtractingOperands(Instruction::op_range Ops, + bool FoldTailByMasking, ElementCount VF) const { - return SmallVector(make_filter_range( - Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); })); + return SmallVector( + make_filter_range(Ops, [this, FoldTailByMasking, VF](Value *V) { + return this->needsExtract(V, FoldTailByMasking, VF); + })); } /// Determines if we have the infrastructure to vectorize loop \p L and its @@ -2877,8 +2923,8 @@ return TripCount; } -Value * -InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) { +Value *InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock, + VPlan &Plan) { if (VectorTripCount) return VectorTripCount; @@ -2897,7 +2943,7 @@ // exit, with the last early-exit vector comparison also producing all-true. // For scalable vectors the VF is not guaranteed to be a power of 2, but this // is accounted for in emitIterationCountCheck that adds an overflow check. - if (Cost->foldTailByMasking()) { + if (Plan.foldTailByMasking()) { assert(isPowerOf2_32(VF.getKnownMinValue() * UF) && "VF*UF must be a power of 2 when folding tail by masking"); Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF); @@ -2959,7 +3005,8 @@ return Builder.CreateBitOrPointerCast(CastVal, DstFVTy); } -void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { +void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass, + VPlan &Plan) { Value *Count = getOrCreateTripCount(LoopVectorPreHeader); // Reuse existing vector loop preheader for TC checks. // Note that new preheader block is generated for vector loop. @@ -2990,7 +3037,7 @@ Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF)); }; - if (!Cost->foldTailByMasking()) + if (!Plan.foldTailByMasking()) CheckMinIters = Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check"); else if (VF.isScalable()) { @@ -3137,9 +3184,10 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue( PHINode *OrigPhi, const InductionDescriptor &II, - ArrayRef BypassBlocks, + ArrayRef BypassBlocks, VPlan &Plan, std::pair AdditionalBypass) { - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + Value *VectorTripCount = + getOrCreateVectorTripCount(LoopVectorPreHeader, Plan); assert(VectorTripCount && "Expected valid arguments"); Instruction *OldInduction = Legal->getPrimaryInduction(); @@ -3195,7 +3243,7 @@ } void InnerLoopVectorizer::createInductionResumeValues( - std::pair AdditionalBypass) { + VPlan &Plan, std::pair AdditionalBypass) { assert(((AdditionalBypass.first && AdditionalBypass.second) || (!AdditionalBypass.first && !AdditionalBypass.second)) && "Inconsistent information about additional bypass."); @@ -3210,15 +3258,16 @@ PHINode *OrigPhi = InductionEntry.first; const InductionDescriptor &II = InductionEntry.second; PHINode *BCResumeVal = createInductionResumeValue( - OrigPhi, II, LoopBypassBlocks, AdditionalBypass); + OrigPhi, II, LoopBypassBlocks, Plan, AdditionalBypass); OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal); } } -BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() { +BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(VPlan &Plan) { // The trip counts should be cached by now. Value *Count = getOrCreateTripCount(LoopVectorPreHeader); - Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + Value *VectorTripCount = + getOrCreateVectorTripCount(LoopVectorPreHeader, Plan); auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator(); @@ -3230,7 +3279,7 @@ // Thus if tail is to be folded, we know we don't need to run the // remainder and we can use the previous value for the condition (true). // 3) Otherwise, construct a runtime check. - if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) { + if (!Cost->requiresScalarEpilogue(VF) && !Plan.foldTailByMasking()) { Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ, Count, VectorTripCount, "cmp.n", LoopMiddleBlock->getTerminator()); @@ -3251,7 +3300,7 @@ } std::pair -InnerLoopVectorizer::createVectorizedLoopSkeleton() { +InnerLoopVectorizer::createVectorizedLoopSkeleton(VPlan &Plan) { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3293,7 +3342,7 @@ // backedge-taken count is uint##_max: adding one to it will overflow leading // to an incorrect trip count of zero. In this (rare) case we will also jump // to the scalar loop. - emitIterationCountCheck(LoopScalarPreHeader); + emitIterationCountCheck(LoopScalarPreHeader, Plan); // Generate the code to check any assumptions that we've made for SCEV // expressions. @@ -3305,9 +3354,9 @@ emitMemRuntimeChecks(LoopScalarPreHeader); // Emit phis for the new starting index of the scalar loop. - createInductionResumeValues(); + createInductionResumeValues(Plan); - return {completeLoopSkeleton(), nullptr}; + return {completeLoopSkeleton(Plan), nullptr}; } // Fix up external users of the induction variable. At this point, we are @@ -3431,9 +3480,9 @@ } } -InstructionCost -LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF, - bool &NeedToScalarize) const { +InstructionCost LoopVectorizationCostModel::getVectorCallCost( + CallInst *CI, bool FoldTailByMasking, ElementCount VF, + bool &NeedToScalarize) const { Function *F = CI->getCalledFunction(); Type *ScalarRetTy = CI->getType(); SmallVector Tys, ScalarTys; @@ -3456,7 +3505,8 @@ // Compute costs of unpacking argument values for the scalar calls and // packing the return values to a vector. - InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF); + InstructionCost ScalarizationCost = + getScalarizationOverhead(CI, FoldTailByMasking, VF); InstructionCost Cost = ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost; @@ -3688,10 +3738,11 @@ // Fix-up external users of the induction variables. for (const auto &Entry : Legal->getInductionVars()) - fixupIVUsers(Entry.first, Entry.second, - getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()), - IVEndValues[Entry.first], LoopMiddleBlock, - VectorLoop->getHeader(), Plan); + fixupIVUsers( + Entry.first, Entry.second, + getOrCreateVectorTripCount(VectorLoop->getLoopPreheader(), Plan), + IVEndValues[Entry.first], LoopMiddleBlock, VectorLoop->getHeader(), + Plan); } // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated @@ -3891,7 +3942,7 @@ // a Select choosing between the vectorized LoopExitInst and vectorized Phi, // instead of the former. For an inloop reduction the reduction will already // be predicated, and does not need to be handled here. - if (Cost->foldTailByMasking() && !PhiR->isInLoop()) { + if (State.Plan->foldTailByMasking() && !PhiR->isInLoop()) { for (unsigned Part = 0; Part < UF; ++Part) { Value *VecLoopExitInst = State.get(LoopExitInstDef, Part); SelectInst *Sel = nullptr; @@ -4185,18 +4236,22 @@ return Cost->useOrderedReductions(RdxDesc); } -void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopScalars(bool FoldTailByMasking, + ElementCount VF) { // We should not collect Scalars more than once per VF. Right now, this // function is called from collectUniformsAndScalars(), which already does // this check. Collecting Scalars for VF=1 does not make any sense. - assert(VF.isVector() && Scalars.find(VF) == Scalars.end() && + assert(VF.isVector() && + Scalars.find({FoldTailByMasking, VF}) == Scalars.end() && "This function should not be visited twice for the same VF"); // This avoids any chances of creating a REPLICATE recipe during planning // since that would result in generation of scalarized code during execution, // which is not supported for scalable vectors. if (VF.isScalable()) { - Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end()); + Scalars[{FoldTailByMasking, VF}].insert( + Uniforms[{FoldTailByMasking, VF}].begin(), + Uniforms[{FoldTailByMasking, VF}].end()); return; } @@ -4213,7 +4268,8 @@ // memory access is not a gather or scatter operation. The value operand of a // store will remain scalar if the store is scalarized. auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) { - InstWidening WideningDecision = getWideningDecision(MemAccess, VF); + InstWidening WideningDecision = + getWideningDecision(MemAccess, FoldTailByMasking, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); if (auto *Store = dyn_cast(MemAccess)) @@ -4266,7 +4322,8 @@ // // (1) Add to the worklist all instructions that have been identified as // uniform-after-vectorization. - Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end()); + Worklist.insert(Uniforms[{FoldTailByMasking, VF}].begin(), + Uniforms[{FoldTailByMasking, VF}].end()); // (2) Add to the worklist all bitcast and getelementptr instructions used by // memory accesses requiring a scalar use. The pointer operands of loads and @@ -4291,7 +4348,7 @@ // Insert the forced scalars. // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector // induction variable when the PHI user is scalarized. - auto ForcedScalar = ForcedScalars.find(VF); + auto ForcedScalar = ForcedScalars.find({FoldTailByMasking, VF}); if (ForcedScalar != ForcedScalars.end()) for (auto *I : ForcedScalar->second) { LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n"); @@ -4327,7 +4384,7 @@ // If tail-folding is applied, the primary induction variable will be used // to feed a vector compare. - if (Ind == Legal->getPrimaryInduction() && foldTailByMasking()) + if (Ind == Legal->getPrimaryInduction() && FoldTailByMasking) continue; // Returns true if \p Indvar is a pointer induction that is used directly by @@ -4369,12 +4426,12 @@ << "\n"); } - Scalars[VF].insert(Worklist.begin(), Worklist.end()); + Scalars[{FoldTailByMasking, VF}].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationCostModel::isScalarWithPredication( - Instruction *I, ElementCount VF) const { - if (!isPredicatedInst(I)) + Instruction *I, bool FoldTailByMasking, ElementCount VF) const { + if (!isPredicatedInst(FoldTailByMasking, I)) return false; // Do we have a non-scalar lowering for this predicated @@ -4402,14 +4459,23 @@ // We have the option to use the safe-divisor idiom to avoid predication. // The cost based decision here will always select safe-divisor for // scalable vectors as scalarization isn't legal. - const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); + const auto [ScalarCost, SafeDivisorCost] = + getDivRemSpeculationCost(I, FoldTailByMasking, VF); return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost); } } } -bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const { - if (!blockNeedsPredicationForAnyReason(I->getParent())) +static bool blockNeedsPredicationForAnyReason(bool FoldTailByMasking, + LoopVectorizationLegality *Legal, + BasicBlock *BB) { + return FoldTailByMasking || Legal->blockNeedsPredication(BB); +} + +bool LoopVectorizationCostModel::isPredicatedInst(bool FoldTailByMasking, + Instruction *I) const { + if (!blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal, + I->getParent())) return false; // Can we prove this instruction is safe to unconditionally execute? @@ -4419,7 +4485,7 @@ return false; case Instruction::Load: case Instruction::Store: { - if (!Legal->isMaskRequired(I)) + if (!Legal->isMaskRequired(FoldTailByMasking, I)) return false; // When we know the load's address is loop invariant and the instruction // in the original scalar loop was unconditionally executed then we @@ -4450,7 +4516,8 @@ std::pair LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I, - ElementCount VF) const { + bool FoldTailByMasking, + ElementCount VF) const { assert(I->getOpcode() == Instruction::UDiv || I->getOpcode() == Instruction::SDiv || I->getOpcode() == Instruction::SRem || @@ -4480,7 +4547,7 @@ // The cost of insertelement and extractelement instructions needed for // scalarization. - ScalarizationCost += getScalarizationOverhead(I, VF); + ScalarizationCost += getScalarizationOverhead(I, FoldTailByMasking, VF); // Scale the cost by the probability of executing the predicated blocks. // This assumes the predicated block for each vector lane is equally @@ -4514,9 +4581,9 @@ } bool LoopVectorizationCostModel::interleavedAccessCanBeWidened( - Instruction *I, ElementCount VF) { + Instruction *I, bool FoldTailByMasking, ElementCount VF) { assert(isAccessInterleaved(I) && "Expecting interleaved access."); - assert(getWideningDecision(I, VF) == CM_Unknown && + assert(getWideningDecision(I, FoldTailByMasking, VF) == CM_Unknown && "Decision should not be set yet."); auto *Group = getInterleavedAccessGroup(I); assert(Group && "Must have a group."); @@ -4555,8 +4622,9 @@ // (either a gap at the end of a load-access that may result in a speculative // load, or any gaps in a store-access). bool PredicatedAccessRequiresMasking = - blockNeedsPredicationForAnyReason(I->getParent()) && - Legal->isMaskRequired(I); + blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal, + I->getParent()) && + Legal->isMaskRequired(FoldTailByMasking, I); bool LoadAccessWithGapsRequiresEpilogMasking = isa(I) && Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed(); @@ -4583,7 +4651,7 @@ } bool LoopVectorizationCostModel::memoryInstructionCanBeWidened( - Instruction *I, ElementCount VF) { + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // Get and ensure we have a valid memory instruction. assert((isa(I)) && "Invalid memory instruction"); @@ -4596,7 +4664,7 @@ // If the instruction is a store located in a predicated block, it will be // scalarized. - if (isScalarWithPredication(I, VF)) + if (isScalarWithPredication(I, FoldTailByMasking, VF)) return false; // If the instruction's allocated size doesn't equal it's type size, it @@ -4608,18 +4676,20 @@ return true; } -void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) { +void LoopVectorizationCostModel::collectLoopUniforms(bool FoldTailByMasking, + ElementCount VF) { // We should not collect Uniforms more than once per VF. Right now, // this function is called from collectUniformsAndScalars(), which // already does this check. Collecting Uniforms for VF=1 does not make any // sense. - assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() && + assert(VF.isVector() && + Uniforms.find({FoldTailByMasking, VF}) == Uniforms.end() && "This function should not be visited twice for the same VF"); // Visit the list of Uniforms. If we'll not find any uniform value, we'll // not analyze again. Uniforms.count(VF) will return 1. - Uniforms[VF].clear(); + Uniforms[{FoldTailByMasking, VF}].clear(); // We now know that the loop is vectorizable! // Collect instructions inside the loop that will remain uniform after @@ -4647,7 +4717,7 @@ << *I << "\n"); return; } - if (isScalarWithPredication(I, VF)) { + if (isScalarWithPredication(I, FoldTailByMasking, VF)) { LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: " << *I << "\n"); return; @@ -4677,7 +4747,8 @@ }; auto isUniformDecision = [&](Instruction *I, ElementCount VF) { - InstWidening WideningDecision = getWideningDecision(I, VF); + InstWidening WideningDecision = + getWideningDecision(I, FoldTailByMasking, VF); assert(WideningDecision != CM_Unknown && "Widening decision should be ready at this moment"); @@ -4689,7 +4760,6 @@ WideningDecision == CM_Interleave); }; - // Returns true if Ptr is the pointer operand of a memory access instruction // I, and I is known to not require scalarization. auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool { @@ -4823,7 +4893,7 @@ addToWorklistIfAllowed(IndUpdate); } - Uniforms[VF].insert(Worklist.begin(), Worklist.end()); + Uniforms[{FoldTailByMasking, VF}].insert(Worklist.begin(), Worklist.end()); } bool LoopVectorizationCostModel::runtimeChecksRequired() { @@ -5134,7 +5204,7 @@ // by masking. // FIXME: look for a smaller MaxVF that does divide TC rather than masking. if (Legal->prepareToFoldTailByMasking()) { - FoldTailByMasking = true; + MayFoldTailByMasking = true; return MaxFactors; } @@ -5234,10 +5304,10 @@ // Collect all viable vectorization factors larger than the default MaxVF // (i.e. MaxVectorElementCount). - SmallVector VFs; + SmallVector VFs; for (ElementCount VS = MaxVectorElementCount * 2; ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2) - VFs.push_back(VS); + VFs.push_back({FoldTailByMasking, VS}); // For each VF calculate its register usage. auto RUs = calculateRegisterUsage(VFs); @@ -5252,7 +5322,7 @@ Selected = false; } if (Selected) { - MaxVF = VFs[i]; + MaxVF = VFs[i].Width; break; } } @@ -5292,35 +5362,51 @@ unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop); - if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking && + if (!A.Scheme.Width.isScalable() && !B.Scheme.Width.isScalable() && MaxTripCount) { - // If we are folding the tail and the trip count is a known (possibly small) - // constant, the trip count will be rounded up to an integer number of - // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF), - // which we compare directly. When not folding the tail, the total cost will - // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is - // approximated with the per-lane cost below instead of using the tripcount - // as here. - auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue()); - auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue()); + // If the trip count is a known (possibly small) constant, the trip count + // will be rounded up to an integer number of iterations under + // FoldTailByMasking. The total cost in that case will be + // PerIterationCost*ceil(TripCount/VF). When not folding the tail, the total + // cost will be PerIterationCost*floor(TC/VF) + Scalar remainder cost, where + // the scalar cost is approximated using the correct fraction of the vector + // cost. + auto RTCostA = + A.Scheme.Width.getFixedValue() + ? (CostA * divideCeil(MaxTripCount, A.Scheme.Width.getFixedValue())) + : (CostA * MaxTripCount) / A.Scheme.Width.getFixedValue(); + auto RTCostB = + B.Scheme.Width.getFixedValue() + ? (CostB * divideCeil(MaxTripCount, B.Scheme.Width.getFixedValue())) + : (CostB * MaxTripCount) / B.Scheme.Width.getFixedValue(); + + if (A.Scheme.FoldTailByMasking && !B.Scheme.FoldTailByMasking) + return RTCostA <= RTCostB; + return RTCostA < RTCostB; } // Improve estimate for the vector width if it is scalable. - unsigned EstimatedWidthA = A.Width.getKnownMinValue(); - unsigned EstimatedWidthB = B.Width.getKnownMinValue(); + unsigned EstimatedWidthA = A.Scheme.Width.getKnownMinValue(); + unsigned EstimatedWidthB = B.Scheme.Width.getKnownMinValue(); if (std::optional VScale = getVScaleForTuning()) { - if (A.Width.isScalable()) + if (A.Scheme.Width.isScalable()) EstimatedWidthA *= *VScale; - if (B.Width.isScalable()) + if (B.Scheme.Width.isScalable()) EstimatedWidthB *= *VScale; } + // If one plan is predicated and the other is not, opt for the predicated + // scheme on a tie. + if (A.Scheme.FoldTailByMasking && !B.Scheme.FoldTailByMasking) + return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA); + // Assume vscale may be larger than 1 (or the value being tuned for), // so that scalable vectorization is slightly favorable over fixed-width // vectorization. - if (A.Width.isScalable() && !B.Width.isScalable()) - return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA); + if (A.Scheme.Width.isScalable() && !B.Scheme.Width.isScalable()) + return (CostA * B.Scheme.Width.getFixedValue()) <= + (CostB * EstimatedWidthA); // To avoid the need for FP division: // (CostA / A.Width) < (CostB / B.Width) @@ -5330,14 +5416,15 @@ VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor( const ElementCountSet &VFCandidates) { - InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first; + InstructionCost ExpectedCost = + expectedCost({false, ElementCount::getFixed(1)}).first; LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n"); assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop"); - assert(VFCandidates.count(ElementCount::getFixed(1)) && + assert(VFCandidates.count({false, ElementCount::getFixed(1)}) && "Expected Scalar VF to be a candidate"); - const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost, - ExpectedCost); + const VectorizationFactor ScalarCost(false, ElementCount::getFixed(1), + ExpectedCost, ExpectedCost); VectorizationFactor ChosenFactor = ScalarCost; bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled; @@ -5351,7 +5438,7 @@ SmallVector InvalidCosts; for (const auto &i : VFCandidates) { // The cost for scalar VF=1 is already calculated, so ignore it. - if (i.isScalar()) + if (i.Width.isScalar()) continue; VectorizationCostTy C = expectedCost(i, &InvalidCosts); @@ -5362,12 +5449,13 @@ if (std::optional VScale = getVScaleForTuning()) AssumedMinimumVscale = *VScale; unsigned Width = - Candidate.Width.isScalable() - ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale - : Candidate.Width.getFixedValue(); - LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i + Candidate.Scheme.Width.isScalable() + ? Candidate.Scheme.Width.getKnownMinValue() * AssumedMinimumVscale + : Candidate.Scheme.Width.getFixedValue(); + LLVM_DEBUG(dbgs() << "LV: " << (i.FoldTailByMasking ? "Tail folded " : "") + << "Vector loop of width " << i.Width << " costs: " << (Candidate.Cost / Width)); - if (i.isScalable()) + if (i.Width.isScalable()) LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of " << AssumedMinimumVscale << ")"); LLVM_DEBUG(dbgs() << ".\n"); @@ -5375,7 +5463,7 @@ if (!C.second && !ForceVectorization) { LLVM_DEBUG( - dbgs() << "LV: Not considering vector loop of width " << i + dbgs() << "LV: Not considering vector loop of width " << i.Width << " because it will not generate any vector instructions.\n"); continue; } @@ -5403,8 +5491,8 @@ [&Numbering](InstructionVFPair &A, InstructionVFPair &B) { if (Numbering[A.first] != Numbering[B.first]) return Numbering[A.first] < Numbering[B.first]; - ElementCountComparator ECC; - return ECC(A.second, B.second); + VectorizationSchemeComparator ECC; + return ECC({false, A.second}, {false, B.second}); }); // For a list of ordered instruction-vf pairs: @@ -5455,11 +5543,16 @@ ChosenFactor = ScalarCost; } - LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() && - !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs() - << "LV: Vectorization seems to be not beneficial, " - << "but was forced by a user.\n"); - LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n"); + LLVM_DEBUG({ + if (ForceVectorization && !ChosenFactor.Scheme.Width.isScalar() && + !isMoreProfitable(ChosenFactor, ScalarCost)) + dbgs() << "LV: Vectorization seems to be not beneficial, " + << "but was forced by a user.\n"; + }); + LLVM_DEBUG( + dbgs() << "LV: Selecting " + << (ChosenFactor.Scheme.FoldTailByMasking ? "Tail folded " : "") + << "VF: " << ChosenFactor.Scheme.Width << ".\n"); return ChosenFactor; } @@ -5544,8 +5637,8 @@ if (EpilogueVectorizationForceVF > 1) { LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";); ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF); - if (LVP.hasPlanWithVF(ForcedEC)) - return {ForcedEC, 0, 0}; + if (LVP.hasPlanWithVF(false, ForcedEC)) + return {false, ForcedEC, 0, 0}; else { LLVM_DEBUG( dbgs() @@ -5579,16 +5672,16 @@ } for (auto &NextVF : ProfitableVFs) - if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() && - ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) || - ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) && - (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) && - LVP.hasPlanWithVF(NextVF.Width)) + if (((!NextVF.Scheme.Width.isScalable() && MainLoopVF.isScalable() && + ElementCount::isKnownLT(NextVF.Scheme.Width, EstimatedRuntimeVF)) || + ElementCount::isKnownLT(NextVF.Scheme.Width, MainLoopVF)) && + (Result.Scheme.Width.isScalar() || isMoreProfitable(NextVF, Result)) && + LVP.hasPlanWithVF(false, NextVF.Scheme.Width)) Result = NextVF; if (Result != VectorizationFactor::Disabled()) LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = " - << Result.Width << "\n";); + << Result.Scheme.Width << "\n";); return Result; } @@ -5668,7 +5761,7 @@ } unsigned -LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF, +LoopVectorizationCostModel::selectInterleaveCount(VectorizationScheme VF, InstructionCost LoopCost) { // -- The interleave heuristics -- // We interleave the loop in order to expose ILP and reduce the loop overhead. @@ -5684,7 +5777,8 @@ // 3. We don't interleave if we think that we will spill registers to memory // due to the increased register pressure. - if (!isScalarEpilogueAllowed()) + if (!isScalarEpilogueAllowed() || + TheLoop->getHeader()->getParent()->hasOptSize()) return 1; // We used the distance for the interleave count. @@ -5699,7 +5793,8 @@ // because with the above conditions interleaving can expose ILP and break // cross iteration dependences for reductions. if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) && - !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar())) + !(InterleaveSmallLoopScalarReduction && HasReductions && + VF.Width.isScalar())) return 1; // If we did not calculate the cost for VF (because the user selected the VF) @@ -5738,7 +5833,7 @@ LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters << " registers of " << TTI.getRegisterClassName(pair.first) << " register class\n"); - if (VF.isScalar()) { + if (VF.Width.isScalar()) { if (ForceTargetNumScalarRegs.getNumOccurrences() > 0) TargetNumRegisters = ForceTargetNumScalarRegs; } else { @@ -5763,10 +5858,10 @@ // Clamp the interleave ranges to reasonable counts. unsigned MaxInterleaveCount = - TTI.getMaxInterleaveFactor(VF.getKnownMinValue()); + TTI.getMaxInterleaveFactor(VF.Width.getKnownMinValue()); // Check if the user has overridden the max. - if (VF.isScalar()) { + if (VF.Width.isScalar()) { if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0) MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor; } else { @@ -5785,8 +5880,8 @@ // the InterleaveCount as if vscale is '1', although if some information about // the vector is known (e.g. min vector size), we can make a better decision. if (BestKnownTC) { - MaxInterleaveCount = - std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount); + MaxInterleaveCount = std::min(*BestKnownTC / VF.Width.getKnownMinValue(), + MaxInterleaveCount); // Make sure MaxInterleaveCount is greater than 0. MaxInterleaveCount = std::max(1u, MaxInterleaveCount); } @@ -5806,7 +5901,7 @@ // Interleave if we vectorized this loop and there is a reduction that could // benefit from interleaving. - if (VF.isVector() && HasReductions) { + if (VF.Width.isVector() && HasReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n"); return IC; } @@ -5816,17 +5911,19 @@ // vectorized the loop we will have done the runtime check and so interleaving // won't require further checks. bool ScalarInterleavingRequiresPredication = - (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { + (VF.Width.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) { return Legal->blockNeedsPredication(BB); })); bool ScalarInterleavingRequiresRuntimePointerCheck = - (VF.isScalar() && Legal->getRuntimePointerChecking()->Need); + (VF.Width.isScalar() && Legal->getRuntimePointerChecking()->Need); // We want to interleave small loops in order to reduce the loop overhead and // potentially expose ILP opportunities. LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n' << "LV: IC is " << IC << '\n' - << "LV: VF is " << VF << '\n'); + << "LV: VF is " << VF.Width << '\n' + << "LV: Fold Tail is " + << (VF.FoldTailByMasking ? "true" : "false") << '\n'); const bool AggressivelyInterleaveReductions = TTI.enableAggressiveInterleaving(HasReductions); if (!ScalarInterleavingRequiresRuntimePointerCheck && @@ -5892,7 +5989,7 @@ // If there are scalar reductions and TTI has enabled aggressive // interleaving for reductions, we will interleave to expose ILP. - if (InterleaveSmallLoopScalarReduction && VF.isScalar() && + if (InterleaveSmallLoopScalarReduction && VF.Width.isScalar() && AggressivelyInterleaveReductions) { LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n"); // Interleave no less than SmallIC but not as aggressive as the normal IC @@ -5916,7 +6013,8 @@ } SmallVector -LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef VFs) { +LoopVectorizationCostModel::calculateRegisterUsage( + ArrayRef VFs) { // This function calculates the register usage by measuring the highest number // of values that are alive at a single location. Obviously, this is a very // rough estimation. We scan the loop in a topological order in order and @@ -6029,7 +6127,7 @@ // there is no previous entry for ClassID. SmallMapVector RegUsage; - if (VFs[j].isScalar()) { + if (VFs[j].Width.isScalar()) { for (auto *Inst : OpenIntervals) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); @@ -6038,12 +6136,13 @@ RegUsage[ClassID] += 1; } } else { - collectUniformsAndScalars(VFs[j]); + collectUniformsAndScalars(VFs[j].FoldTailByMasking, VFs[j].Width); for (auto *Inst : OpenIntervals) { // Skip ignored values for VF > 1. if (VecValuesToIgnore.count(Inst)) continue; - if (isScalarAfterVectorization(Inst, VFs[j])) { + if (isScalarAfterVectorization(Inst, VFs[j].FoldTailByMasking, + VFs[j].Width)) { unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType()); // FIXME: The target might use more than one register for the type @@ -6052,7 +6151,7 @@ } else { unsigned ClassID = TTI.getRegisterClassForType(true, Inst->getType()); - RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]); + RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j].Width); } } } @@ -6079,15 +6178,16 @@ for (auto *Inst : LoopInvariants) { // FIXME: The target might use more than one register for the type // even in the scalar case. - unsigned Usage = - VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]); + unsigned Usage = VFs[i].Width.isScalar() + ? 1 + : GetRegUsage(Inst->getType(), VFs[i].Width); unsigned ClassID = - TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType()); + TTI.getRegisterClassForType(VFs[i].Width.isVector(), Inst->getType()); Invariant[ClassID] += Usage; } LLVM_DEBUG({ - dbgs() << "LV(REG): VF = " << VFs[i] << '\n'; + dbgs() << "LV(REG): VF = " << VFs[i].Width << '\n'; dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size() << " item\n"; for (const auto &pair : MaxUsages[i]) { @@ -6112,8 +6212,8 @@ return RUs; } -bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I, - ElementCount VF) { +bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // TODO: Cost model for emulated masked load/store is completely // broken. This hack guides the cost model to use an artificially // high enough value to practically disable vectorization with such @@ -6122,26 +6222,27 @@ // from moving "masked load/store" check from legality to cost model. // Masked Load/Gather emulation was previously never allowed. // Limited number of Masked Store/Scatter emulation was allowed. - assert((isPredicatedInst(I)) && + assert(isPredicatedInst(FoldTailByMasking, I) && "Expecting a scalar emulated instruction"); return isa(I) || (isa(I) && NumPredStores > NumberOfStoresToPredicate); } -void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) { +void LoopVectorizationCostModel::collectInstsToScalarize(bool FoldTailByMasking, + ElementCount VF) { // If we aren't vectorizing the loop, or if we've already collected the // instructions to scalarize, there's nothing to do. Collection may already // have occurred if we have a user-selected VF and are now computing the // expected cost for interleaving. if (VF.isScalar() || VF.isZero() || - InstsToScalarize.find(VF) != InstsToScalarize.end()) + InstsToScalarize.find({FoldTailByMasking, VF}) != InstsToScalarize.end()) return; // Initialize a mapping for VF in InstsToScalalarize. If we find that it's // not profitable to scalarize any instructions, the presence of VF in the // map will indicate that we've analyzed it already. - ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF]; + ScalarCostsTy &ScalarCostsVF = InstsToScalarize[{FoldTailByMasking, VF}]; PredicatedBBsAfterVectorization[VF].clear(); @@ -6149,17 +6250,19 @@ // determine if it would be better to not if-convert the blocks they are in. // If so, we also record the instructions to scalarize. for (BasicBlock *BB : TheLoop->blocks()) { - if (!blockNeedsPredicationForAnyReason(BB)) + if (!blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal, BB)) continue; for (Instruction &I : *BB) - if (isScalarWithPredication(&I, VF)) { + if (isScalarWithPredication(&I, FoldTailByMasking, VF)) { ScalarCostsTy ScalarCosts; // Do not apply discount if scalable, because that would lead to // invalid scalarization costs. // Do not apply discount logic if hacked cost is needed // for emulated masked memrefs. - if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) && - computePredInstDiscount(&I, ScalarCosts, VF) >= 0) + if (!VF.isScalable() && + !useEmulatedMaskMemRefHack(&I, FoldTailByMasking, VF) && + computePredInstDiscount(&I, ScalarCosts, FoldTailByMasking, VF) >= + 0) ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end()); // Remember that BB will remain after vectorization. PredicatedBBsAfterVectorization[VF].insert(BB); @@ -6168,8 +6271,9 @@ } InstructionCost LoopVectorizationCostModel::computePredInstDiscount( - Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) { - assert(!isUniformAfterVectorization(PredInst, VF) && + Instruction *PredInst, ScalarCostsTy &ScalarCosts, bool FoldTailByMasking, + ElementCount VF) { + assert(!isUniformAfterVectorization(PredInst, FoldTailByMasking, VF) && "Instruction marked uniform-after-vectorization will be predicated"); // Initialize the discount to zero, meaning that the scalar version and the @@ -6189,12 +6293,12 @@ // already be scalar to avoid traversing chains that are unlikely to be // beneficial. if (!I->hasOneUse() || PredInst->getParent() != I->getParent() || - isScalarAfterVectorization(I, VF)) + isScalarAfterVectorization(I, FoldTailByMasking, VF)) return false; // If the instruction is scalar with predication, it will be analyzed // separately. We ignore it within the context of PredInst. - if (isScalarWithPredication(I, VF)) + if (isScalarWithPredication(I, FoldTailByMasking, VF)) return false; // If any of the instruction's operands are uniform after vectorization, @@ -6209,7 +6313,7 @@ // the lane zero values for uniforms rather than asserting. for (Use &U : I->operands()) if (auto *J = dyn_cast(U.get())) - if (isUniformAfterVectorization(J, VF)) + if (isUniformAfterVectorization(J, FoldTailByMasking, VF)) return false; // Otherwise, we can scalarize the instruction. @@ -6229,7 +6333,8 @@ // Compute the cost of the vector instruction. Note that this cost already // includes the scalarization overhead of the predicated instruction. - InstructionCost VectorCost = getInstructionCost(I, VF).first; + InstructionCost VectorCost = + getInstructionCost(I, FoldTailByMasking, VF).first; // Compute the cost of the scalarized instruction. This cost is the cost of // the instruction as if it wasn't if-converted and instead remained in the @@ -6237,11 +6342,13 @@ // computing the scalarization overhead. InstructionCost ScalarCost = VF.getFixedValue() * - getInstructionCost(I, ElementCount::getFixed(1)).first; + getInstructionCost(I, FoldTailByMasking, ElementCount::getFixed(1)) + .first; // Compute the scalarization overhead of needed insertelement instructions // and phi nodes. - if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) { + if (isScalarWithPredication(I, FoldTailByMasking, VF) && + !I->getType()->isVoidTy()) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(I->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), true, false); @@ -6260,7 +6367,7 @@ "Instruction has non-scalar type"); if (canBeScalarized(J)) Worklist.push_back(J); - else if (needsExtract(J, VF)) { + else if (needsExtract(J, FoldTailByMasking, VF)) { ScalarCost += TTI.getScalarizationOverhead( cast(ToVectorTy(J->getType(), VF)), APInt::getAllOnes(VF.getFixedValue()), false, true); @@ -6281,7 +6388,7 @@ LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::expectedCost( - ElementCount VF, SmallVectorImpl *Invalid) { + VectorizationScheme VF, SmallVectorImpl *Invalid) { VectorizationCostTy Cost; // For each block. @@ -6292,10 +6399,11 @@ for (Instruction &I : BB->instructionsWithoutDebug()) { // Skip ignored values. if (ValuesToIgnore.count(&I) || - (VF.isVector() && VecValuesToIgnore.count(&I))) + (VF.Width.isVector() && VecValuesToIgnore.count(&I))) continue; - VectorizationCostTy C = getInstructionCost(&I, VF); + VectorizationCostTy C = + getInstructionCost(&I, VF.FoldTailByMasking, VF.Width); // Check if we should override the cost. if (C.first.isValid() && @@ -6304,12 +6412,12 @@ // Keep a list of instructions with invalid costs. if (Invalid && !C.first.isValid()) - Invalid->emplace_back(&I, VF); + Invalid->emplace_back(&I, VF.Width); BlockCost.first += C.first; BlockCost.second |= C.second; LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first - << " for VF " << VF << " For instruction: " << I + << " for VF " << VF.Width << " For instruction: " << I << '\n'); } @@ -6320,7 +6428,7 @@ // the predicated block, if it is an if-else block. Thus, scale the block's // cost by the probability of executing it. blockNeedsPredication from // Legal is used so as to not include all blocks in tail folded loops. - if (VF.isScalar() && Legal->blockNeedsPredication(BB)) + if (VF.Width.isScalar() && Legal->blockNeedsPredication(BB)) BlockCost.first /= getReciprocalPredBlockProb(); Cost.first += BlockCost.first; @@ -6365,9 +6473,8 @@ Legal->hasStride(I->getOperand(1)); } -InstructionCost -LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getMemInstScalarizationCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { assert(VF.isVector() && "Scalarization cost of instruction implies vectorization."); if (VF.isScalable()) @@ -6399,12 +6506,12 @@ // Get the overhead of the extractelement and insertelement instructions // we might create due to scalarization. - Cost += getScalarizationOverhead(I, VF); + Cost += getScalarizationOverhead(I, FoldTailByMasking, VF); // If we have a predicated load/store, it will need extra i1 extracts and // conditional branches, but may not be executed for each vector lane. Scale // the cost by the probability of executing the predicated block. - if (isPredicatedInst(I)) { + if (isPredicatedInst(FoldTailByMasking, I)) { Cost /= getReciprocalPredBlockProb(); // Add the cost of an i1 extract and a branch @@ -6415,7 +6522,7 @@ /*Insert=*/false, /*Extract=*/true); Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput); - if (useEmulatedMaskMemRefHack(I, VF)) + if (useEmulatedMaskMemRefHack(I, FoldTailByMasking, VF)) // Artificially setting to a high enough value to practically disable // vectorization with such operations. Cost = 3000000; @@ -6424,9 +6531,8 @@ return Cost; } -InstructionCost -LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); Value *Ptr = getLoadStorePointerOperand(I); @@ -6438,7 +6544,7 @@ "Stride should be 1 or -1 for consecutive memory access"); const Align Alignment = getLoadStoreAlignment(I); InstructionCost Cost = 0; - if (Legal->isMaskRequired(I)) { + if (Legal->isMaskRequired(FoldTailByMasking, I)) { Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, CostKind); } else { @@ -6482,9 +6588,8 @@ VF.getKnownMinValue() - 1)); } -InstructionCost -LoopVectorizationCostModel::getGatherScatterCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getGatherScatterCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { Type *ValTy = getLoadStoreType(I); auto *VectorTy = cast(ToVectorTy(ValTy, VF)); const Align Alignment = getLoadStoreAlignment(I); @@ -6492,13 +6597,13 @@ return TTI.getAddressComputationCost(VectorTy) + TTI.getGatherScatterOpCost( - I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment, + I->getOpcode(), VectorTy, Ptr, + Legal->isMaskRequired(FoldTailByMasking, I), Alignment, TargetTransformInfo::TCK_RecipThroughput, I); } -InstructionCost -LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // TODO: Once we have support for interleaving with scalable vectors // we can calculate the cost properly here. if (VF.isScalable()) @@ -6527,11 +6632,12 @@ (isa(I) && (Group->getNumMembers() < Group->getFactor())); InstructionCost Cost = TTI.getInterleavedMemoryOpCost( I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(), - AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps); + AS, CostKind, Legal->isMaskRequired(FoldTailByMasking, I), + UseMaskForGaps); if (Group->isReverse()) { // TODO: Add support for reversed masked interleaved access. - assert(!Legal->isMaskRequired(I) && + assert(!Legal->isMaskRequired(FoldTailByMasking, I) && "Reverse masked interleaved access not supported."); Cost += Group->getNumMembers() * TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, @@ -6713,9 +6819,8 @@ return I == RetI ? std::optional(BaseCost) : std::nullopt; } -InstructionCost -LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I, - ElementCount VF) { +InstructionCost LoopVectorizationCostModel::getMemoryInstructionCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF) { // Calculate scalar cost only. Vectorization cost should be ready at this // moment. if (VF.isScalar()) { @@ -6728,33 +6833,36 @@ TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, TTI::TCK_RecipThroughput, OpInfo, I); } - return getWideningCost(I, VF); + return getWideningCost(I, FoldTailByMasking, VF); } LoopVectorizationCostModel::VectorizationCostTy LoopVectorizationCostModel::getInstructionCost(Instruction *I, + bool FoldTailByMasking, ElementCount VF) { // If we know that this instruction will remain uniform, check the cost of // the scalar version. - if (isUniformAfterVectorization(I, VF)) + if (isUniformAfterVectorization(I, FoldTailByMasking, VF)) VF = ElementCount::getFixed(1); - if (VF.isVector() && isProfitableToScalarize(I, VF)) - return VectorizationCostTy(InstsToScalarize[VF][I], false); + if (VF.isVector() && isProfitableToScalarize(I, FoldTailByMasking, VF)) + return VectorizationCostTy(InstsToScalarize[{FoldTailByMasking, VF}][I], + false); // Forced scalars do not have any scalarization overhead. - auto ForcedScalar = ForcedScalars.find(VF); + auto ForcedScalar = ForcedScalars.find({FoldTailByMasking, VF}); if (VF.isVector() && ForcedScalar != ForcedScalars.end()) { auto InstSet = ForcedScalar->second; if (InstSet.count(I)) return VectorizationCostTy( - (getInstructionCost(I, ElementCount::getFixed(1)).first * + (getInstructionCost(I, FoldTailByMasking, ElementCount::getFixed(1)) + .first * VF.getKnownMinValue()), false); } Type *VectorTy; - InstructionCost C = getInstructionCost(I, VF, VectorTy); + InstructionCost C = getInstructionCost(I, FoldTailByMasking, VF, VectorTy); bool TypeNotScalarized = false; if (VF.isVector() && VectorTy->isVectorTy()) { @@ -6774,9 +6882,8 @@ return VectorizationCostTy(C, TypeNotScalarized); } -InstructionCost -LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I, - ElementCount VF) const { +InstructionCost LoopVectorizationCostModel::getScalarizationOverhead( + Instruction *I, bool FoldTailByMasking, ElementCount VF) const { // There is no mechanism yet to create a scalable scalarization loop, // so this is currently Invalid. @@ -6809,13 +6916,14 @@ // Skip operands that do not require extraction/scalarization and do not incur // any overhead. SmallVector Tys; - for (auto *V : filterExtractingOperands(Ops, VF)) + for (auto *V : filterExtractingOperands(Ops, FoldTailByMasking, VF)) Tys.push_back(MaybeVectorizeType(V->getType(), VF)); return Cost + TTI.getOperandsScalarizationOverhead( - filterExtractingOperands(Ops, VF), Tys); + filterExtractingOperands(Ops, FoldTailByMasking, VF), Tys); } -void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) { +void LoopVectorizationCostModel::setCostBasedWideningDecision( + bool FoldTailByMasking, ElementCount VF) { if (VF.isScalar()) return; NumPredStores = 0; @@ -6830,7 +6938,8 @@ // predicated uniform stores. Today they are treated as any other // predicated store (see added test cases in // invariant-store-vectorization.ll). - if (isa(&I) && isScalarWithPredication(&I, VF)) + if (isa(&I) && + isScalarWithPredication(&I, FoldTailByMasking, VF)) NumPredStores++; if (Legal->isUniformMemOp(I)) { @@ -6843,7 +6952,7 @@ // stores. Note that even with tail folding we know that at least // one lane is active (i.e. generalized predication is not possible // here), and the logic below depends on this fact. - if (!foldTailByMasking()) + if (!FoldTailByMasking) return true; // For scalable vectors, a uniform memop load is always @@ -6858,8 +6967,9 @@ }; const InstructionCost GatherScatterCost = - isLegalGatherOrScatter(&I, VF) ? - getGatherScatterCost(&I, VF) : InstructionCost::getInvalid(); + isLegalGatherOrScatter(&I, VF) + ? getGatherScatterCost(&I, FoldTailByMasking, VF) + : InstructionCost::getInvalid(); // Load: Scalar load + broadcast // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract @@ -6872,22 +6982,25 @@ // costs compare as maximumal large. If both are invalid, we get // scalable invalid which signals a failure and a vectorization abort. if (GatherScatterCost < ScalarizationCost) - setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost); + setWideningDecision(&I, FoldTailByMasking, VF, CM_GatherScatter, + GatherScatterCost); else - setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost); + setWideningDecision(&I, FoldTailByMasking, VF, CM_Scalarize, + ScalarizationCost); continue; } // We assume that widening is the best solution when possible. - if (memoryInstructionCanBeWidened(&I, VF)) { - InstructionCost Cost = getConsecutiveMemOpCost(&I, VF); + if (memoryInstructionCanBeWidened(&I, FoldTailByMasking, VF)) { + InstructionCost Cost = + getConsecutiveMemOpCost(&I, FoldTailByMasking, VF); int ConsecutiveStride = Legal->isConsecutivePtr( getLoadStoreType(&I), getLoadStorePointerOperand(&I)); assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) && "Expected consecutive stride."); InstWidening Decision = ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse; - setWideningDecision(&I, VF, Decision, Cost); + setWideningDecision(&I, FoldTailByMasking, VF, Decision, Cost); continue; } @@ -6899,21 +7012,21 @@ assert(Group && "Fail to get an interleaved access group."); // Make one decision for the whole group. - if (getWideningDecision(&I, VF) != CM_Unknown) + if (getWideningDecision(&I, FoldTailByMasking, VF) != CM_Unknown) continue; NumAccesses = Group->getNumMembers(); - if (interleavedAccessCanBeWidened(&I, VF)) - InterleaveCost = getInterleaveGroupCost(&I, VF); + if (interleavedAccessCanBeWidened(&I, FoldTailByMasking, VF)) + InterleaveCost = getInterleaveGroupCost(&I, FoldTailByMasking, VF); } InstructionCost GatherScatterCost = isLegalGatherOrScatter(&I, VF) - ? getGatherScatterCost(&I, VF) * NumAccesses + ? getGatherScatterCost(&I, FoldTailByMasking, VF) * NumAccesses : InstructionCost::getInvalid(); InstructionCost ScalarizationCost = - getMemInstScalarizationCost(&I, VF) * NumAccesses; + getMemInstScalarizationCost(&I, FoldTailByMasking, VF) * NumAccesses; // Choose better solution for the current VF, // write down this decision and use it during vectorization. @@ -6934,9 +7047,9 @@ // receives the same decision. The whole group receives the cost, but // the cost will actually be assigned to one instruction. if (auto Group = getInterleavedAccessGroup(&I)) - setWideningDecision(Group, VF, Decision, Cost); + setWideningDecision(Group, FoldTailByMasking, VF, Decision, Cost); else - setWideningDecision(&I, VF, Decision, Cost); + setWideningDecision(&I, FoldTailByMasking, VF, Decision, Cost); } } @@ -6955,7 +7068,7 @@ Instruction *PtrDef = dyn_cast_or_null(getLoadStorePointerOperand(&I)); if (PtrDef && TheLoop->contains(PtrDef) && - getWideningDecision(&I, VF) != CM_GatherScatter) + getWideningDecision(&I, FoldTailByMasking, VF) != CM_GatherScatter) AddrDefs.insert(PtrDef); } @@ -6977,45 +7090,46 @@ // by cost functions, but since this involves the task of finding out // if the loaded register is involved in an address computation, it is // instead changed here when we know this is the case. - InstWidening Decision = getWideningDecision(I, VF); + InstWidening Decision = getWideningDecision(I, FoldTailByMasking, VF); if (Decision == CM_Widen || Decision == CM_Widen_Reverse) // Scalarize a widened load of address. setWideningDecision( - I, VF, CM_Scalarize, + I, FoldTailByMasking, VF, CM_Scalarize, (VF.getKnownMinValue() * - getMemoryInstructionCost(I, ElementCount::getFixed(1)))); + getMemoryInstructionCost(I, FoldTailByMasking, + ElementCount::getFixed(1)))); else if (auto Group = getInterleavedAccessGroup(I)) { // Scalarize an interleave group of address loads. for (unsigned I = 0; I < Group->getFactor(); ++I) { if (Instruction *Member = Group->getMember(I)) setWideningDecision( - Member, VF, CM_Scalarize, + Member, FoldTailByMasking, VF, CM_Scalarize, (VF.getKnownMinValue() * - getMemoryInstructionCost(Member, ElementCount::getFixed(1)))); + getMemoryInstructionCost(Member, FoldTailByMasking, + ElementCount::getFixed(1)))); } } } else // Make sure I gets scalarized and a cost estimate without // scalarization overhead. - ForcedScalars[VF].insert(I); + ForcedScalars[{FoldTailByMasking, VF}].insert(I); } } -InstructionCost -LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF, - Type *&VectorTy) { +InstructionCost LoopVectorizationCostModel::getInstructionCost( + Instruction *I, bool FoldTailByMasking, ElementCount VF, Type *&VectorTy) { Type *RetTy = I->getType(); - if (canTruncateToMinimalBitwidth(I, VF)) + if (canTruncateToMinimalBitwidth(I, FoldTailByMasking, VF)) RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]); auto SE = PSE.getSE(); TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto hasSingleCopyAfterVectorization = [this](Instruction *I, - ElementCount VF) -> bool { + auto hasSingleCopyAfterVectorization = + [this](Instruction *I, bool FoldTailByMasking, ElementCount VF) -> bool { if (VF.isScalar()) return true; - auto Scalarized = InstsToScalarize.find(VF); + auto Scalarized = InstsToScalarize.find({FoldTailByMasking, VF}); assert(Scalarized != InstsToScalarize.end() && "VF not yet analyzed for scalarization profitability"); return !Scalarized->second.count(I) && @@ -7026,7 +7140,7 @@ }; (void) hasSingleCopyAfterVectorization; - if (isScalarAfterVectorization(I, VF)) { + if (isScalarAfterVectorization(I, FoldTailByMasking, VF)) { // With the exception of GEPs and PHIs, after scalarization there should // only be one copy of the instruction generated in the loop. This is // because the VF is either 1, or any instructions that need scalarizing @@ -7036,7 +7150,7 @@ I->getOpcode() == Instruction::PHI || (I->getOpcode() == Instruction::BitCast && I->getType()->isPointerTy()) || - hasSingleCopyAfterVectorization(I, VF)); + hasSingleCopyAfterVectorization(I, FoldTailByMasking, VF)); VectorTy = RetTy; } else VectorTy = ToVectorTy(RetTy, VF); @@ -7109,8 +7223,9 @@ case Instruction::SDiv: case Instruction::URem: case Instruction::SRem: - if (VF.isVector() && isPredicatedInst(I)) { - const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF); + if (VF.isVector() && isPredicatedInst(FoldTailByMasking, I)) { + const auto [ScalarCost, SafeDivisorCost] = + getDivRemSpeculationCost(I, FoldTailByMasking, VF); return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ? ScalarCost : SafeDivisorCost; } @@ -7194,7 +7309,7 @@ case Instruction::FCmp: { Type *ValTy = I->getOperand(0)->getType(); Instruction *Op0AsInstruction = dyn_cast(I->getOperand(0)); - if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF)) + if (canTruncateToMinimalBitwidth(Op0AsInstruction, FoldTailByMasking, VF)) ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]); VectorTy = ToVectorTy(ValTy, VF); return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, @@ -7205,16 +7320,17 @@ case Instruction::Load: { ElementCount Width = VF; if (Width.isVector()) { - InstWidening Decision = getWideningDecision(I, Width); + InstWidening Decision = getWideningDecision(I, FoldTailByMasking, Width); assert(Decision != CM_Unknown && "CM decision should be taken at this point"); - if (getWideningCost(I, VF) == InstructionCost::getInvalid()) + if (getWideningCost(I, FoldTailByMasking, VF) == + InstructionCost::getInvalid()) return InstructionCost::getInvalid(); if (Decision == CM_Scalarize) Width = ElementCount::getFixed(1); } VectorTy = ToVectorTy(getLoadStoreType(I), Width); - return getMemoryInstructionCost(I, VF); + return getMemoryInstructionCost(I, FoldTailByMasking, VF); } case Instruction::BitCast: if (I->getType()->isPointerTy()) @@ -7239,15 +7355,16 @@ if (VF.isScalar() || !TheLoop->contains(I)) return TTI::CastContextHint::Normal; - switch (getWideningDecision(I, VF)) { + switch (getWideningDecision(I, FoldTailByMasking, VF)) { case LoopVectorizationCostModel::CM_GatherScatter: return TTI::CastContextHint::GatherScatter; case LoopVectorizationCostModel::CM_Interleave: return TTI::CastContextHint::Interleave; case LoopVectorizationCostModel::CM_Scalarize: case LoopVectorizationCostModel::CM_Widen: - return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked - : TTI::CastContextHint::Normal; + return Legal->isMaskRequired(FoldTailByMasking, I) + ? TTI::CastContextHint::Masked + : TTI::CastContextHint::Normal; case LoopVectorizationCostModel::CM_Widen_Reverse: return TTI::CastContextHint::Reversed; case LoopVectorizationCostModel::CM_Unknown: @@ -7288,7 +7405,7 @@ Type *SrcScalarTy = I->getOperand(0)->getType(); Type *SrcVecTy = VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy; - if (canTruncateToMinimalBitwidth(I, VF)) { + if (canTruncateToMinimalBitwidth(I, FoldTailByMasking, VF)) { // This cast is going to be shrunk. This may remove the cast or it might // turn it into slightly different cast. For example, if MinBW == 16, // "zext i8 %1 to i32" becomes "zext i8 %1 to i16". @@ -7314,7 +7431,8 @@ return *RedCost; bool NeedToScalarize; CallInst *CI = cast(I); - InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost CallCost = + getVectorCallCost(CI, FoldTailByMasking, VF, NeedToScalarize); if (getVectorIntrinsicIDForCall(CI, TLI)) { InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF); return std::min(CallCost, IntrinsicCost); @@ -7481,7 +7599,7 @@ if (VPlanBuildStressTest) return VectorizationFactor::Disabled(); - return {VF, 0 /*Cost*/, 0 /* ScalarCost */}; + return {false, VF, 0 /*Cost*/, 0 /* ScalarCost */}; } LLVM_DEBUG( @@ -7498,7 +7616,8 @@ return std::nullopt; // Invalidate interleave groups if all blocks of loop will be predicated. - if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) && + if (blockNeedsPredicationForAnyReason(CM.mayFoldTailByMasking(), Legal, + OrigLoop->getHeader()) && !useMaskedInterleavedAccesses(*TTI)) { LLVM_DEBUG( dbgs() @@ -7519,12 +7638,13 @@ "VF needs to be a power of two"); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (CM.selectUserVectorizationFactor(UserVF)) { + // TODOD: Should this compare tail folded vs non-tail-folded? + if (CM.selectUserVectorizationFactor(CM.mayFoldTailByMasking(), UserVF)) { LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n"); CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(UserVF, UserVF); + buildVPlansWithVPRecipes(CM.mayFoldTailByMasking(), UserVF, UserVF); LLVM_DEBUG(printPlans(dbgs())); - return {{UserVF, 0, 0}}; + return {{CM.mayFoldTailByMasking(), UserVF, 0, 0}}; } else reportVectorizationInfo("UserVF ignored because of invalid costs.", "InvalidCost", ORE, OrigLoop); @@ -7532,26 +7652,48 @@ // Populate the set of Vectorization Factor Candidates. ElementCountSet VFCandidates; - for (auto VF = ElementCount::getFixed(1); - ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) - VFCandidates.insert(VF); - for (auto VF = ElementCount::getScalable(1); - ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) - VFCandidates.insert(VF); + VFCandidates.insert({false, ElementCount::getFixed(1)}); + if (CM.shouldGenerateNonFoldedVFs()) { + for (auto VF = ElementCount::getFixed(2); + ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) + VFCandidates.insert({false, VF}); + for (auto VF = ElementCount::getScalable(1); + ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) + VFCandidates.insert({false, VF}); + } + if (CM.mayFoldTailByMasking()) { + for (auto VF = ElementCount::getFixed(1); + ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2) + VFCandidates.insert({true, VF}); + for (auto VF = ElementCount::getScalable(1); + ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2) + VFCandidates.insert({true, VF}); + } for (const auto &VF : VFCandidates) { // Collect Uniform and Scalar instructions after vectorization with VF. - CM.collectUniformsAndScalars(VF); + CM.collectUniformsAndScalars(VF.FoldTailByMasking, VF.Width); // Collect the instructions (and their associated costs) that will be more // profitable to scalarize. - if (VF.isVector()) - CM.collectInstsToScalarize(VF); + if (VF.Width.isVector()) + CM.collectInstsToScalarize(VF.FoldTailByMasking, VF.Width); } CM.collectInLoopReductions(); - buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF); - buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF); + buildVPlansWithVPRecipes(false, ElementCount::getFixed(1), + CM.shouldGenerateNonFoldedVFs() + ? MaxFactors.FixedVF + : ElementCount::getFixed(1)); + if (CM.shouldGenerateNonFoldedVFs()) + buildVPlansWithVPRecipes(false, ElementCount::getScalable(1), + MaxFactors.ScalableVF); + if (CM.mayFoldTailByMasking()) { + buildVPlansWithVPRecipes(true, ElementCount::getFixed(1), + MaxFactors.FixedVF); + buildVPlansWithVPRecipes(true, ElementCount::getScalable(1), + MaxFactors.ScalableVF); + } LLVM_DEBUG(printPlans(dbgs())); if (!MaxFactors.hasVector()) @@ -7559,18 +7701,22 @@ // Select the optimal vectorization factor. VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates); - assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero."); + assert((VF.Scheme.Width.isScalar() || VF.ScalarCost > 0) && + "when vectorizing, the scalar cost must be non-zero."); return VF; } -VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const { +VPlan &LoopVectorizationPlanner::getBestPlanFor(VectorizationScheme VF) const { assert(count_if(VPlans, - [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) == - 1 && + [VF](const VPlanPtr &Plan) { + return Plan->hasVF(VF.Width) && + Plan->foldTailByMasking() == VF.FoldTailByMasking; + }) == 1 && "Best VF has not a single VPlan."); for (const VPlanPtr &Plan : VPlans) { - if (Plan->hasVF(VF)) + if (Plan->hasVF(VF.Width) && + Plan->foldTailByMasking() == VF.FoldTailByMasking) return *Plan.get(); } llvm_unreachable("No plan found!"); @@ -7620,8 +7766,9 @@ assert(BestVPlan.hasUF(BestUF) && "Trying to execute plan with unsupported UF"); - LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF - << '\n'); + LLVM_DEBUG(dbgs() << "Executing best plan with TailFold=" + << (BestVPlan.foldTailByMasking() ? "true" : "false") + << ", VF=" << BestVF << ", UF=" << BestUF << '\n'); // Workaround! Compute the trip count of the original loop and cache it // before we start modifying the CFG. This code has a systemic problem @@ -7642,7 +7789,7 @@ VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan}; Value *CanonicalIVStartValue; std::tie(State.CFG.PrevBB, CanonicalIVStartValue) = - ILV.createVectorizedLoopSkeleton(); + ILV.createVectorizedLoopSkeleton(BestVPlan); // Only use noalias metadata when using memory checks guaranteeing no overlap // across all iterations. @@ -7674,7 +7821,7 @@ // 2. Copy and widen instructions from the old loop into the new loop. BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr), - ILV.getOrCreateVectorTripCount(nullptr), + ILV.getOrCreateVectorTripCount(nullptr, BestVPlan), CanonicalIVStartValue, State, IsEpilogueVectorization); @@ -7730,7 +7877,7 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair -EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(VPlan &Plan) { createVectorLoopSkeleton(""); // Generate the code to check the minimum iteration count of the vector @@ -7758,14 +7905,14 @@ emitIterationCountCheck(LoopScalarPreHeader, false); // Generate the induction variable. - EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader); + EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader, Plan); // Skip induction resume value creation here because they will be created in // the second pass for the scalar loop. The induction resume values for the // inductions in the epilogue loop are created before executing the plan for // the epilogue loop. - return {completeLoopSkeleton(), nullptr}; + return {completeLoopSkeleton(Plan), nullptr}; } void EpilogueVectorizerMainLoop::printDebugTracesAtStart() { @@ -7848,7 +7995,8 @@ /// This function is partially responsible for generating the control flow /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization. std::pair -EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { +EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton( + VPlan &Plan) { createVectorLoopSkeleton("vec.epilog."); // Now, compare the remaining count and if there aren't enough iterations to @@ -7946,10 +8094,11 @@ // check, then the resume value for the induction variable comes from // the trip count of the main vector loop, hence passing the AdditionalBypass // argument. - createInductionResumeValues({VecEpilogueIterationCountCheck, + createInductionResumeValues(Plan, + {VecEpilogueIterationCountCheck, EPI.VectorTripCount} /* AdditionalBypass */); - return {completeLoopSkeleton(), EPResumeVal}; + return {completeLoopSkeleton(Plan), EPResumeVal}; } BasicBlock * @@ -8087,10 +8236,11 @@ VPValue *BlockMask = nullptr; if (OrigLoop->getHeader() == BB) { - if (!CM.blockNeedsPredicationForAnyReason(BB)) + if (!blockNeedsPredicationForAnyReason(Plan->foldTailByMasking(), Legal, + BB)) return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one. - assert(CM.foldTailByMasking() && "must fold the tail"); + assert(Plan->foldTailByMasking() && "must fold the tail"); // If we're using the active lane mask for control flow, then we get the // mask from the active lane mask PHI that is cached in the VPlan. @@ -8148,13 +8298,13 @@ auto willWiden = [&](ElementCount VF) -> bool { LoopVectorizationCostModel::InstWidening Decision = - CM.getWideningDecision(I, VF); + CM.getWideningDecision(I, Plan->foldTailByMasking(), VF); assert(Decision != LoopVectorizationCostModel::CM_Unknown && "CM decision should be taken at this point."); if (Decision == LoopVectorizationCostModel::CM_Interleave) return true; - if (CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF)) + if (CM.isScalarAfterVectorization(I, Plan->foldTailByMasking(), VF) || + CM.isProfitableToScalarize(I, Plan->foldTailByMasking(), VF)) return false; return Decision != LoopVectorizationCostModel::CM_Scalarize; }; @@ -8163,13 +8313,13 @@ return nullptr; VPValue *Mask = nullptr; - if (Legal->isMaskRequired(I)) + if (Legal->isMaskRequired(Plan->foldTailByMasking(), I)) Mask = createBlockInMask(I->getParent(), Plan); // Determine if the pointer operand of the access is either consecutive or // reverse consecutive. LoopVectorizationCostModel::InstWidening Decision = - CM.getWideningDecision(I, Range.Start); + CM.getWideningDecision(I, Plan->foldTailByMasking(), Range.Start); bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse; bool Consecutive = Reverse || Decision == LoopVectorizationCostModel::CM_Widen; @@ -8191,9 +8341,10 @@ VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) { // Returns true if an instruction \p I should be scalarized instead of // vectorized for the chosen vectorization factor. - auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) { - return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF); + auto ShouldScalarizeInstruction = [&CM, &Plan](Instruction *I, + ElementCount VF) { + return CM.isScalarAfterVectorization(I, Plan.foldTailByMasking(), VF) || + CM.isProfitableToScalarize(I, Plan.foldTailByMasking(), VF); }; bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange( @@ -8235,7 +8386,8 @@ Phi, Operands[0], Step, *II, LoopVectorizationPlanner::getDecisionAndClampRange( [&](ElementCount VF) { - return CM.isScalarAfterVectorization(Phi, VF); + return CM.isScalarAfterVectorization( + Phi, Plan.foldTailByMasking(), VF); }, Range)); } @@ -8317,11 +8469,12 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range) const { + VFRange &Range, + VPlanPtr &Plan) const { bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange( - [this, CI](ElementCount VF) { - return CM.isScalarWithPredication(CI, VF); + [this, CI, &Plan](ElementCount VF) { + return CM.isScalarWithPredication(CI, Plan->foldTailByMasking(), VF); }, Range); @@ -8344,8 +8497,8 @@ bool NeedToScalarize = false; // Is it beneficial to perform intrinsic call compared to lib // call? - InstructionCost CallCost = - CM.getVectorCallCost(CI, VF, NeedToScalarize); + InstructionCost CallCost = CM.getVectorCallCost( + CI, Plan->foldTailByMasking(), VF, NeedToScalarize); InstructionCost IntrinsicCost = CM.getVectorIntrinsicCost(CI, VF); return IntrinsicCost <= CallCost; @@ -8362,7 +8515,8 @@ // The flag shows whether we can use a usual Call for vectorized // version of the instruction. bool NeedToScalarize = false; - CM.getVectorCallCost(CI, VF, NeedToScalarize); + CM.getVectorCallCost(CI, Plan->foldTailByMasking(), VF, + NeedToScalarize); return !NeedToScalarize; }, Range); @@ -8373,15 +8527,16 @@ return nullptr; } -bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const { +bool VPRecipeBuilder::shouldWiden(Instruction *I, bool FoldTailByMasking, + VFRange &Range) const { assert(!isa(I) && !isa(I) && !isa(I) && !isa(I) && "Instruction should have been handled earlier"); // Instruction should be widened, unless it is scalar after vectorization, // scalarization is profitable or it is predicated. - auto WillScalarize = [this, I](ElementCount VF) -> bool { - return CM.isScalarAfterVectorization(I, VF) || - CM.isProfitableToScalarize(I, VF) || - CM.isScalarWithPredication(I, VF); + auto WillScalarize = [this, I, FoldTailByMasking](ElementCount VF) -> bool { + return CM.isScalarAfterVectorization(I, FoldTailByMasking, VF) || + CM.isProfitableToScalarize(I, FoldTailByMasking, VF) || + CM.isScalarWithPredication(I, FoldTailByMasking, VF); }; return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize, Range); @@ -8399,7 +8554,7 @@ case Instruction::URem: { // If not provably safe, use a select to form a safe divisor before widening the // div/rem operation itself. Otherwise fall through to general handling below. - if (CM.isPredicatedInst(I)) { + if (CM.isPredicatedInst(Plan->foldTailByMasking(), I)) { SmallVector Ops(Operands.begin(), Operands.end()); VPValue *Mask = createBlockInMask(I->getParent(), Plan); VPValue *One = @@ -8462,10 +8617,12 @@ Instruction *I, VFRange &Range, VPBasicBlock *VPBB, VPlanPtr &Plan) { bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange( - [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); }, + [&](ElementCount VF) { + return CM.isUniformAfterVectorization(I, Plan->foldTailByMasking(), VF); + }, Range); - bool IsPredicated = CM.isPredicatedInst(I); + bool IsPredicated = CM.isPredicatedInst(Plan->foldTailByMasking(), I); // Even if the instruction is not marked as uniform, there are certain // intrinsic calls that can be effectively treated as such, so we check for @@ -8639,12 +8796,12 @@ return nullptr; if (auto *CI = dyn_cast(Instr)) - return toVPRecipeResult(tryToWidenCall(CI, Operands, Range)); + return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan)); if (isa(Instr) || isa(Instr)) return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan)); - if (!shouldWiden(Instr, Range)) + if (!shouldWiden(Instr, Plan->foldTailByMasking(), Range)) return nullptr; if (auto GEP = dyn_cast(Instr)) @@ -8661,7 +8818,8 @@ return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan)); } -void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF, +void LoopVectorizationPlanner::buildVPlansWithVPRecipes(bool FoldTailByMasking, + ElementCount MinVF, ElementCount MaxVF) { assert(OrigLoop->isInnermost() && "Inner loop expected."); @@ -8670,7 +8828,7 @@ // TODO: We only need to drop assumes in blocks that get flattend. If the // control flow is preserved, we should keep them. SmallPtrSet DeadInstructions; - auto &ConditionalAssumes = Legal->getConditionalAssumes(); + auto &ConditionalAssumes = Legal->getConditionalAssumes(FoldTailByMasking); DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end()); MapVector &SinkAfter = Legal->getSinkAfter(); @@ -8699,8 +8857,8 @@ auto MaxVFPlusOne = MaxVF.getWithIncrement(1); for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) { VFRange SubRange = {VF, MaxVFPlusOne}; - VPlans.push_back( - buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter)); + VPlans.push_back(buildVPlanWithVPRecipes(FoldTailByMasking, SubRange, + DeadInstructions, SinkAfter)); VF = SubRange.End; } } @@ -8806,7 +8964,8 @@ } VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes( - VFRange &Range, SmallPtrSetImpl &DeadInstructions, + bool FoldTailByMasking, VFRange &Range, + SmallPtrSetImpl &DeadInstructions, const MapVector &SinkAfter) { SmallPtrSet *, 1> InterleaveGroups; @@ -8847,10 +9006,11 @@ // placeholders for its members' Recipes which we'll be replacing with a // single VPInterleaveRecipe. for (InterleaveGroup *IG : IAI.getInterleaveGroups()) { - auto applyIG = [IG, this](ElementCount VF) -> bool { - return (VF.isVector() && // Query is illegal for VF == 1 - CM.getWideningDecision(IG->getInsertPos(), VF) == - LoopVectorizationCostModel::CM_Interleave); + auto applyIG = [IG, this, &FoldTailByMasking](ElementCount VF) -> bool { + return ( + VF.isVector() && // Query is illegal for VF == 1 + CM.getWideningDecision(IG->getInsertPos(), FoldTailByMasking, VF) == + LoopVectorizationCostModel::CM_Interleave); }; if (!getDecisionAndClampRange(applyIG, Range)) continue; @@ -8869,7 +9029,7 @@ // followed by a region for the vector loop, followed by the middle block. The // skeleton vector loop region contains a header and latch block. VPBasicBlock *Preheader = new VPBasicBlock("vector.ph"); - auto Plan = std::make_unique(Preheader); + auto Plan = std::make_unique(Preheader, FoldTailByMasking); VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body"); VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch"); @@ -8883,8 +9043,8 @@ getDebugLocFromInstOrOperands(Legal->getPrimaryInduction()); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DLInst ? DLInst->getDebugLoc() : DebugLoc(), - !CM.foldTailByMasking(), - CM.useActiveLaneMaskForControlFlow()); + !Plan->foldTailByMasking(), + CM.useActiveLaneMaskForControlFlow(FoldTailByMasking)); // Scan the body of the loop in a topological order to visit each basic block // after having visited its predecessor basic blocks. @@ -9194,7 +9354,7 @@ Term->eraseFromParent(); addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(), - true, CM.useActiveLaneMaskForControlFlow()); + true, CM.useActiveLaneMaskForControlFlow(false)); return Plan; } @@ -9247,7 +9407,8 @@ VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId)); VPValue *CondOp = nullptr; - if (CM.blockNeedsPredicationForAnyReason(R->getParent())) { + if (blockNeedsPredicationForAnyReason(Plan->foldTailByMasking(), Legal, + R->getParent())) { VPBuilder::InsertPointGuard Guard(Builder); Builder.setInsertPoint(WidenRecipe->getParent(), WidenRecipe->getIterator()); @@ -9292,7 +9453,7 @@ // If tail is folded by masking, introduce selects between the phi // and the live-out instruction of each reduction, at the beginning of the // dedicated latch block. - if (CM.foldTailByMasking()) { + if (Plan->foldTailByMasking()) { Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin()); for (VPRecipeBase &R : Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) { @@ -9982,16 +10143,16 @@ if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF) return false; - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme); { GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getParent()->getDataLayout()); - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.Width, 1, LVL, &CM, BFI, PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Scheme.Width, + VF.Scheme.Width, 1, LVL, &CM, BFI, PSI, Checks); LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \"" << L->getHeader()->getParent()->getName() << "\"\n"); - LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false); + LVP.executePlan(VF.Scheme.Width, 1, BestPlan, LB, DT, false); } // Mark the loop as already vectorized to avoid vectorizing again. @@ -10055,7 +10216,7 @@ // When interleaving only scalar and vector cost will be equal, which in turn // would lead to a divide by 0. Fall back to hard threshold. - if (VF.Width.isScalar()) { + if (VF.Scheme.Width.isScalar()) { if (CheckCost > VectorizeMemoryCheckThreshold) { LLVM_DEBUG( dbgs() @@ -10098,8 +10259,8 @@ // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that // the computations are performed on doubles, not integers and the result // is rounded up, hence we get an upper estimate of the TC. - unsigned IntVF = VF.Width.getKnownMinValue(); - if (VF.Width.isScalable()) { + unsigned IntVF = VF.Scheme.Width.getKnownMinValue(); + if (VF.Scheme.Width.isScalable()) { unsigned AssumedMinimumVscale = 1; if (VScale) AssumedMinimumVscale = *VScale; @@ -10326,13 +10487,14 @@ if (MaybeVF) { VF = *MaybeVF; // Select the interleave count. - IC = CM.selectInterleaveCount(VF.Width, VF.Cost); + IC = CM.selectInterleaveCount(VF.Scheme, VF.Cost); unsigned SelectedIC = std::max(IC, UserIC); // Optimistically generate runtime checks if they are needed. Drop them if // they turn out to not be profitable. - if (VF.Width.isVector() || SelectedIC > 1) - Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC); + if (VF.Scheme.Width.isVector() || SelectedIC > 1) + Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Scheme.Width, + SelectedIC); // Check if it is profitable to vectorize with runtime checks. bool ForceVectorization = @@ -10356,7 +10518,7 @@ // Identify the diagnostic messages that should be produced. std::pair VecDiagMsg, IntDiagMsg; bool VectorizeLoop = true, InterleaveLoop = true; - if (VF.Width.isScalar()) { + if (VF.Scheme.Width.isScalar()) { LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n"); VecDiagMsg = std::make_pair( "VectorizationNotBeneficial", @@ -10422,7 +10584,7 @@ << VecDiagMsg.second; }); } else if (VectorizeLoop && !InterleaveLoop) { - LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Scheme.Width << ") in " << DebugLocStr << '\n'); ORE->emit([&]() { return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first, @@ -10430,7 +10592,7 @@ << IntDiagMsg.second; }); } else if (VectorizeLoop && InterleaveLoop) { - LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width + LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Scheme.Width << ") in " << DebugLocStr << '\n'); LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n'); } @@ -10446,8 +10608,8 @@ InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme); + LVP.executePlan(VF.Scheme.Width, IC, BestPlan, Unroller, DT, false); ORE->emit([&]() { return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), @@ -10460,17 +10622,19 @@ // Consider vectorizing the epilogue too if it's profitable. VectorizationFactor EpilogueVF = - CM.selectEpilogueVectorizationFactor(VF.Width, LVP); - if (EpilogueVF.Width.isVector()) { + CM.selectEpilogueVectorizationFactor(VF.Scheme.Width, LVP); + if (EpilogueVF.Scheme.Width.isVector()) { // The first pass vectorizes the main loop and creates a scalar epilogue // to be vectorized by executing the plan (potentially with a different // factor) again shortly afterwards. - EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1); + // TODOD: Predicated remainders + EpilogueLoopVectorizationInfo EPI(VF.Scheme.Width, IC, + EpilogueVF.Scheme.Width, 1); EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF); + VPlan &BestMainPlan = LVP.getBestPlanFor({false, EPI.MainLoopVF}); LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV, DT, true); ++LoopsVectorized; @@ -10483,7 +10647,7 @@ ORE, EPI, &LVL, &CM, BFI, PSI, Checks); - VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF); + VPlan &BestEpiPlan = LVP.getBestPlanFor({false, EPI.EpilogueVF}); VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion(); VPBasicBlock *Header = VectorLoop->getEntryBasicBlock(); Header->setName("vec.epilog.vector.body"); @@ -10492,7 +10656,8 @@ // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated // before vectorizing the epilogue loop. for (VPRecipeBase &R : Header->phis()) { - if (isa(&R)) + if (isa(&R) || + isa(&R)) continue; Value *ResumeV = nullptr; @@ -10516,7 +10681,7 @@ } ResumeV = MainILV.createInductionResumeValue( - IndPhi, *ID, {EPI.MainLoopIterationCountCheck}); + IndPhi, *ID, {EPI.MainLoopIterationCountCheck}, BestEpiPlan); } assert(ResumeV && "Must have a resume value"); VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV); @@ -10530,12 +10695,12 @@ if (!MainILV.areSafetyChecksAdded()) DisableRuntimeUnroll = true; } else { - InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, - VF.MinProfitableTripCount, IC, &LVL, &CM, BFI, - PSI, Checks); + InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, + VF.Scheme.Width, VF.MinProfitableTripCount, IC, + &LVL, &CM, BFI, PSI, Checks); - VPlan &BestPlan = LVP.getBestPlanFor(VF.Width); - LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false); + VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme); + LVP.executePlan(VF.Scheme.Width, IC, BestPlan, LB, DT, false); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there @@ -10549,7 +10714,7 @@ return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(), L->getHeader()) << "vectorized loop (vectorization width: " - << NV("VectorizationFactor", VF.Width) + << NV("VectorizationFactor", VF.Scheme.Width) << ", interleaved count: " << NV("InterleaveCount", IC) << ")"; }); } Index: llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -64,7 +64,8 @@ /// Check if \p I can be widened at the start of \p Range and possibly /// decrease the range such that the returned value holds for the entire \p /// Range. The function should not be called for memory instructions or calls. - bool shouldWiden(Instruction *I, VFRange &Range) const; + bool shouldWiden(Instruction *I, bool FoldTailByMasking, + VFRange &Range) const; /// Check if the load or store instruction \p I should widened for \p /// Range.Start and potentially masked. Such instructions are handled by a @@ -95,7 +96,7 @@ /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same /// decision from \p Range.Start to \p Range.End. VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef Operands, - VFRange &Range) const; + VFRange &Range, VPlanPtr &Plan) const; /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe /// if it can. The function should only be called if the cost-model indicates Index: llvm/lib/Transforms/Vectorize/VPlan.h =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.h +++ llvm/lib/Transforms/Vectorize/VPlan.h @@ -2262,8 +2262,12 @@ /// Values used outside the plan. MapVector LiveOuts; + /// Whether this plan should foldTailByMasking + bool FoldTailByMasking = false; + public: - VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) { + VPlan(VPBlockBase *Entry = nullptr, bool FoldTailByMasking = false) + : Entry(Entry), FoldTailByMasking(FoldTailByMasking) { if (Entry) Entry->setPlan(this); } @@ -2446,6 +2450,8 @@ return LiveOuts; } + bool foldTailByMasking() const { return FoldTailByMasking; } + private: /// Add to the given dominator tree the header block and every new basic block /// that was created between it and the latch block, inclusive. Index: llvm/lib/Transforms/Vectorize/VPlan.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/VPlan.cpp +++ llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -778,6 +778,8 @@ std::string Out; raw_string_ostream RSO(Out); RSO << Name << " for "; + if (FoldTailByMasking) + RSO << "Tail Folded "; if (!VFs.empty()) { RSO << "VF={" << VFs[0]; for (ElementCount VF : drop_begin(VFs)) Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll +++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll @@ -9,7 +9,7 @@ target triple = "aarch64-unknown-linux-gnu" ; VPLANS-LABEL: Checking a loop in 'simple_memset' -; VPLANS: VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { +; VPLANS: VPlan 'Initial VPlan for Tail Folded VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' { ; VPLANS-NEXT: vector.ph: ; VPLANS-NEXT: EMIT vp<%2> = VF * Part + ir<0> ; VPLANS-NEXT: EMIT vp<%3> = active lane mask vp<%2> Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll +++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll @@ -1,11 +1,13 @@ ; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT -; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TAILPRED +; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED +; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=DEFAULT target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" target triple = "thumbv8.1m.main-arm-none-eabi" ; When TP is disabled, this test can vectorize with a VF of 16. ; When TP is enabled, this test should vectorize with a VF of 8. +; When both are allowed, the VF=16 without tail folding should win out. ; ; DEFAULT: load <16 x i8>, <16 x i8>* ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16> Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll +++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll @@ -8,9 +8,9 @@ define i32 @foo() { ; CHECK-LABEL: foo -; CHECK-PWR8: Executing best plan with VF=16, UF=4 +; CHECK-PWR8: Executing best plan with TailFold=false, VF=16, UF=4 -; CHECK-PWR9: Executing best plan with VF=8, UF=8 +; CHECK-PWR9: Executing best plan with TailFold=false, VF=8, UF=8 entry: @@ -46,7 +46,7 @@ ; CHECK-LABEL: goo -; CHECK: Executing best plan with VF=16, UF=4 +; CHECK: Executing best plan with TailFold=false, VF=16, UF=4 entry: br label %for.body @@ -79,7 +79,7 @@ define i64 @bar(ptr nocapture %a) { ; CHECK-LABEL: bar -; CHECK: Executing best plan with VF=2, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=2, UF=12 entry: br label %for.body @@ -107,7 +107,7 @@ define void @hoo(i32 %n) { ; CHECK-LABEL: hoo -; CHECK: Executing best plan with VF=1, UF=12 +; CHECK: Executing best plan with TailFold=false, VF=1, UF=12 entry: br label %for.body Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll +++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll @@ -108,11 +108,12 @@ ; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 2 ; CHECK-NEXT: LV: VF is vscale x 4 +; CHECK-NEXT: LV: Fold Tail is false ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: @@ -240,11 +241,12 @@ ; CHECK-NEXT: LV: Loop cost is 23 ; CHECK-NEXT: LV: IC is 2 ; CHECK-NEXT: LV: VF is vscale x 4 +; CHECK-NEXT: LV: Fold Tail is false ; CHECK-NEXT: LV: Not Interleaving. ; CHECK-NEXT: LV: Interleaving is not beneficial. ; CHECK-NEXT: LV: Found a vectorizable loop (vscale x 4) in ; CHECK-NEXT: LEV: Epilogue vectorization is not profitable for this loop -; CHECK-NEXT: Executing best plan with VF=vscale x 4, UF=1 +; CHECK-NEXT: Executing best plan with TailFold=false, VF=vscale x 4, UF=1 ; CHECK-NEXT: LV: Interleaving disabled by the pass manager ; entry: Index: llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll +++ llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll @@ -14,7 +14,9 @@ ; ; -; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata. +; This loop will be vectorized, although the trip count is below the threshold, but +; vectorization is explicitly forced in metadata. The trip count of 4 is chosen as +; it more nicely divides the loop count of 20, produce a lower total cost. ; define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) { ; CHECK-LABEL: @vectorized( @@ -27,20 +29,20 @@ ; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] -; CHECK-NEXT: store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 +; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]] +; CHECK-NEXT: store <4 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20 ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 16 +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 20, 20 ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] @@ -52,7 +54,7 @@ ; CHECK-NEXT: store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]] ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; CHECK: for.end: ; CHECK-NEXT: ret void ; Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll +++ llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll @@ -8,7 +8,7 @@ define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_1 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -98,7 +98,7 @@ define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_2 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -167,7 +167,7 @@ define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize { ; CHECK-LABEL: sink_replicate_region_3_reduction -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -241,7 +241,7 @@ ; containing %conv at the end, because %conv is the last recipe in the block. define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr %ptr, ptr noalias %dst) optsize { ; CHECK-LABEL: sink_replicate_region_4_requires_split_at_end_of_block -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -339,7 +339,7 @@ ; Test case that requires sinking a recipe in a replicate region after another replicate region. define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize { ; CHECK-LABEL: sink_replicate_region_after_replicate_region -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -413,7 +413,7 @@ define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias %dst) { ; CHECK-LABEL: need_new_block_after_sinking_pr56146 -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count Index: llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll +++ llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll @@ -36,7 +36,7 @@ ; Check for crash exposed by D76992. ; CHECK-LABEL: 'test' -; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={4},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count Index: llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll +++ llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll @@ -16,50 +16,27 @@ ; CHECK: vector.ph: ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ] -; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[VEC_IV1:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[VEC_IV2:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IV3:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV1]], 14 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV2]], 14 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV3]], 14 -; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP4]] +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 2 +; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 3 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]] +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP2]] +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP3]] +; CHECK-NEXT: store i32 0, ptr [[TMP4]], align 4 ; CHECK-NEXT: store i32 0, ptr [[TMP5]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]] -; CHECK: pred.store.if4: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP6]] +; CHECK-NEXT: store i32 0, ptr [[TMP6]], align 4 ; CHECK-NEXT: store i32 0, ptr [[TMP7]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE5]] -; CHECK: pred.store.continue5: -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]] -; CHECK: pred.store.if6: -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]] -; CHECK-NEXT: store i32 0, ptr [[TMP9]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE7]] -; CHECK: pred.store.continue7: -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.if8: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP10]] -; CHECK-NEXT: store i32 0, ptr [[TMP11]], align 4 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE9]] -; CHECK: pred.store.continue9: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 15, 12 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ 12, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.cond.cleanup: ; CHECK-NEXT: ret void @@ -69,7 +46,7 @@ ; CHECK-NEXT: store i32 0, ptr [[DST_PTR]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15 -; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] ; entry: br label %for.body @@ -92,55 +69,32 @@ ; CHECK-NEXT: [[PTR2:%.*]] = getelementptr inbounds double, ptr [[PTR1:%.*]], i64 15 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 128 +; CHECK-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 96 ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ] -; CHECK-NEXT: [[VEC_IV:%.*]] = add i64 [[INDEX]], 0 -; CHECK-NEXT: [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1 -; CHECK-NEXT: [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 14 -; CHECK-NEXT: [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 14 -; CHECK-NEXT: [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14 -; CHECK-NEXT: br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] -; CHECK: pred.store.if: -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDEX]], 0 +; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 8 +; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 8 +; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP3]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDEX]], 2 ; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 8 -; CHECK-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] -; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] -; CHECK: pred.store.continue: -; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]] -; CHECK: pred.store.if7: -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 1 +; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[INDEX]], 3 ; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 8 -; CHECK-NEXT: [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]] +; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]] +; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP]], align 8 ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP1]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE8]] -; CHECK: pred.store.continue8: -; CHECK-NEXT: br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]] -; CHECK: pred.store.if9: -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP9:%.*]] = mul i64 [[TMP8]], 8 -; CHECK-NEXT: [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP9]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP2]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE10]] -; CHECK: pred.store.continue10: -; CHECK-NEXT: br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.if11: -; CHECK-NEXT: [[TMP10:%.*]] = add i64 [[INDEX]], 3 -; CHECK-NEXT: [[TMP11:%.*]] = mul i64 [[TMP10]], 8 -; CHECK-NEXT: [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP11]] ; CHECK-NEXT: store double 0.000000e+00, ptr [[NEXT_GEP3]], align 8 -; CHECK-NEXT: br label [[PRED_STORE_CONTINUE12]] -; CHECK: pred.store.continue12: -; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 4 -; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16 -; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12 +; CHECK-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] ; CHECK: middle.block: -; CHECK-NEXT: br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 15, 12 +; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: ; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: br label [[FOR_BODY:%.*]] @@ -151,7 +105,7 @@ ; CHECK-NEXT: store double 0.000000e+00, ptr [[ADDR]], align 8 ; CHECK-NEXT: [[PTR]] = getelementptr inbounds double, ptr [[ADDR]], i64 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq ptr [[PTR]], [[PTR2]] -; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]] +; CHECK-NEXT: br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]] ; entry: %ptr2 = getelementptr inbounds double, ptr %ptr1, i64 15 Index: llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll +++ llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll @@ -10,7 +10,7 @@ ; CHECK-LABEL: LV: Checking a loop in 'sink1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -74,7 +74,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -153,7 +153,7 @@ } ; CHECK-LABEL: LV: Checking a loop in 'sink3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -234,7 +234,7 @@ ; Make sure we do not sink uniform instructions. define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) { ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -304,7 +304,7 @@ ; Loop with predicated load. define void @pred_cfg1(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -399,7 +399,7 @@ ; loaded value. define void @pred_cfg2(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -503,7 +503,7 @@ ; on loaded value. define void @pred_cfg3(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -607,7 +607,7 @@ define void @merge_3_replicate_region(i32 %k, i32 %j) { ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -707,7 +707,7 @@ define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -769,7 +769,7 @@ define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) { ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count @@ -980,7 +980,7 @@ ; need to be removed before merging. define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize { ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions' -; CHECK: VPlan 'Initial VPlan for VF={2},UF>=1' { +; CHECK: VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' { ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count ; CHECK-EMPTY: ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count