Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -375,8 +375,9 @@
 
   /// Returns true if vector representation of the instruction \p I
   /// requires mask.
-  bool isMaskRequired(const Instruction *I) const {
-    return MaskedOp.contains(I);
+  bool isMaskRequired(bool FoldTailByMasking, const Instruction *I) const {
+    return MaskedOp.contains(I) ||
+           (FoldTailByMasking && FoldTailMaskedOp.contains(I));
   }
 
   unsigned getNumStores() const { return LAI->getNumStores(); }
@@ -384,8 +385,9 @@
 
   /// Returns all assume calls in predicated blocks. They need to be dropped
   /// when flattening the CFG.
-  const SmallPtrSetImpl<Instruction *> &getConditionalAssumes() const {
-    return ConditionalAssumes;
+  const SmallPtrSetImpl<Instruction *> &
+  getConditionalAssumes(bool FoldTailByMasking) const {
+    return FoldTailByMasking ? FoldTailConditionalAssumes : ConditionalAssumes;
   }
 
   PredicatedScalarEvolution *getPredicatedScalarEvolution() const {
@@ -545,6 +547,11 @@
   /// flattened.
   SmallPtrSet<Instruction *, 8> ConditionalAssumes;
 
+  /// Same as MaskedOp above when folding tail by masking.
+  SmallPtrSet<const Instruction *, 8> FoldTailMaskedOp;
+  /// Same as ConditionalAssumes above when folding tail by masking.
+  SmallPtrSet<Instruction *, 8> FoldTailConditionalAssumes;
+
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1430,14 +1430,11 @@
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
-  SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
-  SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
-
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
-                              TmpConditionalAssumes)) {
+    if (!blockCanBePredicated(BB, SafePointers, FoldTailMaskedOp,
+                              FoldTailConditionalAssumes)) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
       return false;
     }
@@ -1445,10 +1442,6 @@
 
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
 
-  MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
-  ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
-                            TmpConditionalAssumes.end());
-
   return true;
 }
 
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -189,7 +189,10 @@
   /// Vector width with best cost.
   ElementCount Width;
 
-  /// Cost of the loop with that width.
+  /// Whether the entire loop is predicated.
+  bool FoldTailByMasking;
+
+  /// Cost of the loop with that width and vectorization style.
   InstructionCost Cost;
 
   /// Cost of the scalar loop.
@@ -199,17 +202,19 @@
   /// to runtime checks.
   ElementCount MinProfitableTripCount;
 
-  VectorizationFactor(ElementCount Width, InstructionCost Cost,
-                      InstructionCost ScalarCost)
-      : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
+  VectorizationFactor(ElementCount Width, bool FoldTailByMasking,
+                      InstructionCost Cost, InstructionCost ScalarCost)
+      : Width(Width), FoldTailByMasking(FoldTailByMasking), Cost(Cost),
+        ScalarCost(ScalarCost) {}
 
   /// Width 1 means no vectorization, cost 0 means uncomputed cost.
   static VectorizationFactor Disabled() {
-    return {ElementCount::getFixed(1), 0, 0};
+    return {ElementCount::getFixed(1), false, 0, 0};
   }
 
   bool operator==(const VectorizationFactor &rhs) const {
-    return Width == rhs.Width && Cost == rhs.Cost;
+    return Width == rhs.Width && FoldTailByMasking == rhs.FoldTailByMasking &&
+           Cost == rhs.Cost;
   }
 
   bool operator!=(const VectorizationFactor &rhs) const {
@@ -266,9 +271,6 @@
   /// The legality analysis.
   LoopVectorizationLegality *Legal;
 
-  /// The profitability analysis.
-  LoopVectorizationCostModel &CM;
-
   /// The interleaved access analysis.
   InterleavedAccessInfo &IAI;
 
@@ -283,28 +285,40 @@
   /// A builder used to construct the current plan.
   VPBuilder Builder;
 
+  /// Profitable vector factors.
+  SmallVector<VectorizationFactor, 8> ProfitableVFs;
+
 public:
   LoopVectorizationPlanner(Loop *L, LoopInfo *LI, const TargetLibraryInfo *TLI,
                            const TargetTransformInfo *TTI,
                            LoopVectorizationLegality *Legal,
-                           LoopVectorizationCostModel &CM,
                            InterleavedAccessInfo &IAI,
                            PredicatedScalarEvolution &PSE,
                            const LoopVectorizeHints &Hints,
                            OptimizationRemarkEmitter *ORE)
-      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
+      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), IAI(IAI),
         PSE(PSE), Hints(Hints), ORE(ORE) {}
 
-  /// Plan how to best vectorize, return the best VF and its cost, or
-  /// std::nullopt if vectorization and interleaving should be avoided up front.
-  std::optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
+  /// Plan how to best vectorize with a given cost model.
+  void plan(LoopVectorizationCostModel &CM, ElementCount UserVF,
+            unsigned UserIC);
+
+  /// \return The most profitable vectorization factor and the cost of that VF.
+  /// This method checks every VF in the plans in \p VPlans. If UserVF is not
+  /// ZERO then this vectorization factor will be selected if vectorization is
+  /// possible.
+  std::optional<VectorizationFactor> selectVectorizationFactor();
+
+  VectorizationFactor
+  selectEpilogueVectorizationFactor(const VectorizationFactor &MainVF);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
-  VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
+  VectorizationFactor planInVPlanNativePath(LoopVectorizationCostModel &CM,
+                                            ElementCount UserVF);
 
   /// Return the best VPlan for \p VF.
-  VPlan &getBestPlanFor(ElementCount VF) const;
+  VPlan &getBestPlanFor(ElementCount VF, bool FoldTailByMasking) const;
 
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected \p VF, \p UF and VPlan \p BestPlan.
@@ -321,9 +335,10 @@
 
   /// Look through the existing plans and return true if we have one with all
   /// the vectorization factors in question.
-  bool hasPlanWithVF(ElementCount VF) const {
-    return any_of(VPlans,
-                  [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
+  bool hasPlanWithVF(ElementCount VF, bool FoldTailByMasking) const {
+    return any_of(VPlans, [&](const VPlanPtr &Plan) {
+      return Plan->hasVF(VF) && Plan->foldTailByMasking() == FoldTailByMasking;
+    });
   }
 
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
@@ -340,13 +355,14 @@
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop.
-  void buildVPlans(ElementCount MinVF, ElementCount MaxVF);
+  void buildVPlans(LoopVectorizationCostModel &CM, ElementCount MinVF,
+                   ElementCount MaxVF);
 
 private:
   /// Build a VPlan according to the information gathered by Legal. \return a
   /// VPlan for vectorization factors \p Range.Start and up to \p Range.End
   /// exclusive, possibly decreasing \p Range.End.
-  VPlanPtr buildVPlan(VFRange &Range);
+  VPlanPtr buildVPlan(LoopVectorizationCostModel &CM, VFRange &Range);
 
   /// Build a VPlan using VPRecipes according to the information gather by
   /// Legal. This method is only used for the legacy inner loop vectorizer.
@@ -355,21 +371,39 @@
   /// set the largest included VF to the maximum VF for which no plan could be
   /// built.
   std::optional<VPlanPtr> tryToBuildVPlanWithVPRecipes(
-      VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions);
+      LoopVectorizationCostModel &CM, VFRange &Range,
+      SmallPtrSetImpl<Instruction *> &DeadInstructions);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  void buildVPlansWithVPRecipes(LoopVectorizationCostModel &CM,
+                                ElementCount MinVF, ElementCount MaxVF);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
   // converted to reductions, with one operand being vector and the other being
   // the scalar reduction chain. For other reductions, a select is introduced
   // between the phi and live-out recipes when folding the tail.
-  void adjustRecipesForReductions(VPBasicBlock *LatchVPBB, VPlanPtr &Plan,
+  void adjustRecipesForReductions(LoopVectorizationCostModel &CM,
+                                  VPBasicBlock *LatchVPBB, VPlanPtr &Plan,
                                   VPRecipeBuilder &RecipeBuilder,
                                   ElementCount MinVF);
+
+  /// Determines if we have the infrastructure to vectorize loop \p L and its
+  /// epilogue, assuming the main loop is vectorized by \p VF.
+  bool isCandidateForEpilogueVectorization(const Loop &L,
+                                           const ElementCount VF) const;
+
+  /// Returns true if the per-lane cost of VectorizationFactor A is lower than
+  /// that of B.
+  bool isMoreProfitable(const VectorizationFactor &A,
+                        const VectorizationFactor &B) const;
+
+  /// Returns true if epilogue vectorization is considered profitable, and
+  /// false otherwise.
+  /// \p VF is the vectorization factor chosen for the original loop.
+  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
 };
 
 } // namespace llvm
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -234,7 +234,6 @@
     "force-tail-folding-style", cl::desc("Force the tail folding style"),
     cl::init(TailFoldingStyle::None),
     cl::values(
-        clEnumValN(TailFoldingStyle::None, "none", "Disable tail folding"),
         clEnumValN(
             TailFoldingStyle::Data, "data",
             "Create lane mask for data only, using active.lane.mask intrinsic"),
@@ -1170,18 +1169,18 @@
 /// different operations.
 class LoopVectorizationCostModel {
 public:
-  LoopVectorizationCostModel(ScalarEpilogueLowering SEL, Loop *L,
-                             PredicatedScalarEvolution &PSE, LoopInfo *LI,
-                             LoopVectorizationLegality *Legal,
+  LoopVectorizationCostModel(bool FoldTailByMasking, ScalarEpilogueLowering SEL,
+                             Loop *L, PredicatedScalarEvolution &PSE,
+                             LoopInfo *LI, LoopVectorizationLegality *Legal,
                              const TargetTransformInfo &TTI,
                              const TargetLibraryInfo *TLI, DemandedBits *DB,
                              AssumptionCache *AC,
                              OptimizationRemarkEmitter *ORE, const Function *F,
                              const LoopVectorizeHints *Hints,
                              InterleavedAccessInfo &IAI)
-      : ScalarEpilogueStatus(SEL), TheLoop(L), PSE(PSE), LI(LI), Legal(Legal),
-        TTI(TTI), TLI(TLI), DB(DB), AC(AC), ORE(ORE), TheFunction(F),
-        Hints(Hints), InterleaveInfo(IAI) {}
+      : ScalarEpilogueStatus(SEL), FoldTailByMasking(FoldTailByMasking),
+        TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
+        AC(AC), ORE(ORE), TheFunction(F), Hints(Hints), InterleaveInfo(IAI) {}
 
   /// \return An upper bound for the vectorization factors (both fixed and
   /// scalable). If the factors are 0, vectorization and interleaving should be
@@ -1192,17 +1191,6 @@
   /// otherwise.
   bool runtimeChecksRequired();
 
-  /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every VF in \p CandidateVFs. If UserVF is not ZERO
-  /// then this vectorization factor will be selected if vectorization is
-  /// possible.
-  VectorizationFactor
-  selectVectorizationFactor(const ElementCountSet &CandidateVFs);
-
-  VectorizationFactor
-  selectEpilogueVectorizationFactor(const ElementCount MaxVF,
-                                    const LoopVectorizationPlanner &LVP);
-
   /// Setup cost-based decisions for user vectorization factor.
   /// \return true if the UserVF is a feasible VF to be chosen.
   bool selectUserVectorizationFactor(ElementCount UserVF) {
@@ -1560,16 +1548,20 @@
     return IsRequired;
   }
 
-  /// Returns true if a scalar epilogue is not allowed due to optsize or a
-  /// loop hint annotation.
+  /// Returns false if a scalar epilogue is not allowed due to, for example,
+  /// optsize or a tail folding. It is use either as a check for when
+  /// interleaving/epilog vectorization can occur, or for checking cases where a
+  /// epilog would be required for correctness.
   bool isScalarEpilogueAllowed() const {
-    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+    return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed ||
+           (!FoldTailByMasking &&
+            ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate);
   }
 
   /// Returns the TailFoldingStyle that is best for the current loop.
   TailFoldingStyle
   getTailFoldingStyle(bool IVUpdateMayOverflow = true) const {
-    if (!CanFoldTailByMasking)
+    if (!FoldTailByMasking)
       return TailFoldingStyle::None;
 
     if (ForceTailFoldingStyle.getNumOccurrences())
@@ -1579,9 +1571,7 @@
   }
 
   /// Returns true if all loop blocks should be masked to fold tail loop.
-  bool foldTailByMasking() const {
-    return getTailFoldingStyle() != TailFoldingStyle::None;
-  }
+  bool foldTailByMasking() const { return FoldTailByMasking; }
 
   /// Returns true if the instructions in this block requires predication
   /// for any reason, e.g. because tail folding now requires a predicate
@@ -1620,11 +1610,6 @@
                                     Function **Variant,
                                     bool *NeedsMask = nullptr) const;
 
-  /// Returns true if the per-lane cost of VectorizationFactor A is lower than
-  /// that of B.
-  bool isMoreProfitable(const VectorizationFactor &A,
-                        const VectorizationFactor &B) const;
-
   /// Invalidates decisions already taken by the cost model.
   void invalidateCostModelingDecisions() {
     WideningDecisions.clear();
@@ -1632,6 +1617,26 @@
     Scalars.clear();
   }
 
+  /// The vectorization cost is a combination of the cost itself and a boolean
+  /// indicating whether any of the contributing operations will actually
+  /// operate on vector values after type legalization in the backend. If this
+  /// latter value is false, then all operations will be scalarized (i.e. no
+  /// vectorization has actually taken place).
+  using VectorizationCostTy = std::pair<InstructionCost, bool>;
+
+  /// Returns the expected execution cost. The unit of the cost does
+  /// not matter because we use the 'cost' units to compare different
+  /// vector widths. The cost that is returned is *not* normalized by
+  /// the factor width. If \p Invalid is not nullptr, this function
+  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
+  /// each instruction that has an Invalid cost for the given VF.
+  VectorizationCostTy
+  expectedCost(ElementCount VF,
+               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
+
+  /// Return the NumPredStores, to be checked by the Planner.
+  unsigned getNumPredStores() { return NumPredStores; }
+
 private:
   unsigned NumPredStores = 0;
 
@@ -1657,23 +1662,6 @@
   /// of elements.
   ElementCount getMaxLegalScalableVF(unsigned MaxSafeElements);
 
-  /// The vectorization cost is a combination of the cost itself and a boolean
-  /// indicating whether any of the contributing operations will actually
-  /// operate on vector values after type legalization in the backend. If this
-  /// latter value is false, then all operations will be scalarized (i.e. no
-  /// vectorization has actually taken place).
-  using VectorizationCostTy = std::pair<InstructionCost, bool>;
-
-  /// Returns the expected execution cost. The unit of the cost does
-  /// not matter because we use the 'cost' units to compare different
-  /// vector widths. The cost that is returned is *not* normalized by
-  /// the factor width. If \p Invalid is not nullptr, this function
-  /// will add a pair(Instruction*, ElementCount) to \p Invalid for
-  /// each instruction that has an Invalid cost for the given VF.
-  VectorizationCostTy
-  expectedCost(ElementCount VF,
-               SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
-
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
   VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
@@ -1745,7 +1733,7 @@
   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
 
   /// All blocks of loop are to be masked to fold tail of scalar iterations.
-  bool CanFoldTailByMasking = false;
+  bool FoldTailByMasking = false;
 
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
@@ -1836,16 +1824,6 @@
         Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
   }
 
-  /// Determines if we have the infrastructure to vectorize loop \p L and its
-  /// epilogue, assuming the main loop is vectorized by \p VF.
-  bool isCandidateForEpilogueVectorization(const Loop &L,
-                                           const ElementCount VF) const;
-
-  /// Returns true if epilogue vectorization is considered profitable, and
-  /// false otherwise.
-  /// \p VF is the vectorization factor chosen for the original loop.
-  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
-
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -1891,9 +1869,6 @@
 
   /// All element types found in the loop.
   SmallPtrSet<Type *, 16> ElementTypesInLoop;
-
-  /// Profitable vector factors.
-  SmallVector<VectorizationFactor, 8> ProfitableVFs;
 };
 } // end namespace llvm
 
@@ -3455,7 +3430,7 @@
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
-  bool MaskRequired = Legal->isMaskRequired(CI);
+  bool MaskRequired = Legal->isMaskRequired(foldTailByMasking(), CI);
   for (auto &ArgOp : CI->args())
     ScalarTys.push_back(ArgOp->getType());
 
@@ -3941,7 +3916,7 @@
   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
   // instead of the former. For an inloop reduction the reduction will already
   // be predicated, and does not need to be handled here.
-  if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
+  if (State.Plan->foldTailByMasking() && !PhiR->isInLoop()) {
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
       SelectInst *Sel = nullptr;
@@ -4471,7 +4446,7 @@
     return false;
   case Instruction::Load:
   case Instruction::Store: {
-    if (!Legal->isMaskRequired(I))
+    if (!Legal->isMaskRequired(foldTailByMasking(), I))
       return false;
     // When we know the load's address is loop invariant and the instruction
     // in the original scalar loop was unconditionally executed then we
@@ -4498,13 +4473,13 @@
     // context sensitive reasoning
     return !isSafeToSpeculativelyExecute(I);
   case Instruction::Call:
-    return Legal->isMaskRequired(I);
+    return Legal->isMaskRequired(foldTailByMasking(), I);
   }
 }
 
 std::pair<InstructionCost, InstructionCost>
 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
-                                                    ElementCount VF) const {
+                                                     ElementCount VF) const {
   assert(I->getOpcode() == Instruction::UDiv ||
          I->getOpcode() == Instruction::SDiv ||
          I->getOpcode() == Instruction::SRem ||
@@ -4610,7 +4585,7 @@
   // load, or any gaps in a store-access).
   bool PredicatedAccessRequiresMasking =
       blockNeedsPredicationForAnyReason(I->getParent()) &&
-      Legal->isMaskRequired(I);
+      Legal->isMaskRequired(foldTailByMasking(), I);
   bool LoadAccessWithGapsRequiresEpilogMasking =
       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
       !isScalarEpilogueAllowed();
@@ -5110,12 +5085,22 @@
   case CM_ScalarEpilogueAllowed:
     return computeFeasibleMaxVF(TC, UserVF, false);
   case CM_ScalarEpilogueNotAllowedUsePredicate:
-    [[fallthrough]];
+    LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n"
+                      << "LV: Not allowing scalar epilogue, creating "
+                         "predicated vector loop.\n");
+    // We cannot add a scalar tail, but fall through to the code below both with
+    // and without FoldTailByMasking. FoldTailByMasking=false will only be
+    // allowed if the trip count is known to be a multiple of the VF. Otherwise
+    // FoldTailByMasking=true plans will be used.
+    break;
   case CM_ScalarEpilogueNotNeededUsePredicate:
-    LLVM_DEBUG(
-        dbgs() << "LV: vector predicate hint/switch found.\n"
-               << "LV: Not allowing scalar epilogue, creating predicated "
-               << "vector loop.\n");
+    // If this cost model is for predicated plans then fall through to the
+    // prepareToFoldTailByMasking checks below, else return the unpredicated max
+    // size.
+    if (!FoldTailByMasking)
+      return computeFeasibleMaxVF(TC, UserVF, false);
+    LLVM_DEBUG(dbgs() << "LV: vector predicate hint/switch found.\n"
+                      << "LV: Trying predicated vector loop.\n");
     break;
   case CM_ScalarEpilogueNotAllowedLowTripLoop:
     // fallthrough as a special case of OptForSize
@@ -5139,17 +5124,8 @@
   // a bottom-test and a single exiting block. We'd have to handle the fact
   // that not every instruction executes on the last iteration.  This will
   // require a lane mask which varies through the vector loop body.  (TODO)
-  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch()) {
-    // If there was a tail-folding hint/switch, but we can't fold the tail by
-    // masking, fallback to a vectorization with a scalar epilogue.
-    if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
-      LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
-                           "scalar epilogue instead.\n");
-      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
-      return computeFeasibleMaxVF(TC, UserVF, false);
-    }
+  if (TheLoop->getExitingBlock() != TheLoop->getLoopLatch())
     return FixedScalableVFPair::getNone();
-  }
 
   // Now try the tail folding
 
@@ -5194,25 +5170,22 @@
     if (Rem->isZero()) {
       // Accept MaxFixedVF if we do not have a tail.
       LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
-      return MaxFactors;
+      return FoldTailByMasking ? FixedScalableVFPair::getNone() : MaxFactors;
     }
   }
 
+  // If this cost model is not for tail folding then return at this point and
+  // leave it for the other model.
+  if (!FoldTailByMasking &&
+      ScalarEpilogueStatus != CM_ScalarEpilogueNotNeededUsePredicate)
+    return FixedScalableVFPair::getNone();
+
   // If we don't know the precise trip count, or if the trip count that we
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
   if (Legal->prepareToFoldTailByMasking()) {
-    CanFoldTailByMasking = true;
-    return MaxFactors;
-  }
-
-  // If there was a tail-folding hint/switch, but we can't fold the tail by
-  // masking, fallback to a vectorization with a scalar epilogue.
-  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
-    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: vectorize with a "
-                         "scalar epilogue instead.\n");
-    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    assert(FoldTailByMasking);
     return MaxFactors;
   }
 
@@ -5358,12 +5331,12 @@
   return TTI.getVScaleForTuning();
 }
 
-bool LoopVectorizationCostModel::isMoreProfitable(
+bool LoopVectorizationPlanner::isMoreProfitable(
     const VectorizationFactor &A, const VectorizationFactor &B) const {
   InstructionCost CostA = A.Cost;
   InstructionCost CostB = B.Cost;
 
-  unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
+  unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(OrigLoop);
 
   if (!A.Width.isScalable() && !B.Width.isScalable() && MaxTripCount) {
     // If the trip count is a known (possibly small) constant, the trip count
@@ -5374,15 +5347,20 @@
     // some extra overheads, but for the purpose of comparing the costs of
     // different VFs we can use this to compare the total loop-body cost
     // expected after vectorization.
-    auto GetCostForTC = [MaxTripCount, this](unsigned VF,
-                                             InstructionCost VectorCost,
-                                             InstructionCost ScalarCost) {
-      return foldTailByMasking() ? VectorCost * divideCeil(MaxTripCount, VF)
-                                 : VectorCost * (MaxTripCount / VF) +
-                                       ScalarCost * (MaxTripCount % VF);
+    auto GetCostForTC = [MaxTripCount](bool FoldTailByMasking, unsigned VF,
+                                       InstructionCost VectorCost,
+                                       InstructionCost ScalarCost) {
+      return FoldTailByMasking ? VectorCost * divideCeil(MaxTripCount, VF)
+                               : VectorCost * (MaxTripCount / VF) +
+                                     ScalarCost * (MaxTripCount % VF);
     };
-    auto RTCostA = GetCostForTC(A.Width.getFixedValue(), CostA, A.ScalarCost);
-    auto RTCostB = GetCostForTC(B.Width.getFixedValue(), CostB, B.ScalarCost);
+    auto RTCostA = GetCostForTC(A.FoldTailByMasking, A.Width.getFixedValue(),
+                                CostA, A.ScalarCost);
+    auto RTCostB = GetCostForTC(B.FoldTailByMasking, B.Width.getFixedValue(),
+                                CostB, B.ScalarCost);
+
+    if (A.FoldTailByMasking && !B.FoldTailByMasking)
+      return RTCostA <= RTCostB;
 
     return RTCostA < RTCostB;
   }
@@ -5390,13 +5368,19 @@
   // Improve estimate for the vector width if it is scalable.
   unsigned EstimatedWidthA = A.Width.getKnownMinValue();
   unsigned EstimatedWidthB = B.Width.getKnownMinValue();
-  if (std::optional<unsigned> VScale = getVScaleForTuning(TheFunction, TTI)) {
+  if (std::optional<unsigned> VScale =
+          getVScaleForTuning(OrigLoop->getHeader()->getParent(), *TTI)) {
     if (A.Width.isScalable())
       EstimatedWidthA *= *VScale;
     if (B.Width.isScalable())
       EstimatedWidthB *= *VScale;
   }
 
+  // If one plan is predicated and the other is not, opt for the predicated
+  // scheme on a tie.
+  if (A.FoldTailByMasking && !B.FoldTailByMasking)
+    return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA);
+
   // Assume vscale may be larger than 1 (or the value being tuned for),
   // so that scalable vectorization is slightly favorable over fixed-width
   // vectorization.
@@ -5473,20 +5457,45 @@
   } while (!Tail.empty());
 }
 
-VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
-    const ElementCountSet &VFCandidates) {
-  InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
+std::optional<VectorizationFactor>
+LoopVectorizationPlanner::selectVectorizationFactor() {
+  LLVM_DEBUG(printPlans(dbgs()));
+
+  // If we had no plans as they were all invalid, return the invalid cost
+  if (VPlans.size() == 0)
+    return std::nullopt;
+
+  // If we only have one plan due to the UserVF, return it. We try with both
+  // predicated and unpredicated loops.
+  ElementCount UserVF = Hints.getWidth();
+  bool UserPredicated = Hints.getPredicate();
+  if (UserVF && hasPlanWithVF(UserVF, UserPredicated)) {
+    VPlan &Plan = getBestPlanFor(UserVF, UserPredicated);
+    auto Cost = Plan.getCostModel()->expectedCost(UserVF);
+    if (Cost.first.isValid())
+      return VectorizationFactor(UserVF, UserPredicated, Cost.first, 0);
+  } else if (UserVF && hasPlanWithVF(UserVF, !UserPredicated)) {
+    VPlan &Plan = getBestPlanFor(UserVF, !UserPredicated);
+    auto Cost = Plan.getCostModel()->expectedCost(UserVF);
+    if (Cost.first.isValid())
+      return VectorizationFactor(UserVF, !UserPredicated, Cost.first, 0);
+  }
+
+  assert(VPlans[0]->hasScalarVFOnly() &&
+         "Expected Scalar VPlan to be a the first candidate");
+
+  InstructionCost ExpectedCost =
+      VPlans[0]->getCostModel()->expectedCost(ElementCount::getFixed(1)).first;
   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
-  assert(VFCandidates.count(ElementCount::getFixed(1)) &&
-         "Expected Scalar VF to be a candidate");
 
-  const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
-                                       ExpectedCost);
+  const VectorizationFactor ScalarCost(ElementCount::getFixed(1),
+                                       VPlans[0]->foldTailByMasking(),
+                                       ExpectedCost, ExpectedCost);
   VectorizationFactor ChosenFactor = ScalarCost;
 
-  bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
-  if (ForceVectorization && VFCandidates.size() > 1) {
+  bool ForceVectorization = Hints.getForce() == LoopVectorizeHints::FK_Enabled;
+  if (ForceVectorization && VPlans.size() > 1) {
     // Ignore scalar width, because the user explicitly wants vectorization.
     // Initialize cost to max so that VF = 2 is, at least, chosen during cost
     // evaluation.
@@ -5494,63 +5503,80 @@
   }
 
   SmallVector<InstructionVFPair> InvalidCosts;
-  for (const auto &i : VFCandidates) {
-    // The cost for scalar VF=1 is already calculated, so ignore it.
-    if (i.isScalar())
-      continue;
+  for (const VPlanPtr &VPlan : drop_begin(VPlans)) {
+    for (const ElementCount &i : VPlan->getVFs()) {
+      // The cost for scalar VF=1 is already calculated, so ignore it.
+      if (i.isScalar())
+        continue;
 
-    VectorizationCostTy C = expectedCost(i, &InvalidCosts);
-    VectorizationFactor Candidate(i, C.first, ScalarCost.ScalarCost);
+      LoopVectorizationCostModel::VectorizationCostTy C =
+          VPlan->getCostModel()->expectedCost(i, &InvalidCosts);
+      VectorizationFactor Candidate(i, VPlan->foldTailByMasking(), C.first,
+                                    ScalarCost.ScalarCost);
 
 #ifndef NDEBUG
-    unsigned AssumedMinimumVscale = 1;
-    if (std::optional<unsigned> VScale = getVScaleForTuning(TheFunction, TTI))
-      AssumedMinimumVscale = *VScale;
-    unsigned Width =
-        Candidate.Width.isScalable()
-            ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
-            : Candidate.Width.getFixedValue();
-    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i << " costs: "
-                      << Candidate.Cost << " => " << (Candidate.Cost / Width));
-    if (i.isScalable())
-      LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
-                        << AssumedMinimumVscale << ")");
-    LLVM_DEBUG(dbgs() << ".\n");
-#endif
-
-    if (!C.second && !ForceVectorization) {
+      unsigned AssumedMinimumVscale = 1;
+      if (std::optional<unsigned> VScale =
+              getVScaleForTuning(OrigLoop->getHeader()->getParent(), *TTI))
+        AssumedMinimumVscale = *VScale;
+      unsigned Width =
+          Candidate.Width.isScalable()
+              ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
+              : Candidate.Width.getFixedValue();
       LLVM_DEBUG(
-          dbgs() << "LV: Not considering vector loop of width " << i
-                 << " because it will not generate any vector instructions.\n");
-      continue;
-    }
+          dbgs() << "LV: " << (VPlan->foldTailByMasking() ? "Tail folded " : "")
+                 << "Vector loop of width " << i << " costs: " << Candidate.Cost
+                 << " => " << (Candidate.Cost / Width));
+      if (i.isScalable())
+        LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
+                          << AssumedMinimumVscale << ")");
+      LLVM_DEBUG(dbgs() << ".\n");
+#endif
 
-    // If profitable add it to ProfitableVF list.
-    if (isMoreProfitable(Candidate, ScalarCost))
-      ProfitableVFs.push_back(Candidate);
+      if (!C.second && !ForceVectorization) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Not considering vector loop of width " << i
+            << " because it will not generate any vector instructions.\n");
+        continue;
+      }
 
-    if (isMoreProfitable(Candidate, ChosenFactor))
-      ChosenFactor = Candidate;
-  }
+      // FIXME: Possibly remove EnableCondStoresVectorization now.
+      if (!EnableCondStoresVectorization &&
+          VPlan->getCostModel()->getNumPredStores()) {
+        reportVectorizationFailure(
+            "There are conditional stores.",
+            "store that is conditionally executed prevents vectorization",
+            "ConditionalStore", ORE, OrigLoop);
+        continue;
+      }
 
-  emitInvalidCostRemarks(InvalidCosts, ORE, TheLoop);
+      // If profitable add it to ProfitableVF list.
+      if (isMoreProfitable(Candidate, ScalarCost))
+        ProfitableVFs.push_back(Candidate);
 
-  if (!EnableCondStoresVectorization && NumPredStores) {
-    reportVectorizationFailure("There are conditional stores.",
-        "store that is conditionally executed prevents vectorization",
-        "ConditionalStore", ORE, TheLoop);
-    ChosenFactor = ScalarCost;
+      if (isMoreProfitable(Candidate, ChosenFactor))
+        ChosenFactor = Candidate;
+    }
   }
 
-  LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
-                 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
-             << "LV: Vectorization seems to be not beneficial, "
-             << "but was forced by a user.\n");
-  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
+  emitInvalidCostRemarks(InvalidCosts, ORE, OrigLoop);
+
+  LLVM_DEBUG({
+    if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
+        !isMoreProfitable(ChosenFactor, ScalarCost))
+      dbgs() << "LV: Vectorization seems to be not beneficial, "
+             << "but was forced by a user.\n";
+  });
+  LLVM_DEBUG(dbgs() << "LV: Selecting "
+                    << (ChosenFactor.FoldTailByMasking ? "Tail folded " : "")
+                    << "VF: " << ChosenFactor.Width << ".\n");
+  assert((ChosenFactor.Width.isScalar() || ChosenFactor.ScalarCost > 0) &&
+         "when vectorizing, the scalar cost must be non-zero.");
   return ChosenFactor;
 }
 
-bool LoopVectorizationCostModel::isCandidateForEpilogueVectorization(
+bool LoopVectorizationPlanner::isCandidateForEpilogueVectorization(
     const Loop &L, ElementCount VF) const {
   // Cross iteration phis such as reductions need special handling and are
   // currently unsupported.
@@ -5581,7 +5607,7 @@
   return true;
 }
 
-bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
+bool LoopVectorizationPlanner::isEpilogueVectorizationProfitable(
     const ElementCount VF) const {
   // FIXME: We need a much better cost-model to take different parameters such
   // as register pressure, code size increase and cost of extra branches into
@@ -5589,40 +5615,40 @@
   // with vectorization factors larger than a certain value.
 
   // Allow the target to opt out entirely.
-  if (!TTI.preferEpilogueVectorization())
+  if (!TTI->preferEpilogueVectorization())
     return false;
 
   // We also consider epilogue vectorization unprofitable for targets that don't
   // consider interleaving beneficial (eg. MVE).
-  if (TTI.getMaxInterleaveFactor(VF) <= 1)
+  if (TTI->getMaxInterleaveFactor(VF) <= 1)
     return false;
 
   unsigned Multiplier = 1;
   if (VF.isScalable())
-    Multiplier = getVScaleForTuning(TheFunction, TTI).value_or(1);
+    Multiplier = getVScaleForTuning(OrigLoop->getHeader()->getParent(), *TTI)
+                     .value_or(1);
   if ((Multiplier * VF.getKnownMinValue()) >= EpilogueVectorizationMinVF)
     return true;
   return false;
 }
 
-VectorizationFactor
-LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
-    const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
+    const VectorizationFactor &MainLoopVF) {
   VectorizationFactor Result = VectorizationFactor::Disabled();
   if (!EnableEpilogueVectorization) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n");
     return Result;
   }
 
-  if (!isScalarEpilogueAllowed()) {
-    LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
-                         "epilogue is allowed.\n");
+  if (MainLoopVF.FoldTailByMasking) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue not required as the vector loop is "
+                         "predicated.\n";);
     return Result;
   }
 
   // Not really a cost consideration, but check for unsupported cases here to
   // simplify the logic.
-  if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
+  if (!isCandidateForEpilogueVectorization(*OrigLoop, MainLoopVF.Width)) {
     LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
                          "is not a supported candidate.\n");
     return Result;
@@ -5631,8 +5657,8 @@
   if (EpilogueVectorizationForceVF > 1) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n");
     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
-    if (LVP.hasPlanWithVF(ForcedEC))
-      return {ForcedEC, 0, 0};
+    if (hasPlanWithVF(ForcedEC, false))
+      return {ForcedEC, false, 0, 0};
     else {
       LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization forced factor is not "
                            "viable.\n");
@@ -5640,14 +5666,14 @@
     }
   }
 
-  if (TheLoop->getHeader()->getParent()->hasOptSize() ||
-      TheLoop->getHeader()->getParent()->hasMinSize()) {
+  Function *TheFunction = OrigLoop->getHeader()->getParent();
+  if (TheFunction->hasOptSize() || TheFunction->hasMinSize()) {
     LLVM_DEBUG(
         dbgs() << "LEV: Epilogue vectorization skipped due to opt for size.\n");
     return Result;
   }
 
-  if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
+  if (!isEpilogueVectorizationProfitable(MainLoopVF.Width)) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
                          "this loop\n");
     return Result;
@@ -5656,19 +5682,20 @@
   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
   // the main loop handles 8 lanes per iteration. We could still benefit from
   // vectorizing the epilogue loop with VF=4.
-  ElementCount EstimatedRuntimeVF = MainLoopVF;
-  if (MainLoopVF.isScalable()) {
-    EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
-    if (std::optional<unsigned> VScale = getVScaleForTuning(TheFunction, TTI))
+  ElementCount EstimatedRuntimeVF = MainLoopVF.Width;
+  if (MainLoopVF.Width.isScalable()) {
+    EstimatedRuntimeVF =
+        ElementCount::getFixed(MainLoopVF.Width.getKnownMinValue());
+    if (std::optional<unsigned> VScale = getVScaleForTuning(TheFunction, *TTI))
       EstimatedRuntimeVF *= *VScale;
   }
 
   for (auto &NextVF : ProfitableVFs)
-    if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
+    if (((!NextVF.Width.isScalable() && MainLoopVF.Width.isScalable() &&
           ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
-         ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
+         ElementCount::isKnownLT(NextVF.Width, MainLoopVF.Width)) &&
         (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
-        LVP.hasPlanWithVF(NextVF.Width))
+        hasPlanWithVF(NextVF.Width, NextVF.FoldTailByMasking))
       Result = NextVF;
 
   if (Result != VectorizationFactor::Disabled())
@@ -6529,7 +6556,7 @@
          "Stride should be 1 or -1 for consecutive memory access");
   const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
-  if (Legal->isMaskRequired(I)) {
+  if (Legal->isMaskRequired(foldTailByMasking(), I)) {
     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
                                       CostKind);
   } else {
@@ -6583,7 +6610,8 @@
 
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(
-             I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+             I->getOpcode(), VectorTy, Ptr,
+             Legal->isMaskRequired(foldTailByMasking(), I), Alignment,
              TargetTransformInfo::TCK_RecipThroughput, I);
 }
 
@@ -6618,11 +6646,12 @@
       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
-      AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
+      AS, CostKind, Legal->isMaskRequired(foldTailByMasking(), I),
+      UseMaskForGaps);
 
   if (Group->isReverse()) {
     // TODO: Add support for reversed masked interleaved access.
-    assert(!Legal->isMaskRequired(I) &&
+    assert(!Legal->isMaskRequired(foldTailByMasking(), I) &&
            "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
@@ -7338,8 +7367,9 @@
         return TTI::CastContextHint::Interleave;
       case LoopVectorizationCostModel::CM_Scalarize:
       case LoopVectorizationCostModel::CM_Widen:
-        return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
-                                        : TTI::CastContextHint::Normal;
+        return Legal->isMaskRequired(foldTailByMasking(), I)
+                   ? TTI::CastContextHint::Masked
+                   : TTI::CastContextHint::Normal;
       case LoopVectorizationCostModel::CM_Widen_Reverse:
         return TTI::CastContextHint::Reversed;
       case LoopVectorizationCostModel::CM_Unknown:
@@ -7508,7 +7538,8 @@
 }
 
 VectorizationFactor
-LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
+LoopVectorizationPlanner::planInVPlanNativePath(LoopVectorizationCostModel &CM,
+                                                ElementCount UserVF) {
   assert(!UserVF.isScalable() && "scalable vectors not yet supported");
   ElementCount VF = UserVF;
   // Outer loop handling: They may require CFG and instruction level
@@ -7537,13 +7568,13 @@
            "VF needs to be a power of two");
     LLVM_DEBUG(dbgs() << "LV: Using " << (!UserVF.isZero() ? "user " : "")
                       << "VF " << VF << " to build VPlans.\n");
-    buildVPlans(VF, VF);
+    buildVPlans(CM, VF, VF);
 
     // For VPlan build stress testing, we bail out after VPlan construction.
     if (VPlanBuildStressTest)
       return VectorizationFactor::Disabled();
 
-    return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
+    return {VF, false /*TailFold*/, 0 /*Cost*/, 0 /* ScalarCost */};
   }
 
   LLVM_DEBUG(
@@ -7552,12 +7583,15 @@
   return VectorizationFactor::Disabled();
 }
 
-std::optional<VectorizationFactor>
-LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
+void LoopVectorizationPlanner::plan(LoopVectorizationCostModel &CM,
+                                    ElementCount UserVF, unsigned UserIC) {
+  CM.collectValuesToIgnore();
+  CM.collectElementTypesForWidening();
+
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
   FixedScalableVFPair MaxFactors = CM.computeMaxVF(UserVF, UserIC);
   if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
-    return std::nullopt;
+    return;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
   if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
@@ -7584,66 +7618,29 @@
     if (CM.selectUserVectorizationFactor(UserVF)) {
       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
       CM.collectInLoopReductions();
-      buildVPlansWithVPRecipes(UserVF, UserVF);
-      if (!hasPlanWithVF(UserVF)) {
-        LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
-                          << ".\n");
-        return std::nullopt;
-      }
-
-      LLVM_DEBUG(printPlans(dbgs()));
-      return {{UserVF, 0, 0}};
+      buildVPlansWithVPRecipes(CM, UserVF, UserVF);
+      return;
     } else
       reportVectorizationInfo("UserVF ignored because of invalid costs.",
                               "InvalidCost", ORE, OrigLoop);
   }
 
-  // Populate the set of Vectorization Factor Candidates.
-  ElementCountSet VFCandidates;
-  for (auto VF = ElementCount::getFixed(1);
-       ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
-    VFCandidates.insert(VF);
-  for (auto VF = ElementCount::getScalable(1);
-       ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
-    VFCandidates.insert(VF);
-
-  for (const auto &VF : VFCandidates) {
-    // Collect Uniform and Scalar instructions after vectorization with VF.
-    CM.collectUniformsAndScalars(VF);
-
-    // Collect the instructions (and their associated costs) that will be more
-    // profitable to scalarize.
-    if (VF.isVector())
-      CM.collectInstsToScalarize(VF);
-  }
-
   CM.collectInLoopReductions();
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
-
-  LLVM_DEBUG(printPlans(dbgs()));
-  if (!MaxFactors.hasVector())
-    return VectorizationFactor::Disabled();
-
-  // Select the optimal vectorization factor.
-  VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
-  assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
-  if (!hasPlanWithVF(VF.Width)) {
-    LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << VF.Width
-                      << ".\n");
-    return std::nullopt;
-  }
-  return VF;
+  buildVPlansWithVPRecipes(CM, ElementCount::getFixed(1), MaxFactors.FixedVF);
+  buildVPlansWithVPRecipes(CM, ElementCount::getScalable(1), MaxFactors.ScalableVF);
 }
 
-VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
+VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF,
+                                                bool FoldTailByMasking) const {
   assert(count_if(VPlans,
-                  [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
-             1 &&
+                  [VF, FoldTailByMasking](const VPlanPtr &Plan) {
+                    return Plan->hasVF(VF) &&
+                           Plan->foldTailByMasking() == FoldTailByMasking;
+                  }) == 1 &&
          "Best VF has not a single VPlan.");
 
   for (const VPlanPtr &Plan : VPlans) {
-    if (Plan->hasVF(VF))
+    if (Plan->hasVF(VF) && Plan->foldTailByMasking() == FoldTailByMasking)
       return *Plan.get();
   }
   llvm_unreachable("No plan found!");
@@ -7693,8 +7690,9 @@
   assert(BestVPlan.hasUF(BestUF) &&
          "Trying to execute plan with unsupported UF");
 
-  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
-                    << '\n');
+  LLVM_DEBUG(dbgs() << "Executing best plan with TailFold="
+                    << (BestVPlan.foldTailByMasking() ? "true" : "false")
+                    << ", VF=" << BestVF << ", UF=" << BestUF << '\n');
 
   // Workaround!  Compute the trip count of the original loop and cache it
   // before we start modifying the CFG.  This code has a systemic problem
@@ -8093,12 +8091,13 @@
 /// of VF's starting at a given VF and extending it as much as possible. Each
 /// vectorization decision can potentially shorten this sub-range during
 /// buildVPlan().
-void LoopVectorizationPlanner::buildVPlans(ElementCount MinVF,
+void LoopVectorizationPlanner::buildVPlans(LoopVectorizationCostModel &CM,
+                                           ElementCount MinVF,
                                            ElementCount MaxVF) {
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    VPlans.push_back(buildVPlan(SubRange));
+    VPlans.push_back(buildVPlan(CM, SubRange));
     VF = SubRange.End;
   }
 }
@@ -8164,7 +8163,7 @@
     if (!CM.blockNeedsPredicationForAnyReason(BB))
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
-    assert(CM.foldTailByMasking() && "must fold the tail");
+    assert(Plan.foldTailByMasking() && "must fold the tail");
 
     // If we're using the active lane mask for control flow, then we get the
     // mask from the active lane mask PHI that is cached in the VPlan.
@@ -8236,7 +8235,7 @@
     return nullptr;
 
   VPValue *Mask = nullptr;
-  if (Legal->isMaskRequired(I))
+  if (Legal->isMaskRequired(Plan->foldTailByMasking(), I))
     Mask = createBlockInMask(I->getParent(), *Plan);
 
   // Determine if the pointer operand of the access is either consecutive or
@@ -8452,7 +8451,7 @@
       //      vector variant at this VF requires a mask, so we synthesize an
       //      all-true mask.
       VPValue *Mask = nullptr;
-      if (Legal->isMaskRequired(CI))
+      if (Legal->isMaskRequired(Plan->foldTailByMasking(), CI))
         Mask = createBlockInMask(CI->getParent(), *Plan);
       else
         Mask = Plan->getVPValueOrAddLiveIn(ConstantInt::getTrue(
@@ -8703,22 +8702,33 @@
   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
-                                                        ElementCount MaxVF) {
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
+    LoopVectorizationCostModel &CM, ElementCount MinVF, ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
+  for (ElementCount VF = MinVF; ElementCount::isKnownLE(VF, MaxVF); VF *= 2) {
+    // Collect Uniform and Scalar instructions after vectorization with VF.
+    CM.collectUniformsAndScalars(VF);
+
+    // Collect the instructions (and their associated costs) that will be more
+    // profitable to scalarize.
+    if (VF.isVector())
+      CM.collectInstsToScalarize(VF);
+  }
+
   // Add assume instructions we need to drop to DeadInstructions, to prevent
   // them from being added to the VPlan.
   // TODO: We only need to drop assumes in blocks that get flattend. If the
   // control flow is preserved, we should keep them.
   SmallPtrSet<Instruction *, 4> DeadInstructions;
-  auto &ConditionalAssumes = Legal->getConditionalAssumes();
+  auto &ConditionalAssumes =
+      Legal->getConditionalAssumes(CM.foldTailByMasking());
   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
 
   auto MaxVFTimes2 = MaxVF * 2;
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
     VFRange SubRange = {VF, MaxVFTimes2};
-    if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange, DeadInstructions))
+    if (auto Plan = tryToBuildVPlanWithVPRecipes(CM, SubRange, DeadInstructions))
       VPlans.push_back(std::move(*Plan));
     VF = SubRange.End;
   }
@@ -8851,7 +8861,8 @@
 }
 
 std::optional<VPlanPtr> LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
-    VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+    LoopVectorizationCostModel &CM, VFRange &Range,
+    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
@@ -8885,7 +8896,7 @@
   // placeholders for its members' Recipes which we'll be replacing with a
   // single VPInterleaveRecipe.
   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto applyIG = [IG, this](ElementCount VF) -> bool {
+    auto applyIG = [IG, &CM](ElementCount VF) -> bool {
       return (VF.isVector() && // Query is illegal for VF == 1
               CM.getWideningDecision(IG->getInsertPos(), VF) ==
                   LoopVectorizationCostModel::CM_Interleave);
@@ -8907,7 +8918,7 @@
   // followed by a region for the vector loop, followed by the middle block. The
   // skeleton vector loop region contains a header and latch block.
   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
-  auto Plan = std::make_unique<VPlan>(Preheader);
+  auto Plan = std::make_unique<VPlan>(CM.foldTailByMasking(), &CM, Preheader);
 
   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
@@ -9031,8 +9042,8 @@
   VPlanTransforms::removeRedundantInductionCasts(*Plan);
 
   // Adjust the recipes for any inloop reductions.
-  adjustRecipesForReductions(cast<VPBasicBlock>(TopRegion->getExiting()), Plan,
-                             RecipeBuilder, Range.Start);
+  adjustRecipesForReductions(CM, cast<VPBasicBlock>(TopRegion->getExiting()),
+                             Plan, RecipeBuilder, Range.Start);
 
   // Sink users of fixed-order recurrence past the recipe defining the previous
   // value and introduce FirstOrderRecurrenceSplice VPInstructions.
@@ -9092,7 +9103,8 @@
   return std::make_optional(std::move(Plan));
 }
 
-VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
+VPlanPtr LoopVectorizationPlanner::buildVPlan(LoopVectorizationCostModel &CM,
+                                              VFRange &Range) {
   // Outer loop handling: They may require CFG and instruction level
   // transformations before even evaluating whether vectorization is profitable.
   // Since we cannot modify the incoming IR, we need to build VPlan upfront in
@@ -9101,7 +9113,7 @@
   assert(EnableVPlanNativePath && "VPlan-native path is not enabled.");
 
   // Create new empty VPlan
-  auto Plan = std::make_unique<VPlan>();
+  auto Plan = std::make_unique<VPlan>(CM.foldTailByMasking(), &CM);
 
   // Build hierarchical CFG
   VPlanHCFGBuilder HCFGBuilder(OrigLoop, LI, *Plan);
@@ -9123,7 +9135,7 @@
   Term->eraseFromParent();
 
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
-                        CM.getTailFoldingStyle());
+                        CM.getTailFoldingStyle(false));
   return Plan;
 }
 
@@ -9133,8 +9145,8 @@
 // reduction chain. For other reductions, a select is introduced between the phi
 // and live-out recipes when folding the tail.
 void LoopVectorizationPlanner::adjustRecipesForReductions(
-    VPBasicBlock *LatchVPBB, VPlanPtr &Plan, VPRecipeBuilder &RecipeBuilder,
-    ElementCount MinVF) {
+    LoopVectorizationCostModel &CM, VPBasicBlock *LatchVPBB, VPlanPtr &Plan,
+    VPRecipeBuilder &RecipeBuilder, ElementCount MinVF) {
   for (const auto &Reduction : CM.getInLoopReductionChains()) {
     PHINode *Phi = Reduction.first;
     const RecurrenceDescriptor &RdxDesc =
@@ -9221,7 +9233,7 @@
   // If tail is folded by masking, introduce selects between the phi
   // and the live-out instruction of each reduction, at the beginning of the
   // dedicated latch block.
-  if (CM.foldTailByMasking()) {
+  if (Plan->foldTailByMasking()) {
     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
     for (VPRecipeBase &R :
          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
@@ -9888,12 +9900,10 @@
   ScalarEpilogueLowering SEL =
       getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, *LVL, &IAI);
 
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints, IAI);
+  LoopVectorizationCostModel CM(false, SEL, L, PSE, LI, LVL, *TTI, TLI, DB, AC,
+                                ORE, F, &Hints, IAI);
   // Use the planner for outer loop vectorization.
-  // TODO: CM is not used at this point inside the planner. Turn CM into an
-  // optional argument if we don't need it in the future.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE, Hints, ORE);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, IAI, PSE, Hints, ORE);
 
   // Get user vectorization factor.
   ElementCount UserVF = Hints.getWidth();
@@ -9901,7 +9911,7 @@
   CM.collectElementTypesForWidening();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  const VectorizationFactor VF = LVP.planInVPlanNativePath(UserVF);
+  const VectorizationFactor VF = LVP.planInVPlanNativePath(CM, UserVF);
 
   // If we are stress testing VPlan builds, do not attempt to generate vector
   // code. Masked vector code generation support will follow soon.
@@ -9909,7 +9919,7 @@
   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
     return false;
 
-  VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+  VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking);
 
   {
     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
@@ -10141,7 +10151,7 @@
 
   assert(L->isInnermost() && "Inner loop expected.");
 
-  InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL.getLAI());
+  InterleavedAccessInfo BaseIAI(PSE, L, DT, LI, LVL.getLAI());
   bool UseInterleaved = TTI->enableInterleavedAccessVectorization();
 
   // If an override option has been passed in for interleaved accesses, use it.
@@ -10150,12 +10160,12 @@
 
   // Analyze interleaved memory accesses.
   if (UseInterleaved)
-    IAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+    BaseIAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
 
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
   ScalarEpilogueLowering SEL =
-      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &IAI);
+      getScalarEpilogueLowering(F, L, Hints, PSI, BFI, TTI, TLI, LVL, &BaseIAI);
 
   // Check the loop for a trip count threshold: vectorize loops with a tiny trip
   // count by optimizing for size, to minimize overheads.
@@ -10229,21 +10239,31 @@
     return false;
   }
 
-  // Use the cost model.
-  LoopVectorizationCostModel CM(SEL, L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE,
-                                F, &Hints, IAI);
-  CM.collectValuesToIgnore();
-  CM.collectElementTypesForWidening();
-
   // Use the planner for vectorization.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE, Hints, ORE);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, BaseIAI, PSE, Hints, ORE);
 
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
 
+  // We plan with two different cost models with FoldTailByMasking = false and
+  // true, adding the useful vplans from each and picking the best below in
+  // selectVectorizationFactor.
+  LoopVectorizationCostModel BaseCM(false, SEL, L, PSE, LI, &LVL, *TTI, TLI, DB,
+                                    AC, ORE, F, &Hints, BaseIAI);
+  LVP.plan(BaseCM, UserVF, UserIC);
+
+  InterleavedAccessInfo PredIAI(PSE, L, DT, LI, LVL.getLAI());
+  if (UseInterleaved && SEL != CM_ScalarEpilogueAllowed)
+    PredIAI.analyzeInterleaving(useMaskedInterleavedAccesses(*TTI));
+  LoopVectorizationCostModel PredCM(true, SEL, L, PSE, LI, &LVL, *TTI, TLI, DB,
+                                    AC, ORE, F, &Hints, PredIAI);
+  if (SEL != CM_ScalarEpilogueAllowed)
+    LVP.plan(PredCM, UserVF, UserIC);
+
   // Plan how to best vectorize, return the best VF and its cost.
-  std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
+  // Use the cost model.
+  std::optional<VectorizationFactor> MaybeVF = LVP.selectVectorizationFactor();
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
@@ -10253,7 +10273,8 @@
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
-    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+    IC = VF.FoldTailByMasking ? PredCM.selectInterleaveCount(VF.Width, VF.Cost)
+                              : BaseCM.selectInterleaveCount(VF.Width, VF.Cost);
 
     unsigned SelectedIC = std::max(IC, UserIC);
     //  Optimistically generate runtime checks if they are needed. Drop them if
@@ -10370,10 +10391,9 @@
       assert(IC > 1 && "interleave count should not be 1 or 0");
       // If we decided that it is not legal to vectorize the loop, then
       // interleave it.
+      VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking);
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
-                                 &CM, BFI, PSI, Checks);
-
-      VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+                                 BestPlan.getCostModel(), BFI, PSI, Checks);
       LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
 
       ORE->emit([&]() {
@@ -10387,17 +10407,18 @@
 
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
-          CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
+          LVP.selectEpilogueVectorizationFactor(VF);
       if (EpilogueVF.Width.isVector()) {
-
         // The first pass vectorizes the main loop and creates a scalar epilogue
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
+        // TODOD: Predicated remainders
         EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
-        EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
-                                           EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
+        VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF, false);
+        EpilogueVectorizerMainLoop MainILV(
+            L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL,
+            BestMainPlan.getCostModel(), BFI, PSI, Checks);
         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
                         DT, true);
         ++LoopsVectorized;
@@ -10406,11 +10427,11 @@
         // edges from the first pass.
         EPI.MainLoopVF = EPI.EpilogueVF;
         EPI.MainLoopUF = EPI.EpilogueUF;
-        EpilogueVectorizerEpilogueLoop EpilogILV(L, PSE, LI, DT, TLI, TTI, AC,
-                                                 ORE, EPI, &LVL, &CM, BFI, PSI,
-                                                 Checks);
+        VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF, false);
+        EpilogueVectorizerEpilogueLoop EpilogILV(
+            L, PSE, LI, DT, TLI, TTI, AC, ORE, EPI, &LVL,
+            BestEpiPlan.getCostModel(), BFI, PSI, Checks);
 
-        VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
         Header->setName("vec.epilog.vector.body");
@@ -10457,11 +10478,10 @@
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
+        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width, VF.FoldTailByMasking);
         InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
-                               VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
-                               PSI, Checks);
-
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+                               VF.MinProfitableTripCount, IC, &LVL,
+                               BestPlan.getCostModel(), BFI, PSI, Checks);
         LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -49,6 +49,7 @@
 class InnerLoopVectorizer;
 class IRBuilderBase;
 class LoopInfo;
+class LoopVectorizationCostModel;
 class PredicateScalarEvolution;
 class raw_ostream;
 class RecurrenceDescriptor;
@@ -2253,8 +2254,18 @@
   /// Values used outside the plan.
   MapVector<PHINode *, VPLiveOut *> LiveOuts;
 
+  /// Whether this plan should foldTailByMasking
+  bool FoldTailByMasking = false;
+
+  /// The cost model used to construct and cost this VPlan.
+  /// TODO: Remove this and the dependencies on the costmodel.
+  LoopVectorizationCostModel *Cost;
+
 public:
-  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
+  VPlan(bool FoldTailByMasking = false,
+        LoopVectorizationCostModel *Cost = nullptr,
+        VPBlockBase *Entry = nullptr)
+      : Entry(Entry), FoldTailByMasking(FoldTailByMasking), Cost(Cost) {
     if (Entry)
       Entry->setPlan(this);
   }
@@ -2299,6 +2310,8 @@
   /// longer, because it may be stale.
   void disableValue2VPValue() { Value2VPValueEnabled = false; }
 
+  const SmallSetVector<ElementCount, 2> &getVFs() const { return VFs; }
+
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
   void setVF(ElementCount VF) {
@@ -2416,6 +2429,10 @@
     return LiveOuts;
   }
 
+  bool foldTailByMasking() const { return FoldTailByMasking; }
+
+  LoopVectorizationCostModel *getCostModel() const { return Cost; }
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -794,6 +794,8 @@
   std::string Out;
   raw_string_ostream RSO(Out);
   RSO << Name << " for ";
+  if (FoldTailByMasking)
+    RSO << "Tail Folded ";
   if (!VFs.empty()) {
     RSO << "VF={" << VFs[0];
     for (ElementCount VF : drop_begin(VFs))
Index: llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/maximize-bandwidth-invalidate.ll
@@ -14,7 +14,7 @@
 ; COST: LV: Found an estimated cost of 3000000 for VF 4 For instruction:   %0 = load
 ; COST: LV: Found an estimated cost of 3000000 for VF 8 For instruction:   %0 = load
 ; COST: LV: Found an estimated cost of 3000000 for VF 16 For instruction:   %0 = load
-; COST: LV: Selecting VF: 1.
+; COST: LV: Selecting Tail folded VF: 1.
 
 define i32 @test(ptr nocapture noundef readonly %pInVec, ptr nocapture noundef readonly %pInA1, ptr nocapture noundef readonly %pInA2, ptr nocapture noundef readonly %pInA3, ptr nocapture noundef readonly %pInA4, i32 noundef %numCols) {
 ; CHECK-LABEL: @test(
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -9,7 +9,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; VPLANS-LABEL: Checking a loop in 'simple_memset'
-; VPLANS:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; VPLANS:      VPlan 'Initial VPlan for Tail Folded VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
 ; VPLANS-NEXT: Live-in vp<[[TC:%[0-9]+]]> = original trip-count
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: vector.ph:
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding.ll
@@ -778,28 +778,40 @@
 ; CHECK-LABEL: @simple_memset_trip1024(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT2]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX1]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; CHECK-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]]
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[TMP6]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[INDEX1]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP10]], i32 0
+; CHECK-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP10]], i64 [[TMP14]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT3]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 8
+; CHECK-NEXT:    [[INDEX_NEXT4]] = add nuw i64 [[INDEX1]], [[TMP17]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT4]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
Index: llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/tail-folding-styles.ll
@@ -1,5 +1,4 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=none < %s | FileCheck %s --check-prefix=NONE
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data < %s | FileCheck %s --check-prefix=DATA
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-without-lane-mask < %s | FileCheck %s --check-prefix=DATA_NO_LANEMASK
 ; RUN: opt -S -passes=loop-vectorize -force-tail-folding-style=data-and-control < %s | FileCheck %s --check-prefix=DATA_AND_CONTROL
@@ -10,48 +9,6 @@
 ; Test the different tail folding styles.
 
 define void @simple_memset_tailfold(i32 %val, ptr %ptr, i64 %n) "target-features" = "+sve" {
-; NONE-LABEL: @simple_memset_tailfold(
-; NONE-NEXT:  entry:
-; NONE-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
-; NONE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; NONE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
-; NONE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[UMAX]], [[TMP1]]
-; NONE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; NONE:       vector.ph:
-; NONE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; NONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
-; NONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[UMAX]], [[TMP3]]
-; NONE-NEXT:    [[N_VEC:%.*]] = sub i64 [[UMAX]], [[N_MOD_VF]]
-; NONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[VAL:%.*]], i64 0
-; NONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; NONE-NEXT:    br label [[VECTOR_BODY:%.*]]
-; NONE:       vector.body:
-; NONE-NEXT:    [[INDEX1:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VECTOR_BODY]] ]
-; NONE-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX1]], 0
-; NONE-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
-; NONE-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[TMP5]], i32 0
-; NONE-NEXT:    store <vscale x 4 x i32> [[BROADCAST_SPLAT]], ptr [[TMP6]], align 4
-; NONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; NONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 4
-; NONE-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP8]]
-; NONE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC]]
-; NONE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; NONE:       middle.block:
-; NONE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[UMAX]], [[N_VEC]]
-; NONE-NEXT:    br i1 [[CMP_N]], label [[WHILE_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; NONE:       scalar.ph:
-; NONE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; NONE-NEXT:    br label [[WHILE_BODY:%.*]]
-; NONE:       while.body:
-; NONE-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; NONE-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[INDEX]]
-; NONE-NEXT:    store i32 [[VAL]], ptr [[GEP]], align 4
-; NONE-NEXT:    [[INDEX_NEXT]] = add nsw i64 [[INDEX]], 1
-; NONE-NEXT:    [[CMP10:%.*]] = icmp ult i64 [[INDEX_NEXT]], [[N]]
-; NONE-NEXT:    br i1 [[CMP10]], label [[WHILE_BODY]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP3:![0-9]+]]
-; NONE:       while.end.loopexit:
-; NONE-NEXT:    ret void
-;
 ; DATA-LABEL: @simple_memset_tailfold(
 ; DATA-NEXT:  entry:
 ; DATA-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[N:%.*]], i64 1)
Index: llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/mve-known-trip-count.ll
@@ -6,7 +6,7 @@
 
 ; Trip count of 5 - shouldn't be vectorized.
 ; CHECK-LABEL: tripcount5
-; CHECK: LV: Selecting VF: 1
+; CHECK: LV: Selecting Tail folded VF: 1
 define void @tripcount5(ptr nocapture readonly %in, ptr nocapture %out, ptr nocapture readonly %consts, i32 %n) #0 {
 entry:
   %arrayidx20 = getelementptr inbounds i32, ptr %out, i32 1
@@ -388,8 +388,8 @@
 
 ; Larger example with predication that should also not be vectorized
 ; CHECK-LABEL: predicated
-; CHECK: LV: Selecting VF: 1
-; CHECK: LV: Selecting VF: 1
+; CHECK: LV: Selecting Tail folded VF: 1
+; CHECK: LV: Selecting Tail folded VF: 1
 define dso_local i32 @predicated(i32 noundef %0, ptr %glob) #0 {
   %2 = alloca [101 x i32], align 4
   %3 = alloca [21 x i32], align 4
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -133,7 +133,7 @@
   br label %while.body
 
 while.body:
-  %N.addr.09 = phi i32 [ %dec, %while.body ], [ 2049, %while.body.preheader ]
+  %N.addr.09 = phi i32 [ %dec, %while.body ], [ 2051, %while.body.preheader ]
   %c.addr.08 = phi i8* [ %incdec.ptr4, %while.body ], [ %c, %while.body.preheader ]
   %b.addr.07 = phi i8* [ %incdec.ptr1, %while.body ], [ %b, %while.body.preheader ]
   %a.addr.06 = phi i8* [ %incdec.ptr, %while.body ], [ %a, %while.body.preheader ]
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT
+; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED
 ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TAILPRED
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
@@ -6,6 +7,7 @@
 
 ; When TP is disabled, this test can vectorize with a VF of 16.
 ; When TP is enabled, this test should vectorize with a VF of 8.
+; When both are allowed, the VF=8 with tail folding should win out.
 ;
 ; DEFAULT: load <16 x i8>, <16 x i8>*
 ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16>
Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
+++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -8,9 +8,9 @@
 define i32 @foo() {
 ; CHECK-LABEL: foo
 
-; CHECK-PWR8: Executing best plan with VF=16, UF=4
+; CHECK-PWR8: Executing best plan with TailFold=false, VF=16, UF=4
 
-; CHECK-PWR9: Executing best plan with VF=8, UF=8
+; CHECK-PWR9: Executing best plan with TailFold=false, VF=8, UF=8
 
 
 entry:
@@ -46,7 +46,7 @@
 
 ; CHECK-LABEL: goo
 
-; CHECK: Executing best plan with VF=16, UF=4
+; CHECK: Executing best plan with TailFold=false, VF=16, UF=4
 
 entry:
   br label %for.body
@@ -79,7 +79,7 @@
 define i64 @bar(ptr nocapture %a) {
 ; CHECK-LABEL: bar
 
-; CHECK: Executing best plan with VF=2, UF=12
+; CHECK: Executing best plan with TailFold=false, VF=2, UF=12
 
 entry:
   br label %for.body
@@ -107,7 +107,7 @@
 
 define void @hoo(i32 %n) {
 ; CHECK-LABEL: hoo
-; CHECK: Executing best plan with VF=1, UF=12
+; CHECK: Executing best plan with TailFold=false, VF=1, UF=12
 
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -112,7 +112,7 @@
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  Executing best plan with TailFold=false, VF=vscale x 4, UF=1
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ;
 entry:
@@ -244,7 +244,7 @@
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  Executing best plan with TailFold=false, VF=vscale x 4, UF=1
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ;
 entry:
Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -8,7 +8,7 @@
 
 define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize {
 ; CHECK-LABEL: sink_replicate_region_1
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -97,7 +97,7 @@
 
 define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-LABEL: sink_replicate_region_2
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -165,7 +165,7 @@
 
 define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-LABEL: sink_replicate_region_3_reduction
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -238,7 +238,7 @@
 ; containing %conv at the end, because %conv is the last recipe in the block.
 define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr %ptr, ptr noalias %dst) optsize {
 ; CHECK-LABEL: sink_replicate_region_4_requires_split_at_end_of_block
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -335,7 +335,7 @@
 ; Test case that requires sinking a recipe in a replicate region after another replicate region.
 define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize {
 ; CHECK-LABEL: sink_replicate_region_after_replicate_region
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -408,7 +408,7 @@
 
 define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias %dst) {
 ; CHECK-LABEL: need_new_block_after_sinking_pr56146
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
Index: llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
+++ llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
@@ -36,7 +36,7 @@
 
 ; Check for crash exposed by D76992.
 ; CHECK-LABEL: 'test'
-; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
Index: llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -10,7 +10,7 @@
 
 
 ; CHECK-LABEL: LV: Checking a loop in 'sink1'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -73,7 +73,7 @@
 }
 
 ; CHECK-LABEL: LV: Checking a loop in 'sink2'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -151,7 +151,7 @@
 }
 
 ; CHECK-LABEL: LV: Checking a loop in 'sink3'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -231,7 +231,7 @@
 ; Make sure we do not sink uniform instructions.
 define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -300,7 +300,7 @@
 ; Loop with predicated load.
 define void @pred_cfg1(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -394,7 +394,7 @@
 ; loaded value.
 define void @pred_cfg2(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -497,7 +497,7 @@
 ; on loaded value.
 define void @pred_cfg3(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -600,7 +600,7 @@
 
 define void @merge_3_replicate_region(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -699,7 +699,7 @@
 
 define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) {
 ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -760,7 +760,7 @@
 
 define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) {
 ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY:
@@ -970,7 +970,7 @@
 ; need to be removed before merging.
 define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize {
 ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
 ; CHECK-EMPTY: