Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -378,8 +378,9 @@
 
   /// Returns true if vector representation of the instruction \p I
   /// requires mask.
-  bool isMaskRequired(const Instruction *I) const {
-    return MaskedOp.contains(I);
+  bool isMaskRequired(bool FoldTailByMasking, const Instruction *I) const {
+    return MaskedOp.contains(I) ||
+           (FoldTailByMasking && FoldTailMaskedOp.contains(I));
   }
 
   unsigned getNumStores() const { return LAI->getNumStores(); }
@@ -387,8 +388,9 @@
 
   /// Returns all assume calls in predicated blocks. They need to be dropped
   /// when flattening the CFG.
-  const SmallPtrSetImpl<Instruction *> &getConditionalAssumes() const {
-    return ConditionalAssumes;
+  const SmallPtrSetImpl<Instruction *> &
+  getConditionalAssumes(bool FoldTailByMasking) const {
+    return FoldTailByMasking ? FoldTailConditionalAssumes : ConditionalAssumes;
   }
 
 private:
@@ -548,6 +550,11 @@
   /// flattened.
   SmallPtrSet<Instruction *, 8> ConditionalAssumes;
 
+  /// Same as MaskedOp above when folding tail by masking.
+  SmallPtrSet<const Instruction *, 8> FoldTailMaskedOp;
+  /// Same as ConditionalAssumes above when folding tail by masking.
+  SmallPtrSet<Instruction *, 8> FoldTailConditionalAssumes;
+
   /// BFI and PSI are used to check for profile guided size optimizations.
   BlockFrequencyInfo *BFI;
   ProfileSummaryInfo *PSI;
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1410,14 +1410,11 @@
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
-  SmallPtrSet<const Instruction *, 8> TmpMaskedOp;
-  SmallPtrSet<Instruction *, 8> TmpConditionalAssumes;
-
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers, TmpMaskedOp,
-                              TmpConditionalAssumes)) {
+    if (!blockCanBePredicated(BB, SafePointers, FoldTailMaskedOp,
+                              FoldTailConditionalAssumes)) {
       LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking as requested.\n");
       return false;
     }
@@ -1425,10 +1422,6 @@
 
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
 
-  MaskedOp.insert(TmpMaskedOp.begin(), TmpMaskedOp.end());
-  ConditionalAssumes.insert(TmpConditionalAssumes.begin(),
-                            TmpConditionalAssumes.end());
-
   return true;
 }
 
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -184,10 +184,47 @@
 /// VectorizerParams::VectorizationFactor and VectorizationCostTy.
 /// We need to streamline them.
 
+struct VectorizationScheme {
+  bool FoldTailByMasking;
+  ElementCount Width;
+
+  VectorizationScheme(bool F, ElementCount W)
+      : FoldTailByMasking(F), Width(W) {}
+
+  bool operator==(const VectorizationScheme &rhs) const {
+    return Width == rhs.Width && FoldTailByMasking == rhs.FoldTailByMasking;
+  }
+  bool operator!=(const VectorizationScheme &rhs) const {
+    return !(*this == rhs);
+  }
+};
+template <> struct DenseMapInfo<VectorizationScheme, void> {
+  static inline VectorizationScheme getEmptyKey() {
+    return {false, ElementCount::getScalable(~0U)};
+  }
+  static inline VectorizationScheme getTombstoneKey() {
+    return {false, ElementCount::getFixed(~0U - 1)};
+  }
+  static unsigned getHashValue(const VectorizationScheme &EltCnt) {
+    unsigned HashVal = EltCnt.Width.getKnownMinValue() * 37U;
+    if (EltCnt.FoldTailByMasking)
+      HashVal -= 2;
+    if (EltCnt.Width.isScalable())
+      return (HashVal - 1U);
+
+    return HashVal;
+  }
+
+  static bool isEqual(const VectorizationScheme &LHS,
+                      const VectorizationScheme &RHS) {
+    return LHS == RHS;
+  }
+};
+
 /// Information about vectorization costs.
 struct VectorizationFactor {
-  /// Vector width with best cost.
-  ElementCount Width;
+  /// FoldTailByMasking and Vector width.
+  VectorizationScheme Scheme;
 
   /// Cost of the loop with that width.
   InstructionCost Cost;
@@ -199,17 +236,20 @@
   /// to runtime checks.
   ElementCount MinProfitableTripCount;
 
-  VectorizationFactor(ElementCount Width, InstructionCost Cost,
+  VectorizationFactor(bool FoldTailByMasking, ElementCount Width,
+                      InstructionCost Cost, InstructionCost ScalarCost)
+      : Scheme(FoldTailByMasking, Width), Cost(Cost), ScalarCost(ScalarCost) {}
+  VectorizationFactor(VectorizationScheme Sc, InstructionCost Cost,
                       InstructionCost ScalarCost)
-      : Width(Width), Cost(Cost), ScalarCost(ScalarCost) {}
+      : Scheme(Sc), Cost(Cost), ScalarCost(ScalarCost) {}
 
   /// Width 1 means no vectorization, cost 0 means uncomputed cost.
   static VectorizationFactor Disabled() {
-    return {ElementCount::getFixed(1), 0, 0};
+    return {false, ElementCount::getFixed(1), 0, 0};
   }
 
   bool operator==(const VectorizationFactor &rhs) const {
-    return Width == rhs.Width && Cost == rhs.Cost;
+    return Scheme == rhs.Scheme && Cost == rhs.Cost;
   }
 
   bool operator!=(const VectorizationFactor &rhs) const {
@@ -304,7 +344,7 @@
   VectorizationFactor planInVPlanNativePath(ElementCount UserVF);
 
   /// Return the best VPlan for \p VF.
-  VPlan &getBestPlanFor(ElementCount VF) const;
+  VPlan &getBestPlanFor(VectorizationScheme VF) const;
 
   /// Generate the IR code for the body of the vectorized loop according to the
   /// best selected \p VF, \p UF and VPlan \p BestPlan.
@@ -321,9 +361,10 @@
 
   /// Look through the existing plans and return true if we have one with all
   /// the vectorization factors in question.
-  bool hasPlanWithVF(ElementCount VF) const {
-    return any_of(VPlans,
-                  [&](const VPlanPtr &Plan) { return Plan->hasVF(VF); });
+  bool hasPlanWithVF(bool FoldTailByMasking, ElementCount VF) const {
+    return any_of(VPlans, [&](const VPlanPtr &Plan) {
+      return Plan->foldTailByMasking() == FoldTailByMasking && Plan->hasVF(VF);
+    });
   }
 
   /// Test a \p Predicate on a \p Range of VF's. Return the value of applying
@@ -351,13 +392,15 @@
   /// Build a VPlan using VPRecipes according to the information gather by
   /// Legal. This method is only used for the legacy inner loop vectorizer.
   VPlanPtr buildVPlanWithVPRecipes(
-      VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
+      bool FoldTailByMasking, VFRange &Range,
+      SmallPtrSetImpl<Instruction *> &DeadInstructions,
       const MapVector<Instruction *, Instruction *> &SinkAfter);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
   /// legal to vectorize the loop. This method creates VPlans using VPRecipes.
-  void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
+  void buildVPlansWithVPRecipes(bool FoldTailByMasking, ElementCount MinVF,
+                                ElementCount MaxVF);
 
   // Adjust the recipes for reductions. For in-loop reductions the chain of
   // instructions leading from the loop exit instr to the phi need to be
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -478,7 +478,8 @@
   /// latter is the case when vectorizing the epilogue loop. In the case of
   /// epilogue vectorization, this function is overriden to handle the more
   /// complex control flow around the loops.
-  virtual std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton();
+  virtual std::pair<BasicBlock *, Value *>
+  createVectorizedLoopSkeleton(VPlan &Plan);
 
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
   void fixVectorizedLoop(VPTransformState &State, VPlan &Plan);
@@ -541,7 +542,7 @@
   /// bypass block and the end value on the edge from bypass to this loop.
   PHINode *createInductionResumeValue(
       PHINode *OrigPhi, const InductionDescriptor &ID,
-      ArrayRef<BasicBlock *> BypassBlocks,
+      ArrayRef<BasicBlock *> BypassBlocks, VPlan &Plan,
       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
 
 protected:
@@ -589,7 +590,7 @@
   Value *getOrCreateTripCount(BasicBlock *InsertBlock);
 
   /// Returns (and creates if needed) the trip count of the widened loop.
-  Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock);
+  Value *getOrCreateVectorTripCount(BasicBlock *InsertBlock, VPlan &Plan);
 
   /// Returns a bitcasted value to the requested vector type.
   /// Also handles bitcasts of vector<float> <-> vector<pointer> types.
@@ -598,7 +599,7 @@
 
   /// Emit a bypass check to see if the vector trip count is zero, including if
   /// it overflows.
-  void emitIterationCountCheck(BasicBlock *Bypass);
+  void emitIterationCountCheck(BasicBlock *Bypass, VPlan &Plan);
 
   /// Emit a bypass check to see if all of the SCEV assumptions we've
   /// had to make are correct. Returns the block containing the checks or
@@ -621,12 +622,13 @@
   /// block, the \p AdditionalBypass pair provides information about the bypass
   /// block and the end value on the edge from bypass to this loop.
   void createInductionResumeValues(
+      VPlan &Plan,
       std::pair<BasicBlock *, Value *> AdditionalBypass = {nullptr, nullptr});
 
   /// Complete the loop skeleton by adding debug MDs, creating appropriate
   /// conditional branches in the middle block, preparing the builder and
   /// running the verifier. Return the preheader of the completed vector loop.
-  BasicBlock *completeLoopSkeleton();
+  BasicBlock *completeLoopSkeleton(VPlan &Plan);
 
   /// Collect poison-generating recipes that may generate a poison value that is
   /// used after vectorization, even when their operands are not poison. Those
@@ -810,15 +812,16 @@
 
   // Override this function to handle the more complex control flow around the
   // three loops.
-  std::pair<BasicBlock *, Value *> createVectorizedLoopSkeleton() final {
-    return createEpilogueVectorizedLoopSkeleton();
+  std::pair<BasicBlock *, Value *>
+  createVectorizedLoopSkeleton(VPlan &Plan) final {
+    return createEpilogueVectorizedLoopSkeleton(Plan);
   }
 
   /// The interface for creating a vectorized skeleton using one of two
   /// different strategies, each corresponding to one execution of the vplan
   /// as described above.
   virtual std::pair<BasicBlock *, Value *>
-  createEpilogueVectorizedLoopSkeleton() = 0;
+  createEpilogueVectorizedLoopSkeleton(VPlan &Plan) = 0;
 
   /// Holds and updates state information required to vectorize the main loop
   /// and its epilogue in two separate passes. This setup helps us avoid
@@ -846,7 +849,8 @@
                                        EPI, LVL, CM, BFI, PSI, Check) {}
   /// Implements the interface for creating a vectorized skeleton using the
   /// *main loop* strategy (ie the first pass of vplan execution).
-  std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
+  std::pair<BasicBlock *, Value *>
+  createEpilogueVectorizedLoopSkeleton(VPlan &Plan) final;
 
 protected:
   /// Emits an iteration count bypass check once for the main loop (when \p
@@ -876,7 +880,8 @@
   }
   /// Implements the interface for creating a vectorized skeleton using the
   /// *epilogue loop* strategy (ie the second pass of vplan execution).
-  std::pair<BasicBlock *, Value *> createEpilogueVectorizedLoopSkeleton() final;
+  std::pair<BasicBlock *, Value *>
+  createEpilogueVectorizedLoopSkeleton(VPlan &Plan) final;
 
 protected:
   /// Emits an iteration count bypass check after the main vector loop has
@@ -1144,15 +1149,19 @@
   CM_ScalarEpilogueNotAllowedUsePredicate
 };
 
-/// ElementCountComparator creates a total ordering for ElementCount
-/// for the purposes of using it in a set structure.
-struct ElementCountComparator {
-  bool operator()(const ElementCount &LHS, const ElementCount &RHS) const {
-    return std::make_tuple(LHS.isScalable(), LHS.getKnownMinValue()) <
-           std::make_tuple(RHS.isScalable(), RHS.getKnownMinValue());
+/// VectorizationSchemeComparator creates a total ordering for
+/// VectorizationScheme for the purposes of using it in a set structure.
+struct VectorizationSchemeComparator {
+  bool operator()(const VectorizationScheme &LHS,
+                  const VectorizationScheme &RHS) const {
+    return std::make_tuple(LHS.FoldTailByMasking, LHS.Width.isScalable(),
+                           LHS.Width.getKnownMinValue()) <
+           std::make_tuple(RHS.FoldTailByMasking, RHS.Width.isScalable(),
+                           RHS.Width.getKnownMinValue());
   }
 };
-using ElementCountSet = SmallSet<ElementCount, 16, ElementCountComparator>;
+using ElementCountSet =
+    SmallSet<VectorizationScheme, 16, VectorizationSchemeComparator>;
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
 /// vectorization.
@@ -1198,10 +1207,11 @@
 
   /// Setup cost-based decisions for user vectorization factor.
   /// \return true if the UserVF is a feasible VF to be chosen.
-  bool selectUserVectorizationFactor(ElementCount UserVF) {
-    collectUniformsAndScalars(UserVF);
-    collectInstsToScalarize(UserVF);
-    return expectedCost(UserVF).first.isValid();
+  bool selectUserVectorizationFactor(bool FoldTailByMasking,
+                                     ElementCount UserVF) {
+    collectUniformsAndScalars(FoldTailByMasking, UserVF);
+    collectInstsToScalarize(FoldTailByMasking, UserVF);
+    return expectedCost({FoldTailByMasking, UserVF}).first.isValid();
   }
 
   /// \return The size (in bits) of the smallest and widest types in the code
@@ -1213,7 +1223,8 @@
   /// If interleave count has been specified by metadata it will be returned.
   /// Otherwise, the interleave count is computed and returned. VF and LoopCost
   /// are the selected vectorization factor and the cost of the selected VF.
-  unsigned selectInterleaveCount(ElementCount VF, InstructionCost LoopCost);
+  unsigned selectInterleaveCount(VectorizationScheme VF,
+                                 InstructionCost LoopCost);
 
   /// Memory access instruction may be vectorized in more than one way.
   /// Form of instruction after vectorization depends on cost.
@@ -1222,7 +1233,7 @@
   /// the lists of loop-uniform and loop-scalar instructions.
   /// The calculated cost is saved with widening decision in order to
   /// avoid redundant calculations.
-  void setCostBasedWideningDecision(ElementCount VF);
+  void setCostBasedWideningDecision(bool FoldTailByMasking, ElementCount VF);
 
   /// A struct that represents some properties of the register usage
   /// of a loop.
@@ -1238,7 +1249,7 @@
   /// \return Returns information about the register usages of the loop for the
   /// given vectorization factors.
   SmallVector<RegisterUsage, 8>
-  calculateRegisterUsage(ArrayRef<ElementCount> VFs);
+  calculateRegisterUsage(ArrayRef<VectorizationScheme> VFs);
 
   /// Collect values we want to ignore in the cost model.
   void collectValuesToIgnore();
@@ -1267,7 +1278,8 @@
 
   /// \returns True if it is more profitable to scalarize instruction \p I for
   /// vectorization factor \p VF.
-  bool isProfitableToScalarize(Instruction *I, ElementCount VF) const {
+  bool isProfitableToScalarize(Instruction *I, bool FoldTailByMasking,
+                               ElementCount VF) const {
     assert(VF.isVector() &&
            "Profitable to scalarize relevant only for VF > 1.");
 
@@ -1276,14 +1288,15 @@
     if (EnableVPlanNativePath)
       return false;
 
-    auto Scalars = InstsToScalarize.find(VF);
+    auto Scalars = InstsToScalarize.find({FoldTailByMasking, VF});
     assert(Scalars != InstsToScalarize.end() &&
            "VF not yet analyzed for scalarization profitability");
     return Scalars->second.find(I) != Scalars->second.end();
   }
 
   /// Returns true if \p I is known to be uniform after vectorization.
-  bool isUniformAfterVectorization(Instruction *I, ElementCount VF) const {
+  bool isUniformAfterVectorization(Instruction *I, bool FoldTailByMasking,
+                                   ElementCount VF) const {
     if (VF.isScalar())
       return true;
 
@@ -1292,14 +1305,15 @@
     if (EnableVPlanNativePath)
       return false;
 
-    auto UniformsPerVF = Uniforms.find(VF);
+    auto UniformsPerVF = Uniforms.find({FoldTailByMasking, VF});
     assert(UniformsPerVF != Uniforms.end() &&
            "VF not yet analyzed for uniformity");
     return UniformsPerVF->second.count(I);
   }
 
   /// Returns true if \p I is known to be scalar after vectorization.
-  bool isScalarAfterVectorization(Instruction *I, ElementCount VF) const {
+  bool isScalarAfterVectorization(Instruction *I, bool FoldTailByMasking,
+                                  ElementCount VF) const {
     if (VF.isScalar())
       return true;
 
@@ -1308,7 +1322,7 @@
     if (EnableVPlanNativePath)
       return false;
 
-    auto ScalarsPerVF = Scalars.find(VF);
+    auto ScalarsPerVF = Scalars.find({FoldTailByMasking, VF});
     assert(ScalarsPerVF != Scalars.end() &&
            "Scalar values are not calculated for VF");
     return ScalarsPerVF->second.count(I);
@@ -1316,10 +1330,11 @@
 
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
-  bool canTruncateToMinimalBitwidth(Instruction *I, ElementCount VF) const {
+  bool canTruncateToMinimalBitwidth(Instruction *I, bool FoldTailByMasking,
+                                    ElementCount VF) const {
     return VF.isVector() && MinBWs.find(I) != MinBWs.end() &&
-           !isProfitableToScalarize(I, VF) &&
-           !isScalarAfterVectorization(I, VF);
+           !isProfitableToScalarize(I, FoldTailByMasking, VF) &&
+           !isScalarAfterVectorization(I, FoldTailByMasking, VF);
   }
 
   /// Decision that was taken during cost calculation for memory instruction.
@@ -1334,26 +1349,33 @@
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// instruction \p I and vector width \p VF.
-  void setWideningDecision(Instruction *I, ElementCount VF, InstWidening W,
+  void setWideningDecision(Instruction *I, bool FoldTailByMasking,
+                           ElementCount VF, InstWidening W,
                            InstructionCost Cost) {
     assert(VF.isVector() && "Expected VF >=2");
-    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+    WideningDecisions[std::make_pair(
+        I, VectorizationScheme(FoldTailByMasking, VF))] =
+        std::make_pair(W, Cost);
   }
 
   /// Save vectorization decision \p W and \p Cost taken by the cost model for
   /// interleaving group \p Grp and vector width \p VF.
   void setWideningDecision(const InterleaveGroup<Instruction> *Grp,
-                           ElementCount VF, InstWidening W,
-                           InstructionCost Cost) {
+                           bool FoldTailByMasking, ElementCount VF,
+                           InstWidening W, InstructionCost Cost) {
     assert(VF.isVector() && "Expected VF >=2");
     /// Broadcast this decicion to all instructions inside the group.
     /// But the cost will be assigned to one instruction only.
     for (unsigned i = 0; i < Grp->getFactor(); ++i) {
       if (auto *I = Grp->getMember(i)) {
         if (Grp->getInsertPos() == I)
-          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+          WideningDecisions[std::make_pair(
+              I, VectorizationScheme(FoldTailByMasking, VF))] =
+              std::make_pair(W, Cost);
         else
-          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+          WideningDecisions[std::make_pair(
+              I, VectorizationScheme(FoldTailByMasking, VF))] =
+              std::make_pair(W, 0);
       }
     }
   }
@@ -1361,14 +1383,16 @@
   /// Return the cost model decision for the given instruction \p I and vector
   /// width \p VF. Return CM_Unknown if this instruction did not pass
   /// through the cost modeling.
-  InstWidening getWideningDecision(Instruction *I, ElementCount VF) const {
+  InstWidening getWideningDecision(Instruction *I, bool FoldTailByMasking,
+                                   ElementCount VF) const {
     assert(VF.isVector() && "Expected VF to be a vector VF");
     // Cost model is not run in the VPlan-native path - return conservative
     // result until this changes.
     if (EnableVPlanNativePath)
       return CM_GatherScatter;
 
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, VectorizationScheme> InstOnVF =
+        std::make_pair(I, VectorizationScheme(FoldTailByMasking, VF));
     auto Itr = WideningDecisions.find(InstOnVF);
     if (Itr == WideningDecisions.end())
       return CM_Unknown;
@@ -1377,9 +1401,11 @@
 
   /// Return the vectorization cost for the given instruction \p I and vector
   /// width \p VF.
-  InstructionCost getWideningCost(Instruction *I, ElementCount VF) {
+  InstructionCost getWideningCost(Instruction *I, bool FoldTailByMasking,
+                                  ElementCount VF) {
     assert(VF.isVector() && "Expected VF >=2");
-    std::pair<Instruction *, ElementCount> InstOnVF = std::make_pair(I, VF);
+    std::pair<Instruction *, VectorizationScheme> InstOnVF =
+        std::make_pair(I, VectorizationScheme(FoldTailByMasking, VF));
     assert(WideningDecisions.find(InstOnVF) != WideningDecisions.end() &&
            "The cost is not calculated");
     return WideningDecisions[InstOnVF].second;
@@ -1413,18 +1439,19 @@
 
   /// Collects the instructions to scalarize for each predicated instruction in
   /// the loop.
-  void collectInstsToScalarize(ElementCount VF);
+  void collectInstsToScalarize(bool FoldTailByMasking, ElementCount VF);
 
-  /// Collect Uniform and Scalar values for the given \p VF.
-  /// The sets depend on CM decision for Load/Store instructions
-  /// that may be vectorized as interleave, gather-scatter or scalarized.
-  void collectUniformsAndScalars(ElementCount VF) {
+  /// Collect Uniform and Scalar values for the given \p FoldTailByMasking and
+  /// \p VF. The sets depend on CM decision for Load/Store instructions that may
+  /// be vectorized as interleave, gather-scatter or scalarized.
+  void collectUniformsAndScalars(bool FoldTailByMasking, ElementCount VF) {
     // Do the analysis once.
-    if (VF.isScalar() || Uniforms.find(VF) != Uniforms.end())
+    if (VF.isScalar() ||
+        Uniforms.find({FoldTailByMasking, VF}) != Uniforms.end())
       return;
-    setCostBasedWideningDecision(VF);
-    collectLoopUniforms(VF);
-    collectLoopScalars(VF);
+    setCostBasedWideningDecision(FoldTailByMasking, VF);
+    collectLoopUniforms(FoldTailByMasking, VF);
+    collectLoopScalars(FoldTailByMasking, VF);
   }
 
   /// Returns true if the target machine supports masked store operation
@@ -1486,29 +1513,32 @@
   /// for which our chosen predication strategy is scalarization (i.e. we
   /// don't have an alternate strategy such as masking available).
   /// \p VF is the vectorization factor that will be used to vectorize \p I.
-  bool isScalarWithPredication(Instruction *I, ElementCount VF) const;
+  bool isScalarWithPredication(Instruction *I, bool FoldTailByMasking,
+                               ElementCount VF) const;
 
   /// Returns true if \p I is an instruction that needs to be predicated
   /// at runtime.  The result is independent of the predication mechanism.
   /// Superset of instructions that return true for isScalarWithPredication.
-  bool isPredicatedInst(Instruction *I) const;
+  bool isPredicatedInst(bool FoldTailByMasking, Instruction *I) const;
 
   /// Return the costs for our two available strategies for lowering a
   /// div/rem operation which requires speculating at least one lane.
   /// First result is for scalarization (will be invalid for scalable
   /// vectors); second is for the safe-divisor strategy.
   std::pair<InstructionCost, InstructionCost>
-  getDivRemSpeculationCost(Instruction *I,
+  getDivRemSpeculationCost(Instruction *I, bool FoldTailByMasking,
                            ElementCount VF) const;
 
   /// Returns true if \p I is a memory instruction with consecutive memory
   /// access that can be widened.
-  bool memoryInstructionCanBeWidened(Instruction *I, ElementCount VF);
+  bool memoryInstructionCanBeWidened(Instruction *I, bool FoldTailByMasking,
+                                     ElementCount VF);
 
   /// Returns true if \p I is a memory instruction in an interleaved-group
   /// of memory accesses that can be vectorized with wide vector loads/stores
   /// and shuffles.
-  bool interleavedAccessCanBeWidened(Instruction *I, ElementCount VF);
+  bool interleavedAccessCanBeWidened(Instruction *I, bool FoldTailByMasking,
+                                     ElementCount VF);
 
   /// Check if \p Instr belongs to any interleaved access group.
   bool isAccessInterleaved(Instruction *Instr) {
@@ -1539,23 +1569,22 @@
     return ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
   }
 
+  bool shouldGenerateNonFoldedVFs() const {
+    return !MayFoldTailByMasking ||
+           ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate ||
+           ScalarEpilogueStatus == CM_ScalarEpilogueAllowed;
+  }
+
   /// Returns true if all loop blocks should be masked to fold tail loop.
-  bool foldTailByMasking() const { return FoldTailByMasking; }
+  bool mayFoldTailByMasking() const { return MayFoldTailByMasking; }
 
   /// Returns true if were tail-folding and want to use the active lane mask
   /// for vector loop control flow.
-  bool useActiveLaneMaskForControlFlow() const {
+  bool useActiveLaneMaskForControlFlow(bool FoldTailByMasking) const {
     return FoldTailByMasking &&
            TTI.emitGetActiveLaneMask() == PredicationStyle::DataAndControlFlow;
   }
 
-  /// Returns true if the instructions in this block requires predication
-  /// for any reason, e.g. because tail folding now requires a predicate
-  /// or because the block in the original loop was predicated.
-  bool blockNeedsPredicationForAnyReason(BasicBlock *BB) const {
-    return foldTailByMasking() || Legal->blockNeedsPredication(BB);
-  }
-
   /// A SmallMapVector to store the InLoop reduction op chains, mapping phi
   /// nodes to the chain of instructions representing the reductions. Uses a
   /// MapVector to ensure deterministic iteration order.
@@ -1582,7 +1611,8 @@
   /// if it's needed. The flag NeedToScalarize shows if the call needs to be
   /// scalarized -
   /// i.e. either vector version isn't available, or is too expensive.
-  InstructionCost getVectorCallCost(CallInst *CI, ElementCount VF,
+  InstructionCost getVectorCallCost(CallInst *CI, bool FoldTailByMasking,
+                                    ElementCount VF,
                                     bool &NeedToScalarize) const;
 
   /// Returns true if the per-lane cost of VectorizationFactor A is lower than
@@ -1642,17 +1672,18 @@
   /// each instruction that has an Invalid cost for the given VF.
   using InstructionVFPair = std::pair<Instruction *, ElementCount>;
   VectorizationCostTy
-  expectedCost(ElementCount VF,
+  expectedCost(VectorizationScheme VF,
                SmallVectorImpl<InstructionVFPair> *Invalid = nullptr);
 
   /// Returns the execution time cost of an instruction for a given vector
   /// width. Vector width of one means scalar.
-  VectorizationCostTy getInstructionCost(Instruction *I, ElementCount VF);
+  VectorizationCostTy getInstructionCost(Instruction *I, bool FoldTailByMasking,
+                                         ElementCount VF);
 
   /// The cost-computation logic from getInstructionCost which provides
   /// the vector type as an output parameter.
-  InstructionCost getInstructionCost(Instruction *I, ElementCount VF,
-                                     Type *&VectorTy);
+  InstructionCost getInstructionCost(Instruction *I, bool FoldTailByMasking,
+                                     ElementCount VF, Type *&VectorTy);
 
   /// Return the cost of instructions in an inloop reduction pattern, if I is
   /// part of that pattern.
@@ -1661,20 +1692,28 @@
                           TTI::TargetCostKind CostKind);
 
   /// Calculate vectorization cost of memory instruction \p I.
-  InstructionCost getMemoryInstructionCost(Instruction *I, ElementCount VF);
+  InstructionCost getMemoryInstructionCost(Instruction *I,
+                                           bool FoldTailByMasking,
+                                           ElementCount VF);
 
   /// The cost computation for scalarized memory instruction.
-  InstructionCost getMemInstScalarizationCost(Instruction *I, ElementCount VF);
+  InstructionCost getMemInstScalarizationCost(Instruction *I,
+                                              bool FoldTailByMasking,
+                                              ElementCount VF);
 
   /// The cost computation for interleaving group of memory instructions.
-  InstructionCost getInterleaveGroupCost(Instruction *I, ElementCount VF);
+  InstructionCost getInterleaveGroupCost(Instruction *I, bool FoldTailByMasking,
+                                         ElementCount VF);
 
   /// The cost computation for Gather/Scatter instruction.
-  InstructionCost getGatherScatterCost(Instruction *I, ElementCount VF);
+  InstructionCost getGatherScatterCost(Instruction *I, bool FoldTailByMasking,
+                                       ElementCount VF);
 
   /// The cost computation for widening instruction \p I with consecutive
   /// memory access.
-  InstructionCost getConsecutiveMemOpCost(Instruction *I, ElementCount VF);
+  InstructionCost getConsecutiveMemOpCost(Instruction *I,
+                                          bool FoldTailByMasking,
+                                          ElementCount VF);
 
   /// The cost calculation for Load/Store instruction \p I with uniform pointer -
   /// Load: scalar load + broadcast.
@@ -1685,11 +1724,13 @@
   /// Estimate the overhead of scalarizing an instruction. This is a
   /// convenience wrapper for the type-based getScalarizationOverhead API.
   InstructionCost getScalarizationOverhead(Instruction *I,
+                                           bool FoldTailByMasking,
                                            ElementCount VF) const;
 
   /// Returns true if an artificially high cost for emulated masked memrefs
   /// should be used.
-  bool useEmulatedMaskMemRefHack(Instruction *I, ElementCount VF);
+  bool useEmulatedMaskMemRefHack(Instruction *I, bool FoldTailByMasking,
+                                 ElementCount VF);
 
   /// Map of scalar integer values to the smallest bitwidth they can be legally
   /// represented as. The vector equivalents of these values should be truncated
@@ -1715,26 +1756,27 @@
   /// iterations to execute in the scalar loop.
   ScalarEpilogueLowering ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
 
-  /// All blocks of loop are to be masked to fold tail of scalar iterations.
-  bool FoldTailByMasking = false;
+  /// The loop may be masked so that all blocks of loop are to be masked to fold
+  /// tail of scalar iterations.
+  bool MayFoldTailByMasking = false;
 
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
-  DenseMap<ElementCount, ScalarCostsTy> InstsToScalarize;
+  DenseMap<VectorizationScheme, ScalarCostsTy> InstsToScalarize;
 
   /// Holds the instructions known to be uniform after vectorization.
   /// The data is collected per VF.
-  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Uniforms;
+  DenseMap<VectorizationScheme, SmallPtrSet<Instruction *, 4>> Uniforms;
 
   /// Holds the instructions known to be scalar after vectorization.
   /// The data is collected per VF.
-  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> Scalars;
+  DenseMap<VectorizationScheme, SmallPtrSet<Instruction *, 4>> Scalars;
 
   /// Holds the instructions (address computations) that are forced to be
   /// scalarized.
-  DenseMap<ElementCount, SmallPtrSet<Instruction *, 4>> ForcedScalars;
+  DenseMap<VectorizationScheme, SmallPtrSet<Instruction *, 4>> ForcedScalars;
 
   /// PHINodes of the reductions that should be expanded in-loop along with
   /// their associated chains of reduction operations, in program order from top
@@ -1754,6 +1796,7 @@
   /// Currently, only single-use chains are considered for scalarization.
   InstructionCost computePredInstDiscount(Instruction *PredInst,
                                           ScalarCostsTy &ScalarCosts,
+                                          bool FoldTailByMasking,
                                           ElementCount VF);
 
   /// Collect the instructions that are uniform after vectorization. An
@@ -1765,7 +1808,7 @@
   /// scalarized instruction will be represented by VF scalar values in the
   /// vectorized loop, each corresponding to an iteration of the original
   /// scalar loop.
-  void collectLoopUniforms(ElementCount VF);
+  void collectLoopUniforms(bool FoldTailByMasking, ElementCount VF);
 
   /// Collect the instructions that are scalar after vectorization. An
   /// instruction is scalar if it is known to be uniform or will be scalarized
@@ -1774,18 +1817,18 @@
   /// CM_Scalarize. Non-uniform scalarized instructions will be represented by
   /// VF values in the vectorized loop, each corresponding to an iteration of
   /// the original scalar loop.
-  void collectLoopScalars(ElementCount VF);
+  void collectLoopScalars(bool FoldTailByMasking, ElementCount VF);
 
   /// Keeps cost model vectorization decision and cost for instructions.
   /// Right now it is used for memory instructions only.
-  using DecisionList = DenseMap<std::pair<Instruction *, ElementCount>,
+  using DecisionList = DenseMap<std::pair<Instruction *, VectorizationScheme>,
                                 std::pair<InstWidening, InstructionCost>>;
 
   DecisionList WideningDecisions;
 
   /// Returns true if \p V is expected to be vectorized and it needs to be
   /// extracted.
-  bool needsExtract(Value *V, ElementCount VF) const {
+  bool needsExtract(Value *V, bool FoldTailByMasking, ElementCount VF) const {
     Instruction *I = dyn_cast<Instruction>(V);
     if (VF.isScalar() || !I || !TheLoop->contains(I) ||
         TheLoop->isLoopInvariant(I))
@@ -1797,15 +1840,18 @@
     // the scalars are collected. That should be a safe assumption in most
     // cases, because we check if the operands have vectorizable types
     // beforehand in LoopVectorizationLegality.
-    return Scalars.find(VF) == Scalars.end() ||
-           !isScalarAfterVectorization(I, VF);
+    return Scalars.find({FoldTailByMasking, VF}) == Scalars.end() ||
+           !isScalarAfterVectorization(I, FoldTailByMasking, VF);
   };
 
   /// Returns a range containing only operands needing to be extracted.
   SmallVector<Value *, 4> filterExtractingOperands(Instruction::op_range Ops,
+                                                   bool FoldTailByMasking,
                                                    ElementCount VF) const {
-    return SmallVector<Value *, 4>(make_filter_range(
-        Ops, [this, VF](Value *V) { return this->needsExtract(V, VF); }));
+    return SmallVector<Value *, 4>(
+        make_filter_range(Ops, [this, FoldTailByMasking, VF](Value *V) {
+          return this->needsExtract(V, FoldTailByMasking, VF);
+        }));
   }
 
   /// Determines if we have the infrastructure to vectorize loop \p L and its
@@ -2877,8 +2923,8 @@
   return TripCount;
 }
 
-Value *
-InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock) {
+Value *InnerLoopVectorizer::getOrCreateVectorTripCount(BasicBlock *InsertBlock,
+                                                       VPlan &Plan) {
   if (VectorTripCount)
     return VectorTripCount;
 
@@ -2897,7 +2943,7 @@
   // exit, with the last early-exit vector comparison also producing all-true.
   // For scalable vectors the VF is not guaranteed to be a power of 2, but this
   // is accounted for in emitIterationCountCheck that adds an overflow check.
-  if (Cost->foldTailByMasking()) {
+  if (Plan.foldTailByMasking()) {
     assert(isPowerOf2_32(VF.getKnownMinValue() * UF) &&
            "VF*UF must be a power of 2 when folding tail by masking");
     Value *NumLanes = getRuntimeVF(Builder, Ty, VF * UF);
@@ -2959,7 +3005,8 @@
   return Builder.CreateBitOrPointerCast(CastVal, DstFVTy);
 }
 
-void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) {
+void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass,
+                                                  VPlan &Plan) {
   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
   // Reuse existing vector loop preheader for TC checks.
   // Note that new preheader block is generated for vector loop.
@@ -2990,7 +3037,7 @@
         Intrinsic::umax, MinProfTC, createStepForVF(Builder, CountTy, VF, UF));
   };
 
-  if (!Cost->foldTailByMasking())
+  if (!Plan.foldTailByMasking())
     CheckMinIters =
         Builder.CreateICmp(P, Count, CreateStep(), "min.iters.check");
   else if (VF.isScalable()) {
@@ -3137,9 +3184,10 @@
 
 PHINode *InnerLoopVectorizer::createInductionResumeValue(
     PHINode *OrigPhi, const InductionDescriptor &II,
-    ArrayRef<BasicBlock *> BypassBlocks,
+    ArrayRef<BasicBlock *> BypassBlocks, VPlan &Plan,
     std::pair<BasicBlock *, Value *> AdditionalBypass) {
-  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
+  Value *VectorTripCount =
+      getOrCreateVectorTripCount(LoopVectorPreHeader, Plan);
   assert(VectorTripCount && "Expected valid arguments");
 
   Instruction *OldInduction = Legal->getPrimaryInduction();
@@ -3195,7 +3243,7 @@
 }
 
 void InnerLoopVectorizer::createInductionResumeValues(
-    std::pair<BasicBlock *, Value *> AdditionalBypass) {
+    VPlan &Plan, std::pair<BasicBlock *, Value *> AdditionalBypass) {
   assert(((AdditionalBypass.first && AdditionalBypass.second) ||
           (!AdditionalBypass.first && !AdditionalBypass.second)) &&
          "Inconsistent information about additional bypass.");
@@ -3210,15 +3258,16 @@
     PHINode *OrigPhi = InductionEntry.first;
     const InductionDescriptor &II = InductionEntry.second;
     PHINode *BCResumeVal = createInductionResumeValue(
-        OrigPhi, II, LoopBypassBlocks, AdditionalBypass);
+        OrigPhi, II, LoopBypassBlocks, Plan, AdditionalBypass);
     OrigPhi->setIncomingValueForBlock(LoopScalarPreHeader, BCResumeVal);
   }
 }
 
-BasicBlock *InnerLoopVectorizer::completeLoopSkeleton() {
+BasicBlock *InnerLoopVectorizer::completeLoopSkeleton(VPlan &Plan) {
   // The trip counts should be cached by now.
   Value *Count = getOrCreateTripCount(LoopVectorPreHeader);
-  Value *VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
+  Value *VectorTripCount =
+      getOrCreateVectorTripCount(LoopVectorPreHeader, Plan);
 
   auto *ScalarLatchTerm = OrigLoop->getLoopLatch()->getTerminator();
 
@@ -3230,7 +3279,7 @@
   //    Thus if tail is to be folded, we know we don't need to run the
   //    remainder and we can use the previous value for the condition (true).
   // 3) Otherwise, construct a runtime check.
-  if (!Cost->requiresScalarEpilogue(VF) && !Cost->foldTailByMasking()) {
+  if (!Cost->requiresScalarEpilogue(VF) && !Plan.foldTailByMasking()) {
     Instruction *CmpN = CmpInst::Create(Instruction::ICmp, CmpInst::ICMP_EQ,
                                         Count, VectorTripCount, "cmp.n",
                                         LoopMiddleBlock->getTerminator());
@@ -3251,7 +3300,7 @@
 }
 
 std::pair<BasicBlock *, Value *>
-InnerLoopVectorizer::createVectorizedLoopSkeleton() {
+InnerLoopVectorizer::createVectorizedLoopSkeleton(VPlan &Plan) {
   /*
    In this function we generate a new loop. The new loop will contain
    the vectorized instructions while the old loop will continue to run the
@@ -3293,7 +3342,7 @@
   // backedge-taken count is uint##_max: adding one to it will overflow leading
   // to an incorrect trip count of zero. In this (rare) case we will also jump
   // to the scalar loop.
-  emitIterationCountCheck(LoopScalarPreHeader);
+  emitIterationCountCheck(LoopScalarPreHeader, Plan);
 
   // Generate the code to check any assumptions that we've made for SCEV
   // expressions.
@@ -3305,9 +3354,9 @@
   emitMemRuntimeChecks(LoopScalarPreHeader);
 
   // Emit phis for the new starting index of the scalar loop.
-  createInductionResumeValues();
+  createInductionResumeValues(Plan);
 
-  return {completeLoopSkeleton(), nullptr};
+  return {completeLoopSkeleton(Plan), nullptr};
 }
 
 // Fix up external users of the induction variable. At this point, we are
@@ -3431,9 +3480,9 @@
   }
 }
 
-InstructionCost
-LoopVectorizationCostModel::getVectorCallCost(CallInst *CI, ElementCount VF,
-                                              bool &NeedToScalarize) const {
+InstructionCost LoopVectorizationCostModel::getVectorCallCost(
+    CallInst *CI, bool FoldTailByMasking, ElementCount VF,
+    bool &NeedToScalarize) const {
   Function *F = CI->getCalledFunction();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
@@ -3456,7 +3505,8 @@
 
   // Compute costs of unpacking argument values for the scalar calls and
   // packing the return values to a vector.
-  InstructionCost ScalarizationCost = getScalarizationOverhead(CI, VF);
+  InstructionCost ScalarizationCost =
+      getScalarizationOverhead(CI, FoldTailByMasking, VF);
 
   InstructionCost Cost =
       ScalarCallCost * VF.getKnownMinValue() + ScalarizationCost;
@@ -3688,10 +3738,11 @@
 
     // Fix-up external users of the induction variables.
     for (const auto &Entry : Legal->getInductionVars())
-      fixupIVUsers(Entry.first, Entry.second,
-                   getOrCreateVectorTripCount(VectorLoop->getLoopPreheader()),
-                   IVEndValues[Entry.first], LoopMiddleBlock,
-                   VectorLoop->getHeader(), Plan);
+      fixupIVUsers(
+          Entry.first, Entry.second,
+          getOrCreateVectorTripCount(VectorLoop->getLoopPreheader(), Plan),
+          IVEndValues[Entry.first], LoopMiddleBlock, VectorLoop->getHeader(),
+          Plan);
   }
 
   // Fix LCSSA phis not already fixed earlier. Extracts may need to be generated
@@ -3891,7 +3942,7 @@
   // a Select choosing between the vectorized LoopExitInst and vectorized Phi,
   // instead of the former. For an inloop reduction the reduction will already
   // be predicated, and does not need to be handled here.
-  if (Cost->foldTailByMasking() && !PhiR->isInLoop()) {
+  if (State.Plan->foldTailByMasking() && !PhiR->isInLoop()) {
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VecLoopExitInst = State.get(LoopExitInstDef, Part);
       SelectInst *Sel = nullptr;
@@ -4185,18 +4236,22 @@
   return Cost->useOrderedReductions(RdxDesc);
 }
 
-void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
+void LoopVectorizationCostModel::collectLoopScalars(bool FoldTailByMasking,
+                                                    ElementCount VF) {
   // We should not collect Scalars more than once per VF. Right now, this
   // function is called from collectUniformsAndScalars(), which already does
   // this check. Collecting Scalars for VF=1 does not make any sense.
-  assert(VF.isVector() && Scalars.find(VF) == Scalars.end() &&
+  assert(VF.isVector() &&
+         Scalars.find({FoldTailByMasking, VF}) == Scalars.end() &&
          "This function should not be visited twice for the same VF");
 
   // This avoids any chances of creating a REPLICATE recipe during planning
   // since that would result in generation of scalarized code during execution,
   // which is not supported for scalable vectors.
   if (VF.isScalable()) {
-    Scalars[VF].insert(Uniforms[VF].begin(), Uniforms[VF].end());
+    Scalars[{FoldTailByMasking, VF}].insert(
+        Uniforms[{FoldTailByMasking, VF}].begin(),
+        Uniforms[{FoldTailByMasking, VF}].end());
     return;
   }
 
@@ -4213,7 +4268,8 @@
   // memory access is not a gather or scatter operation. The value operand of a
   // store will remain scalar if the store is scalarized.
   auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
-    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+    InstWidening WideningDecision =
+        getWideningDecision(MemAccess, FoldTailByMasking, VF);
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
     if (auto *Store = dyn_cast<StoreInst>(MemAccess))
@@ -4266,7 +4322,8 @@
   //
   // (1) Add to the worklist all instructions that have been identified as
   // uniform-after-vectorization.
-  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+  Worklist.insert(Uniforms[{FoldTailByMasking, VF}].begin(),
+                  Uniforms[{FoldTailByMasking, VF}].end());
 
   // (2) Add to the worklist all bitcast and getelementptr instructions used by
   // memory accesses requiring a scalar use. The pointer operands of loads and
@@ -4291,7 +4348,7 @@
   // Insert the forced scalars.
   // FIXME: Currently VPWidenPHIRecipe() often creates a dead vector
   // induction variable when the PHI user is scalarized.
-  auto ForcedScalar = ForcedScalars.find(VF);
+  auto ForcedScalar = ForcedScalars.find({FoldTailByMasking, VF});
   if (ForcedScalar != ForcedScalars.end())
     for (auto *I : ForcedScalar->second) {
       LLVM_DEBUG(dbgs() << "LV: Found (forced) scalar instruction: " << *I << "\n");
@@ -4327,7 +4384,7 @@
 
     // If tail-folding is applied, the primary induction variable will be used
     // to feed a vector compare.
-    if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
+    if (Ind == Legal->getPrimaryInduction() && FoldTailByMasking)
       continue;
 
     // Returns true if \p Indvar is a pointer induction that is used directly by
@@ -4369,12 +4426,12 @@
                       << "\n");
   }
 
-  Scalars[VF].insert(Worklist.begin(), Worklist.end());
+  Scalars[{FoldTailByMasking, VF}].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationCostModel::isScalarWithPredication(
-    Instruction *I, ElementCount VF) const {
-  if (!isPredicatedInst(I))
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) const {
+  if (!isPredicatedInst(FoldTailByMasking, I))
     return false;
 
   // Do we have a non-scalar lowering for this predicated
@@ -4402,14 +4459,23 @@
     // We have the option to use the safe-divisor idiom to avoid predication.
     // The cost based decision here will always select safe-divisor for
     // scalable vectors as scalarization isn't legal.
-    const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
+    const auto [ScalarCost, SafeDivisorCost] =
+        getDivRemSpeculationCost(I, FoldTailByMasking, VF);
     return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost);
   }
   }
 }
 
-bool LoopVectorizationCostModel::isPredicatedInst(Instruction *I) const {
-  if (!blockNeedsPredicationForAnyReason(I->getParent()))
+static bool blockNeedsPredicationForAnyReason(bool FoldTailByMasking,
+                                              LoopVectorizationLegality *Legal,
+                                              BasicBlock *BB) {
+  return FoldTailByMasking || Legal->blockNeedsPredication(BB);
+}
+
+bool LoopVectorizationCostModel::isPredicatedInst(bool FoldTailByMasking,
+                                                  Instruction *I) const {
+  if (!blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal,
+                                         I->getParent()))
     return false;
 
   // Can we prove this instruction is safe to unconditionally execute?
@@ -4419,7 +4485,7 @@
     return false;
   case Instruction::Load:
   case Instruction::Store: {
-    if (!Legal->isMaskRequired(I))
+    if (!Legal->isMaskRequired(FoldTailByMasking, I))
       return false;
     // When we know the load's address is loop invariant and the instruction
     // in the original scalar loop was unconditionally executed then we
@@ -4450,7 +4516,8 @@
 
 std::pair<InstructionCost, InstructionCost>
 LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
-                                                    ElementCount VF) const {
+                                                     bool FoldTailByMasking,
+                                                     ElementCount VF) const {
   assert(I->getOpcode() == Instruction::UDiv ||
          I->getOpcode() == Instruction::SDiv ||
          I->getOpcode() == Instruction::SRem ||
@@ -4480,7 +4547,7 @@
 
     // The cost of insertelement and extractelement instructions needed for
     // scalarization.
-    ScalarizationCost += getScalarizationOverhead(I, VF);
+    ScalarizationCost += getScalarizationOverhead(I, FoldTailByMasking, VF);
 
     // Scale the cost by the probability of executing the predicated blocks.
     // This assumes the predicated block for each vector lane is equally
@@ -4514,9 +4581,9 @@
 }
 
 bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
-    Instruction *I, ElementCount VF) {
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   assert(isAccessInterleaved(I) && "Expecting interleaved access.");
-  assert(getWideningDecision(I, VF) == CM_Unknown &&
+  assert(getWideningDecision(I, FoldTailByMasking, VF) == CM_Unknown &&
          "Decision should not be set yet.");
   auto *Group = getInterleavedAccessGroup(I);
   assert(Group && "Must have a group.");
@@ -4555,8 +4622,9 @@
   // (either a gap at the end of a load-access that may result in a speculative
   // load, or any gaps in a store-access).
   bool PredicatedAccessRequiresMasking =
-      blockNeedsPredicationForAnyReason(I->getParent()) &&
-      Legal->isMaskRequired(I);
+      blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal,
+                                        I->getParent()) &&
+      Legal->isMaskRequired(FoldTailByMasking, I);
   bool LoadAccessWithGapsRequiresEpilogMasking =
       isa<LoadInst>(I) && Group->requiresScalarEpilogue() &&
       !isScalarEpilogueAllowed();
@@ -4583,7 +4651,7 @@
 }
 
 bool LoopVectorizationCostModel::memoryInstructionCanBeWidened(
-    Instruction *I, ElementCount VF) {
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   // Get and ensure we have a valid memory instruction.
   assert((isa<LoadInst, StoreInst>(I)) && "Invalid memory instruction");
 
@@ -4596,7 +4664,7 @@
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
-  if (isScalarWithPredication(I, VF))
+  if (isScalarWithPredication(I, FoldTailByMasking, VF))
     return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
@@ -4608,18 +4676,20 @@
   return true;
 }
 
-void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
+void LoopVectorizationCostModel::collectLoopUniforms(bool FoldTailByMasking,
+                                                     ElementCount VF) {
   // We should not collect Uniforms more than once per VF. Right now,
   // this function is called from collectUniformsAndScalars(), which
   // already does this check. Collecting Uniforms for VF=1 does not make any
   // sense.
 
-  assert(VF.isVector() && Uniforms.find(VF) == Uniforms.end() &&
+  assert(VF.isVector() &&
+         Uniforms.find({FoldTailByMasking, VF}) == Uniforms.end() &&
          "This function should not be visited twice for the same VF");
 
   // Visit the list of Uniforms. If we'll not find any uniform value, we'll
   // not analyze again.  Uniforms.count(VF) will return 1.
-  Uniforms[VF].clear();
+  Uniforms[{FoldTailByMasking, VF}].clear();
 
   // We now know that the loop is vectorizable!
   // Collect instructions inside the loop that will remain uniform after
@@ -4647,7 +4717,7 @@
                         << *I << "\n");
       return;
     }
-    if (isScalarWithPredication(I, VF)) {
+    if (isScalarWithPredication(I, FoldTailByMasking, VF)) {
       LLVM_DEBUG(dbgs() << "LV: Found not uniform being ScalarWithPredication: "
                         << *I << "\n");
       return;
@@ -4677,7 +4747,8 @@
   };
 
   auto isUniformDecision = [&](Instruction *I, ElementCount VF) {
-    InstWidening WideningDecision = getWideningDecision(I, VF);
+    InstWidening WideningDecision =
+        getWideningDecision(I, FoldTailByMasking, VF);
     assert(WideningDecision != CM_Unknown &&
            "Widening decision should be ready at this moment");
 
@@ -4689,7 +4760,6 @@
             WideningDecision == CM_Interleave);
   };
 
-
   // Returns true if Ptr is the pointer operand of a memory access instruction
   // I, and I is known to not require scalarization.
   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
@@ -4823,7 +4893,7 @@
     addToWorklistIfAllowed(IndUpdate);
   }
 
-  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
+  Uniforms[{FoldTailByMasking, VF}].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationCostModel::runtimeChecksRequired() {
@@ -5134,7 +5204,7 @@
   // by masking.
   // FIXME: look for a smaller MaxVF that does divide TC rather than masking.
   if (Legal->prepareToFoldTailByMasking()) {
-    FoldTailByMasking = true;
+    MayFoldTailByMasking = true;
     return MaxFactors;
   }
 
@@ -5234,10 +5304,10 @@
 
     // Collect all viable vectorization factors larger than the default MaxVF
     // (i.e. MaxVectorElementCount).
-    SmallVector<ElementCount, 8> VFs;
+    SmallVector<VectorizationScheme, 8> VFs;
     for (ElementCount VS = MaxVectorElementCount * 2;
          ElementCount::isKnownLE(VS, MaxVectorElementCountMaxBW); VS *= 2)
-      VFs.push_back(VS);
+      VFs.push_back({FoldTailByMasking, VS});
 
     // For each VF calculate its register usage.
     auto RUs = calculateRegisterUsage(VFs);
@@ -5252,7 +5322,7 @@
           Selected = false;
       }
       if (Selected) {
-        MaxVF = VFs[i];
+        MaxVF = VFs[i].Width;
         break;
       }
     }
@@ -5292,35 +5362,51 @@
 
   unsigned MaxTripCount = PSE.getSE()->getSmallConstantMaxTripCount(TheLoop);
 
-  if (!A.Width.isScalable() && !B.Width.isScalable() && FoldTailByMasking &&
+  if (!A.Scheme.Width.isScalable() && !B.Scheme.Width.isScalable() &&
       MaxTripCount) {
-    // If we are folding the tail and the trip count is a known (possibly small)
-    // constant, the trip count will be rounded up to an integer number of
-    // iterations. The total cost will be PerIterationCost*ceil(TripCount/VF),
-    // which we compare directly. When not folding the tail, the total cost will
-    // be PerIterationCost*floor(TC/VF) + Scalar remainder cost, and so is
-    // approximated with the per-lane cost below instead of using the tripcount
-    // as here.
-    auto RTCostA = CostA * divideCeil(MaxTripCount, A.Width.getFixedValue());
-    auto RTCostB = CostB * divideCeil(MaxTripCount, B.Width.getFixedValue());
+    // If the trip count is a known (possibly small) constant, the trip count
+    // will be rounded up to an integer number of iterations under
+    // FoldTailByMasking. The total cost in that case will be
+    // PerIterationCost*ceil(TripCount/VF). When not folding the tail, the total
+    // cost will be PerIterationCost*floor(TC/VF) + Scalar remainder cost, where
+    // the scalar cost is approximated using the correct fraction of the vector
+    // cost.
+    auto RTCostA =
+        A.Scheme.Width.getFixedValue()
+            ? (CostA * divideCeil(MaxTripCount, A.Scheme.Width.getFixedValue()))
+            : (CostA * MaxTripCount) / A.Scheme.Width.getFixedValue();
+    auto RTCostB =
+        B.Scheme.Width.getFixedValue()
+            ? (CostB * divideCeil(MaxTripCount, B.Scheme.Width.getFixedValue()))
+            : (CostB * MaxTripCount) / B.Scheme.Width.getFixedValue();
+
+    if (A.Scheme.FoldTailByMasking && !B.Scheme.FoldTailByMasking)
+      return RTCostA <= RTCostB;
+
     return RTCostA < RTCostB;
   }
 
   // Improve estimate for the vector width if it is scalable.
-  unsigned EstimatedWidthA = A.Width.getKnownMinValue();
-  unsigned EstimatedWidthB = B.Width.getKnownMinValue();
+  unsigned EstimatedWidthA = A.Scheme.Width.getKnownMinValue();
+  unsigned EstimatedWidthB = B.Scheme.Width.getKnownMinValue();
   if (std::optional<unsigned> VScale = getVScaleForTuning()) {
-    if (A.Width.isScalable())
+    if (A.Scheme.Width.isScalable())
       EstimatedWidthA *= *VScale;
-    if (B.Width.isScalable())
+    if (B.Scheme.Width.isScalable())
       EstimatedWidthB *= *VScale;
   }
 
+  // If one plan is predicated and the other is not, opt for the predicated
+  // scheme on a tie.
+  if (A.Scheme.FoldTailByMasking && !B.Scheme.FoldTailByMasking)
+    return (CostA * EstimatedWidthB) <= (CostB * EstimatedWidthA);
+
   // Assume vscale may be larger than 1 (or the value being tuned for),
   // so that scalable vectorization is slightly favorable over fixed-width
   // vectorization.
-  if (A.Width.isScalable() && !B.Width.isScalable())
-    return (CostA * B.Width.getFixedValue()) <= (CostB * EstimatedWidthA);
+  if (A.Scheme.Width.isScalable() && !B.Scheme.Width.isScalable())
+    return (CostA * B.Scheme.Width.getFixedValue()) <=
+           (CostB * EstimatedWidthA);
 
   // To avoid the need for FP division:
   //      (CostA / A.Width) < (CostB / B.Width)
@@ -5330,14 +5416,15 @@
 
 VectorizationFactor LoopVectorizationCostModel::selectVectorizationFactor(
     const ElementCountSet &VFCandidates) {
-  InstructionCost ExpectedCost = expectedCost(ElementCount::getFixed(1)).first;
+  InstructionCost ExpectedCost =
+      expectedCost({false, ElementCount::getFixed(1)}).first;
   LLVM_DEBUG(dbgs() << "LV: Scalar loop costs: " << ExpectedCost << ".\n");
   assert(ExpectedCost.isValid() && "Unexpected invalid cost for scalar loop");
-  assert(VFCandidates.count(ElementCount::getFixed(1)) &&
+  assert(VFCandidates.count({false, ElementCount::getFixed(1)}) &&
          "Expected Scalar VF to be a candidate");
 
-  const VectorizationFactor ScalarCost(ElementCount::getFixed(1), ExpectedCost,
-                                       ExpectedCost);
+  const VectorizationFactor ScalarCost(false, ElementCount::getFixed(1),
+                                       ExpectedCost, ExpectedCost);
   VectorizationFactor ChosenFactor = ScalarCost;
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
@@ -5351,7 +5438,7 @@
   SmallVector<InstructionVFPair> InvalidCosts;
   for (const auto &i : VFCandidates) {
     // The cost for scalar VF=1 is already calculated, so ignore it.
-    if (i.isScalar())
+    if (i.Width.isScalar())
       continue;
 
     VectorizationCostTy C = expectedCost(i, &InvalidCosts);
@@ -5362,12 +5449,13 @@
     if (std::optional<unsigned> VScale = getVScaleForTuning())
       AssumedMinimumVscale = *VScale;
     unsigned Width =
-        Candidate.Width.isScalable()
-            ? Candidate.Width.getKnownMinValue() * AssumedMinimumVscale
-            : Candidate.Width.getFixedValue();
-    LLVM_DEBUG(dbgs() << "LV: Vector loop of width " << i
+        Candidate.Scheme.Width.isScalable()
+            ? Candidate.Scheme.Width.getKnownMinValue() * AssumedMinimumVscale
+            : Candidate.Scheme.Width.getFixedValue();
+    LLVM_DEBUG(dbgs() << "LV: " << (i.FoldTailByMasking ? "Tail folded " : "")
+                      << "Vector loop of width " << i.Width
                       << " costs: " << (Candidate.Cost / Width));
-    if (i.isScalable())
+    if (i.Width.isScalable())
       LLVM_DEBUG(dbgs() << " (assuming a minimum vscale of "
                         << AssumedMinimumVscale << ")");
     LLVM_DEBUG(dbgs() << ".\n");
@@ -5375,7 +5463,7 @@
 
     if (!C.second && !ForceVectorization) {
       LLVM_DEBUG(
-          dbgs() << "LV: Not considering vector loop of width " << i
+          dbgs() << "LV: Not considering vector loop of width " << i.Width
                  << " because it will not generate any vector instructions.\n");
       continue;
     }
@@ -5403,8 +5491,8 @@
                [&Numbering](InstructionVFPair &A, InstructionVFPair &B) {
                  if (Numbering[A.first] != Numbering[B.first])
                    return Numbering[A.first] < Numbering[B.first];
-                 ElementCountComparator ECC;
-                 return ECC(A.second, B.second);
+                 VectorizationSchemeComparator ECC;
+                 return ECC({false, A.second}, {false, B.second});
                });
 
     // For a list of ordered instruction-vf pairs:
@@ -5455,11 +5543,16 @@
     ChosenFactor = ScalarCost;
   }
 
-  LLVM_DEBUG(if (ForceVectorization && !ChosenFactor.Width.isScalar() &&
-                 !isMoreProfitable(ChosenFactor, ScalarCost)) dbgs()
-             << "LV: Vectorization seems to be not beneficial, "
-             << "but was forced by a user.\n");
-  LLVM_DEBUG(dbgs() << "LV: Selecting VF: " << ChosenFactor.Width << ".\n");
+  LLVM_DEBUG({
+    if (ForceVectorization && !ChosenFactor.Scheme.Width.isScalar() &&
+        !isMoreProfitable(ChosenFactor, ScalarCost))
+      dbgs() << "LV: Vectorization seems to be not beneficial, "
+             << "but was forced by a user.\n";
+  });
+  LLVM_DEBUG(
+      dbgs() << "LV: Selecting "
+             << (ChosenFactor.Scheme.FoldTailByMasking ? "Tail folded " : "")
+             << "VF: " << ChosenFactor.Scheme.Width << ".\n");
   return ChosenFactor;
 }
 
@@ -5544,8 +5637,8 @@
   if (EpilogueVectorizationForceVF > 1) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization factor is forced.\n";);
     ElementCount ForcedEC = ElementCount::getFixed(EpilogueVectorizationForceVF);
-    if (LVP.hasPlanWithVF(ForcedEC))
-      return {ForcedEC, 0, 0};
+    if (LVP.hasPlanWithVF(false, ForcedEC))
+      return {false, ForcedEC, 0, 0};
     else {
       LLVM_DEBUG(
           dbgs()
@@ -5579,16 +5672,16 @@
   }
 
   for (auto &NextVF : ProfitableVFs)
-    if (((!NextVF.Width.isScalable() && MainLoopVF.isScalable() &&
-          ElementCount::isKnownLT(NextVF.Width, EstimatedRuntimeVF)) ||
-         ElementCount::isKnownLT(NextVF.Width, MainLoopVF)) &&
-        (Result.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
-        LVP.hasPlanWithVF(NextVF.Width))
+    if (((!NextVF.Scheme.Width.isScalable() && MainLoopVF.isScalable() &&
+          ElementCount::isKnownLT(NextVF.Scheme.Width, EstimatedRuntimeVF)) ||
+         ElementCount::isKnownLT(NextVF.Scheme.Width, MainLoopVF)) &&
+        (Result.Scheme.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
+        LVP.hasPlanWithVF(false, NextVF.Scheme.Width))
       Result = NextVF;
 
   if (Result != VectorizationFactor::Disabled())
     LLVM_DEBUG(dbgs() << "LEV: Vectorizing epilogue loop with VF = "
-                      << Result.Width << "\n";);
+                      << Result.Scheme.Width << "\n";);
   return Result;
 }
 
@@ -5668,7 +5761,7 @@
 }
 
 unsigned
-LoopVectorizationCostModel::selectInterleaveCount(ElementCount VF,
+LoopVectorizationCostModel::selectInterleaveCount(VectorizationScheme VF,
                                                   InstructionCost LoopCost) {
   // -- The interleave heuristics --
   // We interleave the loop in order to expose ILP and reduce the loop overhead.
@@ -5684,7 +5777,8 @@
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
 
-  if (!isScalarEpilogueAllowed())
+  if (!isScalarEpilogueAllowed() ||
+      TheLoop->getHeader()->getParent()->hasOptSize())
     return 1;
 
   // We used the distance for the interleave count.
@@ -5699,7 +5793,8 @@
   // because with the above conditions interleaving can expose ILP and break
   // cross iteration dependences for reductions.
   if (BestKnownTC && (*BestKnownTC < TinyTripCountInterleaveThreshold) &&
-      !(InterleaveSmallLoopScalarReduction && HasReductions && VF.isScalar()))
+      !(InterleaveSmallLoopScalarReduction && HasReductions &&
+        VF.Width.isScalar()))
     return 1;
 
   // If we did not calculate the cost for VF (because the user selected the VF)
@@ -5738,7 +5833,7 @@
     LLVM_DEBUG(dbgs() << "LV: The target has " << TargetNumRegisters
                       << " registers of "
                       << TTI.getRegisterClassName(pair.first) << " register class\n");
-    if (VF.isScalar()) {
+    if (VF.Width.isScalar()) {
       if (ForceTargetNumScalarRegs.getNumOccurrences() > 0)
         TargetNumRegisters = ForceTargetNumScalarRegs;
     } else {
@@ -5763,10 +5858,10 @@
 
   // Clamp the interleave ranges to reasonable counts.
   unsigned MaxInterleaveCount =
-      TTI.getMaxInterleaveFactor(VF.getKnownMinValue());
+      TTI.getMaxInterleaveFactor(VF.Width.getKnownMinValue());
 
   // Check if the user has overridden the max.
-  if (VF.isScalar()) {
+  if (VF.Width.isScalar()) {
     if (ForceTargetMaxScalarInterleaveFactor.getNumOccurrences() > 0)
       MaxInterleaveCount = ForceTargetMaxScalarInterleaveFactor;
   } else {
@@ -5785,8 +5880,8 @@
   // the InterleaveCount as if vscale is '1', although if some information about
   // the vector is known (e.g. min vector size), we can make a better decision.
   if (BestKnownTC) {
-    MaxInterleaveCount =
-        std::min(*BestKnownTC / VF.getKnownMinValue(), MaxInterleaveCount);
+    MaxInterleaveCount = std::min(*BestKnownTC / VF.Width.getKnownMinValue(),
+                                  MaxInterleaveCount);
     // Make sure MaxInterleaveCount is greater than 0.
     MaxInterleaveCount = std::max(1u, MaxInterleaveCount);
   }
@@ -5806,7 +5901,7 @@
 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
-  if (VF.isVector() && HasReductions) {
+  if (VF.Width.isVector() && HasReductions) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
@@ -5816,17 +5911,19 @@
   // vectorized the loop we will have done the runtime check and so interleaving
   // won't require further checks.
   bool ScalarInterleavingRequiresPredication =
-      (VF.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
+      (VF.Width.isScalar() && any_of(TheLoop->blocks(), [this](BasicBlock *BB) {
          return Legal->blockNeedsPredication(BB);
        }));
   bool ScalarInterleavingRequiresRuntimePointerCheck =
-      (VF.isScalar() && Legal->getRuntimePointerChecking()->Need);
+      (VF.Width.isScalar() && Legal->getRuntimePointerChecking()->Need);
 
   // We want to interleave small loops in order to reduce the loop overhead and
   // potentially expose ILP opportunities.
   LLVM_DEBUG(dbgs() << "LV: Loop cost is " << LoopCost << '\n'
                     << "LV: IC is " << IC << '\n'
-                    << "LV: VF is " << VF << '\n');
+                    << "LV: VF is " << VF.Width << '\n'
+                    << "LV: Fold Tail is "
+                    << (VF.FoldTailByMasking ? "true" : "false") << '\n');
   const bool AggressivelyInterleaveReductions =
       TTI.enableAggressiveInterleaving(HasReductions);
   if (!ScalarInterleavingRequiresRuntimePointerCheck &&
@@ -5892,7 +5989,7 @@
 
     // If there are scalar reductions and TTI has enabled aggressive
     // interleaving for reductions, we will interleave to expose ILP.
-    if (InterleaveSmallLoopScalarReduction && VF.isScalar() &&
+    if (InterleaveSmallLoopScalarReduction && VF.Width.isScalar() &&
         AggressivelyInterleaveReductions) {
       LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
       // Interleave no less than SmallIC but not as aggressive as the normal IC
@@ -5916,7 +6013,8 @@
 }
 
 SmallVector<LoopVectorizationCostModel::RegisterUsage, 8>
-LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<ElementCount> VFs) {
+LoopVectorizationCostModel::calculateRegisterUsage(
+    ArrayRef<VectorizationScheme> VFs) {
   // This function calculates the register usage by measuring the highest number
   // of values that are alive at a single location. Obviously, this is a very
   // rough estimation. We scan the loop in a topological order in order and
@@ -6029,7 +6127,7 @@
       // there is no previous entry for ClassID.
       SmallMapVector<unsigned, unsigned, 4> RegUsage;
 
-      if (VFs[j].isScalar()) {
+      if (VFs[j].Width.isScalar()) {
         for (auto *Inst : OpenIntervals) {
           unsigned ClassID =
               TTI.getRegisterClassForType(false, Inst->getType());
@@ -6038,12 +6136,13 @@
           RegUsage[ClassID] += 1;
         }
       } else {
-        collectUniformsAndScalars(VFs[j]);
+        collectUniformsAndScalars(VFs[j].FoldTailByMasking, VFs[j].Width);
         for (auto *Inst : OpenIntervals) {
           // Skip ignored values for VF > 1.
           if (VecValuesToIgnore.count(Inst))
             continue;
-          if (isScalarAfterVectorization(Inst, VFs[j])) {
+          if (isScalarAfterVectorization(Inst, VFs[j].FoldTailByMasking,
+                                         VFs[j].Width)) {
             unsigned ClassID =
                 TTI.getRegisterClassForType(false, Inst->getType());
             // FIXME: The target might use more than one register for the type
@@ -6052,7 +6151,7 @@
           } else {
             unsigned ClassID =
                 TTI.getRegisterClassForType(true, Inst->getType());
-            RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j]);
+            RegUsage[ClassID] += GetRegUsage(Inst->getType(), VFs[j].Width);
           }
         }
       }
@@ -6079,15 +6178,16 @@
     for (auto *Inst : LoopInvariants) {
       // FIXME: The target might use more than one register for the type
       // even in the scalar case.
-      unsigned Usage =
-          VFs[i].isScalar() ? 1 : GetRegUsage(Inst->getType(), VFs[i]);
+      unsigned Usage = VFs[i].Width.isScalar()
+                           ? 1
+                           : GetRegUsage(Inst->getType(), VFs[i].Width);
       unsigned ClassID =
-          TTI.getRegisterClassForType(VFs[i].isVector(), Inst->getType());
+          TTI.getRegisterClassForType(VFs[i].Width.isVector(), Inst->getType());
       Invariant[ClassID] += Usage;
     }
 
     LLVM_DEBUG({
-      dbgs() << "LV(REG): VF = " << VFs[i] << '\n';
+      dbgs() << "LV(REG): VF = " << VFs[i].Width << '\n';
       dbgs() << "LV(REG): Found max usage: " << MaxUsages[i].size()
              << " item\n";
       for (const auto &pair : MaxUsages[i]) {
@@ -6112,8 +6212,8 @@
   return RUs;
 }
 
-bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(Instruction *I,
-                                                           ElementCount VF) {
+bool LoopVectorizationCostModel::useEmulatedMaskMemRefHack(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   // TODO: Cost model for emulated masked load/store is completely
   // broken. This hack guides the cost model to use an artificially
   // high enough value to practically disable vectorization with such
@@ -6122,26 +6222,27 @@
   // from moving "masked load/store" check from legality to cost model.
   // Masked Load/Gather emulation was previously never allowed.
   // Limited number of Masked Store/Scatter emulation was allowed.
-  assert((isPredicatedInst(I)) &&
+  assert(isPredicatedInst(FoldTailByMasking, I) &&
          "Expecting a scalar emulated instruction");
   return isa<LoadInst>(I) ||
          (isa<StoreInst>(I) &&
           NumPredStores > NumberOfStoresToPredicate);
 }
 
-void LoopVectorizationCostModel::collectInstsToScalarize(ElementCount VF) {
+void LoopVectorizationCostModel::collectInstsToScalarize(bool FoldTailByMasking,
+                                                         ElementCount VF) {
   // If we aren't vectorizing the loop, or if we've already collected the
   // instructions to scalarize, there's nothing to do. Collection may already
   // have occurred if we have a user-selected VF and are now computing the
   // expected cost for interleaving.
   if (VF.isScalar() || VF.isZero() ||
-      InstsToScalarize.find(VF) != InstsToScalarize.end())
+      InstsToScalarize.find({FoldTailByMasking, VF}) != InstsToScalarize.end())
     return;
 
   // Initialize a mapping for VF in InstsToScalalarize. If we find that it's
   // not profitable to scalarize any instructions, the presence of VF in the
   // map will indicate that we've analyzed it already.
-  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[VF];
+  ScalarCostsTy &ScalarCostsVF = InstsToScalarize[{FoldTailByMasking, VF}];
 
   PredicatedBBsAfterVectorization[VF].clear();
 
@@ -6149,17 +6250,19 @@
   // determine if it would be better to not if-convert the blocks they are in.
   // If so, we also record the instructions to scalarize.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockNeedsPredicationForAnyReason(BB))
+    if (!blockNeedsPredicationForAnyReason(FoldTailByMasking, Legal, BB))
       continue;
     for (Instruction &I : *BB)
-      if (isScalarWithPredication(&I, VF)) {
+      if (isScalarWithPredication(&I, FoldTailByMasking, VF)) {
         ScalarCostsTy ScalarCosts;
         // Do not apply discount if scalable, because that would lead to
         // invalid scalarization costs.
         // Do not apply discount logic if hacked cost is needed
         // for emulated masked memrefs.
-        if (!VF.isScalable() && !useEmulatedMaskMemRefHack(&I, VF) &&
-            computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
+        if (!VF.isScalable() &&
+            !useEmulatedMaskMemRefHack(&I, FoldTailByMasking, VF) &&
+            computePredInstDiscount(&I, ScalarCosts, FoldTailByMasking, VF) >=
+                0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
         // Remember that BB will remain after vectorization.
         PredicatedBBsAfterVectorization[VF].insert(BB);
@@ -6168,8 +6271,9 @@
 }
 
 InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
-    Instruction *PredInst, ScalarCostsTy &ScalarCosts, ElementCount VF) {
-  assert(!isUniformAfterVectorization(PredInst, VF) &&
+    Instruction *PredInst, ScalarCostsTy &ScalarCosts, bool FoldTailByMasking,
+    ElementCount VF) {
+  assert(!isUniformAfterVectorization(PredInst, FoldTailByMasking, VF) &&
          "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
@@ -6189,12 +6293,12 @@
     // already be scalar to avoid traversing chains that are unlikely to be
     // beneficial.
     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
-        isScalarAfterVectorization(I, VF))
+        isScalarAfterVectorization(I, FoldTailByMasking, VF))
       return false;
 
     // If the instruction is scalar with predication, it will be analyzed
     // separately. We ignore it within the context of PredInst.
-    if (isScalarWithPredication(I, VF))
+    if (isScalarWithPredication(I, FoldTailByMasking, VF))
       return false;
 
     // If any of the instruction's operands are uniform after vectorization,
@@ -6209,7 +6313,7 @@
     // the lane zero values for uniforms rather than asserting.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get()))
-        if (isUniformAfterVectorization(J, VF))
+        if (isUniformAfterVectorization(J, FoldTailByMasking, VF))
           return false;
 
     // Otherwise, we can scalarize the instruction.
@@ -6229,7 +6333,8 @@
 
     // Compute the cost of the vector instruction. Note that this cost already
     // includes the scalarization overhead of the predicated instruction.
-    InstructionCost VectorCost = getInstructionCost(I, VF).first;
+    InstructionCost VectorCost =
+        getInstructionCost(I, FoldTailByMasking, VF).first;
 
     // Compute the cost of the scalarized instruction. This cost is the cost of
     // the instruction as if it wasn't if-converted and instead remained in the
@@ -6237,11 +6342,13 @@
     // computing the scalarization overhead.
     InstructionCost ScalarCost =
         VF.getFixedValue() *
-        getInstructionCost(I, ElementCount::getFixed(1)).first;
+        getInstructionCost(I, FoldTailByMasking, ElementCount::getFixed(1))
+            .first;
 
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
-    if (isScalarWithPredication(I, VF) && !I->getType()->isVoidTy()) {
+    if (isScalarWithPredication(I, FoldTailByMasking, VF) &&
+        !I->getType()->isVoidTy()) {
       ScalarCost += TTI.getScalarizationOverhead(
           cast<VectorType>(ToVectorTy(I->getType(), VF)),
           APInt::getAllOnes(VF.getFixedValue()), true, false);
@@ -6260,7 +6367,7 @@
                "Instruction has non-scalar type");
         if (canBeScalarized(J))
           Worklist.push_back(J);
-        else if (needsExtract(J, VF)) {
+        else if (needsExtract(J, FoldTailByMasking, VF)) {
           ScalarCost += TTI.getScalarizationOverhead(
               cast<VectorType>(ToVectorTy(J->getType(), VF)),
               APInt::getAllOnes(VF.getFixedValue()), false, true);
@@ -6281,7 +6388,7 @@
 
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(
-    ElementCount VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
+    VectorizationScheme VF, SmallVectorImpl<InstructionVFPair> *Invalid) {
   VectorizationCostTy Cost;
 
   // For each block.
@@ -6292,10 +6399,11 @@
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       // Skip ignored values.
       if (ValuesToIgnore.count(&I) ||
-          (VF.isVector() && VecValuesToIgnore.count(&I)))
+          (VF.Width.isVector() && VecValuesToIgnore.count(&I)))
         continue;
 
-      VectorizationCostTy C = getInstructionCost(&I, VF);
+      VectorizationCostTy C =
+          getInstructionCost(&I, VF.FoldTailByMasking, VF.Width);
 
       // Check if we should override the cost.
       if (C.first.isValid() &&
@@ -6304,12 +6412,12 @@
 
       // Keep a list of instructions with invalid costs.
       if (Invalid && !C.first.isValid())
-        Invalid->emplace_back(&I, VF);
+        Invalid->emplace_back(&I, VF.Width);
 
       BlockCost.first += C.first;
       BlockCost.second |= C.second;
       LLVM_DEBUG(dbgs() << "LV: Found an estimated cost of " << C.first
-                        << " for VF " << VF << " For instruction: " << I
+                        << " for VF " << VF.Width << " For instruction: " << I
                         << '\n');
     }
 
@@ -6320,7 +6428,7 @@
     // the predicated block, if it is an if-else block. Thus, scale the block's
     // cost by the probability of executing it. blockNeedsPredication from
     // Legal is used so as to not include all blocks in tail folded loops.
-    if (VF.isScalar() && Legal->blockNeedsPredication(BB))
+    if (VF.Width.isScalar() && Legal->blockNeedsPredication(BB))
       BlockCost.first /= getReciprocalPredBlockProb();
 
     Cost.first += BlockCost.first;
@@ -6365,9 +6473,8 @@
          Legal->hasStride(I->getOperand(1));
 }
 
-InstructionCost
-LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
-                                                        ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getMemInstScalarizationCost(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   assert(VF.isVector() &&
          "Scalarization cost of instruction implies vectorization.");
   if (VF.isScalable())
@@ -6399,12 +6506,12 @@
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
-  Cost += getScalarizationOverhead(I, VF);
+  Cost += getScalarizationOverhead(I, FoldTailByMasking, VF);
 
   // If we have a predicated load/store, it will need extra i1 extracts and
   // conditional branches, but may not be executed for each vector lane. Scale
   // the cost by the probability of executing the predicated block.
-  if (isPredicatedInst(I)) {
+  if (isPredicatedInst(FoldTailByMasking, I)) {
     Cost /= getReciprocalPredBlockProb();
 
     // Add the cost of an i1 extract and a branch
@@ -6415,7 +6522,7 @@
         /*Insert=*/false, /*Extract=*/true);
     Cost += TTI.getCFInstrCost(Instruction::Br, TTI::TCK_RecipThroughput);
 
-    if (useEmulatedMaskMemRefHack(I, VF))
+    if (useEmulatedMaskMemRefHack(I, FoldTailByMasking, VF))
       // Artificially setting to a high enough value to practically disable
       // vectorization with such operations.
       Cost = 3000000;
@@ -6424,9 +6531,8 @@
   return Cost;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
-                                                    ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getConsecutiveMemOpCost(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   Value *Ptr = getLoadStorePointerOperand(I);
@@ -6438,7 +6544,7 @@
          "Stride should be 1 or -1 for consecutive memory access");
   const Align Alignment = getLoadStoreAlignment(I);
   InstructionCost Cost = 0;
-  if (Legal->isMaskRequired(I)) {
+  if (Legal->isMaskRequired(FoldTailByMasking, I)) {
     Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
                                       CostKind);
   } else {
@@ -6482,9 +6588,8 @@
                                        VF.getKnownMinValue() - 1));
 }
 
-InstructionCost
-LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
-                                                 ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getGatherScatterCost(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   Type *ValTy = getLoadStoreType(I);
   auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   const Align Alignment = getLoadStoreAlignment(I);
@@ -6492,13 +6597,13 @@
 
   return TTI.getAddressComputationCost(VectorTy) +
          TTI.getGatherScatterOpCost(
-             I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+             I->getOpcode(), VectorTy, Ptr,
+             Legal->isMaskRequired(FoldTailByMasking, I), Alignment,
              TargetTransformInfo::TCK_RecipThroughput, I);
 }
 
-InstructionCost
-LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
-                                                   ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getInterleaveGroupCost(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   // TODO: Once we have support for interleaving with scalable vectors
   // we can calculate the cost properly here.
   if (VF.isScalable())
@@ -6527,11 +6632,12 @@
       (isa<StoreInst>(I) && (Group->getNumMembers() < Group->getFactor()));
   InstructionCost Cost = TTI.getInterleavedMemoryOpCost(
       I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
-      AS, CostKind, Legal->isMaskRequired(I), UseMaskForGaps);
+      AS, CostKind, Legal->isMaskRequired(FoldTailByMasking, I),
+      UseMaskForGaps);
 
   if (Group->isReverse()) {
     // TODO: Add support for reversed masked interleaved access.
-    assert(!Legal->isMaskRequired(I) &&
+    assert(!Legal->isMaskRequired(FoldTailByMasking, I) &&
            "Reverse masked interleaved access not supported.");
     Cost += Group->getNumMembers() *
             TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy,
@@ -6713,9 +6819,8 @@
   return I == RetI ? std::optional<InstructionCost>(BaseCost) : std::nullopt;
 }
 
-InstructionCost
-LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
-                                                     ElementCount VF) {
+InstructionCost LoopVectorizationCostModel::getMemoryInstructionCost(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) {
   // Calculate scalar cost only. Vectorization cost should be ready at this
   // moment.
   if (VF.isScalar()) {
@@ -6728,33 +6833,36 @@
            TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
                                TTI::TCK_RecipThroughput, OpInfo, I);
   }
-  return getWideningCost(I, VF);
+  return getWideningCost(I, FoldTailByMasking, VF);
 }
 
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I,
+                                               bool FoldTailByMasking,
                                                ElementCount VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
-  if (isUniformAfterVectorization(I, VF))
+  if (isUniformAfterVectorization(I, FoldTailByMasking, VF))
     VF = ElementCount::getFixed(1);
 
-  if (VF.isVector() && isProfitableToScalarize(I, VF))
-    return VectorizationCostTy(InstsToScalarize[VF][I], false);
+  if (VF.isVector() && isProfitableToScalarize(I, FoldTailByMasking, VF))
+    return VectorizationCostTy(InstsToScalarize[{FoldTailByMasking, VF}][I],
+                               false);
 
   // Forced scalars do not have any scalarization overhead.
-  auto ForcedScalar = ForcedScalars.find(VF);
+  auto ForcedScalar = ForcedScalars.find({FoldTailByMasking, VF});
   if (VF.isVector() && ForcedScalar != ForcedScalars.end()) {
     auto InstSet = ForcedScalar->second;
     if (InstSet.count(I))
       return VectorizationCostTy(
-          (getInstructionCost(I, ElementCount::getFixed(1)).first *
+          (getInstructionCost(I, FoldTailByMasking, ElementCount::getFixed(1))
+               .first *
            VF.getKnownMinValue()),
           false);
   }
 
   Type *VectorTy;
-  InstructionCost C = getInstructionCost(I, VF, VectorTy);
+  InstructionCost C = getInstructionCost(I, FoldTailByMasking, VF, VectorTy);
 
   bool TypeNotScalarized = false;
   if (VF.isVector() && VectorTy->isVectorTy()) {
@@ -6774,9 +6882,8 @@
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
-InstructionCost
-LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
-                                                     ElementCount VF) const {
+InstructionCost LoopVectorizationCostModel::getScalarizationOverhead(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF) const {
 
   // There is no mechanism yet to create a scalable scalarization loop,
   // so this is currently Invalid.
@@ -6809,13 +6916,14 @@
   // Skip operands that do not require extraction/scalarization and do not incur
   // any overhead.
   SmallVector<Type *> Tys;
-  for (auto *V : filterExtractingOperands(Ops, VF))
+  for (auto *V : filterExtractingOperands(Ops, FoldTailByMasking, VF))
     Tys.push_back(MaybeVectorizeType(V->getType(), VF));
   return Cost + TTI.getOperandsScalarizationOverhead(
-                    filterExtractingOperands(Ops, VF), Tys);
+                    filterExtractingOperands(Ops, FoldTailByMasking, VF), Tys);
 }
 
-void LoopVectorizationCostModel::setCostBasedWideningDecision(ElementCount VF) {
+void LoopVectorizationCostModel::setCostBasedWideningDecision(
+    bool FoldTailByMasking, ElementCount VF) {
   if (VF.isScalar())
     return;
   NumPredStores = 0;
@@ -6830,7 +6938,8 @@
       // predicated uniform stores. Today they are treated as any other
       // predicated store (see added test cases in
       // invariant-store-vectorization.ll).
-      if (isa<StoreInst>(&I) && isScalarWithPredication(&I, VF))
+      if (isa<StoreInst>(&I) &&
+          isScalarWithPredication(&I, FoldTailByMasking, VF))
         NumPredStores++;
 
       if (Legal->isUniformMemOp(I)) {
@@ -6843,7 +6952,7 @@
           // stores.  Note that even with tail folding we know that at least
           // one lane is active (i.e. generalized predication is not possible
           // here), and the logic below depends on this fact.
-          if (!foldTailByMasking())
+          if (!FoldTailByMasking)
             return true;
 
           // For scalable vectors, a uniform memop load is always
@@ -6858,8 +6967,9 @@
         };
 
         const InstructionCost GatherScatterCost =
-          isLegalGatherOrScatter(&I, VF) ?
-          getGatherScatterCost(&I, VF) : InstructionCost::getInvalid();
+            isLegalGatherOrScatter(&I, VF)
+                ? getGatherScatterCost(&I, FoldTailByMasking, VF)
+                : InstructionCost::getInvalid();
 
         // Load: Scalar load + broadcast
         // Store: Scalar store + isLoopInvariantStoreValue ? 0 : extract
@@ -6872,22 +6982,25 @@
         // costs compare as maximumal large.  If both are invalid, we get
         // scalable invalid which signals a failure and a vectorization abort.
         if (GatherScatterCost < ScalarizationCost)
-          setWideningDecision(&I, VF, CM_GatherScatter, GatherScatterCost);
+          setWideningDecision(&I, FoldTailByMasking, VF, CM_GatherScatter,
+                              GatherScatterCost);
         else
-          setWideningDecision(&I, VF, CM_Scalarize, ScalarizationCost);
+          setWideningDecision(&I, FoldTailByMasking, VF, CM_Scalarize,
+                              ScalarizationCost);
         continue;
       }
 
       // We assume that widening is the best solution when possible.
-      if (memoryInstructionCanBeWidened(&I, VF)) {
-        InstructionCost Cost = getConsecutiveMemOpCost(&I, VF);
+      if (memoryInstructionCanBeWidened(&I, FoldTailByMasking, VF)) {
+        InstructionCost Cost =
+            getConsecutiveMemOpCost(&I, FoldTailByMasking, VF);
         int ConsecutiveStride = Legal->isConsecutivePtr(
             getLoadStoreType(&I), getLoadStorePointerOperand(&I));
         assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
                "Expected consecutive stride.");
         InstWidening Decision =
             ConsecutiveStride == 1 ? CM_Widen : CM_Widen_Reverse;
-        setWideningDecision(&I, VF, Decision, Cost);
+        setWideningDecision(&I, FoldTailByMasking, VF, Decision, Cost);
         continue;
       }
 
@@ -6899,21 +7012,21 @@
         assert(Group && "Fail to get an interleaved access group.");
 
         // Make one decision for the whole group.
-        if (getWideningDecision(&I, VF) != CM_Unknown)
+        if (getWideningDecision(&I, FoldTailByMasking, VF) != CM_Unknown)
           continue;
 
         NumAccesses = Group->getNumMembers();
-        if (interleavedAccessCanBeWidened(&I, VF))
-          InterleaveCost = getInterleaveGroupCost(&I, VF);
+        if (interleavedAccessCanBeWidened(&I, FoldTailByMasking, VF))
+          InterleaveCost = getInterleaveGroupCost(&I, FoldTailByMasking, VF);
       }
 
       InstructionCost GatherScatterCost =
           isLegalGatherOrScatter(&I, VF)
-              ? getGatherScatterCost(&I, VF) * NumAccesses
+              ? getGatherScatterCost(&I, FoldTailByMasking, VF) * NumAccesses
               : InstructionCost::getInvalid();
 
       InstructionCost ScalarizationCost =
-          getMemInstScalarizationCost(&I, VF) * NumAccesses;
+          getMemInstScalarizationCost(&I, FoldTailByMasking, VF) * NumAccesses;
 
       // Choose better solution for the current VF,
       // write down this decision and use it during vectorization.
@@ -6934,9 +7047,9 @@
       // receives the same decision. The whole group receives the cost, but
       // the cost will actually be assigned to one instruction.
       if (auto Group = getInterleavedAccessGroup(&I))
-        setWideningDecision(Group, VF, Decision, Cost);
+        setWideningDecision(Group, FoldTailByMasking, VF, Decision, Cost);
       else
-        setWideningDecision(&I, VF, Decision, Cost);
+        setWideningDecision(&I, FoldTailByMasking, VF, Decision, Cost);
     }
   }
 
@@ -6955,7 +7068,7 @@
       Instruction *PtrDef =
         dyn_cast_or_null<Instruction>(getLoadStorePointerOperand(&I));
       if (PtrDef && TheLoop->contains(PtrDef) &&
-          getWideningDecision(&I, VF) != CM_GatherScatter)
+          getWideningDecision(&I, FoldTailByMasking, VF) != CM_GatherScatter)
         AddrDefs.insert(PtrDef);
     }
 
@@ -6977,45 +7090,46 @@
       // by cost functions, but since this involves the task of finding out
       // if the loaded register is involved in an address computation, it is
       // instead changed here when we know this is the case.
-      InstWidening Decision = getWideningDecision(I, VF);
+      InstWidening Decision = getWideningDecision(I, FoldTailByMasking, VF);
       if (Decision == CM_Widen || Decision == CM_Widen_Reverse)
         // Scalarize a widened load of address.
         setWideningDecision(
-            I, VF, CM_Scalarize,
+            I, FoldTailByMasking, VF, CM_Scalarize,
             (VF.getKnownMinValue() *
-             getMemoryInstructionCost(I, ElementCount::getFixed(1))));
+             getMemoryInstructionCost(I, FoldTailByMasking,
+                                      ElementCount::getFixed(1))));
       else if (auto Group = getInterleavedAccessGroup(I)) {
         // Scalarize an interleave group of address loads.
         for (unsigned I = 0; I < Group->getFactor(); ++I) {
           if (Instruction *Member = Group->getMember(I))
             setWideningDecision(
-                Member, VF, CM_Scalarize,
+                Member, FoldTailByMasking, VF, CM_Scalarize,
                 (VF.getKnownMinValue() *
-                 getMemoryInstructionCost(Member, ElementCount::getFixed(1))));
+                 getMemoryInstructionCost(Member, FoldTailByMasking,
+                                          ElementCount::getFixed(1))));
         }
       }
     } else
       // Make sure I gets scalarized and a cost estimate without
       // scalarization overhead.
-      ForcedScalars[VF].insert(I);
+      ForcedScalars[{FoldTailByMasking, VF}].insert(I);
   }
 }
 
-InstructionCost
-LoopVectorizationCostModel::getInstructionCost(Instruction *I, ElementCount VF,
-                                               Type *&VectorTy) {
+InstructionCost LoopVectorizationCostModel::getInstructionCost(
+    Instruction *I, bool FoldTailByMasking, ElementCount VF, Type *&VectorTy) {
   Type *RetTy = I->getType();
-  if (canTruncateToMinimalBitwidth(I, VF))
+  if (canTruncateToMinimalBitwidth(I, FoldTailByMasking, VF))
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
   auto SE = PSE.getSE();
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
-  auto hasSingleCopyAfterVectorization = [this](Instruction *I,
-                                                ElementCount VF) -> bool {
+  auto hasSingleCopyAfterVectorization =
+      [this](Instruction *I, bool FoldTailByMasking, ElementCount VF) -> bool {
     if (VF.isScalar())
       return true;
 
-    auto Scalarized = InstsToScalarize.find(VF);
+    auto Scalarized = InstsToScalarize.find({FoldTailByMasking, VF});
     assert(Scalarized != InstsToScalarize.end() &&
            "VF not yet analyzed for scalarization profitability");
     return !Scalarized->second.count(I) &&
@@ -7026,7 +7140,7 @@
   };
   (void) hasSingleCopyAfterVectorization;
 
-  if (isScalarAfterVectorization(I, VF)) {
+  if (isScalarAfterVectorization(I, FoldTailByMasking, VF)) {
     // With the exception of GEPs and PHIs, after scalarization there should
     // only be one copy of the instruction generated in the loop. This is
     // because the VF is either 1, or any instructions that need scalarizing
@@ -7036,7 +7150,7 @@
            I->getOpcode() == Instruction::PHI ||
            (I->getOpcode() == Instruction::BitCast &&
             I->getType()->isPointerTy()) ||
-           hasSingleCopyAfterVectorization(I, VF));
+           hasSingleCopyAfterVectorization(I, FoldTailByMasking, VF));
     VectorTy = RetTy;
   } else
     VectorTy = ToVectorTy(RetTy, VF);
@@ -7109,8 +7223,9 @@
   case Instruction::SDiv:
   case Instruction::URem:
   case Instruction::SRem:
-    if (VF.isVector() && isPredicatedInst(I)) {
-      const auto [ScalarCost, SafeDivisorCost] = getDivRemSpeculationCost(I, VF);
+    if (VF.isVector() && isPredicatedInst(FoldTailByMasking, I)) {
+      const auto [ScalarCost, SafeDivisorCost] =
+          getDivRemSpeculationCost(I, FoldTailByMasking, VF);
       return isDivRemScalarWithPredication(ScalarCost, SafeDivisorCost) ?
         ScalarCost : SafeDivisorCost;
     }
@@ -7194,7 +7309,7 @@
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
     Instruction *Op0AsInstruction = dyn_cast<Instruction>(I->getOperand(0));
-    if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
+    if (canTruncateToMinimalBitwidth(Op0AsInstruction, FoldTailByMasking, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
     return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr,
@@ -7205,16 +7320,17 @@
   case Instruction::Load: {
     ElementCount Width = VF;
     if (Width.isVector()) {
-      InstWidening Decision = getWideningDecision(I, Width);
+      InstWidening Decision = getWideningDecision(I, FoldTailByMasking, Width);
       assert(Decision != CM_Unknown &&
              "CM decision should be taken at this point");
-      if (getWideningCost(I, VF) == InstructionCost::getInvalid())
+      if (getWideningCost(I, FoldTailByMasking, VF) ==
+          InstructionCost::getInvalid())
         return InstructionCost::getInvalid();
       if (Decision == CM_Scalarize)
         Width = ElementCount::getFixed(1);
     }
     VectorTy = ToVectorTy(getLoadStoreType(I), Width);
-    return getMemoryInstructionCost(I, VF);
+    return getMemoryInstructionCost(I, FoldTailByMasking, VF);
   }
   case Instruction::BitCast:
     if (I->getType()->isPointerTy())
@@ -7239,15 +7355,16 @@
       if (VF.isScalar() || !TheLoop->contains(I))
         return TTI::CastContextHint::Normal;
 
-      switch (getWideningDecision(I, VF)) {
+      switch (getWideningDecision(I, FoldTailByMasking, VF)) {
       case LoopVectorizationCostModel::CM_GatherScatter:
         return TTI::CastContextHint::GatherScatter;
       case LoopVectorizationCostModel::CM_Interleave:
         return TTI::CastContextHint::Interleave;
       case LoopVectorizationCostModel::CM_Scalarize:
       case LoopVectorizationCostModel::CM_Widen:
-        return Legal->isMaskRequired(I) ? TTI::CastContextHint::Masked
-                                        : TTI::CastContextHint::Normal;
+        return Legal->isMaskRequired(FoldTailByMasking, I)
+                   ? TTI::CastContextHint::Masked
+                   : TTI::CastContextHint::Normal;
       case LoopVectorizationCostModel::CM_Widen_Reverse:
         return TTI::CastContextHint::Reversed;
       case LoopVectorizationCostModel::CM_Unknown:
@@ -7288,7 +7405,7 @@
     Type *SrcScalarTy = I->getOperand(0)->getType();
     Type *SrcVecTy =
         VectorTy->isVectorTy() ? ToVectorTy(SrcScalarTy, VF) : SrcScalarTy;
-    if (canTruncateToMinimalBitwidth(I, VF)) {
+    if (canTruncateToMinimalBitwidth(I, FoldTailByMasking, VF)) {
       // This cast is going to be shrunk. This may remove the cast or it might
       // turn it into slightly different cast. For example, if MinBW == 16,
       // "zext i8 %1 to i32" becomes "zext i8 %1 to i16".
@@ -7314,7 +7431,8 @@
         return *RedCost;
     bool NeedToScalarize;
     CallInst *CI = cast<CallInst>(I);
-    InstructionCost CallCost = getVectorCallCost(CI, VF, NeedToScalarize);
+    InstructionCost CallCost =
+        getVectorCallCost(CI, FoldTailByMasking, VF, NeedToScalarize);
     if (getVectorIntrinsicIDForCall(CI, TLI)) {
       InstructionCost IntrinsicCost = getVectorIntrinsicCost(CI, VF);
       return std::min(CallCost, IntrinsicCost);
@@ -7481,7 +7599,7 @@
     if (VPlanBuildStressTest)
       return VectorizationFactor::Disabled();
 
-    return {VF, 0 /*Cost*/, 0 /* ScalarCost */};
+    return {false, VF, 0 /*Cost*/, 0 /* ScalarCost */};
   }
 
   LLVM_DEBUG(
@@ -7498,7 +7616,8 @@
     return std::nullopt;
 
   // Invalidate interleave groups if all blocks of loop will be predicated.
-  if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
+  if (blockNeedsPredicationForAnyReason(CM.mayFoldTailByMasking(), Legal,
+                                        OrigLoop->getHeader()) &&
       !useMaskedInterleavedAccesses(*TTI)) {
     LLVM_DEBUG(
         dbgs()
@@ -7519,12 +7638,13 @@
            "VF needs to be a power of two");
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    if (CM.selectUserVectorizationFactor(UserVF)) {
+    // TODOD: Should this compare tail folded vs non-tail-folded?
+    if (CM.selectUserVectorizationFactor(CM.mayFoldTailByMasking(), UserVF)) {
       LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
       CM.collectInLoopReductions();
-      buildVPlansWithVPRecipes(UserVF, UserVF);
+      buildVPlansWithVPRecipes(CM.mayFoldTailByMasking(), UserVF, UserVF);
       LLVM_DEBUG(printPlans(dbgs()));
-      return {{UserVF, 0, 0}};
+      return {{CM.mayFoldTailByMasking(), UserVF, 0, 0}};
     } else
       reportVectorizationInfo("UserVF ignored because of invalid costs.",
                               "InvalidCost", ORE, OrigLoop);
@@ -7532,26 +7652,48 @@
 
   // Populate the set of Vectorization Factor Candidates.
   ElementCountSet VFCandidates;
-  for (auto VF = ElementCount::getFixed(1);
-       ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
-    VFCandidates.insert(VF);
-  for (auto VF = ElementCount::getScalable(1);
-       ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
-    VFCandidates.insert(VF);
+  VFCandidates.insert({false, ElementCount::getFixed(1)});
+  if (CM.shouldGenerateNonFoldedVFs()) {
+    for (auto VF = ElementCount::getFixed(2);
+         ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
+      VFCandidates.insert({false, VF});
+    for (auto VF = ElementCount::getScalable(1);
+         ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
+      VFCandidates.insert({false, VF});
+  }
+  if (CM.mayFoldTailByMasking()) {
+    for (auto VF = ElementCount::getFixed(1);
+         ElementCount::isKnownLE(VF, MaxFactors.FixedVF); VF *= 2)
+      VFCandidates.insert({true, VF});
+    for (auto VF = ElementCount::getScalable(1);
+         ElementCount::isKnownLE(VF, MaxFactors.ScalableVF); VF *= 2)
+      VFCandidates.insert({true, VF});
+  }
 
   for (const auto &VF : VFCandidates) {
     // Collect Uniform and Scalar instructions after vectorization with VF.
-    CM.collectUniformsAndScalars(VF);
+    CM.collectUniformsAndScalars(VF.FoldTailByMasking, VF.Width);
 
     // Collect the instructions (and their associated costs) that will be more
     // profitable to scalarize.
-    if (VF.isVector())
-      CM.collectInstsToScalarize(VF);
+    if (VF.Width.isVector())
+      CM.collectInstsToScalarize(VF.FoldTailByMasking, VF.Width);
   }
 
   CM.collectInLoopReductions();
-  buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
-  buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
+  buildVPlansWithVPRecipes(false, ElementCount::getFixed(1),
+                           CM.shouldGenerateNonFoldedVFs()
+                               ? MaxFactors.FixedVF
+                               : ElementCount::getFixed(1));
+  if (CM.shouldGenerateNonFoldedVFs())
+    buildVPlansWithVPRecipes(false, ElementCount::getScalable(1),
+                             MaxFactors.ScalableVF);
+  if (CM.mayFoldTailByMasking()) {
+    buildVPlansWithVPRecipes(true, ElementCount::getFixed(1),
+                             MaxFactors.FixedVF);
+    buildVPlansWithVPRecipes(true, ElementCount::getScalable(1),
+                             MaxFactors.ScalableVF);
+  }
 
   LLVM_DEBUG(printPlans(dbgs()));
   if (!MaxFactors.hasVector())
@@ -7559,18 +7701,22 @@
 
   // Select the optimal vectorization factor.
   VectorizationFactor VF = CM.selectVectorizationFactor(VFCandidates);
-  assert((VF.Width.isScalar() || VF.ScalarCost > 0) && "when vectorizing, the scalar cost must be non-zero.");
+  assert((VF.Scheme.Width.isScalar() || VF.ScalarCost > 0) &&
+         "when vectorizing, the scalar cost must be non-zero.");
   return VF;
 }
 
-VPlan &LoopVectorizationPlanner::getBestPlanFor(ElementCount VF) const {
+VPlan &LoopVectorizationPlanner::getBestPlanFor(VectorizationScheme VF) const {
   assert(count_if(VPlans,
-                  [VF](const VPlanPtr &Plan) { return Plan->hasVF(VF); }) ==
-             1 &&
+                  [VF](const VPlanPtr &Plan) {
+                    return Plan->hasVF(VF.Width) &&
+                           Plan->foldTailByMasking() == VF.FoldTailByMasking;
+                  }) == 1 &&
          "Best VF has not a single VPlan.");
 
   for (const VPlanPtr &Plan : VPlans) {
-    if (Plan->hasVF(VF))
+    if (Plan->hasVF(VF.Width) &&
+        Plan->foldTailByMasking() == VF.FoldTailByMasking)
       return *Plan.get();
   }
   llvm_unreachable("No plan found!");
@@ -7620,8 +7766,9 @@
   assert(BestVPlan.hasUF(BestUF) &&
          "Trying to execute plan with unsupported UF");
 
-  LLVM_DEBUG(dbgs() << "Executing best plan with VF=" << BestVF << ", UF=" << BestUF
-                    << '\n');
+  LLVM_DEBUG(dbgs() << "Executing best plan with TailFold="
+                    << (BestVPlan.foldTailByMasking() ? "true" : "false")
+                    << ", VF=" << BestVF << ", UF=" << BestUF << '\n');
 
   // Workaround!  Compute the trip count of the original loop and cache it
   // before we start modifying the CFG.  This code has a systemic problem
@@ -7642,7 +7789,7 @@
   VPTransformState State{BestVF, BestUF, LI, DT, ILV.Builder, &ILV, &BestVPlan};
   Value *CanonicalIVStartValue;
   std::tie(State.CFG.PrevBB, CanonicalIVStartValue) =
-      ILV.createVectorizedLoopSkeleton();
+      ILV.createVectorizedLoopSkeleton(BestVPlan);
 
   // Only use noalias metadata when using memory checks guaranteeing no overlap
   // across all iterations.
@@ -7674,7 +7821,7 @@
 
   // 2. Copy and widen instructions from the old loop into the new loop.
   BestVPlan.prepareToExecute(ILV.getOrCreateTripCount(nullptr),
-                             ILV.getOrCreateVectorTripCount(nullptr),
+                             ILV.getOrCreateVectorTripCount(nullptr, BestVPlan),
                              CanonicalIVStartValue, State,
                              IsEpilogueVectorization);
 
@@ -7730,7 +7877,7 @@
 /// This function is partially responsible for generating the control flow
 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
 std::pair<BasicBlock *, Value *>
-EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
+EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(VPlan &Plan) {
   createVectorLoopSkeleton("");
 
   // Generate the code to check the minimum iteration count of the vector
@@ -7758,14 +7905,14 @@
       emitIterationCountCheck(LoopScalarPreHeader, false);
 
   // Generate the induction variable.
-  EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader);
+  EPI.VectorTripCount = getOrCreateVectorTripCount(LoopVectorPreHeader, Plan);
 
   // Skip induction resume value creation here because they will be created in
   // the second pass for the scalar loop. The induction resume values for the
   // inductions in the epilogue loop are created before executing the plan for
   // the epilogue loop.
 
-  return {completeLoopSkeleton(), nullptr};
+  return {completeLoopSkeleton(Plan), nullptr};
 }
 
 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
@@ -7848,7 +7995,8 @@
 /// This function is partially responsible for generating the control flow
 /// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
 std::pair<BasicBlock *, Value *>
-EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
+EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
+    VPlan &Plan) {
   createVectorLoopSkeleton("vec.epilog.");
 
   // Now, compare the remaining count and if there aren't enough iterations to
@@ -7946,10 +8094,11 @@
   // check, then the resume value for the induction variable comes from
   // the trip count of the main vector loop, hence passing the AdditionalBypass
   // argument.
-  createInductionResumeValues({VecEpilogueIterationCountCheck,
+  createInductionResumeValues(Plan,
+                              {VecEpilogueIterationCountCheck,
                                EPI.VectorTripCount} /* AdditionalBypass */);
 
-  return {completeLoopSkeleton(), EPResumeVal};
+  return {completeLoopSkeleton(Plan), EPResumeVal};
 }
 
 BasicBlock *
@@ -8087,10 +8236,11 @@
   VPValue *BlockMask = nullptr;
 
   if (OrigLoop->getHeader() == BB) {
-    if (!CM.blockNeedsPredicationForAnyReason(BB))
+    if (!blockNeedsPredicationForAnyReason(Plan->foldTailByMasking(), Legal,
+                                           BB))
       return BlockMaskCache[BB] = BlockMask; // Loop incoming mask is all-one.
 
-    assert(CM.foldTailByMasking() && "must fold the tail");
+    assert(Plan->foldTailByMasking() && "must fold the tail");
 
     // If we're using the active lane mask for control flow, then we get the
     // mask from the active lane mask PHI that is cached in the VPlan.
@@ -8148,13 +8298,13 @@
 
   auto willWiden = [&](ElementCount VF) -> bool {
     LoopVectorizationCostModel::InstWidening Decision =
-        CM.getWideningDecision(I, VF);
+        CM.getWideningDecision(I, Plan->foldTailByMasking(), VF);
     assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
            "CM decision should be taken at this point.");
     if (Decision == LoopVectorizationCostModel::CM_Interleave)
       return true;
-    if (CM.isScalarAfterVectorization(I, VF) ||
-        CM.isProfitableToScalarize(I, VF))
+    if (CM.isScalarAfterVectorization(I, Plan->foldTailByMasking(), VF) ||
+        CM.isProfitableToScalarize(I, Plan->foldTailByMasking(), VF))
       return false;
     return Decision != LoopVectorizationCostModel::CM_Scalarize;
   };
@@ -8163,13 +8313,13 @@
     return nullptr;
 
   VPValue *Mask = nullptr;
-  if (Legal->isMaskRequired(I))
+  if (Legal->isMaskRequired(Plan->foldTailByMasking(), I))
     Mask = createBlockInMask(I->getParent(), Plan);
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
   LoopVectorizationCostModel::InstWidening Decision =
-      CM.getWideningDecision(I, Range.Start);
+      CM.getWideningDecision(I, Plan->foldTailByMasking(), Range.Start);
   bool Reverse = Decision == LoopVectorizationCostModel::CM_Widen_Reverse;
   bool Consecutive =
       Reverse || Decision == LoopVectorizationCostModel::CM_Widen;
@@ -8191,9 +8341,10 @@
     VPlan &Plan, ScalarEvolution &SE, Loop &OrigLoop, VFRange &Range) {
   // Returns true if an instruction \p I should be scalarized instead of
   // vectorized for the chosen vectorization factor.
-  auto ShouldScalarizeInstruction = [&CM](Instruction *I, ElementCount VF) {
-    return CM.isScalarAfterVectorization(I, VF) ||
-           CM.isProfitableToScalarize(I, VF);
+  auto ShouldScalarizeInstruction = [&CM, &Plan](Instruction *I,
+                                                 ElementCount VF) {
+    return CM.isScalarAfterVectorization(I, Plan.foldTailByMasking(), VF) ||
+           CM.isProfitableToScalarize(I, Plan.foldTailByMasking(), VF);
   };
 
   bool NeedsScalarIVOnly = LoopVectorizationPlanner::getDecisionAndClampRange(
@@ -8235,7 +8386,8 @@
         Phi, Operands[0], Step, *II,
         LoopVectorizationPlanner::getDecisionAndClampRange(
             [&](ElementCount VF) {
-              return CM.isScalarAfterVectorization(Phi, VF);
+              return CM.isScalarAfterVectorization(
+                  Phi, Plan.foldTailByMasking(), VF);
             },
             Range));
   }
@@ -8317,11 +8469,12 @@
 
 VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                                                    ArrayRef<VPValue *> Operands,
-                                                   VFRange &Range) const {
+                                                   VFRange &Range,
+                                                   VPlanPtr &Plan) const {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [this, CI](ElementCount VF) {
-        return CM.isScalarWithPredication(CI, VF);
+      [this, CI, &Plan](ElementCount VF) {
+        return CM.isScalarWithPredication(CI, Plan->foldTailByMasking(), VF);
       },
       Range);
 
@@ -8344,8 +8497,8 @@
                   bool NeedToScalarize = false;
                   // Is it beneficial to perform intrinsic call compared to lib
                   // call?
-                  InstructionCost CallCost =
-                      CM.getVectorCallCost(CI, VF, NeedToScalarize);
+                  InstructionCost CallCost = CM.getVectorCallCost(
+                      CI, Plan->foldTailByMasking(), VF, NeedToScalarize);
                   InstructionCost IntrinsicCost =
                       CM.getVectorIntrinsicCost(CI, VF);
                   return IntrinsicCost <= CallCost;
@@ -8362,7 +8515,8 @@
         // The flag shows whether we can use a usual Call for vectorized
         // version of the instruction.
         bool NeedToScalarize = false;
-        CM.getVectorCallCost(CI, VF, NeedToScalarize);
+        CM.getVectorCallCost(CI, Plan->foldTailByMasking(), VF,
+                             NeedToScalarize);
         return !NeedToScalarize;
       },
       Range);
@@ -8373,15 +8527,16 @@
   return nullptr;
 }
 
-bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
+bool VPRecipeBuilder::shouldWiden(Instruction *I, bool FoldTailByMasking,
+                                  VFRange &Range) const {
   assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
          !isa<StoreInst>(I) && "Instruction should have been handled earlier");
   // Instruction should be widened, unless it is scalar after vectorization,
   // scalarization is profitable or it is predicated.
-  auto WillScalarize = [this, I](ElementCount VF) -> bool {
-    return CM.isScalarAfterVectorization(I, VF) ||
-           CM.isProfitableToScalarize(I, VF) ||
-           CM.isScalarWithPredication(I, VF);
+  auto WillScalarize = [this, I, FoldTailByMasking](ElementCount VF) -> bool {
+    return CM.isScalarAfterVectorization(I, FoldTailByMasking, VF) ||
+           CM.isProfitableToScalarize(I, FoldTailByMasking, VF) ||
+           CM.isScalarWithPredication(I, FoldTailByMasking, VF);
   };
   return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
                                                              Range);
@@ -8399,7 +8554,7 @@
   case Instruction::URem: {
     // If not provably safe, use a select to form a safe divisor before widening the
     // div/rem operation itself.  Otherwise fall through to general handling below.
-    if (CM.isPredicatedInst(I)) {
+    if (CM.isPredicatedInst(Plan->foldTailByMasking(), I)) {
       SmallVector<VPValue *> Ops(Operands.begin(), Operands.end());
       VPValue *Mask = createBlockInMask(I->getParent(), Plan);
       VPValue *One =
@@ -8462,10 +8617,12 @@
     Instruction *I, VFRange &Range, VPBasicBlock *VPBB,
     VPlanPtr &Plan) {
   bool IsUniform = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](ElementCount VF) { return CM.isUniformAfterVectorization(I, VF); },
+      [&](ElementCount VF) {
+        return CM.isUniformAfterVectorization(I, Plan->foldTailByMasking(), VF);
+      },
       Range);
 
-  bool IsPredicated = CM.isPredicatedInst(I);
+  bool IsPredicated = CM.isPredicatedInst(Plan->foldTailByMasking(), I);
 
   // Even if the instruction is not marked as uniform, there are certain
   // intrinsic calls that can be effectively treated as such, so we check for
@@ -8639,12 +8796,12 @@
     return nullptr;
 
   if (auto *CI = dyn_cast<CallInst>(Instr))
-    return toVPRecipeResult(tryToWidenCall(CI, Operands, Range));
+    return toVPRecipeResult(tryToWidenCall(CI, Operands, Range, Plan));
 
   if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
     return toVPRecipeResult(tryToWidenMemory(Instr, Operands, Range, Plan));
 
-  if (!shouldWiden(Instr, Range))
+  if (!shouldWiden(Instr, Plan->foldTailByMasking(), Range))
     return nullptr;
 
   if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
@@ -8661,7 +8818,8 @@
   return toVPRecipeResult(tryToWiden(Instr, Operands, VPBB, Plan));
 }
 
-void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
+void LoopVectorizationPlanner::buildVPlansWithVPRecipes(bool FoldTailByMasking,
+                                                        ElementCount MinVF,
                                                         ElementCount MaxVF) {
   assert(OrigLoop->isInnermost() && "Inner loop expected.");
 
@@ -8670,7 +8828,7 @@
   // TODO: We only need to drop assumes in blocks that get flattend. If the
   // control flow is preserved, we should keep them.
   SmallPtrSet<Instruction *, 4> DeadInstructions;
-  auto &ConditionalAssumes = Legal->getConditionalAssumes();
+  auto &ConditionalAssumes = Legal->getConditionalAssumes(FoldTailByMasking);
   DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
 
   MapVector<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
@@ -8699,8 +8857,8 @@
   auto MaxVFPlusOne = MaxVF.getWithIncrement(1);
   for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFPlusOne);) {
     VFRange SubRange = {VF, MaxVFPlusOne};
-    VPlans.push_back(
-        buildVPlanWithVPRecipes(SubRange, DeadInstructions, SinkAfter));
+    VPlans.push_back(buildVPlanWithVPRecipes(FoldTailByMasking, SubRange,
+                                             DeadInstructions, SinkAfter));
     VF = SubRange.End;
   }
 }
@@ -8806,7 +8964,8 @@
 }
 
 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
-    VFRange &Range, SmallPtrSetImpl<Instruction *> &DeadInstructions,
+    bool FoldTailByMasking, VFRange &Range,
+    SmallPtrSetImpl<Instruction *> &DeadInstructions,
     const MapVector<Instruction *, Instruction *> &SinkAfter) {
 
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
@@ -8847,10 +9006,11 @@
   // placeholders for its members' Recipes which we'll be replacing with a
   // single VPInterleaveRecipe.
   for (InterleaveGroup<Instruction> *IG : IAI.getInterleaveGroups()) {
-    auto applyIG = [IG, this](ElementCount VF) -> bool {
-      return (VF.isVector() && // Query is illegal for VF == 1
-              CM.getWideningDecision(IG->getInsertPos(), VF) ==
-                  LoopVectorizationCostModel::CM_Interleave);
+    auto applyIG = [IG, this, &FoldTailByMasking](ElementCount VF) -> bool {
+      return (
+          VF.isVector() && // Query is illegal for VF == 1
+          CM.getWideningDecision(IG->getInsertPos(), FoldTailByMasking, VF) ==
+              LoopVectorizationCostModel::CM_Interleave);
     };
     if (!getDecisionAndClampRange(applyIG, Range))
       continue;
@@ -8869,7 +9029,7 @@
   // followed by a region for the vector loop, followed by the middle block. The
   // skeleton vector loop region contains a header and latch block.
   VPBasicBlock *Preheader = new VPBasicBlock("vector.ph");
-  auto Plan = std::make_unique<VPlan>(Preheader);
+  auto Plan = std::make_unique<VPlan>(Preheader, FoldTailByMasking);
 
   VPBasicBlock *HeaderVPBB = new VPBasicBlock("vector.body");
   VPBasicBlock *LatchVPBB = new VPBasicBlock("vector.latch");
@@ -8883,8 +9043,8 @@
       getDebugLocFromInstOrOperands(Legal->getPrimaryInduction());
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(),
                         DLInst ? DLInst->getDebugLoc() : DebugLoc(),
-                        !CM.foldTailByMasking(),
-                        CM.useActiveLaneMaskForControlFlow());
+                        !Plan->foldTailByMasking(),
+                        CM.useActiveLaneMaskForControlFlow(FoldTailByMasking));
 
   // Scan the body of the loop in a topological order to visit each basic block
   // after having visited its predecessor basic blocks.
@@ -9194,7 +9354,7 @@
   Term->eraseFromParent();
 
   addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), DebugLoc(),
-                        true, CM.useActiveLaneMaskForControlFlow());
+                        true, CM.useActiveLaneMaskForControlFlow(false));
   return Plan;
 }
 
@@ -9247,7 +9407,8 @@
       VPValue *VecOp = Plan->getVPValue(R->getOperand(VecOpId));
 
       VPValue *CondOp = nullptr;
-      if (CM.blockNeedsPredicationForAnyReason(R->getParent())) {
+      if (blockNeedsPredicationForAnyReason(Plan->foldTailByMasking(), Legal,
+                                            R->getParent())) {
         VPBuilder::InsertPointGuard Guard(Builder);
         Builder.setInsertPoint(WidenRecipe->getParent(),
                                WidenRecipe->getIterator());
@@ -9292,7 +9453,7 @@
   // If tail is folded by masking, introduce selects between the phi
   // and the live-out instruction of each reduction, at the beginning of the
   // dedicated latch block.
-  if (CM.foldTailByMasking()) {
+  if (Plan->foldTailByMasking()) {
     Builder.setInsertPoint(LatchVPBB, LatchVPBB->begin());
     for (VPRecipeBase &R :
          Plan->getVectorLoopRegion()->getEntryBasicBlock()->phis()) {
@@ -9982,16 +10143,16 @@
   if (VPlanBuildStressTest || VectorizationFactor::Disabled() == VF)
     return false;
 
-  VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
+  VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme);
 
   {
     GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
                              F->getParent()->getDataLayout());
-    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
-                           VF.Width, 1, LVL, &CM, BFI, PSI, Checks);
+    InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Scheme.Width,
+                           VF.Scheme.Width, 1, LVL, &CM, BFI, PSI, Checks);
     LLVM_DEBUG(dbgs() << "Vectorizing outer loop in \""
                       << L->getHeader()->getParent()->getName() << "\"\n");
-    LVP.executePlan(VF.Width, 1, BestPlan, LB, DT, false);
+    LVP.executePlan(VF.Scheme.Width, 1, BestPlan, LB, DT, false);
   }
 
   // Mark the loop as already vectorized to avoid vectorizing again.
@@ -10055,7 +10216,7 @@
 
   // When interleaving only scalar and vector cost will be equal, which in turn
   // would lead to a divide by 0. Fall back to hard threshold.
-  if (VF.Width.isScalar()) {
+  if (VF.Scheme.Width.isScalar()) {
     if (CheckCost > VectorizeMemoryCheckThreshold) {
       LLVM_DEBUG(
           dbgs()
@@ -10098,8 +10259,8 @@
   // For now we assume the epilogue cost EpiC = 0 for simplicity. Note that
   // the computations are performed on doubles, not integers and the result
   // is rounded up, hence we get an upper estimate of the TC.
-  unsigned IntVF = VF.Width.getKnownMinValue();
-  if (VF.Width.isScalable()) {
+  unsigned IntVF = VF.Scheme.Width.getKnownMinValue();
+  if (VF.Scheme.Width.isScalable()) {
     unsigned AssumedMinimumVscale = 1;
     if (VScale)
       AssumedMinimumVscale = *VScale;
@@ -10326,13 +10487,14 @@
   if (MaybeVF) {
     VF = *MaybeVF;
     // Select the interleave count.
-    IC = CM.selectInterleaveCount(VF.Width, VF.Cost);
+    IC = CM.selectInterleaveCount(VF.Scheme, VF.Cost);
 
     unsigned SelectedIC = std::max(IC, UserIC);
     //  Optimistically generate runtime checks if they are needed. Drop them if
     //  they turn out to not be profitable.
-    if (VF.Width.isVector() || SelectedIC > 1)
-      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Width, SelectedIC);
+    if (VF.Scheme.Width.isVector() || SelectedIC > 1)
+      Checks.Create(L, *LVL.getLAI(), PSE.getPredicate(), VF.Scheme.Width,
+                    SelectedIC);
 
     // Check if it is profitable to vectorize with runtime checks.
     bool ForceVectorization =
@@ -10356,7 +10518,7 @@
   // Identify the diagnostic messages that should be produced.
   std::pair<StringRef, std::string> VecDiagMsg, IntDiagMsg;
   bool VectorizeLoop = true, InterleaveLoop = true;
-  if (VF.Width.isScalar()) {
+  if (VF.Scheme.Width.isScalar()) {
     LLVM_DEBUG(dbgs() << "LV: Vectorization is possible but not beneficial.\n");
     VecDiagMsg = std::make_pair(
         "VectorizationNotBeneficial",
@@ -10422,7 +10584,7 @@
              << VecDiagMsg.second;
     });
   } else if (VectorizeLoop && !InterleaveLoop) {
-    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Scheme.Width
                       << ") in " << DebugLocStr << '\n');
     ORE->emit([&]() {
       return OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
@@ -10430,7 +10592,7 @@
              << IntDiagMsg.second;
     });
   } else if (VectorizeLoop && InterleaveLoop) {
-    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Width
+    LLVM_DEBUG(dbgs() << "LV: Found a vectorizable loop (" << VF.Scheme.Width
                       << ") in " << DebugLocStr << '\n');
     LLVM_DEBUG(dbgs() << "LV: Interleave Count is " << IC << '\n');
   }
@@ -10446,8 +10608,8 @@
       InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL,
                                  &CM, BFI, PSI, Checks);
 
-      VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-      LVP.executePlan(VF.Width, IC, BestPlan, Unroller, DT, false);
+      VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme);
+      LVP.executePlan(VF.Scheme.Width, IC, BestPlan, Unroller, DT, false);
 
       ORE->emit([&]() {
         return OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(),
@@ -10460,17 +10622,19 @@
 
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
-          CM.selectEpilogueVectorizationFactor(VF.Width, LVP);
-      if (EpilogueVF.Width.isVector()) {
+          CM.selectEpilogueVectorizationFactor(VF.Scheme.Width, LVP);
+      if (EpilogueVF.Scheme.Width.isVector()) {
 
         // The first pass vectorizes the main loop and creates a scalar epilogue
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
-        EpilogueLoopVectorizationInfo EPI(VF.Width, IC, EpilogueVF.Width, 1);
+        // TODOD: Predicated remainders
+        EpilogueLoopVectorizationInfo EPI(VF.Scheme.Width, IC,
+                                          EpilogueVF.Scheme.Width, 1);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
-        VPlan &BestMainPlan = LVP.getBestPlanFor(EPI.MainLoopVF);
+        VPlan &BestMainPlan = LVP.getBestPlanFor({false, EPI.MainLoopVF});
         LVP.executePlan(EPI.MainLoopVF, EPI.MainLoopUF, BestMainPlan, MainILV,
                         DT, true);
         ++LoopsVectorized;
@@ -10483,7 +10647,7 @@
                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
                                                  Checks);
 
-        VPlan &BestEpiPlan = LVP.getBestPlanFor(EPI.EpilogueVF);
+        VPlan &BestEpiPlan = LVP.getBestPlanFor({false, EPI.EpilogueVF});
         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
         Header->setName("vec.epilog.vector.body");
@@ -10492,7 +10656,8 @@
         // VPWidenPointerInductionRecipe and VPReductionPHIRecipes are updated
         // before vectorizing the epilogue loop.
         for (VPRecipeBase &R : Header->phis()) {
-          if (isa<VPCanonicalIVPHIRecipe>(&R))
+          if (isa<VPCanonicalIVPHIRecipe>(&R) ||
+              isa<VPActiveLaneMaskPHIRecipe>(&R))
             continue;
 
           Value *ResumeV = nullptr;
@@ -10516,7 +10681,7 @@
             }
 
             ResumeV = MainILV.createInductionResumeValue(
-                IndPhi, *ID, {EPI.MainLoopIterationCountCheck});
+                IndPhi, *ID, {EPI.MainLoopIterationCountCheck}, BestEpiPlan);
           }
           assert(ResumeV && "Must have a resume value");
           VPValue *StartVal = BestEpiPlan.getOrAddExternalDef(ResumeV);
@@ -10530,12 +10695,12 @@
         if (!MainILV.areSafetyChecksAdded())
           DisableRuntimeUnroll = true;
       } else {
-        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width,
-                               VF.MinProfitableTripCount, IC, &LVL, &CM, BFI,
-                               PSI, Checks);
+        InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE,
+                               VF.Scheme.Width, VF.MinProfitableTripCount, IC,
+                               &LVL, &CM, BFI, PSI, Checks);
 
-        VPlan &BestPlan = LVP.getBestPlanFor(VF.Width);
-        LVP.executePlan(VF.Width, IC, BestPlan, LB, DT, false);
+        VPlan &BestPlan = LVP.getBestPlanFor(VF.Scheme);
+        LVP.executePlan(VF.Scheme.Width, IC, BestPlan, LB, DT, false);
         ++LoopsVectorized;
 
         // Add metadata to disable runtime unrolling a scalar loop when there
@@ -10549,7 +10714,7 @@
         return OptimizationRemark(LV_NAME, "Vectorized", L->getStartLoc(),
                                   L->getHeader())
                << "vectorized loop (vectorization width: "
-               << NV("VectorizationFactor", VF.Width)
+               << NV("VectorizationFactor", VF.Scheme.Width)
                << ", interleaved count: " << NV("InterleaveCount", IC) << ")";
       });
     }
Index: llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -64,7 +64,8 @@
   /// Check if \p I can be widened at the start of \p Range and possibly
   /// decrease the range such that the returned value holds for the entire \p
   /// Range. The function should not be called for memory instructions or calls.
-  bool shouldWiden(Instruction *I, VFRange &Range) const;
+  bool shouldWiden(Instruction *I, bool FoldTailByMasking,
+                   VFRange &Range) const;
 
   /// Check if the load or store instruction \p I should widened for \p
   /// Range.Start and potentially masked. Such instructions are handled by a
@@ -95,7 +96,7 @@
   /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
   /// decision from \p Range.Start to \p Range.End.
   VPWidenCallRecipe *tryToWidenCall(CallInst *CI, ArrayRef<VPValue *> Operands,
-                                    VFRange &Range) const;
+                                    VFRange &Range, VPlanPtr &Plan) const;
 
   /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
   /// if it can. The function should only be called if the cost-model indicates
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2262,8 +2262,12 @@
   /// Values used outside the plan.
   MapVector<PHINode *, VPLiveOut *> LiveOuts;
 
+  /// Whether this plan should foldTailByMasking
+  bool FoldTailByMasking = false;
+
 public:
-  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
+  VPlan(VPBlockBase *Entry = nullptr, bool FoldTailByMasking = false)
+      : Entry(Entry), FoldTailByMasking(FoldTailByMasking) {
     if (Entry)
       Entry->setPlan(this);
   }
@@ -2446,6 +2450,8 @@
     return LiveOuts;
   }
 
+  bool foldTailByMasking() const { return FoldTailByMasking; }
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -778,6 +778,8 @@
   std::string Out;
   raw_string_ostream RSO(Out);
   RSO << Name << " for ";
+  if (FoldTailByMasking)
+    RSO << "Tail Folded ";
   if (!VFs.empty()) {
     RSO << "VF={" << VFs[0];
     for (ElementCount VF : drop_begin(VFs))
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -9,7 +9,7 @@
 target triple = "aarch64-unknown-linux-gnu"
 
 ; VPLANS-LABEL: Checking a loop in 'simple_memset'
-; VPLANS:      VPlan 'Initial VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
+; VPLANS:      VPlan 'Initial VPlan for Tail Folded VF={vscale x 1,vscale x 2,vscale x 4},UF>=1' {
 ; VPLANS-NEXT: vector.ph:
 ; VPLANS-NEXT:     EMIT vp<%2> = VF * Part +  ir<0>
 ; VPLANS-NEXT:     EMIT vp<%3> = active lane mask vp<%2> <badref>
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
@@ -1,11 +1,13 @@
 ; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT
-; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TAILPRED
+; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED
+; RUN: opt < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=DEFAULT
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; When TP is disabled, this test can vectorize with a VF of 16.
 ; When TP is enabled, this test should vectorize with a VF of 8.
+; When both are allowed, the VF=16 without tail folding should win out.
 ;
 ; DEFAULT: load <16 x i8>, <16 x i8>*
 ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16>
Index: llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
+++ llvm/test/Transforms/LoopVectorize/PowerPC/reg-usage.ll
@@ -8,9 +8,9 @@
 define i32 @foo() {
 ; CHECK-LABEL: foo
 
-; CHECK-PWR8: Executing best plan with VF=16, UF=4
+; CHECK-PWR8: Executing best plan with TailFold=false, VF=16, UF=4
 
-; CHECK-PWR9: Executing best plan with VF=8, UF=8
+; CHECK-PWR9: Executing best plan with TailFold=false, VF=8, UF=8
 
 
 entry:
@@ -46,7 +46,7 @@
 
 ; CHECK-LABEL: goo
 
-; CHECK: Executing best plan with VF=16, UF=4
+; CHECK: Executing best plan with TailFold=false, VF=16, UF=4
 
 entry:
   br label %for.body
@@ -79,7 +79,7 @@
 define i64 @bar(ptr nocapture %a) {
 ; CHECK-LABEL: bar
 
-; CHECK: Executing best plan with VF=2, UF=12
+; CHECK: Executing best plan with TailFold=false, VF=2, UF=12
 
 entry:
   br label %for.body
@@ -107,7 +107,7 @@
 
 define void @hoo(i32 %n) {
 ; CHECK-LABEL: hoo
-; CHECK: Executing best plan with VF=1, UF=12
+; CHECK: Executing best plan with TailFold=false, VF=1, UF=12
 
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
+++ llvm/test/Transforms/LoopVectorize/RISCV/riscv-vector-reverse.ll
@@ -108,11 +108,12 @@
 ; CHECK-NEXT:  LV: Loop cost is 23
 ; CHECK-NEXT:  LV: IC is 2
 ; CHECK-NEXT:  LV: VF is vscale x 4
+; CHECK-NEXT:  LV: Fold Tail is false
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  Executing best plan with TailFold=false, VF=vscale x 4, UF=1
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ;
 entry:
@@ -240,11 +241,12 @@
 ; CHECK-NEXT:  LV: Loop cost is 23
 ; CHECK-NEXT:  LV: IC is 2
 ; CHECK-NEXT:  LV: VF is vscale x 4
+; CHECK-NEXT:  LV: Fold Tail is false
 ; CHECK-NEXT:  LV: Not Interleaving.
 ; CHECK-NEXT:  LV: Interleaving is not beneficial.
 ; CHECK-NEXT:  LV: Found a vectorizable loop (vscale x 4) in <stdin>
 ; CHECK-NEXT:  LEV: Epilogue vectorization is not profitable for this loop
-; CHECK-NEXT:  Executing best plan with VF=vscale x 4, UF=1
+; CHECK-NEXT:  Executing best plan with TailFold=false, VF=vscale x 4, UF=1
 ; CHECK-NEXT:  LV: Interleaving disabled by the pass manager
 ;
 entry:
Index: llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
+++ llvm/test/Transforms/LoopVectorize/X86/vect.omp.force.small-tc.ll
@@ -14,7 +14,9 @@
 ;
 
 ;
-; This loop will be vectorized, although the trip count is below the threshold, but vectorization is explicitly forced in metadata.
+; This loop will be vectorized, although the trip count is below the threshold, but
+; vectorization is explicitly forced in metadata. The trip count of 4 is chosen as
+; it more nicely divides the loop count of 20, produce a lower total cost.
 ;
 define void @vectorized(ptr noalias nocapture %A, ptr noalias nocapture readonly %B) {
 ; CHECK-LABEL: @vectorized(
@@ -27,20 +29,20 @@
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP2]], align 4, !llvm.access.group [[ACC_GRP0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP3]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <8 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <8 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    store <8 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd fast <4 x float> [[WIDE_LOAD]], [[WIDE_LOAD1]]
+; CHECK-NEXT:    store <4 x float> [[TMP5]], ptr [[TMP4]], align 4, !llvm.access.group [[ACC_GRP0]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 20
 ; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP1:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 16
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 20, 20
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 20, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -52,7 +54,7 @@
 ; CHECK-NEXT:    store float [[ADD]], ptr [[ARRAYIDX2]], align 4, !llvm.access.group [[ACC_GRP0]]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 20
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
Index: llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
+++ llvm/test/Transforms/LoopVectorize/first-order-recurrence-sink-replicate-region.ll
@@ -8,7 +8,7 @@
 
 define void @sink_replicate_region_1(i32 %x, ptr %ptr, ptr noalias %dst) optsize {
 ; CHECK-LABEL: sink_replicate_region_1
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -98,7 +98,7 @@
 
 define void @sink_replicate_region_2(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-LABEL: sink_replicate_region_2
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -167,7 +167,7 @@
 
 define i32 @sink_replicate_region_3_reduction(i32 %x, i8 %y, ptr %ptr) optsize {
 ; CHECK-LABEL: sink_replicate_region_3_reduction
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -241,7 +241,7 @@
 ; containing %conv at the end, because %conv is the last recipe in the block.
 define void @sink_replicate_region_4_requires_split_at_end_of_block(i32 %x, ptr %ptr, ptr noalias %dst) optsize {
 ; CHECK-LABEL: sink_replicate_region_4_requires_split_at_end_of_block
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -339,7 +339,7 @@
 ; Test case that requires sinking a recipe in a replicate region after another replicate region.
 define void @sink_replicate_region_after_replicate_region(ptr %ptr, ptr noalias %dst.2, i32 %x, i8 %y) optsize {
 ; CHECK-LABEL: sink_replicate_region_after_replicate_region
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -413,7 +413,7 @@
 
 define void @need_new_block_after_sinking_pr56146(i32 %x, ptr %src, ptr noalias %dst) {
 ; CHECK-LABEL: need_new_block_after_sinking_pr56146
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
Index: llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
+++ llvm/test/Transforms/LoopVectorize/icmp-uniforms.ll
@@ -36,7 +36,7 @@
 
 ; Check for crash exposed by D76992.
 ; CHECK-LABEL: 'test'
-; CHECK:      VPlan 'Initial VPlan for VF={4},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={4},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
Index: llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
+++ llvm/test/Transforms/LoopVectorize/tail-folding-vectorization-factor-1.ll
@@ -16,50 +16,27 @@
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE9:%.*]] ]
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[VEC_IV1:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[VEC_IV2:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IV3:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV1]], 14
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV2]], 14
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i64 [[VEC_IV3]], 14
-; CHECK-NEXT:    br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[DST:%.*]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP2]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP3]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    store i32 0, ptr [[TMP5]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF4:%.*]], label [[PRED_STORE_CONTINUE5:%.*]]
-; CHECK:       pred.store.if4:
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    store i32 0, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    store i32 0, ptr [[TMP7]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE5]]
-; CHECK:       pred.store.continue5:
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF6:%.*]], label [[PRED_STORE_CONTINUE7:%.*]]
-; CHECK:       pred.store.if6:
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP8]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP9]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE7]]
-; CHECK:       pred.store.continue7:
-; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF8:%.*]], label [[PRED_STORE_CONTINUE9]]
-; CHECK:       pred.store.if8:
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP10]]
-; CHECK-NEXT:    store i32 0, ptr [[TMP11]], align 4
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE9]]
-; CHECK:       pred.store.continue9:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 15, 12
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 16, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 12, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
@@ -69,7 +46,7 @@
 ; CHECK-NEXT:    store i32 0, ptr [[DST_PTR]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 15
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ;
 entry:
   br label %for.body
@@ -92,55 +69,32 @@
 ; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr inbounds double, ptr [[PTR1:%.*]], i64 15
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 128
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[PTR1]], i64 96
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE12:%.*]] ]
-; CHECK-NEXT:    [[VEC_IV:%.*]] = add i64 [[INDEX]], 0
-; CHECK-NEXT:    [[VEC_IV4:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[VEC_IV5:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[VEC_IV6:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP0:%.*]] = icmp ule i64 [[VEC_IV]], 14
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ule i64 [[VEC_IV4]], 14
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ule i64 [[VEC_IV5]], 14
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp ule i64 [[VEC_IV6]], 14
-; CHECK-NEXT:    br i1 [[TMP0]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
-; CHECK:       pred.store.if:
-; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 8
+; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]]
-; CHECK-NEXT:    store double 0.000000e+00, ptr [[NEXT_GEP]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
-; CHECK:       pred.store.continue:
-; CHECK-NEXT:    br i1 [[TMP1]], label [[PRED_STORE_IF7:%.*]], label [[PRED_STORE_CONTINUE8:%.*]]
-; CHECK:       pred.store.if7:
-; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[INDEX]], 3
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
-; CHECK-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]]
+; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP7]]
+; CHECK-NEXT:    store double 0.000000e+00, ptr [[NEXT_GEP]], align 8
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[NEXT_GEP1]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE8]]
-; CHECK:       pred.store.continue8:
-; CHECK-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF9:%.*]], label [[PRED_STORE_CONTINUE10:%.*]]
-; CHECK:       pred.store.if9:
-; CHECK-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 8
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP9]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[NEXT_GEP2]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE10]]
-; CHECK:       pred.store.continue10:
-; CHECK-NEXT:    br i1 [[TMP3]], label [[PRED_STORE_IF11:%.*]], label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.if11:
-; CHECK-NEXT:    [[TMP10:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 8
-; CHECK-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[PTR1]], i64 [[TMP11]]
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[NEXT_GEP3]], align 8
-; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE12]]
-; CHECK:       pred.store.continue12:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 16
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 15, 12
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR1]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -151,7 +105,7 @@
 ; CHECK-NEXT:    store double 0.000000e+00, ptr [[ADDR]], align 8
 ; CHECK-NEXT:    [[PTR]] = getelementptr inbounds double, ptr [[ADDR]], i64 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq ptr [[PTR]], [[PTR2]]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ;
 entry:
   %ptr2 = getelementptr inbounds double, ptr %ptr1, i64 15
Index: llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
+++ llvm/test/Transforms/LoopVectorize/vplan-sink-scalars-and-merge.ll
@@ -10,7 +10,7 @@
 
 
 ; CHECK-LABEL: LV: Checking a loop in 'sink1'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -74,7 +74,7 @@
 }
 
 ; CHECK-LABEL: LV: Checking a loop in 'sink2'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -153,7 +153,7 @@
 }
 
 ; CHECK-LABEL: LV: Checking a loop in 'sink3'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -234,7 +234,7 @@
 ; Make sure we do not sink uniform instructions.
 define void @uniform_gep(i64 %k, ptr noalias %A, ptr noalias %B) {
 ; CHECK-LABEL: LV: Checking a loop in 'uniform_gep'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -304,7 +304,7 @@
 ; Loop with predicated load.
 define void @pred_cfg1(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg1'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -399,7 +399,7 @@
 ; loaded value.
 define void @pred_cfg2(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg2'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -503,7 +503,7 @@
 ; on loaded value.
 define void @pred_cfg3(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'pred_cfg3'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -607,7 +607,7 @@
 
 define void @merge_3_replicate_region(i32 %k, i32 %j) {
 ; CHECK-LABEL: LV: Checking a loop in 'merge_3_replicate_region'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -707,7 +707,7 @@
 
 define void @update_2_uses_in_same_recipe_in_merged_block(i32 %k) {
 ; CHECK-LABEL: LV: Checking a loop in 'update_2_uses_in_same_recipe_in_merged_block'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -769,7 +769,7 @@
 
 define void @recipe_in_merge_candidate_used_by_first_order_recurrence(i32 %k) {
 ; CHECK-LABEL: LV: Checking a loop in 'recipe_in_merge_candidate_used_by_first_order_recurrence'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count
@@ -980,7 +980,7 @@
 ; need to be removed before merging.
 define void @merge_with_dead_gep_between_regions(i32 %n, ptr noalias %src, ptr noalias %dst) optsize {
 ; CHECK-LABEL: LV: Checking a loop in 'merge_with_dead_gep_between_regions'
-; CHECK:      VPlan 'Initial VPlan for VF={2},UF>=1' {
+; CHECK:      VPlan 'Initial VPlan for Tail Folded VF={2},UF>=1' {
 ; CHECK-NEXT: Live-in vp<[[VEC_TC:%.+]]> = vector-trip-count
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-in vp<[[BTC:%.+]]> = backedge-taken count