Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -795,6 +795,7 @@
   unsigned MainLoopUF = 0;
   ElementCount EpilogueVF = ElementCount::getFixed(0);
   unsigned EpilogueUF = 0;
+  bool TailFoldEpilogue = false;
   BasicBlock *MainLoopIterationCountCheck = nullptr;
   BasicBlock *EpilogueIterationCountCheck = nullptr;
   BasicBlock *SCEVSafetyCheck = nullptr;
@@ -803,8 +804,9 @@
   Value *VectorTripCount = nullptr;
 
   EpilogueLoopVectorizationInfo(ElementCount MVF, unsigned MUF,
-                                ElementCount EVF, unsigned EUF)
-      : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF) {
+                                ElementCount EVF, unsigned EUF, bool TFE)
+      : MainLoopVF(MVF), MainLoopUF(MUF), EpilogueVF(EVF), EpilogueUF(EUF),
+        TailFoldEpilogue(TFE) {
     assert(EUF == 1 &&
            "A high UF for the epilogue loop is likely not beneficial.");
   }
@@ -1225,7 +1227,7 @@
   selectVectorizationFactor(const ElementCountSet &CandidateVFs);
 
   VectorizationFactor
-  selectEpilogueVectorizationFactor(const ElementCount MaxVF,
+  selectEpilogueVectorizationFactor(const VectorizationScheme MaxVF,
                                     const LoopVectorizationPlanner &LVP);
 
   /// Setup cost-based decisions for user vectorization factor.
@@ -1895,7 +1897,8 @@
   /// Returns true if epilogue vectorization is considered profitable, and
   /// false otherwise.
   /// \p VF is the vectorization factor chosen for the original loop.
-  bool isEpilogueVectorizationProfitable(const ElementCount VF) const;
+  bool
+  isUnpredicatedEpilogueVectorizationProfitable(const ElementCount VF) const;
 
 public:
   /// The loop that we evaluate.
@@ -5639,7 +5642,7 @@
   return true;
 }
 
-bool LoopVectorizationCostModel::isEpilogueVectorizationProfitable(
+bool LoopVectorizationCostModel::isUnpredicatedEpilogueVectorizationProfitable(
     const ElementCount VF) const {
   // FIXME: We need a much better cost-model to take different parameters such
   // as register pressure, code size increase and cost of extra branches into
@@ -5663,26 +5666,23 @@
 
 VectorizationFactor
 LoopVectorizationCostModel::selectEpilogueVectorizationFactor(
-    const ElementCount MainLoopVF, const LoopVectorizationPlanner &LVP) {
+    const VectorizationScheme MainLoopVF, const LoopVectorizationPlanner &LVP) {
   VectorizationFactor Result = VectorizationFactor::Disabled();
   if (!EnableEpilogueVectorization) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is disabled.\n";);
     return Result;
   }
 
-  if (!isScalarEpilogueAllowed()) {
-    LLVM_DEBUG(
-        dbgs() << "LEV: Unable to vectorize epilogue because no epilogue is "
-                  "allowed.\n";);
+  if (MainLoopVF.FoldTailByMasking) {
+    LLVM_DEBUG(dbgs() << "LEV: Epilogue not required due to tail folding.\n";);
     return Result;
   }
 
   // Not really a cost consideration, but check for unsupported cases here to
   // simplify the logic.
-  if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF)) {
-    LLVM_DEBUG(
-        dbgs() << "LEV: Unable to vectorize epilogue because the loop is "
-                  "not a supported candidate.\n";);
+  if (!isCandidateForEpilogueVectorization(*TheLoop, MainLoopVF.Width)) {
+    LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because the loop "
+                         "is not a supported candidate.\n";);
     return Result;
   }
 
@@ -5707,7 +5707,33 @@
     return Result;
   }
 
-  if (!isEpilogueVectorizationProfitable(MainLoopVF)) {
+  if (mayFoldTailByMasking()) {
+    // If we can fold the tail by masking to produce a predicated epilog,
+    // attempt to pick the scheme with the lowest cost providing it is more
+    // profitable than scalar.
+    VectorizationFactor BestFoldedVF = VectorizationFactor::Disabled();
+    for (auto &VF : ProfitableVFs) {
+      if (VF.Scheme.FoldTailByMasking &&
+          VF.Scheme.Width.isScalable() == MainLoopVF.Width.isScalable() &&
+          ElementCount::isKnownLE(VF.Scheme.Width, MainLoopVF.Width) &&
+          (Result.Scheme.Width.isScalar() || isMoreProfitable(VF, Result)))
+        BestFoldedVF = VF;
+    }
+    if (BestFoldedVF != VectorizationFactor::Disabled()) {
+      LLVM_DEBUG(
+          dbgs() << "LEV: Vectorizing predicated epilogue loop with VF = "
+                 << BestFoldedVF.Scheme.Width << "\n";);
+      return BestFoldedVF;
+    }
+  }
+
+  if (!isScalarEpilogueAllowed()) {
+    LLVM_DEBUG(dbgs() << "LEV: Unable to vectorize epilogue because no "
+                         "epilogue is allowed.\n";);
+    return Result;
+  }
+
+  if (!isUnpredicatedEpilogueVectorizationProfitable(MainLoopVF.Width)) {
     LLVM_DEBUG(dbgs() << "LEV: Epilogue vectorization is not profitable for "
                          "this loop\n");
     return Result;
@@ -5716,17 +5742,18 @@
   // If MainLoopVF = vscale x 2, and vscale is expected to be 4, then we know
   // the main loop handles 8 lanes per iteration. We could still benefit from
   // vectorizing the epilogue loop with VF=4.
-  ElementCount EstimatedRuntimeVF = MainLoopVF;
-  if (MainLoopVF.isScalable()) {
-    EstimatedRuntimeVF = ElementCount::getFixed(MainLoopVF.getKnownMinValue());
+  ElementCount EstimatedRuntimeVF = MainLoopVF.Width;
+  if (MainLoopVF.Width.isScalable()) {
+    EstimatedRuntimeVF =
+        ElementCount::getFixed(MainLoopVF.Width.getKnownMinValue());
     if (std::optional<unsigned> VScale = getVScaleForTuning())
       EstimatedRuntimeVF *= *VScale;
   }
 
   for (auto &NextVF : ProfitableVFs)
-    if (((!NextVF.Scheme.Width.isScalable() && MainLoopVF.isScalable() &&
+    if (((!NextVF.Scheme.Width.isScalable() && MainLoopVF.Width.isScalable() &&
           ElementCount::isKnownLT(NextVF.Scheme.Width, EstimatedRuntimeVF)) ||
-         ElementCount::isKnownLT(NextVF.Scheme.Width, MainLoopVF)) &&
+         ElementCount::isKnownLT(NextVF.Scheme.Width, MainLoopVF.Width)) &&
         (Result.Scheme.Width.isScalar() || isMoreProfitable(NextVF, Result)) &&
         LVP.hasPlanWithVF(false, NextVF.Scheme.Width))
       Result = NextVF;
@@ -5828,9 +5855,8 @@
   // overhead.
   // 3. We don't interleave if we think that we will spill registers to memory
   // due to the increased register pressure.
-
-  if (!isScalarEpilogueAllowed() ||
-      TheLoop->getHeader()->getParent()->hasOptSize())
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotAllowedUsePredicate ||
+      VF.FoldTailByMasking || TheLoop->getHeader()->getParent()->hasOptSize())
     return 1;
 
   // We used the distance for the interleave count.
@@ -7949,10 +7975,10 @@
 void EpilogueVectorizerMainLoop::printDebugTracesAtStart() {
   LLVM_DEBUG({
     dbgs() << "Create Skeleton for epilogue vectorized loop (first pass)\n"
-           << "Main Loop VF:" << EPI.MainLoopVF
-           << ", Main Loop UF:" << EPI.MainLoopUF
+           << "Main Loop VF:" << EPI.MainLoopVF << ", UF:" << EPI.MainLoopUF
            << ", Epilogue Loop VF:" << EPI.EpilogueVF
-           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+           << ", UF:" << EPI.EpilogueUF
+           << (EPI.TailFoldEpilogue ? ", with predication" : "") << "\n";
   });
 }
 
@@ -7980,9 +8006,13 @@
   auto P = Cost->requiresScalarEpilogue(ForEpilogue ? EPI.EpilogueVF : VF) ?
       ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT;
 
-  Value *CheckMinIters = Builder.CreateICmp(
-      P, Count, createStepForVF(Builder, Count->getType(), VFactor, UFactor),
-      "min.iters.check");
+  Value *CheckMinIters =
+      (ForEpilogue && EPI.TailFoldEpilogue)
+          ? Builder.getFalse()
+          : Builder.CreateICmp(
+                P, Count,
+                createStepForVF(Builder, Count->getType(), VFactor, UFactor),
+                "min.iters.check");
 
   if (!ForEpilogue)
     TCCheckBlock->setName("vector.main.loop.iter.check");
@@ -8136,6 +8166,17 @@
 EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     BasicBlock *Bypass, BasicBlock *Insert) {
 
+  // If we are creating a predicated epilogue loop, always jump to it.
+  if (EPI.TailFoldEpilogue) {
+    ReplaceInstWithInst(
+        Insert->getTerminator(),
+        BranchInst::Create(Bypass, LoopVectorPreHeader,
+                           ConstantInt::getFalse(Insert->getContext())));
+
+    LoopBypassBlocks.push_back(Insert);
+    return Insert;
+  }
+
   assert(EPI.TripCount &&
          "Expected trip count to have been safed in the first pass.");
   assert(
@@ -8168,8 +8209,8 @@
 void EpilogueVectorizerEpilogueLoop::printDebugTracesAtStart() {
   LLVM_DEBUG({
     dbgs() << "Create Skeleton for epilogue vectorized loop (second pass)\n"
-           << "Epilogue Loop VF:" << EPI.EpilogueVF
-           << ", Epilogue Loop UF:" << EPI.EpilogueUF << "\n";
+           << "Epilogue Loop VF:" << EPI.EpilogueVF << ", UF:" << EPI.EpilogueUF
+           << (EPI.TailFoldEpilogue ? ", with predication" : "") << "\n";
   });
 }
 
@@ -10554,15 +10595,14 @@
 
       // Consider vectorizing the epilogue too if it's profitable.
       VectorizationFactor EpilogueVF =
-          CM.selectEpilogueVectorizationFactor(VF.Scheme.Width, LVP);
+          CM.selectEpilogueVectorizationFactor(VF.Scheme, LVP);
       if (EpilogueVF.Scheme.Width.isVector()) {
-
         // The first pass vectorizes the main loop and creates a scalar epilogue
         // to be vectorized by executing the plan (potentially with a different
         // factor) again shortly afterwards.
-        // TODOD: Predicated remainders
         EpilogueLoopVectorizationInfo EPI(VF.Scheme.Width, IC,
-                                          EpilogueVF.Scheme.Width, 1);
+                                          EpilogueVF.Scheme.Width, 1,
+                                          EpilogueVF.Scheme.FoldTailByMasking);
         EpilogueVectorizerMainLoop MainILV(L, PSE, LI, DT, TLI, TTI, AC, ORE,
                                            EPI, &LVL, &CM, BFI, PSI, Checks);
 
@@ -10579,7 +10619,8 @@
                                                  ORE, EPI, &LVL, &CM, BFI, PSI,
                                                  Checks);
 
-        VPlan &BestEpiPlan = LVP.getBestPlanFor({false, EPI.EpilogueVF});
+        VPlan &BestEpiPlan = LVP.getBestPlanFor(
+            {EpilogueVF.Scheme.FoldTailByMasking, EPI.EpilogueVF});
         VPRegionBlock *VectorLoop = BestEpiPlan.getVectorLoopRegion();
         VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
         Header->setName("vec.epilog.vector.body");
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -644,7 +644,8 @@
     assert(all_of(IV->users(),
                   [](const VPUser *U) {
                     if (isa<VPScalarIVStepsRecipe>(U) ||
-                        isa<VPDerivedIVRecipe>(U))
+                        isa<VPDerivedIVRecipe>(U) ||
+                        isa<VPWidenCanonicalIVRecipe>(U))
                       return true;
                     auto *VPI = cast<VPInstruction>(U);
                     return VPI->getOpcode() ==
@@ -653,8 +654,7 @@
                                VPInstruction::CanonicalIVIncrementNUW;
                   }) &&
            "the canonical IV should only be used by its increments or "
-           "ScalarIVSteps when "
-           "resetting the start value");
+           "ScalarIVSteps when resetting the start value");
     IV->setOperand(0, VPV);
   }
 }
Index: llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -11,12 +11,12 @@
 
 ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16'
 ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:vscale x 8, Epilogue Loop UF:1
+; DEBUG: Main Loop VF:vscale x 16, UF:2, Epilogue Loop VF:vscale x 8, UF:1
 
 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_16'
 ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
 ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG-FORCED: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+; DEBUG-FORCED: Main Loop VF:vscale x 16, UF:2, Epilogue Loop VF:8, UF:1
 
 define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-LABEL: @main_vf_vscale_x_16(
@@ -188,12 +188,12 @@
 
 ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_2'
 ; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+; DEBUG: Main Loop VF:vscale x 2, UF:2, Epilogue Loop VF:8, UF:1
 
 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_2'
 ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
 ; DEBUG-FORCED: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG-FORCED: Main Loop VF:vscale x 2, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+; DEBUG-FORCED: Main Loop VF:vscale x 2, UF:2, Epilogue Loop VF:8, UF:1
 
 ; When the vector.body uses VF=vscale x 1 (or VF=vscale x 2 because
 ; that's the minimum supported VF by SVE), we could still use a wide
Index: llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/mve-epilogs.ll
@@ -8,10 +8,12 @@
 ; CHECK-LABEL: @raddshift2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 [[CMP10]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
@@ -39,18 +41,50 @@
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 8
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX4]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP14]], i32 [[N]])
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP16]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP19]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw <8 x i16> [[TMP17]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw <8 x i16> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr <8 x i16> [[TMP22]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc <8 x i16> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP24]], ptr [[TMP26]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add i32 [[INDEX4]], 8
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP14]] to i16
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP28]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP15]] to i16
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP29]] to i16
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[CONV]], 2
 ; CHECK-NEXT:    [[ADD3:%.*]] = add nuw nsw i16 [[ADD]], [[CONV2]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[ADD3]], 2
@@ -59,7 +93,7 @@
 ; CHECK-NEXT:    store i8 [[CONV4]], ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -103,10 +137,12 @@
 ; CHECK-LABEL: @rhadd(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP10:%.*]] = icmp sgt i32 [[N:%.*]], 0
-; CHECK-NEXT:    br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
-; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br i1 [[CMP10]], label [[ITER_CHECK:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
@@ -131,21 +167,53 @@
 ; CHECK-NEXT:    store <16 x i8> [[TMP10]], ptr [[TMP12]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[N]], 7
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 8
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]]
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP14:%.*]] = add i32 [[INDEX4]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 [[TMP14]], i32 [[N]])
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP16]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT:    [[TMP17:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD]] to <8 x i16>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD5:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0(ptr [[TMP19]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]], <8 x i8> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = zext <8 x i8> [[WIDE_MASKED_LOAD5]] to <8 x i16>
+; CHECK-NEXT:    [[TMP21:%.*]] = add nuw nsw <8 x i16> [[TMP17]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP22:%.*]] = add nuw nsw <8 x i16> [[TMP21]], [[TMP20]]
+; CHECK-NEXT:    [[TMP23:%.*]] = lshr <8 x i16> [[TMP22]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP24:%.*]] = trunc <8 x i16> [[TMP23]] to <8 x i8>
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[DST]], i32 [[TMP14]]
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP25]], i32 0
+; CHECK-NEXT:    call void @llvm.masked.store.v8i8.p0(<8 x i8> [[TMP24]], ptr [[TMP26]], i32 1, <8 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add i32 [[INDEX4]], 8
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i32 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP27]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[SRC1]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP14]] to i16
+; CHECK-NEXT:    [[TMP28:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP28]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[SRC2]], i32 [[I_011]]
-; CHECK-NEXT:    [[TMP15:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP15]] to i16
+; CHECK-NEXT:    [[TMP29:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP29]] to i16
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i16 [[CONV]], 1
 ; CHECK-NEXT:    [[ADD3:%.*]] = add nuw nsw i16 [[ADD]], [[CONV2]]
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i16 [[ADD3]], 1
@@ -154,7 +222,7 @@
 ; CHECK-NEXT:    store i8 [[CONV4]], ptr [[ARRAYIDX5]], align 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       for.cond.cleanup.loopexit:
 ; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
 ; CHECK:       for.cond.cleanup:
@@ -207,10 +275,12 @@
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i32 [ [[CONV2]], [[FOR_BODY_LR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_END:%.*]] ]
 ; CHECK-NEXT:    [[LAG_032:%.*]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC16:%.*]], [[FOR_END]] ]
 ; CHECK-NEXT:    [[CMP428:%.*]] = icmp slt i32 [[LAG_032]], [[CONV2]]
-; CHECK-NEXT:    br i1 [[CMP428]], label [[FOR_BODY6_PREHEADER:%.*]], label [[FOR_END]]
-; CHECK:       for.body6.preheader:
+; CHECK-NEXT:    br i1 [[CMP428]], label [[ITER_CHECK:%.*]], label [[FOR_END]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[INDVARS_IV]], 8
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[INDVARS_IV]], 8
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[INDVARS_IV]], [[N_MOD_VF]]
@@ -236,37 +306,72 @@
 ; CHECK-NEXT:    [[TMP11]] = add i32 [[TMP10]], [[VEC_PHI]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[INDVARS_IV]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY6_PREHEADER]] ]
-; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ]
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i32 [[INDVARS_IV]], 3
+; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i32 [[N_RND_UP]], 4
+; CHECK-NEXT:    [[N_VEC3:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF2]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i32> poison, i32 [[CONV1027]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT7]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i32 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[VEC_EPILOG_PH]] ], [ [[TMP25:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i32 [[INDEX4]], 0
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 [[TMP13]], i32 [[INDVARS_IV]])
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i16, ptr [[TMP14]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP15]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison)
+; CHECK-NEXT:    [[TMP16:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw i32 [[TMP13]], [[LAG_032]]
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[TMP17]]
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i16, ptr [[TMP18]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD6:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0(ptr [[TMP19]], i32 2, <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i16> poison)
+; CHECK-NEXT:    [[TMP20:%.*]] = sext <4 x i16> [[WIDE_MASKED_LOAD6]] to <4 x i32>
+; CHECK-NEXT:    [[TMP21:%.*]] = mul nsw <4 x i32> [[TMP20]], [[TMP16]]
+; CHECK-NEXT:    [[TMP22:%.*]] = ashr <4 x i32> [[TMP21]], [[BROADCAST_SPLAT8]]
+; CHECK-NEXT:    [[TMP23:%.*]] = select <4 x i1> [[ACTIVE_LANE_MASK]], <4 x i32> [[TMP22]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP24:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP23]])
+; CHECK-NEXT:    [[TMP25]] = add i32 [[TMP24]], [[VEC_PHI5]]
+; CHECK-NEXT:    [[INDEX_NEXT9]] = add i32 [[INDEX4]], 4
+; CHECK-NEXT:    [[TMP26:%.*]] = icmp eq i32 [[INDEX_NEXT9]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP26]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX10:%.*]] = phi i32 [ 0, [[ITER_CHECK]] ], [ [[TMP11]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[TMP25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY6:%.*]]
 ; CHECK:       for.body6:
-; CHECK-NEXT:    [[ACCUMULATOR_030:%.*]] = phi i32 [ [[ADD11:%.*]], [[FOR_BODY6]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ACCUMULATOR_030:%.*]] = phi i32 [ [[ADD11:%.*]], [[FOR_BODY6]] ], [ [[BC_MERGE_RDX10]], [[VEC_EPILOG_SCALAR_PH]] ]
+; CHECK-NEXT:    [[I_029:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[I_029]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
-; CHECK-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP13]] to i32
+; CHECK-NEXT:    [[TMP27:%.*]] = load i16, ptr [[ARRAYIDX]], align 2
+; CHECK-NEXT:    [[CONV7:%.*]] = sext i16 [[TMP27]] to i32
 ; CHECK-NEXT:    [[ADD:%.*]] = add nuw nsw i32 [[I_029]], [[LAG_032]]
 ; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i16, ptr [[INPUTDATA]], i32 [[ADD]]
-; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2
-; CHECK-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP14]] to i32
+; CHECK-NEXT:    [[TMP28:%.*]] = load i16, ptr [[ARRAYIDX8]], align 2
+; CHECK-NEXT:    [[CONV9:%.*]] = sext i16 [[TMP28]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV9]], [[CONV7]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], [[CONV1027]]
 ; CHECK-NEXT:    [[ADD11]] = add nsw i32 [[SHR]], [[ACCUMULATOR_030]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_029]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[INDVARS_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY6]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY6]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       for.end.loopexit:
-; CHECK-NEXT:    [[ADD11_LCSSA:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY6]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD11_LCSSA:%.*]] = phi i32 [ [[ADD11]], [[FOR_BODY6]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ], [ [[TMP25]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    br label [[FOR_END]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    [[ACCUMULATOR_0_LCSSA:%.*]] = phi i32 [ 0, [[FOR_BODY]] ], [ [[ADD11_LCSSA]], [[FOR_END_LOOPEXIT]] ]
-; CHECK-NEXT:    [[TMP15:%.*]] = lshr i32 [[ACCUMULATOR_0_LCSSA]], 16
-; CHECK-NEXT:    [[CONV13:%.*]] = trunc i32 [[TMP15]] to i16
+; CHECK-NEXT:    [[TMP29:%.*]] = lshr i32 [[ACCUMULATOR_0_LCSSA]], 16
+; CHECK-NEXT:    [[CONV13:%.*]] = trunc i32 [[TMP29]] to i16
 ; CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, ptr [[AUTOCORRDATA:%.*]], i32 [[LAG_032]]
 ; CHECK-NEXT:    store i16 [[CONV13]], ptr [[ARRAYIDX14]], align 2
 ; CHECK-NEXT:    [[INC16]] = add nuw nsw i32 [[LAG_032]], 1
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-reduces-vf.ll
@@ -1,19 +1,25 @@
-; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=DEFAULT
+; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -tail-predication=disabled -S | FileCheck %s --check-prefixes=NOTAILPRED
 ; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TAILPRED
-; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=DEFAULT
-
+; RUN: opt -opaque-pointers=0 < %s -mattr=+mve,+mve.fp -passes=loop-vectorize -S | FileCheck %s --check-prefixes=DEFAULT
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv8.1m.main-arm-none-eabi"
 
 ; When TP is disabled, this test can vectorize with a VF of 16.
 ; When TP is enabled, this test should vectorize with a VF of 8.
-; When both are allowed, the VF=16 without tail folding should win out.
+; When both are allowed, the VF=16 without tail folding should win out with a
+; predicated remainder.
 ;
 ; DEFAULT: load <16 x i8>, <16 x i8>*
 ; DEFAULT: sext <16 x i8> %{{.*}} to <16 x i16>
 ; DEFAULT: add <16 x i16>
-; DEFAULT-NOT: llvm.masked.load
-; DEFAULT-NOT: llvm.masked.store
+; DEFAULT: vec.epilog.vector.body:
+; DEFAULT: llvm.masked.load
+; DEFAULT: llvm.masked.store
+;
+; NOTAILPRED: load <16 x i8>, <16 x i8>*
+; NOTAILPRED: sext <16 x i8> %{{.*}} to <16 x i16>
+; NOTAILPRED: add <16 x i16>
+; NOTAILPRED-NOT: vec.epilog.vector.body:
 ;
 ; TAILPRED: llvm.masked.load.v8i8.p0v8i8
 ; TAILPRED: sext <8 x i8> %{{.*}} to <8 x i16>
Index: llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
+++ llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization-scalable.ll
@@ -6,7 +6,7 @@
 ; Currently we cannot handle scalable vectorization factors.
 ; CHECK: LV: Checking a loop in 'f1'
 ; CHECK: LEV: Epilogue vectorization factor is forced.
-; CHECK: Epilogue Loop VF:2, Epilogue Loop UF:1
+; CHECK: Epilogue Loop VF:2, UF:1
 
 define void @f1(ptr %A) {
 entry: