Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -229,7 +229,16 @@
 
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking, and mark all respective loads/stores for masking.
-  bool prepareToFoldTailByMasking();
+  /// If SoftFailure is true, then failures won't be considered as vectorization
+  /// failures.
+  bool prepareToFoldTailByMasking(bool SoftFailure = false);
+
+  /// Abandons tail-folding by masking: clears the sets of masked operations and
+  /// conditional assumes.
+  void abandonTailFoldingByMasking() {
+    MaskedOp.clear();
+    ConditionalAssumes.clear();
+  }
 
   /// Returns the primary induction variable.
   PHINode *getPrimaryInduction() { return PrimaryInduction; }
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -1230,7 +1230,7 @@
   return Result;
 }
 
-bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
+bool LoopVectorizationLegality::prepareToFoldTailByMasking(bool SoftFailure) {
 
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
@@ -1249,10 +1249,17 @@
       Instruction *UI = cast<Instruction>(U);
       if (TheLoop->contains(UI))
         continue;
-      reportVectorizationFailure(
-          "Cannot fold tail by masking, loop has an outside user for",
-          "Cannot fold tail by masking in the presence of live outs.",
-          "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
+      if (SoftFailure) {
+        LLVM_DEBUG(
+            dbgs()
+            << "LV: Cannot fold tail by masking, loop has an outside user for "
+            << *UI << "\n");
+      } else {
+        reportVectorizationFailure(
+            "Cannot fold tail by masking, loop has an outside user for",
+            "Cannot fold tail by masking in the presence of live outs.",
+            "LiveOutFoldingTailByMasking", ORE, TheLoop, UI);
+      }
       return false;
     }
   }
@@ -1264,11 +1271,14 @@
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
-      reportVectorizationFailure(
-          "Cannot fold tail by masking as required",
-          "control flow cannot be substituted for a select",
-          "NoCFGForSelect", ORE, TheLoop,
-          BB->getTerminator());
+      if (SoftFailure) {
+        LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking\n");
+      } else {
+        reportVectorizationFailure(
+            "Cannot fold tail by masking as required",
+            "control flow cannot be substituted for a select", "NoCFGForSelect",
+            ORE, TheLoop, BB->getTerminator());
+      }
       return false;
     }
   }
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4960,13 +4960,28 @@
     return None;
   }
 
+  // If a hint/switch to use tail-predication is found, check early if
+  // tail-predication is actually possible. Else, fallback to a scalar epilogue.
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+    LLVM_DEBUG(dbgs() << "LV: Vector predicate hint/switch found.\n");
+
+    if (Legal->prepareToFoldTailByMasking(/*SoftFailure=*/true))
+      FoldTailByMasking = true;
+    else {
+      LLVM_DEBUG(
+          dbgs() << "LV: Loop does not support tail-predication, ignoring "
+                    "hint/switch and falling back to a scalar epilogue.\n");
+      Legal->abandonTailFoldingByMasking();
+      ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    }
+  }
+
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
     return computeFeasibleMaxVF(TC);
   case CM_ScalarEpilogueNotNeededUsePredicate:
     LLVM_DEBUG(
-        dbgs() << "LV: vector predicate hint/switch found.\n"
-               << "LV: Not allowing scalar epilogue, creating predicated "
+        dbgs() << "LV: Not allowing scalar epilogue, creating predicated "
                << "vector loop.\n");
     break;
   case CM_ScalarEpilogueNotAllowedLowTripLoop:
@@ -5002,9 +5017,17 @@
   if (TC > 0 && TC % MaxVF == 0) {
     // Accept MaxVF if we do not have a tail.
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
+    // Cleanup if we already planned to fold the tail.
+    if (FoldTailByMasking) {
+      Legal->abandonTailFoldingByMasking();
+      FoldTailByMasking = false;
+    }
     return MaxVF;
   }
 
+  if (FoldTailByMasking)
+    return MaxVF;
+
   // If we don't know the precise trip count, or if the trip count that we
   // found modulo the vectorization factor is not zero, try to fold the tail
   // by masking.
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
@@ -0,0 +1,194 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -disable-mve-tail-predication=false < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -disable-mve-tail-predication=true < %s | FileCheck %s
+
+; This test should produce the same result (vectorized loop + scalar epilogue) with
+; default options and when MVE Tail Predication is enabled.
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define void @foo(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  cond.preheader:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[BC:%.*]] = bitcast i8** [[POS:%.*]] to i32*
+; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[BC]], align 4
+; CHECK-NEXT:    [[ITO:%.*]] = inttoptr i32 [[LD]] to i8*
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[END:%.*]], label [[BODY_PREHEADER:%.*]]
+; CHECK:       body.preheader:
+; CHECK-NEXT:    [[DEC62:%.*]] = add nsw i32 [[SIZE]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR1363:%.*]] = getelementptr inbounds i8, i8* [[ITO]], i32 1
+; CHECK-NEXT:    store i8* [[INCDEC_PTR1363]], i8** [[POS]], align 4
+; CHECK-NEXT:    store i8 [[TMP0]], i8* [[ITO]], align 1
+; CHECK-NEXT:    [[TOBOOL1164:%.*]] = icmp eq i32 [[DEC62]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL1164]], label [[END_LOOPEXIT:%.*]], label [[BODY_CRIT_EDGE_CHECK:%.*]]
+; CHECK:       body_crit_edge.check:
+; CHECK-NEXT:    [[SCEVGEP67:%.*]] = bitcast i8** [[POS]] to i8*
+; CHECK-NEXT:    [[SCEVGEP69:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[SIZE]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[SCEVGEP69]], [[SCEVGEP67]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[SCEVGEP67]], [[PTR]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND1]], [[BOUND0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SIZE]], -2
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS]], align 4
+; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint i8* [[PTR0]] to i32
+; CHECK-NEXT:    [[PA:%.*]] = and i32 [[PTI]], -4
+; CHECK-NEXT:    [[P0S:%.*]] = inttoptr i32 [[PA]] to i8*
+; CHECK-NEXT:    [[INCDEC_PTR13_P0E:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP1]]
+; CHECK-NEXT:    [[BOUND070:%.*]] = icmp ule i8* [[P0S]], [[SCEVGEP67]]
+; CHECK-NEXT:    [[BOUND171:%.*]] = icmp uge i8* [[INCDEC_PTR13_P0E]], [[SCEVGEP67]]
+; CHECK-NEXT:    [[FOUND_CONFLICT72:%.*]] = and i1 [[BOUND171]], [[BOUND070]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT72]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[BODY_CRIT_EDGE_ORIG:%.*]], label [[BODY_CRIT_EDGE_PREHEADER:%.*]]
+; CHECK:       body_crit_edge.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SIZE]], -1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SIZE]], -1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[PTR]], i32 1
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[SIZE]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult i8* [[PTR0]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT5]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[DEC62]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP4]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[DEC62]], [[INDEX]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> undef, i32 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 -1, i32 -2, i32 -3, i32 -4, i32 -5, i32 -6, i32 -7, i32 -8, i32 -9, i32 -10, i32 -11, i32 -12, i32 -13, i32 -14, i32 -15>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP10]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP9]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP12]], align 1, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END_LOOPEXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR0]], [[BODY_CRIT_EDGE_PREHEADER]] ], [ [[PTR0]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEC62]], [[BODY_CRIT_EDGE_PREHEADER]] ], [ [[DEC62]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i8* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[BODY_CRIT_EDGE_PREHEADER]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[BODY_CRIT_EDGE:%.*]]
+; CHECK:       body_crit_edge.orig:
+; CHECK-NEXT:    [[INCDEC_PTR_ORIG:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i32 1
+; CHECK-NEXT:    [[DEC_ORIG:%.*]] = add nsw i32 [[DEC62]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[INCDEC_PTR_ORIG]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR13_ORIG:%.*]] = getelementptr inbounds i8, i8* [[PTR0]], i32 1
+; CHECK-NEXT:    store i8* [[INCDEC_PTR13_ORIG]], i8** [[POS]], align 4
+; CHECK-NEXT:    store i8 [[TMP14]], i8* [[PTR0]], align 1
+; CHECK-NEXT:    [[TOBOOL11_ORIG:%.*]] = icmp eq i32 [[DEC_ORIG]], 0
+; CHECK-NEXT:    br label [[END_LOOPEXIT]]
+; CHECK:       body_crit_edge:
+; CHECK-NEXT:    [[DOTPRE74:%.*]] = phi i8* [ [[INCDEC_PTR13:%.*]], [[BODY_CRIT_EDGE]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY_CRIT_EDGE]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF_06065:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY_CRIT_EDGE]] ], [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF_06065]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR13]] = getelementptr inbounds i8, i8* [[DOTPRE74]], i32 1
+; CHECK-NEXT:    store i8 [[TMP15]], i8* [[DOTPRE74]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END_LOOPEXIT_LOOPEXIT]], label [[BODY_CRIT_EDGE]], !llvm.loop !7
+; CHECK:       bodyloopexit_crit_edge:
+; CHECK-NEXT:    store i8* [[INCDEC_PTR13_LCSSA:%.*]], i8** [[POS]], align 4
+; CHECK-NEXT:    br label [[END_LOOPEXIT]]
+; CHECK:       end.loopexit.loopexit:
+; CHECK-NEXT:    [[INCDEC_PTR13_LCSSA]] = phi i8* [ [[INCDEC_PTR13]], [[BODY_CRIT_EDGE]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[END_LOOPEXIT]]
+; CHECK:       end.loopexit:
+; CHECK-NEXT:    [[DOTPRE61:%.*]] = load i8*, i8** [[POS]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+
+cond.preheader:
+  %tobool = icmp eq i32 %size, 0
+  %bc = bitcast i8** %pos to i32*
+  %ld = load i32, i32* %bc, align 4
+  %ito = inttoptr i32 %ld to i8*
+  br i1 %tobool, label %end, label %body.preheader
+
+body.preheader:
+  %dec62 = add nsw i32 %size, -1
+  %0 = load i8, i8* %ptr, align 1
+  %incdec.ptr1363 = getelementptr inbounds i8, i8* %ito, i32 1
+  store i8* %incdec.ptr1363, i8** %pos, align 4
+  store i8 %0, i8* %ito, align 1
+  %tobool1164 = icmp eq i32 %dec62, 0
+  br i1 %tobool1164, label %end.loopexit, label %body_crit_edge.check
+
+body_crit_edge.check:
+  %scevgep67 = bitcast i8** %pos to i8*
+  %scevgep69 = getelementptr i8, i8* %ptr, i32 %size
+  %bound0 = icmp ugt i8* %scevgep69, %scevgep67
+  %bound1 = icmp ugt i8* %scevgep67, %ptr
+  %found.conflict = and i1 %bound1, %bound0
+  %1 = add i32 %size, -2
+  %ptr0 = load i8*, i8** %pos, align 4
+  %pti = ptrtoint i8* %ptr0 to i32
+  %pa = and i32 %pti, -4
+  %p0s = inttoptr i32 %pa to i8*
+  %incdec.ptr13.p0e = getelementptr i8, i8* %ptr0, i32 %1
+  %bound070 = icmp ule i8* %p0s, %scevgep67
+  %bound171 = icmp uge i8* %incdec.ptr13.p0e, %scevgep67
+  %found.conflict72 = and i1 %bound171, %bound070
+  %conflict.rdx = or i1 %found.conflict, %found.conflict72
+  br i1 %conflict.rdx, label %body_crit_edge.orig, label %body_crit_edge
+
+body_crit_edge.orig:
+  %incdec.ptr.orig = getelementptr inbounds i8, i8* %ptr, i32 1
+  %dec.orig = add nsw i32 %dec62, -1
+  %2 = load i8, i8* %incdec.ptr.orig, align 1
+  %incdec.ptr13.orig = getelementptr inbounds i8, i8* %ptr0, i32 1
+  store i8* %incdec.ptr13.orig, i8** %pos, align 4
+  store i8 %2, i8* %ptr0, align 1
+  %tobool11.orig = icmp eq i32 %dec.orig, 0
+  br label %end.loopexit
+
+body_crit_edge:
+  %.pre74 = phi i8* [ %incdec.ptr13, %body_crit_edge ], [ %ptr0, %body_crit_edge.check ]
+  %dec66 = phi i32 [ %dec, %body_crit_edge ], [ %dec62, %body_crit_edge.check ]
+  %buff.06065 = phi i8* [ %incdec.ptr, %body_crit_edge ], [ %ptr, %body_crit_edge.check ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff.06065, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %3 = load i8, i8* %incdec.ptr, align 1
+  %incdec.ptr13 = getelementptr inbounds i8, i8* %.pre74, i32 1
+  store i8 %3, i8* %.pre74, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end.loopexit, label %body_crit_edge
+
+bodyloopexit_crit_edge:
+  store i8* %incdec.ptr13, i8** %pos, align 4
+  br label %end.loopexit
+
+end.loopexit:
+  %.pre61 = load i8*, i8** %pos, align 4
+  br label %end
+
+end:
+  ret void
+}
Index: llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -0,0 +1,193 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilog < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+; This test should produce the same result when TP is forced/disabled, because it
+; can't be tail-predicated, so the vectorizer should fall back to a scalar epilogue.
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @foo(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  cond.preheader:
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[SIZE:%.*]], 0
+; CHECK-NEXT:    [[BC:%.*]] = bitcast i8** [[POS:%.*]] to i32*
+; CHECK-NEXT:    [[LD:%.*]] = load i32, i32* [[BC]], align 4
+; CHECK-NEXT:    [[ITO:%.*]] = inttoptr i32 [[LD]] to i8*
+; CHECK-NEXT:    br i1 [[TOBOOL]], label [[END:%.*]], label [[BODY_PREHEADER:%.*]]
+; CHECK:       body.preheader:
+; CHECK-NEXT:    [[DEC62:%.*]] = add nsw i32 [[SIZE]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[PTR:%.*]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR1363:%.*]] = getelementptr inbounds i8, i8* [[ITO]], i32 1
+; CHECK-NEXT:    store i8* [[INCDEC_PTR1363]], i8** [[POS]], align 4
+; CHECK-NEXT:    store i8 [[TMP0]], i8* [[ITO]], align 1
+; CHECK-NEXT:    [[TOBOOL1164:%.*]] = icmp eq i32 [[DEC62]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL1164]], label [[END_LOOPEXIT:%.*]], label [[BODY_CRIT_EDGE_CHECK:%.*]]
+; CHECK:       body_crit_edge.check:
+; CHECK-NEXT:    [[SCEVGEP67:%.*]] = bitcast i8** [[POS]] to i8*
+; CHECK-NEXT:    [[SCEVGEP69:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[SIZE]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ugt i8* [[SCEVGEP69]], [[SCEVGEP67]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ugt i8* [[SCEVGEP67]], [[PTR]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND1]], [[BOUND0]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SIZE]], -2
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS]], align 4
+; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint i8* [[PTR0]] to i32
+; CHECK-NEXT:    [[PA:%.*]] = and i32 [[PTI]], -4
+; CHECK-NEXT:    [[P0S:%.*]] = inttoptr i32 [[PA]] to i8*
+; CHECK-NEXT:    [[INCDEC_PTR13_P0E:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP1]]
+; CHECK-NEXT:    [[BOUND070:%.*]] = icmp ule i8* [[P0S]], [[SCEVGEP67]]
+; CHECK-NEXT:    [[BOUND171:%.*]] = icmp uge i8* [[INCDEC_PTR13_P0E]], [[SCEVGEP67]]
+; CHECK-NEXT:    [[FOUND_CONFLICT72:%.*]] = and i1 [[BOUND171]], [[BOUND070]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT72]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label [[BODY_CRIT_EDGE_ORIG:%.*]], label [[BODY_CRIT_EDGE_PREHEADER:%.*]]
+; CHECK:       body_crit_edge.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[SIZE]], -1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; CHECK:       vector.memcheck:
+; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[SIZE]], -1
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, i8* [[PTR]], i32 1
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[SIZE]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult i8* [[PTR0]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult i8* [[SCEVGEP1]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[MEMCHECK_CONFLICT:%.*]] = and i1 [[FOUND_CONFLICT5]], true
+; CHECK-NEXT:    br i1 [[MEMCHECK_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[N_VEC]]
+; CHECK-NEXT:    [[IND_END7:%.*]] = sub i32 [[DEC62]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END9:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR0]], i32 [[TMP4]]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[DEC62]], [[INDEX]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 -1, i32 -2, i32 -3>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP10:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP10]], i32 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, i8* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[TMP8]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP9]], align 1, !alias.scope !0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP12:%.*]] = bitcast i8* [[TMP11]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP12]], align 1, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !5
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END_LOOPEXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i8* [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[PTR0]], [[BODY_CRIT_EDGE_PREHEADER]] ], [ [[PTR0]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL6:%.*]] = phi i32 [ [[IND_END7]], [[MIDDLE_BLOCK]] ], [ [[DEC62]], [[BODY_CRIT_EDGE_PREHEADER]] ], [ [[DEC62]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL8:%.*]] = phi i8* [ [[IND_END9]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[BODY_CRIT_EDGE_PREHEADER]] ], [ [[PTR]], [[VECTOR_MEMCHECK]] ]
+; CHECK-NEXT:    br label [[BODY_CRIT_EDGE:%.*]]
+; CHECK:       body_crit_edge.orig:
+; CHECK-NEXT:    [[INCDEC_PTR_ORIG:%.*]] = getelementptr inbounds i8, i8* [[PTR]], i32 1
+; CHECK-NEXT:    [[DEC_ORIG:%.*]] = add nsw i32 [[DEC62]], -1
+; CHECK-NEXT:    [[TMP14:%.*]] = load i8, i8* [[INCDEC_PTR_ORIG]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR13_ORIG:%.*]] = getelementptr inbounds i8, i8* [[PTR0]], i32 1
+; CHECK-NEXT:    store i8* [[INCDEC_PTR13_ORIG]], i8** [[POS]], align 4
+; CHECK-NEXT:    store i8 [[TMP14]], i8* [[PTR0]], align 1
+; CHECK-NEXT:    [[TOBOOL11_ORIG:%.*]] = icmp eq i32 [[DEC_ORIG]], 0
+; CHECK-NEXT:    br label [[END_LOOPEXIT]]
+; CHECK:       body_crit_edge:
+; CHECK-NEXT:    [[DOTPRE74:%.*]] = phi i8* [ [[INCDEC_PTR13:%.*]], [[BODY_CRIT_EDGE]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY_CRIT_EDGE]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF_06065:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY_CRIT_EDGE]] ], [ [[BC_RESUME_VAL8]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF_06065]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP15:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR13]] = getelementptr inbounds i8, i8* [[DOTPRE74]], i32 1
+; CHECK-NEXT:    store i8 [[TMP15]], i8* [[DOTPRE74]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END_LOOPEXIT_LOOPEXIT]], label [[BODY_CRIT_EDGE]], !llvm.loop !7
+; CHECK:       bodyloopexit_crit_edge:
+; CHECK-NEXT:    store i8* [[INCDEC_PTR13_LCSSA:%.*]], i8** [[POS]], align 4
+; CHECK-NEXT:    br label [[END_LOOPEXIT]]
+; CHECK:       end.loopexit.loopexit:
+; CHECK-NEXT:    [[INCDEC_PTR13_LCSSA]] = phi i8* [ [[INCDEC_PTR13]], [[BODY_CRIT_EDGE]] ], [ [[IND_END]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[END_LOOPEXIT]]
+; CHECK:       end.loopexit:
+; CHECK-NEXT:    [[DOTPRE61:%.*]] = load i8*, i8** [[POS]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+
+cond.preheader:
+  %tobool = icmp eq i32 %size, 0
+  %bc = bitcast i8** %pos to i32*
+  %ld = load i32, i32* %bc, align 4
+  %ito = inttoptr i32 %ld to i8*
+  br i1 %tobool, label %end, label %body.preheader
+
+body.preheader:
+  %dec62 = add nsw i32 %size, -1
+  %0 = load i8, i8* %ptr, align 1
+  %incdec.ptr1363 = getelementptr inbounds i8, i8* %ito, i32 1
+  store i8* %incdec.ptr1363, i8** %pos, align 4
+  store i8 %0, i8* %ito, align 1
+  %tobool1164 = icmp eq i32 %dec62, 0
+  br i1 %tobool1164, label %end.loopexit, label %body_crit_edge.check
+
+body_crit_edge.check:
+  %scevgep67 = bitcast i8** %pos to i8*
+  %scevgep69 = getelementptr i8, i8* %ptr, i32 %size
+  %bound0 = icmp ugt i8* %scevgep69, %scevgep67
+  %bound1 = icmp ugt i8* %scevgep67, %ptr
+  %found.conflict = and i1 %bound1, %bound0
+  %1 = add i32 %size, -2
+  %ptr0 = load i8*, i8** %pos, align 4
+  %pti = ptrtoint i8* %ptr0 to i32
+  %pa = and i32 %pti, -4
+  %p0s = inttoptr i32 %pa to i8*
+  %incdec.ptr13.p0e = getelementptr i8, i8* %ptr0, i32 %1
+  %bound070 = icmp ule i8* %p0s, %scevgep67
+  %bound171 = icmp uge i8* %incdec.ptr13.p0e, %scevgep67
+  %found.conflict72 = and i1 %bound171, %bound070
+  %conflict.rdx = or i1 %found.conflict, %found.conflict72
+  br i1 %conflict.rdx, label %body_crit_edge.orig, label %body_crit_edge
+
+body_crit_edge.orig:
+  %incdec.ptr.orig = getelementptr inbounds i8, i8* %ptr, i32 1
+  %dec.orig = add nsw i32 %dec62, -1
+  %2 = load i8, i8* %incdec.ptr.orig, align 1
+  %incdec.ptr13.orig = getelementptr inbounds i8, i8* %ptr0, i32 1
+  store i8* %incdec.ptr13.orig, i8** %pos, align 4
+  store i8 %2, i8* %ptr0, align 1
+  %tobool11.orig = icmp eq i32 %dec.orig, 0
+  br label %end.loopexit
+
+body_crit_edge:
+  %.pre74 = phi i8* [ %incdec.ptr13, %body_crit_edge ], [ %ptr0, %body_crit_edge.check ]
+  %dec66 = phi i32 [ %dec, %body_crit_edge ], [ %dec62, %body_crit_edge.check ]
+  %buff.06065 = phi i8* [ %incdec.ptr, %body_crit_edge ], [ %ptr, %body_crit_edge.check ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff.06065, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %3 = load i8, i8* %incdec.ptr, align 1
+  %incdec.ptr13 = getelementptr inbounds i8, i8* %.pre74, i32 1
+  store i8 %3, i8* %.pre74, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end.loopexit, label %body_crit_edge
+
+bodyloopexit_crit_edge:
+  store i8* %incdec.ptr13, i8** %pos, align 4
+  br label %end.loopexit
+
+end.loopexit:
+  %.pre61 = load i8*, i8** %pos, align 4
+  br label %end
+
+end:
+  ret void
+}