Index: llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
===================================================================
--- llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
+++ llvm/include/llvm/Transforms/Vectorize/LoopVectorizationLegality.h
@@ -229,6 +229,7 @@
 
   /// Return true if we can vectorize this loop while folding its tail by
   /// masking, and mark all respective loads/stores for masking.
+  /// This object's state is only modified iff this function returns true.
   bool prepareToFoldTailByMasking();
 
   /// Returns the primary induction variable.
@@ -369,8 +370,14 @@
   /// its original trip-count, under a proper guard, which should be preserved.
   /// \p SafePtrs is a list of addresses that are known to be legal and we know
   /// that we can read from them without segfault.
+  /// \p MaskedOp is a list of instructions that have to be transformed into
+  /// calls to the appropriate masked intrinsic when the loop is vectorized.
+  /// \p ConditionalAssumes is a list of assume instructions in predicated
+  /// blocks that must be dropped if the CFG gets flattened.
   bool blockCanBePredicated(BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
-                            bool PreserveGuards = false);
+                            SmallPtrSetImpl<const Instruction *> &MaskedOp,
+                            SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
+                            bool PreserveGuards = false) const;
 
   /// Updates the vectorization state by adding \p Phi to the inductions list.
   /// This can set \p Phi as the main induction of the loop if \p Phi is a
Index: llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -911,7 +911,10 @@
 }
 
 bool LoopVectorizationLegality::blockCanBePredicated(
-    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs, bool PreserveGuards) {
+    BasicBlock *BB, SmallPtrSetImpl<Value *> &SafePtrs,
+    SmallPtrSetImpl<const Instruction *> &MaskedOp,
+    SmallPtrSetImpl<Instruction *> &ConditionalAssumes,
+    bool PreserveGuards) const {
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
   for (Instruction &I : *BB) {
@@ -1018,7 +1021,8 @@
 
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointers)) {
+      if (!blockCanBePredicated(BB, SafePointers, MaskedOp,
+                                ConditionalAssumes)) {
         reportVectorizationFailure(
             "Control flow cannot be substituted for a select",
             "control flow cannot be substituted for a select",
@@ -1256,10 +1260,14 @@
   // The list of pointers that we can safely read and write to remains empty.
   SmallPtrSet<Value *, 8> SafePointers;
 
+  SmallPtrSet<const Instruction *, 8> MaskedOp;
+  SmallPtrSet<Instruction *, 8> ConditionalAssumes;
+
   // Check and mark all blocks for predication, including those that ordinarily
   // do not need predication such as the header block.
   for (BasicBlock *BB : TheLoop->blocks()) {
-    if (!blockCanBePredicated(BB, SafePointers, /* MaskAllLoads= */ true)) {
+    if (!blockCanBePredicated(BB, SafePointers, MaskedOp, ConditionalAssumes,
+                              /* MaskAllLoads= */ true)) {
       reportVectorizationFailure(
           "Cannot fold tail by masking as required",
           "control flow cannot be substituted for a select",
@@ -1270,6 +1278,11 @@
   }
 
   LLVM_DEBUG(dbgs() << "LV: can fold tail by masking.\n");
+
+  this->MaskedOp.insert(MaskedOp.begin(), MaskedOp.end());
+  this->ConditionalAssumes.insert(ConditionalAssumes.begin(),
+                                  ConditionalAssumes.end());
+
   return true;
 }
 
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -180,11 +180,12 @@
 
 // Indicates that an epilogue is undesired, predication is preferred.
 // This means that the vectorizer will try to fold the loop-tail (epilogue)
-// into the loop and predicate the loop body accordingly.
+// into the loop and predicate the loop body accordingly. If that fails, the
+// vectorizer will fallback to a scalar epilogue.
 static cl::opt<bool> PreferPredicateOverEpilog(
     "prefer-predicate-over-epilog", cl::init(false), cl::Hidden,
     cl::desc("Indicate that an epilogue is undesired, predication should be "
-             "used instead."));
+             "used instead when it is possible"));
 
 static cl::opt<bool> MaximizeBandwidth(
     "vectorizer-maximize-bandwidth", cl::init(false), cl::Hidden,
@@ -5018,6 +5019,15 @@
     return MaxVF;
   }
 
+  // If there was a tail-folding hint/switch, but we can't fold the tail by
+  // masking, fallback to a scalar epilogue.
+  if (ScalarEpilogueStatus == CM_ScalarEpilogueNotNeededUsePredicate) {
+    LLVM_DEBUG(dbgs() << "LV: Cannot fold tail by masking: Ignoring "
+                         "hint/switch and using a scalar epilogue instead.\n");
+    ScalarEpilogueStatus = CM_ScalarEpilogueAllowed;
+    return MaxVF;
+  }
+
   if (TC == 0) {
     reportVectorizationFailure(
         "Unable to calculate the loop count due to complex control flow",
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-scalar-epilogue-fallback.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -disable-mve-tail-predication=false < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize -mattr=+armv8.1-m.main,+mve.fp -disable-mve-tail-predication=true < %s | FileCheck %s
+
+; This test should produce the same result (vectorized loop + scalar epilogue) with
+; default options and when MVE Tail Predication is enabled, as this loop's tail cannot be folded
+; by masking due to an outside user of %incdec.ptr in %end.
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8.1m.main-arm-unknown-eabihf"
+
+define void @outside_user_blocks_tail_folding(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @outside_user_blocks_tail_folding(
+; CHECK-NEXT:  header:
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 16
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 16
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <16 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, <16 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <16 x i8>*
+; CHECK-NEXT:    store <16 x i8> [[WIDE_LOAD]], <16 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 16
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2
+; CHECK:       end:
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4
+; CHECK-NEXT:    ret void
+;
+header:
+  %ptr0 = load i8*, i8** %pos, align 4
+  br label %body
+
+body:
+  %dec66 = phi i32 [ %dec, %body ], [ %size, %header ]
+  %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %0 = load i8, i8* %incdec.ptr, align 1
+  store i8 %0, i8* %buff, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end, label %body
+
+end:
+  store i8* %incdec.ptr, i8** %pos, align 4
+  ret void
+}
Index: llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/use-scalar-epilogue-if-tp-fails.ll
@@ -0,0 +1,154 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -loop-vectorize -prefer-predicate-over-epilog < %s | FileCheck %s
+; RUN: opt -S -loop-vectorize < %s | FileCheck %s
+
+; This test should produce the same result when TP is forced/disabled, because it
+; can't be tail-predicated (due to an outside user of %incdec.ptr in %end),
+; so the vectorizer should fall back to a scalar epilogue.
+;
+; The first test (@basic_loop) simply relies on the command-line switches.
+; The second test (@metadata) forces tail-folding using metadata.
+; Both tests should always generate a scalar epilogue.
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @basic_loop(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @basic_loop(
+; CHECK-NEXT:  header:
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !0
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !2
+; CHECK:       end:
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4
+; CHECK-NEXT:    ret void
+;
+header:
+  %ptr0 = load i8*, i8** %pos, align 4
+  br label %body
+
+body:
+  %dec66 = phi i32 [ %dec, %body ], [ %size, %header ]
+  %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %0 = load i8, i8* %incdec.ptr, align 1
+  store i8 %0, i8* %buff, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end, label %body
+
+end:
+  store i8* %incdec.ptr, i8** %pos, align 4
+  ret void
+}
+
+define void @metadata(i8* nocapture readonly %ptr, i32 %size, i8** %pos) {
+; CHECK-LABEL: @metadata(
+; CHECK-NEXT:  header:
+; CHECK-NEXT:    [[PTR0:%.*]] = load i8*, i8** [[POS:%.*]], align 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[SIZE:%.*]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[SIZE]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[SIZE]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[IND_END:%.*]] = sub i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, i8* [[PTR:%.*]], i32 [[N_VEC]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i32 [[SIZE]], [[INDEX]]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[INDEX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, i8* [[PTR]], i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, i8* [[NEXT_GEP]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, i8* [[TMP2]], i32 0
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[TMP3]] to <4 x i8>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, <4 x i8>* [[TMP4]], align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, i8* [[NEXT_GEP]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[TMP5]] to <4 x i8>*
+; CHECK-NEXT:    store <4 x i8> [[WIDE_LOAD]], <4 x i8>* [[TMP6]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop !4
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[SIZE]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[SIZE]], [[HEADER:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i8* [ [[IND_END2]], [[MIDDLE_BLOCK]] ], [ [[PTR]], [[HEADER]] ]
+; CHECK-NEXT:    br label [[BODY:%.*]]
+; CHECK:       body:
+; CHECK-NEXT:    [[DEC66:%.*]] = phi i32 [ [[DEC:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[BUFF:%.*]] = phi i8* [ [[INCDEC_PTR:%.*]], [[BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, i8* [[BUFF]], i32 1
+; CHECK-NEXT:    [[DEC]] = add nsw i32 [[DEC66]], -1
+; CHECK-NEXT:    [[TMP8:%.*]] = load i8, i8* [[INCDEC_PTR]], align 1
+; CHECK-NEXT:    store i8 [[TMP8]], i8* [[BUFF]], align 1
+; CHECK-NEXT:    [[TOBOOL11:%.*]] = icmp eq i32 [[DEC]], 0
+; CHECK-NEXT:    br i1 [[TOBOOL11]], label [[END]], label [[BODY]], !llvm.loop !5
+; CHECK:       end:
+; CHECK-NEXT:    [[INCDEC_PTR_LCSSA:%.*]] = phi i8* [ [[INCDEC_PTR]], [[BODY]] ], [ [[IND_END2]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    store i8* [[INCDEC_PTR_LCSSA]], i8** [[POS]], align 4
+; CHECK-NEXT:    ret void
+;
+header:
+  %ptr0 = load i8*, i8** %pos, align 4
+  br label %body
+
+body:
+  %dec66 = phi i32 [ %dec, %body ], [ %size, %header ]
+  %buff = phi i8* [ %incdec.ptr, %body ], [ %ptr, %header ]
+  %incdec.ptr = getelementptr inbounds i8, i8* %buff, i32 1
+  %dec = add nsw i32 %dec66, -1
+  %0 = load i8, i8* %incdec.ptr, align 1
+  store i8 %0, i8* %buff, align 1
+  %tobool11 = icmp eq i32 %dec, 0
+  br i1 %tobool11, label %end, label %body, !llvm.loop !1
+
+end:
+  store i8* %incdec.ptr, i8** %pos, align 4
+  ret void
+}
+
+!1 = distinct !{!1, !2, !3}
+!2 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!3 = !{!"llvm.loop.vectorize.enable", i1 true}
+