Index: llvm/include/llvm/Analysis/LoopInfo.h
===================================================================
--- llvm/include/llvm/Analysis/LoopInfo.h
+++ llvm/include/llvm/Analysis/LoopInfo.h
@@ -780,6 +780,10 @@
   /// unrolling pass is run more than once (which it generally is).
   void setLoopAlreadyUnrolled();
 
+  /// Return true if the loop is annotated with pragma
+  /// llvm.loop.vectorize.predicate.enable, and false otherwise.
+  bool isAnnotatedVectorPredicate() const;
+
   void dump() const;
   void dumpVerbose() const;
 
Index: llvm/lib/Analysis/LoopInfo.cpp
===================================================================
--- llvm/lib/Analysis/LoopInfo.cpp
+++ llvm/lib/Analysis/LoopInfo.cpp
@@ -494,6 +494,34 @@
   setLoopID(NewLoopID);
 }
 
+bool Loop::isAnnotatedVectorPredicate() const {
+  MDNode *LoopID = getLoopID();
+  if (!LoopID)
+    return false;
+
+  StringRef Name = "llvm.loop.vectorize.predicate.enable";
+  // First operand should refer to the loop id itself.
+  assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
+  assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
+
+  for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
+    MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    if (!MD)
+      continue;
+
+    MDString *S = dyn_cast<MDString>(MD->getOperand(0));
+    if (!S)
+      continue;
+
+    if (Name.equals(S->getString()) &&
+        mdconst::extract<ConstantInt>(MD->getOperand(1))->getZExtValue())
+      return true;
+    else
+      return false;
+  }
+  return false;
+}
+
 bool Loop::isAnnotatedParallel() const {
   MDNode *DesiredLoopIdMetadata = getLoopID();
 
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -841,7 +841,8 @@
 enum ScalarEpilogueLowering {
   CM_ScalarEpilogueAllowed,
   CM_ScalarEpilogueNotAllowedOptSize,
-  CM_ScalarEpilogueNotAllowedLowTripLoop
+  CM_ScalarEpilogueNotAllowedLowTripLoop,
+  CM_ScalarEpilogueNotAllowedPredicatePragma
 };
 
 /// LoopVectorizationCostModel - estimates the expected speedups due to
@@ -870,6 +871,10 @@
   /// vectorization and interleaving should be avoided up front.
   Optional<unsigned> computeMaxVF();
 
+  /// \return True if runtime checks are required for vectorization, and false
+  /// otherwise. 
+  bool runtimeChecksRequired();
+
   /// \return The most profitable vectorization factor and the cost of that VF.
   /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
@@ -4686,26 +4691,8 @@
   Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
-  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
-    // TODO: It may by useful to do since it's still likely to be dynamically
-    // uniform if the target can skip.
-    LLVM_DEBUG(
-        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
-
-    ORE->emit(
-      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
-      << "runtime pointer checks needed. Not enabled for divergent target");
-
-    return None;
-  }
-
-  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-  if (isScalarEpilogueAllowed())
-    return computeFeasibleMaxVF(TC);
-
-  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue.\n" <<
-                       "LV: Performing code size checks.\n");
+bool LoopVectorizationCostModel::runtimeChecksRequired() {
+  LLVM_DEBUG(dbgs() << "LV: Performing code size checks.\n");
 
   if (Legal->getRuntimePointerChecking()->Need) {
     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
@@ -4715,7 +4702,7 @@
     LLVM_DEBUG(
         dbgs()
         << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return None;
+    return true;
   }
 
   if (!PSE.getUnionPredicate().getPredicates().empty()) {
@@ -4726,7 +4713,7 @@
     LLVM_DEBUG(
         dbgs()
         << "LV: Aborting. Runtime SCEV check is required with -Os/-Oz.\n");
-    return None;
+    return true;
   }
 
   // FIXME: Avoid specializing for stride==1 instead of bailing out.
@@ -4738,12 +4725,28 @@
     LLVM_DEBUG(
         dbgs()
         << "LV: Aborting. Runtime stride check is required with -Os/-Oz.\n");
+    return true;
+  }
+
+  return false;
+}
+
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+  if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
+    // TODO: It may by useful to do since it's still likely to be dynamically
+    // uniform if the target can skip.
+    LLVM_DEBUG(
+        dbgs() << "LV: Not inserting runtime ptr check for divergent target");
+
+    ORE->emit(
+      createMissedAnalysis("CantVersionLoopWithDivergentTarget")
+      << "runtime pointer checks needed. Not enabled for divergent target");
+
     return None;
   }
 
-  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
   LLVM_DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
   if (TC == 1) {
     ORE->emit(createMissedAnalysis("SingleIterationLoop")
               << "loop trip count is one, irrelevant for vectorization");
@@ -4751,17 +4754,35 @@
     return None;
   }
 
-  // Record that scalar epilogue is not allowed.
-  LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+  switch (IsScalarEpilogueAllowed) {
+  default: return None;
+  case CM_ScalarEpilogueAllowed:
+    return computeFeasibleMaxVF(TC);
+  case CM_ScalarEpilogueNotAllowedPredicatePragma:
+    LLVM_DEBUG(dbgs() << "LV: vector predicate pragma found.\n"
+                      << "LV: creating predicated vector loop.\n");
+    break;
+  case CM_ScalarEpilogueNotAllowedLowTripLoop:
+    LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to low trip "
+                      << "count.\n");
+  case CM_ScalarEpilogueNotAllowedOptSize:
+    LLVM_DEBUG(dbgs() << "LV: Not allowing scalar epilogue due to -Os/-Oz.\n");
+    // Bail if runtime checks are required, which are not good when optimising
+    // for size.
+    if (runtimeChecksRequired())
+      return None;
+    break;
+  }
+
+  // Now try the tail folding
 
-  // We don't create an epilogue when optimizing for size.
   // Invalidate interleave groups that require an epilogue if we can't mask
   // the interleave-group.
   if (!useMaskedInterleavedAccesses(TTI))
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
 
+  // Bail if we don't have a tail at all.
   unsigned MaxVF = computeFeasibleMaxVF(TC);
-
   if (TC > 0 && TC % MaxVF == 0) {
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
     return MaxVF;
@@ -7226,6 +7247,8 @@
       (F->hasOptSize() ||
        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
     SEL = CM_ScalarEpilogueNotAllowedOptSize;
+  else if (L->isAnnotatedVectorPredicate())
+    SEL = CM_ScalarEpilogueNotAllowedPredicatePragma;
 
   LoopVectorizationCostModel CM(SEL, L, PSE, LI, LVL, *TTI, TLI,
                                 DB, AC, ORE, F, &Hints, IAI);
@@ -7318,10 +7341,13 @@
   // Check the function attributes and profiles to find out if this function
   // should be optimized for size.
   ScalarEpilogueLowering SEL = CM_ScalarEpilogueAllowed;
+
   if (Hints.getForce() != LoopVectorizeHints::FK_Enabled &&
       (F->hasOptSize() ||
        llvm::shouldOptimizeForSize(L->getHeader(), PSI, BFI)))
     SEL = CM_ScalarEpilogueNotAllowedOptSize;
+  else if (L->isAnnotatedVectorPredicate())
+    SEL = CM_ScalarEpilogueNotAllowedPredicatePragma;
 
   // Entrance to the VPlan-native vectorization path. Outer loops are processed
   // here. They may require CFG and instruction level transformations before
Index: llvm/test/Transforms/LoopVectorize/tail_loop_folding.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopVectorize/tail_loop_folding.ll
@@ -0,0 +1,79 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
+; CHECK-LABEL: tail_folding_enabled(
+; CHECK:  vector.body:
+; CHECK:  %wide.masked.load = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
+; CHECK:  %wide.masked.load1 = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(
+; CHECK:  %8 = add nsw <8 x i32> %wide.masked.load1, %wide.masked.load
+; CHECK:  call void @llvm.masked.store.v8i32.p0v8i32(
+; CHECK:  %index.next = add i64 %index, 8
+; CHECK:  %12 = icmp eq i64 %index.next, 432
+; CHECK:  br i1 %12, label %middle.block, label %vector.body, !llvm.loop !0
+
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 430
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !6
+}
+
+define dso_local void @tail_folding_disabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
+; CHECK-LABEL: tail_folding_disabled(
+; CHECK:      vector.body:
+; CHECK-NOT:  @llvm.masked.load.v8i32.p0v8i32(
+; CHECK-NOT:  @llvm.masked.store.v8i32.p0v8i32(
+; CHECK:      br i1 %44, label {{.*}}, label %vector.body
+entry:
+  br label %for.body
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %B, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %1, %0
+  %arrayidx4 = getelementptr inbounds i32, i32* %A, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 430
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !10
+}
+
+; CHECK:      !0 = distinct !{!0, !1}
+; CHECK-NEXT: !1 = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK-NEXT: !2 = distinct !{!2, !3, !1}
+; CHECK-NEXT: !3 = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK-NEXT: !4 = distinct !{!4, !1}
+; CHECK-NEXT: !5 = distinct !{!5, !3, !1}
+
+attributes #0 = { nounwind optsize uwtable "target-cpu"="core-avx2" "target-features"="+avx,+avx2" }
+
+!6 = distinct !{!6, !7, !8}
+!7 = !{!"llvm.loop.vectorize.predicate.enable", i1 true}
+!8 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+!10 = distinct !{!10, !11, !12}
+!11 = !{!"llvm.loop.vectorize.predicate.enable", i1 false}
+!12 = !{!"llvm.loop.vectorize.enable", i1 true}