Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -472,6 +472,11 @@
                                    DominatorTree *DT,
                                    const LoopAccessInfo *LAI) const;
 
+  /// Query the target whether lowering of the llvm.set.loop.elements intrinsic
+  /// is supported and desired for this loop.
+  bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                              bool TailFolded) const;
+
   /// @}
 
   /// \name Scalar Target Information
@@ -1195,6 +1200,8 @@
   preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                               AssumptionCache &AC, TargetLibraryInfo *TLI,
                               DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
+  virtual bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                                      bool TailFolded) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1472,6 +1479,10 @@
                                    const LoopAccessInfo *LAI) override {
     return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
   }
+  bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                              bool TailFolded) override {
+    return Impl.emitNumElementsVecLoop(L, LI, SE, TailFolded);
+  }
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -143,6 +143,11 @@
     return false;
   }
 
+  bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                              bool TailFold) const {
+    return false;
+  }
+
   void getUnrollingPreferences(Loop *, ScalarEvolution &,
                                TTI::UnrollingPreferences &) {}
 
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -492,6 +492,11 @@
     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
   }
 
+  bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                              bool TailFold) {
+    return BaseT::emitNumElementsVecLoop(L, LI, SE, TailFold);
+  }
+
   int getInstructionLatency(const Instruction *I) {
     if (isa<LoadInst>(I))
       return getST()->getSchedModel().DefaultLoadLatency;
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -1413,6 +1413,11 @@
 def int_set_loop_iterations :
   Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
 
+// Specify the number of elements processed by this (vector) loop, which
+// typically corresponds to the iteration count of the scalar loop.
+def int_set_loop_elements:
+  Intrinsic<[], [llvm_anyint_ty], [IntrNoDuplicate]>;
+
 // Specify that the value given is the number of iterations that the next loop
 // will execute. Also test that the given count is not zero, allowing it to
 // control entry to a 'while' loop.
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -233,6 +233,11 @@
   return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
 }
 
+bool TargetTransformInfo::emitNumElementsVecLoop(Loop *L, LoopInfo *LI,
+    ScalarEvolution &SE, bool TailFolded) const {
+  return TTIImpl->emitNumElementsVecLoop(L, LI, SE, TailFolded);
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -237,9 +237,13 @@
                                    TargetLibraryInfo *TLI,
                                    DominatorTree *DT,
                                    const LoopAccessInfo *LAI);
+
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  bool emitNumElementsVecLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                              bool TailFolded) const;
+
   bool shouldBuildLookupTablesForConstant(Constant *C) const {
     // In the ROPI and RWPI relocation models we can't have pointers to global
     // variables or functions in constant data, so don't convert switches to
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1309,7 +1309,16 @@
   return canTailPredicateLoop(L, LI, SE, DL, LAI);
 }
 
-
+bool ARMTTIImpl::emitNumElementsVecLoop(Loop *L, LoopInfo *LI,
+    ScalarEvolution &SE, bool TailFolded) const {
+  // If this loop is tail-folded, we always want to to emit the
+  // llvm.set.loop.elements intrinsic, so that this can be picked up in the
+  // MVETailPredication pass that needs to know the number of elements
+  // processed by this vector loop.
+  if (TailFolded)
+    return true;
+  return false;
+}
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -639,6 +639,10 @@
   /// Emit bypass checks to check any memory assumptions we may have made.
   void emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass);
 
+  /// Emit the llvm.set.loop.elements IR intrinsic that models the number of
+  /// data elements processed by the vector loop.
+  void emitNumElementsVecLoop(BasicBlock *Bypass, Value *Count);
+
   /// Compute the transformed value of Index at offset StartValue using step
   /// StepValue.
   /// For integer induction, returns StartValue + Index * StepValue.
@@ -2830,6 +2834,22 @@
   LVer->prepareNoAliasMetadata();
 }
 
+void InnerLoopVectorizer::emitNumElementsVecLoop(BasicBlock *Bypass,
+                                                 Value *Count) {
+  if (EnableVPlanNativePath)
+    return;
+
+  if (!TTI->emitNumElementsVecLoop(OrigLoop, LI, *PSE.getSE(),
+                                   Cost->foldTailByMasking()))
+    return;
+
+  IRBuilder<> Builder(Bypass->getTerminator());
+  Function *NumElems = Intrinsic::getDeclaration(
+      Bypass->getParent()->getParent(), Intrinsic::set_loop_elements,
+      Count->getType());
+  Builder.CreateCall(NumElems, Count);
+}
+
 Value *InnerLoopVectorizer::emitTransformedIndex(
     IRBuilder<> &B, Value *Index, ScalarEvolution *SE, const DataLayout &DL,
     const InductionDescriptor &ID) const {
@@ -3024,6 +3044,12 @@
   // faster.
   emitMemRuntimeChecks(Lp, LoopScalarPreHeader);
 
+  // Emit an intrinsic in the vector preheader that represents the number
+  // of data elements processed by the vector loop, which corresponds to
+  // the tripcount of the scalar loop. This queries TTI to check that this
+  // intrinsic is supported by the target.
+  emitNumElementsVecLoop(LoopVectorPreHeader, Count);
+
   // Generate the induction variable.
   // The loop step is equal to the vectorization factor (num of SIMD elements)
   // times the unroll factor (num of SIMD instructions).
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -14,6 +14,10 @@
 ;
 define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
 ; COMMON-LABEL: @sgt_loopguard(
+; CHECK-TF:      %[[TC:.*]] = sub i32 %0, %[[SMIN:.*]]
+; CHECK-TF:       br i1 false, label %scalar.ph, label %vector.ph
+; CHECK-TF:     vector.ph:
+; CHECK-TF:       call void @llvm.set.loop.elements.i32(i32 %[[TC]])
 ; COMMON:       vector.body:
 ; CHECK-TF:     masked.load
 ; CHECK-TF:     masked.load
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@@ -10,10 +10,7 @@
 define dso_local void @tail_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL: tail_folding(
 ; CHECK: vector.body:
-;
-; This needs implementation of TTI::preferPredicateOverEpilogue,
-; then this will be tail-folded too:
-;
+; CHECK-NOT:  call void @llvm.set.loop.elements
 ; CHECK-NOT:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
 ; CHECK-NOT:  call void @llvm.masked.store.v4i32.p0v4i32(
 ; CHECK:      br i1 %{{.*}}, label %{{.*}}, label %vector.body
@@ -40,6 +37,9 @@
 
 define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
 ; COMMON-LABEL: tail_folding_enabled(
+; COMMON: vector.ph:
+; COMMON:   call void @llvm.set.loop.elements.i64(i64 430)
+; COMMON:   br label %vector.body
 ; COMMON: vector.body:
 ; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
 ; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(