Index: llvm/docs/LangRef.rst
===================================================================
--- llvm/docs/LangRef.rst
+++ llvm/docs/LangRef.rst
@@ -16158,6 +16158,56 @@
       %also.r = select <4 x i1> %mask, <4 x i32> %t, <4 x i32> undef
 
 
+.. _int_get_active_lane_mask:
+
+'``llvm.get.active.lane.mask.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <4 x i1> @llvm.get.active.lane.mask.v4i1.v4i32.v4i64(<4 x i32> <IV>, <4 x i32> <BTC>)
+      declare <8 x i1> @llvm.get.active.lane.mask.v8i1.v8i64.v8i64(<8 x i64> <IV>, <8 x i64> <BTC>)
+      declare <16 x i1> @llvm.get.active.lane.mask.v16i1.v16i64.v16i64(<16 x i64> <IV>, <16 x i64> <BTC>)
+
+Overview:
+"""""""""
+
+Create a mask representing active and inactive vector lanes.
+
+
+Arguments:
+""""""""""
+
+Both operands have the same vector of integer type. The first operand is the
+vector induction step. The second operand is the loop back-edge taken iteration
+count splat into a vector. The result is a vector with the same number of
+elements as the operands, but with the i1 element value type.
+
+
+Semantics:
+""""""""""
+
+The '``llvm.get.active.lane.mask``' intrinsic performs an element-wise less
+than or equal comparison of the induction step value with the back-edge taken
+iteration count, producing a mask of true/false values representing
+active/inactive vector lanes. This mask can e.g. be used in the masked
+load/store instructions.
+
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %induction = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
+      %get.active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.v4i64.v4i64(<4 x i64> %induction, <4 x i64> <i64 429, i64 429, i64 429, i64 429>)
+      %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %3, i32 4, <4 x i1> %get.active.lane.mask, <4 x i32> undef)
+
+
 .. _int_mload_mstore:
 
 Masked Vector Load and Store Intrinsics
Index: llvm/include/llvm/Analysis/TargetTransformInfo.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -478,6 +478,11 @@
                                    DominatorTree *DT,
                                    const LoopAccessInfo *LAI) const;
 
+  /// Query the target whether lowering of the llvm.get.active.lane.mask
+  /// intrinsic is supported and if emitting it is desired for this loop.
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFolded) const;
+
   /// @}
 
   /// \name Scalar Target Information
@@ -1228,6 +1233,8 @@
   preferPredicateOverEpilogue(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                               AssumptionCache &AC, TargetLibraryInfo *TLI,
                               DominatorTree *DT, const LoopAccessInfo *LAI) = 0;
+  virtual bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                                     bool TailFolded) = 0;
   virtual bool isLegalAddImmediate(int64_t Imm) = 0;
   virtual bool isLegalICmpImmediate(int64_t Imm) = 0;
   virtual bool isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
@@ -1530,6 +1537,10 @@
                                    const LoopAccessInfo *LAI) override {
     return Impl.preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
   }
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFolded) override {
+    return Impl.emitGetActiveLaneMask(L, LI, SE, TailFolded);
+  }
   bool isLegalAddImmediate(int64_t Imm) override {
     return Impl.isLegalAddImmediate(Imm);
   }
Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -144,6 +144,11 @@
     return false;
   }
 
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFold) const {
+    return false;
+  }
+
   void getUnrollingPreferences(Loop *, ScalarEvolution &,
                                TTI::UnrollingPreferences &) {}
 
Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h
===================================================================
--- llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -498,6 +498,11 @@
     return BaseT::preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
   }
 
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFold) {
+    return BaseT::emitGetActiveLaneMask(L, LI, SE, TailFold);
+  }
+
   int getInstructionLatency(const Instruction *I) {
     if (isa<LoadInst>(I))
       return getST()->getSchedModel().DefaultLoadLatency;
Index: llvm/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/include/llvm/IR/Intrinsics.td
+++ llvm/include/llvm/IR/Intrinsics.td
@@ -1235,6 +1235,9 @@
 
 }
 
+def int_get_active_lane_mask:
+  Intrinsic<[llvm_anyvector_ty],
+            [llvm_anyvector_ty, llvm_anyvector_ty], [IntrNoDuplicate]>;
 
 //===-------------------------- Masked Intrinsics -------------------------===//
 //
Index: llvm/lib/Analysis/TargetTransformInfo.cpp
===================================================================
--- llvm/lib/Analysis/TargetTransformInfo.cpp
+++ llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -235,6 +235,11 @@
   return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
 }
 
+bool TargetTransformInfo::emitGetActiveLaneMask(Loop *L, LoopInfo *LI,
+    ScalarEvolution &SE, bool TailFolded) const {
+  return TTIImpl->emitGetActiveLaneMask(L, LI, SE, TailFolded);
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -250,6 +250,9 @@
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  bool emitGetActiveLaneMask(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
+                             bool TailFolded) const;
+
   bool shouldBuildLookupTablesForConstant(Constant *C) const {
     // In the ROPI and RWPI relocation models we can't have pointers to global
     // variables or functions in constant data, so don't convert switches to
Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1385,7 +1385,16 @@
   return canTailPredicateLoop(L, LI, SE, DL, LAI);
 }
 
-
+bool ARMTTIImpl::emitGetActiveLaneMask(Loop *L, LoopInfo *LI,
+    ScalarEvolution &SE, bool TailFolded) const {
+  // If this loop is tail-folded, we always want to emit the
+  // llvm.get.active.lane.mask intrinsic, so that this can be picked up in the
+  // MVETailPredication pass that needs to know the number of elements
+  // processed by this vector loop.
+  if (TailFolded)
+    return true;
+  return false;
+}
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6820,7 +6820,11 @@
       IV = IVRecipe->getVPValue();
     }
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
-    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+    if (CM.TTI.emitGetActiveLaneMask(CM.TheLoop, CM.LI, *CM.PSE.getSE(),
+                                     !CM.isScalarEpilogueAllowed()))
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
+    else
+      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     return BlockMaskCache[BB] = BlockMask;
   }
 
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -675,6 +675,7 @@
     ICmpULE,
     SLPLoad,
     SLPStore,
+    ActiveLaneMask,
   };
 
 private:
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -380,6 +380,26 @@
     State.set(this, V, Part);
     break;
   }
+  case VPInstruction::ActiveLaneMask: {
+    BasicBlock *BB = Builder.GetInsertBlock();
+    // The induction step.
+    Value *IV = State.get(getOperand(0), Part);
+    // The Back-edge Taken Count.
+    Value *BTC = State.get(getOperand(1), Part);
+
+    int NumElems = dyn_cast<VectorType>(BTC->getType())->getNumElements();
+    Type *Int1Ty = Type::getInt1Ty(BB->getParent()->getContext());
+    Type *PredVecTy = VectorType::get(Int1Ty, NumElems);
+
+    // Create the intrinsic call which models the IV <= BTC comparison
+    // and generates the predicate of active/inactive lanes.
+    Instruction *Call = Builder.CreateIntrinsic(
+        Intrinsic::get_active_lane_mask,
+        {PredVecTy, IV->getType(), BTC->getType()}, {IV, BTC}, nullptr,
+        "get.active.lane.mask");
+    State.set(this, Call, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -421,6 +441,10 @@
   case VPInstruction::SLPStore:
     O << "combined store";
     break;
+  case VPInstruction::ActiveLaneMask:
+    O << "active lane mask";
+    break;
+
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@@ -41,11 +41,13 @@
 define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
 ; COMMON-LABEL: tail_folding_enabled(
 ; COMMON: vector.body:
-; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
-; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; COMMON:   %[[INDUCTION:.*]] = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
+; COMMON:   %get.active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.v4i64.v4i64(<4 x i64> %[[INDUCTION]], <4 x i64> <i64 429, i64 429, i64 429, i64 429>)
+; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %get.active.lane.mask
+; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %get.active.lane.mask
 ; COMMON:   %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
-; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]]
-; COMMON:   br i1 %12, label %{{.*}}, label %vector.body
+; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %get.active.lane.mask
+; COMMON:   br i1 %{{.*}}, label %{{.*}}, label %vector.body
 entry:
   br label %for.body
 
@@ -75,13 +77,15 @@
 
 ; PREDFLAG-LABEL: tail_folding_disabled(
 ; PREDFLAG:  vector.body:
-; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
-; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; PREDFLAG:  %[[INDUCTION:induction.*]] = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
+; PREDFLAG:  %get.active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.v4i64.v4i64(<4 x i64> %[[INDUCTION]], <4 x i64> <i64 429, i64 429, i64 429, i64 429>)
+; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %get.active.lane.mask
+; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %get.active.lane.mask
 ; PREDFLAG:  %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32(
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %get.active.lane.mask
 ; PREDFLAG:  %index.next = add i64 %index, 4
-; PREDFLAG:  %12 = icmp eq i64 %index.next, 432
-; PREDFLAG:  br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
+; PREDFLAG:  %[[CMP:.*]] = icmp eq i64 %index.next, 432
+; PREDFLAG:  br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6
 entry:
   br label %for.body