Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
===================================================================
--- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -1387,10 +1387,12 @@
 
 bool ARMTTIImpl::emitGetActiveLaneMask(Loop *L, LoopInfo *LI,
     ScalarEvolution &SE, bool TailFolded) const {
-  // TODO: if this loop is tail-folded, we want to emit the
+  // If this loop is tail-folded, we want to emit the
   // llvm.get.active.lane.mask intrinsic so that this can be picked up in the
   // MVETailPredication pass that needs to know the number of elements
   // processed by this vector loop.
+  if (TailFolded)
+    return true;
   return false;
 }
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -6821,7 +6821,11 @@
       IV = IVRecipe->getVPValue();
     }
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
-    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+    if (CM.TTI.emitGetActiveLaneMask(CM.TheLoop, CM.LI, *CM.PSE.getSE(),
+                                     !CM.isScalarEpilogueAllowed()))
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
+    else
+      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     return BlockMaskCache[BB] = BlockMask;
   }
 
Index: llvm/lib/Transforms/Vectorize/VPlan.h
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.h
+++ llvm/lib/Transforms/Vectorize/VPlan.h
@@ -675,6 +675,7 @@
     ICmpULE,
     SLPLoad,
     SLPStore,
+    ActiveLaneMask,
   };
 
 private:
Index: llvm/lib/Transforms/Vectorize/VPlan.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -380,6 +380,55 @@
     State.set(this, V, Part);
     break;
   }
+  case VPInstruction::ActiveLaneMask: {
+    // The vector induction variable.
+    Value *VIV = State.get(getOperand(0), Part);
+    // The Back-edge Taken Count (BTC) splat to a vector.
+    Value *SplatBTC = State.get(getOperand(1), Part);
+
+    // Create the intrinsic call, which has equivalent semantics to
+    //   icmp ule %VIV, %SplatBTC
+    // generating the mask of active/inactive lanes.
+    //
+    // We have 2 cases: incrementing loops, and decrementing loops, which means
+    // the BTC looks slightly different. For incrementing loops, it is this:
+    //
+    //   VIV: %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+    //   BTC: <4 x i32> <i32 430, i32 430, i32 430, i32 430>
+    //
+    // and for decrementing loops, it is:
+    //
+    //   VIV: %vec.iv = add <16 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+    //   BTC: %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+
+    Value *ScalarBTC = nullptr;
+    if (auto *CDV = dyn_cast<ConstantDataVector>(SplatBTC))
+      ScalarBTC = CDV->getSplatValue();
+    else if (auto *ShuffleVec = dyn_cast<ShuffleVectorInst>(SplatBTC)) {
+      // Extract the scalar BTC from the insertelement instruction, which is its
+      // 2nd operand.
+      Instruction *SplatInsert = dyn_cast<InsertElementInst>(
+          ShuffleVec->getOperand(0));
+      assert(SplatInsert && "Unexpected backedge taken count form");
+      ScalarBTC = SplatInsert->getOperand(1);
+    }
+    else
+      llvm_unreachable("Unexpected vector induction variable");
+
+    auto *VIVElem0 = Builder.CreateExtractElement(
+        VIV, ConstantInt::get(ScalarBTC->getType(), 0));
+    auto *I = dyn_cast<Instruction>(VIV);
+    auto *Int1Ty = Type::getInt1Ty(I->getParent()->getContext());
+    auto *VecTy = dyn_cast<VectorType>(VIV->getType());
+    auto *PredTy = VectorType::get(Int1Ty, VecTy->getNumElements());
+
+    Instruction *Call = Builder.CreateIntrinsic(
+        Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
+        {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
+
+    State.set(this, Call, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -421,6 +470,10 @@
   case VPInstruction::SLPStore:
     O << "combined store";
     break;
+  case VPInstruction::ActiveLaneMask:
+    O << "active lane mask";
+    break;
+
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
Index: llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/prefer-tail-loop-folding.ll
@@ -45,9 +45,12 @@
 define void @prefer_folding(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL:    prefer_folding(
 ; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
-; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32
-; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32
+; PREFER-FOLDING: %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
+; PREFER-FOLDING: %[[ELEM0:.*]] = extractelement <4 x i32> %induction, i32 0
+; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %[[ELEM0]], i32 430)
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
+; PREFER-FOLDING: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask,
+; PREFER-FOLDING: call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
 ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
 ;
 ; NO-FOLDING-NOT: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
@@ -507,9 +510,10 @@
 define void @float(float* noalias nocapture %A, float* noalias nocapture readonly %B, float* noalias nocapture readonly %C) #0 {
 ; CHECK-LABEL:    float(
 ; PREFER-FOLDING: vector.body:
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
-; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32
-; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32
+; PREFER-FOLDING: %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %2, i32 430)
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
+; PREFER-FOLDING: call <4 x float> @llvm.masked.load.v4f32.p0v4f32({{.*}}%active.lane.mask
+; PREFER-FOLDING: call void @llvm.masked.store.v4f32.p0v4f32({{.*}}%active.lane.mask
 ; PREFER-FOLDING: br i1 %{{.*}}, label %{{.*}}, label %vector.body
 entry:
   br label %for.body
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-folding-counting-down.ll
@@ -15,9 +15,12 @@
 define dso_local void @sgt_loopguard(i8* noalias nocapture readonly %a, i8* noalias nocapture readonly %b, i8* noalias nocapture %c, i32 %N) local_unnamed_addr #0 {
 ; COMMON-LABEL: @sgt_loopguard(
 ; COMMON:       vector.body:
-; CHECK-TF:     masked.load
-; CHECK-TF:     masked.load
-; CHECK-TF:     masked.store
+
+; CHECK-TF:     %[[ELEM:.*]] = extractelement <16 x i32> %vec.iv, i32 0
+; CHECK-TF:     %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %[[ELEM]], i32 %trip.count.minus{{.*}})
+; CHECK-TF:     llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask
+; CHECK-TF:     llvm.masked.store.v16i8.p0v16i8(<16 x i8> %{{.*}}, <16 x i8>* %{{.*}}, i32 1, <16 x i1> %active.lane.mask)
 entry:
   %cmp5 = icmp sgt i32 %N, 0
   br i1 %cmp5, label %while.body.preheader, label %while.end
Index: llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
+++ llvm/test/Transforms/LoopVectorize/ARM/tail-loop-folding.ll
@@ -41,11 +41,14 @@
 define dso_local void @tail_folding_enabled(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C) local_unnamed_addr #0 {
 ; COMMON-LABEL: tail_folding_enabled(
 ; COMMON: vector.body:
-; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
-; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; COMMON:   %[[INDUCTION:.*]] = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
+; COMMON:   %[[ELEM0:.*]] = extractelement <4 x i64> %[[INDUCTION]], i64 0
+; COMMON:   %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
+; COMMON:   %[[WML1:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
+; COMMON:   %[[WML2:.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}<4 x i1> %active.lane.mask
 ; COMMON:   %[[ADD:.*]] = add nsw <4 x i32> %[[WML2]], %[[WML1]]
-; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]]
-; COMMON:   br i1 %12, label %{{.*}}, label %vector.body
+; COMMON:   call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %[[ADD]], {{.*}}<4 x i1> %active.lane.mask
+; COMMON:   br i1 %{{.*}}, label %{{.*}}, label %vector.body
 entry:
   br label %for.body
 
@@ -75,13 +78,16 @@
 
 ; PREDFLAG-LABEL: tail_folding_disabled(
 ; PREDFLAG:  vector.body:
-; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
-; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(
+; PREDFLAG:  %[[INDUCTION:induction.*]] = add <4 x i64> %broadcast.splat, <i64 0, i64 1, i64 2, i64 3>
+; PREDFLAG:  %[[ELEM0:.*]] = extractelement <4 x i64> %[[INDUCTION]], i64 0
+; PREDFLAG:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 %[[ELEM0]], i64 429)
+; PREDFLAG:  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
+; PREDFLAG:  %wide.masked.load1 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
 ; PREDFLAG:  %{{.*}} = add nsw <4 x i32> %wide.masked.load1, %wide.masked.load
-; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32(
+; PREDFLAG:  call void @llvm.masked.store.v4i32.p0v4i32({{.*}}, <4 x i1> %active.lane.mask
 ; PREDFLAG:  %index.next = add i64 %index, 4
-; PREDFLAG:  %12 = icmp eq i64 %index.next, 432
-; PREDFLAG:  br i1 %{{.*}}, label %middle.block, label %vector.body, !llvm.loop !6
+; PREDFLAG:  %[[CMP:.*]] = icmp eq i64 %index.next, 432
+; PREDFLAG:  br i1 %[[CMP]], label %middle.block, label %vector.body, !llvm.loop !6
 entry:
   br label %for.body