diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -86,8 +86,6 @@
   TargetTransformInfo *TTI = nullptr;
   TargetLibraryInfo *TLI = nullptr;
   bool ClonedVCTPInExitBlock = false;
-  IntrinsicInst *ActiveLaneMask = nullptr;
-  FixedVectorType *VecTy = nullptr;
 
 public:
   static char ID;
@@ -119,7 +117,8 @@
   /// intrinsic: check if the first is a loop induction variable, and for the
   /// the second check that no overflow can occur in the expression that use
   /// this backedge-taken count.
-  bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy);
+  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+                        FixedVectorType *VecTy);
 
   /// Insert the intrinsic to represent the effect of tail predication.
   void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
@@ -130,10 +129,6 @@
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
   /// hardware-loops.
   void RematerializeIterCount();
-
-  /// If it is not safe to lower @llvm.get.active.lane.mask to a VCTP, it needs
-  /// to be lowered to an icmp.
-  void RevertActiveLaneMask();
 };
 
 } // end namespace
@@ -167,83 +162,6 @@
                         DeadInsts);
 }
 
-void MVETailPredication::RevertActiveLaneMask() {
-  if (!ActiveLaneMask)
-    return;
-
-  int VectorWidth = VecTy->getElementCount().Min;
-  IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI());
-
-  // 1. Create the vector induction step. This %induction will be the LHS of
-  // the icmp:
-  //
-  // %splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0
-  // %induction = add <4 x i32> %splat, <i32 0, i32 1, i32 2, i32 3>
-  //
-  Value *Index = ActiveLaneMask->getOperand(0);
-  Value *SplatIndex =
-      Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask");
-
-  SmallVector<Constant *, 8> Indices;
-  for (int i = 0; i < VectorWidth; ++i)
-    Indices.push_back(ConstantInt::get(Index->getType(), i));
-
-  Constant *CV = ConstantVector::get(Indices);
-  Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction");
-
-  LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n";
-             dbgs() << "ARM TP: New Induction: " << *Induction << "\n");
-
-  // 2. In the Preheader, first look if the splat BTC already exists. Find this
-  //    %splat, which will be the RHS of the icmp:
-  //
-  //    %TC.minus.1 = add i32 %N, -1
-  //    %splatinsert = insertelement <4 x i32> undef, i32 %TC.minus.1, i32 0
-  //    %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0
-  //
-  auto *Preheader = L->getLoopPreheader();
-  auto *BTC = ActiveLaneMask->getOperand(1);
-  Value *SplatBTC = nullptr;
-
-  if (auto *C = dyn_cast<ConstantInt>(BTC)) {
-    Builder.SetInsertPoint(Preheader->getTerminator());
-    SplatBTC = Builder.CreateVectorSplat(VectorWidth, C);
-    LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
-  } else {
-    Instruction *InsertElem;
-    for (auto &V : *Preheader) {
-      InsertElem = dyn_cast<InsertElementInst>(&V);
-      if (!InsertElem)
-        continue;
-      ConstantInt *CI = dyn_cast<ConstantInt>(InsertElem->getOperand(2));
-      if (!CI)
-        continue;
-      if (InsertElem->getOperand(1) != BTC || CI->getSExtValue() != 0)
-        continue;
-      if ((SplatBTC = dyn_cast<ShuffleVectorInst>(*InsertElem->users().begin())))
-         break;
-    }
-  }
-  // Or create the splat BTC if it doesn't exist.
-  if (!SplatBTC) {
-    Builder.SetInsertPoint(Preheader->getTerminator());
-    Value *Undef =
-        UndefValue::get(FixedVectorType::get(BTC->getType(), VectorWidth));
-    Value *Insert = Builder.CreateInsertElement(Undef,
-        BTC, Builder.getInt32(0), "insert.btc");
-    Value *Zero = ConstantInt::get(Insert->getType(), 0);
-    SplatBTC = Builder.CreateShuffleVector (Insert, Undef, Zero, "splat.btc");
-    LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
-  }
-
-  Builder.SetInsertPoint(ActiveLaneMask);
-  Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC);
-  LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n");
-  ActiveLaneMask->replaceAllUsesWith(ICmp);
-  ActiveLaneMask->eraseFromParent();
-}
-
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
   if (skipLoop(L) || DisableTailPredication)
     return false;
@@ -261,7 +179,6 @@
   TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
   DL = &L->getHeader()->getModule()->getDataLayout();
   this->L = L;
-  ActiveLaneMask = nullptr;
 
   // The MVE and LOB extensions are combined to enable tail-predication, but
   // there's nothing preventing us from generating VCTP instructions for v8.1m.
@@ -318,15 +235,14 @@
   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
              << *Decrement << "\n");
 
-  if (TryConvert(Setup->getArgOperand(0))) {
-    if (ClonedVCTPInExitBlock)
-      RematerializeIterCount();
-    return true;
-  } else
-    RevertActiveLaneMask();
+  if (!TryConvert(Setup->getArgOperand(0))) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
+    return false;
+  }
 
-  LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
-  return false;
+  if (ClonedVCTPInExitBlock)
+    RematerializeIterCount();
+  return true;
 }
 
 static FixedVectorType *getVectorType(IntrinsicInst *I) {
@@ -341,10 +257,27 @@
   // Check that the loop contains at least one masked load/store intrinsic.
   // We only support 'normal' vector instructions - other than masked
   // load/stores.
+  bool ActiveLaneMask = false;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      auto *Int = dyn_cast<IntrinsicInst>(&I);
+      if (!Int)
+        continue;
+
+      switch (Int->getIntrinsicID()) {
+      case Intrinsic::get_active_lane_mask:
+        ActiveLaneMask = true;
+        LLVM_FALLTHROUGH;
+      case Intrinsic::fma:
+      case Intrinsic::sadd_sat:
+      case Intrinsic::uadd_sat:
+        continue;
+      default:
+        break;
+      }
+
       if (IsMasked(&I)) {
-        FixedVectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+        auto *VecTy = getVectorType(Int);
         unsigned Lanes = VecTy->getNumElements();
         unsigned ElementWidth = VecTy->getScalarSizeInBits();
         // MVE vectors are 128-bit, but don't support 128 x i1.
@@ -353,23 +286,20 @@
         if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
           return false;
         MaskedInsts.push_back(cast<IntrinsicInst>(&I));
-      } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
-        switch (Int->getIntrinsicID()) {
-          case Intrinsic::fma:
-          case Intrinsic::sadd_sat:
-          case Intrinsic::uadd_sat:
-            continue;
-          default:
-            break;
-        }
-        for (auto &U : Int->args()) {
-          if (isa<VectorType>(U->getType()))
-            return false;
-        }
+        continue;
+      }
+
+      for (const Use &U : Int->args()) {
+        if (isa<VectorType>(U->getType()))
+          return false;
       }
     }
   }
 
+  if (!ActiveLaneMask) {
+    LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
+    return false;
+  }
   return !MaskedInsts.empty();
 }
 
@@ -451,14 +381,15 @@
 //        (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
 // 3) The IV must be an induction phi with an increment equal to the
 //    vector width.
-bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
-    FixedVectorType *VecTy) {
+bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+    Value *TripCount, FixedVectorType *VecTy) {
   // 1) Test whether entry to the loop is protected by a conditional
   // BTC + 1 < 0. In other words, if the scalar trip count overflows,
   // becomes negative, we shouldn't enter the loop and creating
   // tripcount expression BTC + 1 is not safe. So, check that BTC
   // isn't max. This is evaluated in unsigned, because the semantics
   // of @get.active.lane.mask is a ULE comparison.
+
   int VectorWidth = VecTy->getNumElements();
   auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
   auto *BTC = SE->getSCEV(BackedgeTakenCount);
@@ -570,8 +501,8 @@
   if (VectorWidth == StepValue)
     return true;
 
-  LLVM_DEBUG(dbgs() << "ARM TP: step value " << StepValue << " doesn't match "
-             "vector width : " << VectorWidth << "\n");
+  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
+             "vector width " << VectorWidth << "\n");
 
   return false;
 }
@@ -614,6 +545,7 @@
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
+  unsigned VectorWidth = VecTy->getNumElements();
 
   // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
   // is one less than the trip count. So we need to find or create
@@ -631,10 +563,10 @@
   // represent the effect of tail predication.
   Builder.SetInsertPoint(ActiveLaneMask);
   ConstantInt *Factor =
-    ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+    ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
 
   Intrinsic::ID VCTPID;
-  switch (VecTy->getNumElements()) {
+  switch (VectorWidth) {
   default:
     llvm_unreachable("unexpected number of lanes");
   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
@@ -680,7 +612,7 @@
     if (!Predicate || Predicates.count(Predicate))
       continue;
 
-    ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
+    auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
     if (!ActiveLaneMask ||
         ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
       continue;
@@ -689,8 +621,8 @@
     LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
                       << *ActiveLaneMask << "\n");
 
-    VecTy = getVectorType(I);
-    if (!IsSafeActiveMask(TripCount, VecTy)) {
+    auto *VecTy = getVectorType(I);
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
       LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
       return false;
     }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -49,7 +49,7 @@
   %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
   tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask)
   %index.next = add i32 %index, 16
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -106,7 +106,7 @@
   %tmp7 = bitcast i16* %tmp6 to <8 x i16>*
   tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask)
   %index.next = add i32 %index, 8
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -160,7 +160,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -221,7 +221,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -277,7 +277,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -336,7 +336,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -344,6 +344,92 @@
   ret void
 }
 
+; TODO: Multiple intrinsics not yet supported.
+; This is currently rejected, because if the vector body is unrolled, the step
+; is not what we expect:
+;
+;   Step value 16 doesn't match vector width 4
+;
+; CHECK-LABEL: interleave4
+; CHECK: vector.body:
+; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+;
+define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  %v0 = add i32 %N, 15
+  %v1 = lshr i32 %v0, 4
+  %v2 = shl nuw i32 %v1, 4
+  %v3 = add i32 %v2, -16
+  %v4 = lshr i32 %v3, 4
+  %v5 = add nuw nsw i32 %v4, 1
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+
+vector.ph:
+  %trip.count.minus.1 = add i32 %N, -1
+  %scevgep = getelementptr i32, i32* %A, i32 8
+  %scevgep30 = getelementptr i32, i32* %C, i32 8
+  %scevgep37 = getelementptr i32, i32* %B, i32 8
+  call void @llvm.set.loop.iterations.i32(i32 %v5)
+  br label %vector.body
+
+vector.body:
+  %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
+  %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
+  %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ]
+  %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
+  %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
+  %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %v7 = add i32 %index, 4
+  %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+  %v8 = add i32 %v7, 4
+  %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+  %v9 = add i32 %v8, 4
+  %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+  %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1
+  %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+  %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+  %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1
+  %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+  %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2
+  %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1
+  %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+  %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1
+  %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+  %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
+  %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
+  %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
+  %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
+  %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask)
+  %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16)
+  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
+  %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16
+  %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16
+  %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16
+  %v14 = add i32 %v9, 4
+  %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
+  %v16 = icmp ne i32 %v15, 0
+  br i1 %v16, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
@@ -353,7 +439,7 @@
 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 declare void @llvm.set.loop.iterations.i32(i32)
-declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -266,16 +266,9 @@
 }
 
 ; CHECK-LABEL: @overflow_BTC_plus_1(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
-;
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -316,8 +309,9 @@
 }
 
 ; CHECK-LABEL: @overflow_in_sub(
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -366,8 +360,9 @@
 }
 
 ; CHECK-LABEL: @overflow_in_rounding_tripcount(
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -413,15 +408,9 @@
 
 
 ; CHECK-LABEL: @IV_not_an_induction(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @IV_not_an_induction(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -462,15 +451,9 @@
 }
 
 ; CHECK-LABEL: @IV_wrong_step(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @IV_wrong_step(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -514,15 +497,9 @@
 }
 
 ; CHECK-LABEL: @IV_step_not_constant(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @IV_step_not_constant(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
@@ -563,15 +540,9 @@
 }
 
 ; CHECK-LABEL: @outerloop_phi(
-;
+; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK-NOT:   @llvm.get.active.lane.mask
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 4096, i32 4096, i32 4096, i32 4096>
-; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
-;
+; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
 define dso_local void @outerloop_phi(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -143,21 +143,10 @@
 ;
 ; CHECK-LABEL: @reduction_not_guarded
 ;
+; CHECK:     vector.body:
 ; CHECK-NOT: @llvm.arm.mve.vctp
-; CHECK-NOT: @llvm.get.active.lane.mask.v8i1.i32
-;
-; CHECK: entry:
-; CHECK: %[[ELEMCOUNT:.*]] = add i32 %N, -1
-; CHECK: %broadcast.splatinsert1 = insertelement <8 x i32> undef, i32 %[[ELEMCOUNT]], i32 0
-; CHECK  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
-;
-; CHECK: vector.body:
-; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2
-; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef)
-; CHECK: ret
+; CHECK:     @llvm.get.active.lane.mask.v8i1.i32
+; CHECK:     ret
 ;
 define i16 @reduction_not_guarded(i16* nocapture readonly %A, i16 %B, i32 %N) local_unnamed_addr {
 entry:
@@ -213,20 +202,9 @@
 ;
 ; CHECK-LABEL: @Correlation
 ;
-; CHECK: entry:
-; CHECK: for.body.lr.ph:          ; preds = %entry
-; CHECK: for.body:                ; preds = %for.end, %for.body.lr.ph
-; CHECK: vector.ph:               ; preds = %for.body
-; CHECK:   %trip.count.minus.1 = add i32 %8, -1
-; CHECK:   call void @llvm.set.loop.iterations.i32(i32 %7)
-; CHECK:   %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
-; CHECK:   %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:   br label %vector.body
 ; CHECK: vector.body:
-; CHECK-NOT:   @llvm.arm.mve.vctp
-; CHECK:   %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc
-; CHECK:   call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} 
-;
+; CHECK-NOT: @llvm.arm.mve.vctp
+; CHECK:     %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
 ;
 ; FORCE-LABEL: @Correlation
 ; FORCE: vector.ph:                                        ; preds = %for.body