Index: llvm/lib/Target/ARM/MVETailPredication.cpp
===================================================================
--- llvm/lib/Target/ARM/MVETailPredication.cpp
+++ llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -85,8 +85,7 @@
   TargetTransformInfo *TTI = nullptr;
   TargetLibraryInfo *TLI = nullptr;
   bool ClonedVCTPInExitBlock = false;
-  IntrinsicInst *ActiveLaneMask = nullptr;
-  FixedVectorType *VecTy = nullptr;
+  std::vector<IntrinsicInst *> ActiveLaneMasks;
 
 public:
   static char ID;
@@ -118,11 +117,12 @@
   /// intrinsic: check if the first is a loop induction variable, and for the
   /// the second check that no overflow can occur in the expression that use
   /// this backedge-taken count.
-  bool IsSafeActiveMask(Value *TripCount, FixedVectorType *VecTy);
+  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+                        FixedVectorType *VecTy);
 
   /// Insert the intrinsic to represent the effect of tail predication.
   void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                           FixedVectorType *VecTy,
+                           FixedVectorType *vecTy,
                            DenseMap<Instruction *, Instruction *> &NewPredicates);
 
   /// Rematerialize the iteration count in exit blocks, which enables
@@ -167,11 +167,21 @@
 }
 
 void MVETailPredication::RevertActiveLaneMask() {
-  if (!ActiveLaneMask)
+  if (ActiveLaneMasks.empty())
     return;
 
-  int VectorWidth = VecTy->getElementCount().Min;
-  IRBuilder<> Builder(ActiveLaneMask->getParent()->getFirstNonPHI());
+  // Perform some sanity checks on the intrinsics: they should at least all have
+  // the same 2nd argument, i.e. the BTC should be the same.
+  auto *BTC = ActiveLaneMasks.front()->getOperand(1);
+  auto *VecTy = dyn_cast<FixedVectorType>(ActiveLaneMasks.front()->getType());
+  for (auto *I : ActiveLaneMasks) {
+    if (I->getOperand(1) == BTC)
+      assert("Same BTC expected for all get.active.lane.mask intrinsics");
+    if (dyn_cast<FixedVectorType>(I->getType()) == VecTy)
+      assert("Same types expected");
+  }
+
+  unsigned VectorWidth = VecTy->getNumElements();
 
   // 1. Create the vector induction step. This %induction will be the LHS of
   // the icmp:
@@ -180,19 +190,25 @@
   // %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <4 x i32> 0
   // %induction = add <4 x i32> %splat, <i32 0, i32 1, i32 2, i32 3>
   //
-  Value *Index = ActiveLaneMask->getOperand(0);
-  Value *SplatIndex =
-      Builder.CreateVectorSplat(VectorWidth, Index, "lane.mask");
-
-  SmallVector<Constant *, 8> Indices;
-  for (int i = 0; i < VectorWidth; ++i)
-    Indices.push_back(ConstantInt::get(Index->getType(), i));
-
-  Constant *CV = ConstantVector::get(Indices);
-  Value *Induction = Builder.CreateAdd(SplatIndex, CV, "lane.mask.induction");
-
-  LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n";
-             dbgs() << "ARM TP: New Induction: " << *Induction << "\n");
+  IRBuilder<> Builder(ActiveLaneMasks.front()->getParent()->getFirstNonPHI());
+  std::vector<Value *> Inductions;
+  for (auto *ActiveLaneMask : ActiveLaneMasks) {
+    Builder.SetInsertPoint(ActiveLaneMask);
+    Value *Index = ActiveLaneMask->getOperand(0);
+    Value *SplatIndex =
+        Builder.CreateVectorSplat(VectorWidth, Index, "index");
+
+    SmallVector<Constant *, 8> Indices;
+    for (unsigned i = 0; i < VectorWidth; ++i)
+      Indices.push_back(ConstantInt::get(Index->getType(), i));
+
+    Constant *CV = ConstantVector::get(Indices);
+    Value *Induction = Builder.CreateAdd(SplatIndex, CV, "viv.induction");
+    Inductions.push_back(Induction);
+
+    LLVM_DEBUG(dbgs() << "ARM TP: New index: " << *SplatIndex << "\n";
+               dbgs() << "ARM TP: New Induction: " << *Induction << "\n");
+  }
 
   // 2. In the Preheader, first look if the splat BTC already exists. Find this
   //    %splat, which will be the RHS of the icmp:
@@ -202,7 +218,6 @@
   //    %splat = shufflevector <4 x i32> %splatinsert, <4 x i32> undef, <16 x i32> 0
   //
   auto *Preheader = L->getLoopPreheader();
-  auto *BTC = ActiveLaneMask->getOperand(1);
   Value *SplatBTC = nullptr;
 
   if (auto *C = dyn_cast<ConstantInt>(BTC)) {
@@ -236,11 +251,15 @@
     LLVM_DEBUG(dbgs() << "ARM TCP: New splat BTC: " << *SplatBTC << "\n");
   }
 
-  Builder.SetInsertPoint(ActiveLaneMask);
-  Value *ICmp = Builder.CreateICmp(ICmpInst::ICMP_ULE, Induction, SplatBTC);
-  LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n");
-  ActiveLaneMask->replaceAllUsesWith(ICmp);
-  ActiveLaneMask->eraseFromParent();
+  int i = 0;
+  for (auto *ActiveLaneMask : ActiveLaneMasks) {
+    Builder.SetInsertPoint(ActiveLaneMask);
+    Value *ICmp =
+        Builder.CreateICmp(ICmpInst::ICMP_ULE, Inductions[i++], SplatBTC);
+    LLVM_DEBUG(dbgs() << "ARM TP: New compare: " << *ICmp << "\n");
+    ActiveLaneMask->replaceAllUsesWith(ICmp);
+    ActiveLaneMask->eraseFromParent();
+  }
 }
 
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
@@ -260,7 +279,7 @@
   TLI = TLIP ? &TLIP->getTLI(*L->getHeader()->getParent()) : nullptr;
   DL = &L->getHeader()->getModule()->getDataLayout();
   this->L = L;
-  ActiveLaneMask = nullptr;
+  ActiveLaneMasks.clear();
 
   // The MVE and LOB extensions are combined to enable tail-predication, but
   // there's nothing preventing us from generating VCTP instructions for v8.1m.
@@ -317,15 +336,15 @@
   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
              << *Decrement << "\n");
 
-  if (TryConvert(Setup->getArgOperand(0))) {
-    if (ClonedVCTPInExitBlock)
-      RematerializeIterCount();
-    return true;
-  } else
+  if (!TryConvert(Setup->getArgOperand(0))) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
     RevertActiveLaneMask();
+    return false;
+  }
 
-  LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
-  return false;
+  if (ClonedVCTPInExitBlock)
+    RematerializeIterCount();
+  return true;
 }
 
 static FixedVectorType *getVectorType(IntrinsicInst *I) {
@@ -342,8 +361,19 @@
   // load/stores.
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      auto *Int = dyn_cast<IntrinsicInst>(&I);
+      if (!Int)
+        continue;
+
+      if (Int->getIntrinsicID() == Intrinsic::get_active_lane_mask) {
+        ActiveLaneMasks.push_back(Int);
+        continue;
+      }
+      if (Int->getIntrinsicID() == Intrinsic::fma)
+        continue;
+
       if (IsMasked(&I)) {
-        FixedVectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+        auto *VecTy = getVectorType(Int);
         unsigned Lanes = VecTy->getNumElements();
         unsigned ElementWidth = VecTy->getScalarSizeInBits();
         // MVE vectors are 128-bit, but don't support 128 x i1.
@@ -352,17 +382,27 @@
         if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
           return false;
         MaskedInsts.push_back(cast<IntrinsicInst>(&I));
-      } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
-        if (Int->getIntrinsicID() == Intrinsic::fma)
-          continue;
-        for (auto &U : Int->args()) {
-          if (isa<VectorType>(U->getType()))
-            return false;
-        }
+        continue;
+      }
+
+      for (const Use &U : Int->args()) {
+        if (isa<VectorType>(U->getType()))
+          return false;
       }
     }
   }
 
+  if (!ActiveLaneMasks.size()) {
+    LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
+    return false;
+  }
+  // TODO: we only expect/support 1 lane intrinsic, revert if we find more.
+  if (ActiveLaneMasks.size() > 1) {
+    LLVM_DEBUG(dbgs()
+               << "ARM TP: Multiple lane intrinsics not yet supported.\n");
+    return false;
+  }
+
   return !MaskedInsts.empty();
 }
 
@@ -444,14 +484,15 @@
 //        (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
 // 3) The IV must be an induction phi with an increment equal to the
 //    vector width.
-bool MVETailPredication::IsSafeActiveMask(Value *TripCount,
-    FixedVectorType *VecTy) {
+bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+    Value *TripCount, FixedVectorType *VecTy) {
   // 1) Test whether entry to the loop is protected by a conditional
   // BTC + 1 < 0. In other words, if the scalar trip count overflows,
   // becomes negative, we shouldn't enter the loop and creating
   // tripcount expression BTC + 1 is not safe. So, check that BTC
   // isn't max. This is evaluated in unsigned, because the semantics
   // of @get.active.lane.mask is a ULE comparison.
+
   int VectorWidth = VecTy->getNumElements();
   auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
   auto *BTC = SE->getSCEV(BackedgeTakenCount);
@@ -607,6 +648,7 @@
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
+  unsigned VectorWidth = VecTy->getNumElements();
 
   // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
   // is one less than the trip count. So we need to find or create
@@ -624,10 +666,10 @@
   // represent the effect of tail predication.
   Builder.SetInsertPoint(ActiveLaneMask);
   ConstantInt *Factor =
-    ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+    ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
 
   Intrinsic::ID VCTPID;
-  switch (VecTy->getNumElements()) {
+  switch (VectorWidth) {
   default:
     llvm_unreachable("unexpected number of lanes");
   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
@@ -673,7 +715,7 @@
     if (!Predicate || Predicates.count(Predicate))
       continue;
 
-    ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
+    auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
     if (!ActiveLaneMask ||
         ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
       continue;
@@ -682,8 +724,8 @@
     LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
                       << *ActiveLaneMask << "\n");
 
-    VecTy = getVectorType(I);
-    if (!IsSafeActiveMask(TripCount, VecTy)) {
+    auto *VecTy = getVectorType(I);
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
       LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
       return false;
     }
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -49,7 +49,7 @@
   %tmp7 = bitcast i8* %tmp6 to <16 x i8>*
   tail call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %mul, <16 x i8>* %tmp7, i32 4, <16 x i1> %active.lane.mask)
   %index.next = add i32 %index, 16
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -106,7 +106,7 @@
   %tmp7 = bitcast i16* %tmp6 to <8 x i16>*
   tail call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %mul, <8 x i16>* %tmp7, i32 4, <8 x i1> %active.lane.mask)
   %index.next = add i32 %index, 8
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -160,7 +160,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %mul, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -221,7 +221,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %combine, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -277,7 +277,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %active.lane.mask)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -336,7 +336,7 @@
   %tmp7 = bitcast i32* %tmp6 to <4 x i32>*
   tail call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %tmp5, <4 x i32>* %tmp7, i32 4, <4 x i1> %wrong)
   %index.next = add i32 %index, 4
-  %tmp15 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %tmp14, i32 1)
+  %tmp15 = call i32 @llvm.loop.decrement.reg.i32(i32 %tmp14, i32 1)
   %tmp16 = icmp ne i32 %tmp15, 0
   br i1 %tmp16, label %vector.body, label %for.cond.cleanup
 
@@ -344,6 +344,115 @@
   ret void
 }
 
+; CHECK-LABEL: interleave4
+;
+; CHECK: vector.ph:                  ; preds = %entry
+; CHECK:  %insert.btc = insertelement <4 x i32> undef, i32 %trip.count.minus.1, i32 0
+; CHECK:  %splat.btc = shufflevector <4 x i32> %insert.btc, <4 x i32> undef, <4 x i32> zeroinitializer
+;
+; CHECK: vector.body:                ; preds = %vector.body, %vector.ph
+; CHECK:  %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+; CHECK:  %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction = add <4 x i32> %index.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP1:.*]] = icmp ule <4 x i32> %viv.induction, %splat.btc
+; CHECK:  %v7 = add i32 %index, 4
+; CHECK:  %index.splatinsert1 = insertelement <4 x i32> undef, i32 %v7, i32 0
+; CHECK:  %index.splat2 = shufflevector <4 x i32> %index.splatinsert1, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction3 = add <4 x i32> %index.splat2, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP2:.*]] = icmp ule <4 x i32> %viv.induction3, %splat.btc
+; CHECK:  %v8 = add i32 %v7, 4
+; CHECK:  %index.splatinsert4 = insertelement <4 x i32> undef, i32 %v8, i32 0
+; CHECK:  %index.splat5 = shufflevector <4 x i32> %index.splatinsert4, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction6 = add <4 x i32> %index.splat5, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP3:.*]] = icmp ule <4 x i32> %viv.induction6, %splat.btc
+; CHECK:  %v9 = add i32 %v8, 4
+; CHECK:  %index.splatinsert7 = insertelement <4 x i32> undef, i32 %v9, i32 0
+; CHECK:  %index.splat8 = shufflevector <4 x i32> %index.splatinsert7, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction9 = add <4 x i32> %index.splat8, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP4:.*]] = icmp ule <4 x i32> %viv.induction9, %splat.btc
+;
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP1]], <4 x i32> undef)
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP2]], <4 x i32> undef)
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP3]], <4 x i32> undef)
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP4]], <4 x i32> undef)
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP1]], <4 x i32> undef)
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP2]], <4 x i32> undef)
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP3]], <4 x i32> undef)
+; CHECK:  call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP4]], <4 x i32> undef)
+;
+define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  %v0 = add i32 %N, 15
+  %v1 = lshr i32 %v0, 4
+  %v2 = shl nuw i32 %v1, 4
+  %v3 = add i32 %v2, -16
+  %v4 = lshr i32 %v3, 4
+  %v5 = add nuw nsw i32 %v4, 1
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+
+vector.ph:
+  %trip.count.minus.1 = add i32 %N, -1
+  %scevgep = getelementptr i32, i32* %A, i32 8
+  %scevgep30 = getelementptr i32, i32* %C, i32 8
+  %scevgep37 = getelementptr i32, i32* %B, i32 8
+  call void @llvm.set.loop.iterations.i32(i32 %v5)
+  br label %vector.body
+
+vector.body:
+  %lsr.iv38 = phi i32* [ %scevgep39, %vector.body ], [ %scevgep37, %vector.ph ]
+  %lsr.iv31 = phi i32* [ %scevgep32, %vector.body ], [ %scevgep30, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep25, %vector.body ], [ %scevgep, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %v14, %vector.body ]
+  %v6 = phi i32 [ %v5, %vector.ph ], [ %v15, %vector.body ]
+  %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
+  %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
+  %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %v7 = add i32 %index, 4
+  %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+  %v8 = add i32 %v7, 4
+  %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+  %v9 = add i32 %v8, 4
+  %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+  %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1
+  %wide.masked.load18 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep43, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+  %wide.masked.load19 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3840, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+  %scevgep41 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 1
+  %wide.masked.load20 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep41, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+  %scevgep34 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -2
+  %wide.masked.load21 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep34, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %scevgep35 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 -1
+  %wide.masked.load22 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep35, i32 4, <4 x i1> %active.lane.mask15, <4 x i32> undef)
+  %wide.masked.load23 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %lsr.iv3133, i32 4, <4 x i1> %active.lane.mask16, <4 x i32> undef)
+  %scevgep36 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3133, i32 1
+  %wide.masked.load24 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* nonnull %scevgep36, i32 4, <4 x i1> %active.lane.mask17, <4 x i32> undef)
+  %v10 = add nsw <4 x i32> %wide.masked.load21, %wide.masked.load
+  %v11 = add nsw <4 x i32> %wide.masked.load22, %wide.masked.load18
+  %v12 = add nsw <4 x i32> %wide.masked.load23, %wide.masked.load19
+  %v13 = add nsw <4 x i32> %wide.masked.load24, %wide.masked.load20
+  %scevgep27 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -2
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v10, <4 x i32>* %scevgep27, i32 4, <4 x i1> %active.lane.mask)
+  %scevgep28 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 -1
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v11, <4 x i32>* %scevgep28, i32 4, <4 x i1> %active.lane.mask15)
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v12, <4 x i32>* %lsr.iv26, i32 4, <4 x i1> %active.lane.mask16)
+  %scevgep29 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv26, i32 1
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %v13, <4 x i32>* %scevgep29, i32 4, <4 x i1> %active.lane.mask17)
+  %scevgep25 = getelementptr i32, i32* %lsr.iv, i32 16
+  %scevgep32 = getelementptr i32, i32* %lsr.iv31, i32 16
+  %scevgep39 = getelementptr i32, i32* %lsr.iv38, i32 16
+  %v14 = add i32 %v9, 4
+  %v15 = call i32 @llvm.loop.decrement.reg.i32(i32 %v6, i32 1)
+  %v16 = icmp ne i32 %v15, 0
+  br i1 %v16, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+}
+
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
@@ -353,7 +462,7 @@
 declare <2 x i64> @llvm.masked.load.v2i64.p0v2i64(<2 x i64>*, i32 immarg, <2 x i1>, <2 x i64>)
 declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>)
 declare void @llvm.set.loop.iterations.i32(i32)
-declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32)
+declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32)
 declare <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32, i32)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32, i32)
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -270,10 +270,10 @@
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 ; CHECK-NOT:   @llvm.get.active.lane.mask
 ;
-; CHECK: %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 -1, i32 -1, i32 -1, i32 -1>
+; CHECK: %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+; CHECK: %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: %viv.induction = add <4 x i32> %index.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK: %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, <i32 -1, i32 -1, i32 -1, i32 -1>
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
 ;
 ; CHECK:       ret void
@@ -417,10 +417,10 @@
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 ; CHECK-NOT:   @llvm.get.active.lane.mask
 ;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
+; CHECK:  %index.splatinsert = insertelement <4 x i32> undef, i32 %N, i32 0
+; CHECK:  %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction = add <4 x i32> %index.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
 ; CHECK:       ret void
 ;
@@ -466,10 +466,10 @@
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 ; CHECK-NOT:   @llvm.get.active.lane.mask
 ;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
+; CHECK:  %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+; CHECK:  %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction = add <4 x i32> %index.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
 ; CHECK:       ret void
 ;
@@ -518,10 +518,10 @@
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 ; CHECK-NOT:   @llvm.get.active.lane.mask
 ;
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
+; CHECK:  %index.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
+; CHECK:  %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction = add <4 x i32> %index.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, <i32 32002, i32 32002, i32 32002, i32 32002>
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
 ; CHECK:       ret void
 ;
@@ -566,10 +566,10 @@
 ;
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 ; CHECK-NOT:   @llvm.get.active.lane.mask
-; CHECK:  %lane.mask.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0
-; CHECK:  %lane.mask.splat = shufflevector <4 x i32> %lane.mask.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK:  %lane.mask.induction = add <4 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3>
-; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, <i32 4096, i32 4096, i32 4096, i32 4096>
+; CHECK:  %index.splatinsert = insertelement <4 x i32> undef, i32 %j.025, i32 0
+; CHECK:  %index.splat = shufflevector <4 x i32> %index.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK:  %viv.induction = add <4 x i32> %index.splat, <i32 0, i32 1, i32 2, i32 3>
+; CHECK:  %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, <i32 4096, i32 4096, i32 4096, i32 4096>
 ; CHECK: call <4 x i32> @llvm.masked.load.v4i32.p0v4i32({{.*}}, <4 x i1> %[[ICMP]], <4 x i32> undef)
 ;
 ; CHECK:       ret void
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-reduce.ll
@@ -152,10 +152,10 @@
 ; CHECK  %broadcast.splat2 = shufflevector <8 x i32> %broadcast.splatinsert1, <8 x i32> undef, <8 x i32> zeroinitializer
 ;
 ; CHECK: vector.body:
-; CHECK: %lane.mask.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
-; CHECK: %lane.mask.splat = shufflevector <8 x i32> %lane.mask.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK: %lane.mask.induction = add <8 x i32> %lane.mask.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %lane.mask.induction, %broadcast.splat2
+; CHECK: %index.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0
+; CHECK: %index.splat = shufflevector <8 x i32> %index.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK: %viv.induction = add <8 x i32> %index.splat, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK: %[[ICMP:.*]] = icmp ule <8 x i32> %viv.induction, %broadcast.splat2
 ; CHECK: call <8 x i16> @llvm.masked.load.v8i16.p0v8i16({{.*}}, <8 x i1> %[[ICMP]], <8 x i16> undef)
 ; CHECK: ret
 ;
@@ -224,7 +224,7 @@
 ; CHECK:   br label %vector.body
 ; CHECK: vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp
-; CHECK:   %[[ICMP:.*]] = icmp ule <4 x i32> %lane.mask.induction, %splat.btc
+; CHECK:   %[[ICMP:.*]] = icmp ule <4 x i32> %viv.induction, %splat.btc
 ; CHECK:   call <4 x i16> @llvm.masked.load.v4i16.p0v4i16({{.*}}, <4 x i1> %[[ICMP]],{{.*}} 
 ;
 ;