Index: llvm/lib/Target/ARM/MVETailPredication.cpp
===================================================================
--- llvm/lib/Target/ARM/MVETailPredication.cpp
+++ llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -124,16 +124,21 @@
   /// the second check that no overflow can occur in the expression that use
   /// this backedge-taken count.
   bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                        FixedVectorType *VecTy);
+                        FixedVectorType *VecTy, Instruction *DefBTC);
 
   /// Insert the intrinsic to represent the effect of tail predication.
   void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
-                           FixedVectorType *VecTy);
+                           FixedVectorType *VecTy, Instruction *DefBTC);
 
   /// Rematerialize the iteration count in exit blocks, which enables
   /// ARMLowOverheadLoops to better optimise away loop update statements inside
   /// hardware-loops.
   void RematerializeIterCount();
+
+  /// Given the backedge taken count (BTC) use from get.active.lane.mask,
+  /// find its definition from which we can extract the number of elements
+  /// processed by the loop.
+  Instruction *MatchDefBTC(Value *BTC);
 };
 
 } // end namespace
@@ -347,6 +352,7 @@
 // Here we perform checks that this intrinsic behaves as expected,
 // which means:
 //
+// 0) Check that BTC is indeed the loop's BTC.
 // 1) The element count, which is calculated with BTC + 1, cannot overflow.
 // 2) The element count needs to be sufficiently large that the decrement of
 //    element counter doesn't overflow, which means that we need to prove:
@@ -358,19 +364,58 @@
 // 3) The IV must be an induction phi with an increment equal to the
 //    vector width.
 bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy) {
+    Value *TripCount, FixedVectorType *VecTy, Instruction *DefBTC) {
   bool ForceTailPredication =
     EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
     EnableTailPredication == TailPredication::ForceEnabled;
 
+  auto *IntrinsicBTC = ActiveLaneMask->getOperand(1);
+  int VectorWidth = VecTy->getNumElements();
+  ConstantInt *BTCValue = nullptr;
+
+  // 0) Check that this BTC is indeed the loop's BTC.
+  if ((BTCValue = dyn_cast<ConstantInt>(IntrinsicBTC))) {
+    ConstantInt *TC = dyn_cast<ConstantInt>(TripCount);
+    if (!TC) {
+      LLVM_DEBUG(dbgs() << "ARM TP: Constant tripcount expected in "
+                           "set.loop.iterations\n");
+      return false;
+    }
+
+    // Calculate 2 tripcount values and check that they are consistent with
+    // each other:
+    // - The number of loop iterations extracted from the set.loop.iterations
+    //   intrinsic, multipled by the vector width:
+    uint64_t TC1 = TC->getZExtValue() * VectorWidth;
+
+    // - TC1 has to be equal to BTC + 1 + 1, where BTC + 1 is the loop
+    //   tripcount and the extra + 1 to compensate for start counting from 0.
+    uint64_t TC2 = BTCValue->getZExtValue() + 1 + 1;
+
+    if (TC1 != TC2) {
+      LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
+                 << TC1 << " from set.loop.iterations, and "
+                 << TC2 << " from get.active.lane.mask\n");
+      return false;
+    }
+  } else if (DefBTC) {
+    // Because we look for the definition of BTC in preheader, we know it is
+    // loopinvariant and don't need to further check that here.
+    LLVM_DEBUG(dbgs() << "ARM TP: BTC found in the preheader block: "
+               << *DefBTC << "\n");
+  } else {
+    LLVM_DEBUG(dbgs() << "ARM TP: Couldn't verify that get.active.lane.mask "
+                         "second argument is the backedge taken count.\n");
+    return false;
+  }
+
   // 1) Test whether entry to the loop is protected by a conditional
   // BTC + 1 < 0. In other words, if the scalar trip count overflows,
   // becomes negative, we shouldn't enter the loop and creating
   // tripcount expression BTC + 1 is not safe. So, check that BTC
   // isn't max. This is evaluated in unsigned, because the semantics
   // of @get.active.lane.mask is a ULE comparison.
-  auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
-  auto *BTC = SE->getSCEV(BackedgeTakenCount);
+  auto *BTC = SE->getSCEV(IntrinsicBTC);
   auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L);
 
   if (isa<SCEVCouldNotCompute>(MaxBTC)) {
@@ -404,7 +449,6 @@
   //
   auto *TC = SE->getSCEV(TripCount);
   unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
-  int VectorWidth = VecTy->getNumElements();
   auto Diff =  APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
   uint64_t MaxMinusVW = Diff.getZExtValue();
   uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
@@ -494,40 +538,23 @@
   return false;
 }
 
-// Materialize NumElements in the preheader block.
-static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
-  // First, check the preheader if it not already exist:
-  //
-  // preheader:
-  //    %BTC = add i32 %N, -1
-  //    ..
-  // vector.body:
-  //
-  // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1,
-  // but instead can just return %N.
-  for (auto &I : *Preheader) {
-    if (I.getOpcode() != Instruction::Add || &I != BTC)
-      continue;
-    ConstantInt *MinusOne = nullptr;
-    if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
-      continue;
-    if (MinusOne->getSExtValue() == -1) {
-      LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
-      return I.getOperand(0);
-    }
-  }
+// Materialize NumElements in the preheader block if necessary.
+static Value *getNumElements(BasicBlock *Preheader, Instruction *DefBTC,
+                             Value *UseBTC) {
+  if (DefBTC)
+    return DefBTC->getOperand(0);
 
-  // But we do need to materialise BTC if it is not already there,
-  // e.g. if it is a constant.
+  // But we do need to materialise if the definition of BTC is not already
+  // present, e.g. if it is a constant.
   IRBuilder<> Builder(Preheader->getTerminator());
-  Value *NumElements = Builder.CreateAdd(BTC,
-        ConstantInt::get(BTC->getType(), 1), "num.elements");
+  Value *NumElements = Builder.CreateAdd(UseBTC,
+        ConstantInt::get(UseBTC->getType(), 1), "num.elements");
   LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n");
   return NumElements;
 }
 
 void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
-    Value *TripCount, FixedVectorType *VecTy) {
+    Value *TripCount, FixedVectorType *VecTy, Instruction *DefBTC) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
@@ -536,9 +563,9 @@
   // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
   // is one less than the trip count. So we need to find or create
   // %num.elements = %BTC + 1 in the preheader.
-  Value *BTC = ActiveLaneMask->getOperand(1);
+  Value *UseBTC = ActiveLaneMask->getOperand(1);
   Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
-  Value *NumElements = getNumElements(L->getLoopPreheader(), BTC);
+  Value *NumElements = getNumElements(L->getLoopPreheader(), DefBTC, UseBTC);
 
   // Insert a phi to count the number of elements processed by the loop.
   Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()  );
@@ -578,6 +605,55 @@
              << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n");
 }
 
+
+// If it is not a constant value, we assume that the backedge taken count
+// expression exist in this form in the loop-nest hierarchy:
+//
+//   preheader:
+//     %BTC = add i32 %N, -1
+//     ..
+//   vector.body:
+//
+// We use this to materialise the value that represents the "number of elements
+// processed", and also in the sanity checks for get.active.lane.mask to check
+// that the BTC is in fact the loop's BTC.
+//
+Instruction *MVETailPredication::MatchDefBTC(Value *BTC) {
+  auto getBTC = [&] (BasicBlock *BB) -> Instruction * {
+    for (auto &I : *BB) {
+      if (I.getOpcode() != Instruction::Add || &I != BTC)
+        continue;
+      ConstantInt *MinusOne = nullptr;
+      if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
+        continue;
+      if (MinusOne->getSExtValue() == -1) {
+        LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
+        return &I;
+      }
+    }
+    return nullptr;
+  };
+
+  Loop *L = this->L;
+
+  do {
+    if (!L->getLoopPreheader())
+      return nullptr;
+
+   Instruction *I = nullptr;
+   if ((I = getBTC(L->getLoopPreheader())))
+     return I;
+
+   BasicBlock *Pred = nullptr;
+   if ((Pred = L->getLoopPreheader()->getSinglePredecessor()))
+     if ((I = getBTC(Pred)))
+       return I;
+
+  } while ((L = L->getParentLoop()));
+
+  return nullptr;
+}
+
 bool MVETailPredication::TryConvert(Value *TripCount) {
   if (!IsPredicatedVectorLoop()) {
     LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n");
@@ -605,12 +681,14 @@
                       << *ActiveLaneMask << "\n");
 
     auto *VecTy = getVectorType(I);
-    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
+    auto *DefBTC= MatchDefBTC(ActiveLaneMask->getOperand(1));
+
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy, DefBTC)) {
       LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
       return false;
     }
     LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
-    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
+    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy, DefBTC);
   }
 
   Cleanup(Predicates, L);
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -430,6 +430,211 @@
   ret void
 }
 
+; CHECK-LABEL: BTC_not_N_minus_1
+; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
+; CHECK-NOT:   vctp
+; CHECK:       ret void
+;
+define dso_local void @BTC_not_N_minus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  %0 = add i32 %N, 3
+  %1 = lshr i32 %0, 2
+  %2 = shl nuw i32 %1, 2
+  %3 = add i32 %2, -4
+  %4 = lshr i32 %3, 2
+  %5 = add nuw nsw i32 %4, 1
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:
+
+; BTC is not of the form BTC = N - 1 here:
+
+  %trip.count.minus.1 = add i32 %N, -2
+  call void @llvm.set.loop.iterations.i32(i32 %5)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+  %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: BTC_not_N_minus_1_v2
+; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
+; CHECK-NOT:   vctp
+; CHECK:       ret void
+;
+define dso_local void @BTC_not_N_minus_1_v2(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  %0 = add i32 %N, 3
+  %1 = lshr i32 %0, 2
+  %2 = shl nuw i32 %1, 2
+  %3 = add i32 %2, -4
+  %4 = lshr i32 %3, 2
+  %5 = add nuw nsw i32 %4, 1
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:
+
+; BTC is not of the form BTC = N - 1 here:
+
+  %trip.count.minus.1 = sub i32 %N, -1
+  call void @llvm.set.loop.iterations.i32(i32 %5)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+  %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: BTC_not_N_minus_1_v3
+; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
+; CHECK-NOT:   vctp
+; CHECK:       ret void
+;
+define dso_local void @BTC_not_N_minus_1_v3(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  %0 = add i32 %N, 3
+  %1 = lshr i32 %0, 2
+  %2 = shl nuw i32 %1, 2
+  %3 = add i32 %2, -4
+  %4 = lshr i32 %3, 2
+  %5 = add nuw nsw i32 %4, 1
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:
+
+; We don't have a BTC = N - 1 instruction here (or anywhere else).
+
+  call void @llvm.set.loop.iterations.i32(i32 %5)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
+
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+  %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+; CHECK-LABEL: const_expected_in_set_loop
+; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
+; CHECK-NOT:   vctp
+; CHECK:       ret void
+;
+define dso_local void @const_expected_in_set_loop(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %N, 0
+  %0 = add i32 %N, 3
+  %1 = lshr i32 %0, 2
+  %2 = shl nuw i32 %1, 2
+  %3 = add i32 %2, -4
+  %4 = lshr i32 %3, 2
+  %5 = add nuw nsw i32 %4, 1
+  br i1 %cmp8, label %vector.ph, label %for.cond.cleanup
+
+vector.ph:
+  call void @llvm.set.loop.iterations.i32(i32 %5)
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %lsr.iv17 = phi i32* [ %scevgep18, %vector.body ], [ %A, %vector.ph ]
+  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %C, %vector.ph ]
+  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %vector.ph ]
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
+  %lsr.iv13 = bitcast i32* %lsr.iv to <4 x i32>*
+  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
+  %lsr.iv1719 = bitcast i32* %lsr.iv17 to <4 x i32>*
+
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 42)
+
+  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv13, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %wide.masked.load12 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
+  %7 = add nsw <4 x i32> %wide.masked.load12, %wide.masked.load
+  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv1719, i32 4, <4 x i1> %active.lane.mask)
+  %index.next = add i32 %index, 4
+  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
+  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
+  %scevgep18 = getelementptr i32, i32* %lsr.iv17, i32 4
+  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
+  %9 = icmp ne i32 %8, 0
+  br i1 %9, label %vector.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %vector.body, %entry
+  ret void
+}
+
+
+
 declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>)
 declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>)
 declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>)
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/clear-maskedinsts.ll
@@ -23,13 +23,12 @@
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT71:%.*]] = insertelement <4 x i32> undef, i32 [[X]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT72:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT71]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP3]])
-; CHECK-NEXT:    [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_183]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[LSR_IV9:%.*]] = phi i32* [ [[SCEVGEP10:%.*]], [[VECTOR_BODY]] ], [ [[D:%.*]], [[VECTOR_PH]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = phi i32 [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP10:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ %n, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[LSR_IV911:%.*]] = bitcast i32* [[LSR_IV9]] to <4 x i32>*
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/nested.ll
@@ -24,13 +24,12 @@
 ; CHECK-NEXT:    [[ARRAYIDX8_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX8_US]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX8_PROMOTED_US]], i32 0
 ; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
-; CHECK-NEXT:    [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ %N, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
@@ -146,13 +145,12 @@
 ; CHECK-NEXT:    [[ARRAYIDX7_PROMOTED_US:%.*]] = load i32, i32* [[ARRAYIDX7_US]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> <i32 undef, i32 0, i32 0, i32 0>, i32 [[ARRAYIDX7_PROMOTED_US]], i32 0
 ; CHECK-NEXT:    call void @llvm.set.loop.iterations.i32(i32 [[TMP2]])
-; CHECK-NEXT:    [[NUM_ELEMENTS:%.*]] = add i32 [[TRIP_COUNT_MINUS_1]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP4]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = phi i32 [ [[TMP2]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ [[NUM_ELEMENTS]], [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = phi i32 [ %N, [[FOR_COND1_PREHEADER_US]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> undef, i32 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[INDUCTION:%.*]] = add <4 x i32> [[BROADCAST_SPLAT]], <i32 0, i32 1, i32 2, i32 3>
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-const.ll
@@ -265,13 +265,13 @@
   ret void
 }
 
-; CHECK-LABEL: @overflow_BTC_plus_1(
+; CHECK-LABEL: @inconsistent_tripcounts(
 ; CHECK:       vector.body:
 ; CHECK-NOT:   @llvm.arm.mve.vctp32
 ; CHECK:       @llvm.get.active.lane.mask
 ; CHECK:       ret void
 ;
-define dso_local void @overflow_BTC_plus_1(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
+define dso_local void @inconsistent_tripcounts(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
   call void @llvm.set.loop.iterations.i32(i32 8001)
   br label %vector.body
@@ -316,63 +316,7 @@
 ;
 define dso_local void @overflow_in_sub(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
 entry:
-  call void @llvm.set.loop.iterations.i32(i32 8001)
-  br label %vector.body
-
-vector.body:
-  %lsr.iv14 = phi i32* [ %scevgep15, %vector.body ], [ %A, %entry ]
-  %lsr.iv11 = phi i32* [ %scevgep12, %vector.body ], [ %C, %entry ]
-  %lsr.iv = phi i32* [ %scevgep, %vector.body ], [ %B, %entry ]
-  %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ]
-  %0 = phi i32 [ 8001, %entry ], [ %3, %vector.body ]
-  %lsr.iv1416 = bitcast i32* %lsr.iv14 to <4 x i32>*
-  %lsr.iv1113 = bitcast i32* %lsr.iv11 to <4 x i32>*
-  %lsr.iv10 = bitcast i32* %lsr.iv to <4 x i32>*
-  %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
-  %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-
-; Overflow in the substraction. This should hold:
-;
-;   ceil(ElementCount / VectorWidth) >= TripCount
-;
-; But we have:
-;
-;   ceil(3200 / 4) >= 8001
-;   8000 >= 8001
-;
-  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 31999)
-
-  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
-  %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)
-  %2 = add nsw <4 x i32> %wide.masked.load9, %wide.masked.load
-  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %2, <4 x i32>* %lsr.iv1416, i32 4, <4 x i1> %1)
-  %index.next = add i32 %index, 4
-  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
-  %scevgep12 = getelementptr i32, i32* %lsr.iv11, i32 4
-  %scevgep15 = getelementptr i32, i32* %lsr.iv14, i32 4
-  %3 = call i32 @llvm.loop.decrement.reg.i32(i32 %0, i32 1)
-  %4 = icmp ne i32 %3, 0
-  br i1 %4, label %vector.body, label %for.cond.cleanup
-
-for.cond.cleanup:
-  ret void
-}
-
-; CHECK-LABEL: @overflow_in_rounding_tripcount(
-; CHECK:       vector.body:
-; CHECK-NOT:   @llvm.arm.mve.vctp32
-; CHECK:       @llvm.get.active.lane.mask
-; CHECK:       ret void
-;
-define dso_local void @overflow_in_rounding_tripcount(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture readnone %D, i32 %N) local_unnamed_addr #0 {
-entry:
-
-; TC = 4294967292
-; 4294967292 <= 4294967291 (MAX - vectorwidth)
-; False
-;
-  call void @llvm.set.loop.iterations.i32(i32 4294967291)
+  call void @llvm.set.loop.iterations.i32(i32 1073741824)
   br label %vector.body
 
 vector.body:
@@ -388,7 +332,7 @@
   %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
 
-  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 32002)
+  %1 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 4294967294)
 
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv10, i32 4, <4 x i1> %1, <4 x i32> undef)
   %wide.masked.load9 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv1113, i32 4, <4 x i1> %1, <4 x i32> undef)