Index: llvm/lib/Target/ARM/MVETailPredication.cpp
===================================================================
--- llvm/lib/Target/ARM/MVETailPredication.cpp
+++ llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -352,14 +352,14 @@
 
 // The active lane intrinsic has this form:
 //
-//    @llvm.get.active.lane.mask(IV, BTC)
+//    @llvm.get.active.lane.mask(IV, TC)
 //
 // Here we perform checks that this intrinsic behaves as expected,
 // which means:
 //
-// 1) The element count, which is calculated with BTC + 1, cannot overflow.
-// 2) The element count needs to be sufficiently large that the decrement of
-//    element counter doesn't overflow, which means that we need to prove:
+// 1) Check that the TripCount (TC) belongs to this loop (originally).
+// 2) The element count (TC) needs to be sufficiently large that the decrement
+//    of element counter doesn't overflow, which means that we need to prove:
 //        ceil(ElementCount / VectorWidth) >= TripCount
 //    by rounding up ElementCount up:
 //        ((ElementCount + (VectorWidth - 1)) / VectorWidth
@@ -373,29 +373,8 @@
     EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
     EnableTailPredication == TailPredication::ForceEnabled;
 
-  // 1) Test whether entry to the loop is protected by a conditional
-  // BTC + 1 < 0. In other words, if the scalar trip count overflows,
-  // becomes negative, we shouldn't enter the loop and creating
-  // tripcount expression BTC + 1 is not safe. So, check that BTC
-  // isn't max. This is evaluated in unsigned, because the semantics
-  // of @get.active.lane.mask is a ULE comparison.
-  auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
-  auto *BTC = SE->getSCEV(BackedgeTakenCount);
-  auto *MaxBTC = SE->getConstantMaxBackedgeTakenCount(L);
-
-  if (isa<SCEVCouldNotCompute>(MaxBTC)) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Can't compute SCEV BTC expression: ";
-               BTC->dump());
-    return false;
-  }
-
-  APInt MaxInt = APInt(BTC->getType()->getScalarSizeInBits(), ~0);
-  if (cast<SCEVConstant>(MaxBTC)->getAPInt().eq(MaxInt) &&
-      !ForceTailPredication) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be int max: ";
-               BTC->dump());
-    return false;
-  }
+  // 1) TODO: Check that the TripCount (TC) belongs to this loop (originally).
+  auto *TCUse = ActiveLaneMask->getOperand(1);
 
   // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
   //
@@ -412,12 +391,12 @@
   //
   //     upperbound(TC) <= UINT_MAX - VectorWidth
   //
-  auto *TC = SE->getSCEV(TripCount);
+  auto *TCDef = SE->getSCEV(TripCount);
   unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
   int VectorWidth = VecTy->getNumElements();
   auto Diff =  APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
   uint64_t MaxMinusVW = Diff.getZExtValue();
-  uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
+  uint64_t UpperboundTC = SE->getSignedRange(TCDef).getUpper().getZExtValue();
 
   if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
     LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
@@ -434,7 +413,7 @@
   //
   // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
   // values (and not constants), we have to compensate for the lowerbound value
-  // range to be off by 1. The reason is that BTC lives in the preheader in
+  // range to be off by 1. The reason is that the TC lives in the preheader in
   // this form:
   //
   //     %trip.count.minus = add nsw nuw i32 %N, -1
@@ -449,9 +428,7 @@
   // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
   // we first add 0 to TC such that we can do the <= comparison on both sets.
   //
-  auto *One = SE->getOne(TripCount->getType());
-  // ElementCount = BTC + 1
-  auto *ElementCount = SE->getAddExpr(BTC, One);
+  auto *ElementCount = SE->getSCEV(TCUse);
   // Tmp = ElementCount + (VW-1)
   auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount,
       SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
@@ -460,7 +437,7 @@
       SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));
 
   ConstantRange RangeCeil = SE->getSignedRange(Ceil) ;
-  ConstantRange RangeTC = SE->getSignedRange(TC) ;
+  ConstantRange RangeTC = SE->getSignedRange(TCDef) ;
   if (!RangeTC.isSingleElement()) {
     auto ZeroRange =
         ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
@@ -504,38 +481,6 @@
   return false;
 }
 
-// Materialize NumElements in the preheader block.
-static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
-  // First, check the preheader if it not already exist:
-  //
-  // preheader:
-  //    %BTC = add i32 %N, -1
-  //    ..
-  // vector.body:
-  //
-  // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1,
-  // but instead can just return %N.
-  for (auto &I : *Preheader) {
-    if (I.getOpcode() != Instruction::Add || &I != BTC)
-      continue;
-    ConstantInt *MinusOne = nullptr;
-    if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
-      continue;
-    if (MinusOne->getSExtValue() == -1) {
-      LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
-      return I.getOperand(0);
-    }
-  }
-
-  // But we do need to materialise BTC if it is not already there,
-  // e.g. if it is a constant.
-  IRBuilder<> Builder(Preheader->getTerminator());
-  Value *NumElements = Builder.CreateAdd(BTC,
-        ConstantInt::get(BTC->getType(), 1), "num.elements");
-  LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n");
-  return NumElements;
-}
-
 void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
     Value *TripCount, FixedVectorType *VecTy) {
   IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
@@ -543,23 +488,15 @@
   Type *Ty = IntegerType::get(M->getContext(), 32);
   unsigned VectorWidth = VecTy->getNumElements();
 
-  // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
-  // is one less than the trip count. So we need to find or create
-  // %num.elements = %BTC + 1 in the preheader.
-  Value *BTC = ActiveLaneMask->getOperand(1);
-  Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
-  Value *NumElements = getNumElements(L->getLoopPreheader(), BTC);
-
   // Insert a phi to count the number of elements processed by the loop.
   Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()  );
   PHINode *Processed = Builder.CreatePHI(Ty, 2);
-  Processed->addIncoming(NumElements, L->getLoopPreheader());
+  Processed->addIncoming(ActiveLaneMask->getOperand(1), L->getLoopPreheader());
 
-  // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus
-  // represent the effect of tail predication.
+  // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and
+  // thus represent the effect of tail predication.
   Builder.SetInsertPoint(ActiveLaneMask);
-  ConstantInt *Factor =
-    ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
+  ConstantInt *Factor = ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
 
   Intrinsic::ID VCTPID;
   switch (VectorWidth) {
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/basic-tail-pred.ll
@@ -37,7 +37,7 @@
   %tmp = getelementptr inbounds i8, i8* %a, i32 %index
 
 ;  %tmp1 = icmp ule <16 x i32> %induction, %broadcast.splat11
-  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %active.lane.mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 %index, i32 %N)
 
   %tmp2 = bitcast i8* %tmp to <16 x i8>*
   %wide.masked.load = tail call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %tmp2, i32 4, <16 x i1> %active.lane.mask, <16 x i8> undef)
@@ -94,7 +94,7 @@
   %tmp = getelementptr inbounds i16, i16* %a, i32 %index
 
 ;  %tmp1 = icmp ule <8 x i32> %induction, %broadcast.splat11
-  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %active.lane.mask = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 %index, i32 %N)
 
   %tmp2 = bitcast i16* %tmp to <8 x i16>*
   %wide.masked.load = tail call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %tmp2, i32 4, <8 x i1> %active.lane.mask, <8 x i16> undef)
@@ -150,7 +150,7 @@
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
  ; %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
-  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %tmp3 = getelementptr inbounds i32, i32* %b, i32 %index
   %tmp4 = bitcast i32* %tmp3 to <4 x i32>*
@@ -204,7 +204,7 @@
   %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 ;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
-  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
   %wide.masked.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %tmp2, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %extract.1.low = shufflevector <4 x i32> %wide.masked.load, <4 x i32> undef, < 2 x i32> < i32 0, i32 2>
@@ -264,7 +264,7 @@
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 
 ;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
-  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
@@ -323,7 +323,7 @@
   %tmp = getelementptr inbounds i32, i32* %a, i32 %index
 
 ;  %tmp1 = icmp ule <4 x i32> %induction, %broadcast.splat11
-  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
 
   %wrong = icmp ult <4 x i32> %induction, %broadcast.splat11
   %tmp2 = bitcast i32* %tmp to <4 x i32>*
@@ -352,10 +352,10 @@
 ;
 ; CHECK-LABEL: interleave4
 ; CHECK: vector.body:
-; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
-; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
-; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
-; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+; CHECK:  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
+; CHECK:  %active.lane.mask{{.*}} = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
 ;
 define dso_local void @interleave4(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32 %N) local_unnamed_addr #0 {
 entry:
@@ -386,13 +386,13 @@
   %lsr.iv3840 = bitcast i32* %lsr.iv38 to <4 x i32>*
   %lsr.iv3133 = bitcast i32* %lsr.iv31 to <4 x i32>*
   %lsr.iv26 = bitcast i32* %lsr.iv to <4 x i32>*
-  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %trip.count.minus.1)
+  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N)
   %v7 = add i32 %index, 4
-  %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %trip.count.minus.1)
+  %active.lane.mask15 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v7, i32 %N)
   %v8 = add i32 %v7, 4
-  %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %trip.count.minus.1)
+  %active.lane.mask16 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v8, i32 %N)
   %v9 = add i32 %v8, 4
-  %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %trip.count.minus.1)
+  %active.lane.mask17 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %v9, i32 %N)
   %scevgep42 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -2
   %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %scevgep42, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
   %scevgep43 = getelementptr <4 x i32>, <4 x i32>* %lsr.iv3840, i32 -1