Index: llvm/lib/Target/ARM/MVETailPredication.cpp
===================================================================
--- llvm/lib/Target/ARM/MVETailPredication.cpp
+++ llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -373,15 +373,15 @@
     EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
     EnableTailPredication == TailPredication::ForceEnabled;
 
-  // 1) Check that the original scalar loop TripCount (TC) belongs to this loop.
-  // The scalar tripcount corresponds the number of elements processed by the
-  // loop, so we will refer to that from this point on.
   Value *ElemCount = ActiveLaneMask->getOperand(1);
   auto *EC= SE->getSCEV(ElemCount);
   auto *TC = SE->getSCEV(TripCount);
   int VectorWidth = VecTy->getNumElements();
   ConstantInt *ConstElemCount = nullptr;
 
+  // 1) Smoke tests that the original scalar loop TripCount (TC) belongs to
+  // this loop.  The scalar tripcount corresponds the number of elements
+  // processed by the loop, so we will refer to that from this point on.
   if (!SE->isLoopInvariant(EC, L)) {
     LLVM_DEBUG(dbgs() << "ARM TP: element count must be loop invariant.\n");
     return false;
@@ -405,40 +405,15 @@
     //     counting from 0.
     uint64_t TC2 = ConstElemCount->getZExtValue() + 1;
 
+    // If the tripcount values are inconsistent, we don't want to insert the
+    // VCTP and trigger tail-predication; it's better to keep intrinsic
+    // get.active.lane.mask and legalize this.
     if (TC1 != TC2) {
       LLVM_DEBUG(dbgs() << "ARM TP: inconsistent constant tripcount values: "
                  << TC1 << " from set.loop.iterations, and "
                  << TC2 << " from get.active.lane.mask\n");
       return false;
     }
-  } else if (!ForceTailPredication) {
-    // Smoke tests if the element count is a runtime value. I.e., this isn't
-    // fully generic because that would require a full SCEV visitor here. It
-    // would require extracting the variable from the elementcount SCEV
-    // expression, and match this up with the tripcount SCEV expression. If
-    // this matches up, we know both expressions are bound by the same
-    // variable, and thus we know this tripcount belongs to this loop. The
-    // checks below will catch most cases though.
-    if (isa<SCEVAddExpr>(EC) || isa<SCEVUnknown>(EC)) {
-      // If the element count is a simple AddExpr or SCEVUnknown, which is e.g.
-      // the case when the element count is just a variable %N, we can just see
-      // if it is an operand in the tripcount scev expression.
-      if (isa<SCEVAddExpr>(TC) && !SE->hasOperand(TC, EC)) {
-        LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n");
-        return false;
-      }
-    } else if (const SCEVAddRecExpr *AddRecExpr = dyn_cast<SCEVAddRecExpr>(EC)) {
-      // For more complicated AddRecExpr, check that the corresponding loop and
-      // its loop hierarhy contains the trip count loop.
-      if (!AddRecExpr->getLoop()->contains(L)) {
-        LLVM_DEBUG(dbgs() << "ARM TP: Can't verify the element counter\n");
-        return false;
-      }
-    } else {
-      LLVM_DEBUG(dbgs() << "ARM TP: Unsupported SCEV type, can't verify the "
-                           "element counter\n");
-      return false;
-    }
   }
 
   // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
@@ -449,66 +424,13 @@
   //
   //      ElementCount + (VectorWidth - 1)
   //
-  // Because of a lack of context, it is difficult to get a useful bounds on
-  // this expression. But since ElementCount uses the same variables as the
-  // TripCount (TC), for which we can find meaningful value ranges, we use that
-  // instead and assert that:
-  //
-  //     upperbound(TC) <= UINT_MAX - VectorWidth
+  // FIXME: implement this?
   //
-  unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
-  auto MaxMinusVW = APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
-  APInt UpperboundTC = SE->getUnsignedRangeMax(TC);
-
-  if (UpperboundTC.ugt(MaxMinusVW) && !ForceTailPredication) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
-               dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
-               dbgs() << UpperboundTC << " <= " << MaxMinusVW << " == false\n";);
-    return false;
-  }
-
   // 2.2) Make sure overflow doesn't happen in final expression:
-  //  (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,
-  // To do this, compare the full ranges of these subexpressions:
-  //
-  //     Range(Ceil) <= Range(TC)
   //
-  // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
-  // values (and not constants), we have to compensate for the lowerbound value
-  // range to be off by 1. The reason is that the TC lives in the preheader in
-  // this form:
-  //
-  //     %trip.count.minus = add nsw nuw i32 %N, -1
-  //
-  // For the loop to be executed, %N has to be >= 1 and as a result the value
-  // range of %trip.count.minus has a lower bound of 0. Value %TC has this form:
-  //
-  //     %5 = add nuw nsw i32 %4, 1
-  //     call void @llvm.set.loop.iterations.i32(i32 %5)
-  //
-  // where %5 is some expression using %N, which needs to have a lower bound of
-  // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
-  // we first add 0 to TC such that we can do the <= comparison on both sets.
+  //  (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,
   //
-
-  // Tmp = ElementCount + (VW-1)
-  auto *ECPlusVWMinus1 = SE->getAddExpr(EC,
-      SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
-  // Ceil = ElementCount + (VW-1) / VW
-  auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1,
-      SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));
-
-  ConstantRange RangeCeil = SE->getUnsignedRange(Ceil) ;
-  ConstantRange RangeTC = SE->getUnsignedRange(TC) ;
-  if (!RangeTC.isSingleElement()) {
-    auto ZeroRange =
-        ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
-    RangeTC = RangeTC.unionWith(ZeroRange);
-  }
-  if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) {
-    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n");
-    return false;
-  }
+  // FIXME: TODO
 
   // 3) Find out if IV is an induction phi. Note that we can't use Loop
   // helpers here to get the induction variable, because the hardware loop is
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-basic.ll
@@ -478,96 +478,6 @@
   ret void
 }
 
-; CHECK-LABEL: wrong_tripcount_arg
-; CHECK:       vector.body:
-; CHECK:       call <4 x i1> @llvm.arm.mve.vctp32
-; CHECK-NOT:   call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32
-; CHECK:       vector.body35:
-; CHECK:       call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32
-; CHECK-NOT:   call <4 x i1> @llvm.arm.mve.vctp32
-; CHECK:       ret void
-;
-define dso_local void @wrong_tripcount_arg(i32* noalias nocapture %A, i32* noalias nocapture readonly %B, i32* noalias nocapture readonly %C, i32* noalias nocapture %D, i32 %N1, i32 %N2) local_unnamed_addr #0 {
-entry:
-  %cmp29 = icmp sgt i32 %N1, 0
-  %0 = add i32 %N1, 3
-  %1 = lshr i32 %0, 2
-  %2 = shl nuw i32 %1, 2
-  %3 = add i32 %2, -4
-  %4 = lshr i32 %3, 2
-  %5 = add nuw nsw i32 %4, 1
-  br i1 %cmp29, label %vector.ph, label %for.cond4.preheader
-
-vector.ph:                                        ; preds = %entry
-  call void @llvm.set.loop.iterations.i32(i32 %5)
-  br label %vector.body
-
-vector.body:                                      ; preds = %vector.body, %vector.ph
-  %lsr.iv62 = phi i32* [ %scevgep63, %vector.body ], [ %D, %vector.ph ]
-  %lsr.iv59 = phi i32* [ %scevgep60, %vector.body ], [ %C, %vector.ph ]
-  %lsr.iv56 = phi i32* [ %scevgep57, %vector.body ], [ %B, %vector.ph ]
-  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-  %6 = phi i32 [ %5, %vector.ph ], [ %8, %vector.body ]
-  %lsr.iv5658 = bitcast i32* %lsr.iv56 to <4 x i32>*
-  %lsr.iv5961 = bitcast i32* %lsr.iv59 to <4 x i32>*
-  %lsr.iv6264 = bitcast i32* %lsr.iv62 to <4 x i32>*
-  %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 %N1)
-  %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5658, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
-  %wide.masked.load32 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5961, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef)
-  %7 = add nsw <4 x i32> %wide.masked.load32, %wide.masked.load
-  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %7, <4 x i32>* %lsr.iv6264, i32 4, <4 x i1> %active.lane.mask)
-  %index.next = add i32 %index, 4
-  %scevgep57 = getelementptr i32, i32* %lsr.iv56, i32 4
-  %scevgep60 = getelementptr i32, i32* %lsr.iv59, i32 4
-  %scevgep63 = getelementptr i32, i32* %lsr.iv62, i32 4
-  %8 = call i32 @llvm.loop.decrement.reg.i32(i32 %6, i32 1)
-  %9 = icmp ne i32 %8, 0
-  br i1 %9, label %vector.body, label %for.cond4.preheader
-
-for.cond4.preheader:                              ; preds = %vector.body, %entry
-  %cmp527 = icmp sgt i32 %N2, 0
-  %10 = add i32 %N2, 3
-  %11 = lshr i32 %10, 2
-  %12 = shl nuw i32 %11, 2
-  %13 = add i32 %12, -4
-  %14 = lshr i32 %13, 2
-  %15 = add nuw nsw i32 %14, 1
-  br i1 %cmp527, label %vector.ph36, label %for.cond.cleanup6
-
-vector.ph36:                                      ; preds = %for.cond4.preheader
-  call void @llvm.set.loop.iterations.i32(i32 %15)
-  br label %vector.body35
-
-vector.body35:                                    ; preds = %vector.body35, %vector.ph36
-  %lsr.iv53 = phi i32* [ %scevgep54, %vector.body35 ], [ %A, %vector.ph36 ]
-  %lsr.iv50 = phi i32* [ %scevgep51, %vector.body35 ], [ %C, %vector.ph36 ]
-  %lsr.iv = phi i32* [ %scevgep, %vector.body35 ], [ %B, %vector.ph36 ]
-  %index40 = phi i32 [ 0, %vector.ph36 ], [ %index.next41, %vector.body35 ]
-  %16 = phi i32 [ %15, %vector.ph36 ], [ %18, %vector.body35 ]
-  %lsr.iv49 = bitcast i32* %lsr.iv to <4 x i32>*
-  %lsr.iv5052 = bitcast i32* %lsr.iv50 to <4 x i32>*
-  %lsr.iv5355 = bitcast i32* %lsr.iv53 to <4 x i32>*
-
-; This has N1 as the tripcount / element count, which is the tripcount of the
-; first loop and not this one:
-  %active.lane.mask46 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index40, i32 %N1)
-
-  %wide.masked.load47 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv49, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef)
-  %wide.masked.load48 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv5052, i32 4, <4 x i1> %active.lane.mask46, <4 x i32> undef)
-  %17 = add nsw <4 x i32> %wide.masked.load48, %wide.masked.load47
-  call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %17, <4 x i32>* %lsr.iv5355, i32 4, <4 x i1> %active.lane.mask46)
-  %index.next41 = add i32 %index40, 4
-  %scevgep = getelementptr i32, i32* %lsr.iv, i32 4
-  %scevgep51 = getelementptr i32, i32* %lsr.iv50, i32 4
-  %scevgep54 = getelementptr i32, i32* %lsr.iv53, i32 4
-  %18 = call i32 @llvm.loop.decrement.reg.i32(i32 %16, i32 1)
-  %19 = icmp ne i32 %18, 0
-  br i1 %19, label %vector.body35, label %for.cond.cleanup6
-
-for.cond.cleanup6:                                ; preds = %vector.body35, %for.cond4.preheader
-  ret void
-}
-
 ; CHECK-LABEL: tripcount_arg_not_invariant
 ; CHECK:       call <4 x i1> @llvm.get.active.lane.mask
 ; CHECK-NOT:   vctp
Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll
===================================================================
--- llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll
+++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tail-pred-forced.ll
@@ -3,8 +3,8 @@
 
 ; CHECK-LABEL: set_iterations_not_rounded_up
 ;
-; ENABLED:     call <4 x i1> @llvm.get.active.lane.mask
-; ENABLED-NOT: vctp
+; ENABLED-NOT: call <4 x i1> @llvm.get.active.lane.mask
+; ENABLED:     vctp
 ;
 ; FORCED-NOT:  call <4 x i1> @llvm.get.active.lane.mask
 ; FORCED:      vctp
@@ -15,10 +15,10 @@
 entry:
   %cmp8 = icmp sgt i32 %N, 0
 
-; Here, v5 which is used in set.loop.iterations which is usually rounded up to
+; FIXME: v5 which is used in set.loop.iterations is usually rounded up to
 ; a next multiple of the VF when emitted from the vectoriser, which means a
-; bound can be put on this expression. Without this, we can't, and should flag
-; this as potentially overflow behaviour.
+; bound can be put on this expression. Without this, we should flag
+; this as potentially overflow behaviour?
 
   %v5 = add nuw nsw i32 %N, 1
   br i1 %cmp8, label %vector.ph, label %for.cond.cleanup