diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -2023,11 +2023,38 @@
   createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI);
 
   /// Compute the backedge taken count knowing the interval difference, and
-  /// the stride for an inequality.  Result takes the form:
-  /// (Delta + (Stride - 1)) udiv Stride.
-  /// Caller must ensure that this expression either does not overflow or
-  /// that the result is undefined if it does.
-  const SCEV *computeBECount(const SCEV *Delta, const SCEV *Stride);
+  /// the stride for an inequality.
+  ///
+  /// Caller must ensure that non-negative N exists such that
+  /// (Start + Stride * N) >= End, and that computing "(Start + Stride * N)"
+  /// doesn't overflow. In other words:
+  /// 1. If IsSigned is true, Start <=s End. Otherwise, Start <=u End.
+  /// 2. If End is not equal to start and IsSigned is true, Stride >s 0. If
+  ///    End is not equal to start and IsSigned is false, Stride >u 0.
+  /// 3. The index variable doesn't overflow.
+  ///
+  /// If the preconditions hold, the backedge taken count is N.
+  ///
+  /// IsSigned determines whether End, Start, and Stride are treated as
+  /// signed values, for the purpose of optimizing the form of the result.
+  ///
+  /// This function tries to use an optimized form:
+  /// ((End - Start) + (Stride - 1)) /u Stride
+  ///
+  /// If it can't prove the addition doesn't overflow in that form, it uses
+  /// getUDivCeilSCEV.
+  const SCEV *computeBECount(bool IsSigned, const SCEV *Start, const SCEV *End,
+                             const SCEV *Stride);
+
+  /// Compute ceil(N / D). N and D are treated as unsigned values.
+  ///
+  /// Since SCEV doesn't have native ceiling division, this generates a
+  /// SCEV expression of the following form:
+  ///
+  /// umin(N, 1) + floor((N - umin(N, 1)) / D)
+  ///
+  /// A denominator of zero or poison is handled the same way as getUDivExpr().
+  const SCEV *getUDivCeilSCEV(const SCEV *N, const SCEV *D);
 
   /// Compute the maximum backedge count based on the range of values
   /// permitted by Start, End, and Stride. This is for loops of the form
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11497,11 +11497,100 @@
   return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS);
 }
 
-const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta,
-                                            const SCEV *Step) {
-  const SCEV *One = getOne(Step->getType());
-  Delta = getAddExpr(Delta, getMinusSCEV(Step, One));
-  return getUDivExpr(Delta, Step);
+const SCEV *ScalarEvolution::computeBECount(bool IsSigned, const SCEV *Start,
+                                            const SCEV *End,
+                                            const SCEV *Stride) {
+  // The basic formula here is ceil((End - Start) / Stride).  Since SCEV
+  // doesn't natively have division that rounds up, we need to convert to
+  // floor division.
+  //
+  // MayOverflow is whether adding (End - Start) + (Stride - 1)
+  // can overflow if Stride is positive. It's a precondition of the
+  // function that "End - Start" doesn't overflow. We handle the case where
+  // Stride isn't positive later.
+  //
+  // In practice, the arithmetic almost never overflows, but we have to prove
+  // it.  We have a variety of ways to come up with a proof.
+  const SCEV *One = getOne(Stride->getType());
+  bool MayOverflow = [&] {
+    if (auto *StrideC = dyn_cast<SCEVConstant>(Stride)) {
+      if (StrideC->getAPInt().isPowerOf2()) {
+        // Suppose Stride is a power of two, and Start/End are unsigned
+        // integers.  Let UMAX be the largest representable unsigned
+        // integer.
+        //
+        // By the preconditions of this function (see comment in header), we
+        // know "(Start + Stride * N)" >= End, and this doesn't overflow.
+        // As a formula:
+        //
+        //   End <= (Start + Stride * N) <= UMAX
+        //
+        // Subtracting Start from all the terms:
+        //
+        //   End - Start <= Stride * N <= UMAX - Start
+        //
+        // Since Start is unsigned, UMAX - Start <= UMAX.  Therefore:
+        //
+        //   End - Start <= Stride * N <= UMAX
+        //
+        // Stride * N is a multiple of Stride. Therefore,
+        //
+        //   End - Start <= Stride * N <= UMAX - (UMAX mod Stride)
+        //
+        // Since Stride is a power of two, UMAX + 1 is divisible by Stride.
+        // Therefore, UMAX mod Stride == Stride - 1.  So we can write:
+        //
+        //   End - Start <= Stride * N <= UMAX - Stride - 1
+        //
+        // Dropping the middle term:
+        //
+        //   End - Start <= UMAX - Stride - 1
+        //
+        // Adding Stride - 1 to both sides:
+        //
+        //   (End - Start) + (Stride - 1) <= UMAX
+        //
+        // In other words, the addition doesn't have unsigned overflow.
+        //
+        // A similar proof works if we treat Start/End as signed values.
+        // Just rewrite steps before "End - Start <= Stride * N <= UMAX" to
+        // use signed max instead of unsigned max. Note that we're trying
+        // to prove a lack of unsigned overflow in either case.
+        return false;
+      }
+    }
+    if (Start == Stride || Start == getMinusSCEV(Stride, One)) {
+      // If Start is equal to Stride, (End - Start) + (Stride - 1) == End - 1.
+      // If !IsSigned, 0 <u Stride == Start <=u End; so 0 <u End - 1 <u End.
+      // If IsSigned, 0 <s Stride == Start <=s End; so 0 <s End - 1 <s End.
+      //
+      // If Start is equal to Stride - 1, (End - Start) + Stride - 1 == End.
+      return false;
+    }
+    return true;
+  }();
+
+  // Force the stride to at least one, so we don't divide by zero. The stride
+  // can be zero if Delta is zero. We don't actually care what value we use
+  // for Stride in this case, as long as it isn't zero.
+  Stride = getUMaxExpr(Stride, One);
+
+  const SCEV *Delta = getMinusSCEV(End, Start);
+  if (!MayOverflow) {
+    // floor((D + (S - 1)) / S)
+    // We prefer this formulation if it's legal because it's fewer operations.
+    return getUDivExpr(getAddExpr(Delta, getMinusSCEV(Stride, One)), Stride);
+  }
+  return getUDivCeilSCEV(Delta, Stride);
+}
+
+const SCEV *ScalarEvolution::getUDivCeilSCEV(const SCEV *N, const SCEV *D) {
+  // umin(N, 1) + floor((N - umin(N, 1)) / D)
+  // This is equivalent to "1 + floor((N - 1) / D)" for N != 0. The umin
+  // expression fixes the case of N=0.
+  const SCEV *MinNOne = getUMinExpr(N, getOne(N->getType()));
+  const SCEV *NMinusOne = getMinusSCEV(N, MinNOne);
+  return getAddExpr(MinNOne, getUDivExpr(NMinusOne, D));
 }
 
 const SCEV *ScalarEvolution::computeMaxBECountForLT(const SCEV *Start,
@@ -11540,8 +11629,8 @@
   APInt MaxEnd = IsSigned ? APIntOps::smin(getSignedRangeMax(End), Limit)
                           : APIntOps::umin(getUnsignedRangeMax(End), Limit);
 
-  MaxBECount = computeBECount(getConstant(MaxEnd - MinStart) /* Delta */,
-                              getConstant(StrideForMaxBECount) /* Step */);
+  MaxBECount = getUDivCeilSCEV(getConstant(MaxEnd - MinStart) /* Delta */,
+                               getConstant(StrideForMaxBECount) /* Step */);
 
   return MaxBECount;
 }
@@ -11569,7 +11658,6 @@
 
   auto WrapType = IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW;
   bool NoWrap = ControlsExit && IV->getNoWrapFlags(WrapType);
-  ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT;
 
   const SCEV *Stride = IV->getStepRecurrence(*this);
 
@@ -11682,7 +11770,6 @@
       return RHS;
   }
 
-  const SCEV *End = RHS;
   // When the RHS is not invariant, we do not know the end bound of the loop and
   // cannot calculate the ExactBECount needed by ExitLimit. However, we can
   // calculate the MaxBECount, given the start, stride and max value for the end
@@ -11694,42 +11781,83 @@
     return ExitLimit(getCouldNotCompute() /* ExactNotTaken */, MaxBECount,
                      false /*MaxOrZero*/, Predicates);
   }
-  // If the backedge is taken at least once, then it will be taken
-  // (End-Start)/Stride times (rounded up to a multiple of Stride), where Start
-  // is the LHS value of the less-than comparison the first time it is evaluated
-  // and End is the RHS.
-  const SCEV *BECountIfBackedgeTaken =
-    computeBECount(getMinusSCEV(End, Start), Stride);
-  // If the loop entry is guarded by the result of the backedge test of the
-  // first loop iteration, then we know the backedge will be taken at least
-  // once and so the backedge taken count is as above. If not then we use the
-  // expression (max(End,Start)-Start)/Stride to describe the backedge count,
-  // as if the backedge is taken at least once max(End,Start) is End and so the
-  // result is as above, and if not max(End,Start) is Start so we get a backedge
-  // count of zero.
+
+  // If RHS >= Start, the backedge will be taken ceil(RHS - Start) / Stride
+  // times.
+  const SCEV *BECountIfEndGEStart =
+      computeBECount(IsSigned, Start, RHS, Stride);
+
+  // Try to prove RHS >= Start.
+  bool RHSGEStart = false;
+  ICmpInst::Predicate CondGE =
+      IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE;
+  ICmpInst::Predicate CondGT =
+      IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+  if (isLoopEntryGuardedByCond(L, CondGE, OrigRHS, OrigStart)) {
+    RHSGEStart = true;
+  } else if (isLoopEntryGuardedByCond(
+                 L, CondGT, OrigRHS,
+                 getAddExpr(OrigStart, getMinusOne(OrigStart->getType())))) {
+    // "RHS >= Start" is trivially equivalent to "RHS > Start - 1" if
+    // "Start - 1" doesn't overflow.  If Start - 1 does overflow, it's equal
+    // to INT_MAX, and "RHS > INT_MAX" is trivially false.  So we can just
+    // check for "RHS > Start - 1".
+    //
+    // FIXME: Should isLoopEntryGuardedByCond do this for us?
+    RHSGEStart = true;
+  }
+
   const SCEV *BECount;
-  if (isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(OrigStart, Stride), OrigRHS))
-    BECount = BECountIfBackedgeTaken;
-  else {
-    // If we know that RHS >= Start in the context of loop, then we know that
-    // max(RHS, Start) = RHS at this point.
-    if (isLoopEntryGuardedByCond(
-            L, IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, OrigRHS, OrigStart))
-      End = RHS;
-    else
-      End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
-    BECount = computeBECount(getMinusSCEV(End, Start), Stride);
+  if (RHSGEStart) {
+    BECount = BECountIfEndGEStart;
+  } else {
+    const SCEV *StartMinusStride = getMinusSCEV(Start, Stride);
+    if (isLoopEntryGuardedByCond(L, CondGT, Start, StartMinusStride) &&
+        isLoopEntryGuardedByCond(L, CondGT, RHS, StartMinusStride)) {
+      // Suppose "Stride > 0", "Start - Stride" doesn't overflow, and
+      // "RHS > Start - Stride".
+      //
+      // If RHS >= Start, in general, the backedge-taken count is
+      // "ceil(RHS - Start) / Stride".  This is equvalent to
+      // "((RHS - Start) + (Stride - 1)) /u Stride" in arbitrary-precision math.
+      // We can reassociate that to "((RHS - 1) - (Start - Stride)) /u Stride".
+      // According to our preconditions, "RHS - 1" and "Start - Stride" can't
+      // overflow, and "((RHS - 1) - (Start - Stride))" fits into an unsigned
+      // integer, so we can just emit that expression directly.
+      //
+      // If "RHS < Start", in general, the backedge-taken count is zero.
+      // "RHS < Start" implies "((RHS - 1) - (Start - Stride)) < Stride",
+      // so the above formula evaluates to zero.
+      //
+      // FIXME: We're missing some cases because of limitations of
+      // isLoopEntryGuardedByCond.
+      const SCEV *MinusOne = getMinusOne(Stride->getType());
+      const SCEV *Numerator =
+          getMinusSCEV(getAddExpr(RHS, MinusOne), StartMinusStride);
+      BECount = getUDivExpr(Numerator, Stride);
+    } else {
+      // If RHS < Start, the backedge will be taken zero times.  So in general,
+      // we can write the backedge-taken count as:
+      //
+      //     RHS >= Start ? ceil(RHS - Start) / Stride : 0
+      //
+      // We convert it to the following to make it more convenient for SCEV:
+      //
+      //     ceil(max(RHS, Start) - Start) / Stride
+      const SCEV *End =
+          IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start);
+      BECount = computeBECount(IsSigned, Start, End, Stride);
+    }
   }
 
   const SCEV *MaxBECount;
   bool MaxOrZero = false;
-  if (isa<SCEVConstant>(BECount))
+  if (isa<SCEVConstant>(BECount)) {
     MaxBECount = BECount;
-  else if (isa<SCEVConstant>(BECountIfBackedgeTaken)) {
-    // If we know exactly how many times the backedge will be taken if it's
-    // taken at least once, then the backedge count will either be that or
-    // zero.
-    MaxBECount = BECountIfBackedgeTaken;
+  } else if (isa<SCEVConstant>(BECountIfEndGEStart)) {
+    // If we know exactly how many times the backedge will be taken if
+    // End >= Start, then the backedge count will either be that or zero.
+    MaxBECount = BECountIfEndGEStart;
     MaxOrZero = true;
   } else {
     MaxBECount = computeMaxBECountForLT(
@@ -11804,7 +11932,7 @@
       return End;
   }
 
-  const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride);
+  const SCEV *BECount = computeBECount(IsSigned, End, Start, Stride);
 
   APInt MaxStart = IsSigned ? getSignedRangeMax(Start)
                             : getUnsignedRangeMax(Start);
@@ -11825,11 +11953,8 @@
 
   const SCEV *MaxBECount = isa<SCEVConstant>(BECount)
                                ? BECount
-                               : computeBECount(getConstant(MaxStart - MinEnd),
-                                                getConstant(MinStride));
-
-  if (isa<SCEVCouldNotCompute>(MaxBECount))
-    MaxBECount = BECount;
+                               : getUDivCeilSCEV(getConstant(MaxStart - MinEnd),
+                                                 getConstant(MinStride));
 
   return ExitLimit(BECount, MaxBECount, false, Predicates);
 }
diff --git a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
--- a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
+++ b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll
@@ -1,14 +1,9 @@
 ; RUN: opt < %s -analyze -enable-new-pm=0 -scalar-evolution | FileCheck %s
 ; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 | FileCheck %s
 
-; CHECK: Loop %bb: backedge-taken count is ((-5 + %x) /u 3)
+; CHECK: Loop %bb: backedge-taken count is (((-7 + (-1 * (1 umin (-7 + %x)))<nuw><nsw> + %x) /u 3) + (1 umin (-7 + %x)))
 ; CHECK: Loop %bb: max backedge-taken count is 1431655764
 
-
-; ScalarEvolution can't compute a trip count because it doesn't know if
-; dividing by the stride will have a remainder. This could theoretically
-; be teaching it how to use a more elaborate trip count computation.
-
 define i32 @f(i32 %x) nounwind readnone {
 entry:
 	%0 = icmp ugt i32 %x, 4		; <i1> [#uses=1]
diff --git a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
--- a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
+++ b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll
@@ -1,12 +1,10 @@
 ; RUN: opt < %s -analyze -enable-new-pm=0 -scalar-evolution 2>&1 | FileCheck %s
 ; RUN: opt < %s -disable-output "-passes=print<scalar-evolution>" 2>&1 2>&1 | FileCheck %s
 
-; CHECK: Loop %bb: backedge-taken count is ((999 + (-1 * %x)) /u 3)
+; CHECK: Loop %bb: backedge-taken count is (((-3 + (-1 * (1 umin (-3 + (-1 * %x) + (1000 umax (3 + %x)))))<nuw><nsw> + (-1 * %x) + (1000 umax (3 + %x))) /u 3) + (1 umin (-3 + (-1 * %x) + (1000 umax (3 + %x)))))
 ; CHECK: Loop %bb: max backedge-taken count is 334
 
-
-; This is a tricky testcase for unsigned wrap detection which ScalarEvolution
-; doesn't yet know how to do.
+; This is a tricky testcase for unsigned wrap detection.
 
 define i32 @f(i32 %x) nounwind readnone {
 entry:
diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
--- a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
+++ b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll
@@ -4,8 +4,8 @@
 ; ScalarEvolution should be able to compute trip count of the loop by proving
 ; that this is not an infinite loop with side effects.
 
-; CHECK: Determining loop execution counts for: @foo1
-; CHECK: backedge-taken count is ((-1 + %n) /u %s)
+; CHECK-LABEL: Determining loop execution counts for: @foo1
+; CHECK: backedge-taken count is ((-1 + (-1 * %s) + (1 umax %s) + (%n smax %s)) /u (1 umax %s))
 
 ; We should have a conservative estimate for the max backedge taken count for
 ; loops with unknown stride.
@@ -34,8 +34,8 @@
 
 
 ; Check that we are able to compute trip count of a loop without an entry guard.
-; CHECK: Determining loop execution counts for: @foo2
-; CHECK: backedge-taken count is ((-1 + (%n smax %s)) /u %s)
+; CHECK-LABEL: Determining loop execution counts for: @foo2
+; CHECK: backedge-taken count is ((-1 + (-1 * %s) + (1 umax %s) + (%n smax %s)) /u (1 umax %s))
 
 ; We should have a conservative estimate for the max backedge taken count for
 ; loops with unknown stride.
@@ -61,7 +61,7 @@
 
 ; Check that without mustprogress we don't make assumptions about infinite
 ; loops being UB.
-; CHECK: Determining loop execution counts for: @foo3
+; CHECK-LABEL: Determining loop execution counts for: @foo3
 ; CHECK: Loop %for.body: Unpredictable backedge-taken count.
 ; CHECK: Loop %for.body: Unpredictable max backedge-taken count.
 
@@ -84,8 +84,8 @@
 }
 
 ; Same as foo2, but with mustprogress on loop, not function
-; CHECK: Determining loop execution counts for: @foo4
-; CHECK: backedge-taken count is ((-1 + (%n smax %s)) /u %s)
+; CHECK-LABEL: Determining loop execution counts for: @foo4
+; CHECK: backedge-taken count is ((-1 + (-1 * %s) + (1 umax %s) + (%n smax %s)) /u (1 umax %s))
 ; CHECK: max backedge-taken count is -1
 
 define void @foo4(i32* nocapture %A, i32 %n, i32 %s) {
@@ -106,5 +106,31 @@
   ret void
 }
 
+; A more complex case with pre-increment compare instead of post-increment.
+; CHECK-LABEL: Determining loop execution counts for: @foo5
+; CHECK: Loop %for.body: backedge-taken count is ((((-1 * (1 umin ((-1 * %start) + (%n smax %start))))<nuw><nsw> + (-1 * %start) + (%n smax %start)) /u (1 umax %s)) + (1 umin ((-1 * %start) + (%n smax %start))))
+
+; We should have a conservative estimate for the max backedge taken count for
+; loops with unknown stride.
+; CHECK: max backedge-taken count is -1
+
+define void @foo5(i32* nocapture %A, i32 %n, i32 %s, i32 %start) mustprogress {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.05 = phi i32 [ %add, %for.body ], [ %start, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.05
+  %0 = load i32, i32* %arrayidx, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* %arrayidx, align 4
+  %add = add nsw i32 %i.05, %s
+  %cmp = icmp slt i32 %i.05, %n
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
 !8 = distinct !{!8, !9}
 !9 = !{!"llvm.loop.mustprogress"}
diff --git a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll b/llvm/test/Transforms/LoopReroll/nonconst_lb.ll
--- a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll
+++ b/llvm/test/Transforms/LoopReroll/nonconst_lb.ll
@@ -17,22 +17,24 @@
 ; CHECK-NEXT:    [[CMP34:%.*]] = icmp slt i32 [[M:%.*]], [[N:%.*]]
 ; CHECK-NEXT:    br i1 [[CMP34]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[N]], -1
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[TMP0]], [[M]]
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 3
+; CHECK-NEXT:    [[TMP0:%.*]] = add i32 [[M]], 4
+; CHECK-NEXT:    [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]])
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SMAX]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[TMP1]], [[M]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[M]], [[INDVAR]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP6]], 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP5]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add i32 [[M]], [[INDVAR]]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP7]], 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]]
 ; CHECK-NEXT:    store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP4]]
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP5]]
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end.loopexit:
 ; CHECK-NEXT:    br label [[FOR_END]]
diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
--- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
+++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll
@@ -15,16 +15,15 @@
 ; CHECK:       loop.preheader:
 ; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 [[Y:%.*]], [[ADD]]
 ; CHECK-NEXT:    [[INC:%.*]] = add i64 [[DIV]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = add nuw nsw i64 [[DIV]], 4
-; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], [[INC]]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 3, [[INC]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP2]], 1
+; CHECK-NEXT:    [[N_RND_UP:%.*]] = add i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = mul i64 [[N_VEC]], [[INC]]
-; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -35,41 +34,41 @@
 ; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INC]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = mul <2 x i64> <i64 0, i64 1>, [[DOTSPLAT]]
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], [[TMP3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 0, [[INC]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], [[TMP4]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <2 x i64> <i64 0, i64 1>, [[DOTSPLAT]]
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 0, [[INC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT4]], <i64 0, i64 1>
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0
-; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0
+; CHECK-NEXT:    br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
 ; CHECK:       pred.store.if:
 ; CHECK-NEXT:    store i32 0, i32* [[PTR:%.*]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE]]
 ; CHECK:       pred.store.continue:
-; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1
-; CHECK-NEXT:    br i1 [[TMP8]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1
+; CHECK-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.if5:
 ; CHECK-NEXT:    store i32 0, i32* [[PTR]], align 4
 ; CHECK-NEXT:    br label [[PRED_STORE_CONTINUE6]]
 ; CHECK:       pred.store.continue6:
 ; CHECK-NEXT:    [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], [[INC]]
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[OFFSET_IDX7]] to i8
-; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[INC]] to i8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = trunc i64 [[OFFSET_IDX7]] to i8
+; CHECK-NEXT:    [[TMP9:%.*]] = trunc i64 [[INC]] to i8
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[TMP8]], i32 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT9:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[DOTSPLATINSERT10:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0
+; CHECK-NEXT:    [[DOTSPLATINSERT10:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i32 0
 ; CHECK-NEXT:    [[DOTSPLAT11:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT10]], <2 x i8> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = mul <2 x i8> <i8 0, i8 1>, [[DOTSPLAT11]]
-; CHECK-NEXT:    [[INDUCTION12:%.*]] = add <2 x i8> [[BROADCAST_SPLAT9]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i8 0, [[TMP10]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add i8 [[TMP9]], [[TMP12]]
-; CHECK-NEXT:    [[TMP14:%.*]] = add i8 [[TMP13]], 1
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i8> <i8 0, i8 1>, [[DOTSPLAT11]]
+; CHECK-NEXT:    [[INDUCTION12:%.*]] = add <2 x i8> [[BROADCAST_SPLAT9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i8 0, [[TMP9]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add i8 [[TMP8]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add i8 [[TMP12]], 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -82,7 +81,7 @@
 ; CHECK-NEXT:    [[V3:%.*]] = add i8 [[V2]], 1
 ; CHECK-NEXT:    [[CMP15:%.*]] = icmp slt i8 [[V3]], 5
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], [[INC]]
-; CHECK-NEXT:    br i1 [[CMP15]], label [[LOOP]], label [[LOOP_EXIT]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[CMP15]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       loop.exit:
 ; CHECK-NEXT:    [[DIV_1:%.*]] = udiv i64 [[Y]], [[ADD]]
 ; CHECK-NEXT:    [[V1:%.*]] = add i64 [[DIV_1]], 1