Index: llvm/include/llvm/Analysis/ScalarEvolution.h
===================================================================
--- llvm/include/llvm/Analysis/ScalarEvolution.h
+++ llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -1177,6 +1177,9 @@
   /// sharpen it.
   void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags);
 
+  /// Try to apply information from loop guards for \p L to \p Expr.
+  const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
+
 private:
   /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a
   /// Value is deleted.
@@ -2021,9 +2024,6 @@
   /// Assign A and B to LHS and RHS, respectively.
   bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS);
 
-  /// Try to apply information from loop guards for \p L to \p Expr.
-  const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L);
-
   /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in
   /// `UniqueSCEVs`.
   ///
Index: llvm/lib/Analysis/ScalarEvolution.cpp
===================================================================
--- llvm/lib/Analysis/ScalarEvolution.cpp
+++ llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6875,7 +6875,8 @@
     // Attempt to factor more general cases. Returns the greatest power of
     // two divisor. If overflow happens, the trip count expression is still
     // divisible by the greatest power of 2 divisor returned.
-    return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));
+    return 1U << std::min((uint32_t)31,
+                          GetMinTrailingZeros(applyLoopGuards(TCExpr, L)));
 
   ConstantInt *Result = TC->getValue();
 
@@ -13241,6 +13242,34 @@
 const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) {
   auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS,
                               const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) {
+    // If we have LHS == 0, check if LHS is computing a property of some unknown
+    // SCEV %v which we can rewrite %v to express explicitly.
+    const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
+    if (Predicate == CmpInst::ICMP_EQ && RHSC &&
+        RHSC->getValue()->isNullValue()) {
+      // Check if LHS is zext(trunc(%v, iD), %v->getType()), proving %v to be a
+      // multiple of 2^D. If so, rewrite %v to (%v /u 2^D) * 2^D to explicitly
+      // zero %v's D low bits.
+      if (auto ZExt = dyn_cast<SCEVZeroExtendExpr>(LHS)) {
+        if (auto Trunc = dyn_cast<SCEVTruncateExpr>(ZExt->getOperand(0)))
+          if (const SCEVUnknown *LHSUnknown =
+                  dyn_cast<SCEVUnknown>(Trunc->getOperand(0))) {
+            Value *V = LHSUnknown->getValue();
+            if (ZExt->getType() == LHSUnknown->getType()) {
+              // The condition is proving some low bits to be zero.
+              auto Divisor = getConstant(
+                  V->getType(), 1 << Trunc->getType()->getScalarSizeInBits(),
+                  false);
+              auto ClearLowBits = getMulExpr(
+                  getUDivExpr(LHSUnknown, Divisor), Divisor,
+                  (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW));
+              RewriteMap[LHSUnknown->getValue()] = ClearLowBits;
+              return;
+            }
+          }
+      }
+    }
+
     if (!isa<SCEVUnknown>(LHS)) {
       std::swap(LHS, RHS);
       Predicate = CmpInst::getSwappedPredicate(Predicate);
Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -5576,7 +5576,8 @@
   const SCEV *ExitCount = SE->getAddExpr(
       BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType()));
   const SCEV *Rem = SE->getURemExpr(
-      ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
+      SE->applyLoopGuards(ExitCount, TheLoop),
+      SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC));
   if (Rem->isZero()) {
     // Accept MaxVF if we do not have a tail.
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
Index: llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
===================================================================
--- llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
+++ llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll
@@ -9,7 +9,7 @@
 ; CHECK:       Loop %for.body: backedge-taken count is (-1 + %num)
 ; CHECK-NEXT:  Loop %for.body: max backedge-taken count is -2
 ; CHECK-NEXT:  Loop %for.body: Predicated backedge-taken count is (-1 + %num)
-; CHECK:       Loop %for.body: Trip multiple is 1
+; CHECK:       Loop %for.body: Trip multiple is 4
 ;
 entry:
   %u = urem i32 %num, 4
Index: llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll
@@ -0,0 +1,82 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-count=2 | FileCheck %s
+
+; Make sure the loop is unrolled without a remainder loop based on an assumption
+; that the least significant bit is known to be zero.
+
+define dso_local void @assumeDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %n, i1 %c) local_unnamed_addr {
+; CHECK-LABEL: @assumeDivisibleTC(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BAIL_OUT:%.*]], label [[DO_WORK:%.*]]
+; CHECK:       bail.out:
+; CHECK-NEXT:    ret void
+; CHECK:       do.work:
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[N:%.*]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    [[CMP110:%.*]] = icmp sgt i32 [[N]], 0
+; CHECK-NEXT:    br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond.cleanup.loopexit:
+; CHECK-NEXT:    br label [[FOR_COND_CLEANUP]]
+; CHECK:       for.cond.cleanup:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = zext i32 [[I_011]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ADD:%.*]] = add i8 [[TMP0]], 3
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1
+; CHECK-NEXT:    [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1
+; CHECK-NEXT:    [[IDXPROM_1:%.*]] = zext i32 [[INC]] to i64
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[IDXPROM_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1
+; CHECK-NEXT:    [[ADD_1:%.*]] = add i8 [[TMP1]], 3
+; CHECK-NEXT:    [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IDXPROM_1]]
+; CHECK-NEXT:    store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1
+; CHECK-NEXT:    [[INC_1]] = add nuw nsw i32 [[INC]], 1
+; CHECK-NEXT:    [[CMP1_1:%.*]] = icmp slt i32 [[INC_1]], [[N]]
+; CHECK-NEXT:    br i1 [[CMP1_1]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], [[LOOP0:!llvm.loop !.*]]
+;
+entry:
+  br i1 %c, label %bail.out, label %do.work
+
+bail.out:
+  ret void
+
+do.work:
+  %and = and i32 %n, 1
+  %cmp = icmp eq i32 %and, 0
+  tail call void @llvm.assume(i1 %cmp)
+  %cmp110 = icmp sgt i32 %n, 0
+  br i1 %cmp110, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %do.work
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %idxprom = zext i32 %i.011 to i64
+  %arrayidx = getelementptr inbounds i8, i8* %b, i64 %idxprom
+  %0 = load i8, i8* %arrayidx, align 1
+  %add = add i8 %0, 3
+  %arrayidx4 = getelementptr inbounds i8, i8* %a, i64 %idxprom
+  store i8 %add, i8* %arrayidx4, align 1
+  %inc = add nuw nsw i32 %i.011, 1
+  %cmp1 = icmp slt i32 %inc, %n
+  br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0
+}
+
+declare void @llvm.assume(i1 noundef) nofree nosync nounwind willreturn
+!0 = distinct !{!0, !1, !2}
+!1 = !{!"llvm.loop.mustprogress"}
+!2 = !{!"llvm.loop.unroll.count", i32 4}
Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
+++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll
@@ -64,13 +64,21 @@
 ; Make sure the loop is vectorized under -Os without folding its tail based on
 ; its trip-count's lower bits assumed to be zero.
 
-define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32* %p) optsize {
+define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q, i1 %c) optsize {
 ; CHECK-LABEL: @assumeAlignedTC(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[N:%.*]] = load i32, i32* [[P:%.*]], align 4
-; CHECK-NEXT:    [[AND:%.*]] = and i32 [[N]], 3
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[AND]], 0
-; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP]])
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[BAIL_OUT:%.*]], label [[DO_WORK:%.*]]
+; CHECK:       bail.out:
+; CHECK-NEXT:    ret void
+; CHECK:       do.work:
+; CHECK-NEXT:    [[AND1:%.*]] = and i32 [[P:%.*]], 3
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[AND1]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[Q:%.*]], 8
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[REM]], 0
+; CHECK-NEXT:    tail call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]]
+; CHECK-NEXT:    [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]]
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
@@ -89,12 +97,12 @@
 ; CHECK-NEXT:    store <4 x i32> <i32 13, i32 13, i32 13, i32 13>, <4 x i32>* [[TMP3]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DO_WORK]] ]
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ]
@@ -102,19 +110,29 @@
 ; CHECK-NEXT:    store i32 13, i32* [[ARRAYIDX]], align 1
 ; CHECK-NEXT:    [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]]
-; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]]
+; CHECK-NEXT:    br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
 entry:
-  %n = load i32, i32* %p
-  %and = and i32 %n, 3
-  %cmp = icmp eq i32 %and, 0
-  tail call void @llvm.assume(i1 %cmp)
+  br i1 %c, label %bail.out, label %do.work
+
+bail.out:
+  ret void
+
+do.work:
+  %and1 = and i32 %p, 3
+  %cmp1 = icmp eq i32 %and1, 0
+  tail call void @llvm.assume(i1 %cmp1)
+  %rem = urem i32 %q, 8
+  %cmp2 = icmp eq i32 %rem, 0
+  tail call void @llvm.assume(i1 %cmp2)
+  %gt = icmp sgt i32 %p, %q
+  %n = select i1 %gt, i32 %p, i32 %q
   br label %loop
 
 loop:
-  %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ]
+  %riv = phi i32 [ 0, %do.work ], [ %rivPlus1, %loop ]
   %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv
   store i32 13, i32* %arrayidx, align 1
   %rivPlus1 = add nuw nsw i32 %riv, 1