Index: llvm/include/llvm/Analysis/ScalarEvolution.h =================================================================== --- llvm/include/llvm/Analysis/ScalarEvolution.h +++ llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1177,6 +1177,9 @@ /// sharpen it. void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags); + /// Try to apply information from loop guards for \p L to \p Expr. + const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L); + private: /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a /// Value is deleted. @@ -2021,9 +2024,6 @@ /// Assign A and B to LHS and RHS, respectively. bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS); - /// Try to apply information from loop guards for \p L to \p Expr. - const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L); - /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in /// `UniqueSCEVs`. /// Index: llvm/lib/Analysis/ScalarEvolution.cpp =================================================================== --- llvm/lib/Analysis/ScalarEvolution.cpp +++ llvm/lib/Analysis/ScalarEvolution.cpp @@ -6875,7 +6875,8 @@ // Attempt to factor more general cases. Returns the greatest power of // two divisor. If overflow happens, the trip count expression is still // divisible by the greatest power of 2 divisor returned. - return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr)); + return 1U << std::min((uint32_t)31, + GetMinTrailingZeros(applyLoopGuards(TCExpr, L))); ConstantInt *Result = TC->getValue(); @@ -13241,6 +13242,34 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) { auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) { + // If we have LHS == 0, check if LHS is computing a property of some unknown + // SCEV %v which we can rewrite %v to express explicitly. + const SCEVConstant *RHSC = dyn_cast(RHS); + if (Predicate == CmpInst::ICMP_EQ && RHSC && + RHSC->getValue()->isNullValue()) { + // Check if LHS is zext(trunc(%v, iD), %v->getType()), proving %v to be a + // multiple of 2^D. If so, rewrite %v to (%v /u 2^D) * 2^D to explicitly + // zero %v's D low bits. + if (auto ZExt = dyn_cast(LHS)) { + if (auto Trunc = dyn_cast(ZExt->getOperand(0))) + if (const SCEVUnknown *LHSUnknown = + dyn_cast(Trunc->getOperand(0))) { + Value *V = LHSUnknown->getValue(); + if (ZExt->getType() == LHSUnknown->getType()) { + // The condition is proving some low bits to be zero. + auto Divisor = getConstant( + V->getType(), 1 << Trunc->getType()->getScalarSizeInBits(), + false); + auto ClearLowBits = getMulExpr( + getUDivExpr(LHSUnknown, Divisor), Divisor, + (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW)); + RewriteMap[LHSUnknown->getValue()] = ClearLowBits; + return; + } + } + } + } + if (!isa(LHS)) { std::swap(LHS, RHS); Predicate = CmpInst::getSwappedPredicate(Predicate); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5576,7 +5576,8 @@ const SCEV *ExitCount = SE->getAddExpr( BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); const SCEV *Rem = SE->getURemExpr( - ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); if (Rem->isZero()) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); Index: llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll +++ llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll @@ -9,7 +9,7 @@ ; CHECK: Loop %for.body: backedge-taken count is (-1 + %num) ; CHECK-NEXT: Loop %for.body: max backedge-taken count is -2 ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %num) -; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK: Loop %for.body: Trip multiple is 4 ; entry: %u = urem i32 %num, 4 Index: llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll @@ -0,0 +1,82 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-count=2 | FileCheck %s + +; Make sure the loop is unrolled without a remainder loop based on an assumption +; that the least significant bit is known to be zero. + +define dso_local void @assumeDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %n, i1 %c) local_unnamed_addr { +; CHECK-LABEL: @assumeDivisibleTC( +; CHECK-NEXT: entry: +; CHECK-NEXT: br i1 [[C:%.*]], label [[BAIL_OUT:%.*]], label [[DO_WORK:%.*]] +; CHECK: bail.out: +; CHECK-NEXT: ret void +; CHECK: do.work: +; CHECK-NEXT: [[AND:%.*]] = and i32 [[N:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret void +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_011]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP0]], 3 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1 +; CHECK-NEXT: [[IDXPROM_1:%.*]] = zext i32 [[INC]] to i64 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i64 [[IDXPROM_1]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP1]], 3 +; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i64 [[IDXPROM_1]] +; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1 +; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[INC_1]], [[N]] +; CHECK-NEXT: br i1 [[CMP1_1]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], [[LOOP0:!llvm.loop !.*]] +; +entry: + br i1 %c, label %bail.out, label %do.work + +bail.out: + ret void + +do.work: + %and = and i32 %n, 1 + %cmp = icmp eq i32 %and, 0 + tail call void @llvm.assume(i1 %cmp) + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %do.work + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %i.011 to i64 + %arrayidx = getelementptr inbounds i8, i8* %b, i64 %idxprom + %0 = load i8, i8* %arrayidx, align 1 + %add = add i8 %0, 3 + %arrayidx4 = getelementptr inbounds i8, i8* %a, i64 %idxprom + store i8 %add, i8* %arrayidx4, align 1 + %inc = add nuw nsw i32 %i.011, 1 + %cmp1 = icmp slt i32 %inc, %n + br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit, !llvm.loop !0 +} + +declare void @llvm.assume(i1 noundef) nofree nosync nounwind willreturn +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.mustprogress"} +!2 = !{!"llvm.loop.unroll.count", i32 4} Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll +++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -64,13 +64,21 @@ ; Make sure the loop is vectorized under -Os without folding its tail based on ; its trip-count's lower bits assumed to be zero. -define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32* %p) optsize { +define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q, i1 %c) optsize { ; CHECK-LABEL: @assumeAlignedTC( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[N:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[N]], 3 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: br i1 [[C:%.*]], label [[BAIL_OUT:%.*]], label [[DO_WORK:%.*]] +; CHECK: bail.out: +; CHECK-NEXT: ret void +; CHECK: do.work: +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[AND1]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP1]]) +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 8 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]] +; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]] ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -89,12 +97,12 @@ ; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[TMP3]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[DO_WORK]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] @@ -102,19 +110,29 @@ ; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]] ; CHECK: exit: ; CHECK-NEXT: ret void ; entry: - %n = load i32, i32* %p - %and = and i32 %n, 3 - %cmp = icmp eq i32 %and, 0 - tail call void @llvm.assume(i1 %cmp) + br i1 %c, label %bail.out, label %do.work + +bail.out: + ret void + +do.work: + %and1 = and i32 %p, 3 + %cmp1 = icmp eq i32 %and1, 0 + tail call void @llvm.assume(i1 %cmp1) + %rem = urem i32 %q, 8 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %gt = icmp sgt i32 %p, %q + %n = select i1 %gt, i32 %p, i32 %q br label %loop loop: - %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ] + %riv = phi i32 [ 0, %do.work ], [ %rivPlus1, %loop ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv store i32 13, i32* %arrayidx, align 1 %rivPlus1 = add nuw nsw i32 %riv, 1