Index: llvm/include/llvm/Analysis/ScalarEvolution.h =================================================================== --- llvm/include/llvm/Analysis/ScalarEvolution.h +++ llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1177,6 +1177,9 @@ /// sharpen it. void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags); + /// Try to apply information from loop guards for \p L to \p Expr. + const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L); + private: /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a /// Value is deleted. @@ -2021,9 +2024,6 @@ /// Assign A and B to LHS and RHS, respectively. bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS); - /// Try to apply information from loop guards for \p L to \p Expr. - const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L); - /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in /// `UniqueSCEVs`. /// Index: llvm/lib/Analysis/ScalarEvolution.cpp =================================================================== --- llvm/lib/Analysis/ScalarEvolution.cpp +++ llvm/lib/Analysis/ScalarEvolution.cpp @@ -6887,7 +6887,8 @@ // Attempt to factor more general cases. Returns the greatest power of // two divisor. If overflow happens, the trip count expression is still // divisible by the greatest power of 2 divisor returned. - return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr)); + return 1U << std::min((uint32_t)31, + GetMinTrailingZeros(applyLoopGuards(TCExpr, L))); ConstantInt *Result = TC->getValue(); @@ -13254,6 +13255,27 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) { auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) { + // If we have LHS == 0, check if LHS is computing a property of some unknown + // SCEV %v which we can rewrite %v to express explicitly. + const SCEVConstant *RHSC = dyn_cast(RHS); + if (Predicate == CmpInst::ICMP_EQ && RHSC && + RHSC->getValue()->isNullValue()) { + // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to + // explicitly express that. + const SCEV *URemLHS = nullptr; + const SCEV *URemRHS = nullptr; + if (matchURem(LHS, URemLHS, URemRHS)) { + if (const SCEVUnknown *LHSUnknown = dyn_cast(URemLHS)) { + Value *V = LHSUnknown->getValue(); + auto Multiple = getMulExpr( + getUDivExpr(URemLHS, URemRHS), URemRHS, + (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW)); + RewriteMap[V] = Multiple; + return; + } + } + } + if (!isa(LHS)) { std::swap(LHS, RHS); Predicate = CmpInst::getSwappedPredicate(Predicate); Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5577,7 +5577,8 @@ const SCEV *ExitCount = SE->getAddExpr( BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); const SCEV *Rem = SE->getURemExpr( - ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); if (Rem->isZero()) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); Index: llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll =================================================================== --- llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll +++ llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll @@ -9,7 +9,7 @@ ; CHECK: Loop %for.body: backedge-taken count is (-1 + %num) ; CHECK-NEXT: Loop %for.body: max backedge-taken count is -2 ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %num) -; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK: Loop %for.body: Trip multiple is 4 ; entry: %u = urem i32 %num, 4 Index: llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll @@ -0,0 +1,73 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-count=2 | FileCheck %s + +; Make sure the loop is unrolled without a remainder loop based on an assumption +; that the least significant bit is known to be zero. + +define dso_local void @assumeDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr { +; CHECK-LABEL: @assumeDivisibleTC( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]] +; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]] +; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP0]], 3 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP1]], 3 +; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]] +; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1 +; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[INC_1]], [[N]] +; CHECK-NEXT: br i1 [[CMP1_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]], [[LOOP0:!llvm.loop !.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %and = and i32 %p, 1 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %guarded, label %exit + +guarded: + %rem = urem i32 %q, 2 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %gt = icmp sgt i32 %p, %q + %n = select i1 %gt, i32 %p, i32 %q + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %for.body, label %exit + +for.body: + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ] + %arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011 + %0 = load i8, i8* %arrayidx, align 1 + %add = add i8 %0, 3 + %arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011 + store i8 %add, i8* %arrayidx4, align 1 + %inc = add nuw nsw i32 %i.011, 1 + %cmp1 = icmp slt i32 %inc, %n + br i1 %cmp1, label %for.body, label %exit + +exit: + ret void +} + +declare void @llvm.assume(i1 noundef) nofree nosync nounwind willreturn Index: llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll +++ llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -64,13 +64,21 @@ ; Make sure the loop is vectorized under -Os without folding its tail based on ; its trip-count's lower bits assumed to be zero. -define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32* %p) optsize { +define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize { ; CHECK-LABEL: @assumeAlignedTC( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[N:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[N]], 3 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[AND1]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 8 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]] +; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]] +; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP110]], label [[LOOP_PREHEADER:%.*]], label [[EXIT]] +; CHECK: loop.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -89,32 +97,41 @@ ; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[TMP3]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]] ; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void ; entry: - %n = load i32, i32* %p - %and = and i32 %n, 3 - %cmp = icmp eq i32 %and, 0 - tail call void @llvm.assume(i1 %cmp) - br label %loop + %and1 = and i32 %p, 3 + %cmp1 = icmp eq i32 %and1, 0 + br i1 %cmp1, label %guarded, label %exit + +guarded: + %rem = urem i32 %q, 8 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %gt = icmp sgt i32 %p, %q + %n = select i1 %gt, i32 %p, i32 %q + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %loop, label %exit loop: - %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ] + %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv store i32 13, i32* %arrayidx, align 1 %rivPlus1 = add nuw nsw i32 %riv, 1