diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -1177,6 +1177,9 @@ /// sharpen it. void setNoWrapFlags(SCEVAddRecExpr *AddRec, SCEV::NoWrapFlags Flags); + /// Try to apply information from loop guards for \p L to \p Expr. + const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L); + private: /// A CallbackVH to arrange for ScalarEvolution to be notified whenever a /// Value is deleted. @@ -2021,9 +2024,6 @@ /// Assign A and B to LHS and RHS, respectively. bool matchURem(const SCEV *Expr, const SCEV *&LHS, const SCEV *&RHS); - /// Try to apply information from loop guards for \p L to \p Expr. - const SCEV *applyLoopGuards(const SCEV *Expr, const Loop *L); - /// Look for a SCEV expression with type `SCEVType` and operands `Ops` in /// `UniqueSCEVs`. /// diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -6887,7 +6887,8 @@ // Attempt to factor more general cases. Returns the greatest power of // two divisor. If overflow happens, the trip count expression is still // divisible by the greatest power of 2 divisor returned. - return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr)); + return 1U << std::min((uint32_t)31, + GetMinTrailingZeros(applyLoopGuards(TCExpr, L))); ConstantInt *Result = TC->getValue(); @@ -13259,6 +13260,27 @@ const SCEV *ScalarEvolution::applyLoopGuards(const SCEV *Expr, const Loop *L) { auto CollectCondition = [&](ICmpInst::Predicate Predicate, const SCEV *LHS, const SCEV *RHS, ValueToSCEVMapTy &RewriteMap) { + // If we have LHS == 0, check if LHS is computing a property of some unknown + // SCEV %v which we can rewrite %v to express explicitly. + const SCEVConstant *RHSC = dyn_cast(RHS); + if (Predicate == CmpInst::ICMP_EQ && RHSC && + RHSC->getValue()->isNullValue()) { + // If LHS is A % B, i.e. A % B == 0, rewrite A to (A /u B) * B to + // explicitly express that. + const SCEV *URemLHS = nullptr; + const SCEV *URemRHS = nullptr; + if (matchURem(LHS, URemLHS, URemRHS)) { + if (const SCEVUnknown *LHSUnknown = dyn_cast(URemLHS)) { + Value *V = LHSUnknown->getValue(); + auto Multiple = + getMulExpr(getUDivExpr(URemLHS, URemRHS), URemRHS, + (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW)); + RewriteMap[V] = Multiple; + return; + } + } + } + if (!isa(LHS)) { std::swap(LHS, RHS); Predicate = CmpInst::getSwappedPredicate(Predicate); diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -5573,7 +5573,8 @@ const SCEV *ExitCount = SE->getAddExpr( BackedgeTakenCount, SE->getOne(BackedgeTakenCount->getType())); const SCEV *Rem = SE->getURemExpr( - ExitCount, SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); + SE->applyLoopGuards(ExitCount, TheLoop), + SE->getConstant(BackedgeTakenCount->getType(), MaxVFtimesIC)); if (Rem->isZero()) { // Accept MaxVF if we do not have a tail. LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n"); diff --git a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll --- a/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll +++ b/llvm/test/Analysis/ScalarEvolution/trip-multiple-guard-info.ll @@ -9,7 +9,7 @@ ; CHECK: Loop %for.body: backedge-taken count is (-1 + %num) ; CHECK-NEXT: Loop %for.body: max backedge-taken count is -2 ; CHECK-NEXT: Loop %for.body: Predicated backedge-taken count is (-1 + %num) -; CHECK: Loop %for.body: Trip multiple is 1 +; CHECK: Loop %for.body: Trip multiple is 4 ; entry: %u = urem i32 %num, 4 diff --git a/llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll b/llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopUnroll/runtime-unroll-assume-no-remainder.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true -unroll-count=2 | FileCheck %s + +; Make sure the loop is unrolled without a remainder loop based on an assumption +; that the least significant bit is known to be zero. + +define dso_local void @assumeDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr { +; CHECK-LABEL: @assumeDivisibleTC( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]] +; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]] +; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]] +; CHECK-NEXT: [[TMP0:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP0]], 3 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]] +; CHECK-NEXT: [[TMP1:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP1]], 3 +; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]] +; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1 +; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: [[CMP1_1:%.*]] = icmp slt i32 [[INC_1]], [[N]] +; CHECK-NEXT: br i1 [[CMP1_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT:%.*]], [[LOOP0:!llvm.loop !.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %and = and i32 %p, 1 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %guarded, label %exit + +guarded: + %rem = urem i32 %q, 2 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %gt = icmp sgt i32 %p, %q + %n = select i1 %gt, i32 %p, i32 %q + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %for.body, label %exit + +for.body: + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ] + %arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011 + %0 = load i8, i8* %arrayidx, align 1 + %add = add i8 %0, 3 + %arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011 + store i8 %add, i8* %arrayidx4, align 1 + %inc = add nuw nsw i32 %i.011, 1 + %cmp1 = icmp slt i32 %inc, %n + br i1 %cmp1, label %for.body, label %exit + +exit: + ret void +} + +; Make sure the loop is unrolled with a remainder loop when the trip-count +; is not provably divisible by the unroll factor. + +define dso_local void @cannotProveDivisibleTC(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i32 %p, i32 %q) local_unnamed_addr { +; CHECK-LABEL: @cannotProveDivisibleTC( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND:%.*]] = and i32 [[P:%.*]], 6 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 +; CHECK-NEXT: br i1 [[CMP]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 2 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]] +; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]] +; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP110]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 +; CHECK-NEXT: [[XTRAITER:%.*]] = and i32 [[N]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 1 +; CHECK-NEXT: br i1 [[TMP1]], label [[EXIT_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY_PREHEADER_NEW:%.*]] +; CHECK: for.body.preheader.new: +; CHECK-NEXT: [[UNROLL_ITER:%.*]] = sub i32 [[N]], [[XTRAITER]] +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[I_011:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER_NEW]] ], [ [[INC_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[NITER:%.*]] = phi i32 [ [[UNROLL_ITER]], [[FOR_BODY_PREHEADER_NEW]] ], [ [[NITER_NSUB_1:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[B:%.*]], i32 [[I_011]] +; CHECK-NEXT: [[TMP2:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[ADD:%.*]] = add i8 [[TMP2]], 3 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, i8* [[A:%.*]], i32 [[I_011]] +; CHECK-NEXT: store i8 [[ADD]], i8* [[ARRAYIDX4]], align 1 +; CHECK-NEXT: [[INC:%.*]] = add nuw nsw i32 [[I_011]], 1 +; CHECK-NEXT: [[NITER_NSUB:%.*]] = sub i32 [[NITER]], 1 +; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[INC]] +; CHECK-NEXT: [[TMP3:%.*]] = load i8, i8* [[ARRAYIDX_1]], align 1 +; CHECK-NEXT: [[ADD_1:%.*]] = add i8 [[TMP3]], 3 +; CHECK-NEXT: [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[INC]] +; CHECK-NEXT: store i8 [[ADD_1]], i8* [[ARRAYIDX4_1]], align 1 +; CHECK-NEXT: [[INC_1]] = add nuw nsw i32 [[INC]], 1 +; CHECK-NEXT: [[NITER_NSUB_1]] = sub i32 [[NITER_NSUB]], 1 +; CHECK-NEXT: [[NITER_NCMP_1:%.*]] = icmp ne i32 [[NITER_NSUB_1]], 0 +; CHECK-NEXT: br i1 [[NITER_NCMP_1]], label [[FOR_BODY]], label [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT:%.*]], [[LOOP2:!llvm.loop !.*]] +; CHECK: exit.loopexit.unr-lcssa.loopexit: +; CHECK-NEXT: [[I_011_UNR_PH:%.*]] = phi i32 [ [[INC_1]], [[FOR_BODY]] ] +; CHECK-NEXT: br label [[EXIT_LOOPEXIT_UNR_LCSSA]] +; CHECK: exit.loopexit.unr-lcssa: +; CHECK-NEXT: [[I_011_UNR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[I_011_UNR_PH]], [[EXIT_LOOPEXIT_UNR_LCSSA_LOOPEXIT]] ] +; CHECK-NEXT: [[LCMP_MOD:%.*]] = icmp ne i32 [[XTRAITER]], 0 +; CHECK-NEXT: br i1 [[LCMP_MOD]], label [[FOR_BODY_EPIL_PREHEADER:%.*]], label [[EXIT_LOOPEXIT:%.*]] +; CHECK: for.body.epil.preheader: +; CHECK-NEXT: br label [[FOR_BODY_EPIL:%.*]] +; CHECK: for.body.epil: +; CHECK-NEXT: [[I_011_EPIL:%.*]] = phi i32 [ [[I_011_UNR]], [[FOR_BODY_EPIL_PREHEADER]] ] +; CHECK-NEXT: [[ARRAYIDX_EPIL:%.*]] = getelementptr inbounds i8, i8* [[B]], i32 [[I_011_EPIL]] +; CHECK-NEXT: [[TMP4:%.*]] = load i8, i8* [[ARRAYIDX_EPIL]], align 1 +; CHECK-NEXT: [[ADD_EPIL:%.*]] = add i8 [[TMP4]], 3 +; CHECK-NEXT: [[ARRAYIDX4_EPIL:%.*]] = getelementptr inbounds i8, i8* [[A]], i32 [[I_011_EPIL]] +; CHECK-NEXT: store i8 [[ADD_EPIL]], i8* [[ARRAYIDX4_EPIL]], align 1 +; CHECK-NEXT: [[INC_EPIL:%.*]] = add nuw nsw i32 [[I_011_EPIL]], 1 +; CHECK-NEXT: [[CMP1_EPIL:%.*]] = icmp slt i32 [[INC_EPIL]], [[N]] +; CHECK-NEXT: br label [[EXIT_LOOPEXIT_EPILOG_LCSSA:%.*]] +; CHECK: exit.loopexit.epilog-lcssa: +; CHECK-NEXT: br label [[EXIT_LOOPEXIT]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %and = and i32 %p, 6 + %cmp = icmp eq i32 %and, 0 + br i1 %cmp, label %guarded, label %exit + +guarded: + %rem = urem i32 %q, 2 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %gt = icmp sgt i32 %p, %q + %n = select i1 %gt, i32 %p, i32 %q + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %for.body, label %exit + +for.body: + %i.011 = phi i32 [ %inc, %for.body ], [ 0, %guarded ] + %arrayidx = getelementptr inbounds i8, i8* %b, i32 %i.011 + %0 = load i8, i8* %arrayidx, align 1 + %add = add i8 %0, 3 + %arrayidx4 = getelementptr inbounds i8, i8* %a, i32 %i.011 + store i8 %add, i8* %arrayidx4, align 1 + %inc = add nuw nsw i32 %i.011, 1 + %cmp1 = icmp slt i32 %inc, %n + br i1 %cmp1, label %for.body, label %exit + +exit: + ret void +} + +declare void @llvm.assume(i1 noundef) nofree nosync nounwind willreturn diff --git a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll --- a/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll +++ b/llvm/test/Transforms/LoopVectorize/dont-fold-tail-for-divisible-TC.ll @@ -64,13 +64,21 @@ ; Make sure the loop is vectorized under -Os without folding its tail based on ; its trip-count's lower bits assumed to be zero. -define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32* %p) optsize { +define dso_local void @assumeAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize { ; CHECK-LABEL: @assumeAlignedTC( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[N:%.*]] = load i32, i32* [[P:%.*]], align 4 -; CHECK-NEXT: [[AND:%.*]] = and i32 [[N]], 3 -; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 [[AND]], 0 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP]]) +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[AND1]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 8 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]] +; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]] +; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP110]], label [[LOOP_PREHEADER:%.*]], label [[EXIT]] +; CHECK: loop.preheader: ; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4 ; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: @@ -89,32 +97,151 @@ ; CHECK-NEXT: store <4 x i32> , <4 x i32>* [[TMP3]], align 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP4:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]] +; CHECK-NEXT: br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: -; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[RIVPLUS1:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]] ; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 ; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 ; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]] -; CHECK-NEXT: br i1 [[COND]], label [[EXIT]], label [[LOOP]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP5:!llvm.loop !.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] ; CHECK: exit: ; CHECK-NEXT: ret void ; entry: - %n = load i32, i32* %p - %and = and i32 %n, 3 - %cmp = icmp eq i32 %and, 0 - tail call void @llvm.assume(i1 %cmp) - br label %loop + %and1 = and i32 %p, 3 + %cmp1 = icmp eq i32 %and1, 0 + br i1 %cmp1, label %guarded, label %exit + +guarded: + %rem = urem i32 %q, 8 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %gt = icmp sgt i32 %p, %q + %n = select i1 %gt, i32 %p, i32 %q + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %loop, label %exit loop: - %riv = phi i32 [ 0, %entry ], [ %rivPlus1, %loop ] + %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv + store i32 13, i32* %arrayidx, align 1 + %rivPlus1 = add nuw nsw i32 %riv, 1 + %cond = icmp eq i32 %rivPlus1, %n + br i1 %cond, label %exit, label %loop + +exit: + ret void +} + +; Make sure the loop's tail is folded when vectorized under -Os based on its trip-count's +; not being provably divisible by chosen VF. + +define dso_local void @cannotProveAlignedTC(i32* noalias nocapture %A, i32 %p, i32 %q) optsize { +; CHECK-LABEL: @cannotProveAlignedTC( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[AND1:%.*]] = and i32 [[P:%.*]], 3 +; CHECK-NEXT: [[CMP1:%.*]] = icmp eq i32 [[AND1]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[GUARDED:%.*]], label [[EXIT:%.*]] +; CHECK: guarded: +; CHECK-NEXT: [[REM:%.*]] = urem i32 [[Q:%.*]], 3 +; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[REM]], 0 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[CMP2]]) +; CHECK-NEXT: [[GT:%.*]] = icmp sgt i32 [[P]], [[Q]] +; CHECK-NEXT: [[N:%.*]] = select i1 [[GT]], i32 [[P]], i32 [[Q]] +; CHECK-NEXT: [[CMP110:%.*]] = icmp sgt i32 [[N]], 0 +; CHECK-NEXT: br i1 [[CMP110]], label [[LOOP_PREHEADER:%.*]], label [[EXIT]] +; CHECK: loop.preheader: +; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; CHECK: vector.ph: +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i32 [[N]], 3 +; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i32 [[N_RND_UP]], 4 +; CHECK-NEXT: [[N_VEC:%.*]] = sub i32 [[N_RND_UP]], [[N_MOD_VF]] +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i32 [[N]], 1 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i32> poison, i32 [[TRIP_COUNT_MINUS_1]], i32 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLATINSERT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE6:%.*]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[PRED_STORE_CONTINUE6]] ] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ule <4 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x i1> [[TMP0]], i32 0 +; CHECK-NEXT: br i1 [[TMP1]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK: pred.store.if: +; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[INDEX]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP2]] +; CHECK-NEXT: store i32 13, i32* [[TMP3]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] +; CHECK: pred.store.continue: +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <4 x i1> [[TMP0]], i32 1 +; CHECK-NEXT: br i1 [[TMP4]], label [[PRED_STORE_IF1:%.*]], label [[PRED_STORE_CONTINUE2:%.*]] +; CHECK: pred.store.if1: +; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[INDEX]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP5]] +; CHECK-NEXT: store i32 13, i32* [[TMP6]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE2]] +; CHECK: pred.store.continue2: +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x i1> [[TMP0]], i32 2 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4:%.*]] +; CHECK: pred.store.if3: +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[INDEX]], 2 +; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP8]] +; CHECK-NEXT: store i32 13, i32* [[TMP9]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE4]] +; CHECK: pred.store.continue4: +; CHECK-NEXT: [[TMP10:%.*]] = extractelement <4 x i1> [[TMP0]], i32 3 +; CHECK-NEXT: br i1 [[TMP10]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.if5: +; CHECK-NEXT: [[TMP11:%.*]] = add i32 [[INDEX]], 3 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[TMP11]] +; CHECK-NEXT: store i32 13, i32* [[TMP12]], align 1 +; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] +; CHECK: pred.store.continue6: +; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], +; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP6:!llvm.loop !.*]] +; CHECK: middle.block: +; CHECK-NEXT: br i1 true, label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; CHECK: scalar.ph: +; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_PREHEADER]] ] +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: [[RIV:%.*]] = phi i32 [ [[RIVPLUS1:%.*]], [[LOOP]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i32 [[RIV]] +; CHECK-NEXT: store i32 13, i32* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[RIVPLUS1]] = add nuw nsw i32 [[RIV]], 1 +; CHECK-NEXT: [[COND:%.*]] = icmp eq i32 [[RIVPLUS1]], [[N]] +; CHECK-NEXT: br i1 [[COND]], label [[EXIT_LOOPEXIT]], label [[LOOP]], [[LOOP7:!llvm.loop !.*]] +; CHECK: exit.loopexit: +; CHECK-NEXT: br label [[EXIT]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %and1 = and i32 %p, 3 + %cmp1 = icmp eq i32 %and1, 0 + br i1 %cmp1, label %guarded, label %exit + +guarded: + %rem = urem i32 %q, 3 + %cmp2 = icmp eq i32 %rem, 0 + tail call void @llvm.assume(i1 %cmp2) + %gt = icmp sgt i32 %p, %q + %n = select i1 %gt, i32 %p, i32 %q + %cmp110 = icmp sgt i32 %n, 0 + br i1 %cmp110, label %loop, label %exit + +loop: + %riv = phi i32 [ 0, %guarded ], [ %rivPlus1, %loop ] %arrayidx = getelementptr inbounds i32, i32* %A, i32 %riv store i32 13, i32* %arrayidx, align 1 %rivPlus1 = add nuw nsw i32 %riv, 1