diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h --- a/llvm/include/llvm/Analysis/ScalarEvolution.h +++ b/llvm/include/llvm/Analysis/ScalarEvolution.h @@ -2023,11 +2023,38 @@ createAddRecFromPHIWithCastsImpl(const SCEVUnknown *SymbolicPHI); /// Compute the backedge taken count knowing the interval difference, and - /// the stride for an inequality. Result takes the form: - /// (Delta + (Stride - 1)) udiv Stride. - /// Caller must ensure that this expression either does not overflow or - /// that the result is undefined if it does. - const SCEV *computeBECount(const SCEV *Delta, const SCEV *Stride); + /// the stride for an inequality. + /// + /// Caller must ensure that non-negative N exists such that + /// (Start + Stride * N) >= End, and that computing "(Start + Stride * N)" + /// doesn't overflow. In other words: + /// 1. If IsSigned is true, Start <=s End. Otherwise, Start <=u End. + /// 2. If End is not equal to start and IsSigned is true, Stride >s 0. If + /// End is not equal to start and IsSigned is false, Stride >u 0. + /// 3. The index variable doesn't overflow. + /// + /// If the preconditions hold, the backedge taken count is N. + /// + /// IsSigned determines whether End, Start, and Stride are treated as + /// signed values, for the purpose of optimizing the form of the result. + /// + /// This function tries to use an optimized form: + /// ((End - Start) + (Stride - 1)) /u Stride + /// + /// If it can't prove the addition doesn't overflow in that form, it uses + /// getUDivCeilSCEV. + const SCEV *computeBECount(bool IsSigned, const SCEV *Start, const SCEV *End, + const SCEV *Stride); + + /// Compute ceil(N / D). N and D are treated as unsigned values. + /// + /// Since SCEV doesn't have native ceiling division, this generates a + /// SCEV expression of the following form: + /// + /// umin(N, 1) + floor((N - umin(N, 1)) / D) + /// + /// A denominator of zero or poison is handled the same way as getUDivExpr(). + const SCEV *getUDivCeilSCEV(const SCEV *N, const SCEV *D); /// Compute the maximum backedge count based on the range of values /// permitted by Start, End, and Stride. This is for loops of the form diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp --- a/llvm/lib/Analysis/ScalarEvolution.cpp +++ b/llvm/lib/Analysis/ScalarEvolution.cpp @@ -11497,11 +11497,100 @@ return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS); } -const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, - const SCEV *Step) { - const SCEV *One = getOne(Step->getType()); - Delta = getAddExpr(Delta, getMinusSCEV(Step, One)); - return getUDivExpr(Delta, Step); +const SCEV *ScalarEvolution::computeBECount(bool IsSigned, const SCEV *Start, + const SCEV *End, + const SCEV *Stride) { + // The basic formula here is ceil((End - Start) / Stride). Since SCEV + // doesn't natively have division that rounds up, we need to convert to + // floor division. + // + // MayOverflow is whether adding (End - Start) + (Stride - 1) + // can overflow if Stride is positive. It's a precondition of the + // function that "End - Start" doesn't overflow. We handle the case where + // Stride isn't positive later. + // + // In practice, the arithmetic almost never overflows, but we have to prove + // it. We have a variety of ways to come up with a proof. + const SCEV *One = getOne(Stride->getType()); + bool MayOverflow = [&] { + if (auto *StrideC = dyn_cast(Stride)) { + if (StrideC->getAPInt().isPowerOf2()) { + // Suppose Stride is a power of two, and Start/End are unsigned + // integers. Let UMAX be the largest representable unsigned + // integer. + // + // By the preconditions of this function (see comment in header), we + // know "(Start + Stride * N)" >= End, and this doesn't overflow. + // As a formula: + // + // End <= (Start + Stride * N) <= UMAX + // + // Subtracting Start from all the terms: + // + // End - Start <= Stride * N <= UMAX - Start + // + // Since Start is unsigned, UMAX - Start <= UMAX. Therefore: + // + // End - Start <= Stride * N <= UMAX + // + // Stride * N is a multiple of Stride. Therefore, + // + // End - Start <= Stride * N <= UMAX - (UMAX mod Stride) + // + // Since Stride is a power of two, UMAX + 1 is divisible by Stride. + // Therefore, UMAX mod Stride == Stride - 1. So we can write: + // + // End - Start <= Stride * N <= UMAX - Stride - 1 + // + // Dropping the middle term: + // + // End - Start <= UMAX - Stride - 1 + // + // Adding Stride - 1 to both sides: + // + // (End - Start) + (Stride - 1) <= UMAX + // + // In other words, the addition doesn't have unsigned overflow. + // + // A similar proof works if we treat Start/End as signed values. + // Just rewrite steps before "End - Start <= Stride * N <= UMAX" to + // use signed max instead of unsigned max. Note that we're trying + // to prove a lack of unsigned overflow in either case. + return false; + } + } + if (Start == Stride || Start == getMinusSCEV(Stride, One)) { + // If Start is equal to Stride, (End - Start) + (Stride - 1) == End - 1. + // If !IsSigned, 0 getType())); + const SCEV *NMinusOne = getMinusSCEV(N, MinNOne); + return getAddExpr(MinNOne, getUDivExpr(NMinusOne, D)); } const SCEV *ScalarEvolution::computeMaxBECountForLT(const SCEV *Start, @@ -11540,8 +11629,8 @@ APInt MaxEnd = IsSigned ? APIntOps::smin(getSignedRangeMax(End), Limit) : APIntOps::umin(getUnsignedRangeMax(End), Limit); - MaxBECount = computeBECount(getConstant(MaxEnd - MinStart) /* Delta */, - getConstant(StrideForMaxBECount) /* Step */); + MaxBECount = getUDivCeilSCEV(getConstant(MaxEnd - MinStart) /* Delta */, + getConstant(StrideForMaxBECount) /* Step */); return MaxBECount; } @@ -11569,7 +11658,6 @@ auto WrapType = IsSigned ? SCEV::FlagNSW : SCEV::FlagNUW; bool NoWrap = ControlsExit && IV->getNoWrapFlags(WrapType); - ICmpInst::Predicate Cond = IsSigned ? ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT; const SCEV *Stride = IV->getStepRecurrence(*this); @@ -11682,7 +11770,6 @@ return RHS; } - const SCEV *End = RHS; // When the RHS is not invariant, we do not know the end bound of the loop and // cannot calculate the ExactBECount needed by ExitLimit. However, we can // calculate the MaxBECount, given the start, stride and max value for the end @@ -11694,42 +11781,83 @@ return ExitLimit(getCouldNotCompute() /* ExactNotTaken */, MaxBECount, false /*MaxOrZero*/, Predicates); } - // If the backedge is taken at least once, then it will be taken - // (End-Start)/Stride times (rounded up to a multiple of Stride), where Start - // is the LHS value of the less-than comparison the first time it is evaluated - // and End is the RHS. - const SCEV *BECountIfBackedgeTaken = - computeBECount(getMinusSCEV(End, Start), Stride); - // If the loop entry is guarded by the result of the backedge test of the - // first loop iteration, then we know the backedge will be taken at least - // once and so the backedge taken count is as above. If not then we use the - // expression (max(End,Start)-Start)/Stride to describe the backedge count, - // as if the backedge is taken at least once max(End,Start) is End and so the - // result is as above, and if not max(End,Start) is Start so we get a backedge - // count of zero. + + // If RHS >= Start, the backedge will be taken ceil(RHS - Start) / Stride + // times. + const SCEV *BECountIfEndGEStart = + computeBECount(IsSigned, Start, RHS, Stride); + + // Try to prove RHS >= Start. + bool RHSGEStart = false; + ICmpInst::Predicate CondGE = + IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE; + ICmpInst::Predicate CondGT = + IsSigned ? ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT; + if (isLoopEntryGuardedByCond(L, CondGE, OrigRHS, OrigStart)) { + RHSGEStart = true; + } else if (isLoopEntryGuardedByCond( + L, CondGT, OrigRHS, + getAddExpr(OrigStart, getMinusOne(OrigStart->getType())))) { + // "RHS >= Start" is trivially equivalent to "RHS > Start - 1" if + // "Start - 1" doesn't overflow. If Start - 1 does overflow, it's equal + // to INT_MAX, and "RHS > INT_MAX" is trivially false. So we can just + // check for "RHS > Start - 1". + // + // FIXME: Should isLoopEntryGuardedByCond do this for us? + RHSGEStart = true; + } + const SCEV *BECount; - if (isLoopEntryGuardedByCond(L, Cond, getMinusSCEV(OrigStart, Stride), OrigRHS)) - BECount = BECountIfBackedgeTaken; - else { - // If we know that RHS >= Start in the context of loop, then we know that - // max(RHS, Start) = RHS at this point. - if (isLoopEntryGuardedByCond( - L, IsSigned ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_UGE, OrigRHS, OrigStart)) - End = RHS; - else - End = IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start); - BECount = computeBECount(getMinusSCEV(End, Start), Stride); + if (RHSGEStart) { + BECount = BECountIfEndGEStart; + } else { + const SCEV *StartMinusStride = getMinusSCEV(Start, Stride); + if (isLoopEntryGuardedByCond(L, CondGT, Start, StartMinusStride) && + isLoopEntryGuardedByCond(L, CondGT, RHS, StartMinusStride)) { + // Suppose "Stride > 0", "Start - Stride" doesn't overflow, and + // "RHS > Start - Stride". + // + // If RHS >= Start, in general, the backedge-taken count is + // "ceil(RHS - Start) / Stride". This is equvalent to + // "((RHS - Start) + (Stride - 1)) /u Stride" in arbitrary-precision math. + // We can reassociate that to "((RHS - 1) - (Start - Stride)) /u Stride". + // According to our preconditions, "RHS - 1" and "Start - Stride" can't + // overflow, and "((RHS - 1) - (Start - Stride))" fits into an unsigned + // integer, so we can just emit that expression directly. + // + // If "RHS < Start", in general, the backedge-taken count is zero. + // "RHS < Start" implies "((RHS - 1) - (Start - Stride)) < Stride", + // so the above formula evaluates to zero. + // + // FIXME: We're missing some cases because of limitations of + // isLoopEntryGuardedByCond. + const SCEV *MinusOne = getMinusOne(Stride->getType()); + const SCEV *Numerator = + getMinusSCEV(getAddExpr(RHS, MinusOne), StartMinusStride); + BECount = getUDivExpr(Numerator, Stride); + } else { + // If RHS < Start, the backedge will be taken zero times. So in general, + // we can write the backedge-taken count as: + // + // RHS >= Start ? ceil(RHS - Start) / Stride : 0 + // + // We convert it to the following to make it more convenient for SCEV: + // + // ceil(max(RHS, Start) - Start) / Stride + const SCEV *End = + IsSigned ? getSMaxExpr(RHS, Start) : getUMaxExpr(RHS, Start); + BECount = computeBECount(IsSigned, Start, End, Stride); + } } const SCEV *MaxBECount; bool MaxOrZero = false; - if (isa(BECount)) + if (isa(BECount)) { MaxBECount = BECount; - else if (isa(BECountIfBackedgeTaken)) { - // If we know exactly how many times the backedge will be taken if it's - // taken at least once, then the backedge count will either be that or - // zero. - MaxBECount = BECountIfBackedgeTaken; + } else if (isa(BECountIfEndGEStart)) { + // If we know exactly how many times the backedge will be taken if + // End >= Start, then the backedge count will either be that or zero. + MaxBECount = BECountIfEndGEStart; MaxOrZero = true; } else { MaxBECount = computeMaxBECountForLT( @@ -11804,7 +11932,7 @@ return End; } - const SCEV *BECount = computeBECount(getMinusSCEV(Start, End), Stride); + const SCEV *BECount = computeBECount(IsSigned, End, Start, Stride); APInt MaxStart = IsSigned ? getSignedRangeMax(Start) : getUnsignedRangeMax(Start); @@ -11825,11 +11953,8 @@ const SCEV *MaxBECount = isa(BECount) ? BECount - : computeBECount(getConstant(MaxStart - MinEnd), - getConstant(MinStride)); - - if (isa(MaxBECount)) - MaxBECount = BECount; + : getUDivCeilSCEV(getConstant(MaxStart - MinEnd), + getConstant(MinStride)); return ExitLimit(BECount, MaxBECount, false, Predicates); } diff --git a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll --- a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll +++ b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride1.ll @@ -1,14 +1,9 @@ ; RUN: opt < %s -analyze -enable-new-pm=0 -scalar-evolution | FileCheck %s ; RUN: opt < %s -disable-output "-passes=print" 2>&1 | FileCheck %s -; CHECK: Loop %bb: backedge-taken count is ((-5 + %x) /u 3) +; CHECK: Loop %bb: backedge-taken count is (((-7 + (-1 * (1 umin (-7 + %x))) + %x) /u 3) + (1 umin (-7 + %x))) ; CHECK: Loop %bb: max backedge-taken count is 1431655764 - -; ScalarEvolution can't compute a trip count because it doesn't know if -; dividing by the stride will have a remainder. This could theoretically -; be teaching it how to use a more elaborate trip count computation. - define i32 @f(i32 %x) nounwind readnone { entry: %0 = icmp ugt i32 %x, 4 ; [#uses=1] diff --git a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll --- a/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll +++ b/llvm/test/Analysis/ScalarEvolution/2008-11-18-Stride2.ll @@ -1,12 +1,10 @@ ; RUN: opt < %s -analyze -enable-new-pm=0 -scalar-evolution 2>&1 | FileCheck %s ; RUN: opt < %s -disable-output "-passes=print" 2>&1 2>&1 | FileCheck %s -; CHECK: Loop %bb: backedge-taken count is ((999 + (-1 * %x)) /u 3) +; CHECK: Loop %bb: backedge-taken count is (((-3 + (-1 * (1 umin (-3 + (-1 * %x) + (1000 umax (3 + %x))))) + (-1 * %x) + (1000 umax (3 + %x))) /u 3) + (1 umin (-3 + (-1 * %x) + (1000 umax (3 + %x))))) ; CHECK: Loop %bb: max backedge-taken count is 334 - -; This is a tricky testcase for unsigned wrap detection which ScalarEvolution -; doesn't yet know how to do. +; This is a tricky testcase for unsigned wrap detection. define i32 @f(i32 %x) nounwind readnone { entry: diff --git a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll --- a/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll +++ b/llvm/test/Analysis/ScalarEvolution/trip-count-unknown-stride.ll @@ -4,8 +4,8 @@ ; ScalarEvolution should be able to compute trip count of the loop by proving ; that this is not an infinite loop with side effects. -; CHECK: Determining loop execution counts for: @foo1 -; CHECK: backedge-taken count is ((-1 + %n) /u %s) +; CHECK-LABEL: Determining loop execution counts for: @foo1 +; CHECK: backedge-taken count is ((-1 + (-1 * %s) + (1 umax %s) + (%n smax %s)) /u (1 umax %s)) ; We should have a conservative estimate for the max backedge taken count for ; loops with unknown stride. @@ -34,8 +34,8 @@ ; Check that we are able to compute trip count of a loop without an entry guard. -; CHECK: Determining loop execution counts for: @foo2 -; CHECK: backedge-taken count is ((-1 + (%n smax %s)) /u %s) +; CHECK-LABEL: Determining loop execution counts for: @foo2 +; CHECK: backedge-taken count is ((-1 + (-1 * %s) + (1 umax %s) + (%n smax %s)) /u (1 umax %s)) ; We should have a conservative estimate for the max backedge taken count for ; loops with unknown stride. @@ -61,7 +61,7 @@ ; Check that without mustprogress we don't make assumptions about infinite ; loops being UB. -; CHECK: Determining loop execution counts for: @foo3 +; CHECK-LABEL: Determining loop execution counts for: @foo3 ; CHECK: Loop %for.body: Unpredictable backedge-taken count. ; CHECK: Loop %for.body: Unpredictable max backedge-taken count. @@ -84,8 +84,8 @@ } ; Same as foo2, but with mustprogress on loop, not function -; CHECK: Determining loop execution counts for: @foo4 -; CHECK: backedge-taken count is ((-1 + (%n smax %s)) /u %s) +; CHECK-LABEL: Determining loop execution counts for: @foo4 +; CHECK: backedge-taken count is ((-1 + (-1 * %s) + (1 umax %s) + (%n smax %s)) /u (1 umax %s)) ; CHECK: max backedge-taken count is -1 define void @foo4(i32* nocapture %A, i32 %n, i32 %s) { @@ -106,5 +106,31 @@ ret void } +; A more complex case with pre-increment compare instead of post-increment. +; CHECK-LABEL: Determining loop execution counts for: @foo5 +; CHECK: Loop %for.body: backedge-taken count is ((((-1 * (1 umin ((-1 * %start) + (%n smax %start)))) + (-1 * %start) + (%n smax %start)) /u (1 umax %s)) + (1 umin ((-1 * %start) + (%n smax %start)))) + +; We should have a conservative estimate for the max backedge taken count for +; loops with unknown stride. +; CHECK: max backedge-taken count is -1 + +define void @foo5(i32* nocapture %A, i32 %n, i32 %s, i32 %start) mustprogress { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.05 = phi i32 [ %add, %for.body ], [ %start, %entry ] + %arrayidx = getelementptr inbounds i32, i32* %A, i32 %i.05 + %0 = load i32, i32* %arrayidx, align 4 + %inc = add nsw i32 %0, 1 + store i32 %inc, i32* %arrayidx, align 4 + %add = add nsw i32 %i.05, %s + %cmp = icmp slt i32 %i.05, %n + br i1 %cmp, label %for.body, label %for.end + +for.end: ; preds = %for.body, %entry + ret void +} + !8 = distinct !{!8, !9} !9 = !{!"llvm.loop.mustprogress"} diff --git a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll b/llvm/test/Transforms/LoopReroll/nonconst_lb.ll --- a/llvm/test/Transforms/LoopReroll/nonconst_lb.ll +++ b/llvm/test/Transforms/LoopReroll/nonconst_lb.ll @@ -17,22 +17,24 @@ ; CHECK-NEXT: [[CMP34:%.*]] = icmp slt i32 [[M:%.*]], [[N:%.*]] ; CHECK-NEXT: br i1 [[CMP34]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[N]], -1 -; CHECK-NEXT: [[TMP1:%.*]] = sub i32 [[TMP0]], [[M]] -; CHECK-NEXT: [[TMP2:%.*]] = lshr i32 [[TMP1]], 2 -; CHECK-NEXT: [[TMP3:%.*]] = shl nuw i32 [[TMP2]], 2 -; CHECK-NEXT: [[TMP4:%.*]] = add nuw nsw i32 [[TMP3]], 3 +; CHECK-NEXT: [[TMP0:%.*]] = add i32 [[M]], 4 +; CHECK-NEXT: [[SMAX:%.*]] = call i32 @llvm.smax.i32(i32 [[N]], i32 [[TMP0]]) +; CHECK-NEXT: [[TMP1:%.*]] = add i32 [[SMAX]], -1 +; CHECK-NEXT: [[TMP2:%.*]] = sub i32 [[TMP1]], [[M]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i32 [[TMP2]], 2 +; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i32 [[TMP3]], 2 +; CHECK-NEXT: [[TMP5:%.*]] = add nuw nsw i32 [[TMP4]], 3 ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVAR:%.*]] = phi i32 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP5:%.*]] = add i32 [[M]], [[INDVAR]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP5]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP6]], 2 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP5]] +; CHECK-NEXT: [[TMP6:%.*]] = add i32 [[M]], [[INDVAR]] +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i32 [[TMP6]] +; CHECK-NEXT: [[TMP7:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; CHECK-NEXT: [[MUL:%.*]] = shl nsw i32 [[TMP7]], 2 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i32 [[TMP6]] ; CHECK-NEXT: store i32 [[MUL]], i32* [[ARRAYIDX2]], align 4 ; CHECK-NEXT: [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1 -; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP4]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[INDVAR]], [[TMP5]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_BODY]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: br label [[FOR_END]] diff --git a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll --- a/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll +++ b/llvm/test/Transforms/LoopVectorize/pr46525-expander-insertpoint.ll @@ -15,16 +15,15 @@ ; CHECK: loop.preheader: ; CHECK-NEXT: [[DIV:%.*]] = udiv i64 [[Y:%.*]], [[ADD]] ; CHECK-NEXT: [[INC:%.*]] = add i64 [[DIV]], 1 -; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[DIV]], 4 -; CHECK-NEXT: [[TMP1:%.*]] = udiv i64 [[TMP0]], [[INC]] -; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP0:%.*]] = udiv i64 3, [[INC]] +; CHECK-NEXT: [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 2 ; CHECK-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: -; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP2]], 1 +; CHECK-NEXT: [[N_RND_UP:%.*]] = add i64 [[TMP1]], 1 ; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], 2 ; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]] ; CHECK-NEXT: [[IND_END:%.*]] = mul i64 [[N_VEC]], [[INC]] -; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP2]], 1 +; CHECK-NEXT: [[TRIP_COUNT_MINUS_1:%.*]] = sub i64 [[TMP1]], 1 ; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] @@ -35,41 +34,41 @@ ; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT1]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[INC]], i32 0 ; CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector <2 x i64> [[DOTSPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = mul <2 x i64> , [[DOTSPLAT]] -; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], [[TMP3]] -; CHECK-NEXT: [[TMP4:%.*]] = mul i64 0, [[INC]] -; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], [[TMP4]] +; CHECK-NEXT: [[TMP2:%.*]] = mul <2 x i64> , [[DOTSPLAT]] +; CHECK-NEXT: [[INDUCTION:%.*]] = add <2 x i64> [[BROADCAST_SPLAT2]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = mul i64 0, [[INC]] +; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], [[TMP3]] ; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <2 x i64> poison, i64 [[INDEX]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT4:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT3]], <2 x i64> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[VEC_IV:%.*]] = add <2 x i64> [[BROADCAST_SPLAT4]], -; CHECK-NEXT: [[TMP6:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] -; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP6]], i32 0 -; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp ule <2 x i64> [[VEC_IV]], [[BROADCAST_SPLAT]] +; CHECK-NEXT: [[TMP6:%.*]] = extractelement <2 x i1> [[TMP5]], i32 0 +; CHECK-NEXT: br i1 [[TMP6]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]] ; CHECK: pred.store.if: ; CHECK-NEXT: store i32 0, i32* [[PTR:%.*]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE]] ; CHECK: pred.store.continue: -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i1> [[TMP6]], i32 1 -; CHECK-NEXT: br i1 [[TMP8]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <2 x i1> [[TMP5]], i32 1 +; CHECK-NEXT: br i1 [[TMP7]], label [[PRED_STORE_IF5:%.*]], label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.if5: ; CHECK-NEXT: store i32 0, i32* [[PTR]], align 4 ; CHECK-NEXT: br label [[PRED_STORE_CONTINUE6]] ; CHECK: pred.store.continue6: ; CHECK-NEXT: [[OFFSET_IDX7:%.*]] = mul i64 [[INDEX]], [[INC]] -; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[OFFSET_IDX7]] to i8 -; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[INC]] to i8 -; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = trunc i64 [[OFFSET_IDX7]] to i8 +; CHECK-NEXT: [[TMP9:%.*]] = trunc i64 [[INC]] to i8 +; CHECK-NEXT: [[BROADCAST_SPLATINSERT8:%.*]] = insertelement <2 x i8> poison, i8 [[TMP8]], i32 0 ; CHECK-NEXT: [[BROADCAST_SPLAT9:%.*]] = shufflevector <2 x i8> [[BROADCAST_SPLATINSERT8]], <2 x i8> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <2 x i8> poison, i8 [[TMP10]], i32 0 +; CHECK-NEXT: [[DOTSPLATINSERT10:%.*]] = insertelement <2 x i8> poison, i8 [[TMP9]], i32 0 ; CHECK-NEXT: [[DOTSPLAT11:%.*]] = shufflevector <2 x i8> [[DOTSPLATINSERT10]], <2 x i8> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP11:%.*]] = mul <2 x i8> , [[DOTSPLAT11]] -; CHECK-NEXT: [[INDUCTION12:%.*]] = add <2 x i8> [[BROADCAST_SPLAT9]], [[TMP11]] -; CHECK-NEXT: [[TMP12:%.*]] = mul i8 0, [[TMP10]] -; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[TMP9]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = add i8 [[TMP13]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = mul <2 x i8> , [[DOTSPLAT11]] +; CHECK-NEXT: [[INDUCTION12:%.*]] = add <2 x i8> [[BROADCAST_SPLAT9]], [[TMP10]] +; CHECK-NEXT: [[TMP11:%.*]] = mul i8 0, [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = add i8 [[TMP8]], [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = add i8 [[TMP12]], 1 ; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], 2 -; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: br i1 true, label [[LOOP_EXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -82,7 +81,7 @@ ; CHECK-NEXT: [[V3:%.*]] = add i8 [[V2]], 1 ; CHECK-NEXT: [[CMP15:%.*]] = icmp slt i8 [[V3]], 5 ; CHECK-NEXT: [[IV_NEXT]] = add i64 [[IV]], [[INC]] -; CHECK-NEXT: br i1 [[CMP15]], label [[LOOP]], label [[LOOP_EXIT]], [[LOOP2:!llvm.loop !.*]] +; CHECK-NEXT: br i1 [[CMP15]], label [[LOOP]], label [[LOOP_EXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; CHECK: loop.exit: ; CHECK-NEXT: [[DIV_1:%.*]] = udiv i64 [[Y]], [[ADD]] ; CHECK-NEXT: [[V1:%.*]] = add i64 [[DIV_1]], 1