Index: llvm/include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- llvm/include/llvm/Transforms/Utils/UnrollLoop.h +++ llvm/include/llvm/Transforms/Utils/UnrollLoop.h @@ -70,7 +70,6 @@ bool Force; bool AllowRuntime; bool AllowExpensiveTripCount; - bool PreserveCondBr; unsigned TripMultiple; unsigned PeelCount; bool UnrollRemainder; Index: llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1166,8 +1166,7 @@ LoopUnrollResult UnrollResult = UnrollLoop( L, {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount, - UseUpperBound, TripMultiple, PP.PeelCount, UP.UnrollRemainder, - ForgetAllSCEV}, + TripMultiple, PP.PeelCount, UP.UnrollRemainder, ForgetAllSCEV}, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop); if (UnrollResult == LoopUnrollResult::Unmodified) return LoopUnrollResult::Unmodified; Index: llvm/lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUnroll.cpp +++ llvm/lib/Transforms/Utils/LoopUnroll.cpp @@ -245,18 +245,9 @@ /// branch instruction. However, if the trip count (and multiple) are not known, /// loop unrolling will mostly produce more code that is no faster. /// -/// TripCount is the upper bound of the iteration on which control exits -/// LatchBlock. Control may exit the loop prior to TripCount iterations either -/// via an early branch in other loop block or via LatchBlock terminator. This -/// is relaxed from the general definition of trip count which is the number of -/// times the loop header executes. Note that UnrollLoop assumes that the loop -/// counter test is in LatchBlock in order to remove unnecesssary instances of -/// the test. If control can exit the loop from the LatchBlock's terminator -/// prior to TripCount iterations, flag PreserveCondBr needs to be set. -/// -/// PreserveCondBr indicates whether the conditional branch of the LatchBlock -/// needs to be preserved. It is needed when we use trip count upper bound to -/// fully unroll the loop. +/// TripCount is an upper bound on the number of times the loop header runs. +/// Note that the trip count does not need to be exact, it can be any upper +/// bound on the true trip count. /// /// Similarly, TripMultiple divides the number of times that the LatchBlock may /// execute without exiting the loop. @@ -329,18 +320,6 @@ assert(ULO.TripMultiple > 0); assert(ULO.TripCount == 0 || ULO.TripCount % ULO.TripMultiple == 0); - // Are we eliminating the loop control altogether? - bool CompletelyUnroll = ULO.Count == ULO.TripCount; - - // We assume a run-time trip count if the compiler cannot - // figure out the loop trip count and the unroll-runtime - // flag is specified. - bool RuntimeTripCount = - (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime); - - assert((!RuntimeTripCount || !ULO.PeelCount) && - "Did not expect runtime trip-count unrolling " - "and peeling for the same loop"); bool Peeled = false; if (ULO.PeelCount) { @@ -360,6 +339,21 @@ } } + // Are we eliminating the loop control altogether? Note that we can know + // we're eliminating the backedge without knowing exactly which iteration + // of the unrolled body exits. + const bool CompletelyUnroll = ULO.Count == ULO.TripCount; + + // We assume a run-time trip count if the compiler cannot + // figure out the loop trip count and the unroll-runtime + // flag is specified. + bool RuntimeTripCount = + (ULO.TripCount == 0 && ULO.Count > 0 && ULO.AllowRuntime); + + assert((!RuntimeTripCount || !ULO.PeelCount) && + "Did not expect runtime trip-count unrolling " + "and peeling for the same loop"); + // All these values should be taken only after peeling because they might have // changed. BasicBlock *Preheader = L->getLoopPreheader(); @@ -382,6 +376,8 @@ const unsigned MaxTripCount = SE->getSmallConstantMaxTripCount(L); const bool MaxOrZero = SE->isBackedgeTakenCountMaxOrZero(L); + const unsigned ExactTripCount = SE->getSmallConstantTripCount(L); + const bool ExactUnroll = (ExactTripCount && ExactTripCount == ULO.Count); const bool PreserveOnlyFirst = ULO.Count == MaxTripCount && MaxOrZero; @@ -740,7 +736,7 @@ DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - if (ExitingBI) { + if (ExitingBI) { auto SetDest = [&](BasicBlock *Src, bool WillExit, bool ExitOnTrue) { auto *Term = cast(Src->getTerminator()); const unsigned Idx = ExitOnTrue ^ WillExit; @@ -759,9 +755,22 @@ auto WillExit = [&](unsigned i, unsigned j) -> Optional { if (CompletelyUnroll) { - if (ULO.PreserveCondBr && j && !(PreserveOnlyFirst && i != 0)) - return None; - return j == 0; + ; + if (PreserveOnlyFirst) { + if (i == 0) + return None; + return j == 0; + } + if (ExactUnroll) + return j == 0; + // Full, but non-exact unrolling + if (j == 0) + return true; + if (MaxTripCount && j >= MaxTripCount) + return false; + if (ExactTripCount && j != ExactTripCount) + return false; + return None; } if (RuntimeTripCount && j != 0) Index: llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp +++ llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp @@ -986,8 +986,7 @@ UnrollLoop(remainderLoop, {/*Count*/ Count - 1, /*TripCount*/ Count - 1, /*Force*/ false, /*AllowRuntime*/ false, - /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true, - /*TripMultiple*/ 1, + /*AllowExpensiveTripCount*/ false, /*TripMultiple*/ 1, /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV}, LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA); } Index: llvm/test/Transforms/LoopUnroll/pr45939-peel-count-and-complete-unroll.ll =================================================================== --- llvm/test/Transforms/LoopUnroll/pr45939-peel-count-and-complete-unroll.ll +++ llvm/test/Transforms/LoopUnroll/pr45939-peel-count-and-complete-unroll.ll @@ -36,19 +36,47 @@ ; PEEL2: entry.peel.newph: ; PEEL2-NEXT: br label [[FOR_BODY:%.*]] ; PEEL2: for.body: -; PEEL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_PEEL4]] -; PEEL2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV_NEXT_PEEL4]] to i32 +; PEEL2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PEEL4]], [[ENTRY_PEEL_NEWPH]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY_6:%.*]] ] +; PEEL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV]] +; PEEL2-NEXT: [[TMP2:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; PEEL2-NEXT: store i32 [[TMP2]], i32* [[ARRAYIDX]], align 4 -; PEEL2-NEXT: store i32 3, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 3), align 4 -; PEEL2-NEXT: store i32 4, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 4), align 4 -; PEEL2-NEXT: store i32 5, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 5), align 4 -; PEEL2-NEXT: store i32 6, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 6), align 4 -; PEEL2-NEXT: store i32 7, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 0, i64 7), align 4 -; PEEL2-NEXT: store i32 8, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @a, i64 1, i64 0), align 4 -; PEEL2-NEXT: store i32 9, i32* getelementptr ([8 x i32], [8 x i32]* @a, i64 1, i64 1), align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; PEEL2-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT]] +; PEEL2-NEXT: [[TMP3:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; PEEL2-NEXT: store i32 [[TMP3]], i32* [[ARRAYIDX_1]], align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT]], 1 +; PEEL2-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_1]] +; PEEL2-NEXT: [[TMP4:%.*]] = trunc i64 [[INDVARS_IV_NEXT_1]] to i32 +; PEEL2-NEXT: store i32 [[TMP4]], i32* [[ARRAYIDX_2]], align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1 +; PEEL2-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_2]] +; PEEL2-NEXT: [[TMP5:%.*]] = trunc i64 [[INDVARS_IV_NEXT_2]] to i32 +; PEEL2-NEXT: store i32 [[TMP5]], i32* [[ARRAYIDX_3]], align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1 +; PEEL2-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_3]] +; PEEL2-NEXT: [[TMP6:%.*]] = trunc i64 [[INDVARS_IV_NEXT_3]] to i32 +; PEEL2-NEXT: store i32 [[TMP6]], i32* [[ARRAYIDX_4]], align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_3]], 1 +; PEEL2-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_4]] +; PEEL2-NEXT: [[TMP7:%.*]] = trunc i64 [[INDVARS_IV_NEXT_4]] to i32 +; PEEL2-NEXT: store i32 [[TMP7]], i32* [[ARRAYIDX_5]], align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_4]], 1 +; PEEL2-NEXT: [[EXITCOND_5:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT_5]], 8 +; PEEL2-NEXT: br i1 [[EXITCOND_5]], label [[FOR_BODY_6]], label [[FOR_EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; PEEL2: for.exit.loopexit: ; PEEL2-NEXT: br label [[FOR_EXIT]] ; PEEL2: for.exit: ; PEEL2-NEXT: ret void +; PEEL2: for.body.6: +; PEEL2-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_5]] +; PEEL2-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT_5]] to i32 +; PEEL2-NEXT: store i32 [[TMP8]], i32* [[ARRAYIDX_6]], align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_5]], 1 +; PEEL2-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_6]] +; PEEL2-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV_NEXT_6]] to i32 +; PEEL2-NEXT: store i32 [[TMP9]], i32* [[ARRAYIDX_7]], align 4 +; PEEL2-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV_NEXT_6]], 1 +; PEEL2-NEXT: br label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] ; ; PEEL8-LABEL: @test1( ; PEEL8-NEXT: entry: @@ -132,40 +160,58 @@ ; PEEL8: entry.peel.newph: ; PEEL8-NEXT: br label [[FOR_BODY:%.*]] ; PEEL8: for.body: -; PEEL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_PEEL34]] -; PEEL8-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV_NEXT_PEEL34]] to i32 +; PEEL8-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_PEEL34]], [[ENTRY_PEEL_NEWPH]] ], [ [[INDVARS_IV_NEXT_7:%.*]], [[FOR_BODY_7:%.*]] ] +; PEEL8-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV]] +; PEEL8-NEXT: [[TMP8:%.*]] = trunc i64 [[INDVARS_IV]] to i32 ; PEEL8-NEXT: store i32 [[TMP8]], i32* [[ARRAYIDX]], align 4 -; PEEL8-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_PEEL34]], 1 +; PEEL8-NEXT: [[INDVARS_IV_NEXT:%.*]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY_1:%.*]], label [[FOR_EXIT_LOOPEXIT:%.*]], !llvm.loop [[LOOP0:![0-9]+]] +; PEEL8: for.exit.loopexit: +; PEEL8-NEXT: br label [[FOR_EXIT]] +; PEEL8: for.exit: +; PEEL8-NEXT: ret void +; PEEL8: for.body.1: ; PEEL8-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT]] ; PEEL8-NEXT: [[TMP9:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 ; PEEL8-NEXT: store i32 [[TMP9]], i32* [[ARRAYIDX_1]], align 4 ; PEEL8-NEXT: [[INDVARS_IV_NEXT_1:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY_2:%.*]], label [[FOR_EXIT_LOOPEXIT]], !llvm.loop [[LOOP0]] +; PEEL8: for.body.2: ; PEEL8-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_1]] ; PEEL8-NEXT: [[TMP10:%.*]] = trunc i64 [[INDVARS_IV_NEXT_1]] to i32 ; PEEL8-NEXT: store i32 [[TMP10]], i32* [[ARRAYIDX_2]], align 4 ; PEEL8-NEXT: [[INDVARS_IV_NEXT_2:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_1]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY_3:%.*]], label [[FOR_EXIT_LOOPEXIT]], !llvm.loop [[LOOP0]] +; PEEL8: for.body.3: ; PEEL8-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_2]] ; PEEL8-NEXT: [[TMP11:%.*]] = trunc i64 [[INDVARS_IV_NEXT_2]] to i32 ; PEEL8-NEXT: store i32 [[TMP11]], i32* [[ARRAYIDX_3]], align 4 ; PEEL8-NEXT: [[INDVARS_IV_NEXT_3:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_2]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY_4:%.*]], label [[FOR_EXIT_LOOPEXIT]], !llvm.loop [[LOOP0]] +; PEEL8: for.body.4: ; PEEL8-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_3]] ; PEEL8-NEXT: [[TMP12:%.*]] = trunc i64 [[INDVARS_IV_NEXT_3]] to i32 ; PEEL8-NEXT: store i32 [[TMP12]], i32* [[ARRAYIDX_4]], align 4 ; PEEL8-NEXT: [[INDVARS_IV_NEXT_4:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_3]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY_5:%.*]], label [[FOR_EXIT_LOOPEXIT]], !llvm.loop [[LOOP0]] +; PEEL8: for.body.5: ; PEEL8-NEXT: [[ARRAYIDX_5:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_4]] ; PEEL8-NEXT: [[TMP13:%.*]] = trunc i64 [[INDVARS_IV_NEXT_4]] to i32 ; PEEL8-NEXT: store i32 [[TMP13]], i32* [[ARRAYIDX_5]], align 4 ; PEEL8-NEXT: [[INDVARS_IV_NEXT_5:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_4]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY_6:%.*]], label [[FOR_EXIT_LOOPEXIT]], !llvm.loop [[LOOP0]] +; PEEL8: for.body.6: ; PEEL8-NEXT: [[ARRAYIDX_6:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_5]] ; PEEL8-NEXT: [[TMP14:%.*]] = trunc i64 [[INDVARS_IV_NEXT_5]] to i32 ; PEEL8-NEXT: store i32 [[TMP14]], i32* [[ARRAYIDX_6]], align 4 ; PEEL8-NEXT: [[INDVARS_IV_NEXT_6:%.*]] = add nuw nsw i64 [[INDVARS_IV_NEXT_5]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY_7]], label [[FOR_EXIT_LOOPEXIT]], !llvm.loop [[LOOP0]] +; PEEL8: for.body.7: ; PEEL8-NEXT: [[ARRAYIDX_7:%.*]] = getelementptr inbounds [8 x i32], [8 x i32]* @a, i64 0, i64 [[INDVARS_IV_NEXT_6]] ; PEEL8-NEXT: [[TMP15:%.*]] = trunc i64 [[INDVARS_IV_NEXT_6]] to i32 ; PEEL8-NEXT: store i32 [[TMP15]], i32* [[ARRAYIDX_7]], align 4 -; PEEL8-NEXT: br label [[FOR_EXIT]] -; PEEL8: for.exit: -; PEEL8-NEXT: ret void +; PEEL8-NEXT: [[INDVARS_IV_NEXT_7]] = add nuw nsw i64 [[INDVARS_IV_NEXT_6]], 1 +; PEEL8-NEXT: br i1 true, label [[FOR_BODY]], label [[FOR_EXIT_LOOPEXIT]], !llvm.loop [[LOOP2:![0-9]+]] ; ; PEEL2UNROLL2-LABEL: @test1( ; PEEL2UNROLL2-NEXT: entry: