Index: llvm/trunk/include/llvm/Analysis/ScalarEvolution.h =================================================================== --- llvm/trunk/include/llvm/Analysis/ScalarEvolution.h +++ llvm/trunk/include/llvm/Analysis/ScalarEvolution.h @@ -1330,6 +1330,11 @@ /// prematurely via another branch. unsigned getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock); + /// Returns the upper bound of the loop trip count as a normal unsigned + /// value. + /// Returns 0 if the trip count is unknown or not constant. + unsigned getSmallConstantMaxTripCount(Loop *L); + /// Returns the largest constant divisor of the trip count of the /// loop if it is a single-exit loop and we can compute a small maximum for /// that loop. Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -290,6 +290,8 @@ /// Apply loop unroll on any kind of loop /// (mainly to loops that fail runtime unrolling). bool Force; + /// Allow using trip count upper bound to unroll loops. + bool UpperBound; }; /// \brief Get target-customized preferences for the generic loop unrolling Index: llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/trunk/include/llvm/CodeGen/BasicTTIImpl.h @@ -284,7 +284,8 @@ } // Enable runtime and partial unrolling up to the specified size. - UP.Partial = UP.Runtime = true; + // Enable using trip count upper bound to unroll loops. + UP.Partial = UP.Runtime = UP.UpperBound = true; UP.PartialThreshold = MaxOps; // Avoid unrolling when optimizing for size. Index: llvm/trunk/include/llvm/Transforms/Scalar.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar.h +++ llvm/trunk/include/llvm/Transforms/Scalar.h @@ -169,8 +169,9 @@ // LoopUnroll - This pass is a simple loop unrolling pass. // Pass *createLoopUnrollPass(int Threshold = -1, int Count = -1, - int AllowPartial = -1, int Runtime = -1); -// Create an unrolling pass for full unrolling only. + int AllowPartial = -1, int Runtime = -1, + int UpperBound = -1); +// Create an unrolling pass for full unrolling that uses exact trip count only. Pass *createSimpleLoopUnrollPass(); //===----------------------------------------------------------------------===// Index: llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollPass.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollPass.h +++ llvm/trunk/include/llvm/Transforms/Scalar/LoopUnrollPass.h @@ -21,6 +21,7 @@ Optional ProvidedThreshold; Optional ProvidedAllowPartial; Optional ProvidedRuntime; + Optional ProvidedUpperBound; PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM); }; Index: llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h +++ llvm/trunk/include/llvm/Transforms/Utils/UnrollLoop.h @@ -32,8 +32,8 @@ bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, + bool UseUpperBound, unsigned TripMultiple, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA); bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, Index: llvm/trunk/lib/Analysis/ScalarEvolution.cpp =================================================================== --- llvm/trunk/lib/Analysis/ScalarEvolution.cpp +++ llvm/trunk/lib/Analysis/ScalarEvolution.cpp @@ -5292,6 +5292,20 @@ // Iteration Count Computation Code // +static unsigned getConstantTripCount(const SCEVConstant *ExitCount) { + if (!ExitCount) + return 0; + + ConstantInt *ExitConst = ExitCount->getValue(); + + // Guard against huge trip counts. + if (ExitConst->getValue().getActiveBits() > 32) + return 0; + + // In case of integer overflow, this returns 0, which is correct. + return ((unsigned)ExitConst->getZExtValue()) + 1; +} + unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) { if (BasicBlock *ExitingBB = L->getExitingBlock()) return getSmallConstantTripCount(L, ExitingBB); @@ -5307,17 +5321,13 @@ "Exiting block must actually branch out of the loop!"); const SCEVConstant *ExitCount = dyn_cast(getExitCount(L, ExitingBlock)); - if (!ExitCount) - return 0; - - ConstantInt *ExitConst = ExitCount->getValue(); - - // Guard against huge trip counts. - if (ExitConst->getValue().getActiveBits() > 32) - return 0; + return getConstantTripCount(ExitCount); +} - // In case of integer overflow, this returns 0, which is correct. - return ((unsigned)ExitConst->getZExtValue()) + 1; +unsigned ScalarEvolution::getSmallConstantMaxTripCount(Loop *L) { + const auto *MaxExitCount = + dyn_cast(getMaxBackedgeTakenCount(L)); + return getConstantTripCount(MaxExitCount); } unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) { Index: llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp +++ llvm/trunk/lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -92,6 +92,11 @@ UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); +static cl::opt UnrollMaxUpperBound( + "unroll-max-upperbound", cl::init(8), cl::Hidden, + cl::desc( + "The max of trip count upper bound that is considered in unrolling")); + static cl::opt PragmaUnrollThreshold( "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " @@ -107,7 +112,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, const TargetTransformInfo &TTI, Optional UserThreshold, Optional UserCount, Optional UserAllowPartial, - Optional UserRuntime) { + Optional UserRuntime, Optional UserUpperBound) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults @@ -126,6 +131,7 @@ UP.AllowRemainder = true; UP.AllowExpensiveTripCount = false; UP.Force = false; + UP.UpperBound = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); @@ -156,6 +162,8 @@ UP.AllowRemainder = UnrollAllowRemainder; if (UnrollRuntime.getNumOccurrences() > 0) UP.Runtime = UnrollRuntime; + if (UnrollMaxUpperBound == 0) + UP.UpperBound = false; // Apply user values provided by argument if (UserThreshold.hasValue()) { @@ -168,6 +176,8 @@ UP.Partial = *UserAllowPartial; if (UserRuntime.hasValue()) UP.Runtime = *UserRuntime; + if (UserUpperBound.hasValue()) + UP.UpperBound = *UserUpperBound; return UP; } @@ -691,13 +701,11 @@ // Returns true if unroll count was set explicitly. // Calculates unroll count and writes it to UP.Count. -static bool computeUnrollCount(Loop *L, const TargetTransformInfo &TTI, - DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, - OptimizationRemarkEmitter *ORE, - unsigned TripCount, unsigned TripMultiple, - unsigned LoopSize, - TargetTransformInfo::UnrollingPreferences &UP) { +static bool computeUnrollCount( + Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, + ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, + unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, + TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // BEInsns represents number of instructions optimized when "back edge" // becomes "fall through" in unrolled loop. // For now we count a conditional branch on a backedge and a comparison @@ -749,14 +757,27 @@ } // 3rd priority is full unroll count. - // Full unroll make sense only when TripCount could be staticaly calculated. + // Full unroll makes sense only when TripCount or its upper bound could be + // statically calculated. // Also we need to check if we exceed FullUnrollMaxCount. - if (TripCount && TripCount <= UP.FullUnrollMaxCount) { + // If using the upper bound to unroll, TripMultiple should be set to 1 because + // we do not know when loop may exit. + // MaxTripCount and ExactTripCount cannot both be non zero since we only + // compute the former when the latter is zero. + unsigned ExactTripCount = TripCount; + assert((ExactTripCount == 0 || MaxTripCount == 0) && + "ExtractTripCound and MaxTripCount cannot both be non zero."); + unsigned FullUnrollTripCount = ExactTripCount ? ExactTripCount : MaxTripCount; + if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns; + UnrolledSize = + (uint64_t)(LoopSize - BEInsns) * FullUnrollTripCount + BEInsns; if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount, UnrolledSize, UnrolledSize)) { + UseUpperBound = (MaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; UP.Count = TripCount; return ExplicitUnroll; } else { @@ -764,12 +785,15 @@ // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional Cost = analyzeLoopUnrollCost( - L, TripCount, DT, *SE, TTI, + L, FullUnrollTripCount, DT, *SE, TTI, UP.Threshold + UP.DynamicCostSavingsDiscount)) if (canUnrollCompletely(L, UP.Threshold, UP.PercentDynamicCostSavedThreshold, UP.DynamicCostSavingsDiscount, Cost->UnrolledCost, Cost->RolledDynamicCost)) { + UseUpperBound = (MaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; UP.Count = TripCount; return ExplicitUnroll; } @@ -909,7 +933,8 @@ Optional ProvidedCount, Optional ProvidedThreshold, Optional ProvidedAllowPartial, - Optional ProvidedRuntime) { + Optional ProvidedRuntime, + Optional ProvidedUpperBound) { DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); if (HasUnrollDisablePragma(L)) { @@ -939,6 +964,7 @@ // Find trip count and trip multiple if count is not available unsigned TripCount = 0; + unsigned MaxTripCount = 0; unsigned TripMultiple = 1; // If there are multiple exiting blocks but one of them is the latch, use the // latch for the trip count estimation. Otherwise insist on a single exiting @@ -953,7 +979,7 @@ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, - ProvidedRuntime); + ProvidedRuntime, ProvidedUpperBound); // Exit early if unrolling is disabled. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) @@ -974,8 +1000,23 @@ if (Convergent) UP.AllowRemainder = false; - bool IsCountSetExplicitly = computeUnrollCount( - L, TTI, DT, LI, SE, &ORE, TripCount, TripMultiple, LoopSize, UP); + // Try to find the trip count upper bound if it is allowed and we cannot find + // exact trip count. + if (UP.UpperBound) { + if (!TripCount) { + MaxTripCount = SE->getSmallConstantMaxTripCount(L); + // Only unroll with small upper bound. + if (MaxTripCount > UnrollMaxUpperBound) + MaxTripCount = 0; + } + } + + // computeUnrollCount() decides whether it is beneficial to use upper bound to + // fully unroll the loop. + bool UseUpperBound = false; + bool IsCountSetExplicitly = + computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount, + TripMultiple, LoopSize, UP, UseUpperBound); if (!UP.Count) return false; // Unroll factor (Count) must be less or equal to TripCount. @@ -984,8 +1025,8 @@ // Unroll the loop. if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, - UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC, - &ORE, PreserveLCSSA)) + UP.AllowExpensiveTripCount, UseUpperBound, TripMultiple, LI, + SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; // If loop has an unroll count pragma or unrolled by explicitly set count @@ -1001,10 +1042,11 @@ static char ID; // Pass ID, replacement for typeid LoopUnroll(Optional Threshold = None, Optional Count = None, - Optional AllowPartial = None, Optional Runtime = None) + Optional AllowPartial = None, Optional Runtime = None, + Optional UpperBound = None) : LoopPass(ID), ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), - ProvidedRuntime(Runtime) { + ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -1012,6 +1054,7 @@ Optional ProvidedThreshold; Optional ProvidedAllowPartial; Optional ProvidedRuntime; + Optional ProvidedUpperBound; bool runOnLoop(Loop *L, LPPassManager &) override { if (skipLoop(L)) @@ -1033,7 +1076,8 @@ return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, ProvidedCount, ProvidedThreshold, - ProvidedAllowPartial, ProvidedRuntime); + ProvidedAllowPartial, ProvidedRuntime, + ProvidedUpperBound); } /// This transformation requires natural loop information & requires that @@ -1057,7 +1101,7 @@ INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, - int Runtime) { + int Runtime, int UpperBound) { // TODO: It would make more sense for this function to take the optionals // directly, but that's dangerous since it would silently break out of tree // callers. @@ -1065,11 +1109,12 @@ Count == -1 ? None : Optional(Count), AllowPartial == -1 ? None : Optional(AllowPartial), - Runtime == -1 ? None : Optional(Runtime)); + Runtime == -1 ? None : Optional(Runtime), + UpperBound == -1 ? None : Optional(UpperBound)); } Pass *llvm::createSimpleLoopUnrollPass() { - return llvm::createLoopUnrollPass(-1, -1, 0, 0); + return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0); } PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) { @@ -1098,9 +1143,10 @@ report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not " "cached at a higher level"); - bool Changed = tryToUnrollLoop( - &L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true, ProvidedCount, - ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime); + bool Changed = + tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true, + ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, + ProvidedRuntime, ProvidedUpperBound); if (!Changed) return PreservedAnalyses::all(); Index: llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp +++ llvm/trunk/lib/Transforms/Utils/LoopUnroll.cpp @@ -187,6 +187,10 @@ /// iterations via an early branch, but control may not exit the loop from the /// LatchBlock's terminator prior to TripCount iterations. /// +/// PreserveCondBr indicates whether the conditional branch of the LatchBlock +/// needs to be preserved. It is needed when we use trip count upper bound to +/// fully unroll the loop. +/// /// Similarly, TripMultiple divides the number of times that the LatchBlock may /// execute without exiting the loop. /// @@ -203,9 +207,10 @@ /// DominatorTree if they are non-null. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) { + bool PreserveCondBr, unsigned TripMultiple, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -539,12 +544,16 @@ if (CompletelyUnroll) { if (j == 0) Dest = LoopExit; - NeedConditional = false; - } - - // If we know the trip count or a multiple of it, we can safely use an - // unconditional branch for some iterations. - if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) { + // If using trip count upper bound to completely unroll, we need to keep + // the conditional branch except the last one because the loop may exit + // after any iteration. + assert(NeedConditional && + "NeedCondition cannot be modified by both complete " + "unrolling and runtime unrolling"); + NeedConditional = (PreserveCondBr && j); + } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) { + // If we know the trip count or a multiple of it, we can safely use an + // unconditional branch for some iterations. NeedConditional = false; } Index: llvm/trunk/test/Transforms/LoopUnroll/AArch64/full-unroll-trip-count-upper-bound.ll =================================================================== --- llvm/trunk/test/Transforms/LoopUnroll/AArch64/full-unroll-trip-count-upper-bound.ll +++ llvm/trunk/test/Transforms/LoopUnroll/AArch64/full-unroll-trip-count-upper-bound.ll @@ -0,0 +1,43 @@ +; RUN: opt -loop-unroll -S -mtriple aarch64 -mcpu=cortex-a57 %s | FileCheck %s -check-prefix=UNROLL +; RUN: opt -loop-unroll -unroll-max-upperbound=0 -S -mtriple aarch64 -mcpu=cortex-a57 %s | FileCheck %s -check-prefix=NOUNROLL + +; This IR comes from this C code: +; +; for (int i = 0; i < 4; i++) { +; if (src[i] == 1) { +; *dst = i; +; break; +; } +; } +; +; This test is meant to check that this loop is unrolled into four iterations. + +; UNROLL-LABEL: @test +; UNROLL: load i32, i32* +; UNROLL: load i32, i32* +; UNROLL: load i32, i32* +; UNROLL: load i32, i32* +; UNROLL-NOT: load i32, i32* +; NOUNROLL-LABEL: @test +; NOUNROLL: load i32, i32* +; NOUNROLL-NOT: load i32, i32* + +define void @test(i32* %dst, i32* %src) { +entry: + br label %for.body + +for.body: ; preds = %entry, %for.body + %i = phi i32 [ 0, %entry ], [ %inc, %for.body ] + %0 = sext i32 %i to i64 + %1 = getelementptr inbounds i32, i32* %src, i64 %0 + %2 = load i32, i32* %1 + %inc = add nsw i32 %i, 1 + %cmp1 = icmp slt i32 %inc, 4 + %cmp3 = icmp eq i32 %2, 1 + %or.cond = and i1 %cmp3, %cmp1 + br i1 %or.cond, label %for.body, label %exit + +exit: ; preds = %for.body + store i32 %i, i32* %dst + ret void +}