Index: include/llvm/Analysis/ScalarEvolution.h =================================================================== --- include/llvm/Analysis/ScalarEvolution.h +++ include/llvm/Analysis/ScalarEvolution.h @@ -1392,6 +1392,11 @@ /// prematurely via another branch. unsigned getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock); + /// Returns the upper bound of the loop trip count as a normal unsigned + /// value. + /// Returns 0 if the trip count is unknown or not constant. + unsigned getSmallConstantMaxTripCount(Loop *L); + /// Returns the largest constant divisor of the trip count of the /// loop if it is a single-exit loop and we can compute a small maximum for /// that loop. Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -288,6 +288,8 @@ /// Apply loop unroll on any kind of loop /// (mainly to loops that fail runtime unrolling). bool Force; + /// Allow using trip count upper bound to unroll loops. + bool UpperBound; }; /// \brief Get target-customized preferences for the generic loop unrolling Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -284,7 +284,8 @@ } // Enable runtime and partial unrolling up to the specified size. - UP.Partial = UP.Runtime = true; + // Enable using trip count upper bound to unroll loops. + UP.Partial = UP.Runtime = UP.UpperBound = true; UP.PartialThreshold = MaxOps; // Avoid unrolling when optimizing for size. Index: include/llvm/Transforms/Scalar.h =================================================================== --- include/llvm/Transforms/Scalar.h +++ include/llvm/Transforms/Scalar.h @@ -169,8 +169,9 @@ // LoopUnroll - This pass is a simple loop unrolling pass. // Pass *createLoopUnrollPass(int Threshold = -1, int Count = -1, - int AllowPartial = -1, int Runtime = -1); -// Create an unrolling pass for full unrolling only. + int AllowPartial = -1, int Runtime = -1, + int UpperBound = -1); +// Create an unrolling pass for full unrolling that uses exact trip count only. Pass *createSimpleLoopUnrollPass(); //===----------------------------------------------------------------------===// Index: include/llvm/Transforms/Scalar/LoopUnrollPass.h =================================================================== --- include/llvm/Transforms/Scalar/LoopUnrollPass.h +++ include/llvm/Transforms/Scalar/LoopUnrollPass.h @@ -21,6 +21,7 @@ Optional ProvidedThreshold; Optional ProvidedAllowPartial; Optional ProvidedRuntime; + Optional ProvidedUpperBound; PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM); }; Index: include/llvm/Transforms/Utils/UnrollLoop.h =================================================================== --- include/llvm/Transforms/Utils/UnrollLoop.h +++ include/llvm/Transforms/Utils/UnrollLoop.h @@ -32,8 +32,8 @@ bool UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, + bool UseUpperBound, unsigned TripMultiple, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC, OptimizationRemarkEmitter *ORE, bool PreserveLCSSA); bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count, Index: lib/Analysis/ScalarEvolution.cpp =================================================================== --- lib/Analysis/ScalarEvolution.cpp +++ lib/Analysis/ScalarEvolution.cpp @@ -5313,6 +5313,20 @@ // Iteration Count Computation Code // +static unsigned TranslateToConstantTripCount(const SCEVConstant *ExitCount) { + if (!ExitCount) + return 0; + + ConstantInt *ExitConst = ExitCount->getValue(); + + // Guard against huge trip counts. + if (ExitConst->getValue().getActiveBits() > 32) + return 0; + + // In case of integer overflow, this returns 0, which is correct. + return ((unsigned)ExitConst->getZExtValue()) + 1; +} + unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) { if (BasicBlock *ExitingBB = L->getExitingBlock()) return getSmallConstantTripCount(L, ExitingBB); @@ -5328,17 +5342,13 @@ "Exiting block must actually branch out of the loop!"); const SCEVConstant *ExitCount = dyn_cast(getExitCount(L, ExitingBlock)); - if (!ExitCount) - return 0; - - ConstantInt *ExitConst = ExitCount->getValue(); - - // Guard against huge trip counts. - if (ExitConst->getValue().getActiveBits() > 32) - return 0; + return TranslateToConstantTripCount(ExitCount); +} - // In case of integer overflow, this returns 0, which is correct. - return ((unsigned)ExitConst->getZExtValue()) + 1; +unsigned ScalarEvolution::getSmallConstantMaxTripCount(Loop *L) { + const auto *MaxExitCount = + dyn_cast(getMaxBackedgeTakenCount(L)); + return TranslateToConstantTripCount(MaxExitCount); } unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) { Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -92,6 +92,15 @@ UnrollRuntime("unroll-runtime", cl::ZeroOrMore, cl::Hidden, cl::desc("Unroll loops with run-time trip counts")); +static cl::opt UnrollUpperBound( + "unroll-upperbound", cl::ZeroOrMore, cl::Hidden, + cl::desc("Unroll loops with the upper bounds of the trip counts")); + +static cl::opt UnrollMaxUpperBound( + "unroll-max-upperbound", cl::init(8), cl::Hidden, + cl::desc( + "The max of trip count upper bound that is considered in unrolling")); + static cl::opt PragmaUnrollThreshold( "pragma-unroll-threshold", cl::init(16 * 1024), cl::Hidden, cl::desc("Unrolled size limit for loops with an unroll(full) or " @@ -111,7 +120,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences( Loop *L, const TargetTransformInfo &TTI, Optional UserThreshold, Optional UserCount, Optional UserAllowPartial, - Optional UserRuntime) { + Optional UserRuntime, Optional UserUpperBound) { TargetTransformInfo::UnrollingPreferences UP; // Set up the defaults @@ -129,6 +138,7 @@ UP.AllowRemainder = true; UP.AllowExpensiveTripCount = false; UP.Force = false; + UP.UpperBound = false; // Override with any target specific settings TTI.getUnrollingPreferences(L, UP); @@ -159,6 +169,8 @@ UP.AllowRemainder = UnrollAllowRemainder; if (UnrollRuntime.getNumOccurrences() > 0) UP.Runtime = UnrollRuntime; + if (UnrollUpperBound.getNumOccurrences() > 0) + UP.UpperBound = UnrollUpperBound; // Apply user values provided by argument if (UserThreshold.hasValue()) { @@ -171,6 +183,8 @@ UP.Partial = *UserAllowPartial; if (UserRuntime.hasValue()) UP.Runtime = *UserRuntime; + if (UserUpperBound.hasValue()) + UP.UpperBound = *UserUpperBound; return UP; } @@ -695,8 +709,8 @@ DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, - unsigned TripCount, unsigned TripMultiple, - unsigned LoopSize, + unsigned &TripCount, unsigned MaxTripCount, + unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP) { // BEInsns represents number of instructions optimized when "back edge" // becomes "fall through" in unrolled loop. @@ -749,14 +763,22 @@ } // 3rd priority is full unroll count. - // Full unroll make sense only when TripCount could be staticaly calculated. + // Full unroll make sense only when TripCount or its upper bound could be + // staticaly calculated. // Also we need to check if we exceed FullUnrollMaxCount. - if (TripCount && TripCount <= UP.FullUnrollMaxCount) { + // If using the upper bound to unroll, TripMultiple should be set to 1 because + // we do not know when loop may exit. + unsigned FullUnrollTripCount = TripCount ? TripCount : MaxTripCount; + if (FullUnrollTripCount && FullUnrollTripCount <= UP.FullUnrollMaxCount) { // When computing the unrolled size, note that BEInsns are not replicated // like the rest of the loop body. - UnrolledSize = (uint64_t)(LoopSize - BEInsns) * TripCount + BEInsns; + UnrolledSize = + (uint64_t)(LoopSize - BEInsns) * FullUnrollTripCount + BEInsns; if (canUnrollCompletely(L, UP.Threshold, 100, UP.DynamicCostSavingsDiscount, UnrolledSize, UnrolledSize)) { + UP.UpperBound = (MaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; UP.Count = TripCount; return ExplicitUnroll; } else { @@ -764,12 +786,15 @@ // helps to remove a significant number of instructions. // To check that, run additional analysis on the loop. if (Optional Cost = analyzeLoopUnrollCost( - L, TripCount, DT, *SE, TTI, + L, FullUnrollTripCount, DT, *SE, TTI, UP.Threshold + UP.DynamicCostSavingsDiscount)) if (canUnrollCompletely(L, UP.Threshold, UP.PercentDynamicCostSavedThreshold, UP.DynamicCostSavingsDiscount, Cost->UnrolledCost, Cost->RolledDynamicCost)) { + UP.UpperBound = (MaxTripCount == FullUnrollTripCount); + TripCount = FullUnrollTripCount; + TripMultiple = UP.UpperBound ? 1 : TripMultiple; UP.Count = TripCount; return ExplicitUnroll; } @@ -902,7 +927,8 @@ Optional ProvidedCount, Optional ProvidedThreshold, Optional ProvidedAllowPartial, - Optional ProvidedRuntime) { + Optional ProvidedRuntime, + Optional ProvidedUpperBound) { DEBUG(dbgs() << "Loop Unroll: F[" << L->getHeader()->getParent()->getName() << "] Loop %" << L->getHeader()->getName() << "\n"); if (HasUnrollDisablePragma(L)) { @@ -932,6 +958,7 @@ // Find trip count and trip multiple if count is not available unsigned TripCount = 0; + unsigned MaxTripCount = 0; unsigned TripMultiple = 1; // If there are multiple exiting blocks but one of them is the latch, use the // latch for the trip count estimation. Otherwise insist on a single exiting @@ -946,7 +973,7 @@ TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences( L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial, - ProvidedRuntime); + ProvidedRuntime, ProvidedUpperBound); // Exit early if unrolling is disabled. if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0)) @@ -967,8 +994,22 @@ if (Convergent) UP.AllowRemainder = false; - bool IsCountSetExplicitly = computeUnrollCount( - L, TTI, DT, LI, SE, &ORE, TripCount, TripMultiple, LoopSize, UP); + // Try to find the trip count upper bound if it is allowed. + if (UP.UpperBound) { + if (!TripCount) { + MaxTripCount = SE->getSmallConstantMaxTripCount(L); + // Only unroll with small upper bound. + if (MaxTripCount > UnrollMaxUpperBound) + MaxTripCount = 0; + } + // computeUnrollCount() will set UP.UpperBound to true later if using the + // upper bound to unroll meets the heuristics. + UP.UpperBound = false; + } + + bool IsCountSetExplicitly = + computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount, + TripMultiple, LoopSize, UP); if (!UP.Count) return false; // Unroll factor (Count) must be less or equal to TripCount. @@ -977,8 +1018,8 @@ // Unroll the loop. if (!UnrollLoop(L, UP.Count, TripCount, UP.Force, UP.Runtime, - UP.AllowExpensiveTripCount, TripMultiple, LI, SE, &DT, &AC, - &ORE, PreserveLCSSA)) + UP.AllowExpensiveTripCount, UP.UpperBound, TripMultiple, LI, + SE, &DT, &AC, &ORE, PreserveLCSSA)) return false; // If loop has an unroll count pragma or unrolled by explicitly set count @@ -994,10 +1035,11 @@ static char ID; // Pass ID, replacement for typeid LoopUnroll(Optional Threshold = None, Optional Count = None, - Optional AllowPartial = None, Optional Runtime = None) + Optional AllowPartial = None, Optional Runtime = None, + Optional UpperBound = None) : LoopPass(ID), ProvidedCount(std::move(Count)), ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial), - ProvidedRuntime(Runtime) { + ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) { initializeLoopUnrollPass(*PassRegistry::getPassRegistry()); } @@ -1005,6 +1047,7 @@ Optional ProvidedThreshold; Optional ProvidedAllowPartial; Optional ProvidedRuntime; + Optional ProvidedUpperBound; bool runOnLoop(Loop *L, LPPassManager &) override { if (skipLoop(L)) @@ -1026,7 +1069,8 @@ return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, ProvidedCount, ProvidedThreshold, - ProvidedAllowPartial, ProvidedRuntime); + ProvidedAllowPartial, ProvidedRuntime, + ProvidedUpperBound); } /// This transformation requires natural loop information & requires that @@ -1050,7 +1094,7 @@ INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial, - int Runtime) { + int Runtime, int UpperBound) { // TODO: It would make more sense for this function to take the optionals // directly, but that's dangerous since it would silently break out of tree // callers. @@ -1058,11 +1102,12 @@ Count == -1 ? None : Optional(Count), AllowPartial == -1 ? None : Optional(AllowPartial), - Runtime == -1 ? None : Optional(Runtime)); + Runtime == -1 ? None : Optional(Runtime), + UpperBound == -1 ? None : Optional(UpperBound)); } Pass *llvm::createSimpleLoopUnrollPass() { - return llvm::createLoopUnrollPass(-1, -1, 0, 0); + return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0); } PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM) { @@ -1091,9 +1136,10 @@ report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not " "cached at a higher level"); - bool Changed = tryToUnrollLoop( - &L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true, ProvidedCount, - ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime); + bool Changed = + tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true, + ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, + ProvidedRuntime, ProvidedUpperBound); if (!Changed) return PreservedAnalyses::all(); Index: lib/Transforms/Utils/LoopUnroll.cpp =================================================================== --- lib/Transforms/Utils/LoopUnroll.cpp +++ lib/Transforms/Utils/LoopUnroll.cpp @@ -203,9 +203,10 @@ /// DominatorTree if they are non-null. bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force, bool AllowRuntime, bool AllowExpensiveTripCount, - unsigned TripMultiple, LoopInfo *LI, ScalarEvolution *SE, - DominatorTree *DT, AssumptionCache *AC, - OptimizationRemarkEmitter *ORE, bool PreserveLCSSA) { + bool UseUpperBound, unsigned TripMultiple, LoopInfo *LI, + ScalarEvolution *SE, DominatorTree *DT, + AssumptionCache *AC, OptimizationRemarkEmitter *ORE, + bool PreserveLCSSA) { BasicBlock *Preheader = L->getLoopPreheader(); if (!Preheader) { DEBUG(dbgs() << " Can't unroll; loop preheader-insertion failed.\n"); @@ -536,12 +537,13 @@ if (CompletelyUnroll) { if (j == 0) Dest = LoopExit; - NeedConditional = false; - } - - // If we know the trip count or a multiple of it, we can safely use an - // unconditional branch for some iterations. - if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) { + // If using trip count upper bound to completely unroll, we need to keep + // the conditional branch except the last one because the loop may exit + // after any iteration. + NeedConditional = (UseUpperBound && j); + } else if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) { + // If we know the trip count or a multiple of it, we can safely use an + // unconditional branch for some iterations. NeedConditional = false; } Index: test/CodeGen/AMDGPU/tti-unroll-prefs.ll =================================================================== --- test/CodeGen/AMDGPU/tti-unroll-prefs.ll +++ test/CodeGen/AMDGPU/tti-unroll-prefs.ll @@ -1,4 +1,4 @@ -; RUN: opt -loop-unroll -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s +; RUN: opt -loop-unroll -unroll-upperbound -S -mtriple=amdgcn-- -mcpu=SI %s | FileCheck %s ; This IR comes from this OpenCL C code: ; @@ -12,13 +12,14 @@ ; } ; ; This test is meant to check that this loop isn't unrolled into more than -; four iterations. The loop unrolling preferences we currently use cause this -; loop to not be unrolled at all, but that may change in the future. +; four iterations. ; CHECK-LABEL: @test ; CHECK: store i8 0, i8 addrspace(1)* +; CHECK: store i8 0, i8 addrspace(1)* +; CHECK: store i8 0, i8 addrspace(1)* +; CHECK: store i8 0, i8 addrspace(1)* ; CHECK-NOT: store i8 0, i8 addrspace(1)* -; CHECK: ret void define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) { entry: %add = add nsw i32 %b, 4