Index: include/llvm/Transforms/Scalar/LoopUnrollPass.h =================================================================== --- include/llvm/Transforms/Scalar/LoopUnrollPass.h +++ include/llvm/Transforms/Scalar/LoopUnrollPass.h @@ -10,6 +10,7 @@ #ifndef LLVM_TRANSFORMS_SCALAR_LOOPUNROLLPASS_H #define LLVM_TRANSFORMS_SCALAR_LOOPUNROLLPASS_H +#include "llvm/ADT/Optional.h" #include "llvm/Analysis/LoopAnalysisManager.h" #include "llvm/IR/PassManager.h" @@ -30,16 +31,69 @@ LoopStandardAnalysisResults &AR, LPMUpdater &U); }; +/// A set of parameters used to control various transforms performed by the LoopUnroll pass. +/// Each of the boolean parameters can be set to: +/// true - enabling the transformation. +/// false - disabling the transformation. +/// None - relying on a global default. +/// +/// There is also OptLevel parameter, which is used for additional loop unroll tuning. +/// +/// Intended use is to create a default object, modify parameters with additional setters and +/// then pass it to LoopUnrollPass. +/// +struct LoopUnrollOptions { + Optional AllowPartial; + Optional AllowPeeling; + Optional AllowRuntime; + Optional AllowUpperBound; + int OptLevel; + + LoopUnrollOptions(int OptLevel = 2) : OptLevel(OptLevel) {} + + /// Enables or disables partial unrolling. When disabled only full unrolling is allowed. + LoopUnrollOptions &setPartial(bool Partial) { + AllowPartial = Partial; + return *this; + } + + /// Enables or disables unrolling of loops with runtime trip count. + LoopUnrollOptions &setRuntime(bool Runtime) { + AllowRuntime = Runtime; + return *this; + } + + /// Enables or disables loop peeling. + LoopUnrollOptions &setPeeling(bool Peeling) { + AllowPeeling = Peeling; + return *this; + } + + /// Enables or disables the use of trip count upper bound + /// in loop unrolling. + LoopUnrollOptions &setUpperBound(bool UpperBound) { + AllowUpperBound = UpperBound; + return *this; + } + + // Sets "optimization level" tuning parameter for loop unrolling. + LoopUnrollOptions &setOptLevel(int O) { + OptLevel = O; + return *this; + } +}; + /// Loop unroll pass that will support both full and partial unrolling. /// It is a function pass to have access to function and module analyses. /// It will also put loops into canonical form (simplified and LCSSA). class LoopUnrollPass : public PassInfoMixin { - const int OptLevel; + LoopUnrollOptions UnrollOpts; public: /// This uses the target information (or flags) to control the thresholds for /// different unrolling stategies but supports all of them. - explicit LoopUnrollPass(int OptLevel = 2) : OptLevel(OptLevel) {} + explicit LoopUnrollPass(LoopUnrollOptions UnrollOpts = {}) + : UnrollOpts(UnrollOpts) {} PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -830,7 +830,7 @@ OptimizePM.addPass( createFunctionToLoopPassAdaptor(LoopUnrollAndJamPass(Level))); } - OptimizePM.addPass(LoopUnrollPass(Level)); + OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(Level))); OptimizePM.addPass(InstCombinePass()); OptimizePM.addPass(RequireAnalysisPass()); OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass(), DebugLogging)); Index: lib/Passes/PassRegistry.def =================================================================== --- lib/Passes/PassRegistry.def +++ lib/Passes/PassRegistry.def @@ -215,6 +215,7 @@ FUNCTION_PASS("tailcallelim", TailCallElimPass()) FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass()) FUNCTION_PASS("unroll", LoopUnrollPass()) +FUNCTION_PASS("unroll",LoopUnrollPass(LoopUnrollOptions().setPartial(false).setPeeling(true).setRuntime(false).setUpperBound(false))) FUNCTION_PASS("verify", VerifierPass()) FUNCTION_PASS("verify", DominatorTreeVerifierPass()) FUNCTION_PASS("verify", LoopVerifierPass()) Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -1333,23 +1333,20 @@ Loop *ParentL = L.getParentLoop(); #endif - // The API here is quite complex to call, but there are only two interesting - // states we support: partial and full (or "simple") unrolling. However, to - // enable these things we actually pass "None" in for the optional to avoid - // providing an explicit choice. - Optional AllowPartialParam, RuntimeParam, UpperBoundParam, - AllowPeeling; // Check if the profile summary indicates that the profiled application // has a huge working set size, in which case we disable peeling to avoid // bloating it further. + Optional LocalAllowPeeling = UnrollOpts.AllowPeeling; if (PSI && PSI->hasHugeWorkingSetSize()) - AllowPeeling = false; + LocalAllowPeeling = false; std::string LoopName = L.getName(); - LoopUnrollResult Result = - tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE, - /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, - /*Threshold*/ None, AllowPartialParam, RuntimeParam, - UpperBoundParam, AllowPeeling); + // The API here is quite complex to call and we allow to select some + // flavors of unrolling during construction time (by setting UnrollOpts). + LoopUnrollResult Result = tryToUnrollLoop( + &L, DT, &LI, SE, TTI, AC, ORE, + /*PreserveLCSSA*/ true, UnrollOpts.OptLevel, /*Count*/ None, + /*Threshold*/ None, UnrollOpts.AllowPartial, UnrollOpts.AllowRuntime, + UnrollOpts.AllowUpperBound, LocalAllowPeeling); Changed |= Result != LoopUnrollResult::Unmodified; // The parent must not be damaged by unrolling! Index: test/Transforms/LoopUnroll/peel-loop.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop.ll +++ test/Transforms/LoopUnroll/peel-loop.ll @@ -1,4 +1,6 @@ ; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=3 -verify-dom-info -simplifycfg -instcombine | FileCheck %s +; RUN: opt < %s -S -passes='require,unroll,simplify-cfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s +; RUN: opt < %s -S -passes='require,unroll,simplify-cfg,instcombine' -unroll-force-peel-count=3 -verify-dom-info | FileCheck %s ; Basic loop peeling - check that we can peel-off the first 3 loop iterations ; when explicitly requested. Index: test/Transforms/LoopUnroll/runtime-loop.ll =================================================================== --- test/Transforms/LoopUnroll/runtime-loop.ll +++ test/Transforms/LoopUnroll/runtime-loop.ll @@ -1,8 +1,16 @@ ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=true | FileCheck %s -check-prefixes=EPILOG,COMMON ; RUN: opt < %s -S -loop-unroll -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON - +; ; RUN: opt < %s -S -passes='require,unroll' -unroll-runtime=true -unroll-runtime-epilog=true | FileCheck %s -check-prefixes=EPILOG,COMMON ; RUN: opt < %s -S -passes='require,unroll' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=PROLOG,COMMON +; +; Restricted versions of unroll (unroll-noruntime, unroll-full) should not be doing runtime unrolling +; even if it is globally enabled through -unroll-runtime option +; +; RUN: opt < %s -S -passes='require,unroll' -unroll-runtime=true -unroll-runtime-epilog=true | FileCheck %s -check-prefixes=NOEPILOG,COMMON +; RUN: opt < %s -S -passes='require,unroll' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=NOPROLOG,COMMON +; RUN: opt < %s -S -passes='require,loop(unroll-full)' -unroll-runtime=true -unroll-runtime-epilog=true | FileCheck %s -check-prefixes=NOEPILOG,COMMON +; RUN: opt < %s -S -passes='require,loop(unroll-full)' -unroll-runtime=true -unroll-runtime-epilog=false | FileCheck %s -check-prefixes=NOPROLOG,COMMON target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" @@ -14,22 +22,32 @@ ; EPILOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 ; EPILOG: br i1 %lcmp.mod, label %for.body.epil.preheader, label %for.end.loopexit +; NOEPILOG-NOT: %xtraiter = and i32 %n + ; PROLOG: %xtraiter = and i32 %n ; PROLOG: %lcmp.mod = icmp ne i32 %xtraiter, 0 ; PROLOG: br i1 %lcmp.mod, label %for.body.prol.preheader, label %for.body.prol.loopexit +; NOPROLOG-NOT: %xtraiter = and i32 %n + ; EPILOG: for.body.epil: ; EPILOG: %indvars.iv.epil = phi i64 [ %indvars.iv.next.epil, %for.body.epil ], [ %indvars.iv.unr, %for.body.epil.preheader ] ; EPILOG: %epil.iter.sub = sub i32 %epil.iter, 1 ; EPILOG: %epil.iter.cmp = icmp ne i32 %epil.iter.sub, 0 ; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.end.loopexit.epilog-lcssa, !llvm.loop !0 +; NOEPILOG: for.body: +; NOEPILOG-NOT: for.body.epil: + ; PROLOG: for.body.prol: ; PROLOG: %indvars.iv.prol = phi i64 [ %indvars.iv.next.prol, %for.body.prol ], [ 0, %for.body.prol.preheader ] ; PROLOG: %prol.iter.sub = sub i32 %prol.iter, 1 ; PROLOG: %prol.iter.cmp = icmp ne i32 %prol.iter.sub, 0 ; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit.unr-lcssa, !llvm.loop !0 +; NOPROLOG: for.body: +; NOPROLOG-NOT: for.body.prol: + define i32 @test(i32* nocapture %a, i32 %n) nounwind uwtable readonly { entry: @@ -86,6 +104,8 @@ ; COMMON-LABEL: @foo( ; EPILOG: bb72.2: ; PROLOG: bb72.2: +; NOEPILOG-NOT: bb72.2: +; NOPROLOG-NOT: bb72.2: define void @foo(i32 %trips) { entry: @@ -111,9 +131,15 @@ ; EPILOG: for.body.epil: ; EPILOG: br i1 %epil.iter.cmp, label %for.body.epil, label %for.cond.for.end_crit_edge.epilog-lcssa +; NOEPILOG: for.body: +; NOEPILOG-NOT: for.body.epil: + ; PROLOG: for.body.prol: ; PROLOG: br i1 %prol.iter.cmp, label %for.body.prol, label %for.body.prol.loopexit +; NOPROLOG: for.body: +; NOPROLOG-NOT: for.body.prol: + define zeroext i16 @down(i16* nocapture %p, i32 %len) nounwind uwtable readonly { entry: %cmp2 = icmp eq i32 %len, 0 @@ -146,9 +172,15 @@ ; EPILOG: for.body: ; EPILOG-NOT: for.body.epil: +; NOEPILOG: for.body: +; NOEPILOG-NOT: for.body.epil: + ; PROLOG: for.body: ; PROLOG-NOT: for.body.prol: +; NOPROLOG: for.body: +; NOPROLOG-NOT: for.body.prol: + define zeroext i16 @test2(i16* nocapture %p, i32 %len) nounwind uwtable readonly { entry: %cmp2 = icmp eq i32 %len, 0