Index: include/llvm/Analysis/LoopPassManager.h =================================================================== --- include/llvm/Analysis/LoopPassManager.h +++ include/llvm/Analysis/LoopPassManager.h @@ -48,6 +48,9 @@ typedef OuterAnalysisManagerProxy FunctionAnalysisManagerLoopProxy; +typedef OuterAnalysisManagerProxy + ModuleAnalysisManagerLoopProxy; + /// Returns the minimum set of Analyses that all loop passes must preserve. PreservedAnalyses getLoopPassPreservedAnalyses(); Index: lib/Passes/PassBuilder.cpp =================================================================== --- lib/Passes/PassBuilder.cpp +++ lib/Passes/PassBuilder.cpp @@ -774,6 +774,7 @@ FAM.registerPass([&] { return ModuleAnalysisManagerFunctionProxy(MAM); }); FAM.registerPass([&] { return LoopAnalysisManagerFunctionProxy(LAM); }); LAM.registerPass([&] { return FunctionAnalysisManagerLoopProxy(FAM); }); + LAM.registerPass([&] { return ModuleAnalysisManagerLoopProxy(MAM); }); } bool PassBuilder::parseModulePassPipeline(ModulePassManager &MPM, Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -15,6 +15,7 @@ #include "llvm/Transforms/Scalar/LoopUnrollPass.h" #include "llvm/ADT/SetVector.h" #include "llvm/Analysis/AssumptionCache.h" +#include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/InstructionSimplify.h" @@ -22,6 +23,7 @@ #include "llvm/Analysis/LoopPassManager.h" #include "llvm/Analysis/LoopUnrollAnalyzer.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" @@ -102,6 +104,12 @@ cl::desc("Unrolled size limit for loops with an unroll(full) or " "unroll_count pragma.")); +static cl::opt FlatLoopTripCountThreshold( + "flat-loop-tripcount-threshold", cl::init(5), cl::Hidden, + cl::desc("If the runtime tripcount for the loop is lower than the " + "threshold, the loop is considered as flat and will be less " + "aggressively unrolled.")); + /// A magic value for use with the Threshold parameter to indicate /// that the loop unroll should be performed regardless of how much /// code expansion would result. @@ -712,8 +720,9 @@ // Calculates unroll count and writes it to UP.Count. static bool computeUnrollCount( Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT, LoopInfo *LI, - ScalarEvolution *SE, OptimizationRemarkEmitter *ORE, unsigned &TripCount, - unsigned MaxTripCount, unsigned &TripMultiple, unsigned LoopSize, + ScalarEvolution *SE, BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, + OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount, + unsigned &TripMultiple, unsigned LoopSize, TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) { // Check for explicit Count. // 1st priority is unroll count set by "unroll-count" option. @@ -748,6 +757,19 @@ bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll || PragmaEnableUnroll || UserUnrollCount; + if (L->getHeader()->getParent()->getEntryCount()) { + if (TripCount == 0 && + BFI->getBlockFreq(L->getHeader()) < + FlatLoopTripCountThreshold * + BFI->getBlockFreq(L->getLoopPreheader()).getFrequency()) { + UP.PartialThreshold /= 4; + } else if (PSI->isHotBB(L->getHeader(), BFI)) { + UP.Threshold *= 4; + UP.PartialThreshold *= 4; + UP.AllowExpensiveTripCount = true; + } + } + if (ExplicitUnroll && TripCount != 0) { // If the loop has an unrolling pragma, we want to be more aggressive with // unrolling limits. Set thresholds to at least the PragmaThreshold value @@ -923,8 +945,9 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution *SE, const TargetTransformInfo &TTI, - AssumptionCache &AC, OptimizationRemarkEmitter &ORE, - bool PreserveLCSSA, + AssumptionCache &AC, BlockFrequencyInfo *BFI, + ProfileSummaryInfo *PSI, + OptimizationRemarkEmitter &ORE, bool PreserveLCSSA, Optional ProvidedCount, Optional ProvidedThreshold, Optional ProvidedAllowPartial, @@ -1016,9 +1039,9 @@ // computeUnrollCount() decides whether it is beneficial to use upper bound to // fully unroll the loop. bool UseUpperBound = false; - bool IsCountSetExplicitly = - computeUnrollCount(L, TTI, DT, LI, SE, &ORE, TripCount, MaxTripCount, - TripMultiple, LoopSize, UP, UseUpperBound); + bool IsCountSetExplicitly = computeUnrollCount( + L, TTI, DT, LI, SE, BFI, PSI, &ORE, TripCount, MaxTripCount, TripMultiple, + LoopSize, UP, UseUpperBound); if (!UP.Count) return false; // Unroll factor (Count) must be less or equal to TripCount. @@ -1067,6 +1090,10 @@ auto &DT = getAnalysis().getDomTree(); LoopInfo *LI = &getAnalysis().getLoopInfo(); ScalarEvolution *SE = &getAnalysis().getSE(); + BlockFrequencyInfo *BFI = + &getAnalysis().getBFI(); + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); const TargetTransformInfo &TTI = getAnalysis().getTTI(F); auto &AC = getAnalysis().getAssumptionCache(F); @@ -1076,7 +1103,7 @@ OptimizationRemarkEmitter ORE(&F); bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID); - return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, + return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, BFI, PSI, ORE, PreserveLCSSA, ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound); @@ -1087,6 +1114,8 @@ /// void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); + AU.addRequired(); + AU.addRequired(); AU.addRequired(); // FIXME: Loop passes are required to preserve domtree, and for now we just // recreate dom info if anything gets unrolled. @@ -1098,6 +1127,8 @@ char LoopUnroll::ID = 0; INITIALIZE_PASS_BEGIN(LoopUnroll, "loop-unroll", "Unroll loops", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) +INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopPass) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false) @@ -1123,13 +1154,16 @@ const auto &FAM = AM.getResult(L).getManager(); Function *F = L.getHeader()->getParent(); - + const auto &MAM = + AM.getResult(L).getManager(); DominatorTree *DT = FAM.getCachedResult(*F); LoopInfo *LI = FAM.getCachedResult(*F); ScalarEvolution *SE = FAM.getCachedResult(*F); auto *TTI = FAM.getCachedResult(*F); auto *AC = FAM.getCachedResult(*F); + auto *BFI = FAM.getCachedResult(*F); + auto *PSI = MAM.getCachedResult(*F->getParent()); auto *ORE = FAM.getCachedResult(*F); if (!DT) report_fatal_error( @@ -1146,14 +1180,20 @@ if (!AC) report_fatal_error( "LoopUnrollPass: AssumptionAnalysis not cached at a higher level"); + if (!BFI) + report_fatal_error( + "LoopUnrollPass: BlockFrequencyAnalysis not cached at a higher level"); + if (!PSI) + report_fatal_error( + "LoopUnrollPass: ProfileSummaryAnalysis not cached at a higher level"); if (!ORE) report_fatal_error("LoopUnrollPass: OptimizationRemarkEmitterAnalysis not " "cached at a higher level"); - bool Changed = - tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, *ORE, /*PreserveLCSSA*/ true, - ProvidedCount, ProvidedThreshold, ProvidedAllowPartial, - ProvidedRuntime, ProvidedUpperBound); + bool Changed = tryToUnrollLoop(&L, *DT, LI, SE, *TTI, *AC, BFI, PSI, + *ORE, /*PreserveLCSSA*/ true, ProvidedCount, + ProvidedThreshold, ProvidedAllowPartial, + ProvidedRuntime, ProvidedUpperBound); if (!Changed) return PreservedAnalyses::all(); Index: test/Other/pass-pipelines.ll =================================================================== --- test/Other/pass-pipelines.ll +++ test/Other/pass-pipelines.ll @@ -46,6 +46,8 @@ ; CHECK-O2-NOT: Manager ; CHECK-O2: Loop Pass Manager ; CHECK-O2-NOT: Manager +; CHECK-O2: Loop Pass Manager +; CHECK-O2-NEXT: Unroll loops ; FIXME: It isn't clear that we need yet another loop pass pipeline ; and run of LICM here. ; CHECK-O2-NOT: Manager Index: test/Transforms/LoopUnroll/unloop.ll =================================================================== --- test/Transforms/LoopUnroll/unloop.ll +++ test/Transforms/LoopUnroll/unloop.ll @@ -1,5 +1,5 @@ ; RUN: opt < %s -S -loop-unroll -verify-loop-info | FileCheck %s -; RUN: opt < %s -S -passes='function(require,require,require,loop(unroll),verify)' | FileCheck %s +; RUN: opt < %s -S -passes='module(require,function(require,require,require,require,loop(unroll),verify))' | FileCheck %s ; ; Unit tests for LoopInfo::markAsRemoved. Index: test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll =================================================================== --- /dev/null +++ test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll @@ -0,0 +1,142 @@ +; RUN: opt < %s -S -loop-unroll -unroll-runtime -unroll-threshold=10 -unroll-dynamic-cost-savings-discount=0 | FileCheck %s + +@known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16 + +; CHECK-LABEL: foo +; CHECK: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv +define i32 @foo(i32* noalias nocapture readonly %src) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %r = phi i32 [ 0, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv + %src_element = load i32, i32* %arrayidx, align 4 + %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv + %const_array_element = load i32, i32* %array_const_idx, align 4 + %mul = mul nsw i32 %src_element, %const_array_element + %add = add nsw i32 %mul, %r + %inc = add nuw nsw i64 %iv, 1 + %exitcond86.i = icmp eq i64 %inc, 9 + br i1 %exitcond86.i, label %loop.end, label %loop + +loop.end: + %r.lcssa = phi i32 [ %r, %loop ] + ret i32 %r.lcssa +} + +; CHECK-LABEL: foo_prof +; CHECK-NOT: %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv +define i32 @foo_prof(i32* noalias nocapture readonly %src) !prof !15 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %r = phi i32 [ 0, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv + %src_element = load i32, i32* %arrayidx, align 4 + %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv + %const_array_element = load i32, i32* %array_const_idx, align 4 + %mul = mul nsw i32 %src_element, %const_array_element + %add = add nsw i32 %mul, %r + %inc = add nuw nsw i64 %iv, 1 + %exitcond86.i = icmp eq i64 %inc, 9 + br i1 %exitcond86.i, label %loop.end, label %loop, !prof !16 + +loop.end: + %r.lcssa = phi i32 [ %r, %loop ] + ret i32 %r.lcssa +} + +; CHECK-LABEL: bar +; CHECK-NOT: loop.prol +define i32 @bar(i32* noalias nocapture readonly %src, i64 %c) { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %r = phi i32 [ 0, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv + %src_element = load i32, i32* %arrayidx, align 4 + %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv + %const_array_element = load i32, i32* %array_const_idx, align 4 + %mul = mul nsw i32 %src_element, %const_array_element + %add = add nsw i32 %mul, %r + %inc = add nuw nsw i64 %iv, 1 + %exitcond86.i = icmp eq i64 %inc, %c + br i1 %exitcond86.i, label %loop.end, label %loop + +loop.end: + %r.lcssa = phi i32 [ %r, %loop ] + ret i32 %r.lcssa +} + +; CHECK-LABEL: bar_prof +; CHECK: loop.prol +define i32 @bar_prof(i32* noalias nocapture readonly %src, i64 %c) !prof !15 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %r = phi i32 [ 0, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv + %src_element = load i32, i32* %arrayidx, align 4 + %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv + %const_array_element = load i32, i32* %array_const_idx, align 4 + %mul = mul nsw i32 %src_element, %const_array_element + %add = add nsw i32 %mul, %r + %inc = add nuw nsw i64 %iv, 1 + %exitcond86.i = icmp eq i64 %inc, %c + br i1 %exitcond86.i, label %loop.end, label %loop, !prof !16 + +loop.end: + %r.lcssa = phi i32 [ %r, %loop ] + ret i32 %r.lcssa +} + +; CHECK-LABEL: bar_prof_flat +; CHECK-NOT: loop.prol +define i32 @bar_prof_flat(i32* noalias nocapture readonly %src, i64 %c) !prof !15 { +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %inc, %loop ] + %r = phi i32 [ 0, %entry ], [ %add, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %src, i64 %iv + %src_element = load i32, i32* %arrayidx, align 4 + %array_const_idx = getelementptr inbounds [9 x i32], [9 x i32]* @known_constant, i64 0, i64 %iv + %const_array_element = load i32, i32* %array_const_idx, align 4 + %mul = mul nsw i32 %src_element, %const_array_element + %add = add nsw i32 %mul, %r + %inc = add nuw nsw i64 %iv, 1 + %exitcond86.i = icmp eq i64 %inc, %c + br i1 %exitcond86.i, label %loop, label %loop.end, !prof !16 + +loop.end: + %r.lcssa = phi i32 [ %r, %loop ] + ret i32 %r.lcssa +} + +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 1000} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2} +!15 = !{!"function_entry_count", i64 1} +!16 = !{!"branch_weights", i32 1, i32 1000}