Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -49,6 +49,10 @@ void computeThresholds(); // Count thresholds to answer isHotCount and isColdCount queries. Optional HotCountThreshold, ColdCountThreshold; + // True if the working set size of the code is considered huge, + // because the number of profile counts required to reach the hot + // percentile is above a huge threshold. + Optional HasHugeWorkingSetSize; public: ProfileSummaryInfo(Module &M) : M(M) {} @@ -84,6 +88,8 @@ /// Returns the profile count for \p CallInst. Optional getProfileCount(const Instruction *CallInst, BlockFrequencyInfo *BFI); + /// Returns true if the working set size of the code is considered huge. + bool hasHugeWorkingSetSize(); /// \brief Returns true if \p F has hot function entry. bool isFunctionEntryHot(const Function *F); /// Returns true if \p F has hot function entry or hot call edge. Index: lib/Analysis/ProfileSummaryInfo.cpp =================================================================== --- lib/Analysis/ProfileSummaryInfo.cpp +++ lib/Analysis/ProfileSummaryInfo.cpp @@ -44,10 +44,16 @@ cl::desc("If the sample profile is accurate, we will mark all un-sampled " "callsite as cold. Otherwise, treat un-sampled callsites as if " "we have no profile.")); +static cl::opt ProfileSummaryHugeWorkingSetSizeThreshold( + "profile-summary-huge-working-set-size-threshold", cl::Hidden, + cl::init(15000), cl::ZeroOrMore, + cl::desc("The code working set size is considered huge if the number of" + " blocks required to reach the -profile-summary-cutoff-hot" + " percentile exceeds this count.")); -// Find the minimum count to reach a desired percentile of counts. -static uint64_t getMinCountForPercentile(SummaryEntryVector &DS, - uint64_t Percentile) { +// Find the summary entry for a desired percentile of counts. +static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS, + uint64_t Percentile) { auto Compare = [](const ProfileSummaryEntry &Entry, uint64_t Percentile) { return Entry.Cutoff < Percentile; }; @@ -56,7 +62,7 @@ // detailed summary. if (It == DS.end()) report_fatal_error("Desired percentile exceeds the maximum cutoff"); - return It->MinCount; + return *It; } // The profile summary metadata may be attached either by the frontend or by @@ -169,10 +175,20 @@ if (!computeSummary()) return; auto &DetailedSummary = Summary->getDetailedSummary(); - HotCountThreshold = - getMinCountForPercentile(DetailedSummary, ProfileSummaryCutoffHot); - ColdCountThreshold = - getMinCountForPercentile(DetailedSummary, ProfileSummaryCutoffCold); + auto &HotEntry = + getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffHot); + HotCountThreshold = HotEntry.MinCount; + auto &ColdEntry = + getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffCold); + ColdCountThreshold = ColdEntry.MinCount; + HasHugeWorkingSetSize = + HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold; +} + +bool ProfileSummaryInfo::hasHugeWorkingSetSize() { + if (!HasHugeWorkingSetSize) + computeThresholds(); + return HasHugeWorkingSetSize && HasHugeWorkingSetSize.getValue(); } bool ProfileSummaryInfo::isHotCount(uint64_t C) { Index: lib/Transforms/Scalar/LoopUnrollPass.cpp =================================================================== --- lib/Transforms/Scalar/LoopUnrollPass.cpp +++ lib/Transforms/Scalar/LoopUnrollPass.cpp @@ -21,6 +21,7 @@ #include "llvm/Analysis/LoopPass.h" #include "llvm/Analysis/LoopUnrollAnalyzer.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/IR/DataLayout.h" @@ -1252,6 +1253,11 @@ auto &AC = AM.getResult(F); auto &ORE = AM.getResult(F); + const ModuleAnalysisManager &MAM = + AM.getResult(F).getManager(); + ProfileSummaryInfo *PSI = + MAM.getCachedResult(*F.getParent()); + bool Changed = false; // The unroller requires loops to be in simplified form, and also needs LCSSA. @@ -1280,12 +1286,18 @@ // states we support: partial and full (or "simple") unrolling. However, to // enable these things we actually pass "None" in for the optional to avoid // providing an explicit choice. - Optional AllowPartialParam, RuntimeParam, UpperBoundParam; - bool CurChanged = tryToUnrollLoop( - &L, DT, &LI, SE, TTI, AC, ORE, - /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, - /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam, - /*AllowPeeling*/ None); + Optional AllowPartialParam, RuntimeParam, UpperBoundParam, + AllowPeeling; + // Check if the profile summary indicates that the profiled application + // has a huge working set size, in which case we disable peeling to avoid + // bloating it further. + if (PSI && PSI->hasHugeWorkingSetSize()) + AllowPeeling = false; + bool CurChanged = + tryToUnrollLoop(&L, DT, &LI, SE, TTI, AC, ORE, + /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None, + /*Threshold*/ None, AllowPartialParam, RuntimeParam, + UpperBoundParam, AllowPeeling); Changed |= CurChanged; // The parent must not be damaged by unrolling! Index: test/Other/new-pm-defaults.ll =================================================================== --- test/Other/new-pm-defaults.ll +++ test/Other/new-pm-defaults.ll @@ -198,6 +198,7 @@ ; CHECK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-O-NEXT: Running pass: InstCombinePass ; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass Index: test/Other/new-pm-thinlto-defaults.ll =================================================================== --- test/Other/new-pm-thinlto-defaults.ll +++ test/Other/new-pm-thinlto-defaults.ll @@ -185,6 +185,7 @@ ; CHECK-POSTLINK-O-NEXT: Running pass: SimplifyCFGPass ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: LoopUnrollPass +; CHECK-POSTLINK-O-NEXT: Running analysis: OuterAnalysisManagerProxy ; CHECK-POSTLINK-O-NEXT: Running pass: InstCombinePass ; CHECK-POSTLINK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis ; CHECK-POSTLINK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass Index: test/Transforms/LoopUnroll/peel-loop-pgo.ll =================================================================== --- test/Transforms/LoopUnroll/peel-loop-pgo.ll +++ test/Transforms/LoopUnroll/peel-loop-pgo.ll @@ -1,5 +1,8 @@ ; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll 2>&1 | FileCheck %s -; RUN: opt < %s -S -debug-only=loop-unroll -passes='require,unroll' 2>&1 | FileCheck %s +; RUN: opt < %s -S -debug-only=loop-unroll -passes='require,function(require,unroll)' 2>&1 | FileCheck %s +; Confirm that peeling is disabled if the number of counts required to reach +; the hot percentile is above the threshold. +; RUN: opt < %s -S -profile-summary-huge-working-set-size-threshold=9 -debug-only=loop-unroll -passes='require,function(require,unroll)' 2>&1 | FileCheck %s --check-prefix=NOPEEL ; REQUIRES: asserts ; Make sure we use the profile information correctly to peel-off 3 iterations @@ -11,19 +14,19 @@ ; CHECK-NOT: PEELING ; Confirm that no peeling occurs when we are performing full unrolling. -; RUN: opt < %s -S -debug-only=loop-unroll -passes='require,loop(unroll-full)' 2>&1 | FileCheck %s --check-prefix=FULLUNROLL -; FULLUNROLL-NOT: PEELING +; RUN: opt < %s -S -debug-only=loop-unroll -passes='require,loop(unroll-full)' 2>&1 | FileCheck %s --check-prefix=NOPEEL +; NOPEEL-NOT: PEELING ; CHECK-LABEL: @basic -; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !1 +; CHECK: br i1 %{{.*}}, label %[[NEXT0:.*]], label %for.cond.for.end_crit_edge, !prof !15 ; CHECK: [[NEXT0]]: -; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !2 +; CHECK: br i1 %{{.*}}, label %[[NEXT1:.*]], label %for.cond.for.end_crit_edge, !prof !16 ; CHECK: [[NEXT1]]: -; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !3 +; CHECK: br i1 %{{.*}}, label %[[NEXT2:.*]], label %for.cond.for.end_crit_edge, !prof !17 ; CHECK: [[NEXT2]]: -; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !4 +; CHECK: br i1 %{{.*}}, label %for.body, label %{{.*}}, !prof !18 -define void @basic(i32* %p, i32 %k) #0 !prof !0 { +define void @basic(i32* %p, i32 %k) #0 !prof !15 { entry: %cmp3 = icmp slt i32 0, %k br i1 %cmp3, label %for.body.lr.ph, label %for.end @@ -38,7 +41,7 @@ store i32 %i.05, i32* %p.addr.04, align 4 %inc = add nsw i32 %i.05, 1 %cmp = icmp slt i32 %inc, %k - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !1 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !16 for.cond.for.end_crit_edge: ; preds = %for.body br label %for.end @@ -54,7 +57,7 @@ ; CHECK: for.body: ; CHECK-NOT: br ; CHECK: br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge -define void @optsize(i32* %p, i32 %k) #1 !prof !0 { +define void @optsize(i32* %p, i32 %k) #1 !prof !15 { entry: %cmp3 = icmp slt i32 0, %k br i1 %cmp3, label %for.body.lr.ph, label %for.end @@ -69,7 +72,7 @@ store i32 %i.05, i32* %p.addr.04, align 4 %inc = add nsw i32 %i.05, 1 %cmp = icmp slt i32 %inc, %k - br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !1 + br i1 %cmp, label %for.body, label %for.cond.for.end_crit_edge, !prof !16 for.cond.for.end_crit_edge: ; preds = %for.body br label %for.end @@ -81,11 +84,27 @@ attributes #0 = { nounwind } attributes #1 = { nounwind optsize } -!0 = !{!"function_entry_count", i64 1} -!1 = !{!"branch_weights", i32 3001, i32 1001} - -;CHECK: !1 = !{!"branch_weights", i32 900, i32 101} -;CHECK: !2 = !{!"branch_weights", i32 540, i32 360} -;CHECK: !3 = !{!"branch_weights", i32 162, i32 378} -;CHECK: !4 = !{!"branch_weights", i32 1399, i32 162} +!llvm.module.flags = !{!1} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"InstrProf"} +!4 = !{!"TotalCount", i64 10} +!5 = !{!"MaxCount", i64 3} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 3} +!8 = !{!"NumCounts", i64 2} +!9 = !{!"NumFunctions", i64 2} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 3, i32 2} +!13 = !{i32 999000, i64 1, i32 10} +!14 = !{i32 999999, i64 1, i32 10} +!15 = !{!"function_entry_count", i64 1} +!16 = !{!"branch_weights", i32 3001, i32 1001} + +;CHECK: !15 = !{!"branch_weights", i32 900, i32 101} +;CHECK: !16 = !{!"branch_weights", i32 540, i32 360} +;CHECK: !17 = !{!"branch_weights", i32 162, i32 378} +;CHECK: !18 = !{!"branch_weights", i32 1399, i32 162}