diff --git a/llvm/include/llvm/ProfileData/ProfileCommon.h b/llvm/include/llvm/ProfileData/ProfileCommon.h --- a/llvm/include/llvm/ProfileData/ProfileCommon.h +++ b/llvm/include/llvm/ProfileData/ProfileCommon.h @@ -17,6 +17,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/InstrProf.h" +#include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/Error.h" #include #include @@ -89,6 +90,8 @@ void addRecord(const sampleprof::FunctionSamples &FS, bool isCallsiteSample = false); + std::unique_ptr computeSummaryForProfiles( + const StringMap &Profiles); std::unique_ptr getSummary(); }; diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp --- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp +++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp @@ -18,9 +18,14 @@ #include "llvm/ProfileData/ProfileCommon.h" #include "llvm/ProfileData/SampleProf.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/CommandLine.h" using namespace llvm; +cl::opt UseContextLessSummary( + "profile-summary-contextless", cl::Hidden, cl::init(false), cl::ZeroOrMore, + cl::desc("Merge context profiles before calculating thresholds.")); + // A set of cutoff values. Each value, when divided by ProfileSummary::Scale // (which is 1000000) is a desired percentile of total counts. static const uint32_t DefaultCutoffsData[] = { @@ -111,6 +116,35 @@ MaxFunctionCount, NumCounts, NumFunctions); } +std::unique_ptr +SampleProfileSummaryBuilder::computeSummaryForProfiles( + const StringMap &Profiles) { + assert(NumFunctions == 0 && + "This can only be called on an empty summary builder"); + StringMap ContextLessProfiles; + const StringMap *ProfilesToUse = &Profiles; + // For CSSPGO, context-sensitive profile effectively split a function profile + // into many copies each representing the CFG profile of a particular calling + // context. That makes the count distribution looks more flat as we now have + // more function profiles each with lower counts, which in turn leads to lower + // hot thresholds. To compensate for that, by defauly we merge context + // profiles before coumputing profile summary. + if (UseContextLessSummary || (sampleprof::FunctionSamples::ProfileIsCS && + !UseContextLessSummary.getNumOccurrences())) { + for (const auto &I : Profiles) { + ContextLessProfiles[I.second.getName()].merge(I.second); + } + ProfilesToUse = &ContextLessProfiles; + } + + for (const auto &I : *ProfilesToUse) { + const sampleprof::FunctionSamples &Profile = I.second; + addRecord(Profile); + } + + return getSummary(); +} + std::unique_ptr InstrProfSummaryBuilder::getSummary() { computeDetailedSummary(); return std::make_unique( diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -1610,9 +1610,5 @@ // profile. Binary format has the profile summary in its header. void SampleProfileReader::computeSummary() { SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); - for (const auto &I : Profiles) { - const FunctionSamples &Profile = I.second; - Builder.addRecord(Profile); - } - Summary = Builder.getSummary(); + Summary = Builder.computeSummaryForProfiles(Profiles); } diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -752,9 +752,5 @@ void SampleProfileWriter::computeSummary( const StringMap &ProfileMap) { SampleProfileSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs); - for (const auto &I : ProfileMap) { - const FunctionSamples &Profile = I.second; - Builder.addRecord(Profile); - } - Summary = Builder.getSummary(); + Summary = Builder.computeSummaryForProfiles(ProfileMap); } diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll --- a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll +++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll @@ -30,7 +30,6 @@ ; INLINE-NEW-LIMIT1-NOT: remark -; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11 ; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11 ; INLINE-NEW-LIMIT2-NOT: remark diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll copy from llvm/test/Transforms/SampleProfile/csspgo-inline.ll copy to llvm/test/Transforms/SampleProfile/csspgo-summary.ll --- a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll +++ b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll @@ -1,38 +1,11 @@ -; Test for CSSPGO's new early inliner using priority queue - -; Note that we need new pass manager to enable top-down processing for sample profile loader -; Test we inlined the following in top-down order with old inliner -; main:3 @ _Z5funcAi -; main:3 @ _Z5funcAi:1 @ _Z8funcLeafi -; _Z5funcBi:1 @ _Z8funcLeafi -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-prioritized-inline=0 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE -; -; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, so we get less inlining for given profile -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW -; -; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning hot cutoff can get us the same inlining -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE -; -; With new FDO early inliner, callee entry count is used to drive inlining instead of callee total samples, tuning cold sample profile inline threshold can get us the same inlining -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -sample-profile-cold-inline-threshold=200 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-BASE -; -; With new FDO early inliner and tuned cutoff, we can control inlining through size growth tuning knob. -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=0 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --allow-empty --check-prefix=INLINE-NEW-LIMIT1 -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-context-tracker.prof -sample-profile-inline-size -profile-summary-cutoff-hot=999900 -sample-profile-inline-limit-min=10 -sample-profile-inline-growth-limit=1 -profile-sample-accurate -S -pass-remarks=inline -o /dev/null 2>&1 | FileCheck %s --check-prefix=INLINE-NEW-LIMIT2 - - -; INLINE-BASE: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10 -; INLINE-BASE: remark: merged.cpp:27:11: _Z8funcLeafi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11 @ main:3:10 -; INLINE-BASE: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11 - -; INLINE-NEW: remark: merged.cpp:14:10: _Z5funcAi inlined into main to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite main:3:10 -; INLINE-NEW-NOT: remark - -; INLINE-NEW-LIMIT1-NOT: remark - -; INLINE-NEW-LIMIT2: remark: merged.cpp:27:11: _Z8funcLeafi inlined into _Z5funcAi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcAi:1:11 -; INLINE-NEW-LIMIT2: remark: merged.cpp:33:11: _Z8funcLeafi inlined into _Z5funcBi to match profiling context with (cost={{[0-9]+}}, threshold={{[0-9]+}}) at callsite _Z5funcBi:1:11 -; INLINE-NEW-LIMIT2-NOT: remark +; Test for CSSPGO's profile summary computation with and without pre-merging context profiles + +; RUN: opt < %s -passes=sample-profile,print-profile-summary -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-summary-cutoff-hot=999900 -profile-sample-accurate -profile-summary-contextless=0 -S -o /dev/null 2>&1 | FileCheck %s --check-prefix=SUMMARY-UNMERGED +; RUN: opt < %s -passes=sample-profile,print-profile-summary -sample-profile-file=%S/Inputs/profile-context-tracker.prof -profile-summary-cutoff-hot=999900 -profile-sample-accurate -profile-summary-contextless=1 -S -o /dev/null 2>&1 | FileCheck %s --check-prefix=SUMMARY-MERGED + +; SUMMARY-UNMERGED: main :hot entry +; SUMMARY-MERGED-NOT: main :hot entry + @factor = dso_local global i32 3, align 4, !dbg !0