Index: include/llvm/Analysis/ProfileSummaryInfo.h =================================================================== --- include/llvm/Analysis/ProfileSummaryInfo.h +++ include/llvm/Analysis/ProfileSummaryInfo.h @@ -110,6 +110,12 @@ bool isHotCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); /// \brief Returns true if Callsite \p CS is considered cold. bool isColdCallSite(const CallSite &CS, BlockFrequencyInfo *BFI); + /// \brief Returns HotCountThreshold if set. Recompute HotCountThreshold + /// if not set. + uint64_t getOrCompHotCountThreshold(); + /// \brief Returns ColdCountThreshold if set. Recompute HotCountThreshold + /// if not set. + uint64_t getOrCompColdCountThreshold(); /// \brief Returns HotCountThreshold if set. uint64_t getHotCountThreshold() { return HotCountThreshold ? HotCountThreshold.getValue() : 0; Index: lib/Analysis/ProfileSummaryInfo.cpp =================================================================== --- lib/Analysis/ProfileSummaryInfo.cpp +++ lib/Analysis/ProfileSummaryInfo.cpp @@ -223,6 +223,18 @@ return ColdCountThreshold && C <= ColdCountThreshold.getValue(); } +uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() { + if (!HotCountThreshold) + computeThresholds(); + return HotCountThreshold && HotCountThreshold.getValue(); +} + +uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() { + if (!ColdCountThreshold) + computeThresholds(); + return ColdCountThreshold && ColdCountThreshold.getValue(); +} + bool ProfileSummaryInfo::isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI) { auto Count = BFI->getBlockProfileCount(B); return Count && isHotCount(*Count); Index: lib/Transforms/IPO/SampleProfile.cpp =================================================================== --- lib/Transforms/IPO/SampleProfile.cpp +++ lib/Transforms/IPO/SampleProfile.cpp @@ -37,6 +37,7 @@ #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -109,11 +110,6 @@ cl::desc("Emit a warning if less than N% of samples in the input profile " "are matched to the IR.")); -static cl::opt SampleProfileHotThreshold( - "sample-profile-inline-hot-threshold", cl::init(0.1), cl::value_desc("N"), - cl::desc("Inlined functions that account for more than N% of all samples " - "collected in the parent function, will be inlined again.")); - namespace { using BlockWeightMap = DenseMap; @@ -130,10 +126,13 @@ bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, uint32_t Discriminator, uint64_t Samples); unsigned computeCoverage(unsigned Used, unsigned Total) const; - unsigned countUsedRecords(const FunctionSamples *FS) const; - unsigned countBodyRecords(const FunctionSamples *FS) const; + unsigned countUsedRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const; + unsigned countBodyRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const; uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } - uint64_t countBodySamples(const FunctionSamples *FS) const; + uint64_t countBodySamples(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const; void clear() { SampleCoverage.clear(); @@ -186,7 +185,8 @@ IsThinLTOPreLink(IsThinLTOPreLink) {} bool doInitialization(Module &M); - bool runOnModule(Module &M, ModuleAnalysisManager *AM); + bool runOnModule(Module &M, ModuleAnalysisManager *AM, + ProfileSummaryInfo *_PSI); void dump() { Reader->dump(); } @@ -285,6 +285,9 @@ /// Instead, we will mark GUIDs that needs to be annotated to the function. bool IsThinLTOPreLink; + /// \brief Profile Summary Info computed from sample profile. + ProfileSummaryInfo *PSI = nullptr; + /// \brief Total number of samples collected in this profile. /// /// This is the sum of all the samples collected in all the functions executed @@ -325,6 +328,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); } private: @@ -335,7 +339,7 @@ } // end anonymous namespace -/// Return true if the given callsite is hot wrt to its caller. +/// Return true if the given callsite is hot wrt to hot cutoff threshold. /// /// Functions that were inlined in the original binary will be represented /// in the inline stack in the sample profile. If the profile shows that @@ -343,28 +347,17 @@ /// frequently), then we will recreate the inline decision and apply the /// profile from the inlined callsite. /// -/// To decide whether an inlined callsite is hot, we compute the fraction -/// of samples used by the callsite with respect to the total number of samples -/// collected in the caller. -/// -/// If that fraction is larger than the default given by -/// SampleProfileHotThreshold, the callsite will be inlined again. -static bool callsiteIsHot(const FunctionSamples *CallerFS, - const FunctionSamples *CallsiteFS) { +/// To decide whether an inlined callsite is hot, we compare the callsite +/// sample count with the hot cutoff computed by ProfileSummaryInfo, it is +/// regarded as hot if the count is above the cutoff value. +static bool callsiteIsHot(const FunctionSamples *CallsiteFS, + ProfileSummaryInfo *PSI) { if (!CallsiteFS) return false; // The callsite was not inlined in the original binary. - uint64_t ParentTotalSamples = CallerFS->getTotalSamples(); - if (ParentTotalSamples == 0) - return false; // Avoid division by zero. - + assert(PSI && "PSI is expected to be non null"); uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); - if (CallsiteTotalSamples == 0) - return false; // Callsite is trivially cold. - - double PercentSamples = - (double)CallsiteTotalSamples / (double)ParentTotalSamples * 100.0; - return PercentSamples >= SampleProfileHotThreshold; + return PSI->isHotCount(CallsiteTotalSamples); } /// Mark as used the sample record for the given function samples at @@ -387,7 +380,8 @@ /// /// This count does not include records from cold inlined callsites. unsigned -SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { +SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const { auto I = SampleCoverage.find(FS); // The size of the coverage map for FS represents the number of records @@ -400,8 +394,8 @@ for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countUsedRecords(CalleeSamples); + if (callsiteIsHot(CalleeSamples, PSI)) + Count += countUsedRecords(CalleeSamples, PSI); } return Count; @@ -411,15 +405,16 @@ /// /// This count does not include records from cold inlined callsites. unsigned -SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { +SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const { unsigned Count = FS->getBodySamples().size(); // Only count records in hot callsites. for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countBodyRecords(CalleeSamples); + if (callsiteIsHot(CalleeSamples, PSI)) + Count += countBodyRecords(CalleeSamples, PSI); } return Count; @@ -429,7 +424,8 @@ /// /// This count does not include samples from cold inlined callsites. uint64_t -SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { +SampleCoverageTracker::countBodySamples(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const { uint64_t Total = 0; for (const auto &I : FS->getBodySamples()) Total += I.second.getSamples(); @@ -438,8 +434,8 @@ for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(FS, CalleeSamples)) - Total += countBodySamples(CalleeSamples); + if (callsiteIsHot(CalleeSamples, PSI)) + Total += countBodySamples(CalleeSamples, PSI); } return Total; @@ -767,7 +763,7 @@ if ((isa(I) || isa(I)) && !isa(I) && (FS = findCalleeFunctionSamples(I))) { Candidates.push_back(&I); - if (callsiteIsHot(Samples, FS)) + if (callsiteIsHot(FS, PSI)) Hot = true; } } @@ -787,8 +783,7 @@ for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { if (IsThinLTOPreLink) { FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), - Samples->getTotalSamples() * - SampleProfileHotThreshold / 100); + PSI->getOrCompHotCountThreshold()); continue; } auto CalleeFunctionName = FS->getName(); @@ -827,8 +822,7 @@ LocalChanged = true; } else if (IsThinLTOPreLink) { findCalleeFunctionSamples(*I)->findInlinedFunctions( - InlinedGUIDs, F.getParent(), - Samples->getTotalSamples() * SampleProfileHotThreshold / 100); + InlinedGUIDs, F.getParent(), PSI->getOrCompHotCountThreshold()); } } if (LocalChanged) { @@ -1463,8 +1457,8 @@ // If coverage checking was requested, compute it now. if (SampleProfileRecordCoverage) { - unsigned Used = CoverageTracker.countUsedRecords(Samples); - unsigned Total = CoverageTracker.countBodyRecords(Samples); + unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI); + unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI); unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); if (Coverage < SampleProfileRecordCoverage) { F.getContext().diagnose(DiagnosticInfoSampleProfile( @@ -1477,7 +1471,7 @@ if (SampleProfileSampleCoverage) { uint64_t Used = CoverageTracker.getTotalUsedSamples(); - uint64_t Total = CoverageTracker.countBodySamples(Samples); + uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI); unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); if (Coverage < SampleProfileSampleCoverage) { F.getContext().diagnose(DiagnosticInfoSampleProfile( @@ -1496,6 +1490,7 @@ "Sample Profile loader", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) @@ -1520,10 +1515,15 @@ return new SampleProfileLoaderLegacyPass(Name); } -bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) { +bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, + ProfileSummaryInfo *_PSI) { if (!ProfileIsValid) return false; + PSI = _PSI; + if (M.getProfileSummary() == nullptr) + M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); + // Compute the total number of samples collected in this profile. for (const auto &I : Reader->getProfiles()) TotalCollectedSamples += I.second.getTotalSamples(); @@ -1554,15 +1554,15 @@ clearFunctionData(); retval |= runOnFunction(F, AM); } - if (M.getProfileSummary() == nullptr) - M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); return retval; } bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { ACT = &getAnalysis(); TTIWP = &getAnalysis(); - return SampleLoader.runOnModule(M, nullptr); + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + return SampleLoader.runOnModule(M, nullptr, PSI); } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { @@ -1604,7 +1604,8 @@ SampleLoader.doInitialization(M); - if (!SampleLoader.runOnModule(M, &AM)) + ProfileSummaryInfo *PSI = &AM.getResult(M); + if (!SampleLoader.runOnModule(M, &AM, PSI)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); Index: test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof =================================================================== --- test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof +++ test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof @@ -0,0 +1,11 @@ +main:2257150:0 + 2.1: 5553 + 3: 5391 + 3.1: foo:5860 + 0: 5279 + 1: 5279 + 2: 5279 + 4.1: goo:60 + 0: 20 + 1: 20 + 2: 20 Index: test/Transforms/SampleProfile/function_metadata.ll =================================================================== --- test/Transforms/SampleProfile/function_metadata.ll +++ test/Transforms/SampleProfile/function_metadata.ll @@ -28,7 +28,7 @@ ; GUIDs of foo, bar, foo1, foo2 and foo3 should be included in the metadata to ; make sure hot inline stacks are imported. -; CHECK: ![[ENTRY_TEST]] = !{!"function_entry_count", i64 1, i64 2494702099028631698, i64 6699318081062747564, i64 7682762345278052905, i64 -7908226060800700466, i64 -2012135647395072713} +; CHECK: ![[ENTRY_TEST]] = !{!"function_entry_count", i64 1, i64 2494702099028631698, i64 6699318081062747564, i64 7546896869197086323, i64 7682762345278052905, i64 -7908226060800700466, i64 -2012135647395072713} ; Check GUIDs for both foo and foo_available are included in the metadata to ; make sure the liveness analysis can capture the dependency from test_liveness Index: test/Transforms/SampleProfile/inline.ll =================================================================== --- test/Transforms/SampleProfile/inline.ll +++ test/Transforms/SampleProfile/inline.ll @@ -1,5 +1,5 @@ -; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -sample-profile-inline-hot-threshold=1 -S | FileCheck %s -; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof -sample-profile-inline-hot-threshold=1 -S | FileCheck %s +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline.prof -S | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline.prof -S | FileCheck %s ; Original C++ test case ; Index: test/Transforms/SampleProfile/warm-inline-instance.ll =================================================================== --- test/Transforms/SampleProfile/warm-inline-instance.ll +++ test/Transforms/SampleProfile/warm-inline-instance.ll @@ -0,0 +1,115 @@ +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -S | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -S | FileCheck %s + +@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1 + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %x, i32 %y) !dbg !4 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %t0 = load i32, i32* %x.addr, align 4, !dbg !11 + %t1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %t0, %t1, !dbg !11 + ret i32 %add, !dbg !11 +} + +define i32 @goo(i32 %x, i32 %y) { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %t0 = load i32, i32* %x.addr, align 4, !dbg !11 + %t1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %t0, %t1, !dbg !11 + ret i32 %add, !dbg !11 +} + +; Function Attrs: uwtable +define i32 @main() !dbg !7 { +entry: + %retval = alloca i32, align 4 + %s = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %retval + store i32 0, i32* %i, align 4, !dbg !12 + br label %while.cond, !dbg !13 + +while.cond: ; preds = %if.end, %entry + %t0 = load i32, i32* %i, align 4, !dbg !14 + %inc = add nsw i32 %t0, 1, !dbg !14 + store i32 %inc, i32* %i, align 4, !dbg !14 + %cmp = icmp slt i32 %t0, 400000000, !dbg !14 + br i1 %cmp, label %while.body, label %while.end, !dbg !14 + +while.body: ; preds = %while.cond + %t1 = load i32, i32* %i, align 4, !dbg !16 + %cmp1 = icmp ne i32 %t1, 100, !dbg !16 + br i1 %cmp1, label %if.then, label %if.else, !dbg !16 + +if.then: ; preds = %while.body + %t2 = load i32, i32* %i, align 4, !dbg !18 + %t3 = load i32, i32* %s, align 4, !dbg !18 +; Although the ratio of total samples of @foo vs total samples of @main is +; small, since the total samples count is larger than hot cutoff computed by +; ProfileSummaryInfo, we will still regard the callsite of foo as hot and +; early inlining will inline it. +; CHECK-LABEL: @main( +; CHECK-NOT: call i32 @foo(i32 %t2, i32 %t3) + %call1 = call i32 @foo(i32 %t2, i32 %t3), !dbg !18 + store i32 %call1, i32* %s, align 4, !dbg !18 + br label %if.end, !dbg !18 + +if.else: ; preds = %while.body +; call @goo 's basicblock doesn't get any sample, so no profile will be annotated. +; CHECK: call i32 @goo(i32 2, i32 3), !dbg !{{[0-9]+}} +; CHECK-NOT: !prof +; CHECK-SAME: {{$}} + %call2 = call i32 @goo(i32 2, i32 3), !dbg !26 + store i32 %call2, i32* %s, align 4, !dbg !20 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %while.cond, !dbg !22 + +while.end: ; preds = %while.cond + %t4 = load i32, i32* %s, align 4, !dbg !24 + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %t4), !dbg !24 + ret i32 0, !dbg !25 +} + +declare i32 @printf(i8*, ...) #2 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!4 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, variables: !2) +!5 = !DIFile(filename: "calls.cc", directory: ".") +!6 = !DISubroutineType(types: !2) +!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, variables: !2) +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = !{!"clang version 3.5 "} +!11 = !DILocation(line: 4, scope: !4) +!12 = !DILocation(line: 8, scope: !7) +!13 = !DILocation(line: 9, scope: !7) +!14 = !DILocation(line: 9, scope: !15) +!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7) +!16 = !DILocation(line: 10, scope: !17) +!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7) +!18 = !DILocation(line: 10, scope: !19) +!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17) +!20 = !DILocation(line: 10, scope: !21) +!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17) +!22 = !DILocation(line: 10, scope: !23) +!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17) +!24 = !DILocation(line: 11, scope: !7) +!25 = !DILocation(line: 12, scope: !7) +!26 = !DILocation(line: 11, scope: !19)