Index: lib/Transforms/IPO/SampleProfile.cpp =================================================================== --- lib/Transforms/IPO/SampleProfile.cpp +++ lib/Transforms/IPO/SampleProfile.cpp @@ -37,6 +37,7 @@ #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CFG.h" @@ -130,10 +131,13 @@ bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, uint32_t Discriminator, uint64_t Samples); unsigned computeCoverage(unsigned Used, unsigned Total) const; - unsigned countUsedRecords(const FunctionSamples *FS) const; - unsigned countBodyRecords(const FunctionSamples *FS) const; + unsigned countUsedRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const; + unsigned countBodyRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const; uint64_t getTotalUsedSamples() const { return TotalUsedSamples; } - uint64_t countBodySamples(const FunctionSamples *FS) const; + uint64_t countBodySamples(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const; void clear() { SampleCoverage.clear(); @@ -186,7 +190,8 @@ IsThinLTOPreLink(IsThinLTOPreLink) {} bool doInitialization(Module &M); - bool runOnModule(Module &M, ModuleAnalysisManager *AM); + bool runOnModule(Module &M, ModuleAnalysisManager *AM, + ProfileSummaryInfo *_PSI); void dump() { Reader->dump(); } @@ -285,6 +290,9 @@ /// Instead, we will mark GUIDs that needs to be annotated to the function. bool IsThinLTOPreLink; + /// \brief Profile Summary Info computed from sample profile. + ProfileSummaryInfo *PSI = nullptr; + /// \brief Total number of samples collected in this profile. /// /// This is the sum of all the samples collected in all the functions executed @@ -325,6 +333,7 @@ void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); AU.addRequired(); + AU.addRequired(); } private: @@ -343,22 +352,26 @@ /// frequently), then we will recreate the inline decision and apply the /// profile from the inlined callsite. /// -/// To decide whether an inlined callsite is hot, we compute the fraction -/// of samples used by the callsite with respect to the total number of samples -/// collected in the caller. -/// -/// If that fraction is larger than the default given by -/// SampleProfileHotThreshold, the callsite will be inlined again. +/// To decide whether an inlined callsite is hot, we use two rules: +/// If the callsite sample count is above the hot cutoff computed by +/// ProfileSummaryInfo, it is regarded as hot. +/// Compute the fraction of samples used by the callsite with respect to the +/// total number of samples collected in the caller. If that fraction is larger +/// than the default given by SampleProfileHotThreshold, it is regarded as hot. static bool callsiteIsHot(const FunctionSamples *CallerFS, - const FunctionSamples *CallsiteFS) { + const FunctionSamples *CallsiteFS, + ProfileSummaryInfo *PSI) { if (!CallsiteFS) return false; // The callsite was not inlined in the original binary. + uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); + if (PSI && PSI->isHotCount(CallsiteTotalSamples)) + return true; + uint64_t ParentTotalSamples = CallerFS->getTotalSamples(); if (ParentTotalSamples == 0) return false; // Avoid division by zero. - uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); if (CallsiteTotalSamples == 0) return false; // Callsite is trivially cold. @@ -387,7 +400,8 @@ /// /// This count does not include records from cold inlined callsites. unsigned -SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS) const { +SampleCoverageTracker::countUsedRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const { auto I = SampleCoverage.find(FS); // The size of the coverage map for FS represents the number of records @@ -400,8 +414,8 @@ for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countUsedRecords(CalleeSamples); + if (callsiteIsHot(FS, CalleeSamples, PSI)) + Count += countUsedRecords(CalleeSamples, PSI); } return Count; @@ -411,15 +425,16 @@ /// /// This count does not include records from cold inlined callsites. unsigned -SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS) const { +SampleCoverageTracker::countBodyRecords(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const { unsigned Count = FS->getBodySamples().size(); // Only count records in hot callsites. for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(FS, CalleeSamples)) - Count += countBodyRecords(CalleeSamples); + if (callsiteIsHot(FS, CalleeSamples, PSI)) + Count += countBodyRecords(CalleeSamples, PSI); } return Count; @@ -429,7 +444,8 @@ /// /// This count does not include samples from cold inlined callsites. uint64_t -SampleCoverageTracker::countBodySamples(const FunctionSamples *FS) const { +SampleCoverageTracker::countBodySamples(const FunctionSamples *FS, + ProfileSummaryInfo *PSI) const { uint64_t Total = 0; for (const auto &I : FS->getBodySamples()) Total += I.second.getSamples(); @@ -438,8 +454,8 @@ for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(FS, CalleeSamples)) - Total += countBodySamples(CalleeSamples); + if (callsiteIsHot(FS, CalleeSamples, PSI)) + Total += countBodySamples(CalleeSamples, PSI); } return Total; @@ -767,7 +783,7 @@ if ((isa(I) || isa(I)) && !isa(I) && (FS = findCalleeFunctionSamples(I))) { Candidates.push_back(&I); - if (callsiteIsHot(Samples, FS)) + if (callsiteIsHot(Samples, FS, PSI)) Hot = true; } } @@ -1463,8 +1479,8 @@ // If coverage checking was requested, compute it now. if (SampleProfileRecordCoverage) { - unsigned Used = CoverageTracker.countUsedRecords(Samples); - unsigned Total = CoverageTracker.countBodyRecords(Samples); + unsigned Used = CoverageTracker.countUsedRecords(Samples, PSI); + unsigned Total = CoverageTracker.countBodyRecords(Samples, PSI); unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); if (Coverage < SampleProfileRecordCoverage) { F.getContext().diagnose(DiagnosticInfoSampleProfile( @@ -1477,7 +1493,7 @@ if (SampleProfileSampleCoverage) { uint64_t Used = CoverageTracker.getTotalUsedSamples(); - uint64_t Total = CoverageTracker.countBodySamples(Samples); + uint64_t Total = CoverageTracker.countBodySamples(Samples, PSI); unsigned Coverage = CoverageTracker.computeCoverage(Used, Total); if (Coverage < SampleProfileSampleCoverage) { F.getContext().diagnose(DiagnosticInfoSampleProfile( @@ -1496,6 +1512,7 @@ "Sample Profile loader", false, false) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass) INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) @@ -1520,10 +1537,15 @@ return new SampleProfileLoaderLegacyPass(Name); } -bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM) { +bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM, + ProfileSummaryInfo *_PSI) { if (!ProfileIsValid) return false; + PSI = _PSI; + if (M.getProfileSummary() == nullptr) + M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); + // Compute the total number of samples collected in this profile. for (const auto &I : Reader->getProfiles()) TotalCollectedSamples += I.second.getTotalSamples(); @@ -1554,15 +1576,15 @@ clearFunctionData(); retval |= runOnFunction(F, AM); } - if (M.getProfileSummary() == nullptr) - M.setProfileSummary(Reader->getSummary().getMD(M.getContext())); return retval; } bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) { ACT = &getAnalysis(); TTIWP = &getAnalysis(); - return SampleLoader.runOnModule(M, nullptr); + ProfileSummaryInfo *PSI = + getAnalysis().getPSI(); + return SampleLoader.runOnModule(M, nullptr, PSI); } bool SampleProfileLoader::runOnFunction(Function &F, ModuleAnalysisManager *AM) { @@ -1604,7 +1626,8 @@ SampleLoader.doInitialization(M); - if (!SampleLoader.runOnModule(M, &AM)) + ProfileSummaryInfo *PSI = &AM.getResult(M); + if (!SampleLoader.runOnModule(M, &AM, PSI)) return PreservedAnalyses::all(); return PreservedAnalyses::none(); Index: test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof =================================================================== --- test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof +++ test/Transforms/SampleProfile/Inputs/warm-inline-instance.prof @@ -0,0 +1,11 @@ +main:2257150:0 + 2.1: 5553 + 3: 5391 + 3.1: foo:5860 + 0: 5279 + 1: 5279 + 2: 5279 + 4.1: goo:60 + 0: 20 + 1: 20 + 2: 20 Index: test/Transforms/SampleProfile/warm-inline-instance.ll =================================================================== --- test/Transforms/SampleProfile/warm-inline-instance.ll +++ test/Transforms/SampleProfile/warm-inline-instance.ll @@ -0,0 +1,115 @@ +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -sample-profile-inline-hot-threshold=1 -S | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/warm-inline-instance.prof -sample-profile-inline-hot-threshold=1 -S | FileCheck %s + +@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1 + +; Function Attrs: nounwind uwtable +define i32 @foo(i32 %x, i32 %y) !dbg !4 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %t0 = load i32, i32* %x.addr, align 4, !dbg !11 + %t1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %t0, %t1, !dbg !11 + ret i32 %add, !dbg !11 +} + +define i32 @goo(i32 %x, i32 %y) { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %t0 = load i32, i32* %x.addr, align 4, !dbg !11 + %t1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %t0, %t1, !dbg !11 + ret i32 %add, !dbg !11 +} + +; Function Attrs: uwtable +define i32 @main() !dbg !7 { +entry: + %retval = alloca i32, align 4 + %s = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %retval + store i32 0, i32* %i, align 4, !dbg !12 + br label %while.cond, !dbg !13 + +while.cond: ; preds = %if.end, %entry + %t0 = load i32, i32* %i, align 4, !dbg !14 + %inc = add nsw i32 %t0, 1, !dbg !14 + store i32 %inc, i32* %i, align 4, !dbg !14 + %cmp = icmp slt i32 %t0, 400000000, !dbg !14 + br i1 %cmp, label %while.body, label %while.end, !dbg !14 + +while.body: ; preds = %while.cond + %t1 = load i32, i32* %i, align 4, !dbg !16 + %cmp1 = icmp ne i32 %t1, 100, !dbg !16 + br i1 %cmp1, label %if.then, label %if.else, !dbg !16 + +if.then: ; preds = %while.body + %t2 = load i32, i32* %i, align 4, !dbg !18 + %t3 = load i32, i32* %s, align 4, !dbg !18 +; Although the ratio of total samples of @foo vs total samples of @main is +; small, since the total samples count is larger than hot cutoff computed by +; ProfileSummaryInfo, we will still regard the callsite of foo as hot and +; early inlining will inline it. +; CHECK-LABEL: @main( +; CHECK-NOT: call i32 @foo(i32 %t2, i32 %t3) + %call1 = call i32 @foo(i32 %t2, i32 %t3), !dbg !18 + store i32 %call1, i32* %s, align 4, !dbg !18 + br label %if.end, !dbg !18 + +if.else: ; preds = %while.body +; call @goo 's basicblock doesn't get any sample, so no profile will be annotated. +; CHECK: call i32 @goo(i32 2, i32 3), !dbg !{{[0-9]+}} +; CHECK-NOT: !prof +; CHECK-SAME: {{$}} + %call2 = call i32 @goo(i32 2, i32 3), !dbg !26 + store i32 %call2, i32* %s, align 4, !dbg !20 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %while.cond, !dbg !22 + +while.end: ; preds = %while.cond + %t4 = load i32, i32* %s, align 4, !dbg !24 + %call3 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %t4), !dbg !24 + ret i32 0, !dbg !25 +} + +declare i32 @printf(i8*, ...) #2 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!4 = distinct !DISubprogram(name: "foo", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, variables: !2) +!5 = !DIFile(filename: "calls.cc", directory: ".") +!6 = !DISubroutineType(types: !2) +!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, variables: !2) +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = !{!"clang version 3.5 "} +!11 = !DILocation(line: 4, scope: !4) +!12 = !DILocation(line: 8, scope: !7) +!13 = !DILocation(line: 9, scope: !7) +!14 = !DILocation(line: 9, scope: !15) +!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7) +!16 = !DILocation(line: 10, scope: !17) +!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7) +!18 = !DILocation(line: 10, scope: !19) +!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17) +!20 = !DILocation(line: 10, scope: !21) +!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17) +!22 = !DILocation(line: 10, scope: !23) +!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17) +!24 = !DILocation(line: 11, scope: !7) +!25 = !DILocation(line: 12, scope: !7) +!26 = !DILocation(line: 11, scope: !19)