diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -387,14 +387,17 @@ if (FS != iter->second.end()) return &FS->second; // If we cannot find exact match of the callee name, return the FS with - // the max total count. + // the max total count. Only do this when CalleeName is not provided, + // i.e., only for indirect calls. uint64_t MaxTotalSamples = 0; const FunctionSamples *R = nullptr; - for (const auto &NameFS : iter->second) - if (NameFS.second.getTotalSamples() >= MaxTotalSamples) { - MaxTotalSamples = NameFS.second.getTotalSamples(); - R = &NameFS.second; - } + if (CalleeName.empty()) { + for (const auto &NameFS : iter->second) + if (NameFS.second.getTotalSamples() >= MaxTotalSamples) { + MaxTotalSamples = NameFS.second.getTotalSamples(); + R = &NameFS.second; + } + } return R; } diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -358,6 +358,15 @@ return getSamplesFor(CanonName); } + /// Return the samples collected for function \p F, create empty + /// FunctionSamples if it doesn't exist. + FunctionSamples *getOrCreateSamplesFor(const Function &F) { + std::string FGUID; + StringRef CanonName = FunctionSamples::getCanonicalFnName(F); + CanonName = getRepInFormat(CanonName, getFormat(), FGUID); + return &Profiles[CanonName]; + } + /// Return the samples collected for function \p F. virtual FunctionSamples *getSamplesFor(StringRef Fname) { if (Remapper) { diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -137,6 +137,11 @@ cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. ")); +static cl::opt ProfileMergeInlinee( + "sample-profile-merge-inlinee", cl::Hidden, cl::init(false), + cl::desc("Merge past inlinee's profile to outline version if sample " + "profile loader decided not to inline a call site.")); + namespace { using BlockWeightMap = DenseMap; @@ -1008,9 +1013,26 @@ if (!Callee || Callee->isDeclaration()) continue; const FunctionSamples *FS = Pair.getSecond(); - auto pair = - notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); - pair.first->second.entryCount += FS->getEntrySamples(); + if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { + continue; + } + + if (ProfileMergeInlinee) { + // Use entry samples as head samples during the merge, as inlinees + // don't have head samples. + assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee"); + const_cast(FS)->addHeadSamples(FS->getEntrySamples()); + + // Note that we have to do the merge right after processing function. + // This allows OutlineFS's profile to be used for annotation during + // top-down processing of functions' annotation. + FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); + OutlineFS->merge(*FS); + } else { + auto pair = + notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); + pair.first->second.entryCount += FS->getEntrySamples(); + } } return Changed; } diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo --- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo +++ b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo @@ -1,4 +1,4 @@ caller:0:0 - 2:sum:0 + 2: sum:0 3: 0 __prefetch_nta_0:23456 3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/Inputs/einline.prof b/llvm/test/Transforms/SampleProfile/Inputs/einline.prof --- a/llvm/test/Transforms/SampleProfile/Inputs/einline.prof +++ b/llvm/test/Transforms/SampleProfile/Inputs/einline.prof @@ -1,7 +1,7 @@ _Z3foov:200:100 - 1: _Z3barv:0 + 1: _ZL3barv:0 2: no_inline:100 - 3: _Z3barv:100 + 3: _ZL3barv:100 recursive:200:100 1: recursive:100 2: recursive:100 diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof --- a/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof +++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof @@ -4,7 +4,7 @@ 1: direct_leaf_func:35000 11: 3000 test_cgscc_inline:63067:0 - 1: sample_loader_inlinee:1 + 1: cgscc_inlinee:1 cgscc_inlinee:3000:0 1: direct_leaf_func:35000 11: 3000 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof @@ -0,0 +1,13 @@ +main:225715:0 + 2.1: 5553 + 3: 5391 + 3.1: _Z3sumii:46 + 1: 23 + 2: _Z3subii:2 + 1: 2 + 3: 21 + +_Z3sumii:11:22 + 1: 11 + 2: 10 _Z3subii:10 + 3: 1 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll @@ -0,0 +1,109 @@ +; Test we lose details of not inlined profile without '-sample-profile-merge-inlinee' +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -S | FileCheck -check-prefix=SCALE %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -S | FileCheck -check-prefix=SCALE %s + +; Test we properly merge not inlined profile properly with '-sample-profile-merge-inlinee' +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee -S | FileCheck -check-prefix=MERGE %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee -S | FileCheck -check-prefix=MERGE %s + +@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1 + +; Function Attrs: uwtable +define i32 @main() !dbg !7 { +entry: + %retval = alloca i32, align 4 + %s = alloca i32, align 4 + %i = alloca i32, align 4 + %0 = load i32, i32* %i, align 4, !dbg !18 + %1 = load i32, i32* %s, align 4, !dbg !18 + %call = call i32 @_Z3sumii(i32 %0, i32 %1), !dbg !18 +; SCALE: call i32 @_Z3sumii +; MERGE: call i32 @_Z3sumii + store i32 %call, i32* %s, align 4, !dbg !18 + ret i32 0, !dbg !25 +} + +; Function Attrs: nounwind uwtable +define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !4 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %0 = load i32, i32* %x.addr, align 4, !dbg !11 + %1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %0, %1, !dbg !11 + %2 = load i32, i32* %x.addr, align 4, !dbg !11 + %3 = load i32, i32* %y.addr, align 4, !dbg !11 + %cmp1 = icmp ne i32 %3, 100, !dbg !11 + br i1 %cmp1, label %if.then, label %if.else, !dbg !11 +if.then: + %call = call i32 @_Z3subii(i32 %2, i32 %3), !dbg !29 + ret i32 %add, !dbg !29 +if.else: + ret i32 %add, !dbg !30 +} + +; Function Attrs: nounwind uwtable +define i32 @_Z3subii(i32 %x, i32 %y) !dbg !26 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %0 = load i32, i32* %x.addr, align 4, !dbg !27 + %1 = load i32, i32* %y.addr, align 4, !dbg !27 + %add = sub nsw i32 %0, %1, !dbg !27 + ret i32 %add, !dbg !28 +} + +declare i32 @printf(i8*, ...) #2 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!4 = distinct !DISubprogram(name: "sum", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2) +!5 = !DIFile(filename: "calls.cc", directory: ".") +!6 = !DISubroutineType(types: !2) +!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, retainedNodes: !2) +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = !{!"clang version 3.5 "} +!11 = !DILocation(line: 4, scope: !4) +!12 = !DILocation(line: 8, scope: !7) +!13 = !DILocation(line: 9, scope: !7) +!14 = !DILocation(line: 9, scope: !15) +!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7) +!16 = !DILocation(line: 10, scope: !17) +!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7) +!18 = !DILocation(line: 10, scope: !19) +!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17) +!20 = !DILocation(line: 10, scope: !21) +!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17) +!22 = !DILocation(line: 10, scope: !23) +!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17) +!24 = !DILocation(line: 11, scope: !7) +!25 = !DILocation(line: 12, scope: !7) +!26 = distinct !DISubprogram(name: "sub", line: 20, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 20, file: !1, scope: !5, type: !6, retainedNodes: !2) +!27 = !DILocation(line: 20, scope: !26) +!28 = !DILocation(line: 21, scope: !26) +!29 = !DILocation(line: 5, scope: !4) +!30 = !DILocation(line: 6, scope: !4) + +; SCALE: name: "sum" +; SCALE-NEXT: {!"function_entry_count", i64 46} +; SCALE: !{!"branch_weights", i32 11, i32 2} +; SCALE: !{!"branch_weights", i64 20} +; SCALE: name: "sub" +; SCALE-NEXT: {!"function_entry_count", i64 -1} + +; MERGE: name: "sum" +; MERGE-NEXT: {!"function_entry_count", i64 46} +; MERGE: !{!"branch_weights", i32 11, i32 23} +; MERGE: !{!"branch_weights", i32 10} +; MERGE: name: "sub" +; MERGE-NEXT: {!"function_entry_count", i64 3} \ No newline at end of file