diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -387,7 +387,10 @@ if (FS != iter->second.end()) return &FS->second; // If we cannot find exact match of the callee name, return the FS with - // the max total count. + // the max total count. Only do this when CalleeName is not provided, + // i.e., only for indirect calls. + if (!CalleeName.empty()) + return nullptr; uint64_t MaxTotalSamples = 0; const FunctionSamples *R = nullptr; for (const auto &NameFS : iter->second) diff --git a/llvm/include/llvm/ProfileData/SampleProfReader.h b/llvm/include/llvm/ProfileData/SampleProfReader.h --- a/llvm/include/llvm/ProfileData/SampleProfReader.h +++ b/llvm/include/llvm/ProfileData/SampleProfReader.h @@ -358,6 +358,15 @@ return getSamplesFor(CanonName); } + /// Return the samples collected for function \p F, create empty + /// FunctionSamples if it doesn't exist. + FunctionSamples *getOrCreateSamplesFor(const Function &F) { + std::string FGUID; + StringRef CanonName = FunctionSamples::getCanonicalFnName(F); + CanonName = getRepInFormat(CanonName, getFormat(), FGUID); + return &Profiles[CanonName]; + } + /// Return the samples collected for function \p F. virtual FunctionSamples *getSamplesFor(StringRef Fname) { if (Remapper) { diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -137,6 +137,11 @@ cl::desc("For symbols in profile symbol list, regard their profiles to " "be accurate. It may be overriden by profile-sample-accurate. ")); +static cl::opt ProfileMergeInlinee( + "sample-profile-merge-inlinee", cl::Hidden, cl::init(false), + cl::desc("Merge past inlinee's profile to outline version if sample " + "profile loader decided not to inline a call site.")); + namespace { using BlockWeightMap = DenseMap; @@ -1008,9 +1013,26 @@ if (!Callee || Callee->isDeclaration()) continue; const FunctionSamples *FS = Pair.getSecond(); - auto pair = - notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); - pair.first->second.entryCount += FS->getEntrySamples(); + if (FS->getTotalSamples() == 0 && FS->getEntrySamples() == 0) { + continue; + } + + if (ProfileMergeInlinee) { + // Use entry samples as head samples during the merge, as inlinees + // don't have head samples. + assert(FS->getHeadSamples() == 0 && "Expect 0 head sample for inlinee"); + const_cast(FS)->addHeadSamples(FS->getEntrySamples()); + + // Note that we have to do the merge right after processing function. + // This allows OutlineFS's profile to be used for annotation during + // top-down processing of functions' annotation. + FunctionSamples *OutlineFS = Reader->getOrCreateSamplesFor(*Callee); + OutlineFS->merge(*FS); + } else { + auto pair = + notInlinedCallInfo.try_emplace(Callee, NotInlinedProfileInfo{0}); + pair.first->second.entryCount += FS->getEntrySamples(); + } } return Changed; } diff --git a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo --- a/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo +++ b/llvm/test/CodeGen/X86/insert-prefetch-inline.afdo @@ -1,4 +1,4 @@ caller:0:0 - 2:sum:0 + 2: sum:0 3: 0 __prefetch_nta_0:23456 3.1: 0 __prefetch_nta_0:8764 __prefetch_nta_1:64 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/Inputs/einline.prof b/llvm/test/Transforms/SampleProfile/Inputs/einline.prof --- a/llvm/test/Transforms/SampleProfile/Inputs/einline.prof +++ b/llvm/test/Transforms/SampleProfile/Inputs/einline.prof @@ -1,7 +1,7 @@ _Z3foov:200:100 - 1: _Z3barv:0 + 1: _ZL3barv:0 2: no_inline:100 - 3: _Z3barv:100 + 3: _ZL3barv:100 recursive:200:100 1: recursive:100 2: recursive:100 diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof --- a/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof +++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-callee-update.prof @@ -4,7 +4,7 @@ 1: direct_leaf_func:35000 11: 3000 test_cgscc_inline:63067:0 - 1: sample_loader_inlinee:1 + 1: cgscc_inlinee:1 cgscc_inlinee:3000:0 1: direct_leaf_func:35000 11: 3000 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof b/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/inline-mergeprof.prof @@ -0,0 +1,13 @@ +main:225715:0 + 2.1: 5553 + 3: 5391 + 3.1: _Z3sumii:46 + 1: 23 + 2: _Z3subii:2 + 1: 2 + 3: 21 + +_Z3sumii:11:22 + 1: 11 + 2: 10 _Z3subii:10 + 3: 1 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/inline-mergeprof.ll @@ -0,0 +1,97 @@ +; Test we lose details of not inlined profile without '-sample-profile-merge-inlinee' +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -S | FileCheck -check-prefix=SCALE %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -S | FileCheck -check-prefix=SCALE %s + +; Test we properly merge not inlined profile properly with '-sample-profile-merge-inlinee' +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee -S | FileCheck -check-prefix=MERGE %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/inline-mergeprof.prof -sample-profile-merge-inlinee -S | FileCheck -check-prefix=MERGE %s + +@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1 + +define i32 @main() !dbg !6 { +entry: + %retval = alloca i32, align 4 + %s = alloca i32, align 4 + %i = alloca i32, align 4 + %tmp = load i32, i32* %i, align 4, !dbg !8 + %tmp1 = load i32, i32* %s, align 4, !dbg !8 + %call = call i32 @_Z3sumii(i32 %tmp, i32 %tmp1), !dbg !8 +; SCALE: call i32 @_Z3sumii +; MERGE: call i32 @_Z3sumii + store i32 %call, i32* %s, align 4, !dbg !8 + ret i32 0, !dbg !11 +} + +define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !12 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %tmp = load i32, i32* %x.addr, align 4, !dbg !13 + %tmp1 = load i32, i32* %y.addr, align 4, !dbg !13 + %add = add nsw i32 %tmp, %tmp1, !dbg !13 + %tmp2 = load i32, i32* %x.addr, align 4, !dbg !13 + %tmp3 = load i32, i32* %y.addr, align 4, !dbg !13 + %cmp1 = icmp ne i32 %tmp3, 100, !dbg !13 + br i1 %cmp1, label %if.then, label %if.else, !dbg !13 + +if.then: ; preds = %entry + %call = call i32 @_Z3subii(i32 %tmp2, i32 %tmp3), !dbg !14 + ret i32 %add, !dbg !14 + +if.else: ; preds = %entry + ret i32 %add, !dbg !15 +} + +define i32 @_Z3subii(i32 %x, i32 %y) !dbg !16 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %tmp = load i32, i32* %x.addr, align 4, !dbg !17 + %tmp1 = load i32, i32* %y.addr, align 4, !dbg !17 + %add = sub nsw i32 %tmp, %tmp1, !dbg !17 + ret i32 %add, !dbg !18 +} + +declare i32 @printf(i8*, ...) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!3, !4} +!llvm.ident = !{!5} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.5 ", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!3 = !{i32 2, !"Dwarf Version", i32 4} +!4 = !{i32 1, !"Debug Info Version", i32 3} +!5 = !{!"clang version 3.5 "} +!6 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 7, type: !7, scopeLine: 7, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!7 = !DISubroutineType(types: !2) +!8 = !DILocation(line: 10, scope: !9) +!9 = !DILexicalBlockFile(scope: !10, file: !1, discriminator: 2) +!10 = distinct !DILexicalBlock(scope: !6, file: !1, line: 10) +!11 = !DILocation(line: 12, scope: !6) +!12 = distinct !DISubprogram(name: "sum", scope: !1, file: !1, line: 3, type: !7, scopeLine: 3, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!13 = !DILocation(line: 4, scope: !12) +!14 = !DILocation(line: 5, scope: !12) +!15 = !DILocation(line: 6, scope: !12) +!16 = distinct !DISubprogram(name: "sub", scope: !1, file: !1, line: 20, type: !7, scopeLine: 20, virtualIndex: 6, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2) +!17 = !DILocation(line: 20, scope: !16) +!18 = !DILocation(line: 21, scope: !16) + +; SCALE: name: "sum" +; SCALE-NEXT: {!"function_entry_count", i64 46} +; SCALE: !{!"branch_weights", i32 11, i32 2} +; SCALE: !{!"branch_weights", i64 20} +; SCALE: name: "sub" +; SCALE-NEXT: {!"function_entry_count", i64 -1} + +; MERGE: name: "sum" +; MERGE-NEXT: {!"function_entry_count", i64 46} +; MERGE: !{!"branch_weights", i32 11, i32 23} +; MERGE: !{!"branch_weights", i32 10} +; MERGE: name: "sub" +; MERGE-NEXT: {!"function_entry_count", i64 3} \ No newline at end of file