Index: llvm/trunk/include/llvm/ProfileData/SampleProfReader.h =================================================================== --- llvm/trunk/include/llvm/ProfileData/SampleProfReader.h +++ llvm/trunk/include/llvm/ProfileData/SampleProfReader.h @@ -330,6 +330,10 @@ return nullptr; }; + /// It includes all the names that have samples either in outline instance + /// or inline instance. + virtual std::vector *getNameTable() { return nullptr; } + protected: /// Map every function to its associated profile. /// @@ -387,6 +391,10 @@ /// Read sample profiles from the associated file. std::error_code read() override; + /// It includes all the names that have samples either in outline instance + /// or inline instance. + virtual std::vector *getNameTable() override { return &NameTable; } + protected: /// Read a numeric value of type T from the profile. /// Index: llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp =================================================================== --- llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp +++ llvm/trunk/lib/Transforms/IPO/SampleProfile.cpp @@ -139,9 +139,11 @@ using BlockEdgeMap = DenseMap>; +class SampleProfileLoader; + class SampleCoverageTracker { public: - SampleCoverageTracker() = default; + SampleCoverageTracker(SampleProfileLoader &SPL) : SPLoader(SPL){}; bool markSamplesUsed(const FunctionSamples *FS, uint32_t LineOffset, uint32_t Discriminator, uint64_t Samples); @@ -187,6 +189,8 @@ /// keyed by FunctionSamples pointers, but these stats are cleared after /// every function, so we just need to keep a single counter. uint64_t TotalUsedSamples = 0; + + SampleProfileLoader &SPLoader; }; class GUIDToFuncNameMapper { @@ -269,8 +273,9 @@ std::function GetAssumptionCache, std::function GetTargetTransformInfo) : GetAC(std::move(GetAssumptionCache)), - GetTTI(std::move(GetTargetTransformInfo)), Filename(Name), - RemappingFilename(RemapName), IsThinLTOPreLink(IsThinLTOPreLink) {} + GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this), + Filename(Name), RemappingFilename(RemapName), + IsThinLTOPreLink(IsThinLTOPreLink) {} bool doInitialization(Module &M); bool runOnModule(Module &M, ModuleAnalysisManager *AM, @@ -279,6 +284,8 @@ void dump() { Reader->dump(); } protected: + friend class SampleCoverageTracker; + bool runOnFunction(Function &F, ModuleAnalysisManager *AM); unsigned getFunctionLoc(Function &F); bool emitAnnotations(Function &F); @@ -307,6 +314,8 @@ bool propagateThroughEdges(Function &F, bool UpdateBlockCount); void computeDominanceAndLoopInfo(Function &F); void clearFunctionData(); + bool callsiteIsHot(const FunctionSamples *CallsiteFS, + ProfileSummaryInfo *PSI); /// Map basic blocks to their computed weights. /// @@ -404,6 +413,13 @@ // GUIDToFuncNameMap saves the mapping from GUID to the symbol name, for // all the function symbols defined or declared in current module. DenseMap GUIDToFuncNameMap; + + // All the Names used in FunctionSamples including outline function + // names, inline instance names and call target names. + StringSet<> NamesInProfile; + + // Showing whether ProfileSampleAccurate is enabled for current function. + bool ProfSampleAccEnabled = false; }; class SampleProfileLoaderLegacyPass : public ModulePass { @@ -459,14 +475,22 @@ /// To decide whether an inlined callsite is hot, we compare the callsite /// sample count with the hot cutoff computed by ProfileSummaryInfo, it is /// regarded as hot if the count is above the cutoff value. -static bool callsiteIsHot(const FunctionSamples *CallsiteFS, - ProfileSummaryInfo *PSI) { +/// +/// When profile-sample-accurate is enabled, functions without profile will +/// be regarded as cold and much less inlining will happen in CGSCC inlining +/// pass, so we tend to lower the hot criteria here to allow more early +/// inlining to happen for warm callsites and it is helpful for performance. +bool SampleProfileLoader::callsiteIsHot(const FunctionSamples *CallsiteFS, + ProfileSummaryInfo *PSI) { if (!CallsiteFS) return false; // The callsite was not inlined in the original binary. assert(PSI && "PSI is expected to be non null"); uint64_t CallsiteTotalSamples = CallsiteFS->getTotalSamples(); - return PSI->isHotCount(CallsiteTotalSamples); + if (ProfSampleAccEnabled) + return !PSI->isColdCount(CallsiteTotalSamples); + else + return PSI->isHotCount(CallsiteTotalSamples); } /// Mark as used the sample record for the given function samples at @@ -503,7 +527,7 @@ for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(CalleeSamples, PSI)) + if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) Count += countUsedRecords(CalleeSamples, PSI); } @@ -522,7 +546,7 @@ for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(CalleeSamples, PSI)) + if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) Count += countBodyRecords(CalleeSamples, PSI); } @@ -543,7 +567,7 @@ for (const auto &I : FS->getCallsiteSamples()) for (const auto &J : I.second) { const FunctionSamples *CalleeSamples = &J.second; - if (callsiteIsHot(CalleeSamples, PSI)) + if (SPLoader.callsiteIsHot(CalleeSamples, PSI)) Total += countBodySamples(CalleeSamples, PSI); } @@ -1643,6 +1667,12 @@ ProfileIsValid = (Reader->read() == sampleprof_error::success); PSL = Reader->getProfileSymbolList(); + if (ProfileSampleAccurate) { + NamesInProfile.clear(); + if (auto NameTable = Reader->getNameTable()) + NamesInProfile.insert(NameTable->begin(), NameTable->end()); + } + if (!RemappingFilename.empty()) { // Apply profile remappings to the loaded profile data if requested. // For now, we only support remapping symbols encoded using the Itanium @@ -1733,17 +1763,36 @@ // conservatively by getEntryCount as the same as unknown (None). This is // to avoid newly added code to be treated as cold. If we have samples // this will be overwritten in emitAnnotations. - // - // PSL -- profile symbol list include all the symbols in sampled binary. - // If ProfileSampleAccurate is true or F has profile-sample-accurate - // attribute, and if there is no profile symbol list read in, initialize - // all the function entry counts to 0; if there is profile symbol list, only - // initialize the entry count to 0 when current function is in the list. - uint64_t initialEntryCount = - ((ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate")) && - (!PSL || PSL->contains(F.getName()))) - ? 0 - : -1; + uint64_t initialEntryCount = -1; + + ProfSampleAccEnabled = + ProfileSampleAccurate || F.hasFnAttribute("profile-sample-accurate"); + if (ProfSampleAccEnabled) { + // PSL -- profile symbol list include all the symbols in sampled binary. + // It is used to prevent new functions to be treated as cold. + // If ProfileSampleAccurate is true or F has profile-sample-accurate + // attribute, and if there is no profile symbol list read in, initialize + // all the function entry counts to 0; if there is profile symbol list, only + // initialize the entry count to 0 when current function is in the list. + if (!PSL || PSL->contains(F.getName())) + initialEntryCount = 0; + + // When ProfileSampleAccurate is true, function without sample will be + // regarded as cold. To minimize the potential negative performance + // impact it could have, we want to be a little conservative here + // saying if a function shows up in the profile, no matter as outline + // function, inline instance or call targets, treat the function as not + // being cold. This will handle the cases such as most callsites of a + // function are inlined in sampled binary but not inlined in current + // build (because of source code drift, imprecise debug information, or + // the callsites are all cold individually but not cold accumulatively...), + // so the outline function showing up as cold in sampled binary will + // actually not be cold after current build. + StringRef CanonName = FunctionSamples::getCanonicalFnName(F); + if (NamesInProfile.count(CanonName)) + initialEntryCount = -1; + } + F.setEntryCount(ProfileCount(initialEntryCount, Function::PCT_Real)); std::unique_ptr OwnedORE; if (AM) { Index: llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll =================================================================== --- llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll +++ llvm/trunk/test/Transforms/SampleProfile/profile-sample-accurate.ll @@ -0,0 +1,118 @@ +; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profsampleacc.extbinary.afdo -profile-summary-cutoff-hot=600000 -profile-sample-accurate -S | FileCheck %s +; With the hot cutoff being set to 600000, the inline instance of _Z3sumii +; in main is neither hot nor cold. Check it will still be inlined when +; profile-sample-accurate is enabled, and check _Z3sumii's function entry +; count won't be initialized to 0 because it shows up in the profile as +; inline instance. + +; Original C++ test case +; +; #include +; +; int sum(int x, int y) { +; return x + y; +; } +; +; int main() { +; int s, i = 0; +; while (i++ < 20000 * 20000) +; if (i != 100) s = sum(i, s); else s = 30; +; printf("sum is %d\n", s); +; return 0; +; } +; +@.str = private unnamed_addr constant [11 x i8] c"sum is %d\0A\00", align 1 + +; CHECK: define i32 @_Z3sumii{{.*}}!prof ![[UNKNOWN_ID:[0-9]+]] +; Function Attrs: nounwind uwtable +define i32 @_Z3sumii(i32 %x, i32 %y) !dbg !4 { +entry: + %x.addr = alloca i32, align 4 + %y.addr = alloca i32, align 4 + store i32 %x, i32* %x.addr, align 4 + store i32 %y, i32* %y.addr, align 4 + %0 = load i32, i32* %x.addr, align 4, !dbg !11 + %1 = load i32, i32* %y.addr, align 4, !dbg !11 + %add = add nsw i32 %0, %1, !dbg !11 + ret i32 %add, !dbg !11 +} + +; Function Attrs: uwtable +define i32 @main() !dbg !7 { +entry: + %retval = alloca i32, align 4 + %s = alloca i32, align 4 + %i = alloca i32, align 4 + store i32 0, i32* %retval + store i32 0, i32* %i, align 4, !dbg !12 + br label %while.cond, !dbg !13 + +while.cond: ; preds = %if.end, %entry + %0 = load i32, i32* %i, align 4, !dbg !14 + %inc = add nsw i32 %0, 1, !dbg !14 + store i32 %inc, i32* %i, align 4, !dbg !14 + %cmp = icmp slt i32 %0, 400000000, !dbg !14 + br i1 %cmp, label %while.body, label %while.end, !dbg !14 + +while.body: ; preds = %while.cond + %1 = load i32, i32* %i, align 4, !dbg !16 + %cmp1 = icmp ne i32 %1, 100, !dbg !16 + br i1 %cmp1, label %if.then, label %if.else, !dbg !16 + +; Check _Z3sumii is inlined at this callsite. +; CHECK: if.then: +; CHECK-NOT: call i32 @_Z3sumii +; CHECK: if.else: +if.then: ; preds = %while.body + %2 = load i32, i32* %i, align 4, !dbg !18 + %3 = load i32, i32* %s, align 4, !dbg !18 + %call = call i32 @_Z3sumii(i32 %2, i32 %3), !dbg !18 + store i32 %call, i32* %s, align 4, !dbg !18 + br label %if.end, !dbg !18 + +if.else: ; preds = %while.body + store i32 30, i32* %s, align 4, !dbg !20 + br label %if.end + +if.end: ; preds = %if.else, %if.then + br label %while.cond, !dbg !22 + +while.end: ; preds = %while.cond + %4 = load i32, i32* %s, align 4, !dbg !24 + %call2 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([11 x i8], [11 x i8]* @.str, i32 0, i32 0), i32 %4), !dbg !24 + ret i32 0, !dbg !25 +} + +declare i32 @printf(i8*, ...) #2 + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!8, !9} +!llvm.ident = !{!10} + +; CHECK: ![[UNKNOWN_ID]] = !{!"function_entry_count", i64 -1} +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2) +!1 = !DIFile(filename: "calls.cc", directory: ".") +!2 = !{} +!4 = distinct !DISubprogram(name: "sum", line: 3, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 3, file: !1, scope: !5, type: !6, retainedNodes: !2) +!5 = !DIFile(filename: "calls.cc", directory: ".") +!6 = !DISubroutineType(types: !2) +!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !5, type: !6, retainedNodes: !2) +!8 = !{i32 2, !"Dwarf Version", i32 4} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = !{!"clang version 3.5 "} +!11 = !DILocation(line: 4, scope: !4) +!12 = !DILocation(line: 8, scope: !7) +!13 = !DILocation(line: 9, scope: !7) +!14 = !DILocation(line: 9, scope: !15) +!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7) +!16 = !DILocation(line: 10, scope: !17) +!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7) +!18 = !DILocation(line: 10, scope: !19) +!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17) +!20 = !DILocation(line: 10, scope: !21) +!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17) +!22 = !DILocation(line: 10, scope: !23) +!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17) +!24 = !DILocation(line: 11, scope: !7) +!25 = !DILocation(line: 12, scope: !7)