diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -187,7 +187,10 @@ /// SecFlagPartial means the profile is for common/shared code. /// The common profile is usually merged from profiles collected /// from running other targets. - SecFlagPartial = (1 << 0) + SecFlagPartial = (1 << 0), + /// SecFlagContext means this is context-sensitive profile for + /// CSSPGO + SecFlagFullContext = (1 << 1) }; enum class SecFuncMetadataFlags : uint32_t { @@ -730,7 +733,7 @@ /// corresponding function is no less than \p Threshold, add its corresponding /// GUID to \p S. Also traverse the BodySamples to add hot CallTarget's GUID /// to \p S. - void findInlinedFunctions(DenseSet &S, const Module *M, + void findInlinedFunctions(DenseSet &S, const StringMap &SymbolMap, uint64_t Threshold) const { if (TotalSamples <= Threshold) @@ -753,7 +756,7 @@ } for (const auto &CS : CallsiteSamples) for (const auto &NameFS : CS.second) - NameFS.second.findInlinedFunctions(S, M, SymbolMap, Threshold); + NameFS.second.findInlinedFunctions(S, SymbolMap, Threshold); } /// Set the name of the function. diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h --- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h +++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h @@ -115,6 +115,8 @@ bool MergeContext = true); // Query base profile for a given function by name. FunctionSamples *getBaseSamplesFor(StringRef Name, bool MergeContext); + // Retrieve the context trie node for given profile context + ContextTrieNode *getContextFor(const SampleContext &Context); // Mark a context profile as inlined when function is inlined. // This makes sure that inlined context profile will be excluded in // function's base profile. @@ -127,7 +129,6 @@ private: ContextTrieNode *getContextFor(const DILocation *DIL); - ContextTrieNode *getContextFor(const SampleContext &Context); ContextTrieNode *getCalleeContextFor(const DILocation *DIL, StringRef CalleeName); ContextTrieNode *getOrCreateContextPath(const SampleContext &Context, diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -577,6 +578,8 @@ return EC; if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial)) Summary->setPartialProfile(true); + if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext)) + FunctionSamples::ProfileIsCS = ProfileIsCS = true; break; case SecNameTable: { FixedLengthMD5 = @@ -687,6 +690,46 @@ if (std::error_code EC = readFuncProfile(FuncProfileAddr)) return EC; } + } else if (FunctionSamples::ProfileIsCS) { + // Compute the ordered set of names, so we can + // get all context profiles under a subtree by + // iterating through the ordered names. + struct Comparer { + // Ignore the closing ']' when ordering context + bool operator()(const StringRef &L, const StringRef &R) const { + return L.substr(0, L.size() - 1) < R.substr(0, R.size() - 1); + } + }; + std::set OrderedNames; + for (auto Name : FuncOffsetTable) { + OrderedNames.insert(Name.first); + } + + // For each function in current module, load all + // context profiles for the function. + for (auto NameOffset : FuncOffsetTable) { + StringRef ContextName = NameOffset.first; + SampleContext FContext(ContextName); + auto FuncName = FContext.getNameWithoutContext(); + if (!FuncsToUse.count(FuncName) && + (!Remapper || !Remapper->exist(FuncName))) + continue; + + // For each context profile we need, try to load + // all context profile in the subtree. This can + // help profile guided importing for ThinLTO. + auto It = OrderedNames.find(ContextName); + while (It != OrderedNames.end() && + It->startswith(ContextName.substr(0, ContextName.size() - 1))) { + const uint8_t *FuncProfileAddr = Start + FuncOffsetTable[*It]; + assert(FuncProfileAddr < End && "out of LBRProfile section"); + if (std::error_code EC = readFuncProfile(FuncProfileAddr)) + return EC; + // Remove loaded context profile so we won't + // load it repeatedly. + OrderedNames.erase(It++); + } + } } else { for (auto NameOffset : FuncOffsetTable) { SampleContext FContext(NameOffset.first); @@ -704,8 +747,8 @@ } assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) && "Cannot have both context-sensitive and regular profile"); - ProfileIsCS = (CSProfileCount > 0); - FunctionSamples::ProfileIsCS = ProfileIsCS; + assert(ProfileIsCS == (CSProfileCount > 0) && + "Section flag should be consistent with actual profile"); return sampleprof_error::success; } @@ -1034,6 +1077,8 @@ case SecProfSummary: if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial)) Flags.append("partial,"); + if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext)) + Flags.append("context,"); break; default: break; diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -237,6 +237,8 @@ setToCompressSection(SecProfileSymbolList); if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased) addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased); + if (Type == SecProfSummary && FunctionSamples::ProfileIsCS) + addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext); uint64_t SectionStart = markSectionStart(Type, LayoutIdx); switch (Type) { diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -365,6 +365,10 @@ findFunctionSamples(const Instruction &I) const override; std::vector findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; + void findExternalInlineCandidate(const FunctionSamples *Samples, + DenseSet &InlinedGUIDs, + const StringMap &SymbolMap, + uint64_t Threshold); // Attempt to promote indirect call and also inline the promoted call bool tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, @@ -922,6 +926,59 @@ } } +void SampleProfileLoader::findExternalInlineCandidate( + const FunctionSamples *Samples, DenseSet &InlinedGUIDs, + const StringMap &SymbolMap, uint64_t Threshold) { + assert(Samples && "expect non-null caller profile"); + + // For AutoFDO profile, retrieve candidate profiles by walking over + // the nested inlinee profiles. + if (!ProfileIsCS) { + Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold); + return; + } + + // For CSSPGO profile, retrieve candidate profile by walking over the + // trie built for context profile. Note that also take call targets + // even if callee doesn't have a corresponding context profile. + if (Samples->getEntrySamples() < Threshold) + return; + + ContextTrieNode *Caller = + ContextTracker->getContextFor(Samples->getContext()); + std::queue CalleeList; + CalleeList.push(Caller); + while (!CalleeList.empty()) { + ContextTrieNode *Node = CalleeList.front(); + CalleeList.pop(); + StringRef Name = Node->getFuncName(); + Function *Func = SymbolMap.lookup(Name); + // Add to the import list only when it's defined out of module. + if (!Func || Func->isDeclaration()) + InlinedGUIDs.insert(FunctionSamples::getGUID(Name)); + + // Import hot CallTargets, which may not be available in IR because full + // profile annotation cannot be done until backend compilation in ThinLTO. + for (const auto &BS : Samples->getBodySamples()) + for (const auto &TS : BS.second.getCallTargets()) + if (TS.getValue() > Threshold) { + StringRef CalleeName = Samples->getFuncName(TS.getKey()); + const Function *Callee = SymbolMap.lookup(CalleeName); + if (!Callee || Callee->isDeclaration()) + InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeName)); + } + + // Import hot child context profile associted with callees. Note that this + // may have some overlap with the call target loop above, but doing this + // based child context profile again effectively allow us to use the max of + // entry count and call target count to determine importing. + for (auto &Child : Node->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + CalleeList.push(CalleeNode); + } + } +} + /// Iteratively inline hot callsites of a function. /// /// Iteratively traverse all callsites of the function \p F, and find if @@ -994,8 +1051,8 @@ for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { uint64_t SumOrigin = Sum; if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap, + PSI->getOrCompHotCountThreshold()); continue; } if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList)) @@ -1014,9 +1071,9 @@ LocalChanged = true; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findCalleeFunctionSamples(*I)->findInlinedFunctions( - InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(findCalleeFunctionSamples(*I), InlinedGUIDs, + SymbolMap, + PSI->getOrCompHotCountThreshold()); } } Changed |= LocalChanged; @@ -1268,8 +1325,8 @@ for (const auto *FS : CalleeSamples) { // TODO: Consider disable pre-lTO ICP for MonoLTO as well if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap, + PSI->getOrCompHotCountThreshold()); continue; } uint64_t EntryCountDistributed = @@ -1314,9 +1371,8 @@ Changed = true; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findCalleeFunctionSamples(*I)->findInlinedFunctions( - InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(Candidate.CalleeSamples, InlinedGUIDs, + SymbolMap, PSI->getOrCompHotCountThreshold()); } } diff --git a/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof @@ -0,0 +1,27 @@ +[main]:154:2 + 2: 12 + 3: 18 _Z5funcAi:11 + 3.1: 18 _Z5funcBi:19 +[main:3.1 @ _Z5funcBi]:120:7040 + 0: 7001 + 1: 19 _Z8funcLeafi:9999 + 3: 12 +[main:3.1 @ _Z5funcBi @ _Z5funcBiLeaf2]:1:9010 + 0: 7001 + 1: 19 _Z8funcLeafi3:9999 + 3: 12 +[main:2 @ _Z5funcAi]:99:11 + 0: 10 + 1: 10 _Z8funcLeafi:11 + 2: 287864 _Z3fibi:315608 + 3: 24 +[main:3 @ _Z5funcCi]:23254:11 + 0: 10 + 1: 23250 +[main:3 @ _Z5funcDi]:23:45201 + 0: 10 + 1: 23250 +[main:2 @ _Z5funcAi:2 @ _Z3fibi]:120:101 + 0: 99 + 1: 6 + 3: 97 diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll @@ -0,0 +1,77 @@ +; Make sure Import GUID list for ThinLTO properly set for CSSPGO +; RUN: opt < %s -passes='thinlto-pre-link' -pgo-kind=pgo-sample-use-pipeline -sample-profile-file=%S/Inputs/csspgo-import-list.prof -S | FileCheck %s + +declare i32 @_Z5funcBi(i32 %x) +declare i32 @_Z5funcAi(i32 %x) + +define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 { +entry: + br label %for.body, !dbg !25 + +for.cond.cleanup: ; preds = %for.body + ret i32 %add3, !dbg !27 + +for.body: ; preds = %for.body, %entry + %x.011 = phi i32 [ 300000, %entry ], [ %dec, %for.body ] + %r.010 = phi i32 [ 0, %entry ], [ %add3, %for.body ] + %call = tail call i32 @_Z5funcBi(i32 %x.011), !dbg !32 + %add = add nuw nsw i32 %x.011, 1, !dbg !31 + %call1 = tail call i32 @_Z5funcAi(i32 %add), !dbg !28 + %add2 = add i32 %call, %r.010, !dbg !34 + %add3 = add i32 %add2, %call1, !dbg !35 + %dec = add nsw i32 %x.011, -1, !dbg !36 + %cmp = icmp eq i32 %x.011, 0, !dbg !38 + br i1 %cmp, label %for.cond.cleanup, label %for.body, !dbg !25 +} + +; Make sure the ImportGUID stays with entry count metadata for ThinLTO-PreLink +; CHECK: distinct !DISubprogram(name: "main" +; CHECK: !{!"function_entry_count", i64 3, i64 446061515086924981, i64 3815895320998406042, i64 7102633082150537521} + + +attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" } + +!llvm.dbg.cu = !{!2} +!llvm.module.flags = !{!14, !15, !16} +!llvm.ident = !{!17} + +!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression()) +!1 = distinct !DIGlobalVariable(name: "factor", scope: !2, file: !3, line: 21, type: !13, isLocal: false, isDefinition: true) +!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "clang version 11.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !12, splitDebugInlining: false, debugInfoForProfiling: true, nameTableKind: None) +!3 = !DIFile(filename: "merged.cpp", directory: "/local/autofdo") +!4 = !{} +!5 = !{!6, !10, !11} +!6 = !DISubprogram(name: "funcA", linkageName: "_Z5funcAi", scope: !3, file: !3, line: 6, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!7 = !DISubroutineType(types: !8) +!8 = !{!9, !9} +!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!10 = !DISubprogram(name: "funcB", linkageName: "_Z5funcBi", scope: !3, file: !3, line: 7, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!11 = !DISubprogram(name: "funcLeaf", linkageName: "_Z8funcLeafi", scope: !3, file: !3, line: 22, type: !7, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !4) +!12 = !{!0} +!13 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !9) +!14 = !{i32 7, !"Dwarf Version", i32 4} +!15 = !{i32 2, !"Debug Info Version", i32 3} +!16 = !{i32 1, !"wchar_size", i32 4} +!17 = !{!"clang version 11.0.0"} +!18 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 11, type: !19, scopeLine: 11, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !2, retainedNodes: !21) +!19 = !DISubroutineType(types: !20) +!20 = !{!9} +!21 = !{!22, !23} +!22 = !DILocalVariable(name: "r", scope: !18, file: !3, line: 12, type: !9) +!23 = !DILocalVariable(name: "x", scope: !24, file: !3, line: 13, type: !9) +!24 = distinct !DILexicalBlock(scope: !18, file: !3, line: 13, column: 3) +!25 = !DILocation(line: 13, column: 3, scope: !26) +!26 = !DILexicalBlockFile(scope: !24, file: !3, discriminator: 2) +!27 = !DILocation(line: 17, column: 3, scope: !18) +!28 = !DILocation(line: 13, column: 10, scope: !29) +!29 = distinct !DILexicalBlock(scope: !30, file: !3, line: 13, column: 37) +!30 = distinct !DILexicalBlock(scope: !24, file: !3, line: 13, column: 3) +!31 = !DILocation(line: 14, column: 29, scope: !29) +!32 = !DILocation(line: 14, column: 21, scope: !33) +!33 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2) +!34 = !DILocation(line: 14, column: 19, scope: !29) +!35 = !DILocation(line: 14, column: 7, scope: !29) +!36 = !DILocation(line: 13, column: 33, scope: !37) +!37 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 6) +!38 = !DILocation(line: 13, column: 26, scope: !39) +!39 = !DILexicalBlockFile(scope: !30, file: !3, discriminator: 2) diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.h b/llvm/tools/llvm-profgen/ProfileGenerator.h --- a/llvm/tools/llvm-profgen/ProfileGenerator.h +++ b/llvm/tools/llvm-profgen/ProfileGenerator.h @@ -65,31 +65,7 @@ : BinarySampleCounters(Counters){}; public: - void generateProfile() override { - for (const auto &BI : BinarySampleCounters) { - ProfiledBinary *Binary = BI.first; - for (const auto &CI : BI.second) { - const StringBasedCtxKey *CtxKey = - dyn_cast(CI.first.getPtr()); - StringRef ContextId(CtxKey->Context); - // Get or create function profile for the range - FunctionSamples &FunctionProfile = - getFunctionProfileForContext(ContextId); - - // Fill in function body samples - populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter, - Binary); - // Fill in boundary sample counts as well as call site samples for calls - populateFunctionBoundarySamples(ContextId, FunctionProfile, - CI.second.BranchCounter, Binary); - } - } - // Fill in call site value sample for inlined calls and also use context to - // infer missing samples. Since we don't have call count for inlined - // functions, we estimate it from inlinee's profile using the entry of the - // body sample. - populateInferredFunctionSamples(); - } + void generateProfile() override; // Remove adjacent repeated context sequences up to a given sequence length, // -1 means no size limit. Note that repeated sequences are identified based diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -198,6 +198,33 @@ return Ret.first->second; } +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCS = true; + for (const auto &BI : BinarySampleCounters) { + ProfiledBinary *Binary = BI.first; + for (const auto &CI : BI.second) { + const StringBasedCtxKey *CtxKey = + dyn_cast(CI.first.getPtr()); + StringRef ContextId(CtxKey->Context); + // Get or create function profile for the range + FunctionSamples &FunctionProfile = + getFunctionProfileForContext(ContextId); + + // Fill in function body samples + populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter, + Binary); + // Fill in boundary sample counts as well as call site samples for calls + populateFunctionBoundarySamples(ContextId, FunctionProfile, + CI.second.BranchCounter, Binary); + } + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(); +} + void CSProfileGenerator::updateBodySamplesforFunctionProfile( FunctionSamples &FunctionProfile, const FrameLocation &LeafLoc, uint64_t Count) { @@ -422,6 +449,7 @@ void PseudoProbeCSProfileGenerator::generateProfile() { // Enable pseudo probe functionalities in SampleProf FunctionSamples::ProfileIsProbeBased = true; + FunctionSamples::ProfileIsCS = true; for (const auto &BI : BinarySampleCounters) { ProfiledBinary *Binary = BI.first; for (const auto &CI : BI.second) {