diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h --- a/llvm/include/llvm/ProfileData/SampleProf.h +++ b/llvm/include/llvm/ProfileData/SampleProf.h @@ -187,7 +187,10 @@ /// SecFlagPartial means the profile is for common/shared code. /// The common profile is usually merged from profiles collected /// from running other targets. - SecFlagPartial = (1 << 0) + SecFlagPartial = (1 << 0), + /// SecFlagContext means this is context-sensitive profile for + /// CSSPGO + SecFlagFullContext = (1 << 1) }; enum class SecFuncMetadataFlags : uint32_t { @@ -730,7 +733,7 @@ /// corresponding function is no less than \p Threshold, add its corresponding /// GUID to \p S. Also traverse the BodySamples to add hot CallTarget's GUID /// to \p S. - void findInlinedFunctions(DenseSet &S, const Module *M, + void findInlinedFunctions(DenseSet &S, const StringMap &SymbolMap, uint64_t Threshold) const { if (TotalSamples <= Threshold) @@ -753,7 +756,7 @@ } for (const auto &CS : CallsiteSamples) for (const auto &NameFS : CS.second) - NameFS.second.findInlinedFunctions(S, M, SymbolMap, Threshold); + NameFS.second.findInlinedFunctions(S, SymbolMap, Threshold); } /// Set the name of the function. diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h --- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h +++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h @@ -115,6 +115,8 @@ bool MergeContext = true); // Query base profile for a given function by name. FunctionSamples *getBaseSamplesFor(StringRef Name, bool MergeContext); + // Retrieve the context trie node for given profile context + ContextTrieNode *getContextFor(const SampleContext &Context); // Mark a context profile as inlined when function is inlined. // This makes sure that inlined context profile will be excluded in // function's base profile. @@ -127,7 +129,6 @@ private: ContextTrieNode *getContextFor(const DILocation *DIL); - ContextTrieNode *getContextFor(const SampleContext &Context); ContextTrieNode *getCalleeContextFor(const DILocation *DIL, StringRef CalleeName); ContextTrieNode *getOrCreateContextPath(const SampleContext &Context, diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp --- a/llvm/lib/ProfileData/SampleProfReader.cpp +++ b/llvm/lib/ProfileData/SampleProfReader.cpp @@ -38,6 +38,7 @@ #include #include #include +#include #include #include @@ -577,6 +578,8 @@ return EC; if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial)) Summary->setPartialProfile(true); + if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext)) + FunctionSamples::ProfileIsCS = ProfileIsCS = true; break; case SecNameTable: { FixedLengthMD5 = @@ -687,6 +690,46 @@ if (std::error_code EC = readFuncProfile(FuncProfileAddr)) return EC; } + } else if (FunctionSamples::ProfileIsCS) { + // Compute the ordered set of names, so we can + // get all context profiles under a subtree by + // iterating through the ordered names. + struct Comparer { + // Ignore the closing ']' when ordering context + bool operator()(const StringRef &L, const StringRef &R) const { + return L.substr(0, L.size() - 1) < R.substr(0, R.size() - 1); + } + }; + std::set OrderedNames; + for (auto Name : FuncOffsetTable) { + OrderedNames.insert(Name.first); + } + + // For each function in current module, load all + // context profiles for the function. + for (auto NameOffset : FuncOffsetTable) { + StringRef ContextName = NameOffset.first; + SampleContext FContext(ContextName); + auto FuncName = FContext.getNameWithoutContext(); + if (!FuncsToUse.count(FuncName) && + (!Remapper || !Remapper->exist(FuncName))) + continue; + + // For each context profile we need, try to load + // all context profile in the subtree. This can + // help profile guided importing for ThinLTO. + auto It = OrderedNames.find(ContextName); + while (It != OrderedNames.end() && + It->startswith(ContextName.substr(0, ContextName.size() - 1))) { + const uint8_t *FuncProfileAddr = Start + FuncOffsetTable[*It]; + assert(FuncProfileAddr < End && "out of LBRProfile section"); + if (std::error_code EC = readFuncProfile(FuncProfileAddr)) + return EC; + // Remove loaded context profile so we won't + // load it repeatedly. + It = OrderedNames.erase(It); + } + } } else { for (auto NameOffset : FuncOffsetTable) { SampleContext FContext(NameOffset.first); @@ -704,8 +747,8 @@ } assert((CSProfileCount == 0 || CSProfileCount == Profiles.size()) && "Cannot have both context-sensitive and regular profile"); - ProfileIsCS = (CSProfileCount > 0); - FunctionSamples::ProfileIsCS = ProfileIsCS; + assert(ProfileIsCS == (CSProfileCount > 0) && + "Section flag should be consistent with actual profile"); return sampleprof_error::success; } @@ -1034,6 +1077,8 @@ case SecProfSummary: if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial)) Flags.append("partial,"); + if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagFullContext)) + Flags.append("context,"); break; default: break; diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp --- a/llvm/lib/ProfileData/SampleProfWriter.cpp +++ b/llvm/lib/ProfileData/SampleProfWriter.cpp @@ -237,6 +237,8 @@ setToCompressSection(SecProfileSymbolList); if (Type == SecFuncMetadata && FunctionSamples::ProfileIsProbeBased) addSectionFlag(SecFuncMetadata, SecFuncMetadataFlags::SecFlagIsProbeBased); + if (Type == SecProfSummary && FunctionSamples::ProfileIsCS) + addSectionFlag(SecProfSummary, SecProfSummaryFlags::SecFlagFullContext); uint64_t SectionStart = markSectionStart(Type, LayoutIdx); switch (Type) { diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -365,6 +365,10 @@ findFunctionSamples(const Instruction &I) const override; std::vector findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const; + void findExternalInlineCandidate(const FunctionSamples *Samples, + DenseSet &InlinedGUIDs, + const StringMap &SymbolMap, + uint64_t Threshold); // Attempt to promote indirect call and also inline the promoted call bool tryPromoteAndInlineCandidate( Function &F, InlineCandidate &Candidate, uint64_t SumOrigin, @@ -922,6 +926,60 @@ } } +void SampleProfileLoader::findExternalInlineCandidate( + const FunctionSamples *Samples, DenseSet &InlinedGUIDs, + const StringMap &SymbolMap, uint64_t Threshold) { + assert(Samples && "expect non-null caller profile"); + + // For AutoFDO profile, retrieve candidate profiles by walking over + // the nested inlinee profiles. + if (!ProfileIsCS) { + Samples->findInlinedFunctions(InlinedGUIDs, SymbolMap, Threshold); + return; + } + + ContextTrieNode *Caller = + ContextTracker->getContextFor(Samples->getContext()); + std::queue CalleeList; + CalleeList.push(Caller); + while (!CalleeList.empty()) { + ContextTrieNode *Node = CalleeList.front(); + CalleeList.pop(); + FunctionSamples *CalleeSample = Node->getFunctionSamples(); + // For CSSPGO profile, retrieve candidate profile by walking over the + // trie built for context profile. Note that also take call targets + // even if callee doesn't have a corresponding context profile. + if (!CalleeSample || CalleeSample->getEntrySamples() < Threshold) + continue; + + StringRef Name = CalleeSample->getFuncName(); + Function *Func = SymbolMap.lookup(Name); + // Add to the import list only when it's defined out of module. + if (!Func || Func->isDeclaration()) + InlinedGUIDs.insert(FunctionSamples::getGUID(Name)); + + // Import hot CallTargets, which may not be available in IR because full + // profile annotation cannot be done until backend compilation in ThinLTO. + for (const auto &BS : CalleeSample->getBodySamples()) + for (const auto &TS : BS.second.getCallTargets()) + if (TS.getValue() > Threshold) { + StringRef CalleeName = CalleeSample->getFuncName(TS.getKey()); + const Function *Callee = SymbolMap.lookup(CalleeName); + if (!Callee || Callee->isDeclaration()) + InlinedGUIDs.insert(FunctionSamples::getGUID(CalleeName)); + } + + // Import hot child context profile associted with callees. Note that this + // may have some overlap with the call target loop above, but doing this + // based child context profile again effectively allow us to use the max of + // entry count and call target count to determine importing. + for (auto &Child : Node->getAllChildContext()) { + ContextTrieNode *CalleeNode = &Child.second; + CalleeList.push(CalleeNode); + } + } +} + /// Iteratively inline hot callsites of a function. /// /// Iteratively traverse all callsites of the function \p F, and find if @@ -994,8 +1052,8 @@ for (const auto *FS : findIndirectCallFunctionSamples(*I, Sum)) { uint64_t SumOrigin = Sum; if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap, + PSI->getOrCompHotCountThreshold()); continue; } if (!callsiteIsHot(FS, PSI, ProfAccForSymsInList)) @@ -1014,9 +1072,9 @@ LocalChanged = true; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findCalleeFunctionSamples(*I)->findInlinedFunctions( - InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(findCalleeFunctionSamples(*I), InlinedGUIDs, + SymbolMap, + PSI->getOrCompHotCountThreshold()); } } Changed |= LocalChanged; @@ -1268,8 +1326,8 @@ for (const auto *FS : CalleeSamples) { // TODO: Consider disable pre-lTO ICP for MonoLTO as well if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - FS->findInlinedFunctions(InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(FS, InlinedGUIDs, SymbolMap, + PSI->getOrCompHotCountThreshold()); continue; } uint64_t EntryCountDistributed = @@ -1314,9 +1372,8 @@ Changed = true; } } else if (LTOPhase == ThinOrFullLTOPhase::ThinLTOPreLink) { - findCalleeFunctionSamples(*I)->findInlinedFunctions( - InlinedGUIDs, F.getParent(), SymbolMap, - PSI->getOrCompHotCountThreshold()); + findExternalInlineCandidate(Candidate.CalleeSamples, InlinedGUIDs, + SymbolMap, PSI->getOrCompHotCountThreshold()); } } diff --git a/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof @@ -0,0 +1,27 @@ +[main]:154:2 + 2: 12 + 3: 18 _Z5funcAi:11 + 3.1: 18 _Z5funcBi:19 +[main:3.1 @ _Z5funcBi]:120:7040 + 0: 7001 + 1: 19 _Z8funcLeafi:9999 + 3: 12 +[main:3.1 @ _Z5funcBi @ _Z5funcBiLeaf2]:1:9010 + 0: 7001 + 1: 19 _Z8funcLeafi3:9999 + 3: 12 +[main:2 @ _Z5funcAi]:99:11 + 0: 10 + 1: 10 _Z8funcLeafi:11 + 2: 287864 _Z3fibi:315608 + 3: 24 +[main:3 @ _Z5funcCi]:23254:11 + 0: 10 + 1: 23250 +[main:3 @ _Z5funcDi]:23:45201 + 0: 10 + 1: 23250 +[main:2 @ _Z5funcAi:2 @ _Z3fibi]:120:101 + 0: 99 + 1: 6 + 3: 97 diff --git a/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof.extbin b/llvm/test/Transforms/SampleProfile/Inputs/csspgo-import-list.prof.extbin new file mode 100644 index 0000000000000000000000000000000000000000..0000000000000000000000000000000000000000 GIT binary patch literal 0 Hc$@(CI.first.getPtr()); - StringRef ContextId(CtxKey->Context); - // Get or create function profile for the range - FunctionSamples &FunctionProfile = - getFunctionProfileForContext(ContextId); - - // Fill in function body samples - populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter, - Binary); - // Fill in boundary sample counts as well as call site samples for calls - populateFunctionBoundarySamples(ContextId, FunctionProfile, - CI.second.BranchCounter, Binary); - } - } - // Fill in call site value sample for inlined calls and also use context to - // infer missing samples. Since we don't have call count for inlined - // functions, we estimate it from inlinee's profile using the entry of the - // body sample. - populateInferredFunctionSamples(); - } + void generateProfile() override; // Remove adjacent repeated context sequences up to a given sequence length, // -1 means no size limit. Note that repeated sequences are identified based diff --git a/llvm/tools/llvm-profgen/ProfileGenerator.cpp b/llvm/tools/llvm-profgen/ProfileGenerator.cpp --- a/llvm/tools/llvm-profgen/ProfileGenerator.cpp +++ b/llvm/tools/llvm-profgen/ProfileGenerator.cpp @@ -198,6 +198,33 @@ return Ret.first->second; } +void CSProfileGenerator::generateProfile() { + FunctionSamples::ProfileIsCS = true; + for (const auto &BI : BinarySampleCounters) { + ProfiledBinary *Binary = BI.first; + for (const auto &CI : BI.second) { + const StringBasedCtxKey *CtxKey = + dyn_cast(CI.first.getPtr()); + StringRef ContextId(CtxKey->Context); + // Get or create function profile for the range + FunctionSamples &FunctionProfile = + getFunctionProfileForContext(ContextId); + + // Fill in function body samples + populateFunctionBodySamples(FunctionProfile, CI.second.RangeCounter, + Binary); + // Fill in boundary sample counts as well as call site samples for calls + populateFunctionBoundarySamples(ContextId, FunctionProfile, + CI.second.BranchCounter, Binary); + } + } + // Fill in call site value sample for inlined calls and also use context to + // infer missing samples. Since we don't have call count for inlined + // functions, we estimate it from inlinee's profile using the entry of the + // body sample. + populateInferredFunctionSamples(); +} + void CSProfileGenerator::updateBodySamplesforFunctionProfile( FunctionSamples &FunctionProfile, const FrameLocation &LeafLoc, uint64_t Count) { @@ -422,6 +449,7 @@ void PseudoProbeCSProfileGenerator::generateProfile() { // Enable pseudo probe functionalities in SampleProf FunctionSamples::ProfileIsProbeBased = true; + FunctionSamples::ProfileIsCS = true; for (const auto &BI : BinarySampleCounters) { ProfiledBinary *Binary = BI.first; for (const auto &CI : BI.second) {