diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h --- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h +++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h @@ -13,6 +13,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/ProfileData/SampleProf.h" +#include "llvm/ProfileData/SampleProfReader.h" #include "llvm/Transforms/IPO/SampleContextTracker.h" #include #include @@ -40,6 +41,34 @@ class ProfiledCallGraph { public: using iterator = std::set::iterator; + + // Constructor for non-CS profile. + ProfiledCallGraph(SampleProfileReader &Reader) { + assert(!Reader.profileIsCS() && "CS profile is not handled here"); + for (const auto &Samples : Reader.getProfiles()) { + addProfiledCallEdges(Samples.second); + } + } + + // Constructor for CS profile. + ProfiledCallGraph(SampleContextTracker &ContextTracker) { + // BFS traverse the context profile trie to add call edges for calls shown + // in context. + std::queue Queue; + Queue.push(&ContextTracker.getRootContext()); + while (!Queue.empty()) { + ContextTrieNode *Node = Queue.front(); + addProfiledFunction(Node->getFuncName()); + Queue.pop(); + for (auto &I : Node->getAllChildContext()) { + ContextTrieNode *ChildNode = &I.second; + addProfiledFunction(ChildNode->getFuncName()); + Queue.push(ChildNode); + addProfiledCall(Node->getFuncName(), ChildNode->getFuncName()); + } + } + } + ProfiledCallGraph(StringMap &ProfileMap, SampleContextTracker &ContextTracker) { // Add all profiled functions into profiled call graph. @@ -89,6 +118,7 @@ ProfiledFunctions[Name] = ProfiledCallGraphNode(Name); } } + void addProfiledCall(StringRef CallerName, StringRef CalleeName) { assert(ProfiledFunctions.count(CallerName)); auto CalleeIt = ProfiledFunctions.find(CalleeName); @@ -98,6 +128,25 @@ ProfiledFunctions[CallerName].Callees.insert(&CalleeIt->second); } + void addProfiledCallEdges(const FunctionSamples &Samples) { + addProfiledFunction(Samples.getFuncName()); + + for (const auto &Sample : Samples.getBodySamples()) { + for (const auto &Target : Sample.second.getCallTargets()) { + addProfiledFunction(Target.first()); + addProfiledCall(Samples.getFuncName(), Target.first()); + } + } + + for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) { + for (const auto &InlinedSamples : CallsiteSamples.second) { + addProfiledFunction(InlinedSamples.first); + addProfiledCall(Samples.getFuncName(), InlinedSamples.first); + addProfiledCallEdges(InlinedSamples.second); + } + } + } + private: ProfiledCallGraphNode Root; StringMap ProfiledFunctions; diff --git a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h --- a/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h +++ b/llvm/include/llvm/Transforms/IPO/SampleContextTracker.h @@ -124,7 +124,6 @@ ContextTrieNode &getRootContext(); void promoteMergeContextSamplesTree(const Instruction &Inst, StringRef CalleeName); - void addCallGraphEdges(CallGraph &CG, StringMap &SymbolMap); // Dump the internal context profile trie. void dump(); diff --git a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp --- a/llvm/lib/Transforms/IPO/SampleContextTracker.cpp +++ b/llvm/lib/Transforms/IPO/SampleContextTracker.cpp @@ -568,26 +568,4 @@ return *ToNode; } - -// Replace call graph edges with dynamic call edges from the profile. -void SampleContextTracker::addCallGraphEdges(CallGraph &CG, - StringMap &SymbolMap) { - // Add profile call edges to the call graph. - std::queue NodeQueue; - NodeQueue.push(&RootContext); - while (!NodeQueue.empty()) { - ContextTrieNode *Node = NodeQueue.front(); - NodeQueue.pop(); - Function *F = SymbolMap.lookup(Node->getFuncName()); - for (auto &I : Node->getAllChildContext()) { - ContextTrieNode *ChildNode = &I.second; - NodeQueue.push(ChildNode); - if (F && !F->isDeclaration()) { - Function *Callee = SymbolMap.lookup(ChildNode->getFuncName()); - if (Callee && !Callee->isDeclaration()) - CG[F]->addCalledFunction(nullptr, CG[Callee]); - } - } - } -} } // namespace llvm diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -77,6 +77,7 @@ #include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/IPO.h" +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" #include "llvm/Transforms/IPO/SampleContextTracker.h" #include "llvm/Transforms/IPO/SampleProfileProbe.h" #include "llvm/Transforms/Instrumentation.h" @@ -160,15 +161,11 @@ "order of call graph during sample profile loading. It only " "works for new pass manager. ")); -static cl::opt UseProfileIndirectCallEdges( - "use-profile-indirect-call-edges", cl::init(true), cl::Hidden, - cl::desc("Considering indirect call samples from profile when top-down " - "processing functions. Only CSSPGO is supported.")); - -static cl::opt UseProfileTopDownOrder( - "use-profile-top-down-order", cl::init(false), cl::Hidden, - cl::desc("Process functions in one SCC in a top-down order " - "based on the input profile.")); +static cl::opt + UseProfiledCallGraph("use-profiled-call-graph", cl::init(false), cl::Hidden, + cl::desc("Process functions in a top-down order " + "defined by the profiled call graph when " + "-sample-profile-top-down-load is on.")); static cl::opt ProfileSizeInline( "sample-profile-inline-size", cl::Hidden, cl::init(false), @@ -395,8 +392,7 @@ const SmallVectorImpl &Candidates, const Function &F, bool Hot); std::vector buildFunctionOrder(Module &M, CallGraph *CG); - void addCallGraphEdges(CallGraph &CG, const FunctionSamples &Samples); - void replaceCallGraphEdges(CallGraph &CG, StringMap &SymbolMap); + std::unique_ptr buildProfiledCallGraph(CallGraph &CG); void generateMDProfMetadata(Function &F); /// Map from function name to Function *. Used to find the function from @@ -1579,43 +1575,25 @@ INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile", "Sample Profile loader", false, false) -// Add inlined profile call edges to the call graph. -void SampleProfileLoader::addCallGraphEdges(CallGraph &CG, - const FunctionSamples &Samples) { - Function *Caller = SymbolMap.lookup(Samples.getFuncName()); - if (!Caller || Caller->isDeclaration()) - return; - - // Skip non-inlined call edges which are not important since top down inlining - // for non-CS profile is to get more precise profile matching, not to enable - // more inlining. - - for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) { - for (const auto &InlinedSamples : CallsiteSamples.second) { - Function *Callee = SymbolMap.lookup(InlinedSamples.first); - if (Callee && !Callee->isDeclaration()) - CG[Caller]->addCalledFunction(nullptr, CG[Callee]); - addCallGraphEdges(CG, InlinedSamples.second); - } +std::unique_ptr +SampleProfileLoader::buildProfiledCallGraph(CallGraph &CG) { + std::unique_ptr ProfiledCG; + if (ProfileIsCS) + ProfiledCG = std::make_unique(*ContextTracker); + else + ProfiledCG = std::make_unique(*Reader); + + // Add all functions into the profiled call graph even if they are not in + // the profile. This makes sure functions missing from the profile still + // gets a chance to be processed. + for (auto &Node : CG) { + const auto *F = Node.first; + if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile")) + continue; + ProfiledCG->addProfiledFunction(FunctionSamples::getCanonicalFnName(*F)); } -} -// Replace call graph edges with dynamic call edges from the profile. -void SampleProfileLoader::replaceCallGraphEdges( - CallGraph &CG, StringMap &SymbolMap) { - // Remove static call edges from the call graph except for the ones from the - // root which make the call graph connected. - for (const auto &Node : CG) - if (Node.second.get() != CG.getExternalCallingNode()) - Node.second->removeAllCalledFunctions(); - - // Add profile call edges to the call graph. - if (ProfileIsCS) { - ContextTracker->addCallGraphEdges(CG, SymbolMap); - } else { - for (const auto &Samples : Reader->getProfiles()) - addCallGraphEdges(CG, Samples.second); - } + return ProfiledCG; } std::vector @@ -1641,89 +1619,78 @@ assert(&CG->getModule() == &M); - // Add indirect call edges from profile to augment the static call graph. - // Functions will be processed in a top-down order defined by the static call - // graph. Adjusting the order by considering indirect call edges from the - // profile (which don't exist in the static call graph) can enable the - // inlining of indirect call targets by processing the caller before them. - // TODO: enable this for non-CS profile and fix the counts returning logic to - // have a full support for indirect calls. - if (UseProfileIndirectCallEdges && ProfileIsCS) { - for (auto &Entry : *CG) { - const auto *F = Entry.first; - if (!F || F->isDeclaration() || !F->hasFnAttribute("use-sample-profile")) - continue; - auto &AllContexts = ContextTracker->getAllContextSamplesFor(F->getName()); - if (AllContexts.empty()) - continue; - - for (const auto &BB : *F) { - for (const auto &I : BB.getInstList()) { - const auto *CB = dyn_cast(&I); - if (!CB || !CB->isIndirectCall()) - continue; - const DebugLoc &DLoc = I.getDebugLoc(); - if (!DLoc) - continue; - auto CallSite = FunctionSamples::getCallSiteIdentifier(DLoc); - for (FunctionSamples *Samples : AllContexts) { - if (auto CallTargets = Samples->findCallTargetMapAt(CallSite)) { - for (const auto &Target : CallTargets.get()) { - Function *Callee = SymbolMap.lookup(Target.first()); - if (Callee && !Callee->isDeclaration()) - Entry.second->addCalledFunction(nullptr, (*CG)[Callee]); - } - } - } - } + if (UseProfiledCallGraph || + (ProfileIsCS && !UseProfiledCallGraph.getNumOccurrences())) { + // Use profiled call edges to augment the top-down order. There are cases + // that the top-down order computed based on the static call graph doesn't + // reflect real execution order. For example + // + // 1. Incomplete static call graph due to unknown indirect call targets. + // Adjusting the order by considering indirect call edges from the + // profile can enable the inlining of indirect call targets by allowing + // the caller processed before them. + // 2. Mutual call edges in an SCC. The static processing order computed for + // an SCC may not reflect the call contexts in the context-sensitive + // profile, thus may cause potential inlining to be overlooked. The + // function order in one SCC is being adjusted to a top-down order based + // on the profile to favor more inlining. This is only a problem with CS + // profile. + // 3. Transitive indirect call edges due to inlining. When a callee function + // (say B) is inlined into into a caller function (say A) in LTO prelink, + // every call edge originated from the callee B will be transferred to + // the caller A. If any transferred edge (say A->C) is indirect, the + // original profiled indirect edge B->C, even if considered, would not + // enforce a top-down order from the caller A to the potential indirect + // call target C in LTO postlink since the inlined callee B is gone from + // the static call graph. + // 4. #3 can happen even for direct call targets, due to functions defined + // in header files. A header function (say A), when included into source + // files, is defined multiple times but only one definition survives due + // to ODR. Therefore, the LTO prelink inlining done on those dropped + // definitions can be useless based on a local file scope. More + // importantly, the inlinee (say B), once fully inlined to a + // to-be-dropped A, will have no profile to consume when its outlined + // version is compiled. This can lead to a profile-less prelink + // compilation for the outlined version of B which may be called from + // external modules. while this isn't easy to fix, we rely on the + // postlink AutoFDO pipeline to optimize B. Since the survived copy of + // the A can be inlined in its local scope in prelink, it may not exist + // in the merged IR in postlink, and we'll need the profiled call edges + // to enforce a top-down order for the rest of the functions. + // + // Considering those cases, a profiled call graph completely independent of + // the static call graph is constructed based on profile data, where + // function objects are not even needed to handle case #3 and case 4. + // + // Note that static callgraph edges are completely ignored since they + // can be conflicting with profiled edges for cyclic SCCs and may result in + // an SCC order incompatible with profile-defined one. Using strictly + // profile order ensures a maximum inlining experience. On the other hand, + // static call edges are not so important when they don't correspond to a + // context in the profile. + + std::unique_ptr ProfiledCG = buildProfiledCallGraph(*CG); + scc_iterator CGI = scc_begin(ProfiledCG.get()); + while (!CGI.isAtEnd()) { + for (ProfiledCallGraphNode *Node : *CGI) { + Function *F = SymbolMap.lookup(Node->Name); + if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) + FunctionOrderList.push_back(F); } + ++CGI; } - } - - // Compute a top-down order the profile which is used to sort functions in - // one SCC later. The static processing order computed for an SCC may not - // reflect the call contexts in the context-sensitive profile, thus may cause - // potential inlining to be overlooked. The function order in one SCC is being - // adjusted to a top-down order based on the profile to favor more inlining. - DenseMap ProfileOrderMap; - if (UseProfileTopDownOrder || - (ProfileIsCS && !UseProfileTopDownOrder.getNumOccurrences())) { - // Create a static call graph. The call edges are not important since they - // will be replaced by dynamic edges from the profile. - CallGraph ProfileCG(M); - replaceCallGraphEdges(ProfileCG, SymbolMap); - scc_iterator CGI = scc_begin(&ProfileCG); - uint64_t I = 0; + } else { + scc_iterator CGI = scc_begin(CG); while (!CGI.isAtEnd()) { for (CallGraphNode *Node : *CGI) { - if (auto *F = Node->getFunction()) - ProfileOrderMap[F] = ++I; + auto *F = Node->getFunction(); + if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) + FunctionOrderList.push_back(F); } ++CGI; } } - scc_iterator CGI = scc_begin(CG); - while (!CGI.isAtEnd()) { - uint64_t Start = FunctionOrderList.size(); - for (CallGraphNode *Node : *CGI) { - auto *F = Node->getFunction(); - if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) - FunctionOrderList.push_back(F); - } - - // Sort nodes in SCC based on the profile top-down order. - if (!ProfileOrderMap.empty()) { - std::stable_sort(FunctionOrderList.begin() + Start, - FunctionOrderList.end(), - [&ProfileOrderMap](Function *Left, Function *Right) { - return ProfileOrderMap[Left] < ProfileOrderMap[Right]; - }); - } - - ++CGI; - } - LLVM_DEBUG({ dbgs() << "Function processing order:\n"; for (auto F : reverse(FunctionOrderList)) { diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-order.ll +++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll @@ -1,21 +1,21 @@ ;; Test for different function processing orders affecting inlining in sample profile loader. ;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program. -;; With -use-profile-top-down-order=0, the top-down processing order of +;; With -use-profiled-call-graph=0, the top-down processing order of ;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on -;; the static call graph. With -use-profile-top-down-order=1, call edges +;; the static call graph. With -use-profiled-call-graph=1, call edges ;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi) ;; which leads to _Z8funcLeafi inlined into _Z5funcAi. -; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=INLINE -; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=NOINLINE +; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=INLINE +; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=0 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=NOINLINE ;; There is an indirect call _Z5funcAi -> _Z3fibi in the program. -;; With -use-profile-indirect-call-edges=0, the processing order computed +;; With -use-profiled-call-graph=0, the processing order computed ;; based on the static call graph is (_Z3fibi, _Z5funcAi). With -;; -use-profile-top-down-order=1, the indirect call edge from profile is +;; -use-profiled-call-graph=1, the indirect call edge from profile is ;; considered, thus the order becomes (_Z5funcAi, _Z3fibi) which leads to ;; _Z3fibi inlined into _Z5funcAi. -; RUN: opt < %s -passes=sample-profile -use-profile-indirect-call-edges=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE +; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE @factor = dso_local global i32 3, align 4, !dbg !0 @fp = dso_local global i32 (i32)* null, align 8 diff --git a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll --- a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll +++ b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll @@ -1,14 +1,14 @@ ;; Test for different function processing orders affecting inlining in sample profile loader. ;; There is an SCC _Z5funcAi -> _Z8funcLeafi -> _Z5funcAi in the program. -;; With -use-profile-top-down-order=0, the top-down processing order of +;; With -use-profiled-call-graph=0, the top-down processing order of ;; that SCC is (_Z8funcLeafi, _Z5funcAi), which is determinined based on -;; the static call graph. With -use-profile-top-down-order=1, call edges +;; the static call graph. With -use-profiled-call-graph=1, call edges ;; from profile are considered, thus the order becomes (_Z5funcAi, _Z8funcLeafi). ;; While _Z8funcLeafi is not supposed to be inlined, the outlined entry counts ;; are affected. -; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=0 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=STATIC -; RUN: opt < %s -passes=sample-profile -use-profile-top-down-order=1 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=DYNAMIC +; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=0 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=STATIC +; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-topdown-order.prof -S | FileCheck %s -check-prefix=DYNAMIC ; STATIC: define dso_local i32 @_Z8funcLeafi{{.*}} !prof ![[#PROF:]]