diff --git a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h --- a/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h +++ b/llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h @@ -24,22 +24,45 @@ namespace llvm { namespace sampleprof { +struct ProfiledCallGraphNode; + +struct ProfiledCallGraphEdge { + ProfiledCallGraphEdge(ProfiledCallGraphNode *Source, + ProfiledCallGraphNode *Target, uint64_t Weight) + : Source(Source), Target(Target), Weight(Weight) {} + ProfiledCallGraphNode *Source; + ProfiledCallGraphNode *Target; + uint64_t Weight; + + // The call destination is the only important data here, + // allow to transparently unwrap into it. + operator ProfiledCallGraphNode *() const { return Target; } +}; + struct ProfiledCallGraphNode { - ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {} - StringRef Name; - struct ProfiledCallGraphNodeComparer { - bool operator()(const ProfiledCallGraphNode *L, - const ProfiledCallGraphNode *R) const { - return L->Name < R->Name; + struct ProfiledCallGraphEdgeComparer { + bool operator()(const ProfiledCallGraphEdge &L, + const ProfiledCallGraphEdge &R) const { + if (L.Weight == R.Weight) + return L.Target->Name < R.Target->Name; + return L.Weight > R.Weight; } }; - std::set Callees; + + using iterator = std::set::iterator; + using const_iterator = std::set::const_iterator; + using edges = std::set; + + ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {} + + StringRef Name; + edges Edges; }; class ProfiledCallGraph { public: - using iterator = std::set::iterator; + using iterator = std::set::iterator; // Constructor for non-CS profile. ProfiledCallGraph(SampleProfileMap &ProfileMap) { @@ -50,55 +73,29 @@ } // Constructor for CS profile. - ProfiledCallGraph(SampleContextTracker &ContextTracker) { - // BFS traverse the context profile trie to add call edges for calls shown - // in context. - std::queue Queue; - for (auto &Child : ContextTracker.getRootContext().getAllChildContext()) { - ContextTrieNode *Callee = &Child.second; - addProfiledFunction(ContextTracker.getFuncNameFor(Callee)); - Queue.push(Callee); - } + ProfiledCallGraph(SampleContextTracker &ContextTracker); - while (!Queue.empty()) { - ContextTrieNode *Caller = Queue.front(); - Queue.pop(); - // Add calls for context. When AddNodeWithSamplesOnly is true, both caller - // and callee need to have context profile. - // Note that callsite target samples are completely ignored since they can - // conflict with the context edges, which are formed by context - // compression during profile generation, for cyclic SCCs. This may - // further result in an SCC order incompatible with the purely - // context-based one, which may in turn block context-based inlining. - for (auto &Child : Caller->getAllChildContext()) { - ContextTrieNode *Callee = &Child.second; - addProfiledFunction(ContextTracker.getFuncNameFor(Callee)); - Queue.push(Callee); - addProfiledCall(ContextTracker.getFuncNameFor(Caller), - ContextTracker.getFuncNameFor(Callee)); - } - } - } - - iterator begin() { return Root.Callees.begin(); } - iterator end() { return Root.Callees.end(); } + iterator begin() { return Root.Edges.begin(); } + iterator end() { return Root.Edges.end(); } ProfiledCallGraphNode *getEntryNode() { return &Root; } void addProfiledFunction(StringRef Name) { if (!ProfiledFunctions.count(Name)) { // Link to synthetic root to make sure every node is reachable // from root. This does not affect SCC order. ProfiledFunctions[Name] = ProfiledCallGraphNode(Name); - Root.Callees.insert(&ProfiledFunctions[Name]); + Root.Edges.emplace(&Root, &ProfiledFunctions[Name], 0); } } - void addProfiledCall(StringRef CallerName, StringRef CalleeName) { + void addProfiledCall(StringRef CallerName, StringRef CalleeName, + uint64_t Weight = 0) { assert(ProfiledFunctions.count(CallerName)); auto CalleeIt = ProfiledFunctions.find(CalleeName); if (CalleeIt == ProfiledFunctions.end()) { return; } - ProfiledFunctions[CallerName].Callees.insert(&CalleeIt->second); + ProfiledFunctions[CallerName].Edges.emplace(&ProfiledFunctions[CallerName], + &CalleeIt->second, Weight); } void addProfiledCalls(const FunctionSamples &Samples) { @@ -125,15 +122,37 @@ StringMap ProfiledFunctions; }; +class ProfiledCallGraphNodeSorter { + struct NodeInfo { + NodeInfo *Group = this; + uint32_t Rank = 0; + bool Visited = true; + }; + + // Find the root group of the node and compress the path from node to the + // root. + NodeInfo *find(NodeInfo *Node); + + // Union caller and callee into the same group and return true. + // Returns false if caller and callee are already in the same group. + bool unionGroups(const ProfiledCallGraphEdge *Edge); + + std::unordered_map NodeInfoMap; + +public: + void sort(const std::vector &InputNodes, + std::vector &OutputNodes); +}; } // end namespace sampleprof template <> struct GraphTraits { + using NodeType = ProfiledCallGraphNode; using NodeRef = ProfiledCallGraphNode *; - using ChildIteratorType = std::set::iterator; + using ChildIteratorType = NodeType::const_iterator; static NodeRef getEntryNode(NodeRef PCGN) { return PCGN; } - static ChildIteratorType child_begin(NodeRef N) { return N->Callees.begin(); } - static ChildIteratorType child_end(NodeRef N) { return N->Callees.end(); } + static ChildIteratorType child_begin(NodeRef N) { return N->Edges.begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->Edges.end(); } }; template <> diff --git a/llvm/lib/Transforms/IPO/CMakeLists.txt b/llvm/lib/Transforms/IPO/CMakeLists.txt --- a/llvm/lib/Transforms/IPO/CMakeLists.txt +++ b/llvm/lib/Transforms/IPO/CMakeLists.txt @@ -33,6 +33,7 @@ OpenMPOpt.cpp PartialInlining.cpp PassManagerBuilder.cpp + ProfiledCallGraph.cpp PruneEH.cpp SampleContextTracker.cpp SampleProfile.cpp diff --git a/llvm/lib/Transforms/IPO/ProfiledCallGraph.cpp b/llvm/lib/Transforms/IPO/ProfiledCallGraph.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/IPO/ProfiledCallGraph.cpp @@ -0,0 +1,176 @@ +//===- ProfiledCallGraph.cpp - Pseudo probe Instrumentation -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the ProfiledCallGraph and its sorting support. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/IPO/ProfiledCallGraph.h" +#include "llvm/Support/CommandLine.h" + +using namespace llvm; +#define DEBUG_TYPE "profiled-call-graph" + +static cl::opt + SortProfiledSCC("sort-profiled-scc", cl::init(true), cl::Hidden, + cl::desc("Sort profiled recursion by edge weights.")); + +namespace llvm { +namespace sampleprof { + +// Constructor for CS profile. +ProfiledCallGraph::ProfiledCallGraph(SampleContextTracker &ContextTracker) { + // BFS traverse the context profile trie to add call edges for calls shown + // in context. + std::queue Queue; + for (auto &Child : ContextTracker.getRootContext().getAllChildContext()) { + ContextTrieNode *Callee = &Child.second; + addProfiledFunction(ContextTracker.getFuncNameFor(Callee)); + Queue.push(Callee); + } + + while (!Queue.empty()) { + ContextTrieNode *Caller = Queue.front(); + Queue.pop(); + FunctionSamples *CallerSamples = Caller->getFunctionSamples(); + + // Add calls for context. + // Note that callsite target samples are completely ignored since they can + // conflict with the context edges, which are formed by context + // compression during profile generation, for cyclic SCCs. This may + // further result in an SCC order incompatible with the purely + // context-based one, which may in turn block context-based inlining. + for (auto &Child : Caller->getAllChildContext()) { + ContextTrieNode *Callee = &Child.second; + addProfiledFunction(ContextTracker.getFuncNameFor(Callee)); + Queue.push(Callee); + + // Fetch edge weight from the profile. + uint64_t Weight; + FunctionSamples *CalleeSamples = Callee->getFunctionSamples(); + if (!CalleeSamples || !CallerSamples) { + Weight = 0; + } else { + uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples(); + uint64_t CallsiteCount = 0; + LineLocation Callsite = Callee->getCallSiteLoc(); + if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) { + SampleRecord::CallTargetMap &TargetCounts = CallTargets.get(); + auto It = TargetCounts.find(CalleeSamples->getName()); + if (It != TargetCounts.end()) + CallsiteCount = It->second; + } + Weight = std::max(CallsiteCount, CalleeEntryCount); + } + + addProfiledCall(ContextTracker.getFuncNameFor(Caller), + ContextTracker.getFuncNameFor(Callee), Weight); + } + } +} + +ProfiledCallGraphNodeSorter::NodeInfo * +ProfiledCallGraphNodeSorter::find(ProfiledCallGraphNodeSorter::NodeInfo *Node) { + if (Node->Group != Node) + Node->Group = find(Node->Group); + return Node->Group; +} + +// Union BB1 and BB2 into the same group and return true. +// Returns false if BB1 and BB2 are already in the same group. +bool ProfiledCallGraphNodeSorter::unionGroups( + const ProfiledCallGraphEdge *Edge) { + NodeInfo *G1 = find(&NodeInfoMap[Edge->Source]); + NodeInfo *G2 = find(&NodeInfoMap[Edge->Target]); + + // If the edge forms a cycle, do not add it to MST + if (G1 == G2) + return false; + + // Make the smaller rank tree a direct child or the root of high rank tree. + if (G1->Rank < G1->Rank) + G1->Group = G2; + else { + G2->Group = G1; + // If the ranks are the same, increment root of one tree by one. + if (G1->Rank == G2->Rank) + G2->Rank++; + } + return true; +} + +void ProfiledCallGraphNodeSorter::sort( + const std::vector &InputNodes, + std::vector &OutputNodes) { + if (!SortProfiledSCC) { + OutputNodes = InputNodes; + return; + } + + // Initialize auxilary node information. + NodeInfoMap.clear(); + for (auto *Node : InputNodes) + (void)NodeInfoMap[Node].Group; + + // Sort edges by weights. + struct ProfiledCallGraphEdgeComparer { + bool operator()(const ProfiledCallGraphEdge *L, + const ProfiledCallGraphEdge *R) const { + return L->Weight > R->Weight; + } + }; + + std::set + SortedEdges; + for (auto *Node : InputNodes) { + for (auto &Edge : Node->Edges) { + if (NodeInfoMap.count(Edge.Target)) + SortedEdges.insert(&Edge); + } + } + + // Traverse all the edges and compute the Minimum Weight Spanning Tree + // using union-find algorithm. + std::unordered_set MSTEdges; + for (auto *Edge : SortedEdges) { + if (unionGroups(Edge)) + MSTEdges.insert(Edge); + } + + LLVM_DEBUG(for (const auto *Edge + : MSTEdges) { + outs() << " MST Edge " << Edge->Source->Name << "\n"; + outs() << " " << Edge->Target->Name << "\n"; + outs() << " " << Edge->Weight << "\n"; + }); + + // Do BFS on MST, starting from nodes that have no incoming edge. + for (const auto *Edge : MSTEdges) + NodeInfoMap[Edge->Target].Visited = false; + + std::queue Queue; + for (auto &Node : NodeInfoMap) + if (Node.second.Visited) + Queue.push(const_cast(Node.first)); + + while (!Queue.empty()) { + auto *Node = Queue.front(); + Queue.pop(); + OutputNodes.push_back(Node); + for (auto &Edge : Node->Edges) { + if (MSTEdges.count(&Edge) && !NodeInfoMap[Edge.Target].Visited) { + NodeInfoMap[Edge.Target].Visited = true; + Queue.push(Edge.Target); + } + } + } + + assert(InputNodes.size() == OutputNodes.size() && "missing nodes in MST"); +} +} // namespace sampleprof +} // namespace llvm \ No newline at end of file diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -1851,9 +1851,13 @@ // context in the profile. std::unique_ptr ProfiledCG = buildProfiledCallGraph(*CG); + ProfiledCallGraphNodeSorter NodeSorter; scc_iterator CGI = scc_begin(ProfiledCG.get()); while (!CGI.isAtEnd()) { - for (ProfiledCallGraphNode *Node : *CGI) { + // Sort nodes in one SCC based on callsite hotness. + std::vector SortedNodes; + NodeSorter.sort(*CGI, SortedNodes); + for (auto *Node : reverse(SortedNodes)) { Function *F = SymbolMap.lookup(Node->Name); if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile")) FunctionOrderList.push_back(F); diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof @@ -0,0 +1,43 @@ +[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11 + 0: 6 + 1: 6 + 3: 287884 + 15: 23 +[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20 + 0: 15 + 1: 15 + 3: 74946 + 10: 23324 + 15: 11 +[main]:154:0 + 2: 12 + 3: 18 _Z5funcAi:11 + 3.1: 18 _Z5funcBi:19 +[external:12 @ main]:154:12 + 2: 12 + 3: 10 _Z5funcAi:7 + 3.1: 10 _Z5funcBi:11 +[main:3.1 @ _Z5funcBi]:120:19 + 0: 19 + 1: 19 _Z8funcLeafi:20 + 3: 12 +[externalA:17 @ _Z5funcBi]:120:3 + 0: 3 + 1: 3 +[external:10 @ _Z5funcBi]:120:10 + 0: 10 + 1: 10 +[main:3 @ _Z5funcAi]:99:11 + 0: 10 + 1: 10 _Z8funcLeafi:11 + 2: 287864 _Z3fibi:315608 + 3: 24 +[main:3 @ _Z5funcAi:2 @ _Z3fibi]:287864:315608 + 0: 362839 + 1: 6 + 3: 287884 +[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi:1 @ _Z5funcAi]:1467299:6 + 0: 6 + 1: 6 + 3: 287884 + 15: 23 \ No newline at end of file diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll --- a/llvm/test/Transforms/SampleProfile/profile-context-order.ll +++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll @@ -17,6 +17,14 @@ ;; _Z3fibi inlined into _Z5funcAi. ; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S | FileCheck %s -check-prefix=ICALL-INLINE +;; When a cycle is formed by profiled edges between _Z5funcAi and _Z8funcLeafi, +;; the function processing order matters. Without considering call edge weights +;; _Z8funcLeafi can be processed before _Z5funcAi, thus leads to suboptimal +;; inlining. +; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sort-profiled-scc=0 -sample-profile-file=%S/Inputs/profile-context-order-scc.prof -S | FileCheck %s -check-prefix=NOINLINE +; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sort-profiled-scc=1 -sample-profile-file=%S/Inputs/profile-context-order-scc.prof -S | FileCheck %s -check-prefix=INLINE + + @factor = dso_local global i32 3, align 4, !dbg !0 @fp = dso_local global i32 (i32)* null, align 8 diff --git a/llvm/tools/llvm-profgen/CSPreInliner.cpp b/llvm/tools/llvm-profgen/CSPreInliner.cpp --- a/llvm/tools/llvm-profgen/CSPreInliner.cpp +++ b/llvm/tools/llvm-profgen/CSPreInliner.cpp @@ -65,12 +65,16 @@ std::vector CSPreInliner::buildTopDownOrder() { std::vector Order; ProfiledCallGraph ProfiledCG(ContextTracker); + ProfiledCallGraphNodeSorter NodeSorter; // Now that we have a profiled call graph, construct top-down order // by building up SCC and reversing SCC order. scc_iterator I = scc_begin(&ProfiledCG); while (!I.isAtEnd()) { - for (ProfiledCallGraphNode *Node : *I) { + // Sort nodes in one SCC based on callsite hotness. + std::vector SortedNodes; + NodeSorter.sort(*I, SortedNodes); + for (auto *Node : reverse(SortedNodes)) { if (Node != ProfiledCG.getEntryNode()) Order.push_back(Node->Name); }