Diff 390735

llvm/include/llvm/ADT/SCCIterator.h

Show All 22 Lines
#define LLVM_ADT_SCCITERATOR_H		#define LLVM_ADT_SCCITERATOR_H

#include "llvm/ADT/DenseMap.h"		#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/GraphTraits.h"		#include "llvm/ADT/GraphTraits.h"
#include "llvm/ADT/iterator.h"		#include "llvm/ADT/iterator.h"
#include <cassert>		#include <cassert>
#include <cstddef>		#include <cstddef>
#include <iterator>		#include <iterator>
		#include <queue>
		#include <set>
		#include <unordered_map>
		#include <unordered_set>
#include <vector>		#include <vector>

namespace llvm {		namespace llvm {

/// Enumerate the SCCs of a directed graph in reverse topological order		/// Enumerate the SCCs of a directed graph in reverse topological order
/// of the SCC DAG.		/// of the SCC DAG.
///		///
/// This is implemented using Tarjan's DFS algorithm using an internal stack to		/// This is implemented using Tarjan's DFS algorithm using an internal stack to
▲ Show 20 Lines • Show All 190 Lines • ▼ Show 20 Lines	template <class T> scc_iterator<T> scc_begin(const T &G) {
return scc_iterator<T>::begin(G);		return scc_iterator<T>::begin(G);
}		}

/// Construct the end iterator for a deduced graph type T.		/// Construct the end iterator for a deduced graph type T.
template <class T> scc_iterator<T> scc_end(const T &G) {		template <class T> scc_iterator<T> scc_end(const T &G) {
return scc_iterator<T>::end(G);		return scc_iterator<T>::end(G);
}		}

		/// Sort the nodes of a directed SCC in the decreasing order of the edge
		/// weights. The instantiating GraphT type should have weighted edge type
		/// declared in its graph traits in order to use this iterator.
		///
		/// This is implemented using Kruskal's minimal spanning tree algorithm followed
		/// by a BFS walk. First a maximum spanning tree (forest) is built based on all
		/// edges within the SCC collection. Then a BFS walk is initiated on tree nodes
		/// that do not have a predecessor. Finally, the BFS order computed is the
		/// traversal order of the nodes of the SCC. Such order ensures that
		/// high-weighted edges are visited first during the tranversal.
		template <class GraphT, class GT = GraphTraits<GraphT>>
		class scc_member_iterator {
		using NodeType = typename GT::NodeType;
		using EdgeType = typename GT::EdgeType;
		using NodesType = std::vector<NodeType *>;

		// Auxilary node information used during the MST calculation.
		struct NodeInfo {
		NodeInfo *Group = this;
		uint32_t Rank = 0;
		bool Visited = true;
		};

		// Find the root group of the node and compress the path from node to the
		// root.
		NodeInfo find(NodeInfo Node) {
		if (Node->Group != Node)
		Node->Group = find(Node->Group);
		return Node->Group;
		}

		// Union the source and target node into the same group and return true.
		// Returns false if they are already in the same group.
		bool unionGroups(const EdgeType *Edge) {
		NodeInfo *G1 = find(&NodeInfoMap[Edge->Source]);
		NodeInfo *G2 = find(&NodeInfoMap[Edge->Target]);

		// If the edge forms a cycle, do not add it to MST
		if (G1 == G2)
		return false;

		// Make the smaller rank tree a direct child or the root of high rank tree.
		if (G1->Rank < G1->Rank)
		G1->Group = G2;
		else {
		G2->Group = G1;
		// If the ranks are the same, increment root of one tree by one.
		if (G1->Rank == G2->Rank)
		G2->Rank++;
		}
		return true;
		}

		std::unordered_map<NodeType *, NodeInfo> NodeInfoMap;
		NodesType Nodes;

		public:
		scc_member_iterator(const NodesType &InputNodes);

		NodesType &operator*() { return Nodes; }
		};

		template <class GraphT, class GT>
		scc_member_iterator<GraphT, GT>::scc_member_iterator(
		const NodesType &InputNodes) {
		if (InputNodes.size() <= 1) {
		Nodes = InputNodes;
		return;
		}

		// Initialize auxilary node information.
		NodeInfoMap.clear();
		for (auto *Node : InputNodes) {
		// This is specifically used to construct a `NodeInfo` object in place. An
		// insert operation will involve a copy construction which invalidate the
		// initial value of the `Group` field which should be `this`.
		(void)NodeInfoMap[Node].Group;
		}

		// Sort edges by weights.
		struct EdgeComparer {
		bool operator()(const EdgeType L, const EdgeType R) const {
		return L->Weight > R->Weight;
		}
		};

		std::multiset<const EdgeType *, EdgeComparer> SortedEdges;
		for (auto *Node : InputNodes) {
		for (auto &Edge : Node->Edges) {
		if (NodeInfoMap.count(Edge.Target))
		SortedEdges.insert(&Edge);
		}
		}

		// Traverse all the edges and compute the Maximum Weight Spanning Tree
		// using Kruskal's algorithm.
		std::unordered_set<const EdgeType *> MSTEdges;
		for (auto *Edge : SortedEdges) {
		if (unionGroups(Edge))
		MSTEdges.insert(Edge);
		}

		// Do BFS on MST, starting from nodes that have no incoming edge. These nodes
		// are "roots" of the MST forest. This ensures that nodes are visited before
		// their decsendents are, thus ensures hot edges are processed before cold
		// edges, based on how MST is computed.
		for (const auto *Edge : MSTEdges)
		NodeInfoMap[Edge->Target].Visited = false;

		std::queue<NodeType *> Queue;
		for (auto &Node : NodeInfoMap)
		if (Node.second.Visited)
		Queue.push(Node.first);

		while (!Queue.empty()) {
		auto *Node = Queue.front();
		Queue.pop();
		Nodes.push_back(Node);
		for (auto &Edge : Node->Edges) {
		if (MSTEdges.count(&Edge) && !NodeInfoMap[Edge.Target].Visited) {
		NodeInfoMap[Edge.Target].Visited = true;
		Queue.push(Edge.Target);
		}
		}
		}

		assert(InputNodes.size() == Nodes.size() && "missing nodes in MST");
		std::reverse(Nodes.begin(), Nodes.end());
		}
} // end namespace llvm		} // end namespace llvm

#endif // LLVM_ADT_SCCITERATOR_H		#endif // LLVM_ADT_SCCITERATOR_H

llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h

	Show All 18 Lines
	#include <set>			#include <set>

	using namespace llvm;			using namespace llvm;
	using namespace sampleprof;			using namespace sampleprof;

	namespace llvm {			namespace llvm {
	namespace sampleprof {			namespace sampleprof {

				struct ProfiledCallGraphNode;

				struct ProfiledCallGraphEdge {
				ProfiledCallGraphEdge(ProfiledCallGraphNode *Source,
				ProfiledCallGraphNode *Target, uint64_t Weight)
				: Source(Source), Target(Target), Weight(Weight) {}
				ProfiledCallGraphNode *Source;
				ProfiledCallGraphNode *Target;
				uint64_t Weight;

				// The call destination is the only important data here,
				// allow to transparently unwrap into it.
				operator ProfiledCallGraphNode *() const { return Target; }
				};

	struct ProfiledCallGraphNode {			struct ProfiledCallGraphNode {
	ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {}
	StringRef Name;

	struct ProfiledCallGraphNodeComparer {			// Sort edges by callee names only since all edges to be compared are from
	bool operator()(const ProfiledCallGraphNode *L,			// same caller. Edge weights are not considered either because for the same
	const ProfiledCallGraphNode *R) const {			// callee only the edge with the largest weight is added to the edge set.
	return L->Name < R->Name;			struct ProfiledCallGraphEdgeComparer {
				bool operator()(const ProfiledCallGraphEdge &L,
				wenleiUnsubmitted Not Done Reply Inline Actions Can target still be the same, in which case do we want to check source name? wenlei: Can target still be the same, in which case do we want to check source name?
				hoyAuthorUnsubmitted Done Reply Inline Actions Edges to be compared are from same caller, so the source should be the same. Call targets can also be the same since a caller may have different callsites to the same callee. The current model allows such edges with same callee but different weights to coexist. I think we should optimize it to just keep the edge with largest weight. hoy: Edges to be compared are from same caller, so the source should be the same. Call targets can…
				wenleiUnsubmitted Not Done Reply Inline Actions Ok, then add a comment please so others can understand why only weight is used and tier breaker is not needed. wenlei: Ok, then add a comment please so others can understand why only weight is used and tier breaker…
				hoyAuthorUnsubmitted Done Reply Inline Actions comment added. hoy: comment added.
				const ProfiledCallGraphEdge &R) const {
				return L.Target->Name < R.Target->Name;
	}			}
	};			};
	std::set<ProfiledCallGraphNode *, ProfiledCallGraphNodeComparer> Callees;
				using iterator = std::set<ProfiledCallGraphEdge>::iterator;
				using const_iterator = std::set<ProfiledCallGraphEdge>::const_iterator;
				using edge = ProfiledCallGraphEdge;
				using edges = std::set<ProfiledCallGraphEdge, ProfiledCallGraphEdgeComparer>;

				ProfiledCallGraphNode(StringRef FName = StringRef()) : Name(FName) {}

				StringRef Name;
				edges Edges;
	};			};

	class ProfiledCallGraph {			class ProfiledCallGraph {
	public:			public:
	using iterator = std::set<ProfiledCallGraphNode *>::iterator;			using iterator = std::set<ProfiledCallGraphEdge>::iterator;

	// Constructor for non-CS profile.			// Constructor for non-CS profile.
	ProfiledCallGraph(SampleProfileMap &ProfileMap) {			ProfiledCallGraph(SampleProfileMap &ProfileMap) {
	assert(!FunctionSamples::ProfileIsCS && "CS profile is not handled here");			assert(!FunctionSamples::ProfileIsCS && "CS profile is not handled here");
	for (const auto &Samples : ProfileMap) {			for (const auto &Samples : ProfileMap) {
	addProfiledCalls(Samples.second);			addProfiledCalls(Samples.second);
	}			}
	}			}

	// Constructor for CS profile.			// Constructor for CS profile.
	ProfiledCallGraph(SampleContextTracker &ContextTracker) {			ProfiledCallGraph(SampleContextTracker &ContextTracker) {
	// BFS traverse the context profile trie to add call edges for calls shown			// BFS traverse the context profile trie to add call edges for calls shown
	// in context.			// in context.
	std::queue<ContextTrieNode *> Queue;			std::queue<ContextTrieNode *> Queue;
	for (auto &Child : ContextTracker.getRootContext().getAllChildContext()) {			for (auto &Child : ContextTracker.getRootContext().getAllChildContext()) {
	ContextTrieNode *Callee = &Child.second;			ContextTrieNode *Callee = &Child.second;
	addProfiledFunction(ContextTracker.getFuncNameFor(Callee));			addProfiledFunction(ContextTracker.getFuncNameFor(Callee));
	Queue.push(Callee);			Queue.push(Callee);
	}			}

	while (!Queue.empty()) {			while (!Queue.empty()) {
	ContextTrieNode *Caller = Queue.front();			ContextTrieNode *Caller = Queue.front();
	Queue.pop();			Queue.pop();
	// Add calls for context. When AddNodeWithSamplesOnly is true, both caller			FunctionSamples *CallerSamples = Caller->getFunctionSamples();
	// and callee need to have context profile.
				// Add calls for context.
	// Note that callsite target samples are completely ignored since they can			// Note that callsite target samples are completely ignored since they can
	// conflict with the context edges, which are formed by context			// conflict with the context edges, which are formed by context
	// compression during profile generation, for cyclic SCCs. This may			// compression during profile generation, for cyclic SCCs. This may
	// further result in an SCC order incompatible with the purely			// further result in an SCC order incompatible with the purely
	// context-based one, which may in turn block context-based inlining.			// context-based one, which may in turn block context-based inlining.
	for (auto &Child : Caller->getAllChildContext()) {			for (auto &Child : Caller->getAllChildContext()) {
	ContextTrieNode *Callee = &Child.second;			ContextTrieNode *Callee = &Child.second;
	addProfiledFunction(ContextTracker.getFuncNameFor(Callee));			addProfiledFunction(ContextTracker.getFuncNameFor(Callee));
	Queue.push(Callee);			Queue.push(Callee);

				// Fetch edge weight from the profile.
				uint64_t Weight;
				FunctionSamples *CalleeSamples = Callee->getFunctionSamples();
				if (!CalleeSamples \|\| !CallerSamples) {
				Weight = 0;
				} else {
				uint64_t CalleeEntryCount = CalleeSamples->getEntrySamples();
				uint64_t CallsiteCount = 0;
				LineLocation Callsite = Callee->getCallSiteLoc();
				if (auto CallTargets = CallerSamples->findCallTargetMapAt(Callsite)) {
				SampleRecord::CallTargetMap &TargetCounts = CallTargets.get();
				auto It = TargetCounts.find(CalleeSamples->getName());
				if (It != TargetCounts.end())
				CallsiteCount = It->second;
				}
				Weight = std::max(CallsiteCount, CalleeEntryCount);
				}

	addProfiledCall(ContextTracker.getFuncNameFor(Caller),			addProfiledCall(ContextTracker.getFuncNameFor(Caller),
	ContextTracker.getFuncNameFor(Callee));			ContextTracker.getFuncNameFor(Callee), Weight);
	}			}
	}			}
	}			}

	iterator begin() { return Root.Callees.begin(); }			iterator begin() { return Root.Edges.begin(); }
	iterator end() { return Root.Callees.end(); }			iterator end() { return Root.Edges.end(); }
	ProfiledCallGraphNode *getEntryNode() { return &Root; }			ProfiledCallGraphNode *getEntryNode() { return &Root; }
	void addProfiledFunction(StringRef Name) {			void addProfiledFunction(StringRef Name) {
	if (!ProfiledFunctions.count(Name)) {			if (!ProfiledFunctions.count(Name)) {
	// Link to synthetic root to make sure every node is reachable			// Link to synthetic root to make sure every node is reachable
	// from root. This does not affect SCC order.			// from root. This does not affect SCC order.
	ProfiledFunctions[Name] = ProfiledCallGraphNode(Name);			ProfiledFunctions[Name] = ProfiledCallGraphNode(Name);
	Root.Callees.insert(&ProfiledFunctions[Name]);			Root.Edges.emplace(&Root, &ProfiledFunctions[Name], 0);
	}			}
	}			}

	void addProfiledCall(StringRef CallerName, StringRef CalleeName) {			private:
				void addProfiledCall(StringRef CallerName, StringRef CalleeName,
				uint64_t Weight = 0) {
	assert(ProfiledFunctions.count(CallerName));			assert(ProfiledFunctions.count(CallerName));
	auto CalleeIt = ProfiledFunctions.find(CalleeName);			auto CalleeIt = ProfiledFunctions.find(CalleeName);
	if (CalleeIt == ProfiledFunctions.end()) {			if (CalleeIt == ProfiledFunctions.end())
	return;			return;
				ProfiledCallGraphEdge Edge(&ProfiledFunctions[CallerName],
				&CalleeIt->second, Weight);
				auto &Edges = ProfiledFunctions[CallerName].Edges;
				auto EdgeIt = Edges.find(Edge);
				if (EdgeIt == Edges.end()) {
				Edges.insert(Edge);
				} else if (EdgeIt->Weight < Edge.Weight) {
				// Replace existing call edges with same target but smaller weight.
				Edges.erase(EdgeIt);
				Edges.insert(Edge);
				wenleiUnsubmitted Not Done Reply Inline Actions Do you actually need to erase/insert, instead of updating the weight? wenlei: Do you actually need to erase/insert, instead of updating the weight?
				hoyAuthorUnsubmitted Done Reply Inline Actions `Weight` is a field of the key so it is immutable. hoy: `Weight` is a field of the key so it is immutable.
	}			}
	ProfiledFunctions[CallerName].Callees.insert(&CalleeIt->second);
	}			}

	void addProfiledCalls(const FunctionSamples &Samples) {			void addProfiledCalls(const FunctionSamples &Samples) {
	addProfiledFunction(Samples.getFuncName());			addProfiledFunction(Samples.getFuncName());

	for (const auto &Sample : Samples.getBodySamples()) {			for (const auto &Sample : Samples.getBodySamples()) {
	for (const auto &Target : Sample.second.getCallTargets()) {			for (const auto &Target : Sample.second.getCallTargets()) {
	addProfiledFunction(Target.first());			addProfiledFunction(Target.first());
	addProfiledCall(Samples.getFuncName(), Target.first());			addProfiledCall(Samples.getFuncName(), Target.first(), Target.second);
	}			}
	}			}

	for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {			for (const auto &CallsiteSamples : Samples.getCallsiteSamples()) {
	for (const auto &InlinedSamples : CallsiteSamples.second) {			for (const auto &InlinedSamples : CallsiteSamples.second) {
	addProfiledFunction(InlinedSamples.first);			addProfiledFunction(InlinedSamples.first);
	addProfiledCall(Samples.getFuncName(), InlinedSamples.first);			addProfiledCall(Samples.getFuncName(), InlinedSamples.first,
				InlinedSamples.second.getEntrySamples());
	addProfiledCalls(InlinedSamples.second);			addProfiledCalls(InlinedSamples.second);
	}			}
	}			}
	}			}

	private:
	ProfiledCallGraphNode Root;			ProfiledCallGraphNode Root;
	StringMap<ProfiledCallGraphNode> ProfiledFunctions;			StringMap<ProfiledCallGraphNode> ProfiledFunctions;
	};			};

	} // end namespace sampleprof			} // end namespace sampleprof
				wenleiUnsubmitted Not Done Reply Inline Actions This is only for sorting SCC nodes in an SCC and it doesn't order all nodes in CG, so perhaps it's better to call it SCCNodeSorter. This also feels like a generic functionality that doesn't tie to ProfiledCallGraph, so we can perhaps make it a generic template similar to scc_iterator, e.g. scc_member_iterator. wenlei: This is only for sorting SCC nodes in an SCC and it doesn't order all nodes in CG, so perhaps…
				hoyAuthorUnsubmitted Done Reply Inline Actions A generic template class sounds better. That will require the `GraphTraits` definition for nodes to come with a weighted edge list. Nodes with that can use this sorter. hoy: A generic template class sounds better. That will require the `GraphTraits` definition for…

	template <> struct GraphTraits<ProfiledCallGraphNode *> {			template <> struct GraphTraits<ProfiledCallGraphNode *> {
				using NodeType = ProfiledCallGraphNode;
	using NodeRef = ProfiledCallGraphNode *;			using NodeRef = ProfiledCallGraphNode *;
	using ChildIteratorType = std::set<ProfiledCallGraphNode *>::iterator;			using EdgeType = NodeType::edge;
				using ChildIteratorType = NodeType::const_iterator;

	static NodeRef getEntryNode(NodeRef PCGN) { return PCGN; }			static NodeRef getEntryNode(NodeRef PCGN) { return PCGN; }
	static ChildIteratorType child_begin(NodeRef N) { return N->Callees.begin(); }			static ChildIteratorType child_begin(NodeRef N) { return N->Edges.begin(); }
	static ChildIteratorType child_end(NodeRef N) { return N->Callees.end(); }			static ChildIteratorType child_end(NodeRef N) { return N->Edges.end(); }
	};			};

	template <>			template <>
	struct GraphTraits<ProfiledCallGraph *>			struct GraphTraits<ProfiledCallGraph *>
	: public GraphTraits<ProfiledCallGraphNode *> {			: public GraphTraits<ProfiledCallGraphNode *> {
	static NodeRef getEntryNode(ProfiledCallGraph *PCG) {			static NodeRef getEntryNode(ProfiledCallGraph *PCG) {
	return PCG->getEntryNode();			return PCG->getEntryNode();
	}			}
	Show All 13 Lines

llvm/lib/Transforms/IPO/SampleProfile.cpp

Show First 20 Lines • Show All 167 Lines • ▼ Show 20 Lines	cl::desc("Do profile annotation and inlining for functions in top-down "
"order of call graph during sample profile loading. It only "		"order of call graph during sample profile loading. It only "
"works for new pass manager. "));		"works for new pass manager. "));

static cl::opt<bool>		static cl::opt<bool>
UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,		UseProfiledCallGraph("use-profiled-call-graph", cl::init(true), cl::Hidden,
cl::desc("Process functions in a top-down order "		cl::desc("Process functions in a top-down order "
"defined by the profiled call graph when "		"defined by the profiled call graph when "
"-sample-profile-top-down-load is on."));		"-sample-profile-top-down-load is on."));
		cl::opt<bool>
		SortProfiledSCC("sort-profiled-scc-member", cl::init(true), cl::Hidden,
		wenleiUnsubmitted Not Done Reply Inline Actions nit: sort-profiled-scc-member? wenlei: nit: sort-profiled-scc-member?
		hoyAuthorUnsubmitted Done Reply Inline Actions fixed. hoy: fixed.
		cl::desc("Sort profiled recursion by edge weights."));

static cl::opt<bool> ProfileSizeInline(		static cl::opt<bool> ProfileSizeInline(
"sample-profile-inline-size", cl::Hidden, cl::init(false),		"sample-profile-inline-size", cl::Hidden, cl::init(false),
cl::desc("Inline cold call sites in profile loader if it's beneficial "		cl::desc("Inline cold call sites in profile loader if it's beneficial "
"for code size."));		"for code size."));

cl::opt<int> ProfileInlineGrowthLimit(		cl::opt<int> ProfileInlineGrowthLimit(
"sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),		"sample-profile-inline-growth-limit", cl::Hidden, cl::init(12),
▲ Show 20 Lines • Show All 1,664 Lines • ▼ Show 20 Lines	if (UseProfiledCallGraph \|\|
// an SCC order incompatible with profile-defined one. Using strictly		// an SCC order incompatible with profile-defined one. Using strictly
// profile order ensures a maximum inlining experience. On the other hand,		// profile order ensures a maximum inlining experience. On the other hand,
// static call edges are not so important when they don't correspond to a		// static call edges are not so important when they don't correspond to a
// context in the profile.		// context in the profile.

std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG);		std::unique_ptr<ProfiledCallGraph> ProfiledCG = buildProfiledCallGraph(*CG);
scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());		scc_iterator<ProfiledCallGraph *> CGI = scc_begin(ProfiledCG.get());
while (!CGI.isAtEnd()) {		while (!CGI.isAtEnd()) {
for (ProfiledCallGraphNode Node : CGI) {		auto Range = *CGI;
		if (SortProfiledSCC) {
		// Sort nodes in one SCC based on callsite hotness.
		scc_member_iterator<ProfiledCallGraph > SI(CGI);
		Range = *SI;
		}
		for (auto *Node : Range) {
Function *F = SymbolMap.lookup(Node->Name);		Function *F = SymbolMap.lookup(Node->Name);
if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))		if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
FunctionOrderList.push_back(F);		FunctionOrderList.push_back(F);
}		}
++CGI;		++CGI;
}		}
} else {		} else {
scc_iterator<CallGraph *> CGI = scc_begin(CG);		scc_iterator<CallGraph *> CGI = scc_begin(CG);
▲ Show 20 Lines • Show All 284 Lines • Show Last 20 Lines

llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof

This file was added.

				[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi]:1467299:11
				0: 6
				1: 6
				3: 287884
				15: 23
				[main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
				0: 15
				1: 15
				3: 74946
				10: 23324
				15: 11
				[main]:154:0
				2: 12
				3: 18 _Z5funcAi:11
				3.1: 18 _Z5funcBi:19
				[external:12 @ main]:154:12
				2: 12
				3: 10 _Z5funcAi:7
				3.1: 10 _Z5funcBi:11
				[main:3.1 @ _Z5funcBi]:120:19
				0: 19
				1: 19 _Z8funcLeafi:20
				3: 12
				[externalA:17 @ _Z5funcBi]:120:3
				0: 3
				1: 3
				[external:10 @ _Z5funcBi]:120:10
				0: 10
				1: 10
				[main:3 @ _Z5funcAi]:99:11
				0: 10
				1: 10 _Z8funcLeafi:11
				2: 287864 _Z3fibi:315608
				3: 24
				[main:3 @ _Z5funcAi:2 @ _Z3fibi]:287864:315608
				0: 362839
				1: 6
				3: 287884
				[main:3 @ _Z5funcAi:1 @ _Z8funcLeafi:1 @ _Z5funcBi]:1467299:6
				0: 6
				1: 6
				3: 287884
				15: 23
				No newline at end of file

llvm/test/Transforms/SampleProfile/profile-context-order.ll

	Show All 11 Lines
	;; There is an indirect call _Z5funcAi -> _Z3fibi in the program.			;; There is an indirect call _Z5funcAi -> _Z3fibi in the program.
	;; With -use-profiled-call-graph=0, the processing order computed			;; With -use-profiled-call-graph=0, the processing order computed
	;; based on the static call graph is (_Z3fibi, _Z5funcAi). With			;; based on the static call graph is (_Z3fibi, _Z5funcAi). With
	;; -use-profiled-call-graph=1, the indirect call edge from profile is			;; -use-profiled-call-graph=1, the indirect call edge from profile is
	;; considered, thus the order becomes (_Z5funcAi, _Z3fibi) which leads to			;; considered, thus the order becomes (_Z5funcAi, _Z3fibi) which leads to
	;; _Z3fibi inlined into _Z5funcAi.			;; _Z3fibi inlined into _Z5funcAi.
	; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S \| FileCheck %s -check-prefix=ICALL-INLINE			; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sample-profile-file=%S/Inputs/profile-context-order.prof -S \| FileCheck %s -check-prefix=ICALL-INLINE

				;; When a cycle is formed by profiled edges between _Z5funcBi and _Z8funcLeafi,
				;; the function processing order matters. Without considering call edge weights
				;; _Z8funcLeafi can be processed before _Z5funcBi, thus leads to suboptimal
				;; inlining.
				; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sort-profiled-scc-member=0 -sample-profile-file=%S/Inputs/profile-context-order-scc.prof -S \| FileCheck %s -check-prefix=NOINLINEB
				; RUN: opt < %s -passes=sample-profile -use-profiled-call-graph=1 -sort-profiled-scc-member=1 -sample-profile-file=%S/Inputs/profile-context-order-scc.prof -S \| FileCheck %s -check-prefix=INLINEB


	@factor = dso_local global i32 3, align 4, !dbg !0			@factor = dso_local global i32 3, align 4, !dbg !0
	@fp = dso_local global i32 (i32)* null, align 8			@fp = dso_local global i32 (i32)* null, align 8

	define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {			define dso_local i32 @main() local_unnamed_addr #0 !dbg !18 {
	entry:			entry:
	store i32 (i32)* @_Z3fibi, i32 (i32)** @fp, align 8, !dbg !25			store i32 (i32)* @_Z3fibi, i32 (i32)** @fp, align 8, !dbg !25
	br label %for.body, !dbg !25			br label %for.body, !dbg !25

	Show All 14 Lines
	}			}

	; INLINE: define dso_local i32 @_Z5funcAi			; INLINE: define dso_local i32 @_Z5funcAi
	; INLINE-NOT: call i32 @_Z8funcLeafi			; INLINE-NOT: call i32 @_Z8funcLeafi
	; NOINLINE: define dso_local i32 @_Z5funcAi			; NOINLINE: define dso_local i32 @_Z5funcAi
	; NOINLINE: call i32 @_Z8funcLeafi			; NOINLINE: call i32 @_Z8funcLeafi
	; ICALL-INLINE: define dso_local i32 @_Z5funcAi			; ICALL-INLINE: define dso_local i32 @_Z5funcAi
	; ICALL-INLINE: call i32 @_Z3foo			; ICALL-INLINE: call i32 @_Z3foo
				; INLINEB: define dso_local i32 @_Z5funcBi
				; INLINEB-NOT: call i32 @_Z8funcLeafi
				; NOINLINEB: define dso_local i32 @_Z5funcBi
				; NOINLINEB: call i32 @_Z8funcLeafi
	define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #0 !dbg !40 {			define dso_local i32 @_Z5funcAi(i32 %x) local_unnamed_addr #0 !dbg !40 {
	entry:			entry:
	%add = add nsw i32 %x, 100000, !dbg !44			%add = add nsw i32 %x, 100000, !dbg !44
	%0 = load i32 (i32), i32 (i32)* @fp, align 8			%0 = load i32 (i32), i32 (i32)* @fp, align 8
	%call = call i32 %0(i32 8), !dbg !45			%call = call i32 %0(i32 8), !dbg !45
	%call1 = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !46			%call1 = tail call i32 @_Z8funcLeafi(i32 %add), !dbg !46
	ret i32 %call, !dbg !46			ret i32 %call, !dbg !46
	}			}
	▲ Show 20 Lines • Show All 129 Lines • Show Last 20 Lines

llvm/tools/llvm-profgen/CSPreInliner.cpp

	Show All 32 Lines
	// The switches specify inline thresholds used in SampleProfileLoader inlining.			// The switches specify inline thresholds used in SampleProfileLoader inlining.
	// TODO: the actual threshold to be tuned here because the size here is based			// TODO: the actual threshold to be tuned here because the size here is based
	// on machine code not LLVM IR.			// on machine code not LLVM IR.
	extern cl::opt<int> SampleHotCallSiteThreshold;			extern cl::opt<int> SampleHotCallSiteThreshold;
	extern cl::opt<int> SampleColdCallSiteThreshold;			extern cl::opt<int> SampleColdCallSiteThreshold;
	extern cl::opt<int> ProfileInlineGrowthLimit;			extern cl::opt<int> ProfileInlineGrowthLimit;
	extern cl::opt<int> ProfileInlineLimitMin;			extern cl::opt<int> ProfileInlineLimitMin;
	extern cl::opt<int> ProfileInlineLimitMax;			extern cl::opt<int> ProfileInlineLimitMax;
				extern cl::opt<bool> SortProfiledSCC;

	cl::opt<bool> EnableCSPreInliner(			cl::opt<bool> EnableCSPreInliner(
	"csspgo-preinliner", cl::Hidden, cl::init(false),			"csspgo-preinliner", cl::Hidden, cl::init(false),
	cl::desc("Run a global pre-inliner to merge context profile based on "			cl::desc("Run a global pre-inliner to merge context profile based on "
	"estimated global top-down inline decisions"));			"estimated global top-down inline decisions"));

	cl::opt<bool> UseContextCostForPreInliner(			cl::opt<bool> UseContextCostForPreInliner(
	"use-context-cost-for-preinliner", cl::Hidden, cl::init(true),			"use-context-cost-for-preinliner", cl::Hidden, cl::init(true),
	Show All 16 Lines
	std::vector<StringRef> CSPreInliner::buildTopDownOrder() {			std::vector<StringRef> CSPreInliner::buildTopDownOrder() {
	std::vector<StringRef> Order;			std::vector<StringRef> Order;
	ProfiledCallGraph ProfiledCG(ContextTracker);			ProfiledCallGraph ProfiledCG(ContextTracker);

	// Now that we have a profiled call graph, construct top-down order			// Now that we have a profiled call graph, construct top-down order
	// by building up SCC and reversing SCC order.			// by building up SCC and reversing SCC order.
	scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);			scc_iterator<ProfiledCallGraph *> I = scc_begin(&ProfiledCG);
	while (!I.isAtEnd()) {			while (!I.isAtEnd()) {
	for (ProfiledCallGraphNode Node : I) {			auto Range = *I;
				if (SortProfiledSCC) {
				// Sort nodes in one SCC based on callsite hotness.
				scc_member_iterator<ProfiledCallGraph > SI(I);
				Range = *SI;
				}
				for (auto *Node : Range) {
	if (Node != ProfiledCG.getEntryNode())			if (Node != ProfiledCG.getEntryNode())
	Order.push_back(Node->Name);			Order.push_back(Node->Name);
	}			}
	++I;			++I;
	}			}
	std::reverse(Order.begin(), Order.end());			std::reverse(Order.begin(), Order.end());

	return Order;			return Order;
	▲ Show 20 Lines • Show All 190 Lines • Show Last 20 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[CSSPGO] Sorting nodes in a cycle of profiled call graph.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 390735

llvm/include/llvm/ADT/SCCIterator.h

llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h

llvm/lib/Transforms/IPO/SampleProfile.cpp

llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof

llvm/test/Transforms/SampleProfile/profile-context-order.ll

llvm/tools/llvm-profgen/CSPreInliner.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[CSSPGO] Sorting nodes in a cycle of profiled call graph.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 390735

llvm/include/llvm/ADT/SCCIterator.h

llvm/include/llvm/Transforms/IPO/ProfiledCallGraph.h

llvm/lib/Transforms/IPO/SampleProfile.cpp

llvm/test/Transforms/SampleProfile/Inputs/profile-context-order-scc.prof

llvm/test/Transforms/SampleProfile/profile-context-order.ll

llvm/tools/llvm-profgen/CSPreInliner.cpp

[CSSPGO] Sorting nodes in a cycle of profiled call graph.
ClosedPublic