diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h --- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h +++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h @@ -53,6 +53,34 @@ const std::vector &NodeCounts, const std::vector &EdgeCounts); +/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for +/// the best performance of large-scale front-end bound binaries. +struct CDSortConfig { + /// The size of the cache. + unsigned CacheEntries = 16; + /// The size of a line in the cache. + unsigned CacheSize = 2048; + /// The power exponent for the distane-locality. + double DistancePower = 0.25; + /// The scale factor for the frequency-locality. + double FrequencyScale = 0.25; +}; + +/// Apply a Cache-Directed Sort for functions represented by a call graph. +/// The placement is done by optimizing the call locality by co-locating +/// frequently executed functions. +/// \p FuncSizes: The sizes of the nodes (in bytes). +/// \p FuncCounts: The execution counts of the nodes in the profile. +/// \p CallCounts: The execution counts of every edge (jump) in the profile. The +/// map also defines the edges in CFG and should include 0-count edges. +/// \p CallOffsets: The offsets of the calls from their source nodes. +/// \returns The best function order found. +std::vector applyCDSLayout(const CDSortConfig &Config, + const std::vector &FuncSizes, + const std::vector &FuncCounts, + const std::vector &CallCounts, + const std::vector &CallOffsets); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -985,6 +985,400 @@ std::vector HotChains; }; +/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering +/// functions represented by a call graph. +class CDSortImpl { +public: + CDSortImpl(const CDSortConfig &Config, const std::vector &NodeSizes, + const std::vector &NodeCounts, + const std::vector &EdgeCounts, + const std::vector &EdgeOffsets) + : Config(Config), NumNodes(NodeSizes.size()) { + initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets); + } + + /// Run the algorithm and return an ordered set of function clusters. + void run(std::vector &Result) { + // Merge pairs of chains while improving the objective + mergeChainPairs(); + + LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number" + << " of chains from " << NumNodes << " to " + << HotChains.size() << "\n"); + + // Collect nodes from all the chains + concatChains(Result); + } + +private: + /// Initialize the algorithm's data structures. + void initialize(const std::vector &NodeSizes, + const std::vector &NodeCounts, + const std::vector &EdgeCounts, + const std::vector &EdgeOffsets) { + // Initialize nodes + AllNodes.reserve(NumNodes); + for (uint64_t Node = 0; Node < NumNodes; Node++) { + uint64_t Size = std::max(NodeSizes[Node], 1ULL); + uint64_t ExecutionCount = NodeCounts[Node]; + AllNodes.emplace_back(Node, Size, ExecutionCount); + TotalSamples += ExecutionCount; + if (ExecutionCount > 0) + TotalSize += Size; + } + + // Initialize jumps between the nodes + SuccNodes.resize(NumNodes); + PredNodes.resize(NumNodes); + AllJumps.reserve(EdgeCounts.size()); + for (size_t I = 0; I < EdgeCounts.size(); I++) { + auto It = EdgeCounts[I]; + uint64_t Pred = It.first.first; + uint64_t Succ = It.first.second; + // Ignore recursive calls + if (Pred == Succ) + continue; + + SuccNodes[Pred].push_back(Succ); + PredNodes[Succ].push_back(Pred); + uint64_t ExecutionCount = It.second; + if (ExecutionCount > 0) { + NodeT &PredNode = AllNodes[Pred]; + NodeT &SuccNode = AllNodes[Succ]; + AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount); + AllJumps.back().Offset = EdgeOffsets[I]; + SuccNode.InJumps.push_back(&AllJumps.back()); + PredNode.OutJumps.push_back(&AllJumps.back()); + } + } + + // Initialize chains + AllChains.reserve(NumNodes); + HotChains.reserve(NumNodes); + for (NodeT &Node : AllNodes) { + // Adjust execution counts + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount()); + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount()); + // Create chain + AllChains.emplace_back(Node.Index, &Node); + Node.CurChain = &AllChains.back(); + if (Node.ExecutionCount > 0) { + HotChains.push_back(&AllChains.back()); + } + } + + // Initialize chain edges + AllEdges.reserve(AllJumps.size()); + for (NodeT &PredNode : AllNodes) { + for (JumpT *Jump : PredNode.OutJumps) { + NodeT *SuccNode = Jump->Target; + ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); + // this edge is already present in the graph + if (CurEdge != nullptr) { + assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); + CurEdge->appendJump(Jump); + continue; + } + // this is a new edge + AllEdges.emplace_back(Jump); + PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); + SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); + } + } + } + + /// Run the second optimization pass of the layout algorithm: + /// Merge pairs of chains while there is an improvement in the + /// expected cache miss ratio. + /// Merge pairs of chains while improving the objective. + void mergeChainPairs() { + // Creating a priority queue containing all edges ordered by the merge gain + auto GainComparator = [](ChainEdge *L, ChainEdge *R) { + if (L->gain() != R->gain()) + return L->gain() > R->gain(); + + // Making sure the comparison is deterministic + if (L->srcChain()->Id != R->srcChain()->Id) + return L->srcChain()->Id < R->srcChain()->Id; + return L->dstChain()->Id < R->dstChain()->Id; + }; + std::set Queue(GainComparator); + + // Inserting the edges into the queue + for (ChainT *ChainPred : HotChains) { + for (auto EdgeIt : ChainPred->Edges) { + ChainEdge *Edge = EdgeIt.second; + // Ignore self-edges + if (Edge->isSelfEdge()) + continue; + // Ignore already processed edges + if (Edge->gain() != -1.0) + continue; + + // Compute the gain of merging the two chains + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + + // Merge the chains while the gain of merging is positive + while (!Queue.empty()) { + // Extract the best (top) edge for merging + ChainEdge *BestEdge = *Queue.begin(); + Queue.erase(Queue.begin()); + // Ignore self-edges + if (BestEdge->isSelfEdge()) + continue; + // Ignore edges with non-positive gains + if (BestEdge->gain() <= EPS) + continue; + + ChainT *BestSrcChain = BestEdge->srcChain(); + ChainT *BestDstChain = BestEdge->dstChain(); + + // Remove outdated edges from the queue + for (std::pair EdgeIt : BestSrcChain->Edges) + Queue.erase(EdgeIt.second); + for (std::pair EdgeIt : BestDstChain->Edges) + Queue.erase(EdgeIt.second); + + // Merge the best pair of chains + MergeGainT BestGain = BestEdge->getMergeGain(); + mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(), + BestGain.mergeType()); + + // Insert newly created edges into the queue + for (auto EdgeIt : BestSrcChain->Edges) { + ChainEdge *Edge = EdgeIt.second; + // Ignore loop edges + if (Edge->isSelfEdge()) + continue; + + // Compute the gain of merging the two chains + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + } + + /// Compute the gain of merging two chains. + /// + /// The function considers all possible ways of merging two chains and + /// computes the one having the largest increase in ExtTSP objective. The + /// result is a pair with the first element being the gain and the second + /// element being the corresponding merging type. + MergeGainT getBestMergeGain(ChainEdge *Edge) const { + // Precompute jumps between ChainPred and ChainSucc + auto Jumps = Edge->jumps(); + assert(!Jumps.empty() && "trying to merge chains w/o jumps"); + ChainT *SrcChain = Edge->srcChain(); + ChainT *DstChain = Edge->dstChain(); + + // The object holds the best currently chosen gain of merging the two chains + MergeGainT Gain = MergeGainT(); + + /// Given a merge offset and a list of merge types, try to merge two chains + /// and update Gain with a better alternative + auto tryChainMerging = [&](size_t Offset, + const std::vector &MergeTypes) { + // Apply the merge, compute the corresponding gain, and update the best + // value, if the merge is beneficial + for (const MergeTypeT &MergeType : MergeTypes) { + MergeGainT NewGain = + computeMergeGain(SrcChain, DstChain, Jumps, Offset, MergeType); + + // When forward and backward gains are the same, prioritize merging that + // preserves the original order of the functions in the binary + if (std::abs(Gain.score() - NewGain.score()) < EPS) { + if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) || + (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) { + Gain = NewGain; + } + } else if (NewGain.score() > Gain.score() + EPS) { + Gain = NewGain; + } + } + }; + + // Try to concatenate two chains w/o splitting + tryChainMerging(0, {MergeTypeT::X_Y, MergeTypeT::Y_X}); + + return Gain; + } + + /// Compute the score gain of merging two chains, respecting a given + /// merge 'type' and 'offset'. + /// + /// The two chains are not modified in the method. + MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc, + const std::vector &Jumps, + size_t MergeOffset, MergeTypeT MergeType) const { + // This doesn't depend on the ordering of the nodes + double FreqGain = mergeGainFreq(ChainPred, ChainSucc); + + auto MergedBlocks = + mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); + double DistGain = mergeGainDist(MergedBlocks, Jumps); + + double GainScore = DistGain + Config.FrequencyScale * FreqGain; + // Scale the result to increase the importance of merging short chains + if (GainScore >= 0.0) + GainScore /= std::min(ChainPred->Size, ChainSucc->Size); + + return MergeGainT(GainScore, MergeOffset, MergeType); + } + + /// Compute the change of the frequency locality after merging the chains. + double mergeGainFreq(ChainT *ChainPred, ChainT *ChainSucc) const { + auto missProbability = [&](double ChainDensity) { + double PageSamples = ChainDensity * Config.CacheSize; + if (PageSamples >= TotalSamples) + return 0.0; + double P = PageSamples / TotalSamples; + return pow(1.0 - P, static_cast(Config.CacheEntries)); + }; + + // Cache misses on the chains before merging + double CurScore = + ChainPred->ExecutionCount * missProbability(ChainPred->density()) + + ChainSucc->ExecutionCount * missProbability(ChainSucc->density()); + + // Cache misses on the merged chain + double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount; + double MergedSize = ChainPred->Size + ChainSucc->Size; + double MergedDensity = static_cast(MergedCounts) / MergedSize; + double NewScore = MergedCounts * missProbability(MergedDensity); + + return CurScore - NewScore; + } + + /// Compute the distance locality for a jump / call. + double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const { + uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr; + double D = Dist == 0 ? 0.1 : static_cast(Dist); + return static_cast(Count) * std::pow(D, -Config.DistancePower); + } + + /// Compute the change of the distance locality after merging the chains. + double mergeGainDist(const MergedChain &MergedBlocks, + const std::vector &Jumps) const { + if (Jumps.empty()) + return 0.0; + uint64_t CurAddr = 0; + MergedBlocks.forEach([&](const NodeT *Node) { + Node->EstimatedAddr = CurAddr; + CurAddr += Node->Size; + }); + + double CurScore = 0; + double NewScore = 0; + for (const JumpT *Arc : Jumps) { + uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset; + uint64_t DstAddr = Arc->Target->EstimatedAddr; + NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount); + CurScore += distScore(0, TotalSize, Arc->ExecutionCount); + } + return NewScore - CurScore; + } + + /// Merge chain From into chain Into, update the list of active chains, + /// adjacency information, and the corresponding cached values. + void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset, + MergeTypeT MergeType) { + assert(Into != From && "a chain cannot be merged with itself"); + + // Merge the nodes + MergedChain MergedNodes = + mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); + Into->merge(From, MergedNodes.getNodes()); + + // Merge the edges + Into->mergeEdges(From); + From->clear(); + + // Remove the chain from the list of active chains + llvm::erase_value(HotChains, From); + } + + /// Concatenate all chains into the final order. + void concatChains(std::vector &Order) { + // Collect chains and calculate density stats for their sorting + std::vector SortedChains; + DenseMap ChainDensity; + for (ChainT &Chain : AllChains) { + if (!Chain.Nodes.empty()) { + SortedChains.push_back(&Chain); + // Using doubles to avoid overflow of ExecutionCounts + double Size = 0; + double ExecutionCount = 0; + for (NodeT *Node : Chain.Nodes) { + Size += static_cast(Node->Size); + ExecutionCount += static_cast(Node->ExecutionCount); + } + assert(Size > 0 && "a chain of zero size"); + ChainDensity[&Chain] = ExecutionCount / Size; + } + } + + // Sorting chains by density in the decreasing order + std::stable_sort(SortedChains.begin(), SortedChains.end(), + [&](const ChainT *L, const ChainT *R) { + const double DL = ChainDensity[L]; + const double DR = ChainDensity[R]; + // Compare by density and break ties by chain identifiers + return (DL != DR) ? (DL > DR) : (L->Id < R->Id); + }); + + // Collect the nodes in the order specified by their chains + Order.reserve(NumNodes); + for (const ChainT *Chain : SortedChains) { + for (NodeT *Node : Chain->Nodes) { + Order.push_back(Node->Index); + } + } + } + +private: + /// Config for the algorithm. + const CDSortConfig &Config; + + /// The number of nodes in the graph. + const size_t NumNodes; + + /// Successors of each node. + std::vector> SuccNodes; + + /// Predecessors of each node. + std::vector> PredNodes; + + /// All nodes (functions) in the graph. + std::vector AllNodes; + + /// All jumps (function calls) between the nodes. + std::vector AllJumps; + + /// All chains of nodes. + std::vector AllChains; + + /// All edges between the chains. + std::vector AllEdges; + + /// Active chains. The vector gets updated at runtime when chains are merged. + std::vector HotChains; + + /// The total number of samples in the graph. + uint64_t TotalSamples{0}; + + /// The total size of the nodes in the graph. + uint64_t TotalSize{0}; +}; + } // end of anonymous namespace std::vector @@ -1043,3 +1437,22 @@ } return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); } + +std::vector +llvm::applyCDSLayout(const CDSortConfig &Config, + const std::vector &FuncSizes, + const std::vector &FuncCounts, + const std::vector &CallCounts, + const std::vector &CallOffsets) { + // Verify correctness of the input data + assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input"); + + // Apply the reordering algorithm + CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); + std::vector Result; + Alg.run(Result); + + // Verify correctness of the output + assert(Result.size() == FuncSizes.size() && "Incorrect size of layout"); + return Result; +}