diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp --- a/bolt/lib/Passes/ReorderAlgorithm.cpp +++ b/bolt/lib/Passes/ReorderAlgorithm.cpp @@ -531,21 +531,21 @@ } // Initialize CFG edges - using JumpT = std::pair; - std::vector> JumpCounts; + std::vector JumpCounts; for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { auto BI = BB->branch_info_begin(); for (BinaryBasicBlock *SuccBB : BB->successors()) { assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && "missing profile for a jump"); - auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex()); - JumpCounts.push_back(std::make_pair(It, BI->Count)); + JumpCounts.push_back( + {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count}); ++BI; } } // Run the layout algorithm - auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto Result = + codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); Order.reserve(BF.getLayout().block_size()); for (uint64_t R : Result) Order.push_back(OrigOrder[R]); diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -331,23 +331,21 @@ // Initialize CFG nodes and their data std::vector FuncSizes; std::vector FuncCounts; - using JumpT = std::pair; - std::vector> CallCounts; + std::vector CallCounts; std::vector CallOffsets; for (NodeId F = 0; F < Cg.numNodes(); ++F) { FuncSizes.push_back(Cg.size(F)); FuncCounts.push_back(Cg.samples(F)); for (NodeId Succ : Cg.successors(F)) { const Arc &Arc = *Cg.findArc(F, Succ); - auto It = std::make_pair(F, Succ); - CallCounts.push_back(std::make_pair(It, Arc.weight())); + CallCounts.push_back({F, Succ, uint64_t(Arc.weight())}); CallOffsets.push_back(uint64_t(Arc.avgCallOffset())); } } // Run the layout algorithm. - std::vector Result = - applyCDSLayout(FuncSizes, FuncCounts, CallCounts, CallOffsets); + std::vector Result = codelayout::computeCacheDirectedLayout( + FuncSizes, FuncCounts, CallCounts, CallOffsets); // Create a single cluster from the computed order of hot functions. std::vector NodeOrder(Result.begin(), Result.end()); diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h --- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h +++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h @@ -14,14 +14,22 @@ #ifndef LLVM_TRANSFORMS_UTILS_CODELAYOUT_H #define LLVM_TRANSFORMS_UTILS_CODELAYOUT_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include +#include #include -namespace llvm { +namespace llvm::codelayout { using EdgeT = std::pair; -using EdgeCountT = std::pair; + +struct EdgeCount { + uint64_t src; + uint64_t dst; + uint64_t count; +}; /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump /// locality and thus processor I-cache utilization. This is achieved via @@ -34,24 +42,22 @@ /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The /// map also defines the edges in CFG and should include 0-count edges. /// \returns The best block order found. -std::vector -applyExtTspLayout(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +std::vector computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of a given node order in CFG. The higher the score, /// the better the order is. The score is designed to reflect the locality of /// the given order, which is anti-correlated with the number of I-cache misses /// in a typical execution of the function. -double calcExtTspScore(const std::vector &Order, - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef Order, ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of the current node order in CFG. -double calcExtTspScore(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Algorithm-specific params for Cache-Directed Sort. The values are tuned for /// the best performance of large-scale front-end bound binaries. @@ -75,18 +81,16 @@ /// map also defines the edges in CFG and should include 0-count edges. /// \p CallOffsets: The offsets of the calls from their source nodes. /// \returns The best function order found. -std::vector applyCDSLayout(const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets); +std::vector computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets); /// Apply a Cache-Directed Sort with a custom config. -std::vector applyCDSLayout(const CDSortConfig &Config, - const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets); +std::vector computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets); -} // end namespace llvm +} // namespace llvm::codelayout #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3501,7 +3501,7 @@ auto BlockSizes = std::vector(F->size()); auto BlockCounts = std::vector(F->size()); - std::vector JumpCounts; + std::vector JumpCounts; for (MachineBasicBlock &MBB : *F) { // Getting the block frequency. BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); @@ -3520,8 +3520,8 @@ for (MachineBasicBlock *Succ : MBB.successors()) { auto EP = MBPI->getEdgeProbability(&MBB, Succ); BlockFrequency JumpFreq = BlockFreq * EP; - auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); - JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency())); + JumpCounts.push_back( + {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); } } @@ -3534,7 +3534,7 @@ calcExtTspScore(BlockSizes, BlockCounts, JumpCounts))); // Run the layout algorithm. - auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); std::vector NewBlockOrder; NewBlockOrder.reserve(F->size()); for (uint64_t Node : NewOrder) { diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -48,6 +48,8 @@ #include using namespace llvm; +using namespace llvm::codelayout; + #define DEBUG_TYPE "code-layout" namespace llvm { @@ -318,8 +320,8 @@ Edges.push_back(std::make_pair(Other, Edge)); } - void merge(ChainT *Other, const std::vector &MergedBlocks) { - Nodes = MergedBlocks; + void merge(ChainT *Other, std::vector MergedBlocks) { + Nodes = std::move(MergedBlocks); // Update the chain's data. ExecutionCount += Other->ExecutionCount; Size += Other->Size; @@ -549,15 +551,14 @@ /// The implementation of the ExtTSP algorithm. class ExtTSPImpl { public: - ExtTSPImpl(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) + ExtTSPImpl(ArrayRef NodeSizes, ArrayRef NodeCounts, + ArrayRef EdgeCounts) : NumNodes(NodeSizes.size()) { initialize(NodeSizes, NodeCounts, EdgeCounts); } /// Run the algorithm and return an optimized ordering of nodes. - void run(std::vector &Result) { + std::vector run() { // Pass 1: Merge nodes with their mutually forced successors mergeForcedPairs(); @@ -568,14 +569,14 @@ mergeColdChains(); // Collect nodes from all chains - concatChains(Result); + return concatChains(); } private: /// Initialize the algorithm's data structures. - void initialize(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts) { // Initialize nodes AllNodes.reserve(NumNodes); for (uint64_t Idx = 0; Idx < NumNodes; Idx++) { @@ -593,8 +594,7 @@ std::vector OutDegree(NumNodes, 0); AllJumps.reserve(EdgeCounts.size()); for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; + auto [Pred, Succ, ExecutionCount] = It; OutDegree[Pred]++; // Ignore self-edges. if (Pred == Succ) @@ -602,7 +602,6 @@ SuccNodes[Pred].push_back(Succ); PredNodes[Succ].push_back(Pred); - uint64_t ExecutionCount = It.second; if (ExecutionCount > 0) { NodeT &PredNode = AllNodes[Pred]; NodeT &SuccNode = AllNodes[Succ]; @@ -923,7 +922,7 @@ } /// Concatenate all chains into the final order. - void concatChains(std::vector &Order) { + std::vector concatChains() { // Collect chains and calculate density stats for their sorting. std::vector SortedChains; DenseMap ChainDensity; @@ -957,12 +956,12 @@ }); // Collect the nodes in the order specified by their chains. + std::vector Order; Order.reserve(NumNodes); - for (const ChainT *Chain : SortedChains) { - for (NodeT *Node : Chain->Nodes) { + for (const ChainT *Chain : SortedChains) + for (NodeT *Node : Chain->Nodes) Order.push_back(Node->Index); - } - } + return Order; } private: @@ -995,16 +994,15 @@ /// functions represented by a call graph. class CDSortImpl { public: - CDSortImpl(const CDSortConfig &Config, const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts, - const std::vector &EdgeOffsets) + CDSortImpl(const CDSortConfig &Config, ArrayRef NodeSizes, + ArrayRef NodeCounts, ArrayRef EdgeCounts, + ArrayRef EdgeOffsets) : Config(Config), NumNodes(NodeSizes.size()) { initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets); } /// Run the algorithm and return an ordered set of function clusters. - void run(std::vector &Result) { + std::vector run() { // Merge pairs of chains while improving the objective. mergeChainPairs(); @@ -1013,15 +1011,15 @@ << HotChains.size() << "\n"); // Collect nodes from all the chains. - concatChains(Result); + return concatChains(); } private: /// Initialize the algorithm's data structures. - void initialize(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts, - const std::vector &EdgeOffsets) { + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts, + const ArrayRef &EdgeOffsets) { // Initialize nodes. AllNodes.reserve(NumNodes); for (uint64_t Node = 0; Node < NumNodes; Node++) { @@ -1038,20 +1036,17 @@ PredNodes.resize(NumNodes); AllJumps.reserve(EdgeCounts.size()); for (size_t I = 0; I < EdgeCounts.size(); I++) { - auto It = EdgeCounts[I]; - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; + auto [Pred, Succ, Count] = EdgeCounts[I]; // Ignore recursive calls. if (Pred == Succ) continue; SuccNodes[Pred].push_back(Succ); PredNodes[Succ].push_back(Pred); - uint64_t ExecutionCount = It.second; - if (ExecutionCount > 0) { + if (Count > 0) { NodeT &PredNode = AllNodes[Pred]; NodeT &SuccNode = AllNodes[Succ]; - AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount); + AllJumps.emplace_back(&PredNode, &SuccNode, Count); AllJumps.back().Offset = EdgeOffsets[I]; SuccNode.InJumps.push_back(&AllJumps.back()); PredNode.OutJumps.push_back(&AllJumps.back()); @@ -1302,7 +1297,7 @@ } /// Concatenate all chains into the final order. - void concatChains(std::vector &Order) { + std::vector concatChains() { // Collect chains and calculate density stats for their sorting. std::vector SortedChains; DenseMap ChainDensity; @@ -1332,10 +1327,12 @@ }); // Collect the nodes in the order specified by their chains. + std::vector Order; Order.reserve(NumNodes); for (const ChainT *Chain : SortedChains) for (NodeT *Node : Chain->Nodes) Order.push_back(Node->Index); + return Order; } private: @@ -1376,17 +1373,16 @@ } // end of anonymous namespace std::vector -llvm::applyExtTspLayout(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { +codelayout::computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { // Verify correctness of the input data. assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); assert(NodeSizes.size() > 2 && "Incorrect input"); // Apply the reordering algorithm. ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts); - std::vector Result; - Alg.run(Result); + std::vector Result = Alg.run(); // Verify correctness of the output. assert(Result.front() == 0 && "Original entry point is not preserved"); @@ -1394,27 +1390,22 @@ return Result; } -double llvm::calcExtTspScore(const std::vector &Order, - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { +double codelayout::calcExtTspScore(ArrayRef Order, + ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { // Estimate addresses of the blocks in memory. std::vector Addr(NodeSizes.size(), 0); for (size_t Idx = 1; Idx < Order.size(); Idx++) { Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; } std::vector OutDegree(NodeSizes.size(), 0); - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - OutDegree[Pred]++; - } + for (auto [Pred, Succ, Count] : EdgeCounts) + ++OutDegree[Pred]; // Increase the score for each jump. double Score = 0; - for (auto It : EdgeCounts) { - uint64_t Pred = It.first.first; - uint64_t Succ = It.first.second; - uint64_t Count = It.second; + for (auto [Pred, Succ, Count] : EdgeCounts) { bool IsConditional = OutDegree[Pred] > 1; Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count, IsConditional); @@ -1422,9 +1413,9 @@ return Score; } -double llvm::calcExtTspScore(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts) { +double codelayout::calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { std::vector Order(NodeSizes.size()); for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { Order[Idx] = Idx; @@ -1432,30 +1423,23 @@ return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); } -std::vector -llvm::applyCDSLayout(const CDSortConfig &Config, - const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets) { +std::vector codelayout::computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets) { // Verify correctness of the input data. assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input"); // Apply the reordering algorithm. CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); - std::vector Result; - Alg.run(Result); - - // Verify correctness of the output. + std::vector Result = Alg.run(); assert(Result.size() == FuncSizes.size() && "Incorrect size of layout"); return Result; } -std::vector -llvm::applyCDSLayout(const std::vector &FuncSizes, - const std::vector &FuncCounts, - const std::vector &CallCounts, - const std::vector &CallOffsets) { +std::vector codelayout::computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets) { CDSortConfig Config; // Populate the config from the command-line options. if (CacheEntries.getNumOccurrences() > 0) @@ -1466,5 +1450,6 @@ Config.DistancePower = DistancePower; if (FrequencyScale.getNumOccurrences() > 0) Config.FrequencyScale = FrequencyScale; - return applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); + return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts, + CallOffsets); }