diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h --- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h +++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h @@ -20,6 +20,9 @@ namespace llvm { +using EdgeT = std::pair; +using EdgeCountT = std::pair; + /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump /// locality and thus processor I-cache utilization. This is achieved via /// increasing the number of fall-through jumps and co-locating frequently @@ -31,25 +34,24 @@ /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The /// map also defines the edges in CFG and should include 0-count edges. /// \returns The best block order found. -std::vector applyExtTspLayout( - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const DenseMap, uint64_t> &EdgeCounts); +std::vector +applyExtTspLayout(const std::vector &NodeSizes, + const std::vector &NodeCounts, + const std::vector &EdgeCounts); /// Estimate the "quality" of a given node order in CFG. The higher the score, /// the better the order is. The score is designed to reflect the locality of /// the given order, which is anti-correlated with the number of I-cache misses /// in a typical execution of the function. -double calcExtTspScore( - const std::vector &Order, const std::vector &NodeSizes, - const std::vector &NodeCounts, - const DenseMap, uint64_t> &EdgeCounts); +double calcExtTspScore(const std::vector &Order, + const std::vector &NodeSizes, + const std::vector &NodeCounts, + const std::vector &EdgeCounts); /// Estimate the "quality" of the current node order in CFG. -double calcExtTspScore( - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const DenseMap, uint64_t> &EdgeCounts); +double calcExtTspScore(const std::vector &NodeSizes, + const std::vector &NodeCounts, + const std::vector &EdgeCounts); } // end namespace llvm diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3488,7 +3488,7 @@ auto BlockSizes = std::vector(F->size()); auto BlockCounts = std::vector(F->size()); - DenseMap, uint64_t> JumpCounts; + std::vector JumpCounts; for (MachineBasicBlock &MBB : *F) { // Getting the block frequency. BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); @@ -3506,9 +3506,9 @@ // Getting jump frequencies. for (MachineBasicBlock *Succ : MBB.successors()) { auto EP = MBPI->getEdgeProbability(&MBB, Succ); - BlockFrequency EdgeFreq = BlockFreq * EP; - auto Edge = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); - JumpCounts[Edge] = EdgeFreq.getFrequency(); + BlockFrequency JumpFreq = BlockFreq * EP; + auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); + JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency())); } } diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -35,6 +35,7 @@ // Reference: // * A. Newell and S. Pupyrev, Improved Basic Block Reordering, // IEEE Transactions on Computers, 2020 +// https://arxiv.org/abs/1809.04676 // //===----------------------------------------------------------------------===// @@ -54,40 +55,56 @@ cl::desc("Whether to apply ext-tsp placement for instances w/o profile"), cl::init(true), cl::Hidden); -// Algorithm-specific constants. The values are tuned for the best performance +// Algorithm-specific params. The values are tuned for the best performance // of large-scale front-end bound binaries. -static cl::opt - ForwardWeight("ext-tsp-forward-weight", cl::Hidden, cl::init(0.1), - cl::desc("The weight of forward jumps for ExtTSP value")); +static cl::opt ForwardWeightCond( + "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1), + cl::desc("The weight of conditional forward jumps for ExtTSP value")); -static cl::opt - BackwardWeight("ext-tsp-backward-weight", cl::Hidden, cl::init(0.1), - cl::desc("The weight of backward jumps for ExtTSP value")); +static cl::opt ForwardWeightUncond( + "ext-tsp-forward-weight-uncond", cl::ReallyHidden, cl::init(0.1), + cl::desc("The weight of unconditional forward jumps for ExtTSP value")); + +static cl::opt BackwardWeightCond( + "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1), + cl::desc("The weight of conditonal backward jumps for ExtTSP value")); + +static cl::opt BackwardWeightUncond( + "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1), + cl::desc("The weight of unconditonal backward jumps for ExtTSP value")); + +static cl::opt FallthroughWeightCond( + "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0), + cl::desc("The weight of conditional fallthrough jumps for ExtTSP value")); + +static cl::opt FallthroughWeightUncond( + "ext-tsp-fallthrough-weight-uncond", cl::ReallyHidden, cl::init(1.05), + cl::desc("The weight of unconditional fallthrough jumps for ExtTSP value")); static cl::opt ForwardDistance( - "ext-tsp-forward-distance", cl::Hidden, cl::init(1024), + "ext-tsp-forward-distance", cl::ReallyHidden, cl::init(1024), cl::desc("The maximum distance (in bytes) of a forward jump for ExtTSP")); static cl::opt BackwardDistance( - "ext-tsp-backward-distance", cl::Hidden, cl::init(640), + "ext-tsp-backward-distance", cl::ReallyHidden, cl::init(640), cl::desc("The maximum distance (in bytes) of a backward jump for ExtTSP")); // The maximum size of a chain created by the algorithm. The size is bounded // so that the algorithm can efficiently process extremely large instance. static cl::opt - MaxChainSize("ext-tsp-max-chain-size", cl::Hidden, cl::init(4096), + MaxChainSize("ext-tsp-max-chain-size", cl::ReallyHidden, cl::init(4096), cl::desc("The maximum size of a chain to create.")); // The maximum size of a chain for splitting. Larger values of the threshold // may yield better quality at the cost of worsen run-time. static cl::opt ChainSplitThreshold( - "ext-tsp-chain-split-threshold", cl::Hidden, cl::init(128), + "ext-tsp-chain-split-threshold", cl::ReallyHidden, cl::init(128), cl::desc("The maximum size of a chain to apply splitting")); // The option enables splitting (large) chains along in-coming and out-going // jumps. This typically results in a better quality. static cl::opt EnableChainSplitAlongJumps( - "ext-tsp-enable-chain-split-along-jumps", cl::Hidden, cl::init(true), + "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true), cl::desc("The maximum size of a chain to apply splitting")); namespace { @@ -95,31 +112,37 @@ // Epsilon for comparison of doubles. constexpr double EPS = 1e-8; +// Compute the Ext-TSP score for a given jump. +double jumpExtTSPScore(uint64_t JumpDist, uint64_t JumpMaxDist, uint64_t Count, + double Weight) { + if (JumpDist > JumpMaxDist) + return 0; + double Prob = 1.0 - static_cast(JumpDist) / JumpMaxDist; + return Weight * Prob * Count; +} + // Compute the Ext-TSP score for a jump between a given pair of blocks, // using their sizes, (estimated) addresses and the jump execution count. double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr, - uint64_t Count) { + uint64_t Count, bool IsConditional) { // Fallthrough if (SrcAddr + SrcSize == DstAddr) { - // Assume that FallthroughWeight = 1.0 after normalization - return static_cast(Count); + return jumpExtTSPScore(0, 1, Count, + IsConditional ? FallthroughWeightCond + : FallthroughWeightUncond); } // Forward if (SrcAddr + SrcSize < DstAddr) { - const auto Dist = DstAddr - (SrcAddr + SrcSize); - if (Dist <= ForwardDistance) { - double Prob = 1.0 - static_cast(Dist) / ForwardDistance; - return ForwardWeight * Prob * Count; - } - return 0; + const uint64_t Dist = DstAddr - (SrcAddr + SrcSize); + return jumpExtTSPScore(Dist, ForwardDistance, Count, + IsConditional ? ForwardWeightCond + : ForwardWeightUncond); } // Backward - const auto Dist = SrcAddr + SrcSize - DstAddr; - if (Dist <= BackwardDistance) { - double Prob = 1.0 - static_cast(Dist) / BackwardDistance; - return BackwardWeight * Prob * Count; - } - return 0; + const uint64_t Dist = SrcAddr + SrcSize - DstAddr; + return jumpExtTSPScore(Dist, BackwardDistance, Count, + IsConditional ? BackwardWeightCond + : BackwardWeightUncond); } /// A type of merging two chains, X and Y. The former chain is split into @@ -191,8 +214,8 @@ std::vector InJumps; public: - explicit Block(size_t Index, uint64_t Size_, uint64_t EC) - : Index(Index), Size(Size_), ExecutionCount(EC) {} + explicit Block(size_t Index, uint64_t Size, uint64_t EC) + : Index(Index), Size(Size), ExecutionCount(EC) {} bool isEntry() const { return Index == 0; } }; @@ -210,6 +233,8 @@ Block *Target; // Execution count of the arc in the profile data. uint64_t ExecutionCount{0}; + // Whether the jump corresponds to a conditional branch. + bool IsConditional{false}; public: explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount) @@ -231,6 +256,14 @@ bool isEntry() const { return Blocks[0]->Index == 0; } + bool isCold() const { + for (auto Block : Blocks) { + if (Block->ExecutionCount > 0) + return false; + } + return true; + } + double score() const { return Score; } void setScore(double NewScore) { Score = NewScore; } @@ -371,10 +404,10 @@ // Update edges adjacent to chain Other for (auto EdgeIt : Other->Edges) { - const auto DstChain = EdgeIt.first; - const auto DstEdge = EdgeIt.second; - const auto TargetChain = DstChain == Other ? this : DstChain; - auto CurEdge = getEdge(TargetChain); + Chain *DstChain = EdgeIt.first; + ChainEdge *DstEdge = EdgeIt.second; + Chain *TargetChain = DstChain == Other ? this : DstChain; + ChainEdge *CurEdge = getEdge(TargetChain); if (CurEdge == nullptr) { DstEdge->changeEndpoint(Other, this); this->addEdge(TargetChain, DstEdge); @@ -436,7 +469,7 @@ /// The implementation of the ExtTSP algorithm. class ExtTSPImpl { using EdgeT = std::pair; - using EdgeCountMap = DenseMap; + using EdgeCountMap = std::vector>; public: ExtTSPImpl(size_t NumNodes, const std::vector &NodeSizes, @@ -480,10 +513,12 @@ // Initialize jumps between blocks SuccNodes = std::vector>(NumNodes); PredNodes = std::vector>(NumNodes); + auto OutDegree = std::vector(NumNodes, 0); AllJumps.reserve(EdgeCounts.size()); for (auto It : EdgeCounts) { auto Pred = It.first.first; auto Succ = It.first.second; + OutDegree[Pred]++; // Ignore self-edges if (Pred == Succ) continue; @@ -499,11 +534,15 @@ Block.OutJumps.push_back(&AllJumps.back()); } } + for (auto &Jump : AllJumps) { + assert(OutDegree[Jump.Source->Index] > 0); + Jump.IsConditional = OutDegree[Jump.Source->Index] > 1; + } // Initialize chains AllChains.reserve(NumNodes); HotChains.reserve(NumNodes); - for (auto &Block : AllBlocks) { + for (Block &Block : AllBlocks) { AllChains.emplace_back(Block.Index, &Block); Block.CurChain = &AllChains.back(); if (Block.ExecutionCount > 0) { @@ -513,10 +552,10 @@ // Initialize chain edges AllEdges.reserve(AllJumps.size()); - for (auto &Block : AllBlocks) { + for (Block &Block : AllBlocks) { for (auto &Jump : Block.OutJumps) { auto SuccBlock = Jump->Target; - auto CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain); + ChainEdge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain); // this edge is already present in the graph if (CurEdge != nullptr) { assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr); @@ -596,11 +635,11 @@ Chain *BestChainSucc = nullptr; auto BestGain = MergeGainTy(); // Iterate over all pairs of chains - for (auto ChainPred : HotChains) { + for (Chain *ChainPred : HotChains) { // Get candidates for merging with the current chain for (auto EdgeIter : ChainPred->edges()) { - auto ChainSucc = EdgeIter.first; - auto ChainEdge = EdgeIter.second; + Chain *ChainSucc = EdgeIter.first; + class ChainEdge *ChainEdge = EdgeIter.second; // Ignore loop edges if (ChainPred == ChainSucc) continue; @@ -610,7 +649,8 @@ continue; // Compute the gain of merging the two chains - auto CurGain = getBestMergeGain(ChainPred, ChainSucc, ChainEdge); + MergeGainTy CurGain = + getBestMergeGain(ChainPred, ChainSucc, ChainEdge); if (CurGain.score() <= EPS) continue; @@ -635,11 +675,13 @@ } } - /// Merge cold blocks to reduce code size. + /// Merge remaining blocks into chains w/o taking jump counts into + /// consideration. This allows to maintain the original block order in the + /// absense of profile data void mergeColdChains() { for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) { - // Iterating over neighbors in the reverse order to make sure original - // fallthrough jumps are merged first + // Iterating in reverse order to make sure original fallthrough jumps are + // merged first; this might be beneficial for code size. size_t NumSuccs = SuccNodes[SrcBB].size(); for (size_t Idx = 0; Idx < NumSuccs; Idx++) { auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1]; @@ -647,7 +689,8 @@ auto DstChain = AllBlocks[DstBB].CurChain; if (SrcChain != DstChain && !DstChain->isEntry() && SrcChain->blocks().back()->Index == SrcBB && - DstChain->blocks().front()->Index == DstBB) { + DstChain->blocks().front()->Index == DstBB && + SrcChain->isCold() == DstChain->isCold()) { mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y); } } @@ -666,11 +709,12 @@ }); double Score = 0; - for (const auto &Jump : Jumps) { - const auto SrcBlock = Jump->Source; - const auto DstBlock = Jump->Target; + for (auto &Jump : Jumps) { + const Block *SrcBlock = Jump->Source; + const Block *DstBlock = Jump->Target; Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size, - DstBlock->EstimatedAddr, Jump->ExecutionCount); + DstBlock->EstimatedAddr, Jump->ExecutionCount, + Jump->IsConditional); } return Score; } @@ -689,7 +733,7 @@ // Precompute jumps between ChainPred and ChainSucc auto Jumps = Edge->jumps(); - auto EdgePP = ChainPred->getEdge(ChainPred); + ChainEdge *EdgePP = ChainPred->getEdge(ChainPred); if (EdgePP != nullptr) { Jumps.insert(Jumps.end(), EdgePP->jumps().begin(), EdgePP->jumps().end()); } @@ -813,14 +857,14 @@ assert(Into != From && "a chain cannot be merged with itself"); // Merge the blocks - auto MergedBlocks = + MergedChain MergedBlocks = mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType); Into->merge(From, MergedBlocks.getBlocks()); Into->mergeEdges(From); From->clear(); // Update cached ext-tsp score for the new chain - auto SelfEdge = Into->getEdge(Into); + ChainEdge *SelfEdge = Into->getEdge(Into); if (SelfEdge != nullptr) { MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end()); Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps())); @@ -858,7 +902,7 @@ // Sorting chains by density in the decreasing order std::stable_sort(SortedChains.begin(), SortedChains.end(), [&](const Chain *C1, const Chain *C2) { - // Makre sure the original entry block is at the + // Make sure the original entry block is at the // beginning of the order if (C1->isEntry() != C2->isEntry()) { return C1->isEntry(); @@ -872,8 +916,8 @@ // Collect the blocks in the order specified by their chains Order.reserve(NumNodes); - for (auto Chain : SortedChains) { - for (auto Block : Chain->blocks()) { + for (Chain *Chain : SortedChains) { + for (Block *Block : Chain->blocks()) { Order.push_back(Block->Index); } } @@ -910,7 +954,7 @@ std::vector llvm::applyExtTspLayout( const std::vector &NodeSizes, const std::vector &NodeCounts, - const DenseMap, uint64_t> &EdgeCounts) { + const std::vector> &EdgeCounts) { size_t NumNodes = NodeSizes.size(); // Verify correctness of the input data. @@ -931,12 +975,17 @@ double llvm::calcExtTspScore( const std::vector &Order, const std::vector &NodeSizes, const std::vector &NodeCounts, - const DenseMap, uint64_t> &EdgeCounts) { + const std::vector> &EdgeCounts) { // Estimate addresses of the blocks in memory auto Addr = std::vector(NodeSizes.size(), 0); for (size_t Idx = 1; Idx < Order.size(); Idx++) { Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; } + auto OutDegree = std::vector(NodeSizes.size(), 0); + for (auto It : EdgeCounts) { + auto Pred = It.first.first; + OutDegree[Pred]++; + } // Increase the score for each jump double Score = 0; @@ -944,7 +993,9 @@ auto Pred = It.first.first; auto Succ = It.first.second; uint64_t Count = It.second; - Score += extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count); + bool IsConditional = OutDegree[Pred] > 1; + Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count, + IsConditional); } return Score; } @@ -952,7 +1003,7 @@ double llvm::calcExtTspScore( const std::vector &NodeSizes, const std::vector &NodeCounts, - const DenseMap, uint64_t> &EdgeCounts) { + const std::vector> &EdgeCounts) { auto Order = std::vector(NodeSizes.size()); for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { Order[Idx] = Idx; diff --git a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll --- a/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll +++ b/llvm/test/CodeGen/X86/code_placement_ext_tsp_large.ll @@ -69,7 +69,7 @@ ; ; CHECK-LABEL: Applying ext-tsp layout ; CHECK: original layout score: 9171074274.27 -; CHECK: optimized layout score: 10756755324.57 +; CHECK: optimized layout score: 10844307310.87 ; CHECK: b0 ; CHECK: b2 ; CHECK: b3