Index: lld/ELF/PropellerBBReordering.h =================================================================== --- /dev/null +++ lld/ELF/PropellerBBReordering.h @@ -0,0 +1,355 @@ +//===- PropellerBBReordering.h ----------------------------------------------===// +//// +//// Part of the LLVM Project, under the Apache License v2.0 with LLVM +//Exceptions. +//// See https://llvm.org/LICENSE.txt for license information. +//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//// +////===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H +#define LLD_ELF_PROPELLER_BB_REORDERING_H + +#include "PropellerELFCfg.h" + +#include "lld/Common/LLVM.h" +#include "llvm/ADT/DenseMap.h" + +#include +#include +#include + +using llvm::DenseMap; + +namespace lld { +namespace propeller { + +enum MergeOrder { + Begin, + X2X1Y = Begin, + BeginNext, + X1YX2 = BeginNext, + X2YX1, + YX2X1, + End +}; + +// Represents a chain of nodes (basic blocks). +class NodeChain { +public: + // Representative node of the chain, with which it is initially constructed. + const ELFCfgNode *DelegateNode = nullptr; + std::vector Nodes; + + // Total binary size of the chain + uint32_t Size = 0; + + // Total execution frequency of the chain + uint64_t Freq = 0; + + // Extended TSP score of the chain + double Score = 0; + + // Constructor for building a NodeChain from a single Node + NodeChain(const ELFCfgNode *Node) { + DelegateNode = Node; + Nodes.push_back(Node); + Size = Node->ShSize; + Freq = Node->Freq; + } + + double execDensity() const { + return ((double)Freq) / std::max(Size, (uint32_t)1); + } + + const ELFCfgNode *getFirstNode() const { return Nodes.front(); } + + const ELFCfgNode *getLastNode() const { return Nodes.back(); } +}; + +// BB Chain builder based on the ExtTSP metric +class NodeChainBuilder { +private: + class NodeChainAssembly; + class NodeChainSlice; + + // Cfg representing a single function. + const ELFCfg *CFG; + + // Set of built chains, keyed by section index of their Delegate Nodes. + DenseMap> Chains; + + // Map from every node to its containing chain. + // This map will be updated as chains keep merging together during the + // algorithm. + DenseMap NodeToChainMap; + + // Map from every node to its (binary) offset in its containing chain. + // This map will be updated as chains keep merging together during the + // algorithm. + DenseMap NodeOffsetMap; + + // These represent all the edges which are -- based on the profile -- the only + // (executed) outgoing edges from their source node and the only (executed) + // incoming edges to their sink nodes. The algorithm will make sure that these + // edges form fall-throughs in the final order. + DenseMap MutuallyForcedOut; + + // This maps every (ordered) pair of chains (with the first chain in the pair + // potentially splittable) to the highest-gain NodeChainAssembly for those + // chains. + DenseMap, + std::unique_ptr> + NodeChainAssemblies; + + // This map stores the candidate chains for each chain. + // + // For every chain, its candidate chains are the chains which can increase the + // overall ExtTSP score when merged with that chain. This is used to update + // the NodeChainAssemblies map whenever chains merge together. The candidate + // chains of a chain may also be updated as result of a merge. + std::unordered_map> + CandidateChains; + + void sortChainsByExecutionDensity(std::vector &ChainOrder); + + void initializeExtTSP(); + void attachFallThroughs(); + + // This function tries to place two nodes immediately adjacent to + // each other (used for fallthroughs). + // Returns true if this can be done. + bool attachNodes(const ELFCfgNode *src, const ELFCfgNode *sink); + + void mergeChainEdges(NodeChain *splitChain, NodeChain *unSplitChain); + + void mergeChains(NodeChain *leftChain, NodeChain *rightChain); + void mergeChains(std::unique_ptr assembly); + + // Recompute the ExtTSP score of a chain + double computeExtTSPScore(NodeChain *chain) const; + + // Update the related NodeChainAssembly records for two chains, with the + // assumption that UnsplitChain has been merged into SplitChain. + bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain); + + void computeChainOrder(std::vector &chainOrder); + + // Initialize the mutuallyForcedOut map + void initMutuallyForcedEdges(); + + // Initialize basic block chains, with one chain for every node + void initNodeChains(); + + uint32_t getNodeOffset(const ELFCfgNode *node) const { + auto it = NodeOffsetMap.find(node); + assert("Node does not exist in the offset map." && + it != NodeOffsetMap.end()); + return it->second; + } + + NodeChain *getNodeChain(const ELFCfgNode *node) const { + auto it = NodeToChainMap.find(node); + assert("Node does not exist in the chain map." && + it != NodeToChainMap.end()); + return it->second; + } + +public: + NodeChainBuilder(const ELFCfg *_CFG) : CFG(_CFG) { + initNodeChains(); + initMutuallyForcedEdges(); + } + + // This invokes the Extended TSP algorithm, orders the hot and cold basic + // blocks and inserts their associated symbols at the corresponding locations + // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the + // given SymbolList. This function also modifies the alignment of basic blocks + // based on the new order if the feature is requested by + // -propeller-align-basic-blocks. + void doSplitOrder(std::list &SymbolList, + std::list::iterator HotPlaceHolder, + std::list::iterator ColdPlaceHolder); +}; + +class NodeChainBuilder::NodeChainSlice { +private: + // Chain from which this slice comes from + NodeChain *Chain; + + // The endpoints of the slice in the corresponding chain + std::vector::iterator Begin, End; + + // The offsets corresponding to the two endpoints + uint32_t BeginOffset, EndOffset; + + // Constructor for building a chain slice from a given chain and the two + // endpoints of the chain. + NodeChainSlice(NodeChain *c, std::vector::iterator begin, + std::vector::iterator end, + const NodeChainBuilder &chainBuilder) + : Chain(c), Begin(begin), End(end) { + + BeginOffset = chainBuilder.getNodeOffset(*begin); + if (End == Chain->Nodes.end()) + EndOffset = Chain->Size; + else + EndOffset = chainBuilder.getNodeOffset(*end); + } + + // (Binary) size of this slice + uint32_t size() const { return EndOffset - BeginOffset; } + + friend class NodeChainBuilder; + friend class NodeChainAssembly; +}; + +class NodeChainBuilder::NodeChainAssembly { +private: + // Total ExtTSP score of this NodeChainAssembly + double Score = 0; + + // Whether the ExtTSP score has been computed for this NodeChainAssembly + bool ScoreComputed = false; + + // Corresponding NodeChainBuilder of the assembled chains + const NodeChainBuilder *ChainBuilder; + + // The two assembled chains + NodeChain *SplitChain, *UnsplitChain; + + // The slice position of Splitchain + std::vector::iterator SlicePosition; + + // The three chain slices + std::vector Slices; + + NodeChainAssembly(NodeChain *chainX, NodeChain *chainY, + std::vector::iterator slicePosition, + MergeOrder mergeOrder, const NodeChainBuilder *chainBuilder) + : ChainBuilder(chainBuilder), SplitChain(chainX), UnsplitChain(chainY), + SlicePosition(slicePosition) { + NodeChainSlice x1(chainX, chainX->Nodes.begin(), SlicePosition, + *chainBuilder); + NodeChainSlice x2(chainX, SlicePosition, chainX->Nodes.end(), + *chainBuilder); + NodeChainSlice y(chainY, chainY->Nodes.begin(), chainY->Nodes.end(), + *chainBuilder); + + switch (mergeOrder) { + case MergeOrder::X2X1Y: + Slices = {x2, x1, y}; + break; + case MergeOrder::X1YX2: + Slices = {x1, y, x2}; + break; + case MergeOrder::X2YX1: + Slices = {x2, y, x1}; + break; + case MergeOrder::YX2X1: + Slices = {y, x2, x1}; + break; + default: + assert("Invalid MergeOrder!" && false); + } + } + + double extTSPScore() { + if (ScoreComputed) + return Score; + else { + ScoreComputed = true; + return (Score = computeExtTSPScore()); + } + } + + // Return the gain in ExtTSP score achieved by this NodeChainAssembly once it + // is accordingly applied to the two chains. + double extTSPScoreGain() { + return extTSPScore() - SplitChain->Score - UnsplitChain->Score; + } + + // Find the NodeChainSlice in this NodeChainAssembly which contains the given + // node. If the node is not contained in this NodeChainAssembly, then return + // false. Otherwise, set idx equal to the index of the corresponding slice and + // return true. + bool findSliceIndex(const ELFCfgNode *node, uint8_t &idx) const { + // First find the chain containing the given node. + NodeChain *chain = ChainBuilder->getNodeChain(node); + + // Return false if the found chain is not used in this NodeChainAssembly. + if (SplitChain != chain && UnsplitChain != chain) + return false; + + // Find the slice containing the node using the node's offset in the chain + auto offset = ChainBuilder->getNodeOffset(node); + + for (idx = 0; idx < 3; ++idx) { + if (chain != Slices[idx].Chain) + continue; + if (offset < Slices[idx].BeginOffset) + continue; + if (offset > Slices[idx].EndOffset) + continue; + if (offset < Slices[idx].EndOffset && offset > Slices[idx].BeginOffset) + return true; + // A node can have zero size, which means multiple nodes may be associated + // with the same offset. This means that if the node's offset is at the + // beginning or the end of the slice, the node may reside in either slices + // of the chain. + if (offset == Slices[idx].EndOffset) { + // If offset is at the end of the slice, iterate backwards over the + // slice to find a zero-sized node. + for (auto nodeIt = std::prev(Slices[idx].End); + nodeIt != std::prev(Slices[idx].Begin); nodeIt--) { + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if (!(*nodeIt)->ShSize) + break; + if (*nodeIt == node) + return true; + } + } + if (offset == Slices[idx].BeginOffset) { + // If offset is at the beginning of the slice, iterate forwards over the + // slice to find the node. + for (auto nodeIt = Slices[idx].Begin; nodeIt != Slices[idx].End; + nodeIt++) { + if (*nodeIt == node) + return true; + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if ((*nodeIt)->ShSize) + break; + } + } + } + return false; + } + + // Total Extended TSP score of this NodeChainAssembly once it is assembled + // accordingly. + double computeExtTSPScore() const; + + // First node in the resulting assembled chain. + const ELFCfgNode *getFirstNode() const { + for (auto &slice : Slices) + if (slice.Begin != slice.End) + return *slice.Begin; + return nullptr; + } + + friend class NodeChainBuilder; + +public: + // We delete the copy constructor to make sure NodeChainAssembly is moved + // rather than copied. + NodeChainAssembly(NodeChainAssembly &&) = default; + // copy constructor is implicitly deleted + // NodeChainAssembly(const NodeChainAssembly&) = delete; + NodeChainAssembly() = delete; +}; + +} // namespace propeller +} // namespace lld +#endif Index: lld/ELF/PropellerBBReordering.cpp =================================================================== --- /dev/null +++ lld/ELF/PropellerBBReordering.cpp @@ -0,0 +1,627 @@ +//===- PropellerBBReordering.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is part of the Propeller infrastcture for doing code layout +// optimization and includes the implementation of intra-function basic block +// reordering algorithm based on the Extended TSP metric +// (https://arxiv.org/abs/1809.04676). +// +// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic +// blocks in a function, by combining the gains from fall-throughs and short +// jumps. +// +// Given an ordering of the basic blocks, for a function f, the ExtTSP score is +// computed as follows. +// +// sum_{edges e in f} frequency(e) * weight(e) +// +// where frequency(e) is the execution frequency and weight(e) is computed as +// follows: +// * 1 if distance(Src[e], Sink[e]) = 0 (i.e. fallthrough) +// * 0.1 * (1 - distance(Src[e], Sink[e]) / 1024) if Src[e] < Sink[e] and +// 0 < distance(Src[e], Sink[e]) < 1024 (i.e. short forward jump) +// * 0.1 * (1 - distance(Src[e], Sink[e]) / 640) if Src[e] > Sink[e] and +// 0 < distance(Src[e], Sink[e]) < 640 (i.e. short backward jump) +// * 0 otherwise +// +// In short, it computes a weighted sum of frequencies of all edges in the +// control flow graph. Each edge gets its weight depending on whether the given +// ordering makes the edge a fallthrough, a short forward jump, or a short +// backward jump. +// +// Although this problem is NP-hard like the regular TSP, an iterative greedy +// basic-block-chaining algorithm is used to find a close to optimal solution. +// This algorithm is described as follows. +// +// Starting with one basic block sequence (BB chain) for every basic block, the +// algorithm iteratively joins BB chains together in order to maximize the +// extended TSP score of all the chains. +// +// Initially, it finds all mutually-forced edges in the profiled CFG. These are +// all the edges which are -- based on the profile -- the only (executed) +// outgoing edge from their source node and the only (executed) incoming edges +// to their sink nodes. Next, the source and sink of all mutually-forced edges +// are attached together as fallthrough edges. +// +// Then, at every iteration, the algorithm tries to merge a pair of BB chains +// which leads to the highest gain in the ExtTSP score. The algorithm extends +// the search space by considering splitting short (less than 128 bytes in +// binary size) BB chains into two chains and then merging these two chains with +// the other chain in four ways. After every merge, the new merge gains are +// updated. The algorithm repeats joining BB chains until no additional can be +// achieved. Eventually, it sorts all the existing chains in decreasing order of +// their execution density, i.e., the total profiled frequency of the chain +// divided by its binary size. +// +// The values used by this algorithm are reconfiguriable using lld's propeller +// flags. Specifically, these parameters are: +// * propeller-forward-jump-distance: maximum distance of a forward jump +// (default-set to 1024 in the above equation). +// * propeller-backward-jump-distance: maximum distance of a backward jump +// (default-set to 640 in the above equation). +// * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1) +// * propeller-forward-jump-weight: weight of a forward jump (default-set to +// 0.1) +// * propeller-backward-jump-weight: weight of a backward jump (default-set +// to 0.1) +// * propeller-chain-split-threshold: maximum binary size of a BB chain +// which the algorithm will consider for splitting (default-set to 128). +//===----------------------------------------------------------------------===// +#include "PropellerBBReordering.h" + +#include "Config.h" +#include "llvm/ADT/DenseSet.h" + +#include + +#include +#include + +using lld::elf::config; + +using llvm::DenseSet; +using llvm::detail::DenseMapPair; + +namespace lld { +namespace propeller { + +// Return the Extended TSP score for one edge, given its source to sink +// direction and distance in the layout. +double getEdgeExtTSPScore(const ELFCfgEdge *edge, bool isEdgeForward, + uint32_t srcSinkDistance) { + if (edge->Weight == 0) + return 0; + + if (srcSinkDistance == 0 && (edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC)) + return edge->Weight * config->propellerFallthroughWeight; + + if (isEdgeForward && srcSinkDistance < config->propellerForwardJumpDistance) + return edge->Weight * config->propellerForwardJumpWeight * + (1.0 - + ((double)srcSinkDistance) / config->propellerForwardJumpDistance); + + if (!isEdgeForward && srcSinkDistance < config->propellerBackwardJumpDistance) + return edge->Weight * config->propellerBackwardJumpWeight * + (1.0 - + ((double)srcSinkDistance) / config->propellerBackwardJumpDistance); + return 0; +} + +// Sort BB chains in decreasing order of their execution density. +// NodeChainBuilder calls this function at the end to ensure that hot BB chains +// are placed at the beginning of the function. +void NodeChainBuilder::sortChainsByExecutionDensity( + std::vector &chainOrder) { + for (auto CI = Chains.begin(), CE = Chains.end(); CI != CE; ++CI) { + chainOrder.push_back(CI->second.get()); + } + + std::sort( + chainOrder.begin(), chainOrder.end(), + [this](const NodeChain *c1, const NodeChain *c2) { + if (c1->getFirstNode() == this->CFG->getEntryNode()) + return true; + if (c2->getFirstNode() == this->CFG->getEntryNode()) + return false; + double c1ExecDensity = c1->execDensity(); + double c2ExecDensity = c2->execDensity(); + if (c1ExecDensity == c2ExecDensity) { + if (c1->DelegateNode->MappedAddr == c2->DelegateNode->MappedAddr) + return c1->DelegateNode->Shndx < c2->DelegateNode->Shndx; + return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr; + } + return c1ExecDensity > c2ExecDensity; + }); +} + +// NodeChainBuilder calls this function after building all the chains to attach +// as many fall-throughs as possible. Given that the algorithm already optimizes +// the extend TSP score, this function will only affect the cold basic blocks +// and thus we do not need to consider the edge weights. +void NodeChainBuilder::attachFallThroughs() { + // First, try to keep the fall-throughs from the original order. + for (auto &node : CFG->Nodes) { + if (node->FTEdge != nullptr) { + attachNodes(node.get(), node->FTEdge->Sink); + } + } + + // Sometimes, the original fall-throughs cannot be kept. So we try to find new + // fall-through opportunities which did not exist in the original order. + for (auto &edge : CFG->IntraEdges) { + attachNodes(edge->Src, edge->Sink); + } +} + +// Merge two chains in the specified order. +void NodeChainBuilder::mergeChains(NodeChain *leftChain, + NodeChain *rightChain) { + for (const ELFCfgNode *node : rightChain->Nodes) { + leftChain->Nodes.push_back(node); + NodeToChainMap[node] = leftChain; + NodeOffsetMap[node] += leftChain->Size; + } + leftChain->Size += rightChain->Size; + leftChain->Freq += rightChain->Freq; + Chains.erase(rightChain->DelegateNode->Shndx); +} + +// This function tries to place two basic blocks immediately adjacent to each +// other (used for fallthroughs). Returns true if the basic blocks have been +// attached this way. +bool NodeChainBuilder::attachNodes(const ELFCfgNode *src, + const ELFCfgNode *sink) { + // No edge cannot fall-through to the entry basic block. + if (sink == CFG->getEntryNode()) + return false; + + // Ignore edges between hot and cold basic blocks. + if (src->Freq == 0 ^ sink->Freq == 0) + return false; + NodeChain *srcChain = getNodeChain(src); + NodeChain *sinkChain = getNodeChain(sink); + // Skip this edge if the source and sink are in the same chain + if (srcChain == sinkChain) + return false; + + // It's only possible to form a fall-through between src and sink if src is + // they are respectively located at the end and beginning of their chains. + if (srcChain->getLastNode() != src || sinkChain->getFirstNode() != sink) + return false; + // Attaching is possible. So we merge the chains in the corresponding order. + mergeChains(srcChain, sinkChain); + return true; +} + +// Merge two BB sequences according to the given NodeChainAssembly. A +// NodeChainAssembly is an ordered triple of three slices from two chains. +void NodeChainBuilder::mergeChains( + std::unique_ptr assembly) { + // Create the new node order according the given assembly + std::vector newNodeOrder; + for (NodeChainSlice &slice : assembly->Slices) { + std::copy(slice.Begin, slice.End, std::back_inserter(newNodeOrder)); + } + // We merge the nodes into the SplitChain + assembly->SplitChain->Nodes = std::move(newNodeOrder); + + // Update nodeOffset and nodeToChainMap for all the nodes in the sequence. + uint32_t runningOffset = 0; + for (const ELFCfgNode *node : assembly->SplitChain->Nodes) { + NodeToChainMap[node] = assembly->SplitChain; + NodeOffsetMap[node] = runningOffset; + runningOffset += node->ShSize; + } + assembly->SplitChain->Size = runningOffset; + + // Update the total frequency and ExtTSP score of the aggregated chain + assembly->SplitChain->Freq += assembly->UnsplitChain->Freq; + // We have already computed the gain in the assembly record. So we can just + // increment the aggregated chain's score by that gain. + assembly->SplitChain->Score += assembly->extTSPScoreGain(); + + // Merge the assembly candidate chains of the two chains into the candidate + // chains of the remaining NodeChain and remove the records for the defunct + // NodeChain. + for (NodeChain *c : CandidateChains[assembly->UnsplitChain]) { + NodeChainAssemblies.erase(std::make_pair(c, assembly->UnsplitChain)); + NodeChainAssemblies.erase(std::make_pair(assembly->UnsplitChain, c)); + CandidateChains[c].erase(assembly->UnsplitChain); + if (c != assembly->SplitChain) + CandidateChains[assembly->SplitChain].insert(c); + } + + // Update the NodeChainAssembly for all candidate chains of the merged + // NodeChain. Remove a NodeChain from the merge chain's candidates if the + // NodeChainAssembly update finds no gain. + auto &splitChainCandidateChains = CandidateChains[assembly->SplitChain]; + + for (auto CI = splitChainCandidateChains.begin(), + CE = splitChainCandidateChains.end(); + CI != CE;) { + NodeChain *otherChain = *CI; + auto &otherChainCandidateChains = CandidateChains[otherChain]; + bool x = updateNodeChainAssembly(otherChain, assembly->SplitChain); + bool y = updateNodeChainAssembly(assembly->SplitChain, otherChain); + if (x || y) { + otherChainCandidateChains.insert(assembly->SplitChain); + CI++; + } else { + otherChainCandidateChains.erase(assembly->SplitChain); + CI = splitChainCandidateChains.erase(CI); + } + } + + // remove all the candidate chain records for the merged-in chain + CandidateChains.erase(assembly->UnsplitChain); + + // Finally, remove the defunct (merged-in) chain record from the chains + Chains.erase(assembly->UnsplitChain->DelegateNode->Shndx); +} + +// Calculate the Extended TSP metric for a BB chain. +// This function goes over all the BBs in the chain and for BB chain and +// aggregates the score of all edges which are contained in the same chain. +double NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const { + double score = 0; + uint32_t srcOffset = 0; + for (const ELFCfgNode *node : chain->Nodes) { + for (const ELFCfgEdge *edge : node->Outs) { + if (!edge->Weight) + continue; + NodeChain *sinkChain = getNodeChain(edge->Sink); + if (sinkChain != chain) + continue; + auto sinkOffset = getNodeOffset(edge->Sink); + bool edgeForward = srcOffset + node->ShSize <= sinkOffset; + // Calculate the distance between src and sink + uint32_t distance = edgeForward ? sinkOffset - srcOffset - node->ShSize + : srcOffset - sinkOffset + node->ShSize; + score += getEdgeExtTSPScore(edge, edgeForward, distance); + } + srcOffset += node->ShSize; + } + return score; +} + +// Updates the best NodeChainAssembly between two NodeChains. The existing +// record will be replaced by the new NodeChainAssembly if a non-zero gain +// is achieved. Otherwise, it will be removed. +// If a nodechain assembly record is (kept) inserted, returns true. Otherwise +// returns false. +bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain, + NodeChain *unsplitChain) { + // Remove the assembly record + NodeChainAssemblies.erase(std::make_pair(splitChain, unsplitChain)); + + // Only consider split the chain if the size of the chain is smaller than a + // treshold. + bool doSplit = (splitChain->Size <= config->propellerChainSplitThreshold); + auto slicePosEnd = + doSplit ? splitChain->Nodes.end() : std::next(splitChain->Nodes.begin()); + + SmallVector, 128> candidateNCAs; + + for (auto slicePos = splitChain->Nodes.begin(); slicePos != slicePosEnd; + ++slicePos) { + // Do not split the mutually-forced edges in the chain. + if (slicePos != splitChain->Nodes.begin() && + MutuallyForcedOut[*std::prev(slicePos)] == *slicePos) + continue; + + // If the split position is at the beginning (no splitting), only consider + // one MergeOrder + auto mergeOrderEnd = (slicePos == splitChain->Nodes.begin()) + ? MergeOrder::BeginNext + : MergeOrder::End; + + for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) { + MergeOrder mOrder = static_cast(MI); + + // Create the NodeChainAssembly representing this particular assembly. + auto NCA = unique_ptr(new NodeChainAssembly( + splitChain, unsplitChain, slicePos, mOrder, this)); + + ELFCfgNode *entryNode = CFG->getEntryNode(); + if ((NCA->SplitChain->getFirstNode() == entryNode || + NCA->UnsplitChain->getFirstNode() == entryNode) && + NCA->getFirstNode() != entryNode) + continue; + candidateNCAs.push_back(std::move(NCA)); + } + } + + // Insert the best assembly of the two chains (only if it has positive gain). + auto bestCandidate = std::max_element( + candidateNCAs.begin(), candidateNCAs.end(), + [](unique_ptr &c1, unique_ptr &c2) { + return c1->extTSPScoreGain() < c2->extTSPScoreGain(); + }); + + if (bestCandidate != candidateNCAs.end() && + (*bestCandidate)->extTSPScoreGain() > 0) { + NodeChainAssemblies.try_emplace(std::make_pair(splitChain, unsplitChain), + std::move(*bestCandidate)); + return true; + } else + return false; +} + +void NodeChainBuilder::initNodeChains() { + for (auto &node : CFG->Nodes) { + NodeChain *chain = new NodeChain(node.get()); + NodeToChainMap[node.get()] = chain; + NodeOffsetMap[node.get()] = 0; + Chains.try_emplace(node->Shndx, chain); + } +} + +// Find all the mutually-forced edges. +// These are all the edges which are -- based on the profile -- the only +// (executed) outgoing edge from their source node and the only (executed) +// incoming edges to their sink nodes +void NodeChainBuilder::initMutuallyForcedEdges() { + DenseMap> profiledOuts; + DenseMap> profiledIns; + + for (auto &node : CFG->Nodes) { + std::copy_if(node->Outs.begin(), node->Outs.end(), + std::back_inserter(profiledOuts[node.get()]), + [](const ELFCfgEdge *edge) { + return edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC && + edge->Weight != 0; + }); + std::copy_if(node->Ins.begin(), node->Ins.end(), + std::back_inserter(profiledIns[node.get()]), + [](const ELFCfgEdge *edge) { + return edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC && + edge->Weight != 0; + }); + } + + for (auto &node : CFG->Nodes) { + if (profiledOuts[node.get()].size() == 1) { + ELFCfgEdge *edge = profiledOuts[node.get()].front(); + if (profiledIns[edge->Sink].size() == 1) + MutuallyForcedOut[node.get()] = edge->Sink; + } + } + + // Break cycles in the mutually forced edges by cutting the edge sinking to + // the smallest address in every cycle (hopefully a loop backedge) + DenseMap nodeToPathMap; + SmallVector cycleCutNodes; + unsigned pathCount = 0; + for (auto it = MutuallyForcedOut.begin(); it != MutuallyForcedOut.end(); + ++it) { + // Check to see if the node (and its cycle) have already been visited. + if (nodeToPathMap[it->first]) + continue; + const ELFCfgEdge *victimEdge = nullptr; + auto nodeIt = it; + pathCount++; + while (nodeIt != MutuallyForcedOut.end()) { + const ELFCfgNode *node = nodeIt->first; + unsigned path = nodeToPathMap[node]; + if (path != 0) { + // If this node is marked with a number, either it is the same number, + // in which case we have found a cycle. Or it is a different number, + // which means we have found a path to a previously visited path + // (non-cycle). + if (path == pathCount) { + // We have found a cycle: add the victim edge + cycleCutNodes.push_back(victimEdge->Src); + } + break; + } else + nodeToPathMap[node] = pathCount; + const ELFCfgEdge *edge = profiledOuts[node].front(); + if (!victimEdge || + (edge->Sink->MappedAddr < victimEdge->Sink->MappedAddr)) { + victimEdge = edge; + } + nodeIt = MutuallyForcedOut.find(nodeIt->second); + } + } + + // Remove the victim edges to break cycles in the mutually forced edges + for (const ELFCfgNode *node : cycleCutNodes) + MutuallyForcedOut.erase(node); +} + +// This function initializes the ExtTSP algorithm's data structures. This +// the NodeChainAssemblies and the CandidateChains maps. +void NodeChainBuilder::initializeExtTSP() { + // For each chain, compute its ExtTSP score, add its chain assembly records + // and its merge candidate chain. + DenseSet> visited; + for (auto &c : Chains) { + NodeChain *chain = c.second.get(); + chain->Score = computeExtTSPScore(chain); + for (const ELFCfgNode *node : chain->Nodes) { + for (const ELFCfgEdge *edge : node->Outs) { + if (!edge->Weight) + continue; + NodeChain *otherChain = getNodeChain(edge->Sink); + if (chain == otherChain) + continue; + if (visited.count(std::make_pair(chain, otherChain))) + continue; + bool x = updateNodeChainAssembly(chain, otherChain); + bool y = updateNodeChainAssembly(otherChain, chain); + if (x || y) { + CandidateChains[chain].insert(otherChain); + CandidateChains[otherChain].insert(chain); + } + visited.insert(std::make_pair(chain, otherChain)); + visited.insert(std::make_pair(otherChain, chain)); + } + } + } +} + +void NodeChainBuilder::computeChainOrder( + std::vector &chainOrder) { + // Attach the mutually-foced edges (which will not be split anymore by the + // Extended TSP algorithm). + for (auto &kv : MutuallyForcedOut) + attachNodes(kv.first, kv.second); + + // Initialize the Extended TSP algorithm's data. + initializeExtTSP(); + + // Keep merging the chain assembly record with the highest ExtTSP gain, until + // no more gain is possible. + bool merged = false; + do { + merged = false; + auto bestCandidate = std::max_element( + NodeChainAssemblies.begin(), NodeChainAssemblies.end(), + [](DenseMapPair, + unique_ptr> &c1, + DenseMapPair, + unique_ptr> &c2) { + return c1.second->extTSPScoreGain() < c2.second->extTSPScoreGain(); + }); + + if (bestCandidate != NodeChainAssemblies.end() && + bestCandidate->second->extTSPScoreGain() > 0) { + unique_ptr bestCandidateNCA = + std::move(bestCandidate->second); + NodeChainAssemblies.erase(bestCandidate); + mergeChains(std::move(bestCandidateNCA)); + merged = true; + } + } while (merged); + + // Merge fallthrough basic blocks if we have missed any + attachFallThroughs(); + + sortChainsByExecutionDensity(chainOrder); +} + +// This function computes the ExtTSP score for a chain assembly record. This +// goes the three BB slices in the assembly record and considers all edges +// whose source and sink belongs to the chains in the assembly record. +double NodeChainBuilder::NodeChainAssembly::computeExtTSPScore() const { + double score = 0; + for (uint8_t srcSliceIdx = 0; srcSliceIdx < 3; ++srcSliceIdx) { + const NodeChainSlice &srcSlice = Slices[srcSliceIdx]; + uint32_t srcNodeOffset = srcSlice.BeginOffset; + for (auto nodeIt = srcSlice.Begin; nodeIt != srcSlice.End; + srcNodeOffset += (*nodeIt)->ShSize, ++nodeIt) { + const ELFCfgNode *node = *nodeIt; + for (const ELFCfgEdge *edge : node->Outs) { + if (!edge->Weight) + continue; + + uint8_t sinkSliceIdx; + + if (findSliceIndex(edge->Sink, sinkSliceIdx)) { + auto sinkNodeOffset = ChainBuilder->getNodeOffset(edge->Sink); + bool edgeForward = (srcSliceIdx < sinkSliceIdx) || + (srcSliceIdx == sinkSliceIdx && + (srcNodeOffset + node->ShSize <= sinkNodeOffset)); + + uint32_t distance = 0; + + if (srcSliceIdx == sinkSliceIdx) { + distance = edgeForward + ? sinkNodeOffset - srcNodeOffset - node->ShSize + : srcNodeOffset - sinkNodeOffset + node->ShSize; + } else { + const NodeChainSlice &sinkSlice = Slices[sinkSliceIdx]; + distance = edgeForward + ? srcSlice.EndOffset - srcNodeOffset - node->ShSize + + sinkNodeOffset - sinkSlice.BeginOffset + : srcNodeOffset - srcSlice.BeginOffset + + node->ShSize + sinkSlice.EndOffset - + sinkNodeOffset; + // Increment the distance by the size of the middle slice if the src + // and sink are from the two ends. + if (std::abs(sinkSliceIdx - srcSliceIdx) == 2) + distance += Slices[1].size(); + } + score += getEdgeExtTSPScore(edge, edgeForward, distance); + } + } + } + } + return score; +} + +void NodeChainBuilder::doSplitOrder(list &symbolList, + list::iterator hotPlaceHolder, + list::iterator coldPlaceHolder) { + std::vector chainOrder; + computeChainOrder(chainOrder); + + DenseMap address; + unsigned currentAddress = 0; + for (const NodeChain *c : chainOrder) { + list::iterator insertPos = + c->Freq ? hotPlaceHolder : coldPlaceHolder; + for (const ELFCfgNode *n : c->Nodes) { + symbolList.insert(insertPos, n->ShName); + if (c->Freq) { + address[n] = currentAddress; + currentAddress += n->ShSize; + } + } + } + + if (config->propellerAlignBasicBlocks) { + + enum VisitStatus { NONE = 0, DURING, FINISHED }; + + DenseMap backEdgeFreq; + DenseMap visited; + + std::function visit; + visit = [&address, &visited, &backEdgeFreq, &visit](const ELFCfgNode *n) { + if (visited[n] != NONE) + return; + if (!n->Freq) + return; + visited[n] = DURING; + if (n->FTEdge) + visit(n->FTEdge->Sink); + for (const ELFCfgEdge *e : n->Outs) { + if (e->Sink->Freq && address[e->Sink] > address[n]) + visit(e->Sink); + } + for (const ELFCfgEdge *e : n->Outs) { + if (e->Sink->Freq && address[e->Sink] <= address[n]) { + if (visited[e->Sink] == DURING) { + backEdgeFreq[e->Sink] += e->Weight; + } + } + } + visited[n] = FINISHED; + }; + + for (const NodeChain *c : chainOrder) + if (c->Freq != 0) { + for (const ELFCfgNode *n : c->Nodes) + visit(n); + } + + for (auto &n : CFG->Nodes) { + if (n.get() == CFG->getEntryNode()) + continue; + if (n->Freq && (n->Freq >= 10 * CFG->getEntryNode()->Freq) && + (backEdgeFreq[n.get()] * 5 >= n->Freq * 4)) { + config->symbolAlignmentFile.insert(std::make_pair(n->ShName, 16)); + } else + config->symbolAlignmentFile.insert(std::make_pair(n->ShName, 1)); + } + } +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/PropellerFuncOrdering.h =================================================================== --- /dev/null +++ lld/ELF/PropellerFuncOrdering.h @@ -0,0 +1,74 @@ +//===- PropellerFuncReordering.h +//--------------------------------------------===// +//// +//// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +//// See https://llvm.org/LICENSE.txt for license information. +//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//// +////===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_PROPELLER_FUNC_ORDERING_H +#define LLD_ELF_PROPELLER_FUNC_ORDERING_H + +#include "Propeller.h" + +#include +#include +#include + +namespace lld { +namespace propeller { + +class ELFCfg; + +class CallChainClustering { +public: + class Cluster { + public: + Cluster(ELFCfg *cfg, unsigned); + // All cfgs in this cluster + std::vector CFGs; + // Unique id associated with the cluster + unsigned Id; + // Total binary size of this cluster (only the hot part if using + // split-funcs. + uint64_t Size; + // Total byte-level execution frequency of the cluster + uint64_t Weight; + + // Merge the "other" cluster into this cluster. + Cluster &mergeWith(Cluster &other) { + CFGs.insert(CFGs.end(), other.CFGs.begin(), other.CFGs.end()); + this->Weight += other.Weight; + this->Size += other.Size; + return *this; + } + + // Returns the per-byte execution density of this cluster + double getDensity() { return ((double)Weight) / Size; } + }; + + CallChainClustering() {} + + void init(Propeller &propeller); + + unsigned doOrder(std::list &cfgOrder); + +private: + unsigned ClusterCount = 0; + + ELFCfg *getMostLikelyPredecessor(ELFCfg *cfg, Cluster *cluster); + + void mergeClusters(); + void sortClusters(std::vector &); + + std::vector HotCFGs, ColdCFGs; + std::map CFGToClusterMap; + std::map> Clusters; +}; + +} // namespace propeller +} // namespace lld + +#endif Index: lld/ELF/PropellerFuncOrdering.cpp =================================================================== --- /dev/null +++ lld/ELF/PropellerFuncOrdering.cpp @@ -0,0 +1,193 @@ +//===- PropellerFuncReordering.cpp +//-------------------------------------------===// +//// +//// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +//// See https://llvm.org/LICENSE.txt for license information. +//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +//// +////===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastcture for doing code layout +// optimization and includes the implementation of function reordering based on +// the CallChainClustering algorithm as published in [1]. +// +// This algorithm keeps merging functions together into clusters until the +// cluster sizes reach a limit. The algorithm iterates over functions in +// decreasing order of their execution density (total frequency divided by size) +// and for each function, it first finds the cluster containing the +// most-frequent caller of that function and then places the caller's cluster +// right before the callee's cluster. Finally, all the remaining clusters are +// ordered in decreasing order of their execution density. +// +// [1] G.Ottoni and B.Maher, Optimizing Function Placement for Large-Scale +// Data-Center Applications, CGO 2017. available at +// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +#include "PropellerFuncOrdering.h" + +#include "Config.h" +#include "PropellerELFCfg.h" + +#include +#include + +using lld::elf::config; + +namespace lld { +namespace propeller { + +const unsigned ClusterMergeSizeThreshold = 1 << 21; + +// Initializes the CallChainClustering algorithm with the cfgs from propeller. +// It separates cfgs into hot and cold cfgs and initially orders each collection +// of cfgs based on the address of their corresponding functions in the original +// binary. +void CallChainClustering::init(Propeller &propeller) { + propeller.forEachCfgRef([this](ELFCfg &cfg) { + if (cfg.isHot()) + this->HotCFGs.push_back(&cfg); + else + this->ColdCFGs.push_back(&cfg); + }); + + auto cfgComparator = [](ELFCfg *cfg1, ELFCfg *cfg2) -> bool { + return cfg1->getEntryNode()->MappedAddr < cfg2->getEntryNode()->MappedAddr; + }; + auto sortCFGs = [&cfgComparator](std::vector &cfgs) { + std::sort(cfgs.begin(), cfgs.end(), cfgComparator); + }; + sortCFGs(HotCFGs); + sortCFGs(ColdCFGs); +} + +// Initialize a cluster containing a single cfg an associates it with a unique +// id. +CallChainClustering::Cluster::Cluster(ELFCfg *cfg, unsigned id) + : CFGs(1, cfg), Id(id) {} + +// Returns the most frequent caller of a function. This function also gets as +// the second parameter the cluster containing this function to save a lookup +// into the CFGToClusterMap. +ELFCfg *CallChainClustering::getMostLikelyPredecessor(ELFCfg *cfg, + Cluster *cluster) { + ELFCfgNode *entry = cfg->getEntryNode(); + if (!entry) + return nullptr; + ELFCfgEdge *bestCallIn = nullptr; + + // Iterate over all callers of the entry basic block of the function. + for (ELFCfgEdge *callIn : entry->CallIns) { + auto *caller = callIn->Src->Cfg; + auto *callerCluster = CFGToClusterMap[caller]; + assert(caller->isHot()); + // Ignore callers from the same function, or the same cluster + if (caller == cfg || callerCluster == cluster) + continue; + // Ignore callers with overly large clusters + if (callerCluster->Size > ClusterMergeSizeThreshold) + continue; + // Ignore calls which are cold relative to the callee + if (callIn->Weight * 10 < entry->Freq) + continue; + // Do not merge if the caller cluster's density would degrade by more than + // 1/8 if merged. + if (8 * callerCluster->Size * (cluster->Weight * callerCluster->Weight) < + callerCluster->Weight * (cluster->Size + callerCluster->Size)) + continue; + // Update the best CallIn edge if needed + if (!bestCallIn || bestCallIn->Weight < callIn->Weight) + bestCallIn = callIn; + } + return bestCallIn ? bestCallIn->Src->Cfg : nullptr; +} + +// Merge clusters together based on the CallChainClustering algorithm. +void CallChainClustering::mergeClusters() { + // Build a map for the execution density of each cfg. This value will depend + // on whether function-splitting is used or not. + std::map cfgWeightMap; + for (ELFCfg *cfg : HotCFGs) { + uint64_t cfgWeight = 0; + uint64_t cfgSize = 0; + cfg->forEachNodeRef([&cfgSize, &cfgWeight](ELFCfgNode &n) { + cfgWeight += n.Freq * n.ShSize; + if (!config->propellerSplitFuncs || n.Freq) + cfgSize += n.ShSize; + }); + + Cluster *c = new Cluster(cfg, ClusterCount++); + c->Weight = cfgWeight; + c->Size = std::max(cfgSize, (uint64_t)1); + cfgWeightMap.emplace(cfg, c->getDensity()); + Clusters.emplace(c->Id, c); + CFGToClusterMap[cfg] = c; + } + + // Sort the hot cfgs in decreasing order of their execution density. + std::stable_sort(HotCFGs.begin(), HotCFGs.end(), + [&cfgWeightMap](ELFCfg *cfg1, ELFCfg *cfg2) { + return cfgWeightMap[cfg1] > cfgWeightMap[cfg2]; + }); + + for (ELFCfg *cfg : HotCFGs) { + if (cfgWeightMap[cfg] <= 0.005) + break; + auto *cluster = CFGToClusterMap[cfg]; + // Ignore merging if the cluster containing this function is bigger than + // 2MBs (size of a large page). + if (cluster->Size > ClusterMergeSizeThreshold) + continue; + assert(cluster); + + ELFCfg *predecessorCfg = getMostLikelyPredecessor(cfg, cluster); + if (!predecessorCfg) + continue; + auto *predecessorCluster = CFGToClusterMap[predecessorCfg]; + + assert(predecessorCluster != cluster && predecessorCfg != cfg); + + // Join the two clusters into predecessorCluster. + predecessorCluster->mergeWith(*cluster); + + // Update cfg to cluster mapping, because all cfgs that were + // previsously in cluster are now in predecessorCluster. + for (ELFCfg *cfg : cluster->CFGs) { + CFGToClusterMap[cfg] = predecessorCluster; + } + + // Delete the defunct cluster + Clusters.erase(cluster->Id); + } +} + +// This functions sorts all remaining clusters in decreasing order of their +// execution density. +void CallChainClustering::sortClusters(std::vector &clusterOrder) { + for (auto &p : Clusters) + clusterOrder.push_back(p.second.get()); + std::sort(clusterOrder.begin(), clusterOrder.end(), + [](Cluster *c1, Cluster *c2) { + // Set a deterministic order when execution densities are equal. + if (c1->getDensity() == c2->getDensity()) + return c1->CFGs.front()->getEntryNode()->MappedAddr < + c2->CFGs.front()->getEntryNode()->MappedAddr; + return c1->getDensity() > c2->getDensity(); + }); +} + +// This function performs CallChainClustering on all cfgs and then orders all +// the built clusters based on their execution density. It places all cold +// functions after hot functions and returns the number of hot functions. +unsigned CallChainClustering::doOrder(std::list &cfgOrder) { + mergeClusters(); + std::vector clusterOrder; + sortClusters(clusterOrder); + for (Cluster *c : clusterOrder) { + cfgOrder.insert(cfgOrder.end(), c->CFGs.begin(), c->CFGs.end()); + } + + cfgOrder.insert(cfgOrder.end(), ColdCFGs.begin(), ColdCFGs.end()); + return HotCFGs.size(); +} + +} // namespace propeller +} // namespace lld Index: lld/test/ELF/propeller/propeller-layout-function-ordering.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-function-ordering.s @@ -0,0 +1,75 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises function reordering on three functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld %t.o -o %t.out + +# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000008 t foo +# BEFORE-NEXT: 0000000000201128 0000000000000008 t bar +# BEFORE-NEXT: 0000000000201130 0000000000000008 t baz + +## Create a propeller profile based on the following inter-procedural calls. +## +## 100 100 +## baz -----> bar -----> foo +## + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 8 Nbar" >> %t_prof.propeller +# RUN: echo "3 8 Nbaz" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "3 2 100 C" >> %t_prof.propeller +# RUN: echo "2 1 100 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller + +## Link with the propeller profile and expect the functions to be reordered. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out +# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 0000000000000008 t baz +# REORDER-NEXT: 0000000000201128 0000000000000008 t bar +# REORDER-NEXT: 0000000000201130 0000000000000008 t foo + +## Disable function reordering and expect that the original function order is retained. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out +# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER + +# NOREORDER: 0000000000201120 0000000000000008 t foo +# NOREORDER-NEXT: 0000000000201128 0000000000000008 t bar +# NOREORDER-NEXT: 0000000000201130 0000000000000008 t baz + +.section .text,"ax",@progbits,unique,1 +# -- Begin function foo +.type foo,@function +foo: + .quad 0x11 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,2 +# -- Begin function bar +.type bar,@function +bar: + .quad 0x22 + +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,3 +# -- Begin function baz +.type baz,@function +baz: + .quad 0x33 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz Index: lld/test/ELF/propeller/propeller-layout-function-with-loop.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-function-with-loop.s @@ -0,0 +1,130 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises basic block reordering on a single function with a simple loop. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 0000000000201123 a.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: je 3 + +# BEFORE: 0000000000201128 aa.BB.foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 000000000020112b aaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: jne -13 + +# BEFORE: 0000000000201130 aaaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: retq + + +## Create a propeller profile for foo, based on the cfg below: +## +## foo +## | +## |5 +## V +## +--> a.BB.foo <--+ +## | / \ | +## 0| 0/ \95 |90 +## | / \ | +## | v \ | +## aa.BB.foo v | +## \ aaa.BB.foo +## \ / +## \ / +## 0\ /10 +## \ / +## v v +## aaaa.BB.foo +## +## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo +## as a fallthrough chain in the layout. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 14 Nfoo" > %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 3 1.2" >> %t_prof.propeller +# RUN: echo "4 5 1.3" >> %t_prof.propeller +# RUN: echo "5 4 1.4" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "4 2 90" >> %t_prof.propeller +# RUN: echo "2 4 95" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 10" >> %t_prof.propeller +# RUN: echo "1 2 5" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 foo: +# REORDER-NEXT: nopl (%rax) + +# REORDER: 0000000000201123 a.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne 9 + +# REORDER: 0000000000201128 aaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne -10 + +# REORDER: 000000000020112d aaaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: retq + +# REORDER: 0000000000201131 aa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jmp -14 + +## Disable basic block reordering and expect that the original bb order is retained. +# +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out +# RUN: diff %t.propeller.noreorder.out %t.out + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +foo: + nopl (%rax) + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + jmp aaa.BB.foo +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + jne a.BB.foo + jmp aaaa.BB.foo +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits,unique,4 +aaaa.BB.foo: + nopl (%rax) + ret +.Laaaa.BB.foo_end: + .size aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo Index: lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s @@ -0,0 +1,87 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises optimal basic block reordering one a single function. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000005 t foo +# BEFORE-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# BEFORE-NEXT: 000000000020112a 0000000000000004 t aa.BB.foo +# BEFORE-NEXT: 000000000020112e 0000000000000004 t aaa.BB.foo + +## Create a propeller profile for foo, based on the cfg below: +## +## +## foo +## |\ +## | \100 +## | \ +## | v +## 150| a.BB.foo +## | / \ +## | /100 \0 +## | / \ +## vv v +## aaa.BB.foo aa.BB.foo +## +## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and +## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo. +## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 12 Nfoo" >> %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 4 1.2" >> %t_prof.propeller +# RUN: echo "4 4 1.3" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "1 4 150" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "1 2 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out +# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER + +# AFTER: 0000000000201120 0000000000000005 t foo +# AFTER-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# AFTER-NEXT: 000000000020112a 0000000000000004 t aaa.BB.foo +# AFTER-NEXT: 000000000020112e 0000000000000004 t aa.BB.foo + +#.global _start +#_start: +# -- Begin function foo +.section .text,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + je aaa.BB.foo + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + ret +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + ret +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo