Index: lld/ELF/PropellerBBReordering.h =================================================================== --- /dev/null +++ lld/ELF/PropellerBBReordering.h @@ -0,0 +1,344 @@ +//===- PropellerBBReordering.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H +#define LLD_ELF_PROPELLER_BB_REORDERING_H + +#include "PropellerELFCfg.h" + +#include "lld/Common/LLVM.h" +#include "llvm/ADT/DenseMap.h" + +#include +#include +#include + +using llvm::DenseMap; + +namespace lld { +namespace propeller { + +enum MergeOrder { + Begin, + X2X1Y = Begin, + BeginNext, + X1YX2 = BeginNext, + X2YX1, + YX2X1, + End +}; + +// Represents a chain of nodes (basic blocks). +class NodeChain { +public: + // Representative node of the chain, with which it is initially constructed. + const CFGNode *DelegateNode; + std::vector Nodes; + + // Total binary size of the chain + uint32_t Size; + + // Total execution frequency of the chain + uint64_t Freq; + + // Extended TSP score of the chain + double Score; + + // Constructor for building a NodeChain from a single Node + NodeChain(const CFGNode *Node) + : DelegateNode(Node), Nodes(1, Node), Size(Node->ShSize), + Freq(Node->Freq) {} + + double execDensity() const { + return ((double)Freq) / std::max(Size, (uint32_t)1); + } +}; + +// BB Chain builder based on the ExtTSP metric +class NodeChainBuilder { +private: + class NodeChainAssembly; + class NodeChainSlice; + + // Cfg representing a single function. + const ControlFlowGraph *CFG; + + // Set of built chains, keyed by section index of their Delegate Nodes. + DenseMap> Chains; + + // Map from every node to its containing chain. + // This map will be updated as chains keep merging together during the + // algorithm. + DenseMap NodeToChainMap; + + // Map from every node to its (binary) offset in its containing chain. + // This map will be updated as chains keep merging together during the + // algorithm. + DenseMap NodeOffsetMap; + + // These represent all the edges which are -- based on the profile -- the only + // (executed) outgoing edges from their source node and the only (executed) + // incoming edges to their sink nodes. The algorithm will make sure that these + // edges form fall-throughs in the final order. + DenseMap MutuallyForcedOut; + + // This maps every (ordered) pair of chains (with the first chain in the pair + // potentially splittable) to the highest-gain NodeChainAssembly for those + // chains. + DenseMap, + std::unique_ptr> + NodeChainAssemblies; + + // This map stores the candidate chains for each chain. + // + // For every chain, its candidate chains are the chains which can increase the + // overall ExtTSP score when merged with that chain. This is used to update + // the NodeChainAssemblies map whenever chains merge together. The candidate + // chains of a chain may also be updated as result of a merge. + std::unordered_map> + CandidateChains; + + void sortChainsByExecutionDensity(std::vector &ChainOrder); + + void initializeExtTSP(); + void attachFallThroughs(); + + // This function tries to place two nodes immediately adjacent to + // each other (used for fallthroughs). + // Returns true if this can be done. + bool attachNodes(const CFGNode *src, const CFGNode *sink); + + void mergeChainEdges(NodeChain *splitChain, NodeChain *unSplitChain); + + void mergeChains(NodeChain *leftChain, NodeChain *rightChain); + void mergeChains(std::unique_ptr assembly); + + // Recompute the ExtTSP score of a chain + double computeExtTSPScore(NodeChain *chain) const; + + // Update the related NodeChainAssembly records for two chains, with the + // assumption that UnsplitChain has been merged into SplitChain. + bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain); + + void computeChainOrder(std::vector &chainOrder); + + // Initialize the mutuallyForcedOut map + void initMutuallyForcedEdges(); + + // Initialize basic block chains, with one chain for every node + void initNodeChains(); + + uint32_t getNodeOffset(const CFGNode *node) const { + auto it = NodeOffsetMap.find(node); + assert("Node does not exist in the offset map." && + it != NodeOffsetMap.end()); + return it->second; + } + + NodeChain *getNodeChain(const CFGNode *node) const { + auto it = NodeToChainMap.find(node); + assert("Node does not exist in the chain map." && + it != NodeToChainMap.end()); + return it->second; + } + +public: + NodeChainBuilder(const ControlFlowGraph *_CFG) : CFG(_CFG) { + initNodeChains(); + initMutuallyForcedEdges(); + } + + // This invokes the Extended TSP algorithm, orders the hot and cold basic + // blocks and inserts their associated symbols at the corresponding locations + // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the + // given SymbolList. + void doSplitOrder(std::list &SymbolList, + std::list::iterator HotPlaceHolder, + std::list::iterator ColdPlaceHolder); +}; + +class NodeChainBuilder::NodeChainSlice { +private: + // Chain from which this slice comes from + NodeChain *Chain; + + // The endpoints of the slice in the corresponding chain + std::vector::iterator Begin, End; + + // The offsets corresponding to the two endpoints + uint32_t BeginOffset, EndOffset; + + // Constructor for building a chain slice from a given chain and the two + // endpoints of the chain. + NodeChainSlice(NodeChain *c, std::vector::iterator begin, + std::vector::iterator end, + const NodeChainBuilder &chainBuilder) + : Chain(c), Begin(begin), End(end) { + + BeginOffset = chainBuilder.getNodeOffset(*begin); + if (End == Chain->Nodes.end()) + EndOffset = Chain->Size; + else + EndOffset = chainBuilder.getNodeOffset(*end); + } + + // (Binary) size of this slice + uint32_t size() const { return EndOffset - BeginOffset; } + + friend class NodeChainBuilder; + friend class NodeChainAssembly; +}; + +class NodeChainBuilder::NodeChainAssembly { +private: + // Total ExtTSP score of this NodeChainAssembly + double Score = 0; + + // Whether the ExtTSP score has been computed for this NodeChainAssembly + bool ScoreComputed = false; + + // Corresponding NodeChainBuilder of the assembled chains + const NodeChainBuilder *ChainBuilder; + + // The two assembled chains + NodeChain *SplitChain, *UnsplitChain; + + // The slice position of Splitchain + std::vector::iterator SlicePosition; + + // The three chain slices + std::vector Slices; + + NodeChainAssembly(NodeChain *chainX, NodeChain *chainY, + std::vector::iterator slicePosition, + MergeOrder mergeOrder, const NodeChainBuilder *chainBuilder) + : ChainBuilder(chainBuilder), SplitChain(chainX), UnsplitChain(chainY), + SlicePosition(slicePosition) { + NodeChainSlice x1(chainX, chainX->Nodes.begin(), SlicePosition, + *chainBuilder); + NodeChainSlice x2(chainX, SlicePosition, chainX->Nodes.end(), + *chainBuilder); + NodeChainSlice y(chainY, chainY->Nodes.begin(), chainY->Nodes.end(), + *chainBuilder); + + switch (mergeOrder) { + case MergeOrder::X2X1Y: + Slices = {x2, x1, y}; + break; + case MergeOrder::X1YX2: + Slices = {x1, y, x2}; + break; + case MergeOrder::X2YX1: + Slices = {x2, y, x1}; + break; + case MergeOrder::YX2X1: + Slices = {y, x2, x1}; + break; + default: + assert("Invalid MergeOrder!" && false); + } + } + + double extTSPScore() { + if (ScoreComputed) + return Score; + else { + ScoreComputed = true; + return (Score = computeExtTSPScore()); + } + } + + // Return the gain in ExtTSP score achieved by this NodeChainAssembly once it + // is accordingly applied to the two chains. + double extTSPScoreGain() { + return extTSPScore() - SplitChain->Score - UnsplitChain->Score; + } + + // Find the NodeChainSlice in this NodeChainAssembly which contains the given + // node. If the node is not contained in this NodeChainAssembly, then return + // false. Otherwise, set idx equal to the index of the corresponding slice and + // return true. + bool findSliceIndex(const CFGNode *node, uint8_t &idx) const { + // First find the chain containing the given node. + NodeChain *chain = ChainBuilder->getNodeChain(node); + + // Return false if the found chain is not used in this NodeChainAssembly. + if (SplitChain != chain && UnsplitChain != chain) + return false; + + // Find the slice containing the node using the node's offset in the chain + auto offset = ChainBuilder->getNodeOffset(node); + + for (idx = 0; idx < 3; ++idx) { + if (chain != Slices[idx].Chain) + continue; + if (offset < Slices[idx].BeginOffset) + continue; + if (offset > Slices[idx].EndOffset) + continue; + if (offset < Slices[idx].EndOffset && offset > Slices[idx].BeginOffset) + return true; + // A node can have zero size, which means multiple nodes may be associated + // with the same offset. This means that if the node's offset is at the + // beginning or the end of the slice, the node may reside in either slices + // of the chain. + if (offset == Slices[idx].EndOffset) { + // If offset is at the end of the slice, iterate backwards over the + // slice to find a zero-sized node. + for (auto nodeIt = std::prev(Slices[idx].End); + nodeIt != std::prev(Slices[idx].Begin); nodeIt--) { + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if (!(*nodeIt)->ShSize) + break; + if (*nodeIt == node) + return true; + } + } + if (offset == Slices[idx].BeginOffset) { + // If offset is at the beginning of the slice, iterate forwards over the + // slice to find the node. + for (auto nodeIt = Slices[idx].Begin; nodeIt != Slices[idx].End; + nodeIt++) { + if (*nodeIt == node) + return true; + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if ((*nodeIt)->ShSize) + break; + } + } + } + return false; + } + + // Total Extended TSP score of this NodeChainAssembly once it is assembled + // accordingly. + double computeExtTSPScore() const; + + // First node in the resulting assembled chain. + const CFGNode *getFirstNode() const { + for (auto &slice : Slices) + if (slice.Begin != slice.End) + return *slice.Begin; + return nullptr; + } + + friend class NodeChainBuilder; + +public: + // We delete the copy constructor to make sure NodeChainAssembly is moved + // rather than copied. + NodeChainAssembly(NodeChainAssembly &&) = default; + // copy constructor is implicitly deleted + // NodeChainAssembly(const NodeChainAssembly&) = delete; + NodeChainAssembly() = delete; +}; + +} // namespace propeller +} // namespace lld +#endif Index: lld/ELF/PropellerBBReordering.cpp =================================================================== --- /dev/null +++ lld/ELF/PropellerBBReordering.cpp @@ -0,0 +1,584 @@ +//===- PropellerBBReordering.cpp -----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and includes the implementation of intra-function basic block +// reordering algorithm based on the Extended TSP metric as described in [1]. +// +// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic +// blocks in a function, by combining the gains from fall-throughs and short +// jumps. +// +// Given an ordering of the basic blocks, for a function f, the ExtTSP score is +// computed as follows. +// +// sum_{edges e in f} frequency(e) * weight(e) +// +// where frequency(e) is the execution frequency and weight(e) is computed as +// follows: +// * 1 if distance(Src[e], Sink[e]) = 0 (i.e. fallthrough) +// +// * 0.1 * (1 - distance(Src[e], Sink[e]) / 1024) if Src[e] < Sink[e] and 0 < +// distance(Src[e], Sink[e]) < 1024 (i.e. short forward jump) +// +// * 0.1 * (1 - distance(Src[e], Sink[e]) / 640) if Src[e] > Sink[e] and 0 < +// distance(Src[e], Sink[e]) < 640 (i.e. short backward jump) +// +// * 0 otherwise +// +// +// In short, it computes a weighted sum of frequencies of all edges in the +// control flow graph. Each edge gets its weight depending on whether the given +// ordering makes the edge a fallthrough, a short forward jump, or a short +// backward jump. +// +// Although this problem is NP-hard like the regular TSP, an iterative greedy +// basic-block-chaining algorithm is used to find a close to optimal solution. +// This algorithm is described as follows. +// +// Starting with one basic block sequence (BB chain) for every basic block, the +// algorithm iteratively joins BB chains together in order to maximize the +// extended TSP score of all the chains. +// +// Initially, it finds all mutually-forced edges in the profiled CFG. These are +// all the edges which are -- based on the profile -- the only (executed) +// outgoing edge from their source node and the only (executed) incoming edges +// to their sink nodes. Next, the source and sink of all mutually-forced edges +// are attached together as fallthrough edges. +// +// Then, at every iteration, the algorithm tries to merge a pair of BB chains +// which leads to the highest gain in the ExtTSP score. The algorithm extends +// the search space by considering splitting short (less than 128 bytes in +// binary size) BB chains into two chains and then merging these two chains with +// the other chain in four ways. After every merge, the new merge gains are +// updated. The algorithm repeats joining BB chains until no additional can be +// achieved. Eventually, it sorts all the existing chains in decreasing order of +// their execution density, i.e., the total profiled frequency of the chain +// divided by its binary size. +// +// The values used by this algorithm are reconfiguriable using lld's propeller +// flags. Specifically, these parameters are: +// +// * propeller-forward-jump-distance: maximum distance of a forward jump +// default-set to 1024 in the above equation). +// +// * propeller-backward-jump-distance: maximum distance of a backward jump +// (default-set to 640 in the above equation). +// +// * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1) +// +// * propeller-forward-jump-weight: weight of a forward jump (default-set to +// 0.1) +// +// * propeller-backward-jump-weight: weight of a backward jump (default-set to +// 0.1) +// +// * propeller-chain-split-threshold: maximum binary size of a BB chain which +// the algorithm will consider for splitting (default-set to 128). +// +// References: +// * [1] A. Newell and S. Pupyrev, Improved Basic Block Reordering, available +// at https://arxiv.org/abs/1809.04676 +//===----------------------------------------------------------------------===// +#include "PropellerBBReordering.h" + +#include "Config.h" +#include "llvm/ADT/DenseSet.h" + +#include +#include + +using lld::elf::config; + +using llvm::DenseSet; +using llvm::detail::DenseMapPair; + +namespace lld { +namespace propeller { + +// Return the Extended TSP score for one edge, given its source to sink +// direction and distance in the layout. +double getEdgeExtTSPScore(const CFGEdge *edge, bool isEdgeForward, + uint32_t srcSinkDistance) { + if (edge->Weight == 0) + return 0; + + if (srcSinkDistance == 0 && (edge->Type == CFGEdge::EdgeType::INTRA_FUNC)) + return edge->Weight * config->propellerFallthroughWeight; + + if (isEdgeForward && srcSinkDistance < config->propellerForwardJumpDistance) + return edge->Weight * config->propellerForwardJumpWeight * + (1.0 - + ((double)srcSinkDistance) / config->propellerForwardJumpDistance); + + if (!isEdgeForward && srcSinkDistance < config->propellerBackwardJumpDistance) + return edge->Weight * config->propellerBackwardJumpWeight * + (1.0 - + ((double)srcSinkDistance) / config->propellerBackwardJumpDistance); + return 0; +} + +// Sort BB chains in decreasing order of their execution density. +// NodeChainBuilder calls this function at the end to ensure that hot BB chains +// are placed at the beginning of the function. +void NodeChainBuilder::sortChainsByExecutionDensity( + std::vector &chainOrder) { + for (DenseMapPair> &elem : Chains) + chainOrder.push_back(elem.second.get()); + + std::sort( + chainOrder.begin(), chainOrder.end(), + [this](const NodeChain *c1, const NodeChain *c2) { + if (c1->Nodes.front() == this->CFG->getEntryNode()) + return true; + if (c2->Nodes.front() == this->CFG->getEntryNode()) + return false; + double c1ExecDensity = c1->execDensity(); + double c2ExecDensity = c2->execDensity(); + if (c1ExecDensity == c2ExecDensity) { + if (c1->DelegateNode->MappedAddr == c2->DelegateNode->MappedAddr) + return c1->DelegateNode->Shndx < c2->DelegateNode->Shndx; + return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr; + } + return c1ExecDensity > c2ExecDensity; + }); +} + +// NodeChainBuilder calls this function after building all the chains to attach +// as many fall-throughs as possible. Given that the algorithm already optimizes +// the extend TSP score, this function will only affect the cold basic blocks +// and thus we do not need to consider the edge weights. +void NodeChainBuilder::attachFallThroughs() { + // First, try to keep the fall-throughs from the original order. + for (auto &node : CFG->Nodes) { + if (node->FTEdge != nullptr) { + attachNodes(node.get(), node->FTEdge->Sink); + } + } + + // Sometimes, the original fall-throughs cannot be kept. So we try to find new + // fall-through opportunities which did not exist in the original order. + for (auto &edge : CFG->IntraEdges) { + attachNodes(edge->Src, edge->Sink); + } +} + +// Merge two chains in the specified order. +void NodeChainBuilder::mergeChains(NodeChain *leftChain, + NodeChain *rightChain) { + for (const CFGNode *node : rightChain->Nodes) { + leftChain->Nodes.push_back(node); + NodeToChainMap[node] = leftChain; + NodeOffsetMap[node] += leftChain->Size; + } + leftChain->Size += rightChain->Size; + leftChain->Freq += rightChain->Freq; + Chains.erase(rightChain->DelegateNode->Shndx); +} + +// This function tries to place two basic blocks immediately adjacent to each +// other (used for fallthroughs). Returns true if the basic blocks have been +// attached this way. +bool NodeChainBuilder::attachNodes(const CFGNode *src, const CFGNode *sink) { + // No edge cannot fall-through to the entry basic block. + if (sink == CFG->getEntryNode()) + return false; + + // Ignore edges between hot and cold basic blocks. + if (src->Freq == 0 ^ sink->Freq == 0) + return false; + NodeChain *srcChain = getNodeChain(src); + NodeChain *sinkChain = getNodeChain(sink); + // Skip this edge if the source and sink are in the same chain + if (srcChain == sinkChain) + return false; + + // It's possible to form a fall-through between src and sink only if + // they are respectively located at the end and beginning of their chains. + if (srcChain->Nodes.back() != src || sinkChain->Nodes.front() != sink) + return false; + // Attaching is possible. So we merge the chains in the corresponding order. + mergeChains(srcChain, sinkChain); + return true; +} + +// Merge two BB sequences according to the given NodeChainAssembly. A +// NodeChainAssembly is an ordered triple of three slices from two chains. +void NodeChainBuilder::mergeChains( + std::unique_ptr assembly) { + // Create the new node order according the given assembly + std::vector newNodeOrder; + for (NodeChainSlice &slice : assembly->Slices) { + std::copy(slice.Begin, slice.End, std::back_inserter(newNodeOrder)); + } + // We merge the nodes into the SplitChain + assembly->SplitChain->Nodes = std::move(newNodeOrder); + + // Update nodeOffsetMap and nodeToChainMap for all the nodes in the sequence. + uint32_t runningOffset = 0; + for (const CFGNode *node : assembly->SplitChain->Nodes) { + NodeToChainMap[node] = assembly->SplitChain; + NodeOffsetMap[node] = runningOffset; + runningOffset += node->ShSize; + } + assembly->SplitChain->Size = runningOffset; + + // Update the total frequency and ExtTSP score of the aggregated chain + assembly->SplitChain->Freq += assembly->UnsplitChain->Freq; + // We have already computed the gain in the assembly record. So we can just + // increment the aggregated chain's score by that gain. + assembly->SplitChain->Score += assembly->extTSPScoreGain(); + + // Merge the assembly candidate chains of the two chains into the candidate + // chains of the remaining NodeChain and remove the records for the defunct + // NodeChain. + for (NodeChain *c : CandidateChains[assembly->UnsplitChain]) { + NodeChainAssemblies.erase(std::make_pair(c, assembly->UnsplitChain)); + NodeChainAssemblies.erase(std::make_pair(assembly->UnsplitChain, c)); + CandidateChains[c].erase(assembly->UnsplitChain); + if (c != assembly->SplitChain) + CandidateChains[assembly->SplitChain].insert(c); + } + + // Update the NodeChainAssembly for all candidate chains of the merged + // NodeChain. Remove a NodeChain from the merge chain's candidates if the + // NodeChainAssembly update finds no gain. + auto &splitChainCandidateChains = CandidateChains[assembly->SplitChain]; + + for (auto CI = splitChainCandidateChains.begin(), + CE = splitChainCandidateChains.end(); + CI != CE;) { + NodeChain *otherChain = *CI; + auto &otherChainCandidateChains = CandidateChains[otherChain]; + bool x = updateNodeChainAssembly(otherChain, assembly->SplitChain); + bool y = updateNodeChainAssembly(assembly->SplitChain, otherChain); + if (x || y) { + otherChainCandidateChains.insert(assembly->SplitChain); + CI++; + } else { + otherChainCandidateChains.erase(assembly->SplitChain); + CI = splitChainCandidateChains.erase(CI); + } + } + + // remove all the candidate chain records for the merged-in chain + CandidateChains.erase(assembly->UnsplitChain); + + // Finally, remove the defunct (merged-in) chain record from the chains + Chains.erase(assembly->UnsplitChain->DelegateNode->Shndx); +} + +// Calculate the Extended TSP metric for a BB chain. +// This function goes over all the BBs in the chain and for BB chain and +// aggregates the score of all edges which are contained in the same chain. +double NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const { + double score = 0; + uint32_t srcOffset = 0; + for (const CFGNode *node : chain->Nodes) { + for (const CFGEdge *edge : node->Outs) { + if (!edge->Weight) + continue; + NodeChain *sinkChain = getNodeChain(edge->Sink); + if (sinkChain != chain) + continue; + auto sinkOffset = getNodeOffset(edge->Sink); + bool edgeForward = srcOffset + node->ShSize <= sinkOffset; + // Calculate the distance between src and sink + uint32_t distance = edgeForward ? sinkOffset - srcOffset - node->ShSize + : srcOffset - sinkOffset + node->ShSize; + score += getEdgeExtTSPScore(edge, edgeForward, distance); + } + srcOffset += node->ShSize; + } + return score; +} + +// Updates the best NodeChainAssembly between two NodeChains. The existing +// record will be replaced by the new NodeChainAssembly if a non-zero gain +// is achieved. Otherwise, it will be removed. +// If a nodechain assembly record is (kept) inserted, returns true. Otherwise +// returns false. +bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain, + NodeChain *unsplitChain) { + // Remove the assembly record + NodeChainAssemblies.erase(std::make_pair(splitChain, unsplitChain)); + + // Only consider splitting the chain if the size of the chain is smaller than + // a threshold. + bool doSplit = (splitChain->Size <= config->propellerChainSplitThreshold); + auto slicePosEnd = + doSplit ? splitChain->Nodes.end() : std::next(splitChain->Nodes.begin()); + + SmallVector, 128> candidateNCAs; + + for (auto slicePos = splitChain->Nodes.begin(); slicePos != slicePosEnd; + ++slicePos) { + // Do not split the mutually-forced edges in the chain. + if (slicePos != splitChain->Nodes.begin() && + MutuallyForcedOut[*std::prev(slicePos)] == *slicePos) + continue; + + // If the split position is at the beginning (no splitting), only consider + // one MergeOrder + auto mergeOrderEnd = (slicePos == splitChain->Nodes.begin()) + ? MergeOrder::BeginNext + : MergeOrder::End; + + for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) { + MergeOrder mOrder = static_cast(MI); + + // Create the NodeChainAssembly representing this particular assembly. + auto NCA = std::unique_ptr(new NodeChainAssembly( + splitChain, unsplitChain, slicePos, mOrder, this)); + + CFGNode *entryNode = CFG->getEntryNode(); + if ((NCA->SplitChain->Nodes.front() == entryNode || + NCA->UnsplitChain->Nodes.back() == entryNode) && + NCA->getFirstNode() != entryNode) + continue; + candidateNCAs.push_back(std::move(NCA)); + } + } + + // Insert the best assembly of the two chains (only if it has positive gain). + auto bestCandidate = + std::max_element(candidateNCAs.begin(), candidateNCAs.end(), + [](std::unique_ptr &c1, + std::unique_ptr &c2) { + return c1->extTSPScoreGain() < c2->extTSPScoreGain(); + }); + + if (bestCandidate != candidateNCAs.end() && + (*bestCandidate)->extTSPScoreGain() > 0) { + NodeChainAssemblies.try_emplace(std::make_pair(splitChain, unsplitChain), + std::move(*bestCandidate)); + return true; + } else + return false; +} + +void NodeChainBuilder::initNodeChains() { + for (auto &node : CFG->Nodes) { + NodeChain *chain = new NodeChain(node.get()); + NodeToChainMap[node.get()] = chain; + NodeOffsetMap[node.get()] = 0; + Chains.try_emplace(node->Shndx, chain); + } +} + +// Find all the mutually-forced edges. +// These are all the edges which are -- based on the profile -- the only +// (executed) outgoing edge from their source node and the only (executed) +// incoming edges to their sink nodes +void NodeChainBuilder::initMutuallyForcedEdges() { + DenseMap> profiledOuts; + DenseMap> profiledIns; + + for (auto &node : CFG->Nodes) { + std::copy_if(node->Outs.begin(), node->Outs.end(), + std::back_inserter(profiledOuts[node.get()]), + [](const CFGEdge *edge) { + return edge->Type == CFGEdge::EdgeType::INTRA_FUNC && + edge->Weight != 0; + }); + std::copy_if(node->Ins.begin(), node->Ins.end(), + std::back_inserter(profiledIns[node.get()]), + [](const CFGEdge *edge) { + return edge->Type == CFGEdge::EdgeType::INTRA_FUNC && + edge->Weight != 0; + }); + } + + for (auto &node : CFG->Nodes) { + if (profiledOuts[node.get()].size() == 1) { + CFGEdge *edge = profiledOuts[node.get()].front(); + if (profiledIns[edge->Sink].size() == 1) + MutuallyForcedOut[node.get()] = edge->Sink; + } + } + + // Break cycles in the mutually forced edges by cutting the edge sinking to + // the smallest address in every cycle (hopefully a loop backedge) + DenseMap nodeToPathMap; + SmallVector cycleCutNodes; + unsigned pathCount = 0; + for (auto it = MutuallyForcedOut.begin(); it != MutuallyForcedOut.end(); + ++it) { + // Check to see if the node (and its cycle) have already been visited. + if (nodeToPathMap[it->first]) + continue; + const CFGEdge *victimEdge = nullptr; + auto nodeIt = it; + pathCount++; + while (nodeIt != MutuallyForcedOut.end()) { + const CFGNode *node = nodeIt->first; + unsigned path = nodeToPathMap[node]; + if (path != 0) { + // If this node is marked with a number, either it is the same number, + // in which case we have found a cycle. Or it is a different number, + // which means we have found a path to a previously visited path + // (non-cycle). + if (path == pathCount) { + // We have found a cycle: add the victim edge + cycleCutNodes.push_back(victimEdge->Src); + } + break; + } else + nodeToPathMap[node] = pathCount; + const CFGEdge *edge = profiledOuts[node].front(); + if (!victimEdge || + (edge->Sink->MappedAddr < victimEdge->Sink->MappedAddr)) { + victimEdge = edge; + } + nodeIt = MutuallyForcedOut.find(nodeIt->second); + } + } + + // Remove the victim edges to break cycles in the mutually forced edges + for (const CFGNode *node : cycleCutNodes) + MutuallyForcedOut.erase(node); +} + +// This function initializes the ExtTSP algorithm's data structures. This +// the NodeChainAssemblies and the CandidateChains maps. +void NodeChainBuilder::initializeExtTSP() { + // For each chain, compute its ExtTSP score, add its chain assembly records + // and its merge candidate chain. + DenseSet> visited; + for (DenseMapPair> &elem : Chains) { + NodeChain *chain = elem.second.get(); + chain->Score = computeExtTSPScore(chain); + for (const CFGNode *node : chain->Nodes) { + for (const CFGEdge *edge : node->Outs) { + if (!edge->Weight) + continue; + NodeChain *otherChain = getNodeChain(edge->Sink); + if (chain == otherChain) + continue; + if (visited.count(std::make_pair(chain, otherChain))) + continue; + bool x = updateNodeChainAssembly(chain, otherChain); + bool y = updateNodeChainAssembly(otherChain, chain); + if (x || y) { + CandidateChains[chain].insert(otherChain); + CandidateChains[otherChain].insert(chain); + } + visited.insert(std::make_pair(chain, otherChain)); + visited.insert(std::make_pair(otherChain, chain)); + } + } + } +} + +void NodeChainBuilder::computeChainOrder( + std::vector &chainOrder) { + // Attach the mutually-foced edges (which will not be split anymore by the + // Extended TSP algorithm). + for (auto &kv : MutuallyForcedOut) + attachNodes(kv.first, kv.second); + + // Initialize the Extended TSP algorithm's data. + initializeExtTSP(); + + // Keep merging the chain assembly record with the highest ExtTSP gain, until + // no more gain is possible. + bool merged = false; + do { + merged = false; + auto bestCandidate = std::max_element( + NodeChainAssemblies.begin(), NodeChainAssemblies.end(), + [](DenseMapPair, + std::unique_ptr> &c1, + DenseMapPair, + std::unique_ptr> &c2) { + return c1.second->extTSPScoreGain() < c2.second->extTSPScoreGain(); + }); + + if (bestCandidate != NodeChainAssemblies.end() && + bestCandidate->second->extTSPScoreGain() > 0) { + std::unique_ptr bestCandidateNCA = + std::move(bestCandidate->second); + NodeChainAssemblies.erase(bestCandidate); + mergeChains(std::move(bestCandidateNCA)); + merged = true; + } + } while (merged); + + // Merge fallthrough basic blocks if we have missed any + attachFallThroughs(); + + sortChainsByExecutionDensity(chainOrder); +} + +// This function computes the ExtTSP score for a chain assembly record. This +// goes the three BB slices in the assembly record and considers all edges +// whose source and sink belongs to the chains in the assembly record. +double NodeChainBuilder::NodeChainAssembly::computeExtTSPScore() const { + double score = 0; + for (uint8_t srcSliceIdx = 0; srcSliceIdx < 3; ++srcSliceIdx) { + const NodeChainSlice &srcSlice = Slices[srcSliceIdx]; + uint32_t srcNodeOffset = srcSlice.BeginOffset; + for (auto nodeIt = srcSlice.Begin; nodeIt != srcSlice.End; + srcNodeOffset += (*nodeIt)->ShSize, ++nodeIt) { + const CFGNode *node = *nodeIt; + for (const CFGEdge *edge : node->Outs) { + if (!edge->Weight) + continue; + + uint8_t sinkSliceIdx; + + if (findSliceIndex(edge->Sink, sinkSliceIdx)) { + auto sinkNodeOffset = ChainBuilder->getNodeOffset(edge->Sink); + bool edgeForward = (srcSliceIdx < sinkSliceIdx) || + (srcSliceIdx == sinkSliceIdx && + (srcNodeOffset + node->ShSize <= sinkNodeOffset)); + + uint32_t distance = 0; + + if (srcSliceIdx == sinkSliceIdx) { + distance = edgeForward + ? sinkNodeOffset - srcNodeOffset - node->ShSize + : srcNodeOffset - sinkNodeOffset + node->ShSize; + } else { + const NodeChainSlice &sinkSlice = Slices[sinkSliceIdx]; + distance = edgeForward + ? srcSlice.EndOffset - srcNodeOffset - node->ShSize + + sinkNodeOffset - sinkSlice.BeginOffset + : srcNodeOffset - srcSlice.BeginOffset + + node->ShSize + sinkSlice.EndOffset - + sinkNodeOffset; + // Increment the distance by the size of the middle slice if the src + // and sink are from the two ends. + if (std::abs(sinkSliceIdx - srcSliceIdx) == 2) + distance += Slices[1].size(); + } + score += getEdgeExtTSPScore(edge, edgeForward, distance); + } + } + } + } + return score; +} + +void NodeChainBuilder::doSplitOrder( + std::list &symbolList, + std::list::iterator hotPlaceHolder, + std::list::iterator coldPlaceHolder) { + std::vector chainOrder; + computeChainOrder(chainOrder); + + for (const NodeChain *c : chainOrder) { + std::list::iterator insertPos = + c->Freq ? hotPlaceHolder : coldPlaceHolder; + for (const CFGNode *n : c->Nodes) + symbolList.insert(insertPos, n->ShName); + } +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/PropellerFuncOrdering.h =================================================================== --- /dev/null +++ lld/ELF/PropellerFuncOrdering.h @@ -0,0 +1,72 @@ +//===- PropellerFuncReordering.h ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_FUNC_ORDERING_H +#define LLD_ELF_PROPELLER_FUNC_ORDERING_H + +#include "Propeller.h" + +#include +#include +#include + +namespace lld { +namespace propeller { + +class ControlFlowGraph; + +class CallChainClustering { +public: + class Cluster { + public: + Cluster(ControlFlowGraph *cfg, unsigned); + // All cfgs in this cluster + std::vector CFGs; + // Unique id associated with the cluster + unsigned Id; + // Total binary size of this cluster (only the hot part if using + // split-funcs. + uint64_t Size; + // Total byte-level execution frequency of the cluster + uint64_t Weight; + + // Merge the "other" cluster into this cluster. + Cluster &mergeWith(Cluster &other) { + CFGs.insert(CFGs.end(), other.CFGs.begin(), other.CFGs.end()); + this->Weight += other.Weight; + this->Size += other.Size; + return *this; + } + + // Returns the per-byte execution density of this cluster + double getDensity() { return ((double)Weight) / Size; } + }; + + CallChainClustering() {} + + void init(Propeller &propeller); + + unsigned doOrder(std::list &cfgOrder); + +private: + unsigned ClusterCount = 0; + + ControlFlowGraph *getMostLikelyPredecessor(ControlFlowGraph *cfg, + Cluster *cluster); + + void mergeClusters(); + void sortClusters(std::vector &); + + std::vector HotCFGs, ColdCFGs; + std::map CFGToClusterMap; + std::map> Clusters; +}; + +} // namespace propeller +} // namespace lld + +#endif Index: lld/ELF/PropellerFuncOrdering.cpp =================================================================== --- /dev/null +++ lld/ELF/PropellerFuncOrdering.cpp @@ -0,0 +1,198 @@ +//===- PropellerFuncReordering.cpp ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM +// Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and includes the implementation of function reordering based on +// the CallChainClustering algorithm as published in [1]. +// +// This algorithm keeps merging functions together into clusters until the +// cluster sizes reach a limit. The algorithm iterates over functions in +// decreasing order of their execution density (total frequency divided by size) +// and for each function, it first finds the cluster containing the +// most-frequent caller of that function and then places the caller's cluster +// right before the callee's cluster. Finally, all the remaining clusters are +// ordered in decreasing order of their execution density. +// +// References: +// * [1] G.Ottoni and B.Maher, Optimizing Function Placement for Large-Scale +// Data-Center Applications, CGO 2017. available at +// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +//===----------------------------------------------------------------------===// +#include "PropellerFuncOrdering.h" + +#include "Config.h" +#include "Propeller.h" +#include "PropellerELFCfg.h" + +#include +#include + +using lld::elf::config; + +namespace lld { +namespace propeller { + +const unsigned ClusterMergeSizeThreshold = 1 << 21; + +// Initializes the CallChainClustering algorithm with the cfgs from propeller. +// It separates cfgs into hot and cold cfgs and initially orders each collection +// of cfgs based on the address of their corresponding functions in the original +// binary. +void CallChainClustering::init(Propeller &propeller) { + propeller.forEachCfgRef([this](ControlFlowGraph &cfg) { + if (cfg.isHot()) + this->HotCFGs.push_back(&cfg); + else + this->ColdCFGs.push_back(&cfg); + }); + + auto cfgComparator = [](ControlFlowGraph *cfg1, + ControlFlowGraph *cfg2) -> bool { + return cfg1->getEntryNode()->MappedAddr < cfg2->getEntryNode()->MappedAddr; + }; + auto sortCFGs = [&cfgComparator](std::vector &cfgs) { + std::sort(cfgs.begin(), cfgs.end(), cfgComparator); + }; + sortCFGs(HotCFGs); + sortCFGs(ColdCFGs); +} + +// Initialize a cluster containing a single cfg an associates it with a unique +// id. +CallChainClustering::Cluster::Cluster(ControlFlowGraph *cfg, unsigned id) + : CFGs(1, cfg), Id(id) {} + +// Returns the most frequent caller of a function. This function also gets as +// the second parameter the cluster containing this function to save a lookup +// into the CFGToClusterMap. +ControlFlowGraph * +CallChainClustering::getMostLikelyPredecessor(ControlFlowGraph *cfg, + Cluster *cluster) { + CFGNode *entry = cfg->getEntryNode(); + if (!entry) + return nullptr; + CFGEdge *bestCallIn = nullptr; + + // Iterate over all callers of the entry basic block of the function. + for (CFGEdge *callIn : entry->CallIns) { + auto *caller = callIn->Src->CFG; + auto *callerCluster = CFGToClusterMap[caller]; + assert(caller->isHot()); + // Ignore callers from the same function, or the same cluster + if (caller == cfg || callerCluster == cluster) + continue; + // Ignore callers with overly large clusters + if (callerCluster->Size > ClusterMergeSizeThreshold) + continue; + // Ignore calls which are cold relative to the callee + if (callIn->Weight * 10 < entry->Freq) + continue; + // Do not merge if the caller cluster's density would degrade by more than + // 1/8 if merged. + if (8 * callerCluster->Size * (cluster->Weight * callerCluster->Weight) < + callerCluster->Weight * (cluster->Size + callerCluster->Size)) + continue; + // Update the best CallIn edge if needed + if (!bestCallIn || bestCallIn->Weight < callIn->Weight) + bestCallIn = callIn; + } + return bestCallIn ? bestCallIn->Src->CFG : nullptr; +} + +// Merge clusters together based on the CallChainClustering algorithm. +void CallChainClustering::mergeClusters() { + // Build a map for the execution density of each cfg. This value will depend + // on whether function-splitting is used or not. + std::map cfgWeightMap; + for (ControlFlowGraph *cfg : HotCFGs) { + uint64_t cfgWeight = 0; + uint64_t cfgSize = 0; + cfg->forEachNodeRef([&cfgSize, &cfgWeight](CFGNode &n) { + cfgWeight += n.Freq * n.ShSize; + if (!config->propellerSplitFuncs || n.Freq) + cfgSize += n.ShSize; + }); + + Cluster *c = new Cluster(cfg, ClusterCount++); + c->Weight = cfgWeight; + c->Size = std::max(cfgSize, (uint64_t)1); + cfgWeightMap.emplace(cfg, c->getDensity()); + Clusters.emplace(c->Id, c); + CFGToClusterMap[cfg] = c; + } + + // Sort the hot cfgs in decreasing order of their execution density. + std::stable_sort( + HotCFGs.begin(), HotCFGs.end(), + [&cfgWeightMap](ControlFlowGraph *cfg1, ControlFlowGraph *cfg2) { + return cfgWeightMap[cfg1] > cfgWeightMap[cfg2]; + }); + + for (ControlFlowGraph *cfg : HotCFGs) { + if (cfgWeightMap[cfg] <= 0.005) + break; + auto *cluster = CFGToClusterMap[cfg]; + // Ignore merging if the cluster containing this function is bigger than + // 2MBs (size of a large page). + if (cluster->Size > ClusterMergeSizeThreshold) + continue; + assert(cluster); + + ControlFlowGraph *predecessorCfg = getMostLikelyPredecessor(cfg, cluster); + if (!predecessorCfg) + continue; + auto *predecessorCluster = CFGToClusterMap[predecessorCfg]; + + assert(predecessorCluster != cluster && predecessorCfg != cfg); + + // Join the two clusters into predecessorCluster. + predecessorCluster->mergeWith(*cluster); + + // Update cfg to cluster mapping, because all cfgs that were + // previsously in cluster are now in predecessorCluster. + for (ControlFlowGraph *cfg : cluster->CFGs) { + CFGToClusterMap[cfg] = predecessorCluster; + } + + // Delete the defunct cluster + Clusters.erase(cluster->Id); + } +} + +// This functions sorts all remaining clusters in decreasing order of their +// execution density. +void CallChainClustering::sortClusters(std::vector &clusterOrder) { + for (auto &p : Clusters) + clusterOrder.push_back(p.second.get()); + std::sort(clusterOrder.begin(), clusterOrder.end(), + [](Cluster *c1, Cluster *c2) { + // Set a deterministic order when execution densities are equal. + if (c1->getDensity() == c2->getDensity()) + return c1->CFGs.front()->getEntryNode()->MappedAddr < + c2->CFGs.front()->getEntryNode()->MappedAddr; + return c1->getDensity() > c2->getDensity(); + }); +} + +// This function performs CallChainClustering on all cfgs and then orders all +// the built clusters based on their execution density. It places all cold +// functions after hot functions and returns the number of hot functions. +unsigned CallChainClustering::doOrder(std::list &cfgOrder) { + mergeClusters(); + std::vector clusterOrder; + sortClusters(clusterOrder); + for (Cluster *c : clusterOrder) { + cfgOrder.insert(cfgOrder.end(), c->CFGs.begin(), c->CFGs.end()); + } + + cfgOrder.insert(cfgOrder.end(), ColdCFGs.begin(), ColdCFGs.end()); + return HotCFGs.size(); +} + +} // namespace propeller +} // namespace lld Index: lld/test/ELF/propeller/propeller-layout-function-ordering.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-function-ordering.s @@ -0,0 +1,75 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises function reordering on three functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld %t.o -o %t.out + +# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000008 t foo +# BEFORE-NEXT: 0000000000201128 0000000000000008 t bar +# BEFORE-NEXT: 0000000000201130 0000000000000008 t baz + +## Create a propeller profile based on the following inter-procedural calls. +## +## 100 100 +## baz -----> bar -----> foo +## + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 8 Nbar" >> %t_prof.propeller +# RUN: echo "3 8 Nbaz" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "3 2 100 C" >> %t_prof.propeller +# RUN: echo "2 1 100 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller + +## Link with the propeller profile and expect the functions to be reordered. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out +# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 0000000000000008 t baz +# REORDER-NEXT: 0000000000201128 0000000000000008 t bar +# REORDER-NEXT: 0000000000201130 0000000000000008 t foo + +## Disable function reordering and expect that the original function order is retained. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out +# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER + +# NOREORDER: 0000000000201120 0000000000000008 t foo +# NOREORDER-NEXT: 0000000000201128 0000000000000008 t bar +# NOREORDER-NEXT: 0000000000201130 0000000000000008 t baz + +.section .text,"ax",@progbits,unique,1 +# -- Begin function foo +.type foo,@function +foo: + .quad 0x11 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,2 +# -- Begin function bar +.type bar,@function +bar: + .quad 0x22 + +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,3 +# -- Begin function baz +.type baz,@function +baz: + .quad 0x33 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz Index: lld/test/ELF/propeller/propeller-layout-function-with-loop.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-function-with-loop.s @@ -0,0 +1,130 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises basic block reordering on a single function with a simple loop. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 0000000000201123 a.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: je 3 + +# BEFORE: 0000000000201128 aa.BB.foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 000000000020112b aaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: jne -13 + +# BEFORE: 0000000000201130 aaaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: retq + + +## Create a propeller profile for foo, based on the cfg below: +## +## foo +## | +## |5 +## V +## +--> a.BB.foo <--+ +## | / \ | +## 0| 0/ \95 |90 +## | / \ | +## | v \ | +## aa.BB.foo v | +## \ aaa.BB.foo +## \ / +## \ / +## 0\ /10 +## \ / +## v v +## aaaa.BB.foo +## +## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo +## as a fallthrough chain in the layout. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 14 Nfoo" > %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 3 1.2" >> %t_prof.propeller +# RUN: echo "4 5 1.3" >> %t_prof.propeller +# RUN: echo "5 4 1.4" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "4 2 90" >> %t_prof.propeller +# RUN: echo "2 4 95" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 10" >> %t_prof.propeller +# RUN: echo "1 2 5" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 foo: +# REORDER-NEXT: nopl (%rax) + +# REORDER: 0000000000201123 a.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne 9 + +# REORDER: 0000000000201128 aaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne -10 + +# REORDER: 000000000020112d aaaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: retq + +# REORDER: 0000000000201131 aa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jmp -14 + +## Disable basic block reordering and expect that the original bb order is retained. +# +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out +# RUN: diff %t.propeller.noreorder.out %t.out + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +foo: + nopl (%rax) + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + jmp aaa.BB.foo +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + jne a.BB.foo + jmp aaaa.BB.foo +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits,unique,4 +aaaa.BB.foo: + nopl (%rax) + ret +.Laaaa.BB.foo_end: + .size aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo Index: lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s @@ -0,0 +1,87 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises optimal basic block reordering one a single function. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000005 t foo +# BEFORE-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# BEFORE-NEXT: 000000000020112a 0000000000000004 t aa.BB.foo +# BEFORE-NEXT: 000000000020112e 0000000000000004 t aaa.BB.foo + +## Create a propeller profile for foo, based on the cfg below: +## +## +## foo +## |\ +## | \100 +## | \ +## | v +## 150| a.BB.foo +## | / \ +## | /100 \0 +## | / \ +## vv v +## aaa.BB.foo aa.BB.foo +## +## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and +## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo. +## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 12 Nfoo" >> %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 4 1.2" >> %t_prof.propeller +# RUN: echo "4 4 1.3" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "1 4 150" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "1 2 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out +# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER + +# AFTER: 0000000000201120 0000000000000005 t foo +# AFTER-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# AFTER-NEXT: 000000000020112a 0000000000000004 t aaa.BB.foo +# AFTER-NEXT: 000000000020112e 0000000000000004 t aa.BB.foo + +#.global _start +#_start: +# -- Begin function foo +.section .text,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + je aaa.BB.foo + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + ret +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + ret +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo Index: lld/test/ELF/propeller/propeller-opt-all-combinations.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-opt-all-combinations.s @@ -0,0 +1,295 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises all combinations of propeller options (reorder-funcs, reorder-blocks, +## and split-funcs) on four functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: Disassembly of section .text: +# BEFORE-EMPTY: +# BEFORE-NEXT: foo: +# BEFORE-NEXT: xorb %al, 0 +# BEFORE-NEXT: int3 +# BEFORE-EMPTY: +# BEFORE-NEXT: bar: +# BEFORE-NEXT: xorb %al, 1 +# BEFORE-NEXT: je 9 +# BEFORE-EMPTY: +# BEFORE-NEXT: a.BB.bar: +# BEFORE-NEXT: xorb %al, 2 +# BEFORE-NEXT: jmp 7 +# BEFORE-EMPTY: +# BEFORE-NEXT: aa.BB.bar: +# BEFORE-NEXT: xorb %al, 3 +# BEFORE-EMPTY: +# BEFORE-NEXT: aaa.BB.bar: +# BEFORE-NEXT: xorb %al, 4 +# BEFORE-EMPTY: +# BEFORE-NEXT: baz: +# BEFORE-NEXT: xorb %al, 5 +# BEFORE-NEXT: int3 +# BEFORE-EMPTY: +# BEFORE-NEXT: qux: +# BEFORE-NEXT: xorb %al, 6 +# + +## Create a propeller profile for four functions foo, bar, baz, and qux based on the cfg below: +## +## +## bar +## / \ +## / \100 +## / \ +## 100 v v +## foo <----- a.BB.bar aa.BB.bar baz qux ---+ +## \ / ^ | +## \ /100 | 1 | +## \ / +-----+ +## v v +## aaa.BB.bar +## + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 20 Nbar" >> %t_prof.propeller +# RUN: echo "3 9 2.1" >> %t_prof.propeller +# RUN: echo "4 7 2.2" >> %t_prof.propeller +# RUN: echo "5 7 2.3" >> %t_prof.propeller +# RUN: echo "6 8 Nbaz" >> %t_prof.propeller +# RUN: echo "7 7 Nqux" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "4 1 100 C" >> %t_prof.propeller +# RUN: echo "7 7 10 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller --verbose -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: Disassembly of section .text: +# REORDER-EMPTY: +# REORDER-NEXT: bar: +# REORDER-NEXT: xorb %al, 1 +# REORDER-NEXT: jne 30 +# REORDER-EMPTY: +# REORDER-NEXT: aa.BB.bar: +# REORDER-NEXT: xorb %al, 3 +# REORDER-EMPTY: +# REORDER-NEXT: aaa.BB.bar: +# REORDER-NEXT: xorb %al, 4 +# REORDER-NEXT: int3 +# REORDER-EMPTY: +# REORDER-NEXT: foo: +# REORDER-NEXT: xorb %al, 0 +# REORDER-NEXT: int3 +# REORDER-EMPTY: +# REORDER-NEXT: qux: +# REORDER-NEXT: xorb %al, 6 +# REORDER-EMPTY: +# REORDER-NEXT: a.BB.bar: +# REORDER-NEXT: xorb %al, 2 +# REORDER-NEXT: jmp -32 +# REORDER-EMPTY: +# REORDER-NEXT: baz: +# REORDER-NEXT: xorb %al, 5 +# + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorderblocks.out 2>&1 | FileCheck %s --check-prefixes=WARN,IMPLICITNOSPLIT +# WARN-NOT: warning: +# IMPLICITNOSPLIT: warning: propeller: no-reorder-blocks implicitly sets no-split-funcs. +# +# RUN: llvm-objdump -d %t.propeller.noreorderblocks.out| FileCheck %s --check-prefix=NO_REORDER_BB +# +# NO_REORDER_BB: Disassembly of section .text: +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: bar: +# NO_REORDER_BB-NEXT: xorb %al, 1 +# NO_REORDER_BB-NEXT: je 9 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: a.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 2 +# NO_REORDER_BB-NEXT: jmp 7 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: aa.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: aaa.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 4 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: foo: +# NO_REORDER_BB-NEXT: xorb %al, 0 +# NO_REORDER_BB-NEXT: int3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: qux: +# NO_REORDER_BB-NEXT: xorb %al, 6 +# NO_REORDER_BB-NEXT: int3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: baz: +# NO_REORDER_BB-NEXT: xorb %al, 5 + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.noreorderfuncs.out +# RUN: llvm-objdump -d %t.propeller.noreorderfuncs.out| FileCheck %s --check-prefix=NO_REORDER_FUNC +# +# NO_REORDER_FUNC: Disassembly of section .text: +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: foo: +# NO_REORDER_FUNC-NEXT: xorb %al, 0 +# NO_REORDER_FUNC-NEXT: int3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 1 +# NO_REORDER_FUNC-NEXT: jne 22 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: aa.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: aaa.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 4 +# NO_REORDER_FUNC-NEXT: int3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: qux: +# NO_REORDER_FUNC-NEXT: xorb %al, 6 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: a.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 2 +# NO_REORDER_FUNC-NEXT: jmp -24 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: baz: +# NO_REORDER_FUNC-NEXT: xorb %al, 5 +# + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitfuncs.out +# RUN: llvm-objdump -d %t.propeller.nosplitfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_FUNC +# +# NO_SPLIT_FUNC: Disassembly of section .text: +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 1 +# NO_SPLIT_FUNC-NEXT: jne 14 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: aa.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: aaa.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 4 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: a.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 2 +# NO_SPLIT_FUNC-NEXT: jmp -16 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: foo: +# NO_SPLIT_FUNC-NEXT: xorb %al, 0 +# NO_SPLIT_FUNC-NEXT: int3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: qux: +# NO_SPLIT_FUNC-NEXT: xorb %al, 6 +# NO_SPLIT_FUNC-NEXT: int3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: baz: +# NO_SPLIT_FUNC-NEXT: xorb %al, 5 +# + +## Check that the combination of no-reorder-blocks and split-funcs makes lld fail. +# RUN: not ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-opt=split-funcs 2>&1 -o /dev/null | FileCheck %s --check-prefix=FAIL +# FAIL: propeller: Inconsistent combination of propeller optimizations 'split-funcs' and 'no-reorder-blocks'. + + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitreorderfuncs.out +# RUN: llvm-objdump -d %t.propeller.nosplitreorderfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_REORDER_FUNC +# +# NO_SPLIT_REORDER_FUNC: Disassembly of section .text: +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: foo: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 0 +# NO_SPLIT_REORDER_FUNC-NEXT: int3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 1 +# NO_SPLIT_REORDER_FUNC-NEXT: jne 14 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: aa.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: aaa.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 4 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: a.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 2 +# NO_SPLIT_REORDER_FUNC-NEXT: jmp -16 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: baz: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 5 +# NO_SPLIT_REORDER_FUNC-NEXT: int3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: qux: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 6 + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +.align 4 +foo: + xor %al,0 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,1 +# -- Begin function bar +.type bar,@function +.align 4 +bar: + xor %al,1 + je aa.BB.bar + jmp a.BB.bar + +.section .text,"ax",@progbits,unique,2 +a.BB.bar: + xor %al,2 + jmp aaa.BB.bar +.La.BB.bar_end: + .size a.BB.bar, .La.BB.bar_end-a.BB.bar + +.section .text,"ax",@progbits,unique,3 +aa.BB.bar: + xor %al,3 + jmp aaa.BB.bar +.Laa.BB.bar_end: + .size aa.BB.bar, .Laa.BB.bar_end-aa.BB.bar + +.section .text,"ax",@progbits,unique,4 +aaa.BB.bar: + xor %al,4 +.Laaa.BB.bar_end: + .size aaa.BB.bar, .Laaa.BB.bar_end-aaa.BB.bar + +.section .text,"ax",@progbits,unique,1 +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,5 +# -- Begin function baz +.type baz,@function +.align 4 +baz: + xor %al,5 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz + +.section .text,"ax",@progbits,unique,6 +# -- Begin function qux +.type qux,@function +.align 4 +qux: + xor %al,6 + +.Lqux_end: + .size qux, .Lqux_end-qux +# -- End function qux