Index: lld/ELF/Propeller/CodeLayout/CodeLayout.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/CodeLayout.h @@ -0,0 +1,42 @@ +//===- PropellerBBReordering.h -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H +#define LLD_ELF_PROPELLER_BB_REORDERING_H + +#include "NodeChainClustering.h" +#include "PropellerCFG.h" +#include "PropellerConfig.h" + +#include +#include + +namespace lld { +namespace propeller { + +class CodeLayout { +private: + // CFGs that are processed by the reordering algorithm. These are separated + // into hot and cold cfgs. + std::vector HotCFGs, ColdCFGs; + // The final hot and cold order containing all cfg nodes. + std::vector HotOrder, ColdOrder; + // Handle of the clustering algorithm used to further reorder the computed + // chains. + std::unique_ptr CC; + +public: + void doSplitOrder(std::list &symbolList, + std::list::iterator hotPlaceHolder, + std::list::iterator coldPlaceHolder); + + void printStats(); +}; + +} // namespace propeller +} // namespace lld +#endif Index: lld/ELF/Propeller/CodeLayout/CodeLayout.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/CodeLayout.cpp @@ -0,0 +1,181 @@ +//===- CodeLayout.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and implements the entry point of code layout optimization +// (doSplitOrder). +//===----------------------------------------------------------------------===// +#include "CodeLayout.h" + +#include "NodeChain.h" +#include "NodeChainBuilder.h" +#include "NodeChainClustering.h" +#include "Propeller.h" +#include "PropellerCFG.h" +#include "PropellerConfig.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" + +#include + +using llvm::DenseMap; +using llvm::DenseSet; + +namespace lld { +namespace propeller { +extern double getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward, + uint64_t); + +// This function iterates over the CFGs included in the Propeller profile and +// adds them to cold and hot cfg lists. Then it appropriately performs basic +// block reordering by calling NodeChainBuilder.doOrder() either on all CFGs (if +// -propeller-opt=reorder-ip) or individually on every CFG. After creating all +// the node chains, it hands the basic block chains to a ChainClustering +// instance for further rerodering. +void CodeLayout::doSplitOrder(std::list &symbolList, + std::list::iterator hotPlaceHolder, + std::list::iterator coldPlaceHolder) { + std::chrono::steady_clock::time_point start = + std::chrono::steady_clock::now(); + + // Populate the hot and cold cfg lists by iterating over the CFGs in the + // propeller profile. + prop->forEachCfgRef([this](ControlFlowGraph &cfg) { + if (cfg.isHot()) { + HotCFGs.push_back(&cfg); + if (propellerConfig.optPrintStats) { + // Dump the number of basic blocks and hot basic blocks for every + // function + unsigned hotBBs = 0; + unsigned allBBs = 0; + cfg.forEachNodeRef([&hotBBs, &allBBs](CFGNode &node) { + if (node.Freq) + hotBBs++; + allBBs++; + }); + fprintf(stderr, "HISTOGRAM: %s,%u,%u\n", cfg.Name.str().c_str(), allBBs, + hotBBs); + } + } else + ColdCFGs.push_back(&cfg); + }); + + if (propellerConfig.optReorderIP || propellerConfig.optReorderFuncs) + CC.reset(new CallChainClustering()); + else { + // If function ordering is disabled, we want to conform the the initial + // ordering of functions in both the hot and the cold layout. + CC.reset(new NoOrdering()); + } + + if (propellerConfig.optReorderIP) { + // If -propeller-opt=reorder-ip we want to run basic block reordering on all + // the basic blocks of the hot CFGs. + NodeChainBuilder(HotCFGs).doOrder(CC); + } else if (propellerConfig.optReorderBlocks) { + // Otherwise we apply reordering on every CFG separately + for (ControlFlowGraph *cfg : HotCFGs) + NodeChainBuilder(cfg).doOrder(CC); + } else { + // If reordering is not desired, we create changes according to the initial + // order in the CFG. + for (ControlFlowGraph *cfg : HotCFGs) + CC->addChain(std::unique_ptr(new NodeChain(cfg))); + } + + // The order for cold cfgs remains unchanged. + for (ControlFlowGraph *cfg : ColdCFGs) + CC->addChain(std::unique_ptr(new NodeChain(cfg))); + + // After building all the chains, let the chain clustering algorithm perform + // the final reordering and populate the hot and cold cfg node orders. + CC->doOrder(HotOrder, ColdOrder); + + // Transfter the order to the symbol list. + for (CFGNode *n : HotOrder) + symbolList.insert(hotPlaceHolder, n->ShName); + + for (CFGNode *n : ColdOrder) + symbolList.insert(coldPlaceHolder, n->ShName); + + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + warn("[Propeller]: BB reordering took: " + + Twine(std::chrono::duration_cast(end - start) + .count())); + + if (propellerConfig.optPrintStats) + printStats(); +} + +// Prints statistics of the computed layout, including the number of partitions +// for each function across the code layout, the edge count for each distance +// level, and the ExtTSP score achieved for each function. +void CodeLayout::printStats() { + + DenseMap nodeAddressMap; + llvm::StringMap functionPartitions; + uint64_t currentAddress = 0; + ControlFlowGraph *currentCFG = nullptr; + for (CFGNode *n : HotOrder) { + if (currentCFG != n->CFG) { + currentCFG = n->CFG; + functionPartitions[currentCFG->Name]++; + } + nodeAddressMap[n] = currentAddress; + currentAddress += n->ShSize; + } + + for (auto &elem : functionPartitions) + fprintf(stderr, "FUNCTION PARTITIONS: %s,%u\n", elem.first().str().c_str(), + elem.second); + + std::vector distances({0, 128, 640, 1028, 4096, 65536, 2 << 20, + std::numeric_limits::max()}); + std::map histogram; + llvm::StringMap extTSPScoreMap; + for (CFGNode *n : HotOrder) { + auto scoreEntry = extTSPScoreMap.try_emplace(n->CFG->Name, 0).first; + n->forEachOutEdgeRef([&nodeAddressMap, &distances, &histogram, + &scoreEntry](CFGEdge &edge) { + if (!edge.Weight || edge.isReturn()) + return; + if (nodeAddressMap.find(edge.Src) == nodeAddressMap.end() || + nodeAddressMap.find(edge.Sink) == nodeAddressMap.end()) { + warn("Found a hot edge whose source and sink do not show up in the " + "layout!"); + return; + } + uint64_t srcOffset = nodeAddressMap[edge.Src]; + uint64_t sinkOffset = nodeAddressMap[edge.Sink]; + bool edgeForward = srcOffset + edge.Src->ShSize <= sinkOffset; + uint64_t srcSinkDistance = + edgeForward ? sinkOffset - srcOffset - edge.Src->ShSize + : srcOffset - sinkOffset + edge.Src->ShSize; + + if (edge.Type == CFGEdge::EdgeType::INTRA_FUNC || + edge.Type == CFGEdge::EdgeType::INTRA_DYNA) + scoreEntry->second += + getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance); + + auto res = + std::lower_bound(distances.begin(), distances.end(), srcSinkDistance); + histogram[*res] += edge.Weight; + }); + } + + for (auto &elem : extTSPScoreMap) + fprintf(stderr, "Ext TSP Score: %s %.6f\n", elem.first().str().c_str(), + elem.second); + fprintf(stderr, "DISTANCE HISTOGRAM: "); + for (auto elem : histogram) + fprintf(stderr, "\t[%lu -> %lu]", elem.first, elem.second); + fprintf(stderr, "\n"); +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h @@ -0,0 +1,387 @@ +//===- ModifiablePriorityQueue.h ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a priority-queue data structure for storing key-value +// pairs, with compare operations for both the key and the value type given. +// It stores these pairs in a max-heap data structure which allows for fast O(lg +// n)-time retrieval of the maximum element. +// To compare two key-value pairs, the values parts are compared first. The keys +// are compared in case of a tie. No two key-value pairs could have the same +// key. +// +// The key-value pairs are also accessible using a map data structure. +// +// In contrast to C++ STL's priority queue, this data structure allows for fast +// O(lg n)-time value update, or removal of any element while keep the max-heap +// data structure sound. +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H +#define LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H + +#include "llvm/ADT/DenseMap.h" +#include +#include +#include +#include +#include +#include + +// This class defines a node in the ModifiablePriorityQueue data structure +// containing a key, a value, pointers to its parent, and its left and right +// children. This class must be instantiated with the key-type, the value type, +// and their comparators. +template class HeapNode { +public: + K key; + V value; + // Pointer to the parent (this is nullptr for root). + HeapNode *parent; + // Pointers to the two children + HeapNode *children[2]; + + HeapNode(K k, V v) + : key(k), value(std::move(v)), parent(nullptr), children() {} + + // Get pointer to the left child + HeapNode *leftChild() { return children[0]; } + + // Get pointer to the right child + HeapNode *rightChild() { return children[1]; } + + // Whether this is the left child of its parent, returns false if this is the + // root. + bool isLeftChild() { + if (parent) + return parent->leftChild() == this; + return false; + } + + // Whether this is the right child of its parent, returns false if this is the + // root. + bool isRightChild() { + if (parent) + return parent->rightChild() == this; + return false; + } + + // Set the left child for this node. + void adoptLeftChild(HeapNode *c) { + children[0] = c; + if (c) + c->parent = this; + } + + // Set the right child for this node. + void adoptRightChild(HeapNode *c) { + children[1] = c; + if (c) + c->parent = this; + } + + // Set both the left and right children for this node. + void adoptChildren(HeapNode *(&_children)[2]) { + adoptLeftChild(_children[0]); + adoptRightChild(_children[1]); + } + + // Return a string representation for this node given an indentation level + // which is used for pretty-printing of the heap tree. + // This assumes that + // std::to_string has been specialized for the key and value types. + std::string toString(unsigned level) { + std::string str = std::string(level, ' '); + str += "NODE: " + std::to_string(key) + " -> " + std::to_string(value); + for (auto *c : children) { + if (c) { + str += "\n"; + str += c->toString(level + 1); + } + } + return str; + } +}; + +// Comparator class for nodes given comparators for the key and value types. It +// compares the values for the two pairs first and in the case of a tie, it +// compares the keys. +template struct CompareHeapNode { + CmpK KeyComparator; + CmpV ValueComparator; + bool operator()(const HeapNode &n1, + const HeapNode &n2) const { + if (ValueComparator(n1.value, n2.value)) + return true; + if (ValueComparator(n2.value, n1.value)) + return false; + return KeyComparator(n1.key, n2.key); + } +}; + +// This class implements a pointer-based heap-like data structure for storing +// key-value pairs with efficient O(lg n) retrieval, removal, update, and +// insertion of nodes. +// +// Any insertion happens at the lowest-right-most empty leaf position. This +// position is accessible via the root pointer of the tree using the binary +// representation of [size of the tree + 1]. Starting with the left-most bit, we +// take the left child if the bit is zero and the right child if the bit is one. +// +// Deletion of a node happens by replacing the last node (lowest-right-most +// occupied leaf), with the given node, removing the node from the tree, and +// finally heapifying the replaced node up and down to keep the +// soundness of the heap. +// +// Value-update is the simplest, which requires updating the node's value +// followed by heapifying up the node up and down. +template +class ModifiablePriorityQueue { +private: + // Comparator instance for comparing HeapNodes. + CompareHeapNode HeapNodeComparator; + + // Pointers to the nodes in this heap are stored in a map to allow for fast + // lookup of nodes given their keys. + llvm::DenseMap>> nodes; + + // The tree-root for this priority queue, which holds the max key-value. + HeapNode *root = nullptr; + + // Total number of nodes in the tree. + unsigned Size = 0; + + // Set the root for this tree and set it's parent to null. + void assignRoot(HeapNode *node) { + root = node; + if (root) + root->parent = nullptr; + } + + // Helper for getting a pointer to a node using a handle using its bits. + HeapNode *getNodeWithHandle(unsigned handle) { + return getNodeWithHandleHelper(root, handle); + } + + // This function traverses the tree from the given node, using an integer + // handle for direction. Going from left to right in the binary representation + // of the handle, take right if the bit is one, and left if the bit is zero. + HeapNode * + getNodeWithHandleHelper(HeapNode *node, unsigned handle) { + if (handle == 1) + return node; + auto *p = getNodeWithHandleHelper(node, handle >> 1); + return (handle & 1) ? p->rightChild() : p->leftChild(); + } + + // Insert a node into the heap tree. + void insert(HeapNode *node) { + if (!root) { + // If the tree is empty, we only need to set the root. + assignRoot(node); + } else { + // Otherwise, insert at the lowest-right-most leaf empty-position, which + // is encoded by [Size + 1] + unsigned handle = Size + 1; + // We need to find the parent of the target position. + auto *p = getNodeWithHandleHelper(root, handle >> 1); + // Attach this to the left or right of the parent node, depending on the + // last bit in handle. + if (handle & 1) + p->adoptRightChild(node); + else + p->adoptLeftChild(node); + // We only need to heapify up as this node has no children. + heapifyUp(node); + } + // Increment the size of the tree. + Size++; + } + + // This function removes a node from the heap and returns its value. + // Instead of removing the node directly, it moves the lowest-right-most node + // to the position of the given node and heapifies it up and down. + // + // After this function, the removed node's HeapNode pointer is invalid. + V remove(HeapNode *node) { + // Find the lowest-right-most node + auto *last = getNodeWithHandleHelper(root, Size); + assert(last->leftChild() == nullptr && last->rightChild() == nullptr); + + // Remove the node from the children of its parents. + if (last->parent) { + if (last->isLeftChild()) + last->parent->adoptLeftChild(nullptr); + else + last->parent->adoptRightChild(nullptr); + } + + if (node == last) { + // If we are removing the last node, not much needs to be done. + if (node->parent == nullptr) { + assert(node == root); + assignRoot(nullptr); + } + } else { // node != last + if (node->parent) { + // Move the last node in place of the to-be-removed node. + if (node->isLeftChild()) + node->parent->adoptLeftChild(last); + else + node->parent->adoptRightChild(last); + } else { // node->parent == nullptr, so node is root + assert(node == root); + assignRoot(last); + } + // Let the moved-node adopt the children of the to-be-removed node + last->adoptChildren(node->children); + // Finally, heapify the moved node up and down. + if (!heapifyUp(last)) + heapifyDown(last); + } + + // Decrement the number of nodes. + Size--; + // Save the node's value to be returned. + auto nodeElement = std::move(node->value); + // Remove the node from the key->node mapping, effectively deallocating the + // HeapNode. + nodes.erase(node->key); + return nodeElement; + } + + // Recursively rectify the max-heap upwards from a node. + // This takes O(lg n) time. + // Returns true if any modifications occurs. + bool heapifyUp(HeapNode *node) { + // Heapify upwards until no more inconsistency is found. + if (node->parent && HeapNodeComparator(*node->parent, *node)) { + swapWithParent(node); + heapifyUp(node); + return true; + } + return false; + } + + // Recursively rectify the max-heap downwards from a node. + // This takes O(lg n) time. + // Returns true if any modifications occurs. + bool heapifyDown(HeapNode *node) { + auto maxChild = std::max_element( + node->children, node->children + 2, + [this](const HeapNode *c1, + const HeapNode *c2) { + // This compares two HeapNode pointers, relying on + // HeapNodeComparator for non-null pointers. + return c1 == nullptr + ? true + : (c2 == nullptr ? false : HeapNodeComparator(*c1, *c2)); + }); + // Move the maximum child to the parent if required and further heapify + // down. + if (*maxChild != nullptr && HeapNodeComparator(*node, **maxChild)) { + swapWithParent(*maxChild); + heapifyDown(node); + return true; + } + return false; + } + + // Helper function for swapping a node with its parent. + // Can only be called if the given node has a parent. Also, the given node's + // value should compare bigger than both its parent and its sibling. + // For example swapWithParent(C1) results in the following. + // G G + // / / + // P -> C1 + // / \ / \ + // C1 C2 P C2 + // / \ / \ + // A B A B + // This function merely properlly corrects the parent-child relationships. + void swapWithParent(HeapNode *node) { + auto *par = node->parent; + assert(par); + auto *gpar = node->parent->parent; + if (!gpar) { + assert(this->root == par); + this->assignRoot(node); + } else { + if (par->isLeftChild()) + gpar->adoptLeftChild(node); + else + gpar->adoptRightChild(node); + } + auto *par_old_left = par->leftChild(); + auto *par_old_right = par->rightChild(); + + par->adoptChildren(node->children); + if (par_old_left == node) { + node->adoptLeftChild(par); + node->adoptRightChild(par_old_right); + } else { + node->adoptLeftChild(par_old_left); + node->adoptRightChild(par); + } + } + +public: + // Insert a key-value pair into the heap. + // If the same key exists in the heap, it updates the value of the existing + // HeapNode and heapifies the node up or down. + void insert(K key, V value) { + auto cur = nodes.find(key); + if (cur != nodes.end()) { + cur->second->value = std::move(value); + if (!heapifyUp(cur->second.get())) + heapifyDown(cur->second.get()); + } else { + // If the key is not found, create a new HeapNode and insert it into the + // heap. + auto *node = new HeapNode(key, std::move(value)); + insert(node); + nodes.try_emplace(key, node); + } + } + + // Removes the node associated with a key from the heap if one exists. + void erase(K k) { + auto cur = nodes.find(k); + if (cur != nodes.end()) + remove(cur->second.get()); + } + + // Retrieves the HeapNode corresponding to a given key (returns nullptr if no + // such node exists). + HeapNode *get(K k) { + auto it = nodes.find(k); + return it == nodes.end() ? nullptr : it->second.get(); + } + + // Removes the root (max-element) from the heap. + void pop() { + assert(!empty()); + remove(root); + } + + // Returns the value of the max-element node. + V top() { return std::move(root->value); } + + bool empty() { return Size == 0; } + + unsigned size() { return Size; }; + + // Returns a string representation of the heap tree. + std::string toString() { + std::string str; + str += "HEAP with "; + str += std::to_string(Size); + str += " nodes\n"; + if (root) + str += root->toString(0); + return str; + } +}; +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChain.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChain.h @@ -0,0 +1,124 @@ +//===- PropellerNodeChain.h ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_NODE_CHAIN_H +#define LLD_ELF_PROPELLER_NODE_CHAIN_H + +#include "PropellerCFG.h" +#include "llvm/ADT/DenseSet.h" + +#include +#include +#include + +using llvm::DenseSet; + +namespace lld { +namespace propeller { + +// Represents a chain of nodes (basic blocks). +class NodeChain { +public: + // Representative node of the chain, with which it is initially constructed. + CFGNode *DelegateNode; + + // CFG of the nodes in this chain (this will be null if the nodes come from + // more than one cfg. + ControlFlowGraph *CFG; + + // Ordered list of the nodes in this chain. + std::list Nodes; + + // Iterators to the positions in the chain where the chain transitions from + // one function to another. + std::list::iterator> FunctionTransitions; + + // Out edges for this chain, to its own nodes and nodes of other chains. + std::unordered_map> OutEdges; + + // Chains which have outgoing edges to this chain. + DenseSet InEdges; + + // Total binary size of the chain. + uint64_t Size; + + // Total execution frequency of the chain. + uint64_t Freq; + + // Extended TSP score of the chain. + double Score = 0; + + // Whether to print out information about how this chain joins with others. + bool DebugChain; + + // Constructor for building a NodeChain from a single Node + NodeChain(CFGNode *node) + : DelegateNode(node), CFG(node->CFG), Nodes(1, node), Size(node->ShSize), + Freq(node->Freq), DebugChain(node->CFG->DebugCFG) {} + + // Constructor for building a NodeChain from all nodes in the CFG according to + // the initial order. + NodeChain(ControlFlowGraph *cfg) + : DelegateNode(cfg->getEntryNode()), CFG(cfg), Size(cfg->Size), Freq(0), + DebugChain(cfg->DebugCFG) { + cfg->forEachNodeRef([this](CFGNode &node) { + Nodes.push_back(&node); + Freq += node.Freq; + }); + } + + // Helper function to iterate over the outgoing edges of this chain to a + // specific chain, while applying a given function on each edge. + template + void forEachOutEdgeToChain(NodeChain *chain, Visitor V) { + auto it = OutEdges.find(chain); + if (it == OutEdges.end()) + return; + for (CFGEdge *E : it->second) + V(*E, this, chain); + } + + // This returns the execution density of the chain. + double execDensity() const { + return ((double)Freq) / std::max(Size, (uint64_t)1); + } +}; + +// This returns a string representation of the chain +std::string toString(const NodeChain &c); + +} // namespace propeller +} // namespace lld + +namespace std { +// Specialization of std::less for NodeChain, which allows for consistent +// tie-breaking in our Map data structures. +template <> struct less { + bool operator()(const lld::propeller::NodeChain *c1, + const lld::propeller::NodeChain *c2) const { + return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr; + } +}; + +// Specialization of std::less for pair, which allows for +// consistent tie-breaking in our Map data structures. +template <> +struct less> { + bool operator()( + const pair p1, + const pair p2) + const { + if (less()(p1.first, p2.first)) + return true; + if (less()(p2.first, p1.first)) + return false; + return less()(p1.second, p2.second); + } +}; +} // namespace std + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChain.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChain.cpp @@ -0,0 +1,32 @@ +//===- NodeChain.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "NodeChain.h" + +namespace lld { +namespace propeller { + +std::string toString(const NodeChain &c) { + std::string str; + size_t cfgNameLength = c.DelegateNode->CFG->Name.size(); + str += c.DelegateNode->CFG->Name.str() + " [ "; + for (auto *n : c.Nodes) { + str += n->CFG->getEntryNode() == n + ? "Entry" + : std::to_string(n->ShName.size() - cfgNameLength - 4); + str += " (size=" + std::to_string(n->ShSize) + + ", freq=" + std::to_string(n->Freq) + ")"; + if (n != c.Nodes.back()) + str += " -> "; + } + str += " ]"; + str += " score: " + std::to_string(c.Score); + return str; +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h @@ -0,0 +1,184 @@ +//===- PropellerNodeChainAssembly.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file includes the declaration of the NodeChainAssembly class. Each +// instance of this class gives a recipe for merging two NodeChains together in +// addition to the ExtTSP score gain that will be achieved by that merge. Each +// NodeChainAssembly consists of three NodeChainSlices from two node chains: the +// (potentially) splitted chain, and the unsplit chain. A NodeChainSlice has +// represents a slice of a NodeChain by storing iterators to the beginning and +// end of that slice in the node chain, plus the binary offsets at which these +// slices begin and end. The offsets allow effient computation of the gain in +// ExtTSP score. +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_NODE_CHAIN_ASSEMBLY_H +#define LLD_ELF_PROPELLER_NODE_CHAIN_ASSEMBLY_H + +#include "NodeChain.h" +#include "PropellerCFG.h" + +#include +#include + +namespace lld { +namespace propeller { + +double getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward, + uint64_t srcSinkDistance); + +class NodeChainSlice { +public: + // Chain from which this slice comes from + NodeChain *Chain; + + // The endpoints of the slice in the corresponding chain + std::list::iterator Begin, End; + + // The offsets corresponding to the two endpoints + uint64_t BeginOffset, EndOffset; + + // Constructor for building a chain slice from a given chain and the two + // endpoints of the chain. + NodeChainSlice(NodeChain *c, std::list::iterator begin, + std::list::iterator end) + : Chain(c), Begin(begin), End(end) { + + BeginOffset = (*begin)->ChainOffset; + if (End == Chain->Nodes.end()) + EndOffset = Chain->Size; + else + EndOffset = (*end)->ChainOffset; + } + + // (Binary) size of this slice + uint64_t size() const { return EndOffset - BeginOffset; } +}; + +// This enum represents the order in which three slices (X1, X2, and Y) are +// merged together. +enum MergeOrder { + Begin, + X2X1Y = Begin, + BeginNext, + X1YX2 = BeginNext, + X2YX1, + YX2X1, + End +}; + +class NodeChainAssembly { +public: + // The gain in ExtTSP score achieved by this NodeChainAssembly once it + // is accordingly applied to the two chains. + // This is effectively equal to "Score - splitChain->Score - + // unsplitChain->Score". + double ScoreGain = 0; + + // The two chains, the first being the splitChain and the second being the + // unsplitChain. + std::pair ChainPair; + + // The slice position of splitchain + std::list::iterator SlicePosition; + + // The three chain slices + std::vector Slices; + + // The merge order of the slices + MergeOrder MOrder; + + // The constructor for creating a NodeChainAssembly. slicePosition must be an + // iterator into chainX. + NodeChainAssembly(NodeChain *chainX, NodeChain *chainY, + std::list::iterator slicePosition, + MergeOrder mOrder) + : ChainPair(chainX, chainY), SlicePosition(slicePosition), + MOrder(mOrder) { + NodeChainSlice x1(chainX, chainX->Nodes.begin(), SlicePosition); + NodeChainSlice x2(chainX, SlicePosition, chainX->Nodes.end()); + NodeChainSlice y(chainY, chainY->Nodes.begin(), chainY->Nodes.end()); + + switch (MOrder) { + case MergeOrder::X2X1Y: + Slices = {std::move(x2), std::move(x1), std::move(y)}; + break; + case MergeOrder::X1YX2: + Slices = {std::move(x1), std::move(y), std::move(x2)}; + break; + case MergeOrder::X2YX1: + Slices = {std::move(x2), std::move(y), std::move(x1)}; + break; + case MergeOrder::YX2X1: + Slices = {std::move(y), std::move(x2), std::move(x1)}; + break; + default: + assert("Invalid MergeOrder!" && false); + } + + // Set the ExtTSP Score gain as the difference between the new score after + // merging these chains and the current scores of the two chains. + ScoreGain = + computeExtTSPScore() - splitChain()->Score - unsplitChain()->Score; + } + + bool isValid() { return ScoreGain > 0.0001; } + + // Find the NodeChainSlice in this NodeChainAssembly which contains the given + // node. If the node is not contained in this NodeChainAssembly, then return + // false. Otherwise, set idx equal to the index of the corresponding slice and + // return true. + bool findSliceIndex(CFGNode *node, NodeChain *chain, uint64_t offset, + uint8_t &idx) const; + + // This function computes the ExtTSP score for a chain assembly record. This + // goes the three BB slices in the assembly record and considers all edges + // whose source and sink belongs to the chains in the assembly record. + double computeExtTSPScore() const; + + // First node in the resulting assembled chain. + CFGNode *getFirstNode() const { + for (auto &slice : Slices) + if (slice.Begin != slice.End) + return *slice.Begin; + return nullptr; + } + + struct CompareNodeChainAssembly { + bool operator()(const std::unique_ptr &a1, + const std::unique_ptr &a2) const; + }; + + friend class NodeChainBuilder; + + // We delete the copy constructor to make sure NodeChainAssembly is moved + // rather than copied. + NodeChainAssembly(NodeChainAssembly &&) = default; + // copy constructor is implicitly deleted + // NodeChainAssembly(NodeChainAssembly&) = delete; + NodeChainAssembly() = delete; + + std::pair assemblyStrategy() const { + return std::make_pair(MOrder, (*SlicePosition)->MappedAddr); + } + + inline bool split() const { + return SlicePosition != splitChain()->Nodes.begin(); + } + + inline NodeChain *splitChain() const { return ChainPair.first; } + + inline NodeChain *unsplitChain() const { return ChainPair.second; } + + inline MergeOrder mergeOrder() const { return MOrder; } +}; + +std::string toString(NodeChainAssembly &assembly); + +} // namespace propeller +} // namespace lld + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp @@ -0,0 +1,204 @@ +//===- NodeChainAssembly.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "NodeChainAssembly.h" +#include "NodeChain.h" + +namespace lld { +namespace propeller { + +// Return the Extended TSP score for one edge, given its source to sink +// direction and distance in the layout. +double getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward, + uint64_t srcSinkDistance) { + // Approximate callsites to be in the middle of the source basic block. + if (edge.isCall()) { + if (isEdgeForward) + srcSinkDistance += edge.Src->ShSize / 2; + else + srcSinkDistance -= edge.Src->ShSize / 2; + } + + if (edge.isReturn()) { + if (isEdgeForward) + srcSinkDistance += edge.Sink->ShSize / 2; + else + srcSinkDistance -= edge.Sink->ShSize / 2; + } + + if (srcSinkDistance == 0 && (edge.Type == CFGEdge::EdgeType::INTRA_FUNC || + edge.Type == CFGEdge::EdgeType::INTRA_DYNA)) + return edge.Weight * propellerConfig.optFallthroughWeight; + + if (isEdgeForward && srcSinkDistance < propellerConfig.optForwardJumpDistance) + return edge.Weight * propellerConfig.optForwardJumpWeight * + (1.0 - + ((double)srcSinkDistance) / propellerConfig.optForwardJumpDistance); + + if (!isEdgeForward && + srcSinkDistance < propellerConfig.optBackwardJumpDistance) + return edge.Weight * propellerConfig.optBackwardJumpWeight * + (1.0 - ((double)srcSinkDistance) / + propellerConfig.optBackwardJumpDistance); + return 0; +} + +bool NodeChainAssembly::findSliceIndex(CFGNode *node, NodeChain *chain, + uint64_t offset, uint8_t &idx) const { + for (idx = 0; idx < 3; ++idx) { + if (chain != Slices[idx].Chain) + continue; + // We find if the node's offset lies with the begin and end offset of this + // slice. + if (offset < Slices[idx].BeginOffset || offset > Slices[idx].EndOffset) + continue; + if (offset < Slices[idx].EndOffset && offset > Slices[idx].BeginOffset) + return true; + // A node can have zero size, which means multiple nodes may be associated + // with the same offset. This means that if the node's offset is at the + // beginning or the end of the slice, the node may reside in either slices + // of the chain. + if (offset == Slices[idx].EndOffset) { + // If offset is at the end of the slice, iterate backwards over the + // slice to find a zero-sized node. + for (auto nodeIt = std::prev(Slices[idx].End); + nodeIt != std::prev(Slices[idx].Begin); nodeIt--) { + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if ((*nodeIt)->ShSize) + break; + // Have we found the node? + if (*nodeIt == node) + return true; + } + } + if (offset == Slices[idx].BeginOffset) { + // If offset is at the beginning of the slice, iterate forwards over the + // slice to find the node. + for (auto nodeIt = Slices[idx].Begin; nodeIt != Slices[idx].End; + nodeIt++) { + if (*nodeIt == node) + return true; + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if ((*nodeIt)->ShSize) + break; + } + } + } + return false; +} + +// This function computes the ExtTSP score for a chain assembly record. This +// goes the three BB slices in the assembly record and considers all edges +// whose source and sink belongs to the chains in the assembly record. +double NodeChainAssembly::computeExtTSPScore() const { + // Zero-initialize the score. + double score = 0; + + auto addEdgeScore = [this, &score](CFGEdge &edge, NodeChain *srcChain, + NodeChain *sinkChain) { + uint8_t srcSliceIdx, sinkSliceIdx; + auto srcNodeOffset = edge.Src->ChainOffset; + auto sinkNodeOffset = edge.Sink->ChainOffset; + if (!findSliceIndex(edge.Src, srcChain, srcNodeOffset, srcSliceIdx)) + return; + + if (!findSliceIndex(edge.Sink, sinkChain, sinkNodeOffset, sinkSliceIdx)) + return; + + bool edgeForward = (srcSliceIdx < sinkSliceIdx) || + (srcSliceIdx == sinkSliceIdx && + (srcNodeOffset + edge.Src->ShSize <= sinkNodeOffset)); + + uint64_t srcSinkDistance = 0; + + if (srcSliceIdx == sinkSliceIdx) { + srcSinkDistance = edgeForward + ? sinkNodeOffset - srcNodeOffset - edge.Src->ShSize + : srcNodeOffset - sinkNodeOffset + edge.Src->ShSize; + } else { + const NodeChainSlice &srcSlice = Slices[srcSliceIdx]; + const NodeChainSlice &sinkSlice = Slices[sinkSliceIdx]; + srcSinkDistance = + edgeForward + ? srcSlice.EndOffset - srcNodeOffset - edge.Src->ShSize + + sinkNodeOffset - sinkSlice.BeginOffset + : srcNodeOffset - srcSlice.BeginOffset + edge.Src->ShSize + + sinkSlice.EndOffset - sinkNodeOffset; + // Increment the distance by the size of the middle slice if the src + // and sink are from the two ends. + if (std::abs(sinkSliceIdx - srcSliceIdx) == 2) + srcSinkDistance += Slices[1].size(); + } + + score += getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance); + }; + + // No changes will be made to the score that is contributed by the unsplit + // chain and we can simply increment by the chain's stored score. + score += unsplitChain()->Score; + + // We need to recompute the score induced by the split chain (if it has really + // been split) as the offsets of the nodes have changed. + if (split()) + splitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore); + else + score += splitChain()->Score; + + // Consider the contribution to score for inter-chain edges. + splitChain()->forEachOutEdgeToChain(unsplitChain(), addEdgeScore); + unsplitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore); + + return score; +} + +bool NodeChainAssembly::CompareNodeChainAssembly::operator()( + const std::unique_ptr &a1, + const std::unique_ptr &a2) const { + + if (a1->ScoreGain == a2->ScoreGain) { + if (std::less>()(a1->ChainPair, + a2->ChainPair)) + return true; + if (std::less>()(a2->ChainPair, + a1->ChainPair)) + return false; + return a1->assemblyStrategy() < a2->assemblyStrategy(); + } + return a1->ScoreGain < a2->ScoreGain; +} + +static std::string toString(MergeOrder mOrder) { + switch (mOrder) { + case MergeOrder::X2X1Y: + return "X2X1Y"; + case MergeOrder::X1YX2: + return "X1YX2"; + case MergeOrder::X2YX1: + return "X2YX1"; + case MergeOrder::YX2X1: + return "YX2X1"; + default: + assert("Invalid MergeOrder!" && false); + return ""; + } +} + +std::string toString(NodeChainAssembly &assembly) { + std::string str("assembly record between:\n"); + str += lld::propeller::toString(*assembly.splitChain()) + " as X\n"; + str += lld::propeller::toString(*assembly.unsplitChain()) + " as Y\n"; + // str += "split position (X):, " + std::to_string(assembly.SlicePosition - + // assembly.splitChain()->Nodes.begin()) + "\n"; + str += "merge order: " + lld::propeller::toString(assembly.MOrder) + "\n"; + str += "ScoreGain: " + std::to_string(assembly.ScoreGain); + return str; +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h @@ -0,0 +1,126 @@ +//===- PropellerNodeChainBuilder.h ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_NODE_CHAIN_BUILDER_H +#define LLD_ELF_PROPELLER_NODE_CHAIN_BUILDER_H + +#include "ModifiablePriorityQueue.h" +#include "NodeChain.h" +#include "NodeChainAssembly.h" +#include "NodeChainClustering.h" +#include "PropellerCFG.h" + +#include "llvm/ADT/DenseMap.h" + +#include +#include +#include + +using llvm::DenseMap; + +namespace lld { +namespace propeller { + +// BB Chain builder based on the ExtTSP metric +class NodeChainBuilder { +private: + NodeChainAssembly::CompareNodeChainAssembly NodeChainAssemblyComparator; + // Cfgs repreresenting the functions that are reordered + std::vector CFGs; + + // Set of built chains, keyed by section index of their Delegate Nodes. + // Chains are removed from this Map once they are merged into other chains. + DenseMap> Chains; + + // All the initial chains, seperated into connected components + std::vector> Components; + + // NodeChainBuilder performs BB ordering component by component. + // This is the component number that the chain builder is currently working + // on. + unsigned CurrentComponent; + + // These represent all the edges which are -- based on the profile -- the only + // (executed) outgoing edges from their source node and the only (executed) + // incoming edges to their sink nodes. The algorithm will make sure that these + // edges form fall-throughs in the final order. + DenseMap MutuallyForcedOut; + + // This maps every (ordered) pair of chains (with the first chain in the pair + // potentially splittable) to the highest-gain NodeChainAssembly for those + // chains. The Heap data structure allows fast retrieval of the maximum gain + // NodeChainAssembly, along with fast update. + ModifiablePriorityQueue, + std::unique_ptr, + std::less>, + NodeChainAssembly::CompareNodeChainAssembly> + NodeChainAssemblies; + + // This map stores the candidate chains for each chain. + // + // For every chain, its candidate chains are the chains which can increase the + // overall ExtTSP score when merged with that chain. This is used to update + // the NodeChainAssemblies map whenever chains merge together. The candidate + // chains of a chain may also be updated as result of a merge. + std::unordered_map> + CandidateChains; + + void coalesceChains(); + void initializeExtTSP(); + + // This function separates builds the connected components for the chains so + // it can later merge each connected component separately. + // Chains which are not connected to each other via profile edges will never + // be merged together by NodeChainBuilder. However, the composed chains may + // later be interleaved by ChainClustering. + void initializeChainComponents(); + void attachFallThroughs(); + + // This function tries to place two nodes immediately adjacent to + // each other (used for fallthroughs). + // Returns true if this can be done. + bool attachNodes(CFGNode *src, CFGNode *sink); + + void mergeChainEdges(NodeChain *splitChain, NodeChain *unsplitChain); + + void mergeInOutEdges(NodeChain *mergerChain, NodeChain *mergeeChain); + void mergeChains(NodeChain *leftChain, NodeChain *rightChain); + void mergeChains(std::unique_ptr assembly); + + // Recompute the ExtTSP score of a chain + double computeExtTSPScore(NodeChain *chain) const; + + // Update the related NodeChainAssembly records for two chains, with the + // assumption that unsplitChain has been merged into splitChain. + bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain); + + void mergeAllChains(); + + void init(); + + // Initialize the mutuallyForcedOut map + void initMutuallyForcedEdges(ControlFlowGraph &cfg); + + // Initialize basic block chains, with one chain for every node + void initNodeChains(ControlFlowGraph &cfg); + +public: + // This invokes the Extended TSP algorithm, orders the hot and cold basic + // blocks and inserts their associated symbols at the corresponding locations + // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the + // given SymbolList. + void doOrder(std::unique_ptr &CC); + + NodeChainBuilder(std::vector &cfgs) : CFGs(cfgs) {} + + NodeChainBuilder(ControlFlowGraph *cfg) : CFGs(1, cfg) {} +}; + +} // namespace propeller +} // namespace lld + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp @@ -0,0 +1,809 @@ +//===- NodeChainBuilder.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and includes the implementation of intra-function basic block +// reordering algorithm based on the Extended TSP metric as described in [1]. +// +// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic +// blocks in a function, by combining the gains from fall-throughs and short +// jumps. +// +// Given an ordering of the basic blocks, for a function f, the ExtTSP score is +// computed as follows. +// +// sum_{edges e in f} frequency(e) * weight(e) +// +// where frequency(e) is the execution frequency and weight(e) is computed as +// follows: +// * 1 if distance(Src[e], Sink[e]) = 0 (i.e. fallthrough) +// +// * 0.1 * (1 - distance(Src[e], Sink[e]) / 1024) if Src[e] < Sink[e] and 0 < +// distance(Src[e], Sink[e]) < 1024 (i.e. short forward jump) +// +// * 0.1 * (1 - distance(Src[e], Sink[e]) / 640) if Src[e] > Sink[e] and 0 < +// distance(Src[e], Sink[e]) < 640 (i.e. short backward jump) +// +// * 0 otherwise +// +// In short, it computes a weighted sum of frequencies of all edges in the +// control flow graph. Each edge gets its weight depending on whether the given +// ordering makes the edge a fallthrough, a short forward jump, or a short +// backward jump. +// +// Although this problem is NP-hard like the regular TSP, an iterative greedy +// basic-block-chaining algorithm is used to find a close to optimal solution. +// This algorithm is described as follows. +// +// Starting with one basic block sequence (BB chain) for every basic block, the +// algorithm iteratively joins BB chains together in order to maximize the +// extended TSP score of all the chains. +// +// Initially, it finds all mutually-forced edges in the profiled CFG. These are +// all the edges which are -- based on the profile -- the only (executed) +// outgoing edge from their source node and the only (executed) incoming edges +// to their sink nodes. Next, the source and sink of all mutually-forced edges +// are attached together as fallthrough edges. +// +// Then, at every iteration, the algorithm tries to merge a pair of BB chains +// which leads to the highest gain in the ExtTSP score. The algorithm extends +// the search space by considering splitting short (less than 128 bytes in +// binary size) BB chains into two chains and then merging these two chains with +// the other chain in four ways. After every merge, the new merge gains are +// updated. The algorithm repeats joining BB chains until no additional can be +// achieved. Eventually, it sorts all the existing chains in decreasing order +// of their execution density, i.e., the total profiled frequency of the chain +// divided by its binary size. +// +// The values used by this algorithm are reconfiguriable using lld's propeller +// flags. Specifically, these parameters are: +// +// * propeller-forward-jump-distance: maximum distance of a forward jump +// default-set to 1024 in the above equation). +// +// * propeller-backward-jump-distance: maximum distance of a backward jump +// (default-set to 640 in the above equation). +// +// * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1) +// +// * propeller-forward-jump-weight: weight of a forward jump (default-set to +// 0.1) +// +// * propeller-backward-jump-weight: weight of a backward jump (default-set to +// 0.1) +// +// * propeller-chain-split-threshold: maximum binary size of a BB chain which +// the algorithm will consider for splitting (default-set to 128). +// +// References: +// * [1] A. Newell and S. Pupyrev, Improved Basic Block Reordering, available +// at https://arxiv.org/abs/1809.04676 +//===----------------------------------------------------------------------===// + +#include "NodeChainBuilder.h" + +using llvm::detail::DenseMapPair; + +namespace lld { +namespace propeller { + +// Initializes the node chains for each cfg and finds all the mutually-forced +// edges. +void NodeChainBuilder::init() { + for (ControlFlowGraph *cfg : CFGs) { + initNodeChains(*cfg); + initMutuallyForcedEdges(*cfg); + } +} + +// NodeChainBuilder calls this function after building all the chains to attach +// as many fall-throughs as possible. Given that the algorithm already optimizes +// the extend TSP score, this function will only affect the cold basic blocks +// and thus we do not need to consider the edge weights. +void NodeChainBuilder::attachFallThroughs() { + for (ControlFlowGraph *Cfg : CFGs) { + // First, try to keep the fall-throughs from the original order. + for (auto &Node : Cfg->Nodes) { + if (Node->FTEdge != nullptr) { + attachNodes(Node.get(), Node->FTEdge->Sink); + } + } + + // Sometimes, the original fall-throughs cannot be kept. So we try to find + // new fall-through opportunities which did not exist in the original order. + for (auto &Edge : Cfg->IntraEdges) { + if (Edge->Type == CFGEdge::EdgeType::INTRA_FUNC || + Edge->Type == CFGEdge::EdgeType::INTRA_DYNA) + attachNodes(Edge->Src, Edge->Sink); + } + } +} + +// This function sorts BB chains in decreasing order of their execution density. +// NodeChainBuilder calls this function at the end to ensure that hot BB chains +// are placed at the beginning of the function. +void NodeChainBuilder::coalesceChains() { + std::vector chainOrder; + for (DenseMapPair> &elem : Chains) + chainOrder.push_back(elem.second.get()); + + std::sort( + chainOrder.begin(), chainOrder.end(), [](NodeChain *c1, NodeChain *c2) { + if (!c1->CFG || !c2->CFG || c1->CFG != c2->CFG) + error("Attempting to coalesce chains belonging to different " + "functions."); + if (c1->Freq == 0 ^ c2->Freq == 0) + return c1->Freq != 0; + auto *entryNode = c1->CFG->getEntryNode(); + if (entryNode->Chain == c1) + return true; + if (entryNode->Chain == c2) + return false; + double c1ExecDensity = c1->execDensity(); + double c2ExecDensity = c2->execDensity(); + if (c1ExecDensity == c2ExecDensity) + return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr; + return c1ExecDensity > c2ExecDensity; + }); + + NodeChain *mergerChain = nullptr; + + for (NodeChain *c : chainOrder) { + if (!mergerChain) { + mergerChain = c; + continue; + } + // Create a cold partition when -propeller-split-funcs is set. + if (propellerConfig.optSplitFuncs && + (mergerChain->Freq == 0 ^ c->Freq == 0)) { + mergerChain = c; + continue; + } + mergeChains(mergerChain, c); + } +} + +// Merge two chains in the specified order. +void NodeChainBuilder::mergeChains(NodeChain *leftChain, + NodeChain *rightChain) { + if (leftChain->Freq == 0 ^ rightChain->Freq == 0) + error("Attempting to merge hot and cold chains: \n" + toString(*leftChain) + + "\nAND\n" + toString(*rightChain)); + + mergeInOutEdges(leftChain, rightChain); + + for (CFGNode *node : rightChain->Nodes) { + leftChain->Nodes.push_back(node); + node->Chain = leftChain; + node->ChainOffset += leftChain->Size; + } + leftChain->Size += rightChain->Size; + leftChain->Freq += rightChain->Freq; + leftChain->DebugChain |= rightChain->DebugChain; + if (leftChain->CFG != rightChain->CFG) + leftChain->CFG = nullptr; + + Chains.erase(rightChain->DelegateNode->MappedAddr); +} + +// This function tries to place two basic blocks immediately adjacent to each +// other (used for fallthroughs). Returns true if the basic blocks have been +// attached this way. +bool NodeChainBuilder::attachNodes(CFGNode *src, CFGNode *sink) { + if (sink->isEntryNode()) + return false; + + // Ignore edges between hot and cold basic blocks. + if (src->Freq == 0 ^ sink->Freq == 0) + return false; + NodeChain *srcChain = src->Chain; + NodeChain *sinkChain = sink->Chain; + // Skip this edge if the source and sink are in the same chain + if (srcChain == sinkChain) + return false; + + // It's possible to form a fall-through between src and sink only if + // they are respectively located at the end and beginning of their chains. + if (srcChain->Nodes.back() != src || sinkChain->Nodes.front() != sink) + return false; + // Attaching is possible. So we merge the chains in the corresponding order. + mergeChains(srcChain, sinkChain); + return true; +} + +void NodeChainBuilder::mergeInOutEdges(NodeChain *mergerChain, + NodeChain *mergeeChain) { + for (auto &elem : mergeeChain->OutEdges) { + NodeChain *c = (elem.first == mergeeChain) ? mergerChain : elem.first; + auto res = mergerChain->OutEdges.emplace(c, elem.second); + if (!res.second) + res.first->second.insert(res.first->second.end(), elem.second.begin(), + elem.second.end()); + else + c->InEdges.insert(mergerChain); + + c->InEdges.erase(mergeeChain); + } + + for (auto *c : mergeeChain->InEdges) { + if (c == mergeeChain) + continue; + auto &mergeeChainEdges = c->OutEdges[mergeeChain]; + auto &mergerChainEdges = c->OutEdges[mergerChain]; + + mergerChainEdges.insert(mergerChainEdges.end(), mergeeChainEdges.begin(), + mergeeChainEdges.end()); + mergerChain->InEdges.insert(c); + c->OutEdges.erase(mergeeChain); + } +} + +// Merge two BB sequences according to the given NodeChainAssembly. A +// NodeChainAssembly is an ordered triple of three slices from two chains. +void NodeChainBuilder::mergeChains( + std::unique_ptr assembly) { + if (assembly->splitChain()->Freq == 0 ^ assembly->unsplitChain()->Freq == 0) + error("Attempting to merge hot and cold chains: \n" + + toString(*assembly.get())); + + if (assembly->splitChain()->Freq == 0 ^ assembly->unsplitChain()->Freq == 0) + error("Attempting to merge hot and cold chains: \n" + + toString(*assembly.get())); + + // Decide which chain gets merged into the other chain, to make the reordering + // more efficient. + NodeChain *mergerChain = (assembly->MOrder == YX2X1) + ? assembly->unsplitChain() + : assembly->splitChain(); + NodeChain *mergeeChain = (assembly->MOrder == YX2X1) + ? assembly->splitChain() + : assembly->unsplitChain(); + + // Merge in and out edges of the two chains + mergeInOutEdges(mergerChain, mergeeChain); + + // Create the new node order according the given assembly + auto X1Begin = assembly->splitChain()->Nodes.begin(); + auto X2Begin = assembly->SlicePosition; + // Does X2 mark a function transition ? + bool X2FuncTransition = X2Begin != assembly->splitChain()->Nodes.begin() && + (*std::prev(X2Begin))->CFG != (*X2Begin)->CFG; + auto YBegin = assembly->unsplitChain()->Nodes.begin(); + + // If splitChain is truly splitted, reorder the splitted parts from X1X2 to + // X2X1 if needed. We can do this using splice because we are splicing the + // same list. + if (assembly->split() && + (assembly->MOrder == X2X1Y || assembly->MOrder == X2YX1 || + assembly->MOrder == YX2X1)) + assembly->splitChain()->Nodes.splice(X1Begin, assembly->splitChain()->Nodes, + X2Begin, + assembly->splitChain()->Nodes.end()); + + switch (assembly->MOrder) { + case X2X1Y: + // Splice Y at the end of X. + assembly->splitChain()->Nodes.splice(assembly->splitChain()->Nodes.end(), + assembly->unsplitChain()->Nodes); + break; + case X1YX2: + // Splice Y in the middle + assembly->splitChain()->Nodes.splice(X2Begin, + assembly->unsplitChain()->Nodes); + break; + case X2YX1: + // Splice Y in the middle + assembly->splitChain()->Nodes.splice(X1Begin, + assembly->unsplitChain()->Nodes); + break; + case YX2X1: + // Splice X at the end of Y + assembly->unsplitChain()->Nodes.splice( + assembly->unsplitChain()->Nodes.end(), assembly->splitChain()->Nodes); + break; + default: + break; + } + + if (propellerConfig.optReorderIP) { + // Merge function transitions of mergerChain with those of mergeeChain. + mergerChain->FunctionTransitions.splice( + mergerChain->FunctionTransitions.end(), + mergeeChain->FunctionTransitions); + + // Add the beginnings of slices if they now mark a function transition. + std::vector::iterator> allSlicesBegin; + // YBegin should always be examined. + allSlicesBegin.push_back(YBegin); + // If X2Begin was a function transition point before, we don't examine it + // again as it will remain in the transition points. + if (!X2FuncTransition) + allSlicesBegin.push_back(X2Begin); + // X1Begin will only be examined if this is a splitting assembly. Otherwise, + // X1 is empty. + if (assembly->split()) + allSlicesBegin.push_back(X1Begin); + + // Check if any of the slices mark a function transition position and add + // those to the function transition list. + for (auto it : allSlicesBegin) + if (it != mergerChain->Nodes.begin() && + (*std::prev(it))->CFG != (*it)->CFG) + mergerChain->FunctionTransitions.push_back(it); + } + + // Set the starting and ending point for updating the nodes's chain and offset + // in the new chain. + auto chainBegin = mergerChain->Nodes.begin(); + uint64_t chainBeginOffset = 0; + + if (assembly->MOrder == X1YX2) { + chainBegin = YBegin; + chainBeginOffset = assembly->Slices[0].size(); + } + + if (assembly->MOrder == YX2X1) { + chainBegin = X2Begin; + chainBeginOffset = assembly->Slices[0].size(); + } + + if (!assembly->split()) { + chainBegin = YBegin; + chainBeginOffset = assembly->splitChain()->Size; + } + + uint64_t runningOffset = chainBeginOffset; + + // Update nodeOffsetMap and nodeToChainMap for all the nodes in the sequence. + for (auto it = chainBegin; it != mergerChain->Nodes.end(); ++it) { + CFGNode *node = *it; + node->Chain = mergerChain; + node->ChainOffset = runningOffset; + runningOffset += node->ShSize; + } + + mergerChain->Size = runningOffset; + + // Update the total frequency and ExtTSP score of the aggregated chain + mergerChain->Freq += mergerChain->Freq; + + // We have already computed the new score in the assembly record. So we can + // update the score based on that and the other chain's score. + mergerChain->Score += mergeeChain->Score + assembly->ScoreGain; + + mergerChain->DebugChain |= mergeeChain->DebugChain; + + if (mergerChain->CFG != mergeeChain->CFG) + mergerChain->CFG = nullptr; + + // Merge the assembly candidate chains of the two chains into the candidate + // chains of the remaining NodeChain and remove the records for the defunct + // NodeChain. + for (NodeChain *c : CandidateChains[mergeeChain]) { + NodeChainAssemblies.erase(std::make_pair(c, mergeeChain)); + NodeChainAssemblies.erase(std::make_pair(mergeeChain, c)); + CandidateChains[c].erase(mergeeChain); + if (c != mergerChain) + CandidateChains[mergerChain].insert(c); + } + + // Update the NodeChainAssembly for all candidate chains of the merged + // NodeChain. Remove a NodeChain from the merge chain's candidates if the + // NodeChainAssembly update finds no gain. + auto &mergerChainCandidateChains = CandidateChains[mergerChain]; + + for (auto CI = mergerChainCandidateChains.begin(), + CE = mergerChainCandidateChains.end(); + CI != CE;) { + NodeChain *otherChain = *CI; + auto &otherChainCandidateChains = CandidateChains[otherChain]; + + bool x = updateNodeChainAssembly(otherChain, mergerChain); + + if (!x) + NodeChainAssemblies.erase(std::make_pair(otherChain, mergerChain)); + + bool y = updateNodeChainAssembly(mergerChain, otherChain); + + if (!y) + NodeChainAssemblies.erase(std::make_pair(mergerChain, otherChain)); + + if (x || y) { + otherChainCandidateChains.insert(mergerChain); + CI++; + } else { + otherChainCandidateChains.erase(mergerChain); + CI = mergerChainCandidateChains.erase(CI); + } + } + + // remove all the candidate chain records for the merged-in chain + CandidateChains.erase(mergeeChain); + + // Finally, remove the defunct (merged-in) chain record from the chains + Chains.erase(mergeeChain->DelegateNode->MappedAddr); +} + +// Calculate the Extended TSP metric for a BB chain. +// This function goes over all the BBs in the chain and for BB chain and +// aggregates the score of all edges which are contained in the same chain. +double NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const { + double score = 0; + + auto visit = [&score](CFGEdge &edge, NodeChain *srcChain, + NodeChain *sinkChain) { + assert(srcChain == sinkChain); + auto srcOffset = edge.Src->ChainOffset; + auto sinkOffset = edge.Sink->ChainOffset; + bool edgeForward = srcOffset + edge.Src->ShSize <= sinkOffset; + // Calculate the distance between src and sink + uint64_t distance = edgeForward ? sinkOffset - srcOffset - edge.Src->ShSize + : srcOffset - sinkOffset + edge.Src->ShSize; + score += getEdgeExtTSPScore(edge, edgeForward, distance); + }; + + chain->forEachOutEdgeToChain(chain, visit); + + return score; +} + +// Updates the best NodeChainAssembly between two NodeChains. The existing +// record will be replaced by the new NodeChainAssembly if a non-zero gain +// is achieved. Otherwise, it will be removed. +// If a nodechain assembly record is (kept) inserted, returns true. Otherwise +// returns false. +bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain, + NodeChain *unsplitChain) { + // Only consider splitting the chain if the size of the chain is smaller than + // a threshold. + bool doSplit = (splitChain->Size <= propellerConfig.optChainSplitThreshold); + auto slicePosEnd = + doSplit ? splitChain->Nodes.end() : std::next(splitChain->Nodes.begin()); + + std::unique_ptr bestAssembly(nullptr); + + for (auto slicePos = splitChain->Nodes.begin(); slicePos != slicePosEnd; + ++slicePos) { + // Do not split the mutually-forced edges in the chain. + if (slicePos != splitChain->Nodes.begin() && + MutuallyForcedOut.count(*std::prev(slicePos))) + continue; + + // If the split position is at the beginning (no splitting), only consider + // one MergeOrder + auto mergeOrderEnd = (slicePos == splitChain->Nodes.begin()) + ? MergeOrder::BeginNext + : MergeOrder::End; + + for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) { + MergeOrder mOrder = static_cast(MI); + + // Create the NodeChainAssembly representing this particular assembly. + auto NCA = std::unique_ptr( + new NodeChainAssembly(splitChain, unsplitChain, slicePos, mOrder)); + + if (!NCA->isValid()) + continue; + + if (!bestAssembly || NodeChainAssemblyComparator(bestAssembly, NCA)) + bestAssembly = std::move(NCA); + } + } + + if (propellerConfig.optReorderIP && !doSplit) { + // For inter-procedural layout, we always try splitting at the function + // entries, regardless of the split-chain's size. + for (auto slicePos : splitChain->FunctionTransitions) { + for (uint8_t MI = MergeOrder::Begin; MI != MergeOrder::End; MI++) { + MergeOrder mOrder = static_cast(MI); + + // Create the NodeChainAssembly representing this particular assembly. + auto NCA = std::unique_ptr( + new NodeChainAssembly(splitChain, unsplitChain, slicePos, mOrder)); + + if (!NCA->isValid()) + continue; + + // Update bestAssembly if needed + if (!bestAssembly || NodeChainAssemblyComparator(bestAssembly, NCA)) + bestAssembly = std::move(NCA); + } + } + } + + // Insert the best assembly of the two chains (only if it has positive gain). + if (bestAssembly) { + if (splitChain->DebugChain || unsplitChain->DebugChain) + fprintf(stderr, "INSERTING ASSEMBLY: %s\n", + toString(*bestAssembly).c_str()); + + NodeChainAssemblies.insert(bestAssembly->ChainPair, + std::move(bestAssembly)); + return true; + } else + return false; +} + +void NodeChainBuilder::initNodeChains(ControlFlowGraph &cfg) { + for (auto &node : cfg.Nodes) { + NodeChain *chain = new NodeChain(node.get()); + node->Chain = chain; + node->ChainOffset = 0; + Chains.try_emplace(node->MappedAddr, chain); + } +} + +// Find all the mutually-forced edges. +// These are all the edges which are -- based on the profile -- the only +// (executed) outgoing edge from their source node and the only (executed) +// incoming edges to their sink nodes +void NodeChainBuilder::initMutuallyForcedEdges(ControlFlowGraph &cfg) { + DenseMap mutuallyForcedOut; + DenseSet mutuallyForcedIn; + + auto l = prop->BBLayouts.find(cfg.Name); + if (l != prop->BBLayouts.end()) { + CFGNode *lastNode = nullptr; + for (auto ordinal : l->second) { + auto r = Chains.find(ordinal); + if (r == Chains.end()) { + lastNode = nullptr; + continue; + } + CFGNode *thisNode = r->second->DelegateNode; + if (lastNode) { + mutuallyForcedOut.try_emplace(lastNode, thisNode); + mutuallyForcedIn.insert(thisNode); + } + lastNode = thisNode; + } + } + + DenseMap> profiledOuts; + DenseMap> profiledIns; + + for (auto &node : cfg.Nodes) { + if (!mutuallyForcedOut.count(node.get())) { + std::copy_if(node->Outs.begin(), node->Outs.end(), + std::back_inserter(profiledOuts[node.get()]), + [&mutuallyForcedIn](CFGEdge *edge) { + return !mutuallyForcedIn.count(edge->Sink) && + (edge->Type == CFGEdge::EdgeType::INTRA_FUNC || + edge->Type == CFGEdge::EdgeType::INTRA_DYNA) && + edge->Weight != 0; + }); + } + if (!mutuallyForcedIn.count(node.get())) { + std::copy_if(node->Ins.begin(), node->Ins.end(), + std::back_inserter(profiledIns[node.get()]), + [&mutuallyForcedOut](CFGEdge *edge) { + return !mutuallyForcedOut.count(edge->Src) && + (edge->Type == CFGEdge::EdgeType::INTRA_FUNC || + edge->Type == CFGEdge::EdgeType::INTRA_DYNA) && + edge->Weight != 0; + }); + } + } + + for (auto &node : cfg.Nodes) { + if (profiledOuts[node.get()].size() == 1) { + CFGEdge *edge = profiledOuts[node.get()].front(); + if (edge->Type != CFGEdge::EdgeType::INTRA_FUNC && + edge->Type != CFGEdge::EdgeType::INTRA_DYNA) + continue; + if (profiledIns[edge->Sink].size() == 1) + mutuallyForcedOut.try_emplace(node.get(), edge->Sink); + } + } + + // Break cycles in the mutually forced edges by cutting the edge sinking to + // the smallest address in every cycle (hopefully a loop backedge) + DenseMap nodeToPathMap; + SmallVector cycleCutNodes; + unsigned pathCount = 0; + for (auto it = mutuallyForcedOut.begin(); it != mutuallyForcedOut.end(); + ++it) { + // Check to see if the node (and its cycle) have already been visited. + if (nodeToPathMap[it->first]) + continue; + CFGEdge *victimEdge = nullptr; + auto nodeIt = it; + pathCount++; + while (nodeIt != mutuallyForcedOut.end()) { + CFGNode *node = nodeIt->first; + unsigned path = nodeToPathMap[node]; + if (path != 0) { + // If this node is marked with a number, either it is the same number, + // in which case we have found a cycle. Or it is a different number, + // which means we have found a path to a previously visited path + // (non-cycle). + if (path == pathCount) { + // We have found a cycle: add the victim edge + cycleCutNodes.push_back(victimEdge->Src); + } + break; + } else + nodeToPathMap[node] = pathCount; + if (!profiledOuts[node].empty()) { + CFGEdge *edge = profiledOuts[node].front(); + if (!victimEdge || + (edge->Sink->MappedAddr < victimEdge->Sink->MappedAddr)) { + victimEdge = edge; + } + } + nodeIt = mutuallyForcedOut.find(nodeIt->second); + } + } + + // Remove the victim edges to break cycles in the mutually forced edges + for (CFGNode *node : cycleCutNodes) + mutuallyForcedOut.erase(node); + + for (auto &elem : mutuallyForcedOut) + MutuallyForcedOut.insert(elem); +} + +// This function initializes the ExtTSP algorithm's data structures. This +// the NodeChainAssemblies and the CandidateChains maps. +void NodeChainBuilder::initializeExtTSP() { + // For each chain, compute its ExtTSP score, add its chain assembly records + // and its merge candidate chain. + CandidateChains.clear(); + for (NodeChain *chain : Components[CurrentComponent]) + chain->Score = chain->Freq ? computeExtTSPScore(chain) : 0; + + DenseSet> visited; + + for (NodeChain *chain : Components[CurrentComponent]) { + for (auto &chainEdge : chain->OutEdges) { + NodeChain *otherChain = chainEdge.first; + if (chain == otherChain) + continue; + if (visited.count(std::make_pair(chain, otherChain))) + continue; + + bool x = updateNodeChainAssembly(chain, otherChain); + bool y = updateNodeChainAssembly(otherChain, chain); + + if (x || y) { + CandidateChains[chain].insert(otherChain); + CandidateChains[otherChain].insert(chain); + } + visited.insert(std::make_pair(chain, otherChain)); + visited.insert(std::make_pair(otherChain, chain)); + } + } +} + +void NodeChainBuilder::initializeChainComponents() { + + DenseMap chainToComponentMap; + unsigned componentId = 0; + for (DenseMapPair> &elem : Chains) { + NodeChain *chain = elem.second.get(); + // Cold chains will not be placed in any component. + if (!chain->Freq) + continue; + // Skip if this chain has already been assigned to a component. + if (chainToComponentMap.count(chain)) + continue; + // Start creating the component containing this chain and its connected + // chains. + chainToComponentMap[chain] = componentId; + std::vector toVisit(1, chain); + unsigned index = 0; + + // Find the connected component using BFS. + while (index != toVisit.size()) { + auto *tchain = toVisit[index++]; + for (auto *c : tchain->InEdges) { + if (!chainToComponentMap.count(c)) { + chainToComponentMap[c] = componentId; + toVisit.push_back(c); + } + } + for (auto &e : tchain->OutEdges) { + if (!chainToComponentMap.count(e.first)) { + chainToComponentMap[e.first] = componentId; + toVisit.push_back(e.first); + } + } + } + Components.push_back(std::move(toVisit)); + componentId++; + } +} + +void NodeChainBuilder::mergeAllChains() { + // Attach the mutually-foced edges (which will not be split anymore by the + // Extended TSP algorithm). + for (auto &elem : MutuallyForcedOut) + attachNodes(elem.first, elem.second); + + // Set up the outgoing edges for every chain + for (auto &elem : Chains) { + auto *chain = elem.second.get(); + // Ignore cold chains as they cannot have any hot edges to other nodes + if (!chain->Freq) + continue; + auto addEdge = [chain](CFGEdge &edge) { + // Ignore returns and zero-frequency edges as these edges will not be used + // by the ExtTSP score algorithm. + if (!edge.Weight || edge.isReturn()) + return; + auto *sinkNodeChain = edge.Sink->Chain; + chain->OutEdges[sinkNodeChain].push_back(&edge); + sinkNodeChain->InEdges.insert(chain); + }; + + if (propellerConfig.optReorderIP) { + for (CFGNode *node : chain->Nodes) + node->forEachOutEdgeRef(addEdge); + } else { + for (CFGNode *node : chain->Nodes) + node->forEachIntraOutEdgeRef(addEdge); + } + } + + initializeChainComponents(); + + for (CurrentComponent = 0; CurrentComponent < Components.size(); + ++CurrentComponent) { + fprintf(stderr, "COMPONENT: %u -> SIZE: %zu\n", CurrentComponent, + Components[CurrentComponent].size()); + // Initialize the Extended TSP algorithm's data. + initializeExtTSP(); + + // Keep merging the chain assembly record with the highest ExtTSP gain, + // until no more gain is possible. + while (!NodeChainAssemblies.empty()) { + auto bestAssembly = NodeChainAssemblies.top(); + NodeChainAssemblies.pop(); + if (bestAssembly->splitChain()->DebugChain || + bestAssembly->unsplitChain()->DebugChain) + fprintf(stderr, "MERGING for %s\n", + toString(*bestAssembly.get()).c_str()); + mergeChains(std::move(bestAssembly)); + } + } +} + +void NodeChainBuilder::doOrder(std::unique_ptr &CC) { + init(); + + mergeAllChains(); + + // Merge fallthrough basic blocks if we have missed any + attachFallThroughs(); + + if (!propellerConfig.optReorderIP) { + // If this is not inter-procedural, coalesce all hot chains into a single + // hot chain and all cold chains into a single cold chain. + coalesceChains(); + assert(CFGs.size() == 1 && Chains.size() <= 2); + +#ifdef PROPELLER_PROTOBUF + ControlFlowGraph *cfg = CFGs.back(); + if (prop->protobufPrinter) { + std::list nodeOrder; + for (auto &elem : Chains) { + auto *chain = elem.second.get(); + nodeOrder.insert(chain->Freq ? nodeOrder.begin() : nodeOrder.end(), + chain->Nodes.begin(), chain->Nodes.end()); + } + prop->protobufPrinter->addCFG(*cfg, &nodeOrder); + } +#endif + } + + // Hand in the built chains to the chain clustering algorithm + for (auto &elem : Chains) + CC->addChain(std::move(elem.second)); +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.h @@ -0,0 +1,143 @@ +//===- PropellerChainClustering.h ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H +#define LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H + +#include "NodeChain.h" + +#include "llvm/ADT/DenseMap.h" + +#include + +using llvm::DenseMap; + +namespace lld { +namespace propeller { + +// This is the base class for clustering chains. It provides the basic data +// structures and functions. +class ChainClustering { +public: + class Cluster { + public: + // Constructor for building a cluster from a single chain. + Cluster(NodeChain *); + + // The chains in this cluster in the merged order. + std::vector Chains; + + // The representative chain for this cluster. + NodeChain *DelegateChain; + + // Total size of the cluster. + uint64_t Size; + + // Total frequency of the cluster. + uint64_t Freq; + + // This function merges this cluster with another cluster + Cluster &mergeWith(Cluster &other) { + Chains.insert(Chains.end(), other.Chains.begin(), other.Chains.end()); + this->Freq += other.Freq; + this->Size += other.Size; + return *this; + } + + // Returns the execution density of this cluster + double getDensity() { return ((double)Freq / Size); } + }; + + // This function merges two clusters in the given order, and updates + // ChainToClusterMap for the chains in the mergee cluster and also removes the + // mergee cluster from the Clusters. + void mergeTwoClusters(Cluster *predecessorCluster, Cluster *cluster) { + // Join the two clusters into predecessorCluster. + predecessorCluster->mergeWith(*cluster); + + // Update chain to cluster mapping, because all chains that were + // previsously in cluster are now in predecessorCluster. + for (NodeChain *c : cluster->Chains) { + ChainToClusterMap[c] = predecessorCluster; + } + + // Delete the defunct cluster + Clusters.erase(cluster->DelegateChain->DelegateNode->MappedAddr); + } + + // Adds a chain to the input of the clustering process + void addChain(std::unique_ptr &&chain_ptr); + + // Any subclass should override this function. + virtual void doOrder(std::vector &hotOrder, + std::vector &coldOrder); + + virtual ~ChainClustering() = default; + +protected: + // Optional function for merging clusters, which could be overriden by + // subclasses. + virtual void mergeClusters(){}; + + // This function sorts the final clusters in decreasing order of their + // execution density. + void sortClusters(std::vector &); + + // This function initializes clusters with each cluster including a single + // chain. + void initClusters() { + for (auto &c_ptr : HotChains) { + NodeChain *chain = c_ptr.get(); + Cluster *cl = new Cluster(chain); + cl->Freq = chain->Freq; + cl->Size = std::max(chain->Size, (uint64_t)1); + ChainToClusterMap[chain] = cl; + Clusters.try_emplace(cl->DelegateChain->DelegateNode->MappedAddr, cl); + } + } + + // Chains processed by the clustering algorithm separated into hot and cold + // ones based on their frequency. + std::vector> HotChains, ColdChains; + + // All clusters currently in process. + DenseMap> Clusters; + + // This maps every chain to its containing cluster. + DenseMap ChainToClusterMap; +}; + +// This class computes an ordering for the input chains which conforms to the +// initial ordering of nodes in the binary. +class NoOrdering : public ChainClustering { +public: + void doOrder(std::vector &hotOrder, + std::vector &coldOrder); +}; + +// This class implements the call-chain-clustering algorithm. +class CallChainClustering : public ChainClustering { +private: + Cluster *getMostLikelyPredecessor(NodeChain *chain, Cluster *cluster); + void mergeClusters(); +}; + +} // namespace propeller +} // namespace lld + +namespace std { +template <> struct less { + bool operator()(const lld::propeller::ChainClustering::Cluster *c1, + const lld::propeller::ChainClustering::Cluster *c2) const { + return less()(c1->DelegateChain, + c2->DelegateChain); + } +}; + +} // namespace std + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp @@ -0,0 +1,230 @@ +//===- PropellerChainClustering.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and includes the implementation of the Call-Chain-Clustering +// algorithm as described in [1]. +// +// The algorithm iteratively merges chains into clusters. To do so, it processes +// the basic block chains in decreasing order of +// their execution density and for each chain, merges its cluster with its +// most-frequent predecessor cluster. The merging of a cluster stops when it +// reaches the -propeller-cluster-merge-size-threshold (set to 2MB by default). +// +// [1] Guilherme Ottoni and Bertrand Maher. Optimizing Function Placement for +// Large-Scale Data-Center Applications. Available at +// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +//===----------------------------------------------------------------------===// +#include "NodeChainClustering.h" +#include "PropellerConfig.h" + +using llvm::detail::DenseMapPair; + +namespace lld { +namespace propeller { + +void ChainClustering::addChain(std::unique_ptr &&chain_ptr) { + for (CFGNode *n : chain_ptr->Nodes) + n->Chain = chain_ptr.get(); + auto &chainList = + ((propellerConfig.optReorderIP || propellerConfig.optSplitFuncs || + propellerConfig.optReorderFuncs) && + chain_ptr->Freq == 0) + ? ColdChains + : HotChains; + chainList.push_back(std::move(chain_ptr)); +} + +// Initializes a cluster containing a single chain an associates it with a +// unique id. +ChainClustering::Cluster::Cluster(NodeChain *chain) + : Chains(1, chain), DelegateChain(chain) {} + +// Returns the most frequent predecessor of a chain. This function also gets as +// the second parameter the cluster containing the chain to save a lookup +// into ChainToClusterMap. +ChainClustering::Cluster * +CallChainClustering::getMostLikelyPredecessor(NodeChain *chain, + Cluster *cluster) { + // This map stores the total weight of the incoming edges to the given cluster + // from every other cluster. + DenseMap clusterEdge; + + for (CFGNode *n : chain->Nodes) { + // For non-inter-procedural, we only consider the function entries. + if (!propellerConfig.optReorderIP && !n->isEntryNode()) + continue; + auto visit = [&clusterEdge, n, chain, cluster, this](CFGEdge &edge) { + if (!edge.Weight || edge.isReturn()) + return; + if (!propellerConfig.optReorderIP && !edge.isCall()) + return; + auto *callerChain = edge.Src->Chain; + if (!callerChain) { + warn("Caller for node: " + n->ShName + " does not have a chain!"); + return; + } + auto *callerCluster = ChainToClusterMap[callerChain]; + if (callerChain == chain || callerCluster == cluster) + return; + // Ignore clusters which are too big + if (callerCluster->Size > propellerConfig.optClusterMergeSizeThreshold) + return; + // Ignore edges which are cold relative to the sink node + if (edge.Weight * 10 < n->Freq) + return; + // Do not merge if the caller cluster's density would degrade by more than + // 1/8 by the merge. + if (8 * callerCluster->Size * (cluster->Freq * callerCluster->Freq) < + callerCluster->Freq * (cluster->Size + callerCluster->Size)) + return; + clusterEdge[callerCluster] += edge.Weight; + }; + n->forEachInEdgeRef(visit); + } + + // Get the highest-frequency caller + auto bestCaller = + std::max_element(clusterEdge.begin(), clusterEdge.end(), + [](const DenseMapPair &p1, + const DenseMapPair &p2) { + if (p1.second == p2.second) // Consistently break ties + return std::less()(p1.first, p2.first); + return p1.second < p2.second; + }); + + if (bestCaller == clusterEdge.end()) + return nullptr; + return bestCaller->first; +} + +void ChainClustering::sortClusters(std::vector &clusterOrder) { + for (auto &p : Clusters) + clusterOrder.push_back(p.second.get()); + + auto clusterComparator = [](Cluster *c1, Cluster *c2) -> bool { + // Set a deterministic order when execution densities are equal. + if (c1->getDensity() == c2->getDensity()) + return c1->DelegateChain->DelegateNode->MappedAddr < + c2->DelegateChain->DelegateNode->MappedAddr; + return c1->getDensity() > c2->getDensity(); + }; + + std::sort(clusterOrder.begin(), clusterOrder.end(), clusterComparator); +} + +void NoOrdering::doOrder(std::vector &hotOrder, + std::vector &coldOrder) { + auto chainComparator = [](const std::unique_ptr &c_ptr1, + const std::unique_ptr &c_ptr2) -> bool { + return c_ptr1->DelegateNode->MappedAddr < c_ptr2->DelegateNode->MappedAddr; + }; + + std::sort(HotChains.begin(), HotChains.end(), chainComparator); + std::sort(ColdChains.begin(), ColdChains.end(), chainComparator); + + for (auto &c_ptr : HotChains) + for (CFGNode *n : c_ptr->Nodes) + hotOrder.push_back(n); + + for (auto &c_ptr : ColdChains) + for (CFGNode *n : c_ptr->Nodes) + coldOrder.push_back(n); +} + +// Merge clusters together based on the CallChainClustering algorithm. +void CallChainClustering::mergeClusters() { + // Build a map for the execution density of each chain. + DenseMap chainWeightMap; + + for (auto &c_ptr : HotChains) { + NodeChain *chain = c_ptr.get(); + chainWeightMap.try_emplace(chain, chain->execDensity()); + } + + // Sort the hot chains in decreasing order of their execution density. + std::sort(HotChains.begin(), HotChains.end(), + [&chainWeightMap](const std::unique_ptr &c_ptr1, + const std::unique_ptr &c_ptr2) { + auto chain1Weight = chainWeightMap[c_ptr1.get()]; + auto chain2Weight = chainWeightMap[c_ptr2.get()]; + if (chain1Weight == chain2Weight) + return c_ptr1->DelegateNode->MappedAddr < + c_ptr2->DelegateNode->MappedAddr; + return chain1Weight > chain2Weight; + }); + + for (auto &c_ptr : HotChains) { + NodeChain *chain = c_ptr.get(); + if (chainWeightMap[chain] <= 0.005) + break; + auto *cluster = ChainToClusterMap[chain]; + // Ignore merging if the cluster containing this function is bigger than + // 2MBs (size of a large page). + if (cluster->Size > propellerConfig.optClusterMergeSizeThreshold) + continue; + assert(cluster); + + Cluster *predecessorCluster = getMostLikelyPredecessor(chain, cluster); + if (!predecessorCluster) + continue; + + mergeTwoClusters(predecessorCluster, cluster); + } +} + +// This function orders the hot chains based on mergeClusters and sortClusters +// and uses a consistent ordering for the cold part. +void ChainClustering::doOrder(std::vector &hotOrder, + std::vector &coldOrder) { + initClusters(); + mergeClusters(); + std::vector clusterOrder; + sortClusters(clusterOrder); + // This maps every CFG to the position of its first hot node in the layout. + DenseMap hotCFGOrder; + + for (Cluster *cl : clusterOrder) + for (NodeChain *c : cl->Chains) + for (CFGNode *n : c->Nodes) { + hotCFGOrder.try_emplace(n->CFG, hotOrder.size()); + hotOrder.push_back(n); + } + + auto coldChainComparator = + [&hotCFGOrder](const std::unique_ptr &c_ptr1, + const std::unique_ptr &c_ptr2) -> bool { + // If each of the chains comes from a single CFG, we can order the chains + // consistently based on the hot layout + if (c_ptr1->CFG && c_ptr2->CFG) { + // If one CFG is cold and the other is hot, make sure the hot CFG's chain + // comes earlier in the order. + if (c_ptr1->CFG->isHot() != c_ptr2->CFG->isHot()) + return c_ptr1->CFG->isHot(); + // If both CFGs are hot, make the order for cold chains consistent with + // the order for hot ones. + if (c_ptr1->CFG->isHot() && c_ptr2->CFG->isHot() && + (c_ptr1->CFG != c_ptr2->CFG)) + return hotCFGOrder[c_ptr1->CFG] < hotCFGOrder[c_ptr2->CFG]; + // Otherwise, use a consistent order based on the representative nodes. + return c_ptr1->DelegateNode->MappedAddr < + c_ptr2->DelegateNode->MappedAddr; + } + // Use an order consistent with the initial ordering of the representative + // nodes. + return c_ptr1->DelegateNode->MappedAddr < c_ptr2->DelegateNode->MappedAddr; + }; + + std::sort(ColdChains.begin(), ColdChains.end(), coldChainComparator); + + for (auto &c_ptr : ColdChains) + for (CFGNode *n : c_ptr->Nodes) + coldOrder.push_back(n); +} + +} // namespace propeller +} // namespace lld Index: lld/test/ELF/propeller/codelayout/function-ordering.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/function-ordering.s @@ -0,0 +1,75 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises function reordering on three functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld %t.o -o %t.out + +# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000008 t foo +# BEFORE-NEXT: 0000000000201128 0000000000000008 t bar +# BEFORE-NEXT: 0000000000201130 0000000000000008 t baz + +## Create a propeller profile based on the following inter-procedural calls. +## +## 100 100 +## baz -----> bar -----> foo +## + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 8 Nbar" >> %t_prof.propeller +# RUN: echo "3 8 Nbaz" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "3 2 100 C" >> %t_prof.propeller +# RUN: echo "2 1 100 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller + +## Link with the propeller profile and expect the functions to be reordered. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out +# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 0000000000000008 t baz +# REORDER-NEXT: 0000000000201128 0000000000000008 t bar +# REORDER-NEXT: 0000000000201130 0000000000000008 t foo + +## Disable function reordering and expect that the original function order is retained. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out +# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER + +# NOREORDER: 0000000000201120 0000000000000008 t foo +# NOREORDER-NEXT: 0000000000201128 0000000000000008 t bar +# NOREORDER-NEXT: 0000000000201130 0000000000000008 t baz + +.section .text,"ax",@progbits,unique,1 +# -- Begin function foo +.type foo,@function +foo: + .quad 0x11 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,2 +# -- Begin function bar +.type bar,@function +bar: + .quad 0x22 + +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,3 +# -- Begin function baz +.type baz,@function +baz: + .quad 0x33 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz Index: lld/test/ELF/propeller/codelayout/function-with-loop.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/function-with-loop.s @@ -0,0 +1,130 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises basic block reordering on a single function with a simple loop. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 0000000000201123 a.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: je 3 + +# BEFORE: 0000000000201128 aa.BB.foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 000000000020112b aaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: jne -13 + +# BEFORE: 0000000000201130 aaaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: retq + + +## Create a propeller profile for foo, based on the cfg below: +## +## foo +## | +## |5 +## V +## +--> a.BB.foo <--+ +## | / \ | +## 0| 0/ \95 |90 +## | / \ | +## | v \ | +## aa.BB.foo v | +## \ aaa.BB.foo +## \ / +## \ / +## 0\ /10 +## \ / +## v v +## aaaa.BB.foo +## +## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo +## as a fallthrough chain in the layout. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 14 Nfoo" > %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 3 1.2" >> %t_prof.propeller +# RUN: echo "4 5 1.3" >> %t_prof.propeller +# RUN: echo "5 4 1.4" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "4 2 90" >> %t_prof.propeller +# RUN: echo "2 4 95" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 10" >> %t_prof.propeller +# RUN: echo "1 2 5" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 foo: +# REORDER-NEXT: nopl (%rax) + +# REORDER: 0000000000201123 a.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne 9 + +# REORDER: 0000000000201128 aaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne -10 + +# REORDER: 000000000020112d aaaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: retq + +# REORDER: 0000000000201131 aa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jmp -14 + +## Disable basic block reordering and expect that the original bb order is retained. +# +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out +# RUN: diff %t.propeller.noreorder.out %t.out + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +foo: + nopl (%rax) + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + jmp aaa.BB.foo +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + jne a.BB.foo + jmp aaaa.BB.foo +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits,unique,4 +aaaa.BB.foo: + nopl (%rax) + ret +.Laaaa.BB.foo_end: + .size aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo Index: lld/test/ELF/propeller/codelayout/opt-all-combinations.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/opt-all-combinations.s @@ -0,0 +1,295 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises all combinations of propeller options (reorder-funcs, reorder-blocks, +## and split-funcs) on four functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: Disassembly of section .text: +# BEFORE-EMPTY: +# BEFORE-NEXT: foo: +# BEFORE-NEXT: xorb %al, 0 +# BEFORE-NEXT: int3 +# BEFORE-EMPTY: +# BEFORE-NEXT: bar: +# BEFORE-NEXT: xorb %al, 1 +# BEFORE-NEXT: je 9 +# BEFORE-EMPTY: +# BEFORE-NEXT: a.BB.bar: +# BEFORE-NEXT: xorb %al, 2 +# BEFORE-NEXT: jmp 7 +# BEFORE-EMPTY: +# BEFORE-NEXT: aa.BB.bar: +# BEFORE-NEXT: xorb %al, 3 +# BEFORE-EMPTY: +# BEFORE-NEXT: aaa.BB.bar: +# BEFORE-NEXT: xorb %al, 4 +# BEFORE-EMPTY: +# BEFORE-NEXT: baz: +# BEFORE-NEXT: xorb %al, 5 +# BEFORE-NEXT: int3 +# BEFORE-EMPTY: +# BEFORE-NEXT: qux: +# BEFORE-NEXT: xorb %al, 6 +# + +## Create a propeller profile for four functions foo, bar, baz, and qux based on the cfg below: +## +## +## bar +## / \ +## / \100 +## / \ +## 100 v v +## foo <----- a.BB.bar aa.BB.bar baz qux ---+ +## \ / ^ | +## \ /100 | 1 | +## \ / +-----+ +## v v +## aaa.BB.bar +## + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 20 Nbar" >> %t_prof.propeller +# RUN: echo "3 9 2.1" >> %t_prof.propeller +# RUN: echo "4 7 2.2" >> %t_prof.propeller +# RUN: echo "5 7 2.3" >> %t_prof.propeller +# RUN: echo "6 8 Nbaz" >> %t_prof.propeller +# RUN: echo "7 7 Nqux" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "4 1 100 C" >> %t_prof.propeller +# RUN: echo "7 7 10 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller --verbose -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: Disassembly of section .text: +# REORDER-EMPTY: +# REORDER-NEXT: bar: +# REORDER-NEXT: xorb %al, 1 +# REORDER-NEXT: jne 30 +# REORDER-EMPTY: +# REORDER-NEXT: aa.BB.bar: +# REORDER-NEXT: xorb %al, 3 +# REORDER-EMPTY: +# REORDER-NEXT: aaa.BB.bar: +# REORDER-NEXT: xorb %al, 4 +# REORDER-NEXT: int3 +# REORDER-EMPTY: +# REORDER-NEXT: foo: +# REORDER-NEXT: xorb %al, 0 +# REORDER-NEXT: int3 +# REORDER-EMPTY: +# REORDER-NEXT: qux: +# REORDER-NEXT: xorb %al, 6 +# REORDER-EMPTY: +# REORDER-NEXT: a.BB.bar: +# REORDER-NEXT: xorb %al, 2 +# REORDER-NEXT: jmp -32 +# REORDER-EMPTY: +# REORDER-NEXT: baz: +# REORDER-NEXT: xorb %al, 5 +# + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorderblocks.out 2>&1 | FileCheck %s --check-prefixes=WARN,IMPLICITNOSPLIT +# WARN-NOT: warning: +# IMPLICITNOSPLIT: warning: propeller: no-reorder-blocks implicitly sets no-split-funcs. +# +# RUN: llvm-objdump -d %t.propeller.noreorderblocks.out| FileCheck %s --check-prefix=NO_REORDER_BB +# +# NO_REORDER_BB: Disassembly of section .text: +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: bar: +# NO_REORDER_BB-NEXT: xorb %al, 1 +# NO_REORDER_BB-NEXT: je 9 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: a.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 2 +# NO_REORDER_BB-NEXT: jmp 7 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: aa.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: aaa.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 4 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: foo: +# NO_REORDER_BB-NEXT: xorb %al, 0 +# NO_REORDER_BB-NEXT: int3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: qux: +# NO_REORDER_BB-NEXT: xorb %al, 6 +# NO_REORDER_BB-NEXT: int3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: baz: +# NO_REORDER_BB-NEXT: xorb %al, 5 + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.noreorderfuncs.out +# RUN: llvm-objdump -d %t.propeller.noreorderfuncs.out| FileCheck %s --check-prefix=NO_REORDER_FUNC +# +# NO_REORDER_FUNC: Disassembly of section .text: +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: foo: +# NO_REORDER_FUNC-NEXT: xorb %al, 0 +# NO_REORDER_FUNC-NEXT: int3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 1 +# NO_REORDER_FUNC-NEXT: jne 22 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: aa.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: aaa.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 4 +# NO_REORDER_FUNC-NEXT: int3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: qux: +# NO_REORDER_FUNC-NEXT: xorb %al, 6 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: a.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 2 +# NO_REORDER_FUNC-NEXT: jmp -24 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: baz: +# NO_REORDER_FUNC-NEXT: xorb %al, 5 +# + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitfuncs.out +# RUN: llvm-objdump -d %t.propeller.nosplitfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_FUNC +# +# NO_SPLIT_FUNC: Disassembly of section .text: +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 1 +# NO_SPLIT_FUNC-NEXT: jne 14 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: aa.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: aaa.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 4 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: a.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 2 +# NO_SPLIT_FUNC-NEXT: jmp -16 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: foo: +# NO_SPLIT_FUNC-NEXT: xorb %al, 0 +# NO_SPLIT_FUNC-NEXT: int3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: qux: +# NO_SPLIT_FUNC-NEXT: xorb %al, 6 +# NO_SPLIT_FUNC-NEXT: int3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: baz: +# NO_SPLIT_FUNC-NEXT: xorb %al, 5 +# + +## Check that the combination of no-reorder-blocks and split-funcs makes lld fail. +# RUN: not ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-opt=split-funcs 2>&1 -o /dev/null | FileCheck %s --check-prefix=FAIL +# FAIL: propeller: Inconsistent combination of propeller optimizations 'split-funcs' and 'no-reorder-blocks'. + + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitreorderfuncs.out +# RUN: llvm-objdump -d %t.propeller.nosplitreorderfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_REORDER_FUNC +# +# NO_SPLIT_REORDER_FUNC: Disassembly of section .text: +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: foo: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 0 +# NO_SPLIT_REORDER_FUNC-NEXT: int3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 1 +# NO_SPLIT_REORDER_FUNC-NEXT: jne 14 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: aa.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: aaa.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 4 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: a.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 2 +# NO_SPLIT_REORDER_FUNC-NEXT: jmp -16 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: baz: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 5 +# NO_SPLIT_REORDER_FUNC-NEXT: int3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: qux: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 6 + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +.align 4 +foo: + xor %al,0 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,1 +# -- Begin function bar +.type bar,@function +.align 4 +bar: + xor %al,1 + je aa.BB.bar + jmp a.BB.bar + +.section .text,"ax",@progbits,unique,2 +a.BB.bar: + xor %al,2 + jmp aaa.BB.bar +.La.BB.bar_end: + .size a.BB.bar, .La.BB.bar_end-a.BB.bar + +.section .text,"ax",@progbits,unique,3 +aa.BB.bar: + xor %al,3 + jmp aaa.BB.bar +.Laa.BB.bar_end: + .size aa.BB.bar, .Laa.BB.bar_end-aa.BB.bar + +.section .text,"ax",@progbits,unique,4 +aaa.BB.bar: + xor %al,4 +.Laaa.BB.bar_end: + .size aaa.BB.bar, .Laaa.BB.bar_end-aaa.BB.bar + +.section .text,"ax",@progbits,unique,1 +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,5 +# -- Begin function baz +.type baz,@function +.align 4 +baz: + xor %al,5 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz + +.section .text,"ax",@progbits,unique,6 +# -- Begin function qux +.type qux,@function +.align 4 +qux: + xor %al,6 + +.Lqux_end: + .size qux, .Lqux_end-qux +# -- End function qux Index: lld/test/ELF/propeller/codelayout/optimal-fallthrough.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/optimal-fallthrough.s @@ -0,0 +1,87 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises optimal basic block reordering one a single function. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000005 t foo +# BEFORE-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# BEFORE-NEXT: 000000000020112a 0000000000000004 t aa.BB.foo +# BEFORE-NEXT: 000000000020112e 0000000000000004 t aaa.BB.foo + +## Create a propeller profile for foo, based on the cfg below: +## +## +## foo +## |\ +## | \100 +## | \ +## | v +## 150| a.BB.foo +## | / \ +## | /100 \0 +## | / \ +## vv v +## aaa.BB.foo aa.BB.foo +## +## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and +## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo. +## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 12 Nfoo" >> %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 4 1.2" >> %t_prof.propeller +# RUN: echo "4 4 1.3" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "1 4 150" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "1 2 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -propeller-debug-symbol=foo -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out +# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER + +# AFTER: 0000000000201120 0000000000000005 t foo +# AFTER-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# AFTER-NEXT: 000000000020112a 0000000000000004 t aaa.BB.foo +# AFTER-NEXT: 000000000020112e 0000000000000004 t aa.BB.foo + +#.global _start +#_start: +# -- Begin function foo +.section .text,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + je aaa.BB.foo + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + ret +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + ret +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo Index: lld/test/ELF/symbol-alignment-file.s =================================================================== --- /dev/null +++ lld/test/ELF/symbol-alignment-file.s @@ -0,0 +1,29 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.out +# RUN: llvm-objdump -s %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: Contents of section .foo: +# BEFORE-NEXT: 201120 11cccccc 2233 + +# RUN: echo "_foo2 1" > %t_alignment.txt +# RUN: echo "_foo3 4" >> %t_alignment.txt + +# RUN: ld.lld --symbol-alignment-file %t_alignment.txt %t.o -o %t2.out +# RUN: llvm-objdump -s %t2.out| FileCheck %s --check-prefix=AFTER + +# AFTER: Contents of section .foo: +# AFTER-NEXT: 201120 1122cccc 33 + +.section .foo,"ax",@progbits,unique,1 +_foo1: + .byte 0x11 + +.section .foo,"ax",@progbits,unique,2 +.align 4 +_foo2: + .byte 0x22 + +.section .foo,"ax",@progbits,unique,3 +_foo3: + .byte 0x33