Index: lld/ELF/Propeller/CodeLayout/CodeLayout.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/CodeLayout.h @@ -0,0 +1,41 @@ +//===- PropellerBBReordering.h -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H +#define LLD_ELF_PROPELLER_BB_REORDERING_H + +#include "NodeChainClustering.h" +#include "PropellerCFG.h" + +#include +#include + +namespace lld { +namespace propeller { + +class CodeLayout { +private: + // cfgs that are processed by the reordering algorithm. These are separated + // into hot and cold cfgs. + std::vector hotCFGs, coldCFGs; + // The final hot and cold order containing all cfg nodes. + std::vector HotOrder, ColdOrder; + // Handle of the clustering algorithm used to further reorder the computed + // chains. + std::unique_ptr clustering; + +public: + void doSplitOrder(std::list &symbolList, + std::list::iterator hotPlaceHolder, + std::list::iterator coldPlaceHolder); + + void printStats(); +}; + +} // namespace propeller +} // namespace lld +#endif Index: lld/ELF/Propeller/CodeLayout/CodeLayout.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/CodeLayout.cpp @@ -0,0 +1,185 @@ +//===- CodeLayout.cpp ------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and implements the entry point of code layout optimization +// (doSplitOrder). +//===----------------------------------------------------------------------===// +#include "CodeLayout.h" + +#include "NodeChain.h" +#include "NodeChainBuilder.h" +#include "NodeChainClustering.h" +#include "Propeller.h" +#include "PropellerCFG.h" +#include "PropellerConfig.h" + +#include "llvm/ADT/DenseMap.h" + +#include +#include + +using llvm::DenseMap; + +namespace lld { +namespace propeller { +extern uint64_t getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward, + uint64_t srcSinkDistance); + +// This function iterates over the cfgs included in the Propeller profile and +// adds them to cold and hot cfg lists. Then it appropriately performs basic +// block reordering by calling NodeChainBuilder.doOrder() either on all cfgs (if +// -propeller-opt=reorder-ip) or individually on every controlFlowGraph. After +// creating all the node chains, it hands the basic block chains to a +// ChainClustering instance for further rerodering. +void CodeLayout::doSplitOrder(std::list &symbolList, + std::list::iterator hotPlaceHolder, + std::list::iterator coldPlaceHolder) { + std::chrono::steady_clock::time_point start = + std::chrono::steady_clock::now(); + + // Populate the hot and cold cfg lists by iterating over the cfgs in the + // propeller profile. + prop->forEachCfgRef([this](ControlFlowGraph &cfg) { + if (cfg.isHot()) { + hotCFGs.push_back(&cfg); + if (propConfig.optPrintStats) { + // Dump the number of basic blocks and hot basic blocks for every + // function + unsigned hotBBs = 0; + unsigned allBBs = 0; + cfg.forEachNodeRef([&hotBBs, &allBBs](CFGNode &node) { + if (node.freq) + hotBBs++; + allBBs++; + }); + fprintf(stderr, "HISTOGRAM: %s,%u,%u\n", cfg.name.str().c_str(), allBBs, + hotBBs); + } + } else + coldCFGs.push_back(&cfg); + }); + + if (propConfig.optReorderIP || propConfig.optReorderFuncs) + clustering.reset(new CallChainClustering()); + else { + // If function ordering is disabled, we want to conform the the initial + // ordering of functions in both the hot and the cold layout. + clustering.reset(new NoOrdering()); + } + + if (propConfig.optReorderIP) { + // If -propeller-opt=reorder-ip we want to run basic block reordering on all + // the basic blocks of the hot cfgs. + NodeChainBuilder(hotCFGs).doOrder(clustering); + } else if (propConfig.optReorderBlocks) { + // Otherwise we apply reordering on every controlFlowGraph separately + for (ControlFlowGraph *cfg : hotCFGs) + NodeChainBuilder(cfg).doOrder(clustering); + } else { + // If reordering is not desired, we create changes according to the initial + // order in the controlFlowGraph. + for (ControlFlowGraph *cfg : hotCFGs) + clustering->addChain(std::unique_ptr(new NodeChain(cfg))); + } + + // The order for cold cfgs remains unchanged. + for (ControlFlowGraph *cfg : coldCFGs) + clustering->addChain(std::unique_ptr(new NodeChain(cfg))); + + // After building all the chains, let the chain clustering algorithm perform + // the final reordering and populate the hot and cold cfg node orders. + clustering->doOrder(HotOrder, ColdOrder); + + // Transfter the order to the symbol list. + for (CFGNode *n : HotOrder) + symbolList.insert(hotPlaceHolder, n->shName); + + for (CFGNode *n : ColdOrder) + symbolList.insert(coldPlaceHolder, n->shName); + + std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now(); + warn("[Propeller]: bb reordering took: " + + Twine(std::chrono::duration_cast(end - start) + .count())); + + if (propConfig.optPrintStats) + printStats(); +} + +// Prints statistics of the computed layout, including the number of partitions +// for each function across the code layout, the edge count for each distance +// level, and the ExtTSP score achieved for each function. +void CodeLayout::printStats() { + + DenseMap nodeAddressMap; + llvm::StringMap functionPartitions; + uint64_t currentAddress = 0; + ControlFlowGraph *currentCFG = nullptr; + for (CFGNode *n : HotOrder) { + if (currentCFG != n->controlFlowGraph) { + currentCFG = n->controlFlowGraph; + functionPartitions[currentCFG->name]++; + } + nodeAddressMap[n] = currentAddress; + currentAddress += n->shSize; + } + + for (auto &elem : functionPartitions) + fprintf(stderr, "FUNCTION PARTITIONS: %s,%u\n", elem.first().str().c_str(), + elem.second); + + std::vector distances({0, 128, 640, 1028, 4096, 65536, 2 << 20, + std::numeric_limits::max()}); + std::map histogram; + llvm::StringMap extTSPScoreMap; + for (CFGNode *n : HotOrder) { + auto scoreEntry = + extTSPScoreMap.try_emplace(n->controlFlowGraph->name, 0).first; + n->forEachOutEdgeRef([&nodeAddressMap, &distances, &histogram, + &scoreEntry](CFGEdge &edge) { + if (!edge.weight || edge.isReturn()) + return; + if (nodeAddressMap.find(edge.src) == nodeAddressMap.end() || + nodeAddressMap.find(edge.sink) == nodeAddressMap.end()) { + warn("Found a hot edge whose source and sink do not show up in the " + "layout!"); + return; + } + uint64_t srcOffset = nodeAddressMap[edge.src]; + uint64_t sinkOffset = nodeAddressMap[edge.sink]; + bool edgeForward = srcOffset + edge.src->shSize <= sinkOffset; + uint64_t srcSinkDistance = + edgeForward ? sinkOffset - srcOffset - edge.src->shSize + : srcOffset - sinkOffset + edge.src->shSize; + + if (edge.type == CFGEdge::EdgeType::INTRA_FUNC || + edge.type == CFGEdge::EdgeType::INTRA_DYNA) + scoreEntry->second += + getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance); + + auto res = + std::lower_bound(distances.begin(), distances.end(), srcSinkDistance); + histogram[*res] += edge.weight; + }); + } + + for (auto &elem : extTSPScoreMap) + fprintf(stderr, "Ext TSP score: %s %lu\n", elem.first().str().c_str(), + elem.second); + fprintf(stderr, "DISTANCE HISTOGRAM: "); + uint64_t sumEdgeWeights = 0; + for (auto elem : histogram) + sumEdgeWeights += elem.second; + for (auto elem : histogram) + fprintf(stderr, "\t[%lu -> %lu (%.2f%%)]", elem.first, elem.second, + (double)elem.second * 100 / sumEdgeWeights); + fprintf(stderr, "\n"); +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h @@ -0,0 +1,387 @@ +//===- ModifiablePriorityQueue.h ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file implements a priority-queue data structure for storing key-value +// pairs, with compare operations for both the key and the value type given. +// It stores these pairs in a max-heap data structure which allows for fast O(lg +// n)-time retrieval of the maximum element. +// To compare two key-value pairs, the values parts are compared first. The keys +// are compared in case of a tie. No two key-value pairs could have the same +// key. +// +// The key-value pairs are also accessible using a map data structure. +// +// In contrast to C++ STL's priority queue, this data structure allows for fast +// O(lg n)-time value update, or removal of any element while keep the max-heap +// data structure sound. +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H +#define LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H + +#include "llvm/ADT/DenseMap.h" +#include +#include +#include +#include +#include +#include + +// This class defines a node in the ModifiablePriorityQueue data structure +// containing a key, a value, pointers to its parent, and its left and right +// children. This class must be instantiated with the key-type, the value type, +// and their comparators. +template class HeapNode { +public: + K key; + V value; + // Pointer to the parent (this is nullptr for root). + HeapNode *parent; + // Pointers to the two children + HeapNode *children[2]; + + HeapNode(K k, V v) + : key(k), value(std::move(v)), parent(nullptr), children() {} + + // Get pointer to the left child + HeapNode *leftChild() { return children[0]; } + + // Get pointer to the right child + HeapNode *rightChild() { return children[1]; } + + // Whether this is the left child of its parent, returns false if this is the + // root. + bool isLeftChild() { + if (parent) + return parent->leftChild() == this; + return false; + } + + // Whether this is the right child of its parent, returns false if this is the + // root. + bool isRightChild() { + if (parent) + return parent->rightChild() == this; + return false; + } + + // Set the left child for this node. + void adoptLeftChild(HeapNode *c) { + children[0] = c; + if (c) + c->parent = this; + } + + // Set the right child for this node. + void adoptRightChild(HeapNode *c) { + children[1] = c; + if (c) + c->parent = this; + } + + // Set both the left and right children for this node. + void adoptChildren(HeapNode *(&_children)[2]) { + adoptLeftChild(_children[0]); + adoptRightChild(_children[1]); + } + + // Return a string representation for this node given an indentation level + // which is used for pretty-printing of the heap tree. + // This assumes that + // std::to_string has been specialized for the key and value types. + std::string toString(unsigned level) { + std::string str = std::string(level, ' '); + str += "NODE: " + std::to_string(key) + " -> " + std::to_string(value); + for (auto *c : children) { + if (c) { + str += "\n"; + str += c->toString(level + 1); + } + } + return str; + } +}; + +// Comparator class for nodes given comparators for the key and value types. It +// compares the values for the two pairs first and in the case of a tie, it +// compares the keys. +template struct CompareHeapNode { + CmpK KeyComparator; + CmpV ValueComparator; + bool operator()(const HeapNode &n1, + const HeapNode &n2) const { + if (ValueComparator(n1.value, n2.value)) + return true; + if (ValueComparator(n2.value, n1.value)) + return false; + return KeyComparator(n1.key, n2.key); + } +}; + +// This class implements a pointer-based heap-like data structure for storing +// key-value pairs with efficient O(lg n) retrieval, removal, update, and +// insertion of nodes. +// +// Any insertion happens at the lowest-right-most empty leaf position. This +// position is accessible via the root pointer of the tree using the binary +// representation of [size of the tree + 1]. Starting with the left-most bit, we +// take the left child if the bit is zero and the right child if the bit is one. +// +// Deletion of a node happens by replacing the last node (lowest-right-most +// occupied leaf), with the given node, removing the node from the tree, and +// finally heapifying the replaced node up and down to keep the +// soundness of the heap. +// +// Value-update is the simplest, which requires updating the node's value +// followed by heapifying up the node up and down. +template +class ModifiablePriorityQueue { +private: + // Comparator instance for comparing HeapNodes. + CompareHeapNode HeapNodeComparator; + + // Pointers to the nodes in this heap are stored in a map to allow for fast + // lookup of nodes given their keys. + llvm::DenseMap>> nodes; + + // The tree-root for this priority queue, which holds the max key-value. + HeapNode *root = nullptr; + + // Total number of nodes in the tree. + unsigned Size = 0; + + // Set the root for this tree and set it's parent to null. + void assignRoot(HeapNode *node) { + root = node; + if (root) + root->parent = nullptr; + } + + // Helper for getting a pointer to a node using a handle using its bits. + HeapNode *getNodeWithHandle(unsigned handle) { + return getNodeWithHandleHelper(root, handle); + } + + // This function traverses the tree from the given node, using an integer + // handle for direction. Going from left to right in the binary representation + // of the handle, take right if the bit is one, and left if the bit is zero. + HeapNode * + getNodeWithHandleHelper(HeapNode *node, unsigned handle) { + if (handle == 1) + return node; + auto *p = getNodeWithHandleHelper(node, handle >> 1); + return (handle & 1) ? p->rightChild() : p->leftChild(); + } + + // Insert a node into the heap tree. + void insert(HeapNode *node) { + if (!root) { + // If the tree is empty, we only need to set the root. + assignRoot(node); + } else { + // Otherwise, insert at the lowest-right-most leaf empty-position, which + // is encoded by [Size + 1] + unsigned handle = Size + 1; + // We need to find the parent of the target position. + auto *p = getNodeWithHandleHelper(root, handle >> 1); + // Attach this to the left or right of the parent node, depending on the + // last bit in handle. + if (handle & 1) + p->adoptRightChild(node); + else + p->adoptLeftChild(node); + // We only need to heapify up as this node has no children. + heapifyUp(node); + } + // Increment the size of the tree. + Size++; + } + + // This function removes a node from the heap and returns its value. + // Instead of removing the node directly, it moves the lowest-right-most node + // to the position of the given node and heapifies it up and down. + // + // After this function, the removed node's HeapNode pointer is invalid. + V remove(HeapNode *node) { + // Find the lowest-right-most node + auto *last = getNodeWithHandleHelper(root, Size); + assert(last->leftChild() == nullptr && last->rightChild() == nullptr); + + // Remove the node from the children of its parents. + if (last->parent) { + if (last->isLeftChild()) + last->parent->adoptLeftChild(nullptr); + else + last->parent->adoptRightChild(nullptr); + } + + if (node == last) { + // If we are removing the last node, not much needs to be done. + if (node->parent == nullptr) { + assert(node == root); + assignRoot(nullptr); + } + } else { // node != last + if (node->parent) { + // Move the last node in place of the to-be-removed node. + if (node->isLeftChild()) + node->parent->adoptLeftChild(last); + else + node->parent->adoptRightChild(last); + } else { // node->parent == nullptr, so node is root + assert(node == root); + assignRoot(last); + } + // Let the moved-node adopt the children of the to-be-removed node + last->adoptChildren(node->children); + // Finally, heapify the moved node up and down. + if (!heapifyUp(last)) + heapifyDown(last); + } + + // Decrement the number of nodes. + Size--; + // Save the node's value to be returned. + auto nodeElement = std::move(node->value); + // Remove the node from the key->node mapping, effectively deallocating the + // HeapNode. + nodes.erase(node->key); + return nodeElement; + } + + // Recursively rectify the max-heap upwards from a node. + // This takes O(lg n) time. + // Returns true if any modifications occurs. + bool heapifyUp(HeapNode *node) { + // Heapify upwards until no more inconsistency is found. + if (node->parent && HeapNodeComparator(*node->parent, *node)) { + swapWithParent(node); + heapifyUp(node); + return true; + } + return false; + } + + // Recursively rectify the max-heap downwards from a node. + // This takes O(lg n) time. + // Returns true if any modifications occurs. + bool heapifyDown(HeapNode *node) { + auto maxChild = std::max_element( + node->children, node->children + 2, + [this](const HeapNode *c1, + const HeapNode *c2) { + // This compares two HeapNode pointers, relying on + // HeapNodeComparator for non-null pointers. + return c1 == nullptr + ? true + : (c2 == nullptr ? false : HeapNodeComparator(*c1, *c2)); + }); + // Move the maximum child to the parent if required and further heapify + // down. + if (*maxChild != nullptr && HeapNodeComparator(*node, **maxChild)) { + swapWithParent(*maxChild); + heapifyDown(node); + return true; + } + return false; + } + + // Helper function for swapping a node with its parent. + // Can only be called if the given node has a parent. Also, the given node's + // value should compare bigger than both its parent and its sibling. + // For example swapWithParent(C1) results in the following. + // G G + // / / + // P -> C1 + // / \ / \ + // C1 C2 P C2 + // / \ / \ + // A B A B + // This function merely properlly corrects the parent-child relationships. + void swapWithParent(HeapNode *node) { + auto *par = node->parent; + assert(par); + auto *gpar = node->parent->parent; + if (!gpar) { + assert(this->root == par); + this->assignRoot(node); + } else { + if (par->isLeftChild()) + gpar->adoptLeftChild(node); + else + gpar->adoptRightChild(node); + } + auto *par_old_left = par->leftChild(); + auto *par_old_right = par->rightChild(); + + par->adoptChildren(node->children); + if (par_old_left == node) { + node->adoptLeftChild(par); + node->adoptRightChild(par_old_right); + } else { + node->adoptLeftChild(par_old_left); + node->adoptRightChild(par); + } + } + +public: + // Insert a key-value pair into the heap. + // If the same key exists in the heap, it updates the value of the existing + // HeapNode and heapifies the node up or down. + void insert(K key, V value) { + auto cur = nodes.find(key); + if (cur != nodes.end()) { + cur->second->value = std::move(value); + if (!heapifyUp(cur->second.get())) + heapifyDown(cur->second.get()); + } else { + // If the key is not found, create a new HeapNode and insert it into the + // heap. + auto *node = new HeapNode(key, std::move(value)); + insert(node); + nodes.try_emplace(key, node); + } + } + + // Removes the node associated with a key from the heap if one exists. + void erase(K k) { + auto cur = nodes.find(k); + if (cur != nodes.end()) + remove(cur->second.get()); + } + + // Retrieves the HeapNode corresponding to a given key (returns nullptr if no + // such node exists). + HeapNode *get(K k) { + auto it = nodes.find(k); + return it == nodes.end() ? nullptr : it->second.get(); + } + + // Removes the root (max-element) from the heap. + void pop() { + assert(!empty()); + remove(root); + } + + // Returns the value of the max-element node. + V top() { return std::move(root->value); } + + bool empty() { return Size == 0; } + + unsigned size() { return Size; }; + + // Returns a string representation of the heap tree. + std::string toString() { + std::string str; + str += "HEAP with "; + str += std::to_string(Size); + str += " nodes\n"; + if (root) + str += root->toString(0); + return str; + } +}; +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChain.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChain.h @@ -0,0 +1,131 @@ +//===- PropellerNodeChain.h ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_H +#define LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_H + +#include "PropellerCFG.h" +#include "llvm/ADT/DenseSet.h" + +#include +#include +#include + +using llvm::DenseSet; + +namespace lld { +namespace propeller { + +// Represents a chain of nodes (basic blocks). +class NodeChain { +public: + // Representative node of the chain, with which it is initially constructed. + CFGNode *delegateNode; + + // controlFlowGraph of the nodes in this chain (this will be null if the nodes + // come from more than one cfg). + ControlFlowGraph *controlFlowGraph; + + // Ordered list of the nodes in this chain. + std::list nodes; + + // Iterators to the positions in the chain where the chain transitions from + // one function to another. + std::list::iterator> functionTransitions; + + // Out edges for this chain, to its own nodes and nodes of other chains. + std::unordered_map> outEdges; + + // chains which have outgoing edges to this chain. + DenseSet inEdges; + + // Total binary size of the chain. + uint64_t size; + + // Total execution frequency of the chain. + uint64_t freq; + + // Extended TSP score of the chain. + uint64_t score = 0; + + // Whether to print out information about how this chain joins with others. + bool debugChain; + + // Constructor for building a NodeChain from a single Node + NodeChain(CFGNode *node) + : delegateNode(node), controlFlowGraph(node->controlFlowGraph), + nodes(1, node), size(node->shSize), freq(node->freq), + debugChain(node->controlFlowGraph->debugCFG) {} + + // Constructor for building a NodeChain from all nodes in the controlFlowGraph + // according to the initial order. + NodeChain(ControlFlowGraph *cfg) + : delegateNode(cfg->getEntryNode()), controlFlowGraph(cfg), + size(cfg->size), freq(0), debugChain(cfg->debugCFG) { + cfg->forEachNodeRef([this](CFGNode &node) { + nodes.push_back(&node); + freq += node.freq; + }); + } + + // Helper function to iterate over the outgoing edges of this chain to a + // specific chain, while applying a given function on each edge. + template + void forEachOutEdgeToChain(NodeChain *chain, Visitor V) { + auto it = outEdges.find(chain); + if (it == outEdges.end()) + return; + for (CFGEdge *E : it->second) + V(*E, this, chain); + } + + // This returns the execution density of the chain. + double execDensity() const { + return ((double)freq) / std::max(size, (uint64_t)1); + } + + bool isSameCFG(const NodeChain &c) { + return controlFlowGraph && controlFlowGraph == c.controlFlowGraph; + } +}; + +// This returns a string representation of the chain +std::string toString(const NodeChain &c, + std::list::const_iterator slicePos); +std::string toString(const NodeChain &c); + +} // namespace propeller +} // namespace lld + +namespace std { +// Specialization of std::less for NodeChain, which allows for consistent +// tie-breaking in our Map data structures. +template <> struct less { + bool operator()(const lld::propeller::NodeChain *c1, + const lld::propeller::NodeChain *c2) const { + return c1->delegateNode->mappedAddr < c2->delegateNode->mappedAddr; + } +}; + +// Specialization of std::less for pair, which allows for +// consistent tie-breaking in our Map data structures. +template <> +struct less> { + bool operator()( + const pair p1, + const pair p2) + const { + if (less()(p1.first, p2.first)) + return true; + if (less()(p2.first, p1.first)) + return false; + return less()(p1.second, p2.second); + } +}; +} // namespace std + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChain.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChain.cpp @@ -0,0 +1,43 @@ +//===- NodeChain.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "NodeChain.h" + +namespace lld { +namespace propeller { + +std::string toString(const NodeChain &c, + std::list::const_iterator slicePos) { + std::string str; + if (c.controlFlowGraph) + str += c.controlFlowGraph->name.str(); + str += " [ "; + for (auto it = c.nodes.begin(); it != c.nodes.end(); ++it) { + auto *n = *it; + if (it == slicePos) + str += "\n....SLICE POSITION....\n"; + if (!c.controlFlowGraph) + str += + std::to_string(n->controlFlowGraph->getEntryNode()->mappedAddr) + ":"; + str += n->controlFlowGraph->getEntryNode() == n + ? "Entry" + : std::to_string(n->shName.size() - + n->controlFlowGraph->name.size() - 4); + str += " (size=" + std::to_string(n->shSize) + + ", freq=" + std::to_string(n->freq) + ")"; + if (n != c.nodes.back()) + str += " -> "; + } + str += " ]"; + str += " score: " + std::to_string(c.score); + return str; +} + +std::string toString(const NodeChain &c) { return toString(c, c.nodes.end()); } + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h @@ -0,0 +1,210 @@ +//===- PropellerNodeChainAssembly.h --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file includes the declaration of the NodeChainAssembly class. Each +// instance of this class gives a recipe for merging two NodeChains together in +// addition to the ExtTSP score gain that will be achieved by that merge. Each +// NodeChainAssembly consists of three NodeChainSlices from two node chains: the +// (potentially) splitted chain, and the unsplit chain. A NodeChainSlice has +// represents a slice of a NodeChain by storing iterators to the beginning and +// end of that slice in the node chain, plus the binary offsets at which these +// slices begin and end. The offsets allow effient computation of the gain in +// ExtTSP score. +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_ASSEMBLY_H +#define LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_ASSEMBLY_H + +#include "NodeChain.h" +#include "PropellerCFG.h" + +#include +#include + +namespace lld { +namespace propeller { + +uint64_t getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward, + uint64_t srcSinkDistance); + +// This class defines a slices of a node chain, specified by iterators to the +// beginning and end of the slice. +class NodeChainSlice { +public: + // chain from which this slice comes from + NodeChain *chain; + + // The endpoints of the slice in the corresponding chain + std::list::iterator beginPosition, endPosition; + + // The offsets corresponding to the two endpoints + uint64_t beginOffset, endOffset; + + // Constructor for building a chain slice from a given chain and the two + // endpoints of the chain. + NodeChainSlice(NodeChain *c, std::list::iterator begin, + std::list::iterator end) + : chain(c), beginPosition(begin), endPosition(end) { + + beginOffset = (*begin)->chainOffset; + if (endPosition == chain->nodes.end()) + endOffset = chain->size; + else + endOffset = (*end)->chainOffset; + } + + // Constructor for building a chain slice from a node chain containing all of + // its nodes. + NodeChainSlice(NodeChain *c) + : chain(c), beginPosition(c->nodes.begin()), endPosition(c->nodes.end()), + beginOffset(0), endOffset(c->size) {} + + // (Binary) size of this slice + uint64_t size() const { return endOffset - beginOffset; } + + bool isEmpty() const { return beginPosition == endPosition; } +}; + +// This enum represents the order in which three slices (S1, S2, and U) are +// merged together. We exclude S1S2U and US1S2 since they are respectively +// equivalent to S2S1U (with S2 being empty) and U2U1S (with U2 being empty). +enum MergeOrder { + Begin, + S2S1U = Begin, + BeginNext, + S1US2 = BeginNext, + S2US1, + US2S1, + End +}; + +// This class defines a strategy for assembling two chains together with one of +// the chains potentially split into two chains. +class NodeChainAssembly { +public: + // The gain in ExtTSP score achieved by this NodeChainAssembly once it + // is accordingly applied to the two chains. + // This is effectively equal to "score - splitChain->score - + // unsplitChain->score". + uint64_t scoreGain = 0; + + // The two chains, the first being the splitChain and the second being the + // unsplitChain. + std::pair chainPair; + + // The splitting position in splitchain + std::list::iterator slicePosition; + + // The three chain slices + std::vector slices; + + // The merge order of the slices + MergeOrder mergeOrder; + + // The constructor for creating a NodeChainAssembly. slicePosition must be an + // iterator into splitChain->nodes. + NodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain, + std::list::iterator slicePosition, + MergeOrder mOrder) + : chainPair(splitChain, unsplitChain), slicePosition(slicePosition), + mergeOrder(mOrder) { + // Construct the slices. + NodeChainSlice s1(splitChain, splitChain->nodes.begin(), slicePosition); + NodeChainSlice s2(splitChain, slicePosition, splitChain->nodes.end()); + NodeChainSlice u(unsplitChain); + + switch (mergeOrder) { + case MergeOrder::S2S1U: + slices = {std::move(s2), std::move(s1), std::move(u)}; + break; + case MergeOrder::S1US2: + slices = {std::move(s1), std::move(u), std::move(s2)}; + break; + case MergeOrder::S2US1: + slices = {std::move(s2), std::move(u), std::move(s1)}; + break; + case MergeOrder::US2S1: + slices = {std::move(u), std::move(s2), std::move(s1)}; + break; + default: + assert("Invalid MergeOrder!" && false); + } + + // Set the ExtTSP score gain as the difference between the new score after + // merging these chains and the current scores of the two chains. + auto assemblyScore = computeExtTSPScore(); + auto chainsScore = splitChain->score + unsplitChain->score; + scoreGain = assemblyScore > chainsScore ? assemblyScore - chainsScore : 0; + } + + bool isValid() { return scoreGain; } + + // Find the NodeChainSlice in this NodeChainAssembly which contains the given + // node. If the node is not contained in this NodeChainAssembly, then return + // false. Otherwise, set idx equal to the index of the corresponding slice and + // return true. + bool findSliceIndex(CFGNode *node, NodeChain *chain, uint64_t offset, + uint8_t &idx) const; + + // This function computes the ExtTSP score for a chain assembly record. This + // goes over the three bb slices in the assembly record and considers all + // edges whose source and sink belong to the chains in the assembly record. + uint64_t computeExtTSPScore() const; + + // First node in the resulting assembled chain. + CFGNode *getFirstNode() const { + for (auto &slice : slices) + if (!slice.isEmpty()) + return *slice.beginPosition; + return nullptr; + } + + // Comparator for two nodeChainAssemblies. It compare scoreGain and break ties + // consistently. + struct CompareNodeChainAssembly { + bool operator()(const std::unique_ptr &a1, + const std::unique_ptr &a2) const; + }; + + // We delete the copy constructor to make sure NodeChainAssembly is moved + // rather than copied. + NodeChainAssembly(NodeChainAssembly &&) = default; + // copy constructor is implicitly deleted + // NodeChainAssembly(NodeChainAssembly&) = delete; + NodeChainAssembly() = delete; + + // This returns a unique value for every different assembly record between two + // chains. When chainPair is equal, this helps differentiate and compare the + // two assembly records. + std::pair assemblyStrategy() const { + return std::make_pair(mergeOrder, (*slicePosition)->mappedAddr); + } + + inline bool splits() const { + return slicePosition != splitChain()->nodes.begin(); + } + + inline bool splitsAtFunctionTransition() const { + return splits() && ((*std::prev(slicePosition))->controlFlowGraph != + (*slicePosition)->controlFlowGraph); + } + + inline bool needsSplitChainRotation() { + return (mergeOrder == S2S1U && splits()) || mergeOrder == S2US1 || + mergeOrder == US2S1; + } + + inline NodeChain *splitChain() const { return chainPair.first; } + + inline NodeChain *unsplitChain() const { return chainPair.second; } +}; + +std::string toString(NodeChainAssembly &assembly); + +} // namespace propeller +} // namespace lld + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp @@ -0,0 +1,204 @@ +//===- NodeChainAssembly.cpp ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "NodeChainAssembly.h" +#include "NodeChain.h" + +namespace lld { +namespace propeller { + +// Return the Extended TSP score for one edge, given its source to sink +// direction and distance in the layout. +uint64_t getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward, + uint64_t srcSinkDistance) { + + // Approximate callsites to be in the middle of the source basic block. + if (edge.isCall()) { + if (isEdgeForward) + srcSinkDistance += edge.src->shSize / 2; + else + srcSinkDistance -= edge.src->shSize / 2; + } + + if (edge.isReturn()) { + if (isEdgeForward) + srcSinkDistance += edge.sink->shSize / 2; + else + srcSinkDistance -= edge.sink->shSize / 2; + } + + if (srcSinkDistance == 0 && (edge.type == CFGEdge::EdgeType::INTRA_FUNC || + edge.type == CFGEdge::EdgeType::INTRA_DYNA)) + return edge.weight * propConfig.optFallthroughWeight; + + if (isEdgeForward && srcSinkDistance < propConfig.optForwardJumpDistance) + return edge.weight * propConfig.optForwardJumpWeight * + (propConfig.optForwardJumpDistance - srcSinkDistance); + + if (!isEdgeForward && srcSinkDistance < propConfig.optBackwardJumpDistance) + return edge.weight * propConfig.optBackwardJumpWeight * + (propConfig.optBackwardJumpDistance - srcSinkDistance); + return 0; +} + +bool NodeChainAssembly::findSliceIndex(CFGNode *node, NodeChain *chain, + uint64_t offset, uint8_t &idx) const { + for (idx = 0; idx < 3; ++idx) { + if (chain != slices[idx].chain) + continue; + // We find if the node's offset lies within the begin and end offset of this + // slice. + if (offset < slices[idx].beginOffset || offset > slices[idx].endOffset) + continue; + if (offset < slices[idx].endOffset && offset > slices[idx].beginOffset) + return true; + // A node can have zero size, which means multiple nodes may be associated + // with the same offset. This means that if the node's offset is at the + // beginning or the end of the slice, the node may reside in either slices + // of the chain. + if (offset == slices[idx].endOffset) { + // If offset is at the end of the slice, iterate backwards over the + // slice to find a zero-sized node. + for (auto nodeIt = std::prev(slices[idx].endPosition); + nodeIt != std::prev(slices[idx].beginPosition); nodeIt--) { + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if ((*nodeIt)->shSize) + break; + // Have we found the node? + if (*nodeIt == node) + return true; + } + } + if (offset == slices[idx].beginOffset) { + // If offset is at the beginning of the slice, iterate forwards over the + // slice to find the node. + for (auto nodeIt = slices[idx].beginPosition; + nodeIt != slices[idx].endPosition; nodeIt++) { + if (*nodeIt == node) + return true; + // Stop iterating if the node's size is non-zero as this would change + // the offset. + if ((*nodeIt)->shSize) + break; + } + } + } + return false; +} + +// This function computes the ExtTSP score for a chain assembly record. This +// goes the three bb slices in the assembly record and considers all edges +// whose source and sink belongs to the chains in the assembly record. +uint64_t NodeChainAssembly::computeExtTSPScore() const { + // Zero-initialize the score. + uint64_t score = 0; + + auto addEdgeScore = [this, &score](CFGEdge &edge, NodeChain *srcChain, + NodeChain *sinkChain) { + uint8_t srcSliceIdx, sinkSliceIdx; + auto srcNodeOffset = edge.src->chainOffset; + auto sinkNodeOffset = edge.sink->chainOffset; + if (!findSliceIndex(edge.src, srcChain, srcNodeOffset, srcSliceIdx)) + return; + + if (!findSliceIndex(edge.sink, sinkChain, sinkNodeOffset, sinkSliceIdx)) + return; + + bool edgeForward = (srcSliceIdx < sinkSliceIdx) || + (srcSliceIdx == sinkSliceIdx && + (srcNodeOffset + edge.src->shSize <= sinkNodeOffset)); + + uint64_t srcSinkDistance = 0; + + if (srcSliceIdx == sinkSliceIdx) { + srcSinkDistance = edgeForward + ? sinkNodeOffset - srcNodeOffset - edge.src->shSize + : srcNodeOffset - sinkNodeOffset + edge.src->shSize; + } else { + const NodeChainSlice &srcSlice = slices[srcSliceIdx]; + const NodeChainSlice &sinkSlice = slices[sinkSliceIdx]; + srcSinkDistance = + edgeForward + ? srcSlice.endOffset - srcNodeOffset - edge.src->shSize + + sinkNodeOffset - sinkSlice.beginOffset + : srcNodeOffset - srcSlice.beginOffset + edge.src->shSize + + sinkSlice.endOffset - sinkNodeOffset; + // Increment the distance by the size of the middle slice if the src + // and sink are from the two ends. + if (std::abs(((int16_t)sinkSliceIdx) - ((int16_t)srcSliceIdx)) == 2) + srcSinkDistance += slices[1].size(); + } + + score += getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance); + }; + + // No changes will be made to the score that is contributed by the unsplit + // chain and we can simply increment by the chain's stored score. + score += unsplitChain()->score; + + // We need to recompute the score induced by the split chain (if it has really + // been split) as the offsets of the nodes have changed. + if (splits()) + splitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore); + else + score += splitChain()->score; + + // Consider the contribution to score for inter-chain edges. + splitChain()->forEachOutEdgeToChain(unsplitChain(), addEdgeScore); + unsplitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore); + + return score; +} + +bool NodeChainAssembly::CompareNodeChainAssembly::operator()( + const std::unique_ptr &a1, + const std::unique_ptr &a2) const { + + if (a1->scoreGain == a2->scoreGain) { + // If score gains are equal, we pick a consistent order based on the chains + // in the assembly records + if (std::less>()(a1->chainPair, + a2->chainPair)) + return true; + if (std::less>()(a2->chainPair, + a1->chainPair)) + return false; + // When even the chain pairs are the same, we resort to the assembly + // strategy to pick a consistent order. + return a1->assemblyStrategy() < a2->assemblyStrategy(); + } + return a1->scoreGain < a2->scoreGain; +} + +static std::string toString(MergeOrder mOrder) { + switch (mOrder) { + case MergeOrder::S2S1U: + return "S2S1U"; + case MergeOrder::S1US2: + return "S1US2"; + case MergeOrder::S2US1: + return "S2US1"; + case MergeOrder::US2S1: + return "US2S1"; + default: + assert("Invalid MergeOrder!" && false); + return ""; + } +} + +std::string toString(NodeChainAssembly &assembly) { + std::string str("assembly record between:\n"); + str += toString(*assembly.splitChain(), assembly.slicePosition) + " as S\n"; + str += toString(*assembly.unsplitChain()) + " as U\n"; + str += "merge order: " + toString(assembly.mergeOrder) + "\n"; + str += "scoreGain: " + std::to_string(assembly.scoreGain); + return str; +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h @@ -0,0 +1,126 @@ +//===- PropellerNodeChainBuilder.h ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_BUILDER_H +#define LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_BUILDER_H + +#include "ModifiablePriorityQueue.h" +#include "NodeChain.h" +#include "NodeChainAssembly.h" +#include "NodeChainClustering.h" +#include "PropellerCFG.h" + +#include "llvm/ADT/DenseMap.h" + +#include +#include +#include + +using llvm::DenseMap; + +namespace lld { +namespace propeller { + +// bb chain builder based on the ExtTSP metric +class NodeChainBuilder { +private: + NodeChainAssembly::CompareNodeChainAssembly nodeChainAssemblyComparator; + // Cfgs repreresenting the functions that are reordered + std::vector cfgs; + + // Set of built chains, keyed by section index of their Delegate nodes. + // chains are removed from this Map once they are merged into other chains. + DenseMap> chains; + + // All the initial chains, seperated into connected components + std::vector> components; + + // NodeChainBuilder performs bb ordering component by component. + // This is the component number that the chain builder is currently working + // on. + unsigned currentComponent; + + // These represent all the edges which are -- based on the profile -- the only + // (executed) outgoing edges from their source node and the only (executed) + // incoming edges to their sink nodes. The algorithm will make sure that these + // edges form fall-throughs in the final order. + DenseMap mutuallyForcedOut; + + // This maps every (ordered) pair of chains (with the first chain in the pair + // potentially splittable) to the highest-gain NodeChainAssembly for those + // chains. The Heap data structure allows fast retrieval of the maximum gain + // NodeChainAssembly, along with fast update. + ModifiablePriorityQueue, + std::unique_ptr, + std::less>, + NodeChainAssembly::CompareNodeChainAssembly> + nodeChainAssemblies; + + // This map stores the candidate chains for each chain. + // + // For every chain, its candidate chains are the chains which can increase the + // overall ExtTSP score when merged with that chain. This is used to update + // the nodeChainAssemblies map whenever chains merge together. The candidate + // chains of a chain may also be updated as result of a merge. + std::unordered_map> + candidateChains; + + void coalesceChains(); + void initializeExtTSP(); + + // This function separates builds the connected components for the chains so + // it can later merge each connected component separately. + // chains which are not connected to each other via profile edges will never + // be merged together by NodeChainBuilder. However, the composed chains may + // later be interleaved by ChainClustering. + void initializeChainComponents(); + void attachFallThroughs(); + + // This function tries to place two nodes immediately adjacent to + // each other (used for fallthroughs). + // Returns true if this can be done. + bool attachNodes(CFGNode *src, CFGNode *sink); + + void mergeChainEdges(NodeChain *splitChain, NodeChain *unsplitChain); + + void mergeInOutEdges(NodeChain *mergerChain, NodeChain *mergeeChain); + void mergeChains(NodeChain *leftChain, NodeChain *rightChain); + void mergeChains(std::unique_ptr assembly); + + // Recompute the ExtTSP score of a chain + uint64_t computeExtTSPScore(NodeChain *chain) const; + + // Update the related NodeChainAssembly records for two chains, with the + // assumption that unsplitChain has been merged into splitChain. + bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain); + + void mergeAllChains(); + + void init(); + + // Initialize the mutuallyForcedOut map + void initMutuallyForcedEdges(ControlFlowGraph &cfg); + + // Initialize basic block chains, with one chain for every node + void initNodeChains(ControlFlowGraph &cfg); + +public: + // This invokes the Extended TSP algorithm, orders the hot and cold basic + // blocks and inserts their associated symbols at the corresponding locations + // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the + // given SymbolList. + void doOrder(std::unique_ptr &clustering); + + NodeChainBuilder(std::vector &cfgs) : cfgs(cfgs) {} + + NodeChainBuilder(ControlFlowGraph *cfg) : cfgs(1, cfg) {} +}; + +} // namespace propeller +} // namespace lld + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp @@ -0,0 +1,816 @@ +//===- NodeChainBuilder.cpp ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and includes the implementation of intra-function basic block +// reordering algorithm based on the Extended TSP metric as described in [1]. +// +// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic +// blocks in a function, by combining the gains from fall-throughs and short +// jumps. +// +// Given an ordering of the basic blocks, for a function f, the ExtTSP score is +// computed as follows. +// +// sum_{edges e in f} frequency(e) * weight(e) +// +// where frequency(e) is the execution frequency and weight(e) is computed as +// follows: +// * 1 if distance(src[e], sink[e]) = 0 (i.e. fallthrough) +// +// * 0.1 * (1 - distance(src[e], sink[e]) / 1024) if src[e] < sink[e] and 0 < +// distance(src[e], sink[e]) < 1024 (i.e. short forward jump) +// +// * 0.1 * (1 - distance(src[e], sink[e]) / 640) if src[e] > sink[e] and 0 < +// distance(src[e], sink[e]) < 640 (i.e. short backward jump) +// +// * 0 otherwise +// +// In short, it computes a weighted sum of frequencies of all edges in the +// control flow graph. Each edge gets its weight depending on whether the given +// ordering makes the edge a fallthrough, a short forward jump, or a short +// backward jump. +// +// Although this problem is NP-hard like the regular TSP, an iterative greedy +// basic-block-chaining algorithm is used to find a close to optimal solution. +// This algorithm is described as follows. +// +// Starting with one basic block sequence (bb chain) for every basic block, the +// algorithm iteratively joins bb chains together in order to maximize the +// extended TSP score of all the chains. +// +// Initially, it finds all mutually-forced edges in the profiled +// controlFlowGraph. These are all the edges which are -- based on the profile +// -- the only (executed) outgoing edge from their source node and the only +// (executed) incoming edges to their sink nodes. Next, the source and sink of +// all mutually-forced edges are attached together as fallthrough edges. +// +// Then, at every iteration, the algorithm tries to merge a pair of bb chains +// which leads to the highest gain in the ExtTSP score. The algorithm extends +// the search space by considering splitting short (less than 1KBs in +// binary size) bb chains into two chains and then merging these two chains with +// the other chain in four ways. After every merge, the new merge gains are +// updated. The algorithm repeats joining bb chains until no additional can be +// achieved. Eventually, it sorts all the existing chains in decreasing order +// of their execution density, i.e., the total profiled frequency of the chain +// divided by its binary size. +// +// The values used by this algorithm are reconfiguriable using lld's propeller +// flags. Specifically, these parameters are: +// +// * propeller-forward-jump-distance: maximum distance of a forward jump +// default-set to 1024 in the above equation). +// +// * propeller-backward-jump-distance: maximum distance of a backward jump +// (default-set to 640 in the above equation). +// +// * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1) +// +// * propeller-forward-jump-weight: weight of a forward jump (default-set to +// 0.1) +// +// * propeller-backward-jump-weight: weight of a backward jump (default-set to +// 0.1) +// +// * propeller-chain-split-threshold: maximum binary size of a bb chain which +// the algorithm will consider for splitting (default-set to 1KB). +// +// References: +// * [1] A. Newell and S. Pupyrev, Improved Basic Block Reordering, available +// at https://arxiv.org/abs/1809.04676 +//===----------------------------------------------------------------------===// + +#include "NodeChainBuilder.h" + +#include "llvm/ADT/DenseSet.h" + +using llvm::DenseSet; + +using llvm::detail::DenseMapPair; + +namespace lld { +namespace propeller { + +// Initializes the node chains for each cfg and finds all the mutually-forced +// edges. +void NodeChainBuilder::init() { + for (ControlFlowGraph *cfg : cfgs) { + initNodeChains(*cfg); + initMutuallyForcedEdges(*cfg); + } +} + +// NodeChainBuilder calls this function after building all the chains to attach +// as many fall-throughs as possible. Given that the algorithm already optimizes +// the extend TSP score, this function will only affect the cold basic blocks +// and thus we do not need to consider the edge weights. +void NodeChainBuilder::attachFallThroughs() { + for (ControlFlowGraph *Cfg : cfgs) { + // First, try to keep the fall-throughs from the original order. + for (auto &Node : Cfg->nodes) { + if (Node->ftEdge != nullptr) { + attachNodes(Node.get(), Node->ftEdge->sink); + } + } + + // Sometimes, the original fall-throughs cannot be kept. So we try to find + // new fall-through opportunities which did not exist in the original order. + for (auto &Edge : Cfg->intraEdges) { + if (Edge->type == CFGEdge::EdgeType::INTRA_FUNC || + Edge->type == CFGEdge::EdgeType::INTRA_DYNA) + attachNodes(Edge->src, Edge->sink); + } + } +} + +// This function sorts bb chains in decreasing order of their execution density. +// NodeChainBuilder calls this function at the end to ensure that hot bb chains +// are placed at the beginning of the function. +void NodeChainBuilder::coalesceChains() { + std::vector chainOrder; + for (DenseMapPair> &elem : chains) + chainOrder.push_back(elem.second.get()); + + std::sort( + chainOrder.begin(), chainOrder.end(), [](NodeChain *c1, NodeChain *c2) { + if (!c1->isSameCFG(*c2)) + error("Attempting to coalesce chains belonging to different " + "functions."); + // Always place the hot chains before cold ones + if (c1->freq == 0 ^ c2->freq == 0) + return c1->freq != 0; + + // Place the entry node's chain before other chains + auto *entryNode = c1->controlFlowGraph->getEntryNode(); + if (entryNode->chain == c1) + return true; + if (entryNode->chain == c2) + return false; + + // Sort chains in decreasing order of execution density, and break ties + // according to the initial ordering. + double c1ExecDensity = c1->execDensity(); + double c2ExecDensity = c2->execDensity(); + if (c1ExecDensity == c2ExecDensity) + return c1->delegateNode->mappedAddr < c2->delegateNode->mappedAddr; + return c1ExecDensity > c2ExecDensity; + }); + + // We will merge all chains into at most two chains; a hot and cold one. + NodeChain *mergerChain = nullptr; + + for (NodeChain *c : chainOrder) { + if (!mergerChain) { + mergerChain = c; + continue; + } + // Create a cold partition when -propeller-split-funcs is set. + if (propConfig.optSplitFuncs && (mergerChain->freq != 0 && c->freq == 0)) { + mergerChain = c; + continue; + } + // Merge this chain into the merger chain + mergeChains(mergerChain, c); + } +} + +// Merge two chains in the specified order. +void NodeChainBuilder::mergeChains(NodeChain *leftChain, + NodeChain *rightChain) { + if ((propConfig.optReorderIP || propConfig.optSplitFuncs) && + (leftChain->freq == 0 ^ rightChain->freq == 0)) + error("Attempting to merge hot and cold chains: \n" + toString(*leftChain) + + "\nAND\n" + toString(*rightChain)); + + if (leftChain->debugChain || rightChain->debugChain) + fprintf(stderr, "MERGING chains:\n%s\nAND%s\n", + toString(*leftChain).c_str(), toString(*rightChain).c_str()); + + mergeInOutEdges(leftChain, rightChain); + + auto rightChainBegin = rightChain->nodes.begin(); + + leftChain->nodes.splice(leftChain->nodes.end(), rightChain->nodes); + + for (auto it = rightChainBegin; it != leftChain->nodes.end(); ++it) { + (*it)->chain = leftChain; + (*it)->chainOffset += leftChain->size; + } + + leftChain->size += rightChain->size; + leftChain->freq += rightChain->freq; + + leftChain->debugChain |= rightChain->debugChain; + if (leftChain->controlFlowGraph && + leftChain->controlFlowGraph != rightChain->controlFlowGraph) + leftChain->controlFlowGraph = nullptr; + + chains.erase(rightChain->delegateNode->mappedAddr); +} + +// This function tries to place two basic blocks immediately adjacent to each +// other (used for fallthroughs). Returns true if the basic blocks have been +// attached this way. +bool NodeChainBuilder::attachNodes(CFGNode *src, CFGNode *sink) { + if (sink->isEntryNode()) + return false; + + // Ignore edges between hot and cold basic blocks. + if (src->freq == 0 ^ sink->freq == 0) + return false; + NodeChain *srcChain = src->chain; + NodeChain *sinkChain = sink->chain; + // Skip this edge if the source and sink are in the same chain + if (srcChain == sinkChain) + return false; + + // It's possible to form a fall-through between src and sink only if + // they are respectively located at the end and beginning of their chains. + if (srcChain->nodes.back() != src || sinkChain->nodes.front() != sink) + return false; + // Attaching is possible. So we merge the chains in the corresponding order. + mergeChains(srcChain, sinkChain); + return true; +} + +// This function merges the in-and-out chain-edges of one chain (mergeeChain) +// into those of another (mergerChain). +void NodeChainBuilder::mergeInOutEdges(NodeChain *mergerChain, + NodeChain *mergeeChain) { + // Add out-edges of mergeeChain to the out-edges of mergerChain + for (auto &elem : mergeeChain->outEdges) { + NodeChain *c = (elem.first == mergeeChain) ? mergerChain : elem.first; + auto res = mergerChain->outEdges.emplace(c, elem.second); + // If the chain-edge is already present, just add the controlFlowGraph + // edges, otherwise, add mergerChain to in-edges of the chain. + if (!res.second) + res.first->second.insert(res.first->second.end(), elem.second.begin(), + elem.second.end()); + else + c->inEdges.insert(mergerChain); + + // Remove mergeeChain from in-edges + c->inEdges.erase(mergeeChain); + } + + // Add in-edges of mergeeChain to in-edges of mergerChain + for (auto *c : mergeeChain->inEdges) { + // Self edges were already handled above. + if (c == mergeeChain) + continue; + // Move all controlFlowGraph edges from being mapped to mergeeChain to being + // mapped to mergerChain. + auto &mergeeChainEdges = c->outEdges[mergeeChain]; + auto &mergerChainEdges = c->outEdges[mergerChain]; + + mergerChainEdges.insert(mergerChainEdges.end(), mergeeChainEdges.begin(), + mergeeChainEdges.end()); + mergerChain->inEdges.insert(c); + // Remove the defunct chain from out edges + c->outEdges.erase(mergeeChain); + } +} + +// Merge two bb sequences according to the given NodeChainAssembly. A +// NodeChainAssembly is an ordered triple of three slices from two chains. +void NodeChainBuilder::mergeChains( + std::unique_ptr assembly) { + if (assembly->splitChain()->freq == 0 ^ assembly->unsplitChain()->freq == 0) + error("Attempting to merge hot and cold chains: \n" + + toString(*assembly.get())); + + if (assembly->splitChain()->freq == 0 ^ assembly->unsplitChain()->freq == 0) + error("Attempting to merge hot and cold chains: \n" + + toString(*assembly.get())); + + // Decide which chain gets merged into the other chain, to make the reordering + // more efficient. + NodeChain *mergerChain = (assembly->mergeOrder == US2S1) + ? assembly->unsplitChain() + : assembly->splitChain(); + NodeChain *mergeeChain = (assembly->mergeOrder == US2S1) + ? assembly->splitChain() + : assembly->unsplitChain(); + + // Merge in and out edges of the two chains + mergeInOutEdges(mergerChain, mergeeChain); + + // Does S2 mark a function transition? + bool S2FuncTransition = assembly->splitsAtFunctionTransition(); + + // Create the new node order according the given assembly + auto S1Begin = assembly->splitChain()->nodes.begin(); + auto S2Begin = assembly->slicePosition; + auto UBegin = assembly->unsplitChain()->nodes.begin(); + + // Reorder S1S2 to S2S1 if needed. This operation takes O(1) because we're + // splicing from and into the same list. + if (assembly->needsSplitChainRotation()) + assembly->splitChain()->nodes.splice(S1Begin, assembly->splitChain()->nodes, + S2Begin, + assembly->splitChain()->nodes.end()); + + switch (assembly->mergeOrder) { + case S2S1U: + // Splice U at the end of S. + assembly->splitChain()->nodes.splice(assembly->splitChain()->nodes.end(), + assembly->unsplitChain()->nodes); + break; + case S1US2: + // Splice U in the middle + assembly->splitChain()->nodes.splice(S2Begin, + assembly->unsplitChain()->nodes); + break; + case S2US1: + // Splice U in the middle + assembly->splitChain()->nodes.splice(S1Begin, + assembly->unsplitChain()->nodes); + break; + case US2S1: + // Splice S at the end of U + assembly->unsplitChain()->nodes.splice( + assembly->unsplitChain()->nodes.end(), assembly->splitChain()->nodes); + break; + default: + break; + } + + if (propConfig.optReorderIP && !mergerChain->isSameCFG(*mergeeChain)) { + // Merge function transitions of mergerChain with those of mergeeChain. + mergerChain->functionTransitions.splice( + mergerChain->functionTransitions.end(), + mergeeChain->functionTransitions); + + // Add the beginnings of slices if they now mark a function transition. + std::vector::iterator> funcTransitionCandids; + // UBegin should always be examined. + funcTransitionCandids.push_back(UBegin); + // If S2Begin was a function transition point before, we don't examine it + // again as it will remain in the transition points (We never remove entries + // from functionTransitions). + if (!S2FuncTransition) + funcTransitionCandids.push_back(S2Begin); + // S1Begin will only be examined if this is a splitting assembly. Otherwise, + // S1 is empty. + if (assembly->splits()) + funcTransitionCandids.push_back(S1Begin); + + // Check if any of the candidates mark a function transition position and + // add those to the function transition list of the mergerChain. + for (auto it : funcTransitionCandids) + if (it != mergerChain->nodes.begin() && + (*std::prev(it))->controlFlowGraph != (*it)->controlFlowGraph) + mergerChain->functionTransitions.push_back(it); + } + + // Set the starting and ending point for updating the nodes's chain and offset + // in the new chain. This helps to reduce the computation time. + auto chainBegin = mergerChain->nodes.begin(); + uint64_t chainBeginOffset = 0; + + if (assembly->mergeOrder == S1US2 || assembly->mergeOrder == US2S1) { + // We can skip the first slice (slices[0]) in these cases. + chainBegin = assembly->slices[1].beginPosition; + chainBeginOffset = assembly->slices[0].size(); + } + + if (!assembly->splits()) { + // We can skip the S chain in this case. + chainBegin = UBegin; + chainBeginOffset = assembly->splitChain()->size; + } + + uint64_t runningOffset = chainBeginOffset; + + // Update chainOffset and the chain pointer for all the nodes in the sequence. + for (auto it = chainBegin; it != mergerChain->nodes.end(); ++it) { + CFGNode *node = *it; + node->chain = mergerChain; + node->chainOffset = runningOffset; + runningOffset += node->shSize; + } + + mergerChain->size += mergeeChain->size; + assert(mergerChain->size == runningOffset && + "Mismatch of merger chain's size and running offset!"); + + // Update the total frequency the aggregated chain + mergerChain->freq += mergeeChain->freq; + + // We have already computed the new score in the assembly record. So we can + // update the score based on that and the other chain's score. + mergerChain->score += mergeeChain->score + assembly->scoreGain; + + mergerChain->debugChain |= mergeeChain->debugChain; + + if (mergerChain->controlFlowGraph && + mergerChain->controlFlowGraph != mergeeChain->controlFlowGraph) + mergerChain->controlFlowGraph = nullptr; + + // Merge the assembly candidate chains of the two chains into the candidate + // chains of the remaining NodeChain and remove the records for the defunct + // NodeChain. + for (NodeChain *c : candidateChains[mergeeChain]) { + nodeChainAssemblies.erase(std::make_pair(c, mergeeChain)); + nodeChainAssemblies.erase(std::make_pair(mergeeChain, c)); + candidateChains[c].erase(mergeeChain); + if (c != mergerChain) + candidateChains[mergerChain].insert(c); + } + + // Update the NodeChainAssembly for all candidate chains of the merged + // NodeChain. Remove a NodeChain from the merge chain's candidates if the + // NodeChainAssembly update finds no gain. + auto &mergerChainCandidateChains = candidateChains[mergerChain]; + + for (auto CI = mergerChainCandidateChains.begin(), + CE = mergerChainCandidateChains.end(); + CI != CE;) { + NodeChain *otherChain = *CI; + auto &otherChainCandidateChains = candidateChains[otherChain]; + + bool x = updateNodeChainAssembly(otherChain, mergerChain); + + if (!x) + nodeChainAssemblies.erase(std::make_pair(otherChain, mergerChain)); + + bool y = updateNodeChainAssembly(mergerChain, otherChain); + + if (!y) + nodeChainAssemblies.erase(std::make_pair(mergerChain, otherChain)); + + if (x || y) { + otherChainCandidateChains.insert(mergerChain); + CI++; + } else { + otherChainCandidateChains.erase(mergerChain); + CI = mergerChainCandidateChains.erase(CI); + } + } + + // remove all the candidate chain records for the merged-in chain + candidateChains.erase(mergeeChain); + + // Finally, remove the defunct (merged-in) chain record from the chains + chains.erase(mergeeChain->delegateNode->mappedAddr); +} + +// Calculate the Extended TSP metric for a bb chain. +// This function goes over all the BBs in the chain and for bb chain and +// aggregates the score of all edges which are contained in the same chain. +uint64_t NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const { + uint64_t score = 0; + + auto visit = [&score](CFGEdge &edge, NodeChain *srcChain, + NodeChain *sinkChain) { + assert(srcChain == sinkChain); + auto srcOffset = edge.src->chainOffset; + auto sinkOffset = edge.sink->chainOffset; + bool edgeForward = srcOffset + edge.src->shSize <= sinkOffset; + // Calculate the distance between src and sink + uint64_t distance = edgeForward ? sinkOffset - srcOffset - edge.src->shSize + : srcOffset - sinkOffset + edge.src->shSize; + score += getEdgeExtTSPScore(edge, edgeForward, distance); + }; + + chain->forEachOutEdgeToChain(chain, visit); + + return score; +} + +// Updates the best NodeChainAssembly between two NodeChains. The existing +// record will be replaced by the new NodeChainAssembly if a non-zero gain +// is achieved. Otherwise, it will be removed. +// If a nodechain assembly record is (kept) inserted, returns true. Otherwise +// returns false. +bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain, + NodeChain *unsplitChain) { + // Only consider splitting the chain if the size of the chain is smaller than + // a threshold. + bool doSplit = (splitChain->size <= propConfig.optChainSplitThreshold); + // If we are not splitting, we only consider the slice position at the + // beginning of the chain (effectively no splitting). + auto slicePosEnd = + doSplit ? splitChain->nodes.end() : std::next(splitChain->nodes.begin()); + + std::unique_ptr bestAssembly(nullptr); + + // This function creates the different nodeChainAssemblies for splitChain and + // unsplitChain, given a slice position in splitChain, and updates the best + // assembly if required. + auto updateBestAssembly = [this, splitChain, unsplitChain, &bestAssembly]( + std::list::iterator slicePos) { + // If the split position is at the beginning (no splitting), only consider + // one MergeOrder + auto mergeOrderEnd = (slicePos == splitChain->nodes.begin()) + ? MergeOrder::BeginNext + : MergeOrder::End; + for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) { + MergeOrder mOrder = static_cast(MI); + + // Create the NodeChainAssembly representing this particular assembly. + auto NCA = std::unique_ptr( + new NodeChainAssembly(splitChain, unsplitChain, slicePos, mOrder)); + + // Update bestAssembly if needed + if (NCA->isValid() && + (!bestAssembly || nodeChainAssemblyComparator(bestAssembly, NCA))) + bestAssembly = std::move(NCA); + } + }; + + for (auto slicePos = splitChain->nodes.begin(); slicePos != slicePosEnd; + ++slicePos) { + // Do not split the mutually-forced edges in the chain. + if (slicePos != splitChain->nodes.begin() && + mutuallyForcedOut.count(*std::prev(slicePos))) + continue; + updateBestAssembly(slicePos); + } + + if (propConfig.optReorderIP && !doSplit) { + // For inter-procedural layout, we always try splitting at the function + // transition positions regardless of the splitChain's size. + for (auto transit = splitChain->functionTransitions.begin(); + transit != splitChain->functionTransitions.end();) { + auto slicePos = *transit; + if (slicePos == splitChain->nodes.begin() || + (*std::prev(slicePos))->controlFlowGraph == + (*slicePos)->controlFlowGraph) { + transit = splitChain->functionTransitions.erase(transit); + continue; + } + updateBestAssembly(slicePos); + transit++; + } + } + + // Insert the best assembly of the two chains. + if (bestAssembly) { + if (splitChain->debugChain || unsplitChain->debugChain) + fprintf(stderr, "INSERTING ASSEMBLY: %s\n", + toString(*bestAssembly).c_str()); + + nodeChainAssemblies.insert(bestAssembly->chainPair, + std::move(bestAssembly)); + return true; + } else + return false; +} + +void NodeChainBuilder::initNodeChains(ControlFlowGraph &cfg) { + for (auto &node : cfg.nodes) { + NodeChain *chain = new NodeChain(node.get()); + node->chain = chain; + node->chainOffset = 0; + chains.try_emplace(node->mappedAddr, chain); + } +} + +// Find all the mutually-forced edges. +// These are all the edges which are -- based on the profile -- the only +// (executed) outgoing edge from their source node and the only (executed) +// incoming edges to their sink nodes +void NodeChainBuilder::initMutuallyForcedEdges(ControlFlowGraph &cfg) { + DenseMap mutuallyForcedOut; + + DenseMap> profiledOuts; + DenseMap> profiledIns; + + for (auto &node : cfg.nodes) { + std::copy_if(node->outs.begin(), node->outs.end(), + std::back_inserter(profiledOuts[node.get()]), + [](CFGEdge *edge) { + return (edge->type == CFGEdge::EdgeType::INTRA_FUNC || + edge->type == CFGEdge::EdgeType::INTRA_DYNA) && + edge->weight != 0; + }); + std::copy_if(node->ins.begin(), node->ins.end(), + std::back_inserter(profiledIns[node.get()]), + [](CFGEdge *edge) { + return (edge->type == CFGEdge::EdgeType::INTRA_FUNC || + edge->type == CFGEdge::EdgeType::INTRA_DYNA) && + edge->weight != 0; + }); + } + + for (auto &node : cfg.nodes) { + if (profiledOuts[node.get()].size() == 1) { + CFGEdge *edge = profiledOuts[node.get()].front(); + if (profiledIns[edge->sink].size() == 1) + mutuallyForcedOut.try_emplace(node.get(), edge->sink); + } + } + + // Break cycles in the mutually forced edges by cutting the edge sinking to + // the smallest address in every cycle (hopefully a loop backedge) + DenseMap nodeToPathMap; + SmallVector cycleCutNodes; + unsigned pathCount = 0; + for (auto it = mutuallyForcedOut.begin(); it != mutuallyForcedOut.end(); + ++it) { + // Check to see if the node (and its cycle) have already been visited. + if (nodeToPathMap[it->first]) + continue; + CFGEdge *victimEdge = nullptr; + auto nodeIt = it; + pathCount++; + while (nodeIt != mutuallyForcedOut.end()) { + CFGNode *node = nodeIt->first; + unsigned path = nodeToPathMap[node]; + if (path != 0) { + // If this node is marked with a number, either it is the same number, + // in which case we have found a cycle. Or it is a different number, + // which means we have found a path to a previously visited path + // (non-cycle). + if (path == pathCount) { + // We have found a cycle: add the victim edge + cycleCutNodes.push_back(victimEdge->src); + } + break; + } else + nodeToPathMap[node] = pathCount; + if (!profiledOuts[node].empty()) { + CFGEdge *edge = profiledOuts[node].front(); + if (!victimEdge || + (edge->sink->mappedAddr < victimEdge->sink->mappedAddr)) { + victimEdge = edge; + } + } + nodeIt = mutuallyForcedOut.find(nodeIt->second); + } + } + + // Remove the victim edges to break cycles in the mutually forced edges + for (CFGNode *node : cycleCutNodes) + mutuallyForcedOut.erase(node); + + for (auto &elem : mutuallyForcedOut) + mutuallyForcedOut.insert(elem); +} + +// This function initializes the ExtTSP algorithm's data structures; +// the nodeChainAssemblies and the candidateChains maps. +void NodeChainBuilder::initializeExtTSP() { + // For each chain, compute its ExtTSP score, add its chain assembly records + // and its merge candidate chain. + candidateChains.clear(); + for (NodeChain *chain : components[currentComponent]) + chain->score = chain->freq ? computeExtTSPScore(chain) : 0; + + DenseSet> visited; + + for (NodeChain *chain : components[currentComponent]) { + auto &thisCandidateChains = candidateChains[chain]; + for (auto &chainEdge : chain->outEdges) { + NodeChain *otherChain = chainEdge.first; + if (chain == otherChain) + continue; + std::pair chainPair1(chain, otherChain), + chainPair2(otherChain, chain); + auto p = std::less>()(chainPair1, + chainPair2) + ? chainPair1 + : chainPair2; + if (!visited.insert(p).second) + continue; + + bool x = updateNodeChainAssembly(chain, otherChain); + bool y = updateNodeChainAssembly(otherChain, chain); + + if (x || y) { + thisCandidateChains.insert(otherChain); + candidateChains[otherChain].insert(chain); + } + } + } +} + +void NodeChainBuilder::initializeChainComponents() { + + DenseMap chainToComponentMap; + unsigned componentId = 0; + for (DenseMapPair> &elem : chains) { + NodeChain *chain = elem.second.get(); + // Cold chains will not be placed in any component. + if (!chain->freq) + continue; + // Skip if this chain has already been assigned to a component. + if (!chainToComponentMap.try_emplace(chain, componentId).second) + continue; + // Start creating the component containing this chain and its connected + // chains. + std::vector toVisit(1, chain); + unsigned index = 0; + + // Find the connected component using BFS. + while (index != toVisit.size()) { + auto *tchain = toVisit[index++]; + for (auto *c : tchain->inEdges) { + if (chainToComponentMap.try_emplace(c, componentId).second) + toVisit.push_back(c); + } + for (auto &e : tchain->outEdges) { + if (chainToComponentMap.try_emplace(e.first, componentId).second) + toVisit.push_back(e.first); + } + } + components.push_back(std::move(toVisit)); + componentId++; + } +} + +void NodeChainBuilder::mergeAllChains() { + // Attach the mutually-foced edges (which will not be split anymore by the + // Extended TSP algorithm). + for (auto &elem : mutuallyForcedOut) + attachNodes(elem.first, elem.second); + + // Set up the outgoing edges for every chain + for (auto &elem : chains) { + auto *chain = elem.second.get(); + // Ignore cold chains as they cannot have any hot edges to other nodes + if (!chain->freq) + continue; + auto addEdge = [chain](CFGEdge &edge) { + // Ignore returns and zero-frequency edges as these edges will not be used + // by the ExtTSP score algorithm. + if (!edge.weight || edge.isReturn()) + return; + auto *sinkNodeChain = edge.sink->chain; + chain->outEdges[sinkNodeChain].push_back(&edge); + sinkNodeChain->inEdges.insert(chain); + }; + + if (propConfig.optReorderIP) { + for (CFGNode *node : chain->nodes) + node->forEachOutEdgeRef(addEdge); + } else { + for (CFGNode *node : chain->nodes) + node->forEachIntraOutEdgeRef(addEdge); + } + } + + initializeChainComponents(); + + for (currentComponent = 0; currentComponent < components.size(); + ++currentComponent) { + if (propConfig.optPrintStats) + fprintf(stderr, "COMPONENT: %u -> SIZE: %zu\n", currentComponent, + components[currentComponent].size()); + // Initialize the Extended TSP algorithm's data. + initializeExtTSP(); + + // Keep merging the chain assembly record with the highest ExtTSP gain, + // until no more gain is possible. + while (!nodeChainAssemblies.empty()) { + auto bestAssembly = nodeChainAssemblies.top(); + nodeChainAssemblies.pop(); + if (bestAssembly->splitChain()->debugChain || + bestAssembly->unsplitChain()->debugChain) + fprintf(stderr, "MERGING for %s\n", + toString(*bestAssembly.get()).c_str()); + mergeChains(std::move(bestAssembly)); + } + } +} + +void NodeChainBuilder::doOrder(std::unique_ptr &clustering) { + init(); + + mergeAllChains(); + + // Merge fallthrough basic blocks if we have missed any + attachFallThroughs(); + + if (!propConfig.optReorderIP) { + // If this is not inter-procedural, coalesce all hot chains into a single + // hot chain and all cold chains into a single cold chain. + coalesceChains(); + assert(cfgs.size() == 1 && chains.size() <= 2); + +#ifdef PROPELLER_PROTOBUF + ControlFlowGraph *cfg = cfgs.back(); + if (prop->protobufPrinter) { + std::list nodeOrder; + for (auto &elem : chains) { + auto *chain = elem.second.get(); + nodeOrder.insert(chain->freq ? nodeOrder.begin() : nodeOrder.end(), + chain->nodes.begin(), chain->nodes.end()); + } + prop->protobufPrinter->addCFG(*cfg, &nodeOrder); + } +#endif + } + + // Hand in the built chains to the chain clustering algorithm + for (auto &elem : chains) + clustering->addChain(std::move(elem.second)); +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.h =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.h @@ -0,0 +1,143 @@ +//===- PropellerChainClustering.h ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#ifndef LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H +#define LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H + +#include "NodeChain.h" + +#include "llvm/ADT/DenseMap.h" + +#include + +using llvm::DenseMap; + +namespace lld { +namespace propeller { + +// This is the base class for clustering chains. It provides the basic data +// structures and functions. +class ChainClustering { +public: + class Cluster { + public: + // Constructor for building a cluster from a single chain. + Cluster(NodeChain *); + + // The chains in this cluster in the merged order. + std::vector chains; + + // The representative chain for this cluster. + NodeChain *delegateChain; + + // Total size of the cluster. + uint64_t size; + + // Total frequency of the cluster. + uint64_t freq; + + // This function merges this cluster with another cluster + Cluster &mergeWith(Cluster &other) { + chains.insert(chains.end(), other.chains.begin(), other.chains.end()); + this->freq += other.freq; + this->size += other.size; + return *this; + } + + // Returns the execution density of this cluster + double getDensity() { return ((double)freq / size); } + }; + + // This function merges two clusters in the given order, and updates + // chainToClusterMap for the chains in the mergee cluster and also removes the + // mergee cluster from the clusters. + void mergeTwoClusters(Cluster *predecessorCluster, Cluster *cluster) { + // Join the two clusters into predecessorCluster. + predecessorCluster->mergeWith(*cluster); + + // Update chain to cluster mapping, because all chains that were + // previsously in cluster are now in predecessorCluster. + for (NodeChain *c : cluster->chains) { + chainToClusterMap[c] = predecessorCluster; + } + + // Delete the defunct cluster + clusters.erase(cluster->delegateChain->delegateNode->mappedAddr); + } + + // Adds a chain to the input of the clustering process + void addChain(std::unique_ptr &&chain_ptr); + + // Any subclass should override this function. + virtual void doOrder(std::vector &hotOrder, + std::vector &coldOrder); + + virtual ~ChainClustering() = default; + +protected: + // Optional function for merging clusters, which could be overriden by + // subclasses. + virtual void mergeClusters(){}; + + // This function sorts the final clusters in decreasing order of their + // execution density. + void sortClusters(std::vector &); + + // This function initializes clusters with each cluster including a single + // chain. + void initClusters() { + for (auto &c_ptr : hotChains) { + NodeChain *chain = c_ptr.get(); + Cluster *cl = new Cluster(chain); + cl->freq = chain->freq; + cl->size = std::max(chain->size, (uint64_t)1); + chainToClusterMap[chain] = cl; + clusters.try_emplace(cl->delegateChain->delegateNode->mappedAddr, cl); + } + } + + // chains processed by the clustering algorithm separated into hot and cold + // ones based on their frequency. + std::vector> hotChains, coldChains; + + // All clusters currently in process. + DenseMap> clusters; + + // This maps every chain to its containing cluster. + DenseMap chainToClusterMap; +}; + +// This class computes an ordering for the input chains which conforms to the +// initial ordering of nodes in the binary. +class NoOrdering : public ChainClustering { +public: + void doOrder(std::vector &hotOrder, + std::vector &coldOrder); +}; + +// This class implements the call-chain-clustering algorithm. +class CallChainClustering : public ChainClustering { +private: + Cluster *getMostLikelyPredecessor(NodeChain *chain, Cluster *cluster); + void mergeClusters(); +}; + +} // namespace propeller +} // namespace lld + +namespace std { +template <> struct less { + bool operator()(const lld::propeller::ChainClustering::Cluster *c1, + const lld::propeller::ChainClustering::Cluster *c2) const { + return less()(c1->delegateChain, + c2->delegateChain); + } +}; + +} // namespace std + +#endif Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp =================================================================== --- /dev/null +++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp @@ -0,0 +1,233 @@ +//===- PropellerChainClustering.cpp --------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// This file is part of the Propeller infrastructure for doing code layout +// optimization and includes the implementation of the Call-chain-Clustering +// algorithm as described in [1]. +// +// The algorithm iteratively merges chains into clusters. To do so, it processes +// the basic block chains in decreasing order of +// their execution density and for each chain, merges its cluster with its +// most-frequent predecessor cluster. The merging of a cluster stops when it +// reaches the -propeller-cluster-merge-size-threshold (set to 2MB by default). +// +// [1] Guilherme Ottoni and Bertrand Maher. Optimizing Function Placement for +// Large-Scale Data-Center Applications. Available at +// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +//===----------------------------------------------------------------------===// +#include "NodeChainClustering.h" +#include "PropellerConfig.h" + +using llvm::detail::DenseMapPair; + +namespace lld { +namespace propeller { + +void ChainClustering::addChain(std::unique_ptr &&chain_ptr) { + for (CFGNode *n : chain_ptr->nodes) + n->chain = chain_ptr.get(); + auto &chainList = ((propConfig.optReorderIP || propConfig.optSplitFuncs || + propConfig.optReorderFuncs) && + chain_ptr->freq == 0) + ? coldChains + : hotChains; + chainList.push_back(std::move(chain_ptr)); +} + +// Initializes a cluster containing a single chain an associates it with a +// unique id. +ChainClustering::Cluster::Cluster(NodeChain *chain) + : chains(1, chain), delegateChain(chain) {} + +// Returns the most frequent predecessor of a chain. This function also gets as +// the second parameter the cluster containing the chain to save a lookup +// into chainToClusterMap. +ChainClustering::Cluster * +CallChainClustering::getMostLikelyPredecessor(NodeChain *chain, + Cluster *cluster) { + // This map stores the total weight of the incoming edges to the given cluster + // from every other cluster. + DenseMap clusterEdge; + + for (CFGNode *n : chain->nodes) { + // For non-inter-procedural, we only consider the function entries. + if (!propConfig.optReorderIP && !n->isEntryNode()) + continue; + auto visit = [&clusterEdge, n, chain, cluster, this](CFGEdge &edge) { + if (!edge.weight || edge.isReturn()) + return; + if (!propConfig.optReorderIP && !edge.isCall()) + return; + auto *callerChain = edge.src->chain; + if (!callerChain) { + warn("Caller for node: " + n->shName + " does not have a chain!"); + return; + } + auto *callerCluster = chainToClusterMap[callerChain]; + if (callerChain == chain || callerCluster == cluster) + return; + // Ignore clusters which are too big + if (callerCluster->size > propConfig.optClusterMergeSizeThreshold) + return; + // Ignore edges which are cold relative to the sink node + if (edge.weight * 10 < n->freq) + return; + // Do not merge if the caller cluster's density would degrade by more than + // 1/8 by the merge. + if (8 * callerCluster->size * (cluster->freq * callerCluster->freq) < + callerCluster->freq * (cluster->size + callerCluster->size)) + return; + clusterEdge[callerCluster] += edge.weight; + }; + n->forEachInEdgeRef(visit); + } + + // Get the highest-frequency caller + auto bestCaller = + std::max_element(clusterEdge.begin(), clusterEdge.end(), + [](const DenseMapPair &p1, + const DenseMapPair &p2) { + if (p1.second == p2.second) // Consistently break ties + return std::less()(p1.first, p2.first); + return p1.second < p2.second; + }); + + if (bestCaller == clusterEdge.end()) + return nullptr; + return bestCaller->first; +} + +void ChainClustering::sortClusters(std::vector &clusterOrder) { + for (auto &p : clusters) + clusterOrder.push_back(p.second.get()); + + auto clusterComparator = [](Cluster *c1, Cluster *c2) -> bool { + // Set a deterministic order when execution densities are equal. + if (c1->getDensity() == c2->getDensity()) + return c1->delegateChain->delegateNode->mappedAddr < + c2->delegateChain->delegateNode->mappedAddr; + return c1->getDensity() > c2->getDensity(); + }; + + std::sort(clusterOrder.begin(), clusterOrder.end(), clusterComparator); +} + +void NoOrdering::doOrder(std::vector &hotOrder, + std::vector &coldOrder) { + auto chainComparator = [](const std::unique_ptr &c_ptr1, + const std::unique_ptr &c_ptr2) -> bool { + return c_ptr1->delegateNode->mappedAddr < c_ptr2->delegateNode->mappedAddr; + }; + + std::sort(hotChains.begin(), hotChains.end(), chainComparator); + std::sort(coldChains.begin(), coldChains.end(), chainComparator); + + for (auto &c_ptr : hotChains) + for (CFGNode *n : c_ptr->nodes) + hotOrder.push_back(n); + + for (auto &c_ptr : coldChains) + for (CFGNode *n : c_ptr->nodes) + coldOrder.push_back(n); +} + +// Merge clusters together based on the CallChainClustering algorithm. +void CallChainClustering::mergeClusters() { + // Build a map for the execution density of each chain. + DenseMap chainWeightMap; + + for (auto &c_ptr : hotChains) { + NodeChain *chain = c_ptr.get(); + chainWeightMap.try_emplace(chain, chain->execDensity()); + } + + // Sort the hot chains in decreasing order of their execution density. + std::sort(hotChains.begin(), hotChains.end(), + [&chainWeightMap](const std::unique_ptr &c_ptr1, + const std::unique_ptr &c_ptr2) { + auto chain1Weight = chainWeightMap[c_ptr1.get()]; + auto chain2Weight = chainWeightMap[c_ptr2.get()]; + if (chain1Weight == chain2Weight) + return c_ptr1->delegateNode->mappedAddr < + c_ptr2->delegateNode->mappedAddr; + return chain1Weight > chain2Weight; + }); + + for (auto &c_ptr : hotChains) { + NodeChain *chain = c_ptr.get(); + if (chainWeightMap[chain] <= 0.005) + break; + auto *cluster = chainToClusterMap[chain]; + // Ignore merging if the cluster containing this function is bigger than + // 2MBs (size of a large page). + if (cluster->size > propConfig.optClusterMergeSizeThreshold) + continue; + assert(cluster); + + Cluster *predecessorCluster = getMostLikelyPredecessor(chain, cluster); + if (!predecessorCluster) + continue; + + mergeTwoClusters(predecessorCluster, cluster); + } +} + +// This function orders the hot chains based on mergeClusters and sortClusters +// and uses a consistent ordering for the cold part. +void ChainClustering::doOrder(std::vector &hotOrder, + std::vector &coldOrder) { + initClusters(); + mergeClusters(); + std::vector clusterOrder; + sortClusters(clusterOrder); + // This maps every controlFlowGraph to the position of its first hot node in + // the layout. + DenseMap hotCFGOrder; + + for (Cluster *cl : clusterOrder) + for (NodeChain *c : cl->chains) + for (CFGNode *n : c->nodes) { + hotCFGOrder.try_emplace(n->controlFlowGraph, hotOrder.size()); + hotOrder.push_back(n); + } + + auto coldChainComparator = + [&hotCFGOrder](const std::unique_ptr &c_ptr1, + const std::unique_ptr &c_ptr2) -> bool { + // If each of the chains comes from a single controlFlowGraph, we can order + // the chains consistently based on the hot layout + if (c_ptr1->controlFlowGraph && c_ptr2->controlFlowGraph) { + // If one controlFlowGraph is cold and the other is hot, make sure the hot + // controlFlowGraph's chain comes earlier in the order. + if (c_ptr1->controlFlowGraph->isHot() != + c_ptr2->controlFlowGraph->isHot()) + return c_ptr1->controlFlowGraph->isHot(); + // If both cfgs are hot, make the order for cold chains consistent with + // the order for hot ones. + if (c_ptr1->controlFlowGraph->isHot() && + c_ptr2->controlFlowGraph->isHot() && + (c_ptr1->controlFlowGraph != c_ptr2->controlFlowGraph)) + return hotCFGOrder[c_ptr1->controlFlowGraph] < + hotCFGOrder[c_ptr2->controlFlowGraph]; + // Otherwise, use a consistent order based on the representative nodes. + return c_ptr1->delegateNode->mappedAddr < + c_ptr2->delegateNode->mappedAddr; + } + // Use an order consistent with the initial ordering of the representative + // nodes. + return c_ptr1->delegateNode->mappedAddr < c_ptr2->delegateNode->mappedAddr; + }; + + std::sort(coldChains.begin(), coldChains.end(), coldChainComparator); + + for (auto &c_ptr : coldChains) + for (CFGNode *n : c_ptr->nodes) + coldOrder.push_back(n); +} + +} // namespace propeller +} // namespace lld Index: lld/test/ELF/propeller/codelayout/function-ordering.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/function-ordering.s @@ -0,0 +1,78 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises function reordering on three functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld %t.o -o %t.out + +# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000008 t foo +# BEFORE-NEXT: 0000000000201128 0000000000000008 t bar +# BEFORE-NEXT: 0000000000201130 0000000000000008 t baz + +## Create a propeller profile based on the following inter-procedural calls. +## +## 100 100 +## baz -----> bar -----> foo +## + +# RUN: echo "!foo" > %t_prof.propeller +# RUN: echo "!bar" >> %t_prof.propeller +# RUN: echo "!baz" >> %t_prof.propeller +# RUN: echo "Symbols" >> %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 8 Nbar" >> %t_prof.propeller +# RUN: echo "3 8 Nbaz" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "3 2 100 C" >> %t_prof.propeller +# RUN: echo "2 1 100 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller + +## Link with the propeller profile and expect the functions to be reordered. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out +# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 0000000000000008 t baz +# REORDER-NEXT: 0000000000201128 0000000000000008 t bar +# REORDER-NEXT: 0000000000201130 0000000000000008 t foo + +## Disable function reordering and expect that the original function order is retained. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out +# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER + +# NOREORDER: 0000000000201120 0000000000000008 t foo +# NOREORDER-NEXT: 0000000000201128 0000000000000008 t bar +# NOREORDER-NEXT: 0000000000201130 0000000000000008 t baz + +.section .text,"ax",@progbits,unique,1 +# -- Begin function foo +.type foo,@function +foo: + .quad 0x11 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,2 +# -- Begin function bar +.type bar,@function +bar: + .quad 0x22 + +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,3 +# -- Begin function baz +.type baz,@function +baz: + .quad 0x33 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz Index: lld/test/ELF/propeller/codelayout/function-with-loop.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/function-with-loop.s @@ -0,0 +1,134 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises basic block reordering on a single function with a simple loop. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 0000000000201123 a.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: je 3 + +# BEFORE: 0000000000201128 aa.BB.foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 000000000020112b aaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: jne -13 + +# BEFORE: 0000000000201130 aaaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: retq + + +## Create a propeller profile for foo, based on the cfg below: +## +## foo +## | +## |5 +## V +## +--> a.BB.foo <--+ +## | / \ | +## 0| 0/ \95 |90 +## | / \ | +## | v \ | +## aa.BB.foo v | +## \ aaa.BB.foo +## \ / +## \ / +## 0\ /10 +## \ / +## v v +## aaaa.BB.foo +## +## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo +## as a fallthrough chain in the layout. + +# RUN: echo "!foo" > %t_prof.propeller +# RUN: echo "!!1" >> %t_prof.propeller +# RUN: echo "!!3" >> %t_prof.propeller +# RUN: echo "!!4" >> %t_prof.propeller +# RUN: echo "Symbols" >> %t_prof.propeller +# RUN: echo "1 14 Nfoo" >> %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 3 1.2" >> %t_prof.propeller +# RUN: echo "4 5 1.3" >> %t_prof.propeller +# RUN: echo "5 4 1.4" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "4 2 90" >> %t_prof.propeller +# RUN: echo "2 4 95" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 10" >> %t_prof.propeller +# RUN: echo "1 2 5" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 foo: +# REORDER-NEXT: nopl (%rax) + +# REORDER: 0000000000201123 a.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne 9 + +# REORDER: 0000000000201128 aaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne -10 + +# REORDER: 000000000020112d aaaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: retq + +# REORDER: 0000000000201131 aa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jmp -14 + +## Disable basic block reordering and expect that the original bb order is retained. +# +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out +# RUN: diff %t.propeller.noreorder.out %t.out + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +foo: + nopl (%rax) + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + jmp aaa.BB.foo +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + jne a.BB.foo + jmp aaaa.BB.foo +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits,unique,4 +aaaa.BB.foo: + nopl (%rax) + ret +.Laaaa.BB.foo_end: + .size aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo Index: lld/test/ELF/propeller/codelayout/opt-all-combinations.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/opt-all-combinations.s @@ -0,0 +1,301 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises all combinations of propeller options (reorder-funcs, reorder-blocks, +## and split-funcs) on four functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: Disassembly of section .text: +# BEFORE-EMPTY: +# BEFORE-NEXT: foo: +# BEFORE-NEXT: xorb %al, 0 +# BEFORE-NEXT: int3 +# BEFORE-EMPTY: +# BEFORE-NEXT: bar: +# BEFORE-NEXT: xorb %al, 1 +# BEFORE-NEXT: je 9 +# BEFORE-EMPTY: +# BEFORE-NEXT: a.BB.bar: +# BEFORE-NEXT: xorb %al, 2 +# BEFORE-NEXT: jmp 7 +# BEFORE-EMPTY: +# BEFORE-NEXT: aa.BB.bar: +# BEFORE-NEXT: xorb %al, 3 +# BEFORE-EMPTY: +# BEFORE-NEXT: aaa.BB.bar: +# BEFORE-NEXT: xorb %al, 4 +# BEFORE-EMPTY: +# BEFORE-NEXT: baz: +# BEFORE-NEXT: xorb %al, 5 +# BEFORE-NEXT: int3 +# BEFORE-EMPTY: +# BEFORE-NEXT: qux: +# BEFORE-NEXT: xorb %al, 6 +# + +## Create a propeller profile for four functions foo, bar, baz, and qux based on the cfg below: +## +## +## bar +## / \ +## / \100 +## / \ +## 100 v v +## foo <----- a.BB.bar aa.BB.bar baz qux ---+ +## \ / ^ | +## \ /100 | 1 | +## \ / +-----+ +## v v +## aaa.BB.bar +## + +# RUN: echo "!foo" > %t_prof.propeller +# RUN: echo "!bar" >> %t_prof.propeller +# RUN: echo "!!1" >> %t_prof.propeller +# RUN: echo "!!2" >> %t_prof.propeller +# RUN: echo "!!3" >> %t_prof.propeller +# RUN: echo "!qux" >> %t_prof.propeller +# RUN: echo "Symbols" >> %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 20 Nbar" >> %t_prof.propeller +# RUN: echo "3 9 2.1" >> %t_prof.propeller +# RUN: echo "4 7 2.2" >> %t_prof.propeller +# RUN: echo "5 7 2.3" >> %t_prof.propeller +# RUN: echo "6 8 Nbaz" >> %t_prof.propeller +# RUN: echo "7 7 Nqux" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "4 1 100 C" >> %t_prof.propeller +# RUN: echo "7 7 10 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller --verbose -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: Disassembly of section .text: +# REORDER-EMPTY: +# REORDER-NEXT: bar: +# REORDER-NEXT: xorb %al, 1 +# REORDER-NEXT: jne 30 +# REORDER-EMPTY: +# REORDER-NEXT: aa.BB.bar: +# REORDER-NEXT: xorb %al, 3 +# REORDER-EMPTY: +# REORDER-NEXT: aaa.BB.bar: +# REORDER-NEXT: xorb %al, 4 +# REORDER-NEXT: int3 +# REORDER-EMPTY: +# REORDER-NEXT: foo: +# REORDER-NEXT: xorb %al, 0 +# REORDER-NEXT: int3 +# REORDER-EMPTY: +# REORDER-NEXT: qux: +# REORDER-NEXT: xorb %al, 6 +# REORDER-EMPTY: +# REORDER-NEXT: a.BB.bar: +# REORDER-NEXT: xorb %al, 2 +# REORDER-NEXT: jmp -32 +# REORDER-EMPTY: +# REORDER-NEXT: baz: +# REORDER-NEXT: xorb %al, 5 +# + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorderblocks.out 2>&1 | FileCheck %s --check-prefixes=WARN,IMPLICITNOSPLIT +# WARN-NOT: warning: +# IMPLICITNOSPLIT: warning: propeller: no-reorder-blocks implicitly sets no-split-funcs. +# +# RUN: llvm-objdump -d %t.propeller.noreorderblocks.out| FileCheck %s --check-prefix=NO_REORDER_BB +# +# NO_REORDER_BB: Disassembly of section .text: +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: bar: +# NO_REORDER_BB-NEXT: xorb %al, 1 +# NO_REORDER_BB-NEXT: je 9 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: a.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 2 +# NO_REORDER_BB-NEXT: jmp 7 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: aa.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: aaa.BB.bar: +# NO_REORDER_BB-NEXT: xorb %al, 4 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: foo: +# NO_REORDER_BB-NEXT: xorb %al, 0 +# NO_REORDER_BB-NEXT: int3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: qux: +# NO_REORDER_BB-NEXT: xorb %al, 6 +# NO_REORDER_BB-NEXT: int3 +# NO_REORDER_BB-EMPTY: +# NO_REORDER_BB-NEXT: baz: +# NO_REORDER_BB-NEXT: xorb %al, 5 + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.noreorderfuncs.out +# RUN: llvm-objdump -d %t.propeller.noreorderfuncs.out| FileCheck %s --check-prefix=NO_REORDER_FUNC +# +# NO_REORDER_FUNC: Disassembly of section .text: +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: foo: +# NO_REORDER_FUNC-NEXT: xorb %al, 0 +# NO_REORDER_FUNC-NEXT: int3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 1 +# NO_REORDER_FUNC-NEXT: jne 22 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: aa.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: aaa.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 4 +# NO_REORDER_FUNC-NEXT: int3 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: qux: +# NO_REORDER_FUNC-NEXT: xorb %al, 6 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: a.BB.bar: +# NO_REORDER_FUNC-NEXT: xorb %al, 2 +# NO_REORDER_FUNC-NEXT: jmp -24 +# NO_REORDER_FUNC-EMPTY: +# NO_REORDER_FUNC-NEXT: baz: +# NO_REORDER_FUNC-NEXT: xorb %al, 5 +# + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitfuncs.out +# RUN: llvm-objdump -d %t.propeller.nosplitfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_FUNC +# +# NO_SPLIT_FUNC: Disassembly of section .text: +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 1 +# NO_SPLIT_FUNC-NEXT: jne 14 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: aa.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: aaa.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 4 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: a.BB.bar: +# NO_SPLIT_FUNC-NEXT: xorb %al, 2 +# NO_SPLIT_FUNC-NEXT: jmp -16 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: foo: +# NO_SPLIT_FUNC-NEXT: xorb %al, 0 +# NO_SPLIT_FUNC-NEXT: int3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: qux: +# NO_SPLIT_FUNC-NEXT: xorb %al, 6 +# NO_SPLIT_FUNC-NEXT: int3 +# NO_SPLIT_FUNC-EMPTY: +# NO_SPLIT_FUNC-NEXT: baz: +# NO_SPLIT_FUNC-NEXT: xorb %al, 5 +# + +## Check that the combination of no-reorder-blocks and split-funcs makes lld fail. +# RUN: not ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-opt=split-funcs 2>&1 -o /dev/null | FileCheck %s --check-prefix=FAIL +# FAIL: propeller: Inconsistent combination of propeller optimizations 'split-funcs' and 'no-reorder-blocks'. + + +# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitreorderfuncs.out +# RUN: llvm-objdump -d %t.propeller.nosplitreorderfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_REORDER_FUNC +# +# NO_SPLIT_REORDER_FUNC: Disassembly of section .text: +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: foo: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 0 +# NO_SPLIT_REORDER_FUNC-NEXT: int3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 1 +# NO_SPLIT_REORDER_FUNC-NEXT: jne 14 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: aa.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: aaa.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 4 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: a.BB.bar: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 2 +# NO_SPLIT_REORDER_FUNC-NEXT: jmp -16 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: baz: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 5 +# NO_SPLIT_REORDER_FUNC-NEXT: int3 +# NO_SPLIT_REORDER_FUNC-EMPTY: +# NO_SPLIT_REORDER_FUNC-NEXT: qux: +# NO_SPLIT_REORDER_FUNC-NEXT: xorb %al, 6 + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +.align 4 +foo: + xor %al,0 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,1 +# -- Begin function bar +.type bar,@function +.align 4 +bar: + xor %al,1 + je aa.BB.bar + jmp a.BB.bar + +.section .text,"ax",@progbits,unique,2 +a.BB.bar: + xor %al,2 + jmp aaa.BB.bar +.La.BB.bar_end: + .size a.BB.bar, .La.BB.bar_end-a.BB.bar + +.section .text,"ax",@progbits,unique,3 +aa.BB.bar: + xor %al,3 + jmp aaa.BB.bar +.Laa.BB.bar_end: + .size aa.BB.bar, .Laa.BB.bar_end-aa.BB.bar + +.section .text,"ax",@progbits,unique,4 +aaa.BB.bar: + xor %al,4 +.Laaa.BB.bar_end: + .size aaa.BB.bar, .Laaa.BB.bar_end-aaa.BB.bar + +.section .text,"ax",@progbits,unique,1 +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,5 +# -- Begin function baz +.type baz,@function +.align 4 +baz: + xor %al,5 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz + +.section .text,"ax",@progbits,unique,6 +# -- Begin function qux +.type qux,@function +.align 4 +qux: + xor %al,6 + +.Lqux_end: + .size qux, .Lqux_end-qux +# -- End function qux Index: lld/test/ELF/propeller/codelayout/optimal-fallthrough.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/codelayout/optimal-fallthrough.s @@ -0,0 +1,90 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises optimal basic block reordering one a single function. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out +# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000005 t foo +# BEFORE-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# BEFORE-NEXT: 000000000020112a 0000000000000004 t aa.BB.foo +# BEFORE-NEXT: 000000000020112e 0000000000000004 t aaa.BB.foo + +## Create a propeller profile for foo, based on the cfg below: +## +## +## foo +## |\ +## | \100 +## | \ +## | v +## 150| a.BB.foo +## | / \ +## | /100 \0 +## | / \ +## vv v +## aaa.BB.foo aa.BB.foo +## +## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and +## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo. +## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf. + +# RUN: echo "!foo" > %t_prof.propeller +# RUN: echo "!!1" >> %t_prof.propeller +# RUN: echo "!!3" >> %t_prof.propeller +# RUN: echo "Symbols" >> %t_prof.propeller +# RUN: echo "1 12 Nfoo" >> %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 4 1.2" >> %t_prof.propeller +# RUN: echo "4 4 1.3" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "1 4 150" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "1 2 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -propeller-debug-symbol=foo -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out +# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER + +# AFTER: 0000000000201120 0000000000000005 t foo +# AFTER-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# AFTER-NEXT: 000000000020112a 0000000000000004 t aaa.BB.foo +# AFTER-NEXT: 000000000020112e 0000000000000004 t aa.BB.foo + +#.global _start +#_start: +# -- Begin function foo +.section .text,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + je aaa.BB.foo + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + ret +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + ret +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo