Index: lld/ELF/Propeller/CodeLayout/CodeLayout.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/CodeLayout.h
@@ -0,0 +1,42 @@
+//===- PropellerBBReordering.h  -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H
+#define LLD_ELF_PROPELLER_BB_REORDERING_H
+
+#include "NodeChainClustering.h"
+#include "PropellerCFG.h"
+#include "PropellerConfig.h"
+
+#include <list>
+#include <vector>
+
+namespace lld {
+namespace propeller {
+
+class CodeLayout {
+private:
+  // CFGs that are processed by the reordering algorithm. These are separated
+  // into hot and cold cfgs.
+  std::vector<ControlFlowGraph *> HotCFGs, ColdCFGs;
+  // The final hot and cold order containing all cfg nodes.
+  std::vector<CFGNode *> HotOrder, ColdOrder;
+  // Handle of the clustering algorithm used to further reorder the computed
+  // chains.
+  std::unique_ptr<ChainClustering> CC;
+
+public:
+  void doSplitOrder(std::list<StringRef> &symbolList,
+                    std::list<StringRef>::iterator hotPlaceHolder,
+                    std::list<StringRef>::iterator coldPlaceHolder);
+
+  void printStats();
+};
+
+} // namespace propeller
+} // namespace lld
+#endif
Index: lld/ELF/Propeller/CodeLayout/CodeLayout.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/CodeLayout.cpp
@@ -0,0 +1,181 @@
+//===- CodeLayout.cpp  ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and implements the entry point of code layout optimization
+// (doSplitOrder).
+//===----------------------------------------------------------------------===//
+#include "CodeLayout.h"
+
+#include "NodeChain.h"
+#include "NodeChainBuilder.h"
+#include "NodeChainClustering.h"
+#include "Propeller.h"
+#include "PropellerCFG.h"
+#include "PropellerConfig.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+
+#include <vector>
+
+using llvm::DenseMap;
+using llvm::DenseSet;
+
+namespace lld {
+namespace propeller {
+extern double getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward,
+                                 uint64_t);
+
+// This function iterates over the CFGs included in the Propeller profile and
+// adds them to cold and hot cfg lists. Then it appropriately performs basic
+// block reordering by calling NodeChainBuilder.doOrder() either on all CFGs (if
+// -propeller-opt=reorder-ip) or individually on every CFG. After creating all
+// the node chains, it hands the basic block chains to a ChainClustering
+// instance for further rerodering.
+void CodeLayout::doSplitOrder(std::list<StringRef> &symbolList,
+                              std::list<StringRef>::iterator hotPlaceHolder,
+                              std::list<StringRef>::iterator coldPlaceHolder) {
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
+
+  // Populate the hot and cold cfg lists by iterating over the CFGs in the
+  // propeller profile.
+  prop->forEachCfgRef([this](ControlFlowGraph &cfg) {
+    if (cfg.isHot()) {
+      HotCFGs.push_back(&cfg);
+      if (propellerConfig.optPrintStats) {
+        // Dump the number of basic blocks and hot basic blocks for every
+        // function
+        unsigned hotBBs = 0;
+        unsigned allBBs = 0;
+        cfg.forEachNodeRef([&hotBBs, &allBBs](CFGNode &node) {
+          if (node.Freq)
+            hotBBs++;
+          allBBs++;
+        });
+        fprintf(stderr, "HISTOGRAM: %s,%u,%u\n", cfg.Name.str().c_str(), allBBs,
+                hotBBs);
+      }
+    } else
+      ColdCFGs.push_back(&cfg);
+  });
+
+  if (propellerConfig.optReorderIP || propellerConfig.optReorderFuncs)
+    CC.reset(new CallChainClustering());
+  else {
+    // If function ordering is disabled, we want to conform the the initial
+    // ordering of functions in both the hot and the cold layout.
+    CC.reset(new NoOrdering());
+  }
+
+  if (propellerConfig.optReorderIP) {
+    // If -propeller-opt=reorder-ip we want to run basic block reordering on all
+    // the basic blocks of the hot CFGs.
+    NodeChainBuilder(HotCFGs).doOrder(CC);
+  } else if (propellerConfig.optReorderBlocks) {
+    // Otherwise we apply reordering on every CFG separately
+    for (ControlFlowGraph *cfg : HotCFGs)
+      NodeChainBuilder(cfg).doOrder(CC);
+  } else {
+    // If reordering is not desired, we create changes according to the initial
+    // order in the CFG.
+    for (ControlFlowGraph *cfg : HotCFGs)
+      CC->addChain(std::unique_ptr<NodeChain>(new NodeChain(cfg)));
+  }
+
+  // The order for cold cfgs remains unchanged.
+  for (ControlFlowGraph *cfg : ColdCFGs)
+    CC->addChain(std::unique_ptr<NodeChain>(new NodeChain(cfg)));
+
+  // After building all the chains, let the chain clustering algorithm perform
+  // the final reordering and populate the hot and cold cfg node orders.
+  CC->doOrder(HotOrder, ColdOrder);
+
+  // Transfter the order to the symbol list.
+  for (CFGNode *n : HotOrder)
+    symbolList.insert(hotPlaceHolder, n->ShName);
+
+  for (CFGNode *n : ColdOrder)
+    symbolList.insert(coldPlaceHolder, n->ShName);
+
+  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
+  warn("[Propeller]: BB reordering took: " +
+       Twine(std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+                 .count()));
+
+  if (propellerConfig.optPrintStats)
+    printStats();
+}
+
+// Prints statistics of the computed layout, including the number of partitions
+// for each function across the code layout, the edge count for each distance
+// level, and the ExtTSP score achieved for each function.
+void CodeLayout::printStats() {
+
+  DenseMap<CFGNode *, uint64_t> nodeAddressMap;
+  llvm::StringMap<unsigned> functionPartitions;
+  uint64_t currentAddress = 0;
+  ControlFlowGraph *currentCFG = nullptr;
+  for (CFGNode *n : HotOrder) {
+    if (currentCFG != n->CFG) {
+      currentCFG = n->CFG;
+      functionPartitions[currentCFG->Name]++;
+    }
+    nodeAddressMap[n] = currentAddress;
+    currentAddress += n->ShSize;
+  }
+
+  for (auto &elem : functionPartitions)
+    fprintf(stderr, "FUNCTION PARTITIONS: %s,%u\n", elem.first().str().c_str(),
+            elem.second);
+
+  std::vector<uint64_t> distances({0, 128, 640, 1028, 4096, 65536, 2 << 20,
+                                   std::numeric_limits<uint64_t>::max()});
+  std::map<uint64_t, uint64_t> histogram;
+  llvm::StringMap<double> extTSPScoreMap;
+  for (CFGNode *n : HotOrder) {
+    auto scoreEntry = extTSPScoreMap.try_emplace(n->CFG->Name, 0).first;
+    n->forEachOutEdgeRef([&nodeAddressMap, &distances, &histogram,
+                          &scoreEntry](CFGEdge &edge) {
+      if (!edge.Weight || edge.isReturn())
+        return;
+      if (nodeAddressMap.find(edge.Src) == nodeAddressMap.end() ||
+          nodeAddressMap.find(edge.Sink) == nodeAddressMap.end()) {
+        warn("Found a hot edge whose source and sink do not show up in the "
+             "layout!");
+        return;
+      }
+      uint64_t srcOffset = nodeAddressMap[edge.Src];
+      uint64_t sinkOffset = nodeAddressMap[edge.Sink];
+      bool edgeForward = srcOffset + edge.Src->ShSize <= sinkOffset;
+      uint64_t srcSinkDistance =
+          edgeForward ? sinkOffset - srcOffset - edge.Src->ShSize
+                      : srcOffset - sinkOffset + edge.Src->ShSize;
+
+      if (edge.Type == CFGEdge::EdgeType::INTRA_FUNC ||
+          edge.Type == CFGEdge::EdgeType::INTRA_DYNA)
+        scoreEntry->second +=
+            getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance);
+
+      auto res =
+          std::lower_bound(distances.begin(), distances.end(), srcSinkDistance);
+      histogram[*res] += edge.Weight;
+    });
+  }
+
+  for (auto &elem : extTSPScoreMap)
+    fprintf(stderr, "Ext TSP Score: %s %.6f\n", elem.first().str().c_str(),
+            elem.second);
+  fprintf(stderr, "DISTANCE HISTOGRAM: ");
+  for (auto elem : histogram)
+    fprintf(stderr, "\t[%lu -> %lu]", elem.first, elem.second);
+  fprintf(stderr, "\n");
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h
@@ -0,0 +1,387 @@
+//===- ModifiablePriorityQueue.h  ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a priority-queue data structure for storing key-value
+// pairs, with compare operations for both the key and the value type given.
+// It stores these pairs in a max-heap data structure which allows for fast O(lg
+// n)-time retrieval of the maximum element.
+// To compare two key-value pairs, the values parts are compared first. The keys
+// are compared in case of a tie. No two key-value pairs could have the same
+// key.
+//
+// The key-value pairs are also accessible using a map data structure.
+//
+// In contrast to C++ STL's priority queue, this data structure allows for fast
+// O(lg n)-time value update, or removal of any element while keep the max-heap
+// data structure sound.
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H
+#define LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <memory>
+#include <string.h>
+#include <vector>
+
+// This class defines a node in the ModifiablePriorityQueue data structure
+// containing a key, a value, pointers to its parent, and its left and right
+// children. This class must be instantiated with the key-type, the value type,
+// and their comparators.
+template <class K, class V, class CmpK, class CmpV> class HeapNode {
+public:
+  K key;
+  V value;
+  // Pointer to the parent (this is nullptr for root).
+  HeapNode<K, V, CmpK, CmpV> *parent;
+  // Pointers to the two children
+  HeapNode<K, V, CmpK, CmpV> *children[2];
+
+  HeapNode(K k, V v)
+      : key(k), value(std::move(v)), parent(nullptr), children() {}
+
+  // Get pointer to the left child
+  HeapNode<K, V, CmpK, CmpV> *leftChild() { return children[0]; }
+
+  // Get pointer to the right child
+  HeapNode<K, V, CmpK, CmpV> *rightChild() { return children[1]; }
+
+  // Whether this is the left child of its parent, returns false if this is the
+  // root.
+  bool isLeftChild() {
+    if (parent)
+      return parent->leftChild() == this;
+    return false;
+  }
+
+  // Whether this is the right child of its parent, returns false if this is the
+  // root.
+  bool isRightChild() {
+    if (parent)
+      return parent->rightChild() == this;
+    return false;
+  }
+
+  // Set the left child for this node.
+  void adoptLeftChild(HeapNode<K, V, CmpK, CmpV> *c) {
+    children[0] = c;
+    if (c)
+      c->parent = this;
+  }
+
+  // Set the right child for this node.
+  void adoptRightChild(HeapNode<K, V, CmpK, CmpV> *c) {
+    children[1] = c;
+    if (c)
+      c->parent = this;
+  }
+
+  // Set both the left and right children for this node.
+  void adoptChildren(HeapNode<K, V, CmpK, CmpV> *(&_children)[2]) {
+    adoptLeftChild(_children[0]);
+    adoptRightChild(_children[1]);
+  }
+
+  // Return a string representation for this node given an indentation level
+  // which is used for pretty-printing of the heap tree.
+  // This assumes that
+  // std::to_string has been specialized for the key and value types.
+  std::string toString(unsigned level) {
+    std::string str = std::string(level, ' ');
+    str += "NODE: " + std::to_string(key) + " -> " + std::to_string(value);
+    for (auto *c : children) {
+      if (c) {
+        str += "\n";
+        str += c->toString(level + 1);
+      }
+    }
+    return str;
+  }
+};
+
+// Comparator class for nodes given comparators for the key and value types. It
+// compares the values for the two pairs first and in the case of a tie, it
+// compares the keys.
+template <class K, class V, class CmpK, class CmpV> struct CompareHeapNode {
+  CmpK KeyComparator;
+  CmpV ValueComparator;
+  bool operator()(const HeapNode<K, V, CmpK, CmpV> &n1,
+                  const HeapNode<K, V, CmpK, CmpV> &n2) const {
+    if (ValueComparator(n1.value, n2.value))
+      return true;
+    if (ValueComparator(n2.value, n1.value))
+      return false;
+    return KeyComparator(n1.key, n2.key);
+  }
+};
+
+// This class implements a pointer-based heap-like data structure for storing
+// key-value pairs with efficient O(lg n) retrieval, removal, update, and
+// insertion of nodes.
+//
+// Any insertion happens at the lowest-right-most empty leaf position. This
+// position is accessible via the root pointer of the tree using the binary
+// representation of [size of the tree + 1]. Starting with the left-most bit, we
+// take the left child if the bit is zero and the right child if the bit is one.
+//
+// Deletion of a node happens by replacing the last node (lowest-right-most
+// occupied leaf), with the given node, removing the node from the tree, and
+// finally heapifying the replaced node up and down to keep the
+// soundness of the heap.
+//
+// Value-update is the simplest, which requires updating the node's value
+// followed by heapifying up the node up and down.
+template <class K, class V, class CmpK, class CmpV>
+class ModifiablePriorityQueue {
+private:
+  // Comparator instance for comparing HeapNodes.
+  CompareHeapNode<K, V, CmpK, CmpV> HeapNodeComparator;
+
+  // Pointers to the nodes in this heap are stored in a map to allow for fast
+  // lookup of nodes given their keys.
+  llvm::DenseMap<K, std::unique_ptr<HeapNode<K, V, CmpK, CmpV>>> nodes;
+
+  // The tree-root for this priority queue, which holds the max key-value.
+  HeapNode<K, V, CmpK, CmpV> *root = nullptr;
+
+  // Total number of nodes in the tree.
+  unsigned Size = 0;
+
+  // Set the root for this tree and set it's parent to null.
+  void assignRoot(HeapNode<K, V, CmpK, CmpV> *node) {
+    root = node;
+    if (root)
+      root->parent = nullptr;
+  }
+
+  // Helper for getting a pointer to a node using a handle using its bits.
+  HeapNode<K, V, CmpK, CmpV> *getNodeWithHandle(unsigned handle) {
+    return getNodeWithHandleHelper(root, handle);
+  }
+
+  // This function traverses the tree from the given node, using an integer
+  // handle for direction. Going from left to right in the binary representation
+  // of the handle, take right if the bit is one, and left if the bit is zero.
+  HeapNode<K, V, CmpK, CmpV> *
+  getNodeWithHandleHelper(HeapNode<K, V, CmpK, CmpV> *node, unsigned handle) {
+    if (handle == 1)
+      return node;
+    auto *p = getNodeWithHandleHelper(node, handle >> 1);
+    return (handle & 1) ? p->rightChild() : p->leftChild();
+  }
+
+  // Insert a node into the heap tree.
+  void insert(HeapNode<K, V, CmpK, CmpV> *node) {
+    if (!root) {
+      // If the tree is empty, we only need to set the root.
+      assignRoot(node);
+    } else {
+      // Otherwise, insert at the lowest-right-most leaf empty-position, which
+      // is encoded by [Size + 1]
+      unsigned handle = Size + 1;
+      // We need to find the parent of the target position.
+      auto *p = getNodeWithHandleHelper(root, handle >> 1);
+      // Attach this to the left or right of the parent node, depending on the
+      // last bit in handle.
+      if (handle & 1)
+        p->adoptRightChild(node);
+      else
+        p->adoptLeftChild(node);
+      // We only need to heapify up as this node has no children.
+      heapifyUp(node);
+    }
+    // Increment the size of the tree.
+    Size++;
+  }
+
+  // This function removes a node from the heap and returns its value.
+  // Instead of removing the node directly, it moves the lowest-right-most node
+  // to the position of the given node and heapifies it up and down.
+  //
+  // After this function, the removed node's HeapNode pointer is invalid.
+  V remove(HeapNode<K, V, CmpK, CmpV> *node) {
+    // Find the lowest-right-most node
+    auto *last = getNodeWithHandleHelper(root, Size);
+    assert(last->leftChild() == nullptr && last->rightChild() == nullptr);
+
+    // Remove the node from the children of its parents.
+    if (last->parent) {
+      if (last->isLeftChild())
+        last->parent->adoptLeftChild(nullptr);
+      else
+        last->parent->adoptRightChild(nullptr);
+    }
+
+    if (node == last) {
+      // If we are removing the last node, not much needs to be done.
+      if (node->parent == nullptr) {
+        assert(node == root);
+        assignRoot(nullptr);
+      }
+    } else { // node != last
+      if (node->parent) {
+        // Move the last node in place of the to-be-removed node.
+        if (node->isLeftChild())
+          node->parent->adoptLeftChild(last);
+        else
+          node->parent->adoptRightChild(last);
+      } else { // node->parent == nullptr, so node is root
+        assert(node == root);
+        assignRoot(last);
+      }
+      // Let the moved-node adopt the children of the to-be-removed node
+      last->adoptChildren(node->children);
+      // Finally, heapify the moved node up and down.
+      if (!heapifyUp(last))
+        heapifyDown(last);
+    }
+
+    // Decrement the number of nodes.
+    Size--;
+    // Save the node's value to be returned.
+    auto nodeElement = std::move(node->value);
+    // Remove the node from the key->node mapping, effectively deallocating the
+    // HeapNode.
+    nodes.erase(node->key);
+    return nodeElement;
+  }
+
+  // Recursively rectify the max-heap upwards from a node.
+  // This takes O(lg n) time.
+  // Returns true if any modifications occurs.
+  bool heapifyUp(HeapNode<K, V, CmpK, CmpV> *node) {
+    // Heapify upwards until no more inconsistency is found.
+    if (node->parent && HeapNodeComparator(*node->parent, *node)) {
+      swapWithParent(node);
+      heapifyUp(node);
+      return true;
+    }
+    return false;
+  }
+
+  // Recursively rectify the max-heap downwards from a node.
+  // This takes O(lg n) time.
+  // Returns true if any modifications occurs.
+  bool heapifyDown(HeapNode<K, V, CmpK, CmpV> *node) {
+    auto maxChild = std::max_element(
+        node->children, node->children + 2,
+        [this](const HeapNode<K, V, CmpK, CmpV> *c1,
+               const HeapNode<K, V, CmpK, CmpV> *c2) {
+          // This compares two HeapNode pointers, relying on
+          // HeapNodeComparator for non-null pointers.
+          return c1 == nullptr
+                     ? true
+                     : (c2 == nullptr ? false : HeapNodeComparator(*c1, *c2));
+        });
+    // Move the maximum child to the parent if required and further heapify
+    // down.
+    if (*maxChild != nullptr && HeapNodeComparator(*node, **maxChild)) {
+      swapWithParent(*maxChild);
+      heapifyDown(node);
+      return true;
+    }
+    return false;
+  }
+
+  // Helper function for swapping a node with its parent.
+  // Can only be called if the given node has a parent. Also, the given node's
+  // value should compare bigger than both its parent and its sibling.
+  // For example swapWithParent(C1) results in the following.
+  //            G                    G
+  //           /                    /
+  //          P        ->          C1
+  //         / \                  / \
+  //        C1 C2                P  C2
+  //       / \                  / \
+  //      A   B                A   B
+  // This function merely properlly corrects the parent-child relationships.
+  void swapWithParent(HeapNode<K, V, CmpK, CmpV> *node) {
+    auto *par = node->parent;
+    assert(par);
+    auto *gpar = node->parent->parent;
+    if (!gpar) {
+      assert(this->root == par);
+      this->assignRoot(node);
+    } else {
+      if (par->isLeftChild())
+        gpar->adoptLeftChild(node);
+      else
+        gpar->adoptRightChild(node);
+    }
+    auto *par_old_left = par->leftChild();
+    auto *par_old_right = par->rightChild();
+
+    par->adoptChildren(node->children);
+    if (par_old_left == node) {
+      node->adoptLeftChild(par);
+      node->adoptRightChild(par_old_right);
+    } else {
+      node->adoptLeftChild(par_old_left);
+      node->adoptRightChild(par);
+    }
+  }
+
+public:
+  // Insert a key-value pair into the heap.
+  // If the same key exists in the heap, it updates the value of the existing
+  // HeapNode and heapifies the node up or down.
+  void insert(K key, V value) {
+    auto cur = nodes.find(key);
+    if (cur != nodes.end()) {
+      cur->second->value = std::move(value);
+      if (!heapifyUp(cur->second.get()))
+        heapifyDown(cur->second.get());
+    } else {
+      // If the key is not found, create a new HeapNode and insert it into the
+      // heap.
+      auto *node = new HeapNode<K, V, CmpK, CmpV>(key, std::move(value));
+      insert(node);
+      nodes.try_emplace(key, node);
+    }
+  }
+
+  // Removes the node associated with a key from the heap if one exists.
+  void erase(K k) {
+    auto cur = nodes.find(k);
+    if (cur != nodes.end())
+      remove(cur->second.get());
+  }
+
+  // Retrieves the HeapNode corresponding to a given key (returns nullptr if no
+  // such node exists).
+  HeapNode<K, V, CmpK, CmpV> *get(K k) {
+    auto it = nodes.find(k);
+    return it == nodes.end() ? nullptr : it->second.get();
+  }
+
+  // Removes the root (max-element) from the heap.
+  void pop() {
+    assert(!empty());
+    remove(root);
+  }
+
+  // Returns the value of the max-element node.
+  V top() { return std::move(root->value); }
+
+  bool empty() { return Size == 0; }
+
+  unsigned size() { return Size; };
+
+  // Returns a string representation of the heap tree.
+  std::string toString() {
+    std::string str;
+    str += "HEAP with ";
+    str += std::to_string(Size);
+    str += " nodes\n";
+    if (root)
+      str += root->toString(0);
+    return str;
+  }
+};
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChain.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChain.h
@@ -0,0 +1,124 @@
+//===- PropellerNodeChain.h  ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_NODE_CHAIN_H
+#define LLD_ELF_PROPELLER_NODE_CHAIN_H
+
+#include "PropellerCFG.h"
+#include "llvm/ADT/DenseSet.h"
+
+#include <list>
+#include <unordered_map>
+#include <vector>
+
+using llvm::DenseSet;
+
+namespace lld {
+namespace propeller {
+
+// Represents a chain of nodes (basic blocks).
+class NodeChain {
+public:
+  // Representative node of the chain, with which it is initially constructed.
+  CFGNode *DelegateNode;
+
+  // CFG of the nodes in this chain (this will be null if the nodes come from
+  // more than one cfg.
+  ControlFlowGraph *CFG;
+
+  // Ordered list of the nodes in this chain.
+  std::list<CFGNode *> Nodes;
+
+  // Iterators to the positions in the chain where the chain transitions from
+  // one function to another.
+  std::list<std::list<CFGNode *>::iterator> FunctionTransitions;
+
+  // Out edges for this chain, to its own nodes and nodes of other chains.
+  std::unordered_map<NodeChain *, std::vector<CFGEdge *>> OutEdges;
+
+  // Chains which have outgoing edges to this chain.
+  DenseSet<NodeChain *> InEdges;
+
+  // Total binary size of the chain.
+  uint64_t Size;
+
+  // Total execution frequency of the chain.
+  uint64_t Freq;
+
+  // Extended TSP score of the chain.
+  double Score = 0;
+
+  // Whether to print out information about how this chain joins with others.
+  bool DebugChain;
+
+  // Constructor for building a NodeChain from a single Node
+  NodeChain(CFGNode *node)
+      : DelegateNode(node), CFG(node->CFG), Nodes(1, node), Size(node->ShSize),
+        Freq(node->Freq), DebugChain(node->CFG->DebugCFG) {}
+
+  // Constructor for building a NodeChain from all nodes in the CFG according to
+  // the initial order.
+  NodeChain(ControlFlowGraph *cfg)
+      : DelegateNode(cfg->getEntryNode()), CFG(cfg), Size(cfg->Size), Freq(0),
+        DebugChain(cfg->DebugCFG) {
+    cfg->forEachNodeRef([this](CFGNode &node) {
+      Nodes.push_back(&node);
+      Freq += node.Freq;
+    });
+  }
+
+  // Helper function to iterate over the outgoing edges of this chain to a
+  // specific chain, while applying a given function on each edge.
+  template <class Visitor>
+  void forEachOutEdgeToChain(NodeChain *chain, Visitor V) {
+    auto it = OutEdges.find(chain);
+    if (it == OutEdges.end())
+      return;
+    for (CFGEdge *E : it->second)
+      V(*E, this, chain);
+  }
+
+  // This returns the execution density of the chain.
+  double execDensity() const {
+    return ((double)Freq) / std::max(Size, (uint64_t)1);
+  }
+};
+
+// This returns a string representation of the chain
+std::string toString(const NodeChain &c);
+
+} // namespace propeller
+} // namespace lld
+
+namespace std {
+// Specialization of std::less for NodeChain, which allows for consistent
+// tie-breaking in our Map data structures.
+template <> struct less<lld::propeller::NodeChain *> {
+  bool operator()(const lld::propeller::NodeChain *c1,
+                  const lld::propeller::NodeChain *c2) const {
+    return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr;
+  }
+};
+
+// Specialization of std::less for pair<NodeChain,NodeChain>, which allows for
+// consistent tie-breaking in our Map data structures.
+template <>
+struct less<pair<lld::propeller::NodeChain *, lld::propeller::NodeChain *>> {
+  bool operator()(
+      const pair<lld::propeller::NodeChain *, lld::propeller::NodeChain *> p1,
+      const pair<lld::propeller::NodeChain *, lld::propeller::NodeChain *> p2)
+      const {
+    if (less<lld::propeller::NodeChain *>()(p1.first, p2.first))
+      return true;
+    if (less<lld::propeller::NodeChain *>()(p2.first, p1.first))
+      return false;
+    return less<lld::propeller::NodeChain *>()(p1.second, p2.second);
+  }
+};
+} // namespace std
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChain.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChain.cpp
@@ -0,0 +1,32 @@
+//===- NodeChain.cpp  -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "NodeChain.h"
+
+namespace lld {
+namespace propeller {
+
+std::string toString(const NodeChain &c) {
+  std::string str;
+  size_t cfgNameLength = c.DelegateNode->CFG->Name.size();
+  str += c.DelegateNode->CFG->Name.str() + " [ ";
+  for (auto *n : c.Nodes) {
+    str += n->CFG->getEntryNode() == n
+               ? "Entry"
+               : std::to_string(n->ShName.size() - cfgNameLength - 4);
+    str += " (size=" + std::to_string(n->ShSize) +
+           ", freq=" + std::to_string(n->Freq) + ")";
+    if (n != c.Nodes.back())
+      str += " -> ";
+  }
+  str += " ]";
+  str += " score: " + std::to_string(c.Score);
+  return str;
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h
@@ -0,0 +1,184 @@
+//===- PropellerNodeChainAssembly.h  --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file includes the declaration of the NodeChainAssembly class. Each
+// instance of this class gives a recipe for merging two NodeChains together in
+// addition to the ExtTSP score gain that will be achieved by that merge. Each
+// NodeChainAssembly consists of three NodeChainSlices from two node chains: the
+// (potentially) splitted chain, and the unsplit chain. A NodeChainSlice has
+// represents a slice of a NodeChain by storing iterators to the beginning and
+// end of that slice in the node chain, plus the binary offsets at which these
+// slices begin and end. The offsets allow effient computation of the gain in
+// ExtTSP score.
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_NODE_CHAIN_ASSEMBLY_H
+#define LLD_ELF_PROPELLER_NODE_CHAIN_ASSEMBLY_H
+
+#include "NodeChain.h"
+#include "PropellerCFG.h"
+
+#include <list>
+#include <vector>
+
+namespace lld {
+namespace propeller {
+
+double getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward,
+                          uint64_t srcSinkDistance);
+
+class NodeChainSlice {
+public:
+  // Chain from which this slice comes from
+  NodeChain *Chain;
+
+  // The endpoints of the slice in the corresponding chain
+  std::list<CFGNode *>::iterator Begin, End;
+
+  // The offsets corresponding to the two endpoints
+  uint64_t BeginOffset, EndOffset;
+
+  // Constructor for building a chain slice from a given chain and the two
+  // endpoints of the chain.
+  NodeChainSlice(NodeChain *c, std::list<CFGNode *>::iterator begin,
+                 std::list<CFGNode *>::iterator end)
+      : Chain(c), Begin(begin), End(end) {
+
+    BeginOffset = (*begin)->ChainOffset;
+    if (End == Chain->Nodes.end())
+      EndOffset = Chain->Size;
+    else
+      EndOffset = (*end)->ChainOffset;
+  }
+
+  // (Binary) size of this slice
+  uint64_t size() const { return EndOffset - BeginOffset; }
+};
+
+// This enum represents the order in which three slices (X1, X2, and Y) are
+// merged together.
+enum MergeOrder {
+  Begin,
+  X2X1Y = Begin,
+  BeginNext,
+  X1YX2 = BeginNext,
+  X2YX1,
+  YX2X1,
+  End
+};
+
+class NodeChainAssembly {
+public:
+  // The gain in ExtTSP score achieved by this NodeChainAssembly once it
+  // is accordingly applied to the two chains.
+  // This is effectively equal to "Score - splitChain->Score -
+  // unsplitChain->Score".
+  double ScoreGain = 0;
+
+  // The two chains, the first being the splitChain and the second being the
+  // unsplitChain.
+  std::pair<NodeChain *, NodeChain *> ChainPair;
+
+  // The slice position of splitchain
+  std::list<CFGNode *>::iterator SlicePosition;
+
+  // The three chain slices
+  std::vector<NodeChainSlice> Slices;
+
+  // The merge order of the slices
+  MergeOrder MOrder;
+
+  // The constructor for creating a NodeChainAssembly. slicePosition must be an
+  // iterator into chainX.
+  NodeChainAssembly(NodeChain *chainX, NodeChain *chainY,
+                    std::list<CFGNode *>::iterator slicePosition,
+                    MergeOrder mOrder)
+      : ChainPair(chainX, chainY), SlicePosition(slicePosition),
+        MOrder(mOrder) {
+    NodeChainSlice x1(chainX, chainX->Nodes.begin(), SlicePosition);
+    NodeChainSlice x2(chainX, SlicePosition, chainX->Nodes.end());
+    NodeChainSlice y(chainY, chainY->Nodes.begin(), chainY->Nodes.end());
+
+    switch (MOrder) {
+    case MergeOrder::X2X1Y:
+      Slices = {std::move(x2), std::move(x1), std::move(y)};
+      break;
+    case MergeOrder::X1YX2:
+      Slices = {std::move(x1), std::move(y), std::move(x2)};
+      break;
+    case MergeOrder::X2YX1:
+      Slices = {std::move(x2), std::move(y), std::move(x1)};
+      break;
+    case MergeOrder::YX2X1:
+      Slices = {std::move(y), std::move(x2), std::move(x1)};
+      break;
+    default:
+      assert("Invalid MergeOrder!" && false);
+    }
+
+    // Set the ExtTSP Score gain as the difference between the new score after
+    // merging these chains and the current scores of the two chains.
+    ScoreGain =
+        computeExtTSPScore() - splitChain()->Score - unsplitChain()->Score;
+  }
+
+  bool isValid() { return ScoreGain > 0.0001; }
+
+  // Find the NodeChainSlice in this NodeChainAssembly which contains the given
+  // node. If the node is not contained in this NodeChainAssembly, then return
+  // false. Otherwise, set idx equal to the index of the corresponding slice and
+  // return true.
+  bool findSliceIndex(CFGNode *node, NodeChain *chain, uint64_t offset,
+                      uint8_t &idx) const;
+
+  // This function computes the ExtTSP score for a chain assembly record. This
+  // goes the three BB slices in the assembly record and considers all edges
+  // whose source and sink belongs to the chains in the assembly record.
+  double computeExtTSPScore() const;
+
+  // First node in the resulting assembled chain.
+  CFGNode *getFirstNode() const {
+    for (auto &slice : Slices)
+      if (slice.Begin != slice.End)
+        return *slice.Begin;
+    return nullptr;
+  }
+
+  struct CompareNodeChainAssembly {
+    bool operator()(const std::unique_ptr<NodeChainAssembly> &a1,
+                    const std::unique_ptr<NodeChainAssembly> &a2) const;
+  };
+
+  friend class NodeChainBuilder;
+
+  // We delete the copy constructor to make sure NodeChainAssembly is moved
+  // rather than copied.
+  NodeChainAssembly(NodeChainAssembly &&) = default;
+  // copy constructor is implicitly deleted
+  // NodeChainAssembly(NodeChainAssembly&) = delete;
+  NodeChainAssembly() = delete;
+
+  std::pair<uint8_t, size_t> assemblyStrategy() const {
+    return std::make_pair(MOrder, (*SlicePosition)->MappedAddr);
+  }
+
+  inline bool split() const {
+    return SlicePosition != splitChain()->Nodes.begin();
+  }
+
+  inline NodeChain *splitChain() const { return ChainPair.first; }
+
+  inline NodeChain *unsplitChain() const { return ChainPair.second; }
+
+  inline MergeOrder mergeOrder() const { return MOrder; }
+};
+
+std::string toString(NodeChainAssembly &assembly);
+
+} // namespace propeller
+} // namespace lld
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp
@@ -0,0 +1,204 @@
+//===- NodeChainAssembly.cpp  ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "NodeChainAssembly.h"
+#include "NodeChain.h"
+
+namespace lld {
+namespace propeller {
+
+// Return the Extended TSP score for one edge, given its source to sink
+// direction and distance in the layout.
+double getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward,
+                          uint64_t srcSinkDistance) {
+  // Approximate callsites to be in the middle of the source basic block.
+  if (edge.isCall()) {
+    if (isEdgeForward)
+      srcSinkDistance += edge.Src->ShSize / 2;
+    else
+      srcSinkDistance -= edge.Src->ShSize / 2;
+  }
+
+  if (edge.isReturn()) {
+    if (isEdgeForward)
+      srcSinkDistance += edge.Sink->ShSize / 2;
+    else
+      srcSinkDistance -= edge.Sink->ShSize / 2;
+  }
+
+  if (srcSinkDistance == 0 && (edge.Type == CFGEdge::EdgeType::INTRA_FUNC ||
+                               edge.Type == CFGEdge::EdgeType::INTRA_DYNA))
+    return edge.Weight * propellerConfig.optFallthroughWeight;
+
+  if (isEdgeForward && srcSinkDistance < propellerConfig.optForwardJumpDistance)
+    return edge.Weight * propellerConfig.optForwardJumpWeight *
+           (1.0 -
+            ((double)srcSinkDistance) / propellerConfig.optForwardJumpDistance);
+
+  if (!isEdgeForward &&
+      srcSinkDistance < propellerConfig.optBackwardJumpDistance)
+    return edge.Weight * propellerConfig.optBackwardJumpWeight *
+           (1.0 - ((double)srcSinkDistance) /
+                      propellerConfig.optBackwardJumpDistance);
+  return 0;
+}
+
+bool NodeChainAssembly::findSliceIndex(CFGNode *node, NodeChain *chain,
+                                       uint64_t offset, uint8_t &idx) const {
+  for (idx = 0; idx < 3; ++idx) {
+    if (chain != Slices[idx].Chain)
+      continue;
+    // We find if the node's offset lies with the begin and end offset of this
+    // slice.
+    if (offset < Slices[idx].BeginOffset || offset > Slices[idx].EndOffset)
+      continue;
+    if (offset < Slices[idx].EndOffset && offset > Slices[idx].BeginOffset)
+      return true;
+    // A node can have zero size, which means multiple nodes may be associated
+    // with the same offset. This means that if the node's offset is at the
+    // beginning or the end of the slice, the node may reside in either slices
+    // of the chain.
+    if (offset == Slices[idx].EndOffset) {
+      // If offset is at the end of the slice, iterate backwards over the
+      // slice to find a zero-sized node.
+      for (auto nodeIt = std::prev(Slices[idx].End);
+           nodeIt != std::prev(Slices[idx].Begin); nodeIt--) {
+        // Stop iterating if the node's size is non-zero as this would change
+        // the offset.
+        if ((*nodeIt)->ShSize)
+          break;
+        // Have we found the node?
+        if (*nodeIt == node)
+          return true;
+      }
+    }
+    if (offset == Slices[idx].BeginOffset) {
+      // If offset is at the beginning of the slice, iterate forwards over the
+      // slice to find the node.
+      for (auto nodeIt = Slices[idx].Begin; nodeIt != Slices[idx].End;
+           nodeIt++) {
+        if (*nodeIt == node)
+          return true;
+        // Stop iterating if the node's size is non-zero as this would change
+        // the offset.
+        if ((*nodeIt)->ShSize)
+          break;
+      }
+    }
+  }
+  return false;
+}
+
+// This function computes the ExtTSP score for a chain assembly record. This
+// goes the three BB slices in the assembly record and considers all edges
+// whose source and sink belongs to the chains in the assembly record.
+double NodeChainAssembly::computeExtTSPScore() const {
+  // Zero-initialize the score.
+  double score = 0;
+
+  auto addEdgeScore = [this, &score](CFGEdge &edge, NodeChain *srcChain,
+                                     NodeChain *sinkChain) {
+    uint8_t srcSliceIdx, sinkSliceIdx;
+    auto srcNodeOffset = edge.Src->ChainOffset;
+    auto sinkNodeOffset = edge.Sink->ChainOffset;
+    if (!findSliceIndex(edge.Src, srcChain, srcNodeOffset, srcSliceIdx))
+      return;
+
+    if (!findSliceIndex(edge.Sink, sinkChain, sinkNodeOffset, sinkSliceIdx))
+      return;
+
+    bool edgeForward = (srcSliceIdx < sinkSliceIdx) ||
+                       (srcSliceIdx == sinkSliceIdx &&
+                        (srcNodeOffset + edge.Src->ShSize <= sinkNodeOffset));
+
+    uint64_t srcSinkDistance = 0;
+
+    if (srcSliceIdx == sinkSliceIdx) {
+      srcSinkDistance = edgeForward
+                            ? sinkNodeOffset - srcNodeOffset - edge.Src->ShSize
+                            : srcNodeOffset - sinkNodeOffset + edge.Src->ShSize;
+    } else {
+      const NodeChainSlice &srcSlice = Slices[srcSliceIdx];
+      const NodeChainSlice &sinkSlice = Slices[sinkSliceIdx];
+      srcSinkDistance =
+          edgeForward
+              ? srcSlice.EndOffset - srcNodeOffset - edge.Src->ShSize +
+                    sinkNodeOffset - sinkSlice.BeginOffset
+              : srcNodeOffset - srcSlice.BeginOffset + edge.Src->ShSize +
+                    sinkSlice.EndOffset - sinkNodeOffset;
+      // Increment the distance by the size of the middle slice if the src
+      // and sink are from the two ends.
+      if (std::abs(sinkSliceIdx - srcSliceIdx) == 2)
+        srcSinkDistance += Slices[1].size();
+    }
+
+    score += getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance);
+  };
+
+  // No changes will be made to the score that is contributed by the unsplit
+  // chain and we can simply increment by the chain's stored score.
+  score += unsplitChain()->Score;
+
+  // We need to recompute the score induced by the split chain (if it has really
+  // been split) as the offsets of the nodes have changed.
+  if (split())
+    splitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore);
+  else
+    score += splitChain()->Score;
+
+  // Consider the contribution to score for inter-chain edges.
+  splitChain()->forEachOutEdgeToChain(unsplitChain(), addEdgeScore);
+  unsplitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore);
+
+  return score;
+}
+
+bool NodeChainAssembly::CompareNodeChainAssembly::operator()(
+    const std::unique_ptr<NodeChainAssembly> &a1,
+    const std::unique_ptr<NodeChainAssembly> &a2) const {
+
+  if (a1->ScoreGain == a2->ScoreGain) {
+    if (std::less<std::pair<NodeChain *, NodeChain *>>()(a1->ChainPair,
+                                                         a2->ChainPair))
+      return true;
+    if (std::less<std::pair<NodeChain *, NodeChain *>>()(a2->ChainPair,
+                                                         a1->ChainPair))
+      return false;
+    return a1->assemblyStrategy() < a2->assemblyStrategy();
+  }
+  return a1->ScoreGain < a2->ScoreGain;
+}
+
+static std::string toString(MergeOrder mOrder) {
+  switch (mOrder) {
+  case MergeOrder::X2X1Y:
+    return "X2X1Y";
+  case MergeOrder::X1YX2:
+    return "X1YX2";
+  case MergeOrder::X2YX1:
+    return "X2YX1";
+  case MergeOrder::YX2X1:
+    return "YX2X1";
+  default:
+    assert("Invalid MergeOrder!" && false);
+    return "";
+  }
+}
+
+std::string toString(NodeChainAssembly &assembly) {
+  std::string str("assembly record between:\n");
+  str += lld::propeller::toString(*assembly.splitChain()) + " as X\n";
+  str += lld::propeller::toString(*assembly.unsplitChain()) + " as Y\n";
+  // str += "split position (X):, " + std::to_string(assembly.SlicePosition -
+  // assembly.splitChain()->Nodes.begin()) + "\n";
+  str += "merge order: " + lld::propeller::toString(assembly.MOrder) + "\n";
+  str += "ScoreGain: " + std::to_string(assembly.ScoreGain);
+  return str;
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h
@@ -0,0 +1,126 @@
+//===- PropellerNodeChainBuilder.h  ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_NODE_CHAIN_BUILDER_H
+#define LLD_ELF_PROPELLER_NODE_CHAIN_BUILDER_H
+
+#include "ModifiablePriorityQueue.h"
+#include "NodeChain.h"
+#include "NodeChainAssembly.h"
+#include "NodeChainClustering.h"
+#include "PropellerCFG.h"
+
+#include "llvm/ADT/DenseMap.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+using llvm::DenseMap;
+
+namespace lld {
+namespace propeller {
+
+// BB Chain builder based on the ExtTSP metric
+class NodeChainBuilder {
+private:
+  NodeChainAssembly::CompareNodeChainAssembly NodeChainAssemblyComparator;
+  // Cfgs repreresenting the functions that are reordered
+  std::vector<ControlFlowGraph *> CFGs;
+
+  // Set of built chains, keyed by section index of their Delegate Nodes.
+  // Chains are removed from this Map once they are merged into other chains.
+  DenseMap<uint64_t, std::unique_ptr<NodeChain>> Chains;
+
+  // All the initial chains, seperated into connected components
+  std::vector<std::vector<NodeChain *>> Components;
+
+  // NodeChainBuilder performs BB ordering component by component.
+  // This is the component number that the chain builder is currently working
+  // on.
+  unsigned CurrentComponent;
+
+  // These represent all the edges which are -- based on the profile -- the only
+  // (executed) outgoing edges from their source node and the only (executed)
+  // incoming edges to their sink nodes. The algorithm will make sure that these
+  // edges form fall-throughs in the final order.
+  DenseMap<CFGNode *, CFGNode *> MutuallyForcedOut;
+
+  // This maps every (ordered) pair of chains (with the first chain in the pair
+  // potentially splittable) to the highest-gain NodeChainAssembly for those
+  // chains. The Heap data structure allows fast retrieval of the maximum gain
+  // NodeChainAssembly, along with fast update.
+  ModifiablePriorityQueue<std::pair<NodeChain *, NodeChain *>,
+                          std::unique_ptr<NodeChainAssembly>,
+                          std::less<std::pair<NodeChain *, NodeChain *>>,
+                          NodeChainAssembly::CompareNodeChainAssembly>
+      NodeChainAssemblies;
+
+  // This map stores the candidate chains for each chain.
+  //
+  // For every chain, its candidate chains are the chains which can increase the
+  // overall ExtTSP score when merged with that chain. This is used to update
+  // the NodeChainAssemblies map whenever chains merge together. The candidate
+  // chains of a chain may also be updated as result of a merge.
+  std::unordered_map<NodeChain *, std::unordered_set<NodeChain *>>
+      CandidateChains;
+
+  void coalesceChains();
+  void initializeExtTSP();
+
+  // This function separates builds the connected components for the chains so
+  // it can later merge each connected component separately.
+  // Chains which are not connected to each other via profile edges will never
+  // be merged together by NodeChainBuilder. However, the composed chains may
+  // later be interleaved by ChainClustering.
+  void initializeChainComponents();
+  void attachFallThroughs();
+
+  // This function tries to place two nodes immediately adjacent to
+  // each other (used for fallthroughs).
+  // Returns true if this can be done.
+  bool attachNodes(CFGNode *src, CFGNode *sink);
+
+  void mergeChainEdges(NodeChain *splitChain, NodeChain *unsplitChain);
+
+  void mergeInOutEdges(NodeChain *mergerChain, NodeChain *mergeeChain);
+  void mergeChains(NodeChain *leftChain, NodeChain *rightChain);
+  void mergeChains(std::unique_ptr<NodeChainAssembly> assembly);
+
+  // Recompute the ExtTSP score of a chain
+  double computeExtTSPScore(NodeChain *chain) const;
+
+  // Update the related NodeChainAssembly records for two chains, with the
+  // assumption that unsplitChain has been merged into splitChain.
+  bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain);
+
+  void mergeAllChains();
+
+  void init();
+
+  // Initialize the mutuallyForcedOut map
+  void initMutuallyForcedEdges(ControlFlowGraph &cfg);
+
+  // Initialize basic block chains, with one chain for every node
+  void initNodeChains(ControlFlowGraph &cfg);
+
+public:
+  // This invokes the Extended TSP algorithm, orders the hot and cold basic
+  // blocks and inserts their associated symbols at the corresponding locations
+  // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the
+  // given SymbolList.
+  void doOrder(std::unique_ptr<ChainClustering> &CC);
+
+  NodeChainBuilder(std::vector<ControlFlowGraph *> &cfgs) : CFGs(cfgs) {}
+
+  NodeChainBuilder(ControlFlowGraph *cfg) : CFGs(1, cfg) {}
+};
+
+} // namespace propeller
+} // namespace lld
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp
@@ -0,0 +1,809 @@
+//===- NodeChainBuilder.cpp  ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and includes the implementation of intra-function basic block
+// reordering algorithm based on the Extended TSP metric as described in [1].
+//
+// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic
+// blocks in a function, by combining the gains from fall-throughs and short
+// jumps.
+//
+// Given an ordering of the basic blocks, for a function f, the ExtTSP score is
+// computed as follows.
+//
+// sum_{edges e in f} frequency(e) * weight(e)
+//
+// where frequency(e) is the execution frequency and weight(e) is computed as
+// follows:
+//  * 1 if distance(Src[e], Sink[e]) = 0 (i.e. fallthrough)
+//
+//  * 0.1 * (1 - distance(Src[e], Sink[e]) / 1024) if Src[e] < Sink[e] and 0 <
+//    distance(Src[e], Sink[e]) < 1024 (i.e. short forward jump)
+//
+//  * 0.1 * (1 - distance(Src[e], Sink[e]) / 640) if Src[e] > Sink[e] and 0 <
+//    distance(Src[e], Sink[e]) < 640 (i.e. short backward jump)
+//
+//  * 0 otherwise
+//
+// In short, it computes a weighted sum of frequencies of all edges in the
+// control flow graph. Each edge gets its weight depending on whether the given
+// ordering makes the edge a fallthrough, a short forward jump, or a short
+// backward jump.
+//
+// Although this problem is NP-hard like the regular TSP, an iterative greedy
+// basic-block-chaining algorithm is used to find a close to optimal solution.
+// This algorithm is described as follows.
+//
+// Starting with one basic block sequence (BB chain) for every basic block, the
+// algorithm iteratively joins BB chains together in order to maximize the
+// extended TSP score of all the chains.
+//
+// Initially, it finds all mutually-forced edges in the profiled CFG. These are
+// all the edges which are -- based on the profile -- the only (executed)
+// outgoing edge from their source node and the only (executed) incoming edges
+// to their sink nodes. Next, the source and sink of all mutually-forced edges
+// are attached together as fallthrough edges.
+//
+// Then, at every iteration, the algorithm tries to merge a pair of BB chains
+// which leads to the highest gain in the ExtTSP score. The algorithm extends
+// the search space by considering splitting short (less than 128 bytes in
+// binary size) BB chains into two chains and then merging these two chains with
+// the other chain in four ways. After every merge, the new merge gains are
+// updated. The algorithm repeats joining BB chains until no additional can be
+// achieved. Eventually, it sorts all the existing chains in decreasing order
+// of their execution density, i.e., the total profiled frequency of the chain
+// divided by its binary size.
+//
+// The values used by this algorithm are reconfiguriable using lld's propeller
+// flags. Specifically, these parameters are:
+//
+//   * propeller-forward-jump-distance: maximum distance of a forward jump
+//     default-set to 1024 in the above equation).
+//
+//   * propeller-backward-jump-distance: maximum distance of a backward jump
+//     (default-set to 640 in the above equation).
+//
+//   * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1)
+//
+//   * propeller-forward-jump-weight: weight of a forward jump (default-set to
+//     0.1)
+//
+//   * propeller-backward-jump-weight: weight of a backward jump (default-set to
+//     0.1)
+//
+//   * propeller-chain-split-threshold: maximum binary size of a BB chain which
+//     the algorithm will consider for splitting (default-set to 128).
+//
+// References:
+//   * [1] A. Newell and S. Pupyrev, Improved Basic Block Reordering, available
+//         at https://arxiv.org/abs/1809.04676
+//===----------------------------------------------------------------------===//
+
+#include "NodeChainBuilder.h"
+
+using llvm::detail::DenseMapPair;
+
+namespace lld {
+namespace propeller {
+
+// Initializes the node chains for each cfg and finds all the mutually-forced
+// edges.
+void NodeChainBuilder::init() {
+  for (ControlFlowGraph *cfg : CFGs) {
+    initNodeChains(*cfg);
+    initMutuallyForcedEdges(*cfg);
+  }
+}
+
+// NodeChainBuilder calls this function after building all the chains to attach
+// as many fall-throughs as possible. Given that the algorithm already optimizes
+// the extend TSP score, this function will only affect the cold basic blocks
+// and thus we do not need to consider the edge weights.
+void NodeChainBuilder::attachFallThroughs() {
+  for (ControlFlowGraph *Cfg : CFGs) {
+    // First, try to keep the fall-throughs from the original order.
+    for (auto &Node : Cfg->Nodes) {
+      if (Node->FTEdge != nullptr) {
+        attachNodes(Node.get(), Node->FTEdge->Sink);
+      }
+    }
+
+    // Sometimes, the original fall-throughs cannot be kept. So we try to find
+    // new fall-through opportunities which did not exist in the original order.
+    for (auto &Edge : Cfg->IntraEdges) {
+      if (Edge->Type == CFGEdge::EdgeType::INTRA_FUNC ||
+          Edge->Type == CFGEdge::EdgeType::INTRA_DYNA)
+        attachNodes(Edge->Src, Edge->Sink);
+    }
+  }
+}
+
+// This function sorts BB chains in decreasing order of their execution density.
+// NodeChainBuilder calls this function at the end to ensure that hot BB chains
+// are placed at the beginning of the function.
+void NodeChainBuilder::coalesceChains() {
+  std::vector<NodeChain *> chainOrder;
+  for (DenseMapPair<uint64_t, std::unique_ptr<NodeChain>> &elem : Chains)
+    chainOrder.push_back(elem.second.get());
+
+  std::sort(
+      chainOrder.begin(), chainOrder.end(), [](NodeChain *c1, NodeChain *c2) {
+        if (!c1->CFG || !c2->CFG || c1->CFG != c2->CFG)
+          error("Attempting to coalesce chains belonging to different "
+                "functions.");
+        if (c1->Freq == 0 ^ c2->Freq == 0)
+          return c1->Freq != 0;
+        auto *entryNode = c1->CFG->getEntryNode();
+        if (entryNode->Chain == c1)
+          return true;
+        if (entryNode->Chain == c2)
+          return false;
+        double c1ExecDensity = c1->execDensity();
+        double c2ExecDensity = c2->execDensity();
+        if (c1ExecDensity == c2ExecDensity)
+          return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr;
+        return c1ExecDensity > c2ExecDensity;
+      });
+
+  NodeChain *mergerChain = nullptr;
+
+  for (NodeChain *c : chainOrder) {
+    if (!mergerChain) {
+      mergerChain = c;
+      continue;
+    }
+    // Create a cold partition when -propeller-split-funcs is set.
+    if (propellerConfig.optSplitFuncs &&
+        (mergerChain->Freq == 0 ^ c->Freq == 0)) {
+      mergerChain = c;
+      continue;
+    }
+    mergeChains(mergerChain, c);
+  }
+}
+
+// Merge two chains in the specified order.
+void NodeChainBuilder::mergeChains(NodeChain *leftChain,
+                                   NodeChain *rightChain) {
+  if (leftChain->Freq == 0 ^ rightChain->Freq == 0)
+    error("Attempting to merge hot and cold chains: \n" + toString(*leftChain) +
+          "\nAND\n" + toString(*rightChain));
+
+  mergeInOutEdges(leftChain, rightChain);
+
+  for (CFGNode *node : rightChain->Nodes) {
+    leftChain->Nodes.push_back(node);
+    node->Chain = leftChain;
+    node->ChainOffset += leftChain->Size;
+  }
+  leftChain->Size += rightChain->Size;
+  leftChain->Freq += rightChain->Freq;
+  leftChain->DebugChain |= rightChain->DebugChain;
+  if (leftChain->CFG != rightChain->CFG)
+    leftChain->CFG = nullptr;
+
+  Chains.erase(rightChain->DelegateNode->MappedAddr);
+}
+
+// This function tries to place two basic blocks immediately adjacent to each
+// other (used for fallthroughs). Returns true if the basic blocks have been
+// attached this way.
+bool NodeChainBuilder::attachNodes(CFGNode *src, CFGNode *sink) {
+  if (sink->isEntryNode())
+    return false;
+
+  // Ignore edges between hot and cold basic blocks.
+  if (src->Freq == 0 ^ sink->Freq == 0)
+    return false;
+  NodeChain *srcChain = src->Chain;
+  NodeChain *sinkChain = sink->Chain;
+  // Skip this edge if the source and sink are in the same chain
+  if (srcChain == sinkChain)
+    return false;
+
+  // It's possible to form a fall-through between src and sink only if
+  // they are respectively located at the end and beginning of their chains.
+  if (srcChain->Nodes.back() != src || sinkChain->Nodes.front() != sink)
+    return false;
+  // Attaching is possible. So we merge the chains in the corresponding order.
+  mergeChains(srcChain, sinkChain);
+  return true;
+}
+
+void NodeChainBuilder::mergeInOutEdges(NodeChain *mergerChain,
+                                       NodeChain *mergeeChain) {
+  for (auto &elem : mergeeChain->OutEdges) {
+    NodeChain *c = (elem.first == mergeeChain) ? mergerChain : elem.first;
+    auto res = mergerChain->OutEdges.emplace(c, elem.second);
+    if (!res.second)
+      res.first->second.insert(res.first->second.end(), elem.second.begin(),
+                               elem.second.end());
+    else
+      c->InEdges.insert(mergerChain);
+
+    c->InEdges.erase(mergeeChain);
+  }
+
+  for (auto *c : mergeeChain->InEdges) {
+    if (c == mergeeChain)
+      continue;
+    auto &mergeeChainEdges = c->OutEdges[mergeeChain];
+    auto &mergerChainEdges = c->OutEdges[mergerChain];
+
+    mergerChainEdges.insert(mergerChainEdges.end(), mergeeChainEdges.begin(),
+                            mergeeChainEdges.end());
+    mergerChain->InEdges.insert(c);
+    c->OutEdges.erase(mergeeChain);
+  }
+}
+
+// Merge two BB sequences according to the given NodeChainAssembly. A
+// NodeChainAssembly is an ordered triple of three slices from two chains.
+void NodeChainBuilder::mergeChains(
+    std::unique_ptr<NodeChainAssembly> assembly) {
+  if (assembly->splitChain()->Freq == 0 ^ assembly->unsplitChain()->Freq == 0)
+    error("Attempting to merge hot and cold chains: \n" +
+          toString(*assembly.get()));
+
+  if (assembly->splitChain()->Freq == 0 ^ assembly->unsplitChain()->Freq == 0)
+    error("Attempting to merge hot and cold chains: \n" +
+          toString(*assembly.get()));
+
+  // Decide which chain gets merged into the other chain, to make the reordering
+  // more efficient.
+  NodeChain *mergerChain = (assembly->MOrder == YX2X1)
+                               ? assembly->unsplitChain()
+                               : assembly->splitChain();
+  NodeChain *mergeeChain = (assembly->MOrder == YX2X1)
+                               ? assembly->splitChain()
+                               : assembly->unsplitChain();
+
+  // Merge in and out edges of the two chains
+  mergeInOutEdges(mergerChain, mergeeChain);
+
+  // Create the new node order according the given assembly
+  auto X1Begin = assembly->splitChain()->Nodes.begin();
+  auto X2Begin = assembly->SlicePosition;
+  // Does X2 mark a function transition ?
+  bool X2FuncTransition = X2Begin != assembly->splitChain()->Nodes.begin() &&
+                          (*std::prev(X2Begin))->CFG != (*X2Begin)->CFG;
+  auto YBegin = assembly->unsplitChain()->Nodes.begin();
+
+  // If splitChain is truly splitted, reorder the splitted parts from X1X2 to
+  // X2X1 if needed. We can do this using splice because we are splicing the
+  // same list.
+  if (assembly->split() &&
+      (assembly->MOrder == X2X1Y || assembly->MOrder == X2YX1 ||
+       assembly->MOrder == YX2X1))
+    assembly->splitChain()->Nodes.splice(X1Begin, assembly->splitChain()->Nodes,
+                                         X2Begin,
+                                         assembly->splitChain()->Nodes.end());
+
+  switch (assembly->MOrder) {
+  case X2X1Y:
+    // Splice Y at the end of X.
+    assembly->splitChain()->Nodes.splice(assembly->splitChain()->Nodes.end(),
+                                         assembly->unsplitChain()->Nodes);
+    break;
+  case X1YX2:
+    // Splice Y in the middle
+    assembly->splitChain()->Nodes.splice(X2Begin,
+                                         assembly->unsplitChain()->Nodes);
+    break;
+  case X2YX1:
+    // Splice Y in the middle
+    assembly->splitChain()->Nodes.splice(X1Begin,
+                                         assembly->unsplitChain()->Nodes);
+    break;
+  case YX2X1:
+    // Splice X at the end of Y
+    assembly->unsplitChain()->Nodes.splice(
+        assembly->unsplitChain()->Nodes.end(), assembly->splitChain()->Nodes);
+    break;
+  default:
+    break;
+  }
+
+  if (propellerConfig.optReorderIP) {
+    // Merge function transitions of mergerChain with those of mergeeChain.
+    mergerChain->FunctionTransitions.splice(
+        mergerChain->FunctionTransitions.end(),
+        mergeeChain->FunctionTransitions);
+
+    // Add the beginnings of slices if they now mark a function transition.
+    std::vector<std::list<CFGNode *>::iterator> allSlicesBegin;
+    // YBegin should always be examined.
+    allSlicesBegin.push_back(YBegin);
+    // If X2Begin was a function transition point before, we don't examine it
+    // again as it will remain in the transition points.
+    if (!X2FuncTransition)
+      allSlicesBegin.push_back(X2Begin);
+    // X1Begin will only be examined if this is a splitting assembly. Otherwise,
+    // X1 is empty.
+    if (assembly->split())
+      allSlicesBegin.push_back(X1Begin);
+
+    // Check if any of the slices mark a function transition position and add
+    // those to the function transition list.
+    for (auto it : allSlicesBegin)
+      if (it != mergerChain->Nodes.begin() &&
+          (*std::prev(it))->CFG != (*it)->CFG)
+        mergerChain->FunctionTransitions.push_back(it);
+  }
+
+  // Set the starting and ending point for updating the nodes's chain and offset
+  // in the new chain.
+  auto chainBegin = mergerChain->Nodes.begin();
+  uint64_t chainBeginOffset = 0;
+
+  if (assembly->MOrder == X1YX2) {
+    chainBegin = YBegin;
+    chainBeginOffset = assembly->Slices[0].size();
+  }
+
+  if (assembly->MOrder == YX2X1) {
+    chainBegin = X2Begin;
+    chainBeginOffset = assembly->Slices[0].size();
+  }
+
+  if (!assembly->split()) {
+    chainBegin = YBegin;
+    chainBeginOffset = assembly->splitChain()->Size;
+  }
+
+  uint64_t runningOffset = chainBeginOffset;
+
+  // Update nodeOffsetMap and nodeToChainMap for all the nodes in the sequence.
+  for (auto it = chainBegin; it != mergerChain->Nodes.end(); ++it) {
+    CFGNode *node = *it;
+    node->Chain = mergerChain;
+    node->ChainOffset = runningOffset;
+    runningOffset += node->ShSize;
+  }
+
+  mergerChain->Size = runningOffset;
+
+  // Update the total frequency and ExtTSP score of the aggregated chain
+  mergerChain->Freq += mergerChain->Freq;
+
+  // We have already computed the new score in the assembly record. So we can
+  // update the score based on that and the other chain's score.
+  mergerChain->Score += mergeeChain->Score + assembly->ScoreGain;
+
+  mergerChain->DebugChain |= mergeeChain->DebugChain;
+
+  if (mergerChain->CFG != mergeeChain->CFG)
+    mergerChain->CFG = nullptr;
+
+  // Merge the assembly candidate chains of the two chains into the candidate
+  // chains of the remaining NodeChain and remove the records for the defunct
+  // NodeChain.
+  for (NodeChain *c : CandidateChains[mergeeChain]) {
+    NodeChainAssemblies.erase(std::make_pair(c, mergeeChain));
+    NodeChainAssemblies.erase(std::make_pair(mergeeChain, c));
+    CandidateChains[c].erase(mergeeChain);
+    if (c != mergerChain)
+      CandidateChains[mergerChain].insert(c);
+  }
+
+  // Update the NodeChainAssembly for all candidate chains of the merged
+  // NodeChain. Remove a NodeChain from the merge chain's candidates if the
+  // NodeChainAssembly update finds no gain.
+  auto &mergerChainCandidateChains = CandidateChains[mergerChain];
+
+  for (auto CI = mergerChainCandidateChains.begin(),
+            CE = mergerChainCandidateChains.end();
+       CI != CE;) {
+    NodeChain *otherChain = *CI;
+    auto &otherChainCandidateChains = CandidateChains[otherChain];
+
+    bool x = updateNodeChainAssembly(otherChain, mergerChain);
+
+    if (!x)
+      NodeChainAssemblies.erase(std::make_pair(otherChain, mergerChain));
+
+    bool y = updateNodeChainAssembly(mergerChain, otherChain);
+
+    if (!y)
+      NodeChainAssemblies.erase(std::make_pair(mergerChain, otherChain));
+
+    if (x || y) {
+      otherChainCandidateChains.insert(mergerChain);
+      CI++;
+    } else {
+      otherChainCandidateChains.erase(mergerChain);
+      CI = mergerChainCandidateChains.erase(CI);
+    }
+  }
+
+  // remove all the candidate chain records for the merged-in chain
+  CandidateChains.erase(mergeeChain);
+
+  // Finally, remove the defunct (merged-in) chain record from the chains
+  Chains.erase(mergeeChain->DelegateNode->MappedAddr);
+}
+
+// Calculate the Extended TSP metric for a BB chain.
+// This function goes over all the BBs in the chain and for BB chain and
+// aggregates the score of all edges which are contained in the same chain.
+double NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const {
+  double score = 0;
+
+  auto visit = [&score](CFGEdge &edge, NodeChain *srcChain,
+                        NodeChain *sinkChain) {
+    assert(srcChain == sinkChain);
+    auto srcOffset = edge.Src->ChainOffset;
+    auto sinkOffset = edge.Sink->ChainOffset;
+    bool edgeForward = srcOffset + edge.Src->ShSize <= sinkOffset;
+    // Calculate the distance between src and sink
+    uint64_t distance = edgeForward ? sinkOffset - srcOffset - edge.Src->ShSize
+                                    : srcOffset - sinkOffset + edge.Src->ShSize;
+    score += getEdgeExtTSPScore(edge, edgeForward, distance);
+  };
+
+  chain->forEachOutEdgeToChain(chain, visit);
+
+  return score;
+}
+
+// Updates the best NodeChainAssembly between two NodeChains. The existing
+// record will be replaced by the new NodeChainAssembly if a non-zero gain
+// is achieved. Otherwise, it will be removed.
+// If a nodechain assembly record is (kept) inserted, returns true. Otherwise
+// returns false.
+bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain,
+                                               NodeChain *unsplitChain) {
+  // Only consider splitting the chain if the size of the chain is smaller than
+  // a threshold.
+  bool doSplit = (splitChain->Size <= propellerConfig.optChainSplitThreshold);
+  auto slicePosEnd =
+      doSplit ? splitChain->Nodes.end() : std::next(splitChain->Nodes.begin());
+
+  std::unique_ptr<NodeChainAssembly> bestAssembly(nullptr);
+
+  for (auto slicePos = splitChain->Nodes.begin(); slicePos != slicePosEnd;
+       ++slicePos) {
+    // Do not split the mutually-forced edges in the chain.
+    if (slicePos != splitChain->Nodes.begin() &&
+        MutuallyForcedOut.count(*std::prev(slicePos)))
+      continue;
+
+    // If the split position is at the beginning (no splitting), only consider
+    // one MergeOrder
+    auto mergeOrderEnd = (slicePos == splitChain->Nodes.begin())
+                             ? MergeOrder::BeginNext
+                             : MergeOrder::End;
+
+    for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) {
+      MergeOrder mOrder = static_cast<MergeOrder>(MI);
+
+      // Create the NodeChainAssembly representing this particular assembly.
+      auto NCA = std::unique_ptr<NodeChainAssembly>(
+          new NodeChainAssembly(splitChain, unsplitChain, slicePos, mOrder));
+
+      if (!NCA->isValid())
+        continue;
+
+      if (!bestAssembly || NodeChainAssemblyComparator(bestAssembly, NCA))
+        bestAssembly = std::move(NCA);
+    }
+  }
+
+  if (propellerConfig.optReorderIP && !doSplit) {
+    // For inter-procedural layout, we always try splitting at the function
+    // entries, regardless of the split-chain's size.
+    for (auto slicePos : splitChain->FunctionTransitions) {
+      for (uint8_t MI = MergeOrder::Begin; MI != MergeOrder::End; MI++) {
+        MergeOrder mOrder = static_cast<MergeOrder>(MI);
+
+        // Create the NodeChainAssembly representing this particular assembly.
+        auto NCA = std::unique_ptr<NodeChainAssembly>(
+            new NodeChainAssembly(splitChain, unsplitChain, slicePos, mOrder));
+
+        if (!NCA->isValid())
+          continue;
+
+        // Update bestAssembly if needed
+        if (!bestAssembly || NodeChainAssemblyComparator(bestAssembly, NCA))
+          bestAssembly = std::move(NCA);
+      }
+    }
+  }
+
+  // Insert the best assembly of the two chains (only if it has positive gain).
+  if (bestAssembly) {
+    if (splitChain->DebugChain || unsplitChain->DebugChain)
+      fprintf(stderr, "INSERTING ASSEMBLY: %s\n",
+              toString(*bestAssembly).c_str());
+
+    NodeChainAssemblies.insert(bestAssembly->ChainPair,
+                               std::move(bestAssembly));
+    return true;
+  } else
+    return false;
+}
+
+void NodeChainBuilder::initNodeChains(ControlFlowGraph &cfg) {
+  for (auto &node : cfg.Nodes) {
+    NodeChain *chain = new NodeChain(node.get());
+    node->Chain = chain;
+    node->ChainOffset = 0;
+    Chains.try_emplace(node->MappedAddr, chain);
+  }
+}
+
+// Find all the mutually-forced edges.
+// These are all the edges which are -- based on the profile -- the only
+// (executed) outgoing edge from their source node and the only (executed)
+// incoming edges to their sink nodes
+void NodeChainBuilder::initMutuallyForcedEdges(ControlFlowGraph &cfg) {
+  DenseMap<CFGNode *, CFGNode *> mutuallyForcedOut;
+  DenseSet<CFGNode *> mutuallyForcedIn;
+
+  auto l = prop->BBLayouts.find(cfg.Name);
+  if (l != prop->BBLayouts.end()) {
+    CFGNode *lastNode = nullptr;
+    for (auto ordinal : l->second) {
+      auto r = Chains.find(ordinal);
+      if (r == Chains.end()) {
+        lastNode = nullptr;
+        continue;
+      }
+      CFGNode *thisNode = r->second->DelegateNode;
+      if (lastNode) {
+        mutuallyForcedOut.try_emplace(lastNode, thisNode);
+        mutuallyForcedIn.insert(thisNode);
+      }
+      lastNode = thisNode;
+    }
+  }
+
+  DenseMap<CFGNode *, std::vector<CFGEdge *>> profiledOuts;
+  DenseMap<CFGNode *, std::vector<CFGEdge *>> profiledIns;
+
+  for (auto &node : cfg.Nodes) {
+    if (!mutuallyForcedOut.count(node.get())) {
+      std::copy_if(node->Outs.begin(), node->Outs.end(),
+                   std::back_inserter(profiledOuts[node.get()]),
+                   [&mutuallyForcedIn](CFGEdge *edge) {
+                     return !mutuallyForcedIn.count(edge->Sink) &&
+                            (edge->Type == CFGEdge::EdgeType::INTRA_FUNC ||
+                             edge->Type == CFGEdge::EdgeType::INTRA_DYNA) &&
+                            edge->Weight != 0;
+                   });
+    }
+    if (!mutuallyForcedIn.count(node.get())) {
+      std::copy_if(node->Ins.begin(), node->Ins.end(),
+                   std::back_inserter(profiledIns[node.get()]),
+                   [&mutuallyForcedOut](CFGEdge *edge) {
+                     return !mutuallyForcedOut.count(edge->Src) &&
+                            (edge->Type == CFGEdge::EdgeType::INTRA_FUNC ||
+                             edge->Type == CFGEdge::EdgeType::INTRA_DYNA) &&
+                            edge->Weight != 0;
+                   });
+    }
+  }
+
+  for (auto &node : cfg.Nodes) {
+    if (profiledOuts[node.get()].size() == 1) {
+      CFGEdge *edge = profiledOuts[node.get()].front();
+      if (edge->Type != CFGEdge::EdgeType::INTRA_FUNC &&
+          edge->Type != CFGEdge::EdgeType::INTRA_DYNA)
+        continue;
+      if (profiledIns[edge->Sink].size() == 1)
+        mutuallyForcedOut.try_emplace(node.get(), edge->Sink);
+    }
+  }
+
+  // Break cycles in the mutually forced edges by cutting the edge sinking to
+  // the smallest address in every cycle (hopefully a loop backedge)
+  DenseMap<CFGNode *, unsigned> nodeToPathMap;
+  SmallVector<CFGNode *, 16> cycleCutNodes;
+  unsigned pathCount = 0;
+  for (auto it = mutuallyForcedOut.begin(); it != mutuallyForcedOut.end();
+       ++it) {
+    // Check to see if the node (and its cycle) have already been visited.
+    if (nodeToPathMap[it->first])
+      continue;
+    CFGEdge *victimEdge = nullptr;
+    auto nodeIt = it;
+    pathCount++;
+    while (nodeIt != mutuallyForcedOut.end()) {
+      CFGNode *node = nodeIt->first;
+      unsigned path = nodeToPathMap[node];
+      if (path != 0) {
+        // If this node is marked with a number, either it is the same number,
+        // in which case we have found a cycle. Or it is a different number,
+        // which means we have found a path to a previously visited path
+        // (non-cycle).
+        if (path == pathCount) {
+          // We have found a cycle: add the victim edge
+          cycleCutNodes.push_back(victimEdge->Src);
+        }
+        break;
+      } else
+        nodeToPathMap[node] = pathCount;
+      if (!profiledOuts[node].empty()) {
+        CFGEdge *edge = profiledOuts[node].front();
+        if (!victimEdge ||
+            (edge->Sink->MappedAddr < victimEdge->Sink->MappedAddr)) {
+          victimEdge = edge;
+        }
+      }
+      nodeIt = mutuallyForcedOut.find(nodeIt->second);
+    }
+  }
+
+  // Remove the victim edges to break cycles in the mutually forced edges
+  for (CFGNode *node : cycleCutNodes)
+    mutuallyForcedOut.erase(node);
+
+  for (auto &elem : mutuallyForcedOut)
+    MutuallyForcedOut.insert(elem);
+}
+
+// This function initializes the ExtTSP algorithm's data structures. This
+// the NodeChainAssemblies and the CandidateChains maps.
+void NodeChainBuilder::initializeExtTSP() {
+  // For each chain, compute its ExtTSP score, add its chain assembly records
+  // and its merge candidate chain.
+  CandidateChains.clear();
+  for (NodeChain *chain : Components[CurrentComponent])
+    chain->Score = chain->Freq ? computeExtTSPScore(chain) : 0;
+
+  DenseSet<std::pair<NodeChain *, NodeChain *>> visited;
+
+  for (NodeChain *chain : Components[CurrentComponent]) {
+    for (auto &chainEdge : chain->OutEdges) {
+      NodeChain *otherChain = chainEdge.first;
+      if (chain == otherChain)
+        continue;
+      if (visited.count(std::make_pair(chain, otherChain)))
+        continue;
+
+      bool x = updateNodeChainAssembly(chain, otherChain);
+      bool y = updateNodeChainAssembly(otherChain, chain);
+
+      if (x || y) {
+        CandidateChains[chain].insert(otherChain);
+        CandidateChains[otherChain].insert(chain);
+      }
+      visited.insert(std::make_pair(chain, otherChain));
+      visited.insert(std::make_pair(otherChain, chain));
+    }
+  }
+}
+
+void NodeChainBuilder::initializeChainComponents() {
+
+  DenseMap<NodeChain *, unsigned> chainToComponentMap;
+  unsigned componentId = 0;
+  for (DenseMapPair<uint64_t, std::unique_ptr<NodeChain>> &elem : Chains) {
+    NodeChain *chain = elem.second.get();
+    // Cold chains will not be placed in any component.
+    if (!chain->Freq)
+      continue;
+    // Skip if this chain has already been assigned to a component.
+    if (chainToComponentMap.count(chain))
+      continue;
+    // Start creating the component containing this chain and its connected
+    // chains.
+    chainToComponentMap[chain] = componentId;
+    std::vector<NodeChain *> toVisit(1, chain);
+    unsigned index = 0;
+
+    // Find the connected component using BFS.
+    while (index != toVisit.size()) {
+      auto *tchain = toVisit[index++];
+      for (auto *c : tchain->InEdges) {
+        if (!chainToComponentMap.count(c)) {
+          chainToComponentMap[c] = componentId;
+          toVisit.push_back(c);
+        }
+      }
+      for (auto &e : tchain->OutEdges) {
+        if (!chainToComponentMap.count(e.first)) {
+          chainToComponentMap[e.first] = componentId;
+          toVisit.push_back(e.first);
+        }
+      }
+    }
+    Components.push_back(std::move(toVisit));
+    componentId++;
+  }
+}
+
+void NodeChainBuilder::mergeAllChains() {
+  // Attach the mutually-foced edges (which will not be split anymore by the
+  // Extended TSP algorithm).
+  for (auto &elem : MutuallyForcedOut)
+    attachNodes(elem.first, elem.second);
+
+  // Set up the outgoing edges for every chain
+  for (auto &elem : Chains) {
+    auto *chain = elem.second.get();
+    // Ignore cold chains as they cannot have any hot edges to other nodes
+    if (!chain->Freq)
+      continue;
+    auto addEdge = [chain](CFGEdge &edge) {
+      // Ignore returns and zero-frequency edges as these edges will not be used
+      // by the ExtTSP score algorithm.
+      if (!edge.Weight || edge.isReturn())
+        return;
+      auto *sinkNodeChain = edge.Sink->Chain;
+      chain->OutEdges[sinkNodeChain].push_back(&edge);
+      sinkNodeChain->InEdges.insert(chain);
+    };
+
+    if (propellerConfig.optReorderIP) {
+      for (CFGNode *node : chain->Nodes)
+        node->forEachOutEdgeRef(addEdge);
+    } else {
+      for (CFGNode *node : chain->Nodes)
+        node->forEachIntraOutEdgeRef(addEdge);
+    }
+  }
+
+  initializeChainComponents();
+
+  for (CurrentComponent = 0; CurrentComponent < Components.size();
+       ++CurrentComponent) {
+    fprintf(stderr, "COMPONENT: %u -> SIZE: %zu\n", CurrentComponent,
+            Components[CurrentComponent].size());
+    // Initialize the Extended TSP algorithm's data.
+    initializeExtTSP();
+
+    // Keep merging the chain assembly record with the highest ExtTSP gain,
+    // until no more gain is possible.
+    while (!NodeChainAssemblies.empty()) {
+      auto bestAssembly = NodeChainAssemblies.top();
+      NodeChainAssemblies.pop();
+      if (bestAssembly->splitChain()->DebugChain ||
+          bestAssembly->unsplitChain()->DebugChain)
+        fprintf(stderr, "MERGING for %s\n",
+                toString(*bestAssembly.get()).c_str());
+      mergeChains(std::move(bestAssembly));
+    }
+  }
+}
+
+void NodeChainBuilder::doOrder(std::unique_ptr<ChainClustering> &CC) {
+  init();
+
+  mergeAllChains();
+
+  // Merge fallthrough basic blocks if we have missed any
+  attachFallThroughs();
+
+  if (!propellerConfig.optReorderIP) {
+    // If this is not inter-procedural, coalesce all hot chains into a single
+    // hot chain and all cold chains into a single cold chain.
+    coalesceChains();
+    assert(CFGs.size() == 1 && Chains.size() <= 2);
+
+#ifdef PROPELLER_PROTOBUF
+    ControlFlowGraph *cfg = CFGs.back();
+    if (prop->protobufPrinter) {
+      std::list<CFGNode *> nodeOrder;
+      for (auto &elem : Chains) {
+        auto *chain = elem.second.get();
+        nodeOrder.insert(chain->Freq ? nodeOrder.begin() : nodeOrder.end(),
+                         chain->Nodes.begin(), chain->Nodes.end());
+      }
+      prop->protobufPrinter->addCFG(*cfg, &nodeOrder);
+    }
+#endif
+  }
+
+  // Hand in the built chains to the chain clustering algorithm
+  for (auto &elem : Chains)
+    CC->addChain(std::move(elem.second));
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.h
@@ -0,0 +1,143 @@
+//===- PropellerChainClustering.h  ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H
+#define LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H
+
+#include "NodeChain.h"
+
+#include "llvm/ADT/DenseMap.h"
+
+#include <vector>
+
+using llvm::DenseMap;
+
+namespace lld {
+namespace propeller {
+
+// This is the base class for clustering chains. It provides the basic data
+// structures and functions.
+class ChainClustering {
+public:
+  class Cluster {
+  public:
+    // Constructor for building a cluster from a single chain.
+    Cluster(NodeChain *);
+
+    // The chains in this cluster in the merged order.
+    std::vector<NodeChain *> Chains;
+
+    // The representative chain for this cluster.
+    NodeChain *DelegateChain;
+
+    // Total size of the cluster.
+    uint64_t Size;
+
+    // Total frequency of the cluster.
+    uint64_t Freq;
+
+    // This function merges this cluster with another cluster
+    Cluster &mergeWith(Cluster &other) {
+      Chains.insert(Chains.end(), other.Chains.begin(), other.Chains.end());
+      this->Freq += other.Freq;
+      this->Size += other.Size;
+      return *this;
+    }
+
+    // Returns the execution density of this cluster
+    double getDensity() { return ((double)Freq / Size); }
+  };
+
+  // This function merges two clusters in the given order, and updates
+  // ChainToClusterMap for the chains in the mergee cluster and also removes the
+  // mergee cluster from the Clusters.
+  void mergeTwoClusters(Cluster *predecessorCluster, Cluster *cluster) {
+    // Join the two clusters into predecessorCluster.
+    predecessorCluster->mergeWith(*cluster);
+
+    // Update chain to cluster mapping, because all chains that were
+    // previsously in cluster are now in predecessorCluster.
+    for (NodeChain *c : cluster->Chains) {
+      ChainToClusterMap[c] = predecessorCluster;
+    }
+
+    // Delete the defunct cluster
+    Clusters.erase(cluster->DelegateChain->DelegateNode->MappedAddr);
+  }
+
+  // Adds a chain to the input of the clustering process
+  void addChain(std::unique_ptr<NodeChain> &&chain_ptr);
+
+  // Any subclass should override this function.
+  virtual void doOrder(std::vector<CFGNode *> &hotOrder,
+                       std::vector<CFGNode *> &coldOrder);
+
+  virtual ~ChainClustering() = default;
+
+protected:
+  // Optional function for merging clusters, which could be overriden by
+  // subclasses.
+  virtual void mergeClusters(){};
+
+  // This function sorts the final clusters in decreasing order of their
+  // execution density.
+  void sortClusters(std::vector<Cluster *> &);
+
+  // This function initializes clusters with each cluster including a single
+  // chain.
+  void initClusters() {
+    for (auto &c_ptr : HotChains) {
+      NodeChain *chain = c_ptr.get();
+      Cluster *cl = new Cluster(chain);
+      cl->Freq = chain->Freq;
+      cl->Size = std::max(chain->Size, (uint64_t)1);
+      ChainToClusterMap[chain] = cl;
+      Clusters.try_emplace(cl->DelegateChain->DelegateNode->MappedAddr, cl);
+    }
+  }
+
+  // Chains processed by the clustering algorithm separated into hot and cold
+  // ones based on their frequency.
+  std::vector<std::unique_ptr<NodeChain>> HotChains, ColdChains;
+
+  // All clusters currently in process.
+  DenseMap<uint64_t, std::unique_ptr<Cluster>> Clusters;
+
+  // This maps every chain to its containing cluster.
+  DenseMap<NodeChain *, Cluster *> ChainToClusterMap;
+};
+
+// This class computes an ordering for the input chains which conforms to the
+// initial ordering of nodes in the binary.
+class NoOrdering : public ChainClustering {
+public:
+  void doOrder(std::vector<CFGNode *> &hotOrder,
+               std::vector<CFGNode *> &coldOrder);
+};
+
+// This class implements the call-chain-clustering algorithm.
+class CallChainClustering : public ChainClustering {
+private:
+  Cluster *getMostLikelyPredecessor(NodeChain *chain, Cluster *cluster);
+  void mergeClusters();
+};
+
+} // namespace propeller
+} // namespace lld
+
+namespace std {
+template <> struct less<lld::propeller::ChainClustering::Cluster *> {
+  bool operator()(const lld::propeller::ChainClustering::Cluster *c1,
+                  const lld::propeller::ChainClustering::Cluster *c2) const {
+    return less<lld::propeller::NodeChain *>()(c1->DelegateChain,
+                                               c2->DelegateChain);
+  }
+};
+
+} // namespace std
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp
@@ -0,0 +1,230 @@
+//===- PropellerChainClustering.cpp  --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and includes the implementation of the Call-Chain-Clustering
+// algorithm as described in [1].
+//
+// The algorithm iteratively merges chains into clusters. To do so, it processes
+// the basic block chains in decreasing order of
+// their execution density and for each chain, merges its cluster with its
+// most-frequent predecessor cluster. The merging of a cluster stops when it
+// reaches the -propeller-cluster-merge-size-threshold (set to 2MB by default).
+//
+//   [1] Guilherme Ottoni and Bertrand Maher. Optimizing Function Placement for
+//   Large-Scale Data-Center Applications. Available at
+//   https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+//===----------------------------------------------------------------------===//
+#include "NodeChainClustering.h"
+#include "PropellerConfig.h"
+
+using llvm::detail::DenseMapPair;
+
+namespace lld {
+namespace propeller {
+
+void ChainClustering::addChain(std::unique_ptr<NodeChain> &&chain_ptr) {
+  for (CFGNode *n : chain_ptr->Nodes)
+    n->Chain = chain_ptr.get();
+  auto &chainList =
+      ((propellerConfig.optReorderIP || propellerConfig.optSplitFuncs ||
+        propellerConfig.optReorderFuncs) &&
+       chain_ptr->Freq == 0)
+          ? ColdChains
+          : HotChains;
+  chainList.push_back(std::move(chain_ptr));
+}
+
+// Initializes a cluster containing a single chain an associates it with a
+// unique id.
+ChainClustering::Cluster::Cluster(NodeChain *chain)
+    : Chains(1, chain), DelegateChain(chain) {}
+
+// Returns the most frequent predecessor of a chain. This function also gets as
+// the second parameter the cluster containing the chain to save a lookup
+// into ChainToClusterMap.
+ChainClustering::Cluster *
+CallChainClustering::getMostLikelyPredecessor(NodeChain *chain,
+                                              Cluster *cluster) {
+  // This map stores the total weight of the incoming edges to the given cluster
+  // from every other cluster.
+  DenseMap<Cluster *, uint64_t> clusterEdge;
+
+  for (CFGNode *n : chain->Nodes) {
+    // For non-inter-procedural, we only consider the function entries.
+    if (!propellerConfig.optReorderIP && !n->isEntryNode())
+      continue;
+    auto visit = [&clusterEdge, n, chain, cluster, this](CFGEdge &edge) {
+      if (!edge.Weight || edge.isReturn())
+        return;
+      if (!propellerConfig.optReorderIP && !edge.isCall())
+        return;
+      auto *callerChain = edge.Src->Chain;
+      if (!callerChain) {
+        warn("Caller for node: " + n->ShName + " does not have a chain!");
+        return;
+      }
+      auto *callerCluster = ChainToClusterMap[callerChain];
+      if (callerChain == chain || callerCluster == cluster)
+        return;
+      // Ignore clusters which are too big
+      if (callerCluster->Size > propellerConfig.optClusterMergeSizeThreshold)
+        return;
+      // Ignore edges which are cold relative to the sink node
+      if (edge.Weight * 10 < n->Freq)
+        return;
+      // Do not merge if the caller cluster's density would degrade by more than
+      // 1/8 by the merge.
+      if (8 * callerCluster->Size * (cluster->Freq * callerCluster->Freq) <
+          callerCluster->Freq * (cluster->Size + callerCluster->Size))
+        return;
+      clusterEdge[callerCluster] += edge.Weight;
+    };
+    n->forEachInEdgeRef(visit);
+  }
+
+  // Get the highest-frequency caller
+  auto bestCaller =
+      std::max_element(clusterEdge.begin(), clusterEdge.end(),
+                       [](const DenseMapPair<Cluster *, uint64_t> &p1,
+                          const DenseMapPair<Cluster *, uint64_t> &p2) {
+                         if (p1.second == p2.second) // Consistently break ties
+                           return std::less<Cluster *>()(p1.first, p2.first);
+                         return p1.second < p2.second;
+                       });
+
+  if (bestCaller == clusterEdge.end())
+    return nullptr;
+  return bestCaller->first;
+}
+
+void ChainClustering::sortClusters(std::vector<Cluster *> &clusterOrder) {
+  for (auto &p : Clusters)
+    clusterOrder.push_back(p.second.get());
+
+  auto clusterComparator = [](Cluster *c1, Cluster *c2) -> bool {
+    // Set a deterministic order when execution densities are equal.
+    if (c1->getDensity() == c2->getDensity())
+      return c1->DelegateChain->DelegateNode->MappedAddr <
+             c2->DelegateChain->DelegateNode->MappedAddr;
+    return c1->getDensity() > c2->getDensity();
+  };
+
+  std::sort(clusterOrder.begin(), clusterOrder.end(), clusterComparator);
+}
+
+void NoOrdering::doOrder(std::vector<CFGNode *> &hotOrder,
+                         std::vector<CFGNode *> &coldOrder) {
+  auto chainComparator = [](const std::unique_ptr<NodeChain> &c_ptr1,
+                            const std::unique_ptr<NodeChain> &c_ptr2) -> bool {
+    return c_ptr1->DelegateNode->MappedAddr < c_ptr2->DelegateNode->MappedAddr;
+  };
+
+  std::sort(HotChains.begin(), HotChains.end(), chainComparator);
+  std::sort(ColdChains.begin(), ColdChains.end(), chainComparator);
+
+  for (auto &c_ptr : HotChains)
+    for (CFGNode *n : c_ptr->Nodes)
+      hotOrder.push_back(n);
+
+  for (auto &c_ptr : ColdChains)
+    for (CFGNode *n : c_ptr->Nodes)
+      coldOrder.push_back(n);
+}
+
+// Merge clusters together based on the CallChainClustering algorithm.
+void CallChainClustering::mergeClusters() {
+  // Build a map for the execution density of each chain.
+  DenseMap<NodeChain *, double> chainWeightMap;
+
+  for (auto &c_ptr : HotChains) {
+    NodeChain *chain = c_ptr.get();
+    chainWeightMap.try_emplace(chain, chain->execDensity());
+  }
+
+  // Sort the hot chains in decreasing order of their execution density.
+  std::sort(HotChains.begin(), HotChains.end(),
+            [&chainWeightMap](const std::unique_ptr<NodeChain> &c_ptr1,
+                              const std::unique_ptr<NodeChain> &c_ptr2) {
+              auto chain1Weight = chainWeightMap[c_ptr1.get()];
+              auto chain2Weight = chainWeightMap[c_ptr2.get()];
+              if (chain1Weight == chain2Weight)
+                return c_ptr1->DelegateNode->MappedAddr <
+                       c_ptr2->DelegateNode->MappedAddr;
+              return chain1Weight > chain2Weight;
+            });
+
+  for (auto &c_ptr : HotChains) {
+    NodeChain *chain = c_ptr.get();
+    if (chainWeightMap[chain] <= 0.005)
+      break;
+    auto *cluster = ChainToClusterMap[chain];
+    // Ignore merging if the cluster containing this function is bigger than
+    // 2MBs (size of a large page).
+    if (cluster->Size > propellerConfig.optClusterMergeSizeThreshold)
+      continue;
+    assert(cluster);
+
+    Cluster *predecessorCluster = getMostLikelyPredecessor(chain, cluster);
+    if (!predecessorCluster)
+      continue;
+
+    mergeTwoClusters(predecessorCluster, cluster);
+  }
+}
+
+// This function orders the hot chains based on mergeClusters and sortClusters
+// and uses a consistent ordering for the cold part.
+void ChainClustering::doOrder(std::vector<CFGNode *> &hotOrder,
+                              std::vector<CFGNode *> &coldOrder) {
+  initClusters();
+  mergeClusters();
+  std::vector<Cluster *> clusterOrder;
+  sortClusters(clusterOrder);
+  // This maps every CFG to the position of its first hot node in the layout.
+  DenseMap<ControlFlowGraph *, size_t> hotCFGOrder;
+
+  for (Cluster *cl : clusterOrder)
+    for (NodeChain *c : cl->Chains)
+      for (CFGNode *n : c->Nodes) {
+        hotCFGOrder.try_emplace(n->CFG, hotOrder.size());
+        hotOrder.push_back(n);
+      }
+
+  auto coldChainComparator =
+      [&hotCFGOrder](const std::unique_ptr<NodeChain> &c_ptr1,
+                     const std::unique_ptr<NodeChain> &c_ptr2) -> bool {
+    // If each of the chains comes from a single CFG, we can order the chains
+    // consistently based on the hot layout
+    if (c_ptr1->CFG && c_ptr2->CFG) {
+      // If one CFG is cold and the other is hot, make sure the hot CFG's chain
+      // comes earlier in the order.
+      if (c_ptr1->CFG->isHot() != c_ptr2->CFG->isHot())
+        return c_ptr1->CFG->isHot();
+      // If both CFGs are hot, make the order for cold chains consistent with
+      // the order for hot ones.
+      if (c_ptr1->CFG->isHot() && c_ptr2->CFG->isHot() &&
+          (c_ptr1->CFG != c_ptr2->CFG))
+        return hotCFGOrder[c_ptr1->CFG] < hotCFGOrder[c_ptr2->CFG];
+      // Otherwise, use a consistent order based on the representative nodes.
+      return c_ptr1->DelegateNode->MappedAddr <
+             c_ptr2->DelegateNode->MappedAddr;
+    }
+    // Use an order consistent with the initial ordering of the representative
+    // nodes.
+    return c_ptr1->DelegateNode->MappedAddr < c_ptr2->DelegateNode->MappedAddr;
+  };
+
+  std::sort(ColdChains.begin(), ColdChains.end(), coldChainComparator);
+
+  for (auto &c_ptr : ColdChains)
+    for (CFGNode *n : c_ptr->Nodes)
+      coldOrder.push_back(n);
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/test/ELF/propeller/codelayout/function-ordering.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/function-ordering.s
@@ -0,0 +1,75 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises function reordering on three functions.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld  %t.o -o %t.out
+
+# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000008 t foo
+# BEFORE-NEXT:	0000000000201128 0000000000000008 t bar
+# BEFORE-NEXT:	0000000000201130 0000000000000008 t baz
+
+## Create a propeller profile based on the following inter-procedural calls.
+##
+##       100        100
+##  baz -----> bar -----> foo
+##
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 8 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 8 Nbar" >> %t_prof.propeller
+# RUN: echo "3 8 Nbaz" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "3 2 100 C" >> %t_prof.propeller
+# RUN: echo "2 1 100 C" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+
+## Link with the propeller profile and expect the functions to be reordered.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out
+# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 0000000000000008 t baz
+# REORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# REORDER-NEXT:	0000000000201130 0000000000000008 t foo
+
+## Disable function reordering and expect that the original function order is retained.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out
+# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER
+
+# NOREORDER:		0000000000201120 0000000000000008 t foo
+# NOREORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# NOREORDER-NEXT:	0000000000201130 0000000000000008 t baz
+
+.section	.text,"ax",@progbits,unique,1
+# -- Begin function foo
+.type	foo,@function
+foo:
+ .quad 0x11
+
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
+
+.section	.text,"ax",@progbits,unique,2
+# -- Begin function bar
+.type	bar,@function
+bar:
+ .quad 0x22
+
+.Lbar_end:
+ .size	bar, .Lbar_end-bar
+# -- End function bar
+
+.section	.text,"ax",@progbits,unique,3
+# -- Begin function baz
+.type	baz,@function
+baz:
+ .quad 0x33
+
+.Lbaz_end:
+ .size	baz, .Lbaz_end-baz
+# -- End function baz
Index: lld/test/ELF/propeller/codelayout/function-with-loop.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/function-with-loop.s
@@ -0,0 +1,130 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises basic block reordering on a single function with a simple loop.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	0000000000201123 a.BB.foo:
+# BEFORE-NEXT:	nopl	(%rax)
+# BEFORE-NEXT:	je      3 <aaa.BB.foo>
+
+# BEFORE:	0000000000201128 aa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	000000000020112b aaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	jne     -13 <a.BB.foo>
+
+# BEFORE:	0000000000201130 aaaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	retq
+
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##             foo
+##              |
+##              |5
+##              V
+##      +--> a.BB.foo <--+
+##      |      / \       |
+##     0|    0/   \95    |90
+##      |    /     \     |
+##      |   v       \    |
+##  aa.BB.foo        v   |
+##      \        aaa.BB.foo
+##       \           /
+##        \         /
+##        0\       /10
+##          \     /
+##           v   v
+##         aaaa.BB.foo
+##
+## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo
+## as a fallthrough chain in the layout.
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 14 Nfoo" > %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 3 1.2" >> %t_prof.propeller
+# RUN: echo "4 5 1.3" >> %t_prof.propeller
+# RUN: echo "5 4 1.4" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "4 2 90" >> %t_prof.propeller
+# RUN: echo "2 4 95" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "4 5 10" >> %t_prof.propeller
+# RUN: echo "1 2 5" >> %t_prof.propeller
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out
+# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 foo:
+# REORDER-NEXT:	nopl	(%rax)
+
+# REORDER:	0000000000201123 a.BB.foo:
+# REORDER-NEXT:	nopl	(%rax)
+# REORDER-NEXT:	jne      9 <aa.BB.foo>
+
+# REORDER:	0000000000201128 aaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jne     -10 <a.BB.foo>
+
+# REORDER:	000000000020112d aaaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	retq
+
+# REORDER:	0000000000201131 aa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jmp	-14 <aaa.BB.foo>
+
+## Disable basic block reordering and expect that the original bb order is retained.
+#
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out
+# RUN: diff %t.propeller.noreorder.out %t.out
+
+.section	.text,"ax",@progbits
+# -- Begin function foo
+.type	foo,@function
+foo:
+ nopl (%rax)
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ jmp	aaa.BB.foo
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ jne	a.BB.foo
+ jmp	aaaa.BB.foo
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits,unique,4
+aaaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaaa.BB.foo_end:
+ .size	aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
Index: lld/test/ELF/propeller/codelayout/opt-all-combinations.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/opt-all-combinations.s
@@ -0,0 +1,295 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises all combinations of propeller options (reorder-funcs, reorder-blocks,
+## and split-funcs) on four functions.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	Disassembly of section .text:
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	foo:
+# BEFORE-NEXT:	xorb	%al, 0
+# BEFORE-NEXT:	int3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	bar:
+# BEFORE-NEXT:	xorb	%al, 1
+# BEFORE-NEXT:	je	9 <aa.BB.bar>
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	a.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 2
+# BEFORE-NEXT:	jmp      7 <aaa.BB.bar>
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	aa.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	aaa.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 4
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	baz:
+# BEFORE-NEXT:	xorb	%al, 5
+# BEFORE-NEXT:	int3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	qux:
+# BEFORE-NEXT:	xorb	%al, 6
+#
+
+## Create a propeller profile for four functions foo, bar, baz, and qux based on the cfg below:
+##
+##
+##                   bar
+##                  /   \
+##                 /     \100
+##                /       \
+##       100     v         v
+## foo <----- a.BB.bar  aa.BB.bar     baz     qux ---+
+##               \        /                    ^     |
+##                \      /100                  |   1 |
+##                 \    /                      +-----+
+##                  v  v
+##                aaa.BB.bar
+##
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 8 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 20 Nbar" >> %t_prof.propeller
+# RUN: echo "3 9 2.1" >> %t_prof.propeller
+# RUN: echo "4 7 2.2" >> %t_prof.propeller
+# RUN: echo "5 7 2.3" >> %t_prof.propeller
+# RUN: echo "6 8 Nbaz" >> %t_prof.propeller
+# RUN: echo "7 7 Nqux" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "2 4 100" >> %t_prof.propeller
+# RUN: echo "4 1 100 C" >> %t_prof.propeller
+# RUN: echo "7 7 10 C" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "4 5 100" >> %t_prof.propeller
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller --verbose -propeller-keep-named-symbols -o %t.propeller.reorder.out
+# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	Disassembly of section .text:
+# REORDER-EMPTY:
+# REORDER-NEXT:	bar:
+# REORDER-NEXT: xorb	%al, 1
+# REORDER-NEXT:	jne	30 <a.BB.bar>
+# REORDER-EMPTY:
+# REORDER-NEXT:	aa.BB.bar:
+# REORDER-NEXT:	xorb	%al, 3
+# REORDER-EMPTY:
+# REORDER-NEXT:	aaa.BB.bar:
+# REORDER-NEXT:	xorb	%al, 4
+# REORDER-NEXT:	int3
+# REORDER-EMPTY:
+# REORDER-NEXT:	foo:
+# REORDER-NEXT:	xorb	%al, 0
+# REORDER-NEXT:	int3
+# REORDER-EMPTY:
+# REORDER-NEXT:	qux:
+# REORDER-NEXT:	xorb	%al, 6
+# REORDER-EMPTY:
+# REORDER-NEXT:	a.BB.bar:
+# REORDER-NEXT:	xorb	%al, 2
+# REORDER-NEXT:	jmp	-32 <aaa.BB.bar>
+# REORDER-EMPTY:
+# REORDER-NEXT:	baz:
+# REORDER-NEXT:	xorb	%al, 5
+#
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorderblocks.out  2>&1 | FileCheck %s --check-prefixes=WARN,IMPLICITNOSPLIT
+# WARN-NOT:		warning:
+# IMPLICITNOSPLIT:	warning: propeller: no-reorder-blocks implicitly sets no-split-funcs.
+#
+# RUN: llvm-objdump -d %t.propeller.noreorderblocks.out| FileCheck %s --check-prefix=NO_REORDER_BB
+#
+# NO_REORDER_BB:		Disassembly of section .text:
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 1
+# NO_REORDER_BB-NEXT:	je	9 <aa.BB.bar>
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	a.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 2
+# NO_REORDER_BB-NEXT:	jmp	7 <aaa.BB.bar>
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	aa.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	aaa.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 4
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	foo:
+# NO_REORDER_BB-NEXT:	xorb	%al, 0
+# NO_REORDER_BB-NEXT:	int3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	qux:
+# NO_REORDER_BB-NEXT:	xorb	%al, 6
+# NO_REORDER_BB-NEXT:	int3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	baz:
+# NO_REORDER_BB-NEXT:	xorb	%al, 5
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.noreorderfuncs.out
+# RUN: llvm-objdump -d %t.propeller.noreorderfuncs.out| FileCheck %s --check-prefix=NO_REORDER_FUNC
+#
+# NO_REORDER_FUNC:	Disassembly of section .text:
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	foo:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 0
+# NO_REORDER_FUNC-NEXT:	int3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 1
+# NO_REORDER_FUNC-NEXT:	jne	22 <a.BB.bar>
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	aa.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	aaa.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 4
+# NO_REORDER_FUNC-NEXT:	int3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	qux:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 6
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	a.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 2
+# NO_REORDER_FUNC-NEXT:	jmp	-24 <aaa.BB.bar>
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	baz:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 5
+#
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitfuncs.out
+# RUN: llvm-objdump -d %t.propeller.nosplitfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_FUNC
+#
+# NO_SPLIT_FUNC:	Disassembly of section .text:
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 1
+# NO_SPLIT_FUNC-NEXT:	jne	14 <a.BB.bar>
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	aa.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	aaa.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 4
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	a.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 2
+# NO_SPLIT_FUNC-NEXT:	jmp	-16 <aaa.BB.bar>
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	foo:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 0
+# NO_SPLIT_FUNC-NEXT:	int3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	qux:
+# NO_SPLIT_FUNC-NEXT:	xorb	%al, 6
+# NO_SPLIT_FUNC-NEXT:	int3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	baz:
+# NO_SPLIT_FUNC-NEXT:	xorb	%al, 5
+#
+
+## Check that the combination of no-reorder-blocks and split-funcs makes lld fail.
+# RUN: not ld.lld  %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-opt=split-funcs 2>&1 -o /dev/null | FileCheck %s --check-prefix=FAIL
+# FAIL:		propeller: Inconsistent combination of propeller optimizations 'split-funcs' and 'no-reorder-blocks'.
+
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitreorderfuncs.out
+# RUN: llvm-objdump -d %t.propeller.nosplitreorderfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_REORDER_FUNC
+#
+# NO_SPLIT_REORDER_FUNC:	Disassembly of section .text:
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	foo:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 0
+# NO_SPLIT_REORDER_FUNC-NEXT:	int3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 1
+# NO_SPLIT_REORDER_FUNC-NEXT:	jne	14 <a.BB.bar>
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	aa.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	aaa.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 4
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	a.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 2
+# NO_SPLIT_REORDER_FUNC-NEXT:	jmp	-16 <aaa.BB.bar>
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	baz:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 5
+# NO_SPLIT_REORDER_FUNC-NEXT:	int3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	qux:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 6
+
+.section	.text,"ax",@progbits
+# -- Begin function foo
+.type	foo,@function
+.align	4
+foo:
+ xor %al,0
+
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
+
+.section	.text,"ax",@progbits,unique,1
+# -- Begin function bar
+.type	bar,@function
+.align	4
+bar:
+ xor %al,1
+ je	aa.BB.bar
+ jmp	a.BB.bar
+
+.section	.text,"ax",@progbits,unique,2
+a.BB.bar:
+ xor %al,2
+ jmp	aaa.BB.bar
+.La.BB.bar_end:
+ .size	a.BB.bar, .La.BB.bar_end-a.BB.bar
+
+.section	.text,"ax",@progbits,unique,3
+aa.BB.bar:
+ xor %al,3
+ jmp	aaa.BB.bar
+.Laa.BB.bar_end:
+ .size	aa.BB.bar, .Laa.BB.bar_end-aa.BB.bar
+
+.section	.text,"ax",@progbits,unique,4
+aaa.BB.bar:
+ xor %al,4
+.Laaa.BB.bar_end:
+ .size	aaa.BB.bar, .Laaa.BB.bar_end-aaa.BB.bar
+
+.section	.text,"ax",@progbits,unique,1
+.Lbar_end:
+ .size	bar, .Lbar_end-bar
+# -- End function bar
+
+.section	.text,"ax",@progbits,unique,5
+# -- Begin function baz
+.type	baz,@function
+.align	4
+baz:
+ xor %al,5
+
+.Lbaz_end:
+ .size	baz, .Lbaz_end-baz
+# -- End function baz
+
+.section	.text,"ax",@progbits,unique,6
+# -- Begin function qux
+.type	qux,@function
+.align	4
+qux:
+ xor %al,6
+
+.Lqux_end:
+ .size	qux, .Lqux_end-qux
+# -- End function qux
Index: lld/test/ELF/propeller/codelayout/optimal-fallthrough.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/optimal-fallthrough.s
@@ -0,0 +1,87 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises optimal basic block reordering one a single function.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000005 t foo
+# BEFORE-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# BEFORE-NEXT:	000000000020112a 0000000000000004 t aa.BB.foo
+# BEFORE-NEXT:	000000000020112e 0000000000000004 t aaa.BB.foo
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##
+##     foo
+##      |\
+##      | \100
+##      |  \
+##      |   v
+##   150|  a.BB.foo
+##      |   /  \
+##      |  /100 \0
+##      | /      \
+##      vv        v
+##   aaa.BB.foo  aa.BB.foo
+##
+## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and
+## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo.
+## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf.
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 12 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 4 1.2" >> %t_prof.propeller
+# RUN: echo "4 4 1.3" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "1 4 150" >> %t_prof.propeller
+# RUN: echo "2 4 100" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "1 2 100" >> %t_prof.propeller
+
+# RUN: ld.lld %t.o -propeller-debug-symbol=foo -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out
+# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER
+
+# AFTER:	0000000000201120 0000000000000005 t foo
+# AFTER-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# AFTER-NEXT:	000000000020112a 0000000000000004 t aaa.BB.foo
+# AFTER-NEXT:	000000000020112e 0000000000000004 t aa.BB.foo
+
+#.global _start
+#_start:
+# -- Begin function foo
+.section	.text,"ax",@progbits
+.type	foo,@function
+foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ ret
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
Index: lld/test/ELF/symbol-alignment-file.s
===================================================================
--- /dev/null
+++ lld/test/ELF/symbol-alignment-file.s
@@ -0,0 +1,29 @@
+# REQUIRES: x86
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -o %t.out
+# RUN: llvm-objdump -s %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:      Contents of section .foo:
+# BEFORE-NEXT:  201120 11cccccc 2233
+
+# RUN: echo "_foo2 1" > %t_alignment.txt
+# RUN: echo "_foo3 4" >> %t_alignment.txt
+
+# RUN: ld.lld --symbol-alignment-file %t_alignment.txt %t.o -o %t2.out
+# RUN: llvm-objdump -s %t2.out| FileCheck %s --check-prefix=AFTER
+
+# AFTER:      Contents of section .foo:
+# AFTER-NEXT:  201120 1122cccc 33
+
+.section .foo,"ax",@progbits,unique,1
+_foo1:
+ .byte 0x11
+
+.section .foo,"ax",@progbits,unique,2
+.align 4
+_foo2:
+ .byte 0x22
+
+.section .foo,"ax",@progbits,unique,3
+_foo3:
+ .byte 0x33