Index: lld/ELF/Propeller/CodeLayout/CodeLayout.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/CodeLayout.h
@@ -0,0 +1,41 @@
+//===- PropellerBBReordering.h  -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H
+#define LLD_ELF_PROPELLER_BB_REORDERING_H
+
+#include "NodeChainClustering.h"
+#include "PropellerCFG.h"
+
+#include <list>
+#include <vector>
+
+namespace lld {
+namespace propeller {
+
+class CodeLayout {
+private:
+  // cfgs that are processed by the reordering algorithm. These are separated
+  // into hot and cold cfgs.
+  std::vector<ControlFlowGraph *> hotCFGs, coldCFGs;
+  // The final hot and cold order containing all cfg nodes.
+  std::vector<CFGNode *> HotOrder, ColdOrder;
+  // Handle of the clustering algorithm used to further reorder the computed
+  // chains.
+  std::unique_ptr<ChainClustering> clustering;
+
+public:
+  void doSplitOrder(std::list<StringRef> &symbolList,
+                    std::list<StringRef>::iterator hotPlaceHolder,
+                    std::list<StringRef>::iterator coldPlaceHolder);
+
+  void printStats();
+};
+
+} // namespace propeller
+} // namespace lld
+#endif
Index: lld/ELF/Propeller/CodeLayout/CodeLayout.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/CodeLayout.cpp
@@ -0,0 +1,185 @@
+//===- CodeLayout.cpp  ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and implements the entry point of code layout optimization
+// (doSplitOrder).
+//===----------------------------------------------------------------------===//
+#include "CodeLayout.h"
+
+#include "NodeChain.h"
+#include "NodeChainBuilder.h"
+#include "NodeChainClustering.h"
+#include "Propeller.h"
+#include "PropellerCFG.h"
+#include "PropellerConfig.h"
+
+#include "llvm/ADT/DenseMap.h"
+
+#include <chrono>
+#include <vector>
+
+using llvm::DenseMap;
+
+namespace lld {
+namespace propeller {
+extern uint64_t getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward,
+                                   uint64_t srcSinkDistance);
+
+// This function iterates over the cfgs included in the Propeller profile and
+// adds them to cold and hot cfg lists. Then it appropriately performs basic
+// block reordering by calling NodeChainBuilder.doOrder() either on all cfgs (if
+// -propeller-opt=reorder-ip) or individually on every controlFlowGraph. After
+// creating all the node chains, it hands the basic block chains to a
+// ChainClustering instance for further rerodering.
+void CodeLayout::doSplitOrder(std::list<StringRef> &symbolList,
+                              std::list<StringRef>::iterator hotPlaceHolder,
+                              std::list<StringRef>::iterator coldPlaceHolder) {
+  std::chrono::steady_clock::time_point start =
+      std::chrono::steady_clock::now();
+
+  // Populate the hot and cold cfg lists by iterating over the cfgs in the
+  // propeller profile.
+  prop->forEachCfgRef([this](ControlFlowGraph &cfg) {
+    if (cfg.isHot()) {
+      hotCFGs.push_back(&cfg);
+      if (propConfig.optPrintStats) {
+        // Dump the number of basic blocks and hot basic blocks for every
+        // function
+        unsigned hotBBs = 0;
+        unsigned allBBs = 0;
+        cfg.forEachNodeRef([&hotBBs, &allBBs](CFGNode &node) {
+          if (node.freq)
+            hotBBs++;
+          allBBs++;
+        });
+        fprintf(stderr, "HISTOGRAM: %s,%u,%u\n", cfg.name.str().c_str(), allBBs,
+                hotBBs);
+      }
+    } else
+      coldCFGs.push_back(&cfg);
+  });
+
+  if (propConfig.optReorderIP || propConfig.optReorderFuncs)
+    clustering.reset(new CallChainClustering());
+  else {
+    // If function ordering is disabled, we want to conform the the initial
+    // ordering of functions in both the hot and the cold layout.
+    clustering.reset(new NoOrdering());
+  }
+
+  if (propConfig.optReorderIP) {
+    // If -propeller-opt=reorder-ip we want to run basic block reordering on all
+    // the basic blocks of the hot cfgs.
+    NodeChainBuilder(hotCFGs).doOrder(clustering);
+  } else if (propConfig.optReorderBlocks) {
+    // Otherwise we apply reordering on every controlFlowGraph separately
+    for (ControlFlowGraph *cfg : hotCFGs)
+      NodeChainBuilder(cfg).doOrder(clustering);
+  } else {
+    // If reordering is not desired, we create changes according to the initial
+    // order in the controlFlowGraph.
+    for (ControlFlowGraph *cfg : hotCFGs)
+      clustering->addChain(std::unique_ptr<NodeChain>(new NodeChain(cfg)));
+  }
+
+  // The order for cold cfgs remains unchanged.
+  for (ControlFlowGraph *cfg : coldCFGs)
+    clustering->addChain(std::unique_ptr<NodeChain>(new NodeChain(cfg)));
+
+  // After building all the chains, let the chain clustering algorithm perform
+  // the final reordering and populate the hot and cold cfg node orders.
+  clustering->doOrder(HotOrder, ColdOrder);
+
+  // Transfter the order to the symbol list.
+  for (CFGNode *n : HotOrder)
+    symbolList.insert(hotPlaceHolder, n->shName);
+
+  for (CFGNode *n : ColdOrder)
+    symbolList.insert(coldPlaceHolder, n->shName);
+
+  std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
+  warn("[Propeller]: bb reordering took: " +
+       Twine(std::chrono::duration_cast<std::chrono::milliseconds>(end - start)
+                 .count()));
+
+  if (propConfig.optPrintStats)
+    printStats();
+}
+
+// Prints statistics of the computed layout, including the number of partitions
+// for each function across the code layout, the edge count for each distance
+// level, and the ExtTSP score achieved for each function.
+void CodeLayout::printStats() {
+
+  DenseMap<CFGNode *, uint64_t> nodeAddressMap;
+  llvm::StringMap<unsigned> functionPartitions;
+  uint64_t currentAddress = 0;
+  ControlFlowGraph *currentCFG = nullptr;
+  for (CFGNode *n : HotOrder) {
+    if (currentCFG != n->controlFlowGraph) {
+      currentCFG = n->controlFlowGraph;
+      functionPartitions[currentCFG->name]++;
+    }
+    nodeAddressMap[n] = currentAddress;
+    currentAddress += n->shSize;
+  }
+
+  for (auto &elem : functionPartitions)
+    fprintf(stderr, "FUNCTION PARTITIONS: %s,%u\n", elem.first().str().c_str(),
+            elem.second);
+
+  std::vector<uint64_t> distances({0, 128, 640, 1028, 4096, 65536, 2 << 20,
+                                   std::numeric_limits<uint64_t>::max()});
+  std::map<uint64_t, uint64_t> histogram;
+  llvm::StringMap<uint64_t> extTSPScoreMap;
+  for (CFGNode *n : HotOrder) {
+    auto scoreEntry =
+        extTSPScoreMap.try_emplace(n->controlFlowGraph->name, 0).first;
+    n->forEachOutEdgeRef([&nodeAddressMap, &distances, &histogram,
+                          &scoreEntry](CFGEdge &edge) {
+      if (!edge.weight || edge.isReturn())
+        return;
+      if (nodeAddressMap.find(edge.src) == nodeAddressMap.end() ||
+          nodeAddressMap.find(edge.sink) == nodeAddressMap.end()) {
+        warn("Found a hot edge whose source and sink do not show up in the "
+             "layout!");
+        return;
+      }
+      uint64_t srcOffset = nodeAddressMap[edge.src];
+      uint64_t sinkOffset = nodeAddressMap[edge.sink];
+      bool edgeForward = srcOffset + edge.src->shSize <= sinkOffset;
+      uint64_t srcSinkDistance =
+          edgeForward ? sinkOffset - srcOffset - edge.src->shSize
+                      : srcOffset - sinkOffset + edge.src->shSize;
+
+      if (edge.type == CFGEdge::EdgeType::INTRA_FUNC ||
+          edge.type == CFGEdge::EdgeType::INTRA_DYNA)
+        scoreEntry->second +=
+            getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance);
+
+      auto res =
+          std::lower_bound(distances.begin(), distances.end(), srcSinkDistance);
+      histogram[*res] += edge.weight;
+    });
+  }
+
+  for (auto &elem : extTSPScoreMap)
+    fprintf(stderr, "Ext TSP score: %s %lu\n", elem.first().str().c_str(),
+            elem.second);
+  fprintf(stderr, "DISTANCE HISTOGRAM: ");
+  uint64_t sumEdgeWeights = 0;
+  for (auto elem : histogram)
+    sumEdgeWeights += elem.second;
+  for (auto elem : histogram)
+    fprintf(stderr, "\t[%lu -> %lu (%.2f%%)]", elem.first, elem.second,
+            (double)elem.second * 100 / sumEdgeWeights);
+  fprintf(stderr, "\n");
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/ModifiablePriorityQueue.h
@@ -0,0 +1,387 @@
+//===- ModifiablePriorityQueue.h  ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file implements a priority-queue data structure for storing key-value
+// pairs, with compare operations for both the key and the value type given.
+// It stores these pairs in a max-heap data structure which allows for fast O(lg
+// n)-time retrieval of the maximum element.
+// To compare two key-value pairs, the values parts are compared first. The keys
+// are compared in case of a tie. No two key-value pairs could have the same
+// key.
+//
+// The key-value pairs are also accessible using a map data structure.
+//
+// In contrast to C++ STL's priority queue, this data structure allows for fast
+// O(lg n)-time value update, or removal of any element while keep the max-heap
+// data structure sound.
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H
+#define LLD_ELF_PROPELLER_MODIFIABLE_PRIORITY_QUEUE_H
+
+#include "llvm/ADT/DenseMap.h"
+#include <algorithm>
+#include <assert.h>
+#include <functional>
+#include <memory>
+#include <string.h>
+#include <vector>
+
+// This class defines a node in the ModifiablePriorityQueue data structure
+// containing a key, a value, pointers to its parent, and its left and right
+// children. This class must be instantiated with the key-type, the value type,
+// and their comparators.
+template <class K, class V, class CmpK, class CmpV> class HeapNode {
+public:
+  K key;
+  V value;
+  // Pointer to the parent (this is nullptr for root).
+  HeapNode<K, V, CmpK, CmpV> *parent;
+  // Pointers to the two children
+  HeapNode<K, V, CmpK, CmpV> *children[2];
+
+  HeapNode(K k, V v)
+      : key(k), value(std::move(v)), parent(nullptr), children() {}
+
+  // Get pointer to the left child
+  HeapNode<K, V, CmpK, CmpV> *leftChild() { return children[0]; }
+
+  // Get pointer to the right child
+  HeapNode<K, V, CmpK, CmpV> *rightChild() { return children[1]; }
+
+  // Whether this is the left child of its parent, returns false if this is the
+  // root.
+  bool isLeftChild() {
+    if (parent)
+      return parent->leftChild() == this;
+    return false;
+  }
+
+  // Whether this is the right child of its parent, returns false if this is the
+  // root.
+  bool isRightChild() {
+    if (parent)
+      return parent->rightChild() == this;
+    return false;
+  }
+
+  // Set the left child for this node.
+  void adoptLeftChild(HeapNode<K, V, CmpK, CmpV> *c) {
+    children[0] = c;
+    if (c)
+      c->parent = this;
+  }
+
+  // Set the right child for this node.
+  void adoptRightChild(HeapNode<K, V, CmpK, CmpV> *c) {
+    children[1] = c;
+    if (c)
+      c->parent = this;
+  }
+
+  // Set both the left and right children for this node.
+  void adoptChildren(HeapNode<K, V, CmpK, CmpV> *(&_children)[2]) {
+    adoptLeftChild(_children[0]);
+    adoptRightChild(_children[1]);
+  }
+
+  // Return a string representation for this node given an indentation level
+  // which is used for pretty-printing of the heap tree.
+  // This assumes that
+  // std::to_string has been specialized for the key and value types.
+  std::string toString(unsigned level) {
+    std::string str = std::string(level, ' ');
+    str += "NODE: " + std::to_string(key) + " -> " + std::to_string(value);
+    for (auto *c : children) {
+      if (c) {
+        str += "\n";
+        str += c->toString(level + 1);
+      }
+    }
+    return str;
+  }
+};
+
+// Comparator class for nodes given comparators for the key and value types. It
+// compares the values for the two pairs first and in the case of a tie, it
+// compares the keys.
+template <class K, class V, class CmpK, class CmpV> struct CompareHeapNode {
+  CmpK KeyComparator;
+  CmpV ValueComparator;
+  bool operator()(const HeapNode<K, V, CmpK, CmpV> &n1,
+                  const HeapNode<K, V, CmpK, CmpV> &n2) const {
+    if (ValueComparator(n1.value, n2.value))
+      return true;
+    if (ValueComparator(n2.value, n1.value))
+      return false;
+    return KeyComparator(n1.key, n2.key);
+  }
+};
+
+// This class implements a pointer-based heap-like data structure for storing
+// key-value pairs with efficient O(lg n) retrieval, removal, update, and
+// insertion of nodes.
+//
+// Any insertion happens at the lowest-right-most empty leaf position. This
+// position is accessible via the root pointer of the tree using the binary
+// representation of [size of the tree + 1]. Starting with the left-most bit, we
+// take the left child if the bit is zero and the right child if the bit is one.
+//
+// Deletion of a node happens by replacing the last node (lowest-right-most
+// occupied leaf), with the given node, removing the node from the tree, and
+// finally heapifying the replaced node up and down to keep the
+// soundness of the heap.
+//
+// Value-update is the simplest, which requires updating the node's value
+// followed by heapifying up the node up and down.
+template <class K, class V, class CmpK, class CmpV>
+class ModifiablePriorityQueue {
+private:
+  // Comparator instance for comparing HeapNodes.
+  CompareHeapNode<K, V, CmpK, CmpV> HeapNodeComparator;
+
+  // Pointers to the nodes in this heap are stored in a map to allow for fast
+  // lookup of nodes given their keys.
+  llvm::DenseMap<K, std::unique_ptr<HeapNode<K, V, CmpK, CmpV>>> nodes;
+
+  // The tree-root for this priority queue, which holds the max key-value.
+  HeapNode<K, V, CmpK, CmpV> *root = nullptr;
+
+  // Total number of nodes in the tree.
+  unsigned Size = 0;
+
+  // Set the root for this tree and set it's parent to null.
+  void assignRoot(HeapNode<K, V, CmpK, CmpV> *node) {
+    root = node;
+    if (root)
+      root->parent = nullptr;
+  }
+
+  // Helper for getting a pointer to a node using a handle using its bits.
+  HeapNode<K, V, CmpK, CmpV> *getNodeWithHandle(unsigned handle) {
+    return getNodeWithHandleHelper(root, handle);
+  }
+
+  // This function traverses the tree from the given node, using an integer
+  // handle for direction. Going from left to right in the binary representation
+  // of the handle, take right if the bit is one, and left if the bit is zero.
+  HeapNode<K, V, CmpK, CmpV> *
+  getNodeWithHandleHelper(HeapNode<K, V, CmpK, CmpV> *node, unsigned handle) {
+    if (handle == 1)
+      return node;
+    auto *p = getNodeWithHandleHelper(node, handle >> 1);
+    return (handle & 1) ? p->rightChild() : p->leftChild();
+  }
+
+  // Insert a node into the heap tree.
+  void insert(HeapNode<K, V, CmpK, CmpV> *node) {
+    if (!root) {
+      // If the tree is empty, we only need to set the root.
+      assignRoot(node);
+    } else {
+      // Otherwise, insert at the lowest-right-most leaf empty-position, which
+      // is encoded by [Size + 1]
+      unsigned handle = Size + 1;
+      // We need to find the parent of the target position.
+      auto *p = getNodeWithHandleHelper(root, handle >> 1);
+      // Attach this to the left or right of the parent node, depending on the
+      // last bit in handle.
+      if (handle & 1)
+        p->adoptRightChild(node);
+      else
+        p->adoptLeftChild(node);
+      // We only need to heapify up as this node has no children.
+      heapifyUp(node);
+    }
+    // Increment the size of the tree.
+    Size++;
+  }
+
+  // This function removes a node from the heap and returns its value.
+  // Instead of removing the node directly, it moves the lowest-right-most node
+  // to the position of the given node and heapifies it up and down.
+  //
+  // After this function, the removed node's HeapNode pointer is invalid.
+  V remove(HeapNode<K, V, CmpK, CmpV> *node) {
+    // Find the lowest-right-most node
+    auto *last = getNodeWithHandleHelper(root, Size);
+    assert(last->leftChild() == nullptr && last->rightChild() == nullptr);
+
+    // Remove the node from the children of its parents.
+    if (last->parent) {
+      if (last->isLeftChild())
+        last->parent->adoptLeftChild(nullptr);
+      else
+        last->parent->adoptRightChild(nullptr);
+    }
+
+    if (node == last) {
+      // If we are removing the last node, not much needs to be done.
+      if (node->parent == nullptr) {
+        assert(node == root);
+        assignRoot(nullptr);
+      }
+    } else { // node != last
+      if (node->parent) {
+        // Move the last node in place of the to-be-removed node.
+        if (node->isLeftChild())
+          node->parent->adoptLeftChild(last);
+        else
+          node->parent->adoptRightChild(last);
+      } else { // node->parent == nullptr, so node is root
+        assert(node == root);
+        assignRoot(last);
+      }
+      // Let the moved-node adopt the children of the to-be-removed node
+      last->adoptChildren(node->children);
+      // Finally, heapify the moved node up and down.
+      if (!heapifyUp(last))
+        heapifyDown(last);
+    }
+
+    // Decrement the number of nodes.
+    Size--;
+    // Save the node's value to be returned.
+    auto nodeElement = std::move(node->value);
+    // Remove the node from the key->node mapping, effectively deallocating the
+    // HeapNode.
+    nodes.erase(node->key);
+    return nodeElement;
+  }
+
+  // Recursively rectify the max-heap upwards from a node.
+  // This takes O(lg n) time.
+  // Returns true if any modifications occurs.
+  bool heapifyUp(HeapNode<K, V, CmpK, CmpV> *node) {
+    // Heapify upwards until no more inconsistency is found.
+    if (node->parent && HeapNodeComparator(*node->parent, *node)) {
+      swapWithParent(node);
+      heapifyUp(node);
+      return true;
+    }
+    return false;
+  }
+
+  // Recursively rectify the max-heap downwards from a node.
+  // This takes O(lg n) time.
+  // Returns true if any modifications occurs.
+  bool heapifyDown(HeapNode<K, V, CmpK, CmpV> *node) {
+    auto maxChild = std::max_element(
+        node->children, node->children + 2,
+        [this](const HeapNode<K, V, CmpK, CmpV> *c1,
+               const HeapNode<K, V, CmpK, CmpV> *c2) {
+          // This compares two HeapNode pointers, relying on
+          // HeapNodeComparator for non-null pointers.
+          return c1 == nullptr
+                     ? true
+                     : (c2 == nullptr ? false : HeapNodeComparator(*c1, *c2));
+        });
+    // Move the maximum child to the parent if required and further heapify
+    // down.
+    if (*maxChild != nullptr && HeapNodeComparator(*node, **maxChild)) {
+      swapWithParent(*maxChild);
+      heapifyDown(node);
+      return true;
+    }
+    return false;
+  }
+
+  // Helper function for swapping a node with its parent.
+  // Can only be called if the given node has a parent. Also, the given node's
+  // value should compare bigger than both its parent and its sibling.
+  // For example swapWithParent(C1) results in the following.
+  //            G                    G
+  //           /                    /
+  //          P        ->          C1
+  //         / \                  / \
+  //        C1 C2                P  C2
+  //       / \                  / \
+  //      A   B                A   B
+  // This function merely properlly corrects the parent-child relationships.
+  void swapWithParent(HeapNode<K, V, CmpK, CmpV> *node) {
+    auto *par = node->parent;
+    assert(par);
+    auto *gpar = node->parent->parent;
+    if (!gpar) {
+      assert(this->root == par);
+      this->assignRoot(node);
+    } else {
+      if (par->isLeftChild())
+        gpar->adoptLeftChild(node);
+      else
+        gpar->adoptRightChild(node);
+    }
+    auto *par_old_left = par->leftChild();
+    auto *par_old_right = par->rightChild();
+
+    par->adoptChildren(node->children);
+    if (par_old_left == node) {
+      node->adoptLeftChild(par);
+      node->adoptRightChild(par_old_right);
+    } else {
+      node->adoptLeftChild(par_old_left);
+      node->adoptRightChild(par);
+    }
+  }
+
+public:
+  // Insert a key-value pair into the heap.
+  // If the same key exists in the heap, it updates the value of the existing
+  // HeapNode and heapifies the node up or down.
+  void insert(K key, V value) {
+    auto cur = nodes.find(key);
+    if (cur != nodes.end()) {
+      cur->second->value = std::move(value);
+      if (!heapifyUp(cur->second.get()))
+        heapifyDown(cur->second.get());
+    } else {
+      // If the key is not found, create a new HeapNode and insert it into the
+      // heap.
+      auto *node = new HeapNode<K, V, CmpK, CmpV>(key, std::move(value));
+      insert(node);
+      nodes.try_emplace(key, node);
+    }
+  }
+
+  // Removes the node associated with a key from the heap if one exists.
+  void erase(K k) {
+    auto cur = nodes.find(k);
+    if (cur != nodes.end())
+      remove(cur->second.get());
+  }
+
+  // Retrieves the HeapNode corresponding to a given key (returns nullptr if no
+  // such node exists).
+  HeapNode<K, V, CmpK, CmpV> *get(K k) {
+    auto it = nodes.find(k);
+    return it == nodes.end() ? nullptr : it->second.get();
+  }
+
+  // Removes the root (max-element) from the heap.
+  void pop() {
+    assert(!empty());
+    remove(root);
+  }
+
+  // Returns the value of the max-element node.
+  V top() { return std::move(root->value); }
+
+  bool empty() { return Size == 0; }
+
+  unsigned size() { return Size; };
+
+  // Returns a string representation of the heap tree.
+  std::string toString() {
+    std::string str;
+    str += "HEAP with ";
+    str += std::to_string(Size);
+    str += " nodes\n";
+    if (root)
+      str += root->toString(0);
+    return str;
+  }
+};
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChain.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChain.h
@@ -0,0 +1,131 @@
+//===- PropellerNodeChain.h  ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_H
+#define LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_H
+
+#include "PropellerCFG.h"
+#include "llvm/ADT/DenseSet.h"
+
+#include <list>
+#include <unordered_map>
+#include <vector>
+
+using llvm::DenseSet;
+
+namespace lld {
+namespace propeller {
+
+// Represents a chain of nodes (basic blocks).
+class NodeChain {
+public:
+  // Representative node of the chain, with which it is initially constructed.
+  CFGNode *delegateNode;
+
+  // controlFlowGraph of the nodes in this chain (this will be null if the nodes
+  // come from more than one cfg).
+  ControlFlowGraph *controlFlowGraph;
+
+  // Ordered list of the nodes in this chain.
+  std::list<CFGNode *> nodes;
+
+  // Iterators to the positions in the chain where the chain transitions from
+  // one function to another.
+  std::list<std::list<CFGNode *>::iterator> functionTransitions;
+
+  // Out edges for this chain, to its own nodes and nodes of other chains.
+  std::unordered_map<NodeChain *, std::vector<CFGEdge *>> outEdges;
+
+  // chains which have outgoing edges to this chain.
+  DenseSet<NodeChain *> inEdges;
+
+  // Total binary size of the chain.
+  uint64_t size;
+
+  // Total execution frequency of the chain.
+  uint64_t freq;
+
+  // Extended TSP score of the chain.
+  uint64_t score = 0;
+
+  // Whether to print out information about how this chain joins with others.
+  bool debugChain;
+
+  // Constructor for building a NodeChain from a single Node
+  NodeChain(CFGNode *node)
+      : delegateNode(node), controlFlowGraph(node->controlFlowGraph),
+        nodes(1, node), size(node->shSize), freq(node->freq),
+        debugChain(node->controlFlowGraph->debugCFG) {}
+
+  // Constructor for building a NodeChain from all nodes in the controlFlowGraph
+  // according to the initial order.
+  NodeChain(ControlFlowGraph *cfg)
+      : delegateNode(cfg->getEntryNode()), controlFlowGraph(cfg),
+        size(cfg->size), freq(0), debugChain(cfg->debugCFG) {
+    cfg->forEachNodeRef([this](CFGNode &node) {
+      nodes.push_back(&node);
+      freq += node.freq;
+    });
+  }
+
+  // Helper function to iterate over the outgoing edges of this chain to a
+  // specific chain, while applying a given function on each edge.
+  template <class Visitor>
+  void forEachOutEdgeToChain(NodeChain *chain, Visitor V) {
+    auto it = outEdges.find(chain);
+    if (it == outEdges.end())
+      return;
+    for (CFGEdge *E : it->second)
+      V(*E, this, chain);
+  }
+
+  // This returns the execution density of the chain.
+  double execDensity() const {
+    return ((double)freq) / std::max(size, (uint64_t)1);
+  }
+
+  bool isSameCFG(const NodeChain &c) {
+    return controlFlowGraph && controlFlowGraph == c.controlFlowGraph;
+  }
+};
+
+// This returns a string representation of the chain
+std::string toString(const NodeChain &c,
+                     std::list<CFGNode *>::const_iterator slicePos);
+std::string toString(const NodeChain &c);
+
+} // namespace propeller
+} // namespace lld
+
+namespace std {
+// Specialization of std::less for NodeChain, which allows for consistent
+// tie-breaking in our Map data structures.
+template <> struct less<lld::propeller::NodeChain *> {
+  bool operator()(const lld::propeller::NodeChain *c1,
+                  const lld::propeller::NodeChain *c2) const {
+    return c1->delegateNode->mappedAddr < c2->delegateNode->mappedAddr;
+  }
+};
+
+// Specialization of std::less for pair<NodeChain,NodeChain>, which allows for
+// consistent tie-breaking in our Map data structures.
+template <>
+struct less<pair<lld::propeller::NodeChain *, lld::propeller::NodeChain *>> {
+  bool operator()(
+      const pair<lld::propeller::NodeChain *, lld::propeller::NodeChain *> p1,
+      const pair<lld::propeller::NodeChain *, lld::propeller::NodeChain *> p2)
+      const {
+    if (less<lld::propeller::NodeChain *>()(p1.first, p2.first))
+      return true;
+    if (less<lld::propeller::NodeChain *>()(p2.first, p1.first))
+      return false;
+    return less<lld::propeller::NodeChain *>()(p1.second, p2.second);
+  }
+};
+} // namespace std
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChain.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChain.cpp
@@ -0,0 +1,43 @@
+//===- NodeChain.cpp  -----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "NodeChain.h"
+
+namespace lld {
+namespace propeller {
+
+std::string toString(const NodeChain &c,
+                     std::list<CFGNode *>::const_iterator slicePos) {
+  std::string str;
+  if (c.controlFlowGraph)
+    str += c.controlFlowGraph->name.str();
+  str += " [ ";
+  for (auto it = c.nodes.begin(); it != c.nodes.end(); ++it) {
+    auto *n = *it;
+    if (it == slicePos)
+      str += "\n....SLICE POSITION....\n";
+    if (!c.controlFlowGraph)
+      str +=
+          std::to_string(n->controlFlowGraph->getEntryNode()->mappedAddr) + ":";
+    str += n->controlFlowGraph->getEntryNode() == n
+               ? "Entry"
+               : std::to_string(n->shName.size() -
+                                n->controlFlowGraph->name.size() - 4);
+    str += " (size=" + std::to_string(n->shSize) +
+           ", freq=" + std::to_string(n->freq) + ")";
+    if (n != c.nodes.back())
+      str += " -> ";
+  }
+  str += " ]";
+  str += " score: " + std::to_string(c.score);
+  return str;
+}
+
+std::string toString(const NodeChain &c) { return toString(c, c.nodes.end()); }
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.h
@@ -0,0 +1,210 @@
+//===- PropellerNodeChainAssembly.h  --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file includes the declaration of the NodeChainAssembly class. Each
+// instance of this class gives a recipe for merging two NodeChains together in
+// addition to the ExtTSP score gain that will be achieved by that merge. Each
+// NodeChainAssembly consists of three NodeChainSlices from two node chains: the
+// (potentially) splitted chain, and the unsplit chain. A NodeChainSlice has
+// represents a slice of a NodeChain by storing iterators to the beginning and
+// end of that slice in the node chain, plus the binary offsets at which these
+// slices begin and end. The offsets allow effient computation of the gain in
+// ExtTSP score.
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_ASSEMBLY_H
+#define LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_ASSEMBLY_H
+
+#include "NodeChain.h"
+#include "PropellerCFG.h"
+
+#include <list>
+#include <vector>
+
+namespace lld {
+namespace propeller {
+
+uint64_t getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward,
+                            uint64_t srcSinkDistance);
+
+// This class defines a slices of a node chain, specified by iterators to the
+// beginning and end of the slice.
+class NodeChainSlice {
+public:
+  // chain from which this slice comes from
+  NodeChain *chain;
+
+  // The endpoints of the slice in the corresponding chain
+  std::list<CFGNode *>::iterator beginPosition, endPosition;
+
+  // The offsets corresponding to the two endpoints
+  uint64_t beginOffset, endOffset;
+
+  // Constructor for building a chain slice from a given chain and the two
+  // endpoints of the chain.
+  NodeChainSlice(NodeChain *c, std::list<CFGNode *>::iterator begin,
+                 std::list<CFGNode *>::iterator end)
+      : chain(c), beginPosition(begin), endPosition(end) {
+
+    beginOffset = (*begin)->chainOffset;
+    if (endPosition == chain->nodes.end())
+      endOffset = chain->size;
+    else
+      endOffset = (*end)->chainOffset;
+  }
+
+  // Constructor for building a chain slice from a node chain containing all of
+  // its nodes.
+  NodeChainSlice(NodeChain *c)
+      : chain(c), beginPosition(c->nodes.begin()), endPosition(c->nodes.end()),
+        beginOffset(0), endOffset(c->size) {}
+
+  // (Binary) size of this slice
+  uint64_t size() const { return endOffset - beginOffset; }
+
+  bool isEmpty() const { return beginPosition == endPosition; }
+};
+
+// This enum represents the order in which three slices (S1, S2, and U) are
+// merged together. We exclude S1S2U and US1S2 since they are respectively
+// equivalent to S2S1U (with S2 being empty) and U2U1S (with U2 being empty).
+enum MergeOrder {
+  Begin,
+  S2S1U = Begin,
+  BeginNext,
+  S1US2 = BeginNext,
+  S2US1,
+  US2S1,
+  End
+};
+
+// This class defines a strategy for assembling two chains together with one of
+// the chains potentially split into two chains.
+class NodeChainAssembly {
+public:
+  // The gain in ExtTSP score achieved by this NodeChainAssembly once it
+  // is accordingly applied to the two chains.
+  // This is effectively equal to "score - splitChain->score -
+  // unsplitChain->score".
+  uint64_t scoreGain = 0;
+
+  // The two chains, the first being the splitChain and the second being the
+  // unsplitChain.
+  std::pair<NodeChain *, NodeChain *> chainPair;
+
+  // The splitting position in splitchain
+  std::list<CFGNode *>::iterator slicePosition;
+
+  // The three chain slices
+  std::vector<NodeChainSlice> slices;
+
+  // The merge order of the slices
+  MergeOrder mergeOrder;
+
+  // The constructor for creating a NodeChainAssembly. slicePosition must be an
+  // iterator into splitChain->nodes.
+  NodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain,
+                    std::list<CFGNode *>::iterator slicePosition,
+                    MergeOrder mOrder)
+      : chainPair(splitChain, unsplitChain), slicePosition(slicePosition),
+        mergeOrder(mOrder) {
+    // Construct the slices.
+    NodeChainSlice s1(splitChain, splitChain->nodes.begin(), slicePosition);
+    NodeChainSlice s2(splitChain, slicePosition, splitChain->nodes.end());
+    NodeChainSlice u(unsplitChain);
+
+    switch (mergeOrder) {
+    case MergeOrder::S2S1U:
+      slices = {std::move(s2), std::move(s1), std::move(u)};
+      break;
+    case MergeOrder::S1US2:
+      slices = {std::move(s1), std::move(u), std::move(s2)};
+      break;
+    case MergeOrder::S2US1:
+      slices = {std::move(s2), std::move(u), std::move(s1)};
+      break;
+    case MergeOrder::US2S1:
+      slices = {std::move(u), std::move(s2), std::move(s1)};
+      break;
+    default:
+      assert("Invalid MergeOrder!" && false);
+    }
+
+    // Set the ExtTSP score gain as the difference between the new score after
+    // merging these chains and the current scores of the two chains.
+    auto assemblyScore = computeExtTSPScore();
+    auto chainsScore = splitChain->score + unsplitChain->score;
+    scoreGain = assemblyScore > chainsScore ? assemblyScore - chainsScore : 0;
+  }
+
+  bool isValid() { return scoreGain; }
+
+  // Find the NodeChainSlice in this NodeChainAssembly which contains the given
+  // node. If the node is not contained in this NodeChainAssembly, then return
+  // false. Otherwise, set idx equal to the index of the corresponding slice and
+  // return true.
+  bool findSliceIndex(CFGNode *node, NodeChain *chain, uint64_t offset,
+                      uint8_t &idx) const;
+
+  // This function computes the ExtTSP score for a chain assembly record. This
+  // goes over the three bb slices in the assembly record and considers all
+  // edges whose source and sink belong to the chains in the assembly record.
+  uint64_t computeExtTSPScore() const;
+
+  // First node in the resulting assembled chain.
+  CFGNode *getFirstNode() const {
+    for (auto &slice : slices)
+      if (!slice.isEmpty())
+        return *slice.beginPosition;
+    return nullptr;
+  }
+
+  // Comparator for two nodeChainAssemblies. It compare scoreGain and break ties
+  // consistently.
+  struct CompareNodeChainAssembly {
+    bool operator()(const std::unique_ptr<NodeChainAssembly> &a1,
+                    const std::unique_ptr<NodeChainAssembly> &a2) const;
+  };
+
+  // We delete the copy constructor to make sure NodeChainAssembly is moved
+  // rather than copied.
+  NodeChainAssembly(NodeChainAssembly &&) = default;
+  // copy constructor is implicitly deleted
+  // NodeChainAssembly(NodeChainAssembly&) = delete;
+  NodeChainAssembly() = delete;
+
+  // This returns a unique value for every different assembly record between two
+  // chains. When chainPair is equal, this helps differentiate and compare the
+  // two assembly records.
+  std::pair<uint8_t, size_t> assemblyStrategy() const {
+    return std::make_pair(mergeOrder, (*slicePosition)->mappedAddr);
+  }
+
+  inline bool splits() const {
+    return slicePosition != splitChain()->nodes.begin();
+  }
+
+  inline bool splitsAtFunctionTransition() const {
+    return splits() && ((*std::prev(slicePosition))->controlFlowGraph !=
+                        (*slicePosition)->controlFlowGraph);
+  }
+
+  inline bool needsSplitChainRotation() {
+    return (mergeOrder == S2S1U && splits()) || mergeOrder == S2US1 ||
+           mergeOrder == US2S1;
+  }
+
+  inline NodeChain *splitChain() const { return chainPair.first; }
+
+  inline NodeChain *unsplitChain() const { return chainPair.second; }
+};
+
+std::string toString(NodeChainAssembly &assembly);
+
+} // namespace propeller
+} // namespace lld
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainAssembly.cpp
@@ -0,0 +1,204 @@
+//===- NodeChainAssembly.cpp  ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "NodeChainAssembly.h"
+#include "NodeChain.h"
+
+namespace lld {
+namespace propeller {
+
+// Return the Extended TSP score for one edge, given its source to sink
+// direction and distance in the layout.
+uint64_t getEdgeExtTSPScore(const CFGEdge &edge, bool isEdgeForward,
+                            uint64_t srcSinkDistance) {
+
+  // Approximate callsites to be in the middle of the source basic block.
+  if (edge.isCall()) {
+    if (isEdgeForward)
+      srcSinkDistance += edge.src->shSize / 2;
+    else
+      srcSinkDistance -= edge.src->shSize / 2;
+  }
+
+  if (edge.isReturn()) {
+    if (isEdgeForward)
+      srcSinkDistance += edge.sink->shSize / 2;
+    else
+      srcSinkDistance -= edge.sink->shSize / 2;
+  }
+
+  if (srcSinkDistance == 0 && (edge.type == CFGEdge::EdgeType::INTRA_FUNC ||
+                               edge.type == CFGEdge::EdgeType::INTRA_DYNA))
+    return edge.weight * propConfig.optFallthroughWeight;
+
+  if (isEdgeForward && srcSinkDistance < propConfig.optForwardJumpDistance)
+    return edge.weight * propConfig.optForwardJumpWeight *
+           (propConfig.optForwardJumpDistance - srcSinkDistance);
+
+  if (!isEdgeForward && srcSinkDistance < propConfig.optBackwardJumpDistance)
+    return edge.weight * propConfig.optBackwardJumpWeight *
+           (propConfig.optBackwardJumpDistance - srcSinkDistance);
+  return 0;
+}
+
+bool NodeChainAssembly::findSliceIndex(CFGNode *node, NodeChain *chain,
+                                       uint64_t offset, uint8_t &idx) const {
+  for (idx = 0; idx < 3; ++idx) {
+    if (chain != slices[idx].chain)
+      continue;
+    // We find if the node's offset lies within the begin and end offset of this
+    // slice.
+    if (offset < slices[idx].beginOffset || offset > slices[idx].endOffset)
+      continue;
+    if (offset < slices[idx].endOffset && offset > slices[idx].beginOffset)
+      return true;
+    // A node can have zero size, which means multiple nodes may be associated
+    // with the same offset. This means that if the node's offset is at the
+    // beginning or the end of the slice, the node may reside in either slices
+    // of the chain.
+    if (offset == slices[idx].endOffset) {
+      // If offset is at the end of the slice, iterate backwards over the
+      // slice to find a zero-sized node.
+      for (auto nodeIt = std::prev(slices[idx].endPosition);
+           nodeIt != std::prev(slices[idx].beginPosition); nodeIt--) {
+        // Stop iterating if the node's size is non-zero as this would change
+        // the offset.
+        if ((*nodeIt)->shSize)
+          break;
+        // Have we found the node?
+        if (*nodeIt == node)
+          return true;
+      }
+    }
+    if (offset == slices[idx].beginOffset) {
+      // If offset is at the beginning of the slice, iterate forwards over the
+      // slice to find the node.
+      for (auto nodeIt = slices[idx].beginPosition;
+           nodeIt != slices[idx].endPosition; nodeIt++) {
+        if (*nodeIt == node)
+          return true;
+        // Stop iterating if the node's size is non-zero as this would change
+        // the offset.
+        if ((*nodeIt)->shSize)
+          break;
+      }
+    }
+  }
+  return false;
+}
+
+// This function computes the ExtTSP score for a chain assembly record. This
+// goes the three bb slices in the assembly record and considers all edges
+// whose source and sink belongs to the chains in the assembly record.
+uint64_t NodeChainAssembly::computeExtTSPScore() const {
+  // Zero-initialize the score.
+  uint64_t score = 0;
+
+  auto addEdgeScore = [this, &score](CFGEdge &edge, NodeChain *srcChain,
+                                     NodeChain *sinkChain) {
+    uint8_t srcSliceIdx, sinkSliceIdx;
+    auto srcNodeOffset = edge.src->chainOffset;
+    auto sinkNodeOffset = edge.sink->chainOffset;
+    if (!findSliceIndex(edge.src, srcChain, srcNodeOffset, srcSliceIdx))
+      return;
+
+    if (!findSliceIndex(edge.sink, sinkChain, sinkNodeOffset, sinkSliceIdx))
+      return;
+
+    bool edgeForward = (srcSliceIdx < sinkSliceIdx) ||
+                       (srcSliceIdx == sinkSliceIdx &&
+                        (srcNodeOffset + edge.src->shSize <= sinkNodeOffset));
+
+    uint64_t srcSinkDistance = 0;
+
+    if (srcSliceIdx == sinkSliceIdx) {
+      srcSinkDistance = edgeForward
+                            ? sinkNodeOffset - srcNodeOffset - edge.src->shSize
+                            : srcNodeOffset - sinkNodeOffset + edge.src->shSize;
+    } else {
+      const NodeChainSlice &srcSlice = slices[srcSliceIdx];
+      const NodeChainSlice &sinkSlice = slices[sinkSliceIdx];
+      srcSinkDistance =
+          edgeForward
+              ? srcSlice.endOffset - srcNodeOffset - edge.src->shSize +
+                    sinkNodeOffset - sinkSlice.beginOffset
+              : srcNodeOffset - srcSlice.beginOffset + edge.src->shSize +
+                    sinkSlice.endOffset - sinkNodeOffset;
+      // Increment the distance by the size of the middle slice if the src
+      // and sink are from the two ends.
+      if (std::abs(((int16_t)sinkSliceIdx) - ((int16_t)srcSliceIdx)) == 2)
+        srcSinkDistance += slices[1].size();
+    }
+
+    score += getEdgeExtTSPScore(edge, edgeForward, srcSinkDistance);
+  };
+
+  // No changes will be made to the score that is contributed by the unsplit
+  // chain and we can simply increment by the chain's stored score.
+  score += unsplitChain()->score;
+
+  // We need to recompute the score induced by the split chain (if it has really
+  // been split) as the offsets of the nodes have changed.
+  if (splits())
+    splitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore);
+  else
+    score += splitChain()->score;
+
+  // Consider the contribution to score for inter-chain edges.
+  splitChain()->forEachOutEdgeToChain(unsplitChain(), addEdgeScore);
+  unsplitChain()->forEachOutEdgeToChain(splitChain(), addEdgeScore);
+
+  return score;
+}
+
+bool NodeChainAssembly::CompareNodeChainAssembly::operator()(
+    const std::unique_ptr<NodeChainAssembly> &a1,
+    const std::unique_ptr<NodeChainAssembly> &a2) const {
+
+  if (a1->scoreGain == a2->scoreGain) {
+    // If score gains are equal, we pick a consistent order based on the chains
+    // in the assembly records
+    if (std::less<std::pair<NodeChain *, NodeChain *>>()(a1->chainPair,
+                                                         a2->chainPair))
+      return true;
+    if (std::less<std::pair<NodeChain *, NodeChain *>>()(a2->chainPair,
+                                                         a1->chainPair))
+      return false;
+    // When even the chain pairs are the same, we resort to the assembly
+    // strategy to pick a consistent order.
+    return a1->assemblyStrategy() < a2->assemblyStrategy();
+  }
+  return a1->scoreGain < a2->scoreGain;
+}
+
+static std::string toString(MergeOrder mOrder) {
+  switch (mOrder) {
+  case MergeOrder::S2S1U:
+    return "S2S1U";
+  case MergeOrder::S1US2:
+    return "S1US2";
+  case MergeOrder::S2US1:
+    return "S2US1";
+  case MergeOrder::US2S1:
+    return "US2S1";
+  default:
+    assert("Invalid MergeOrder!" && false);
+    return "";
+  }
+}
+
+std::string toString(NodeChainAssembly &assembly) {
+  std::string str("assembly record between:\n");
+  str += toString(*assembly.splitChain(), assembly.slicePosition) + " as S\n";
+  str += toString(*assembly.unsplitChain()) + " as U\n";
+  str += "merge order: " + toString(assembly.mergeOrder) + "\n";
+  str += "scoreGain: " + std::to_string(assembly.scoreGain);
+  return str;
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.h
@@ -0,0 +1,126 @@
+//===- PropellerNodeChainBuilder.h  ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_BUILDER_H
+#define LLD_ELF_PROPELLER_CODE_LAYOUT_NODE_CHAIN_BUILDER_H
+
+#include "ModifiablePriorityQueue.h"
+#include "NodeChain.h"
+#include "NodeChainAssembly.h"
+#include "NodeChainClustering.h"
+#include "PropellerCFG.h"
+
+#include "llvm/ADT/DenseMap.h"
+
+#include <unordered_map>
+#include <unordered_set>
+#include <vector>
+
+using llvm::DenseMap;
+
+namespace lld {
+namespace propeller {
+
+// bb chain builder based on the ExtTSP metric
+class NodeChainBuilder {
+private:
+  NodeChainAssembly::CompareNodeChainAssembly nodeChainAssemblyComparator;
+  // Cfgs repreresenting the functions that are reordered
+  std::vector<ControlFlowGraph *> cfgs;
+
+  // Set of built chains, keyed by section index of their Delegate nodes.
+  // chains are removed from this Map once they are merged into other chains.
+  DenseMap<uint64_t, std::unique_ptr<NodeChain>> chains;
+
+  // All the initial chains, seperated into connected components
+  std::vector<std::vector<NodeChain *>> components;
+
+  // NodeChainBuilder performs bb ordering component by component.
+  // This is the component number that the chain builder is currently working
+  // on.
+  unsigned currentComponent;
+
+  // These represent all the edges which are -- based on the profile -- the only
+  // (executed) outgoing edges from their source node and the only (executed)
+  // incoming edges to their sink nodes. The algorithm will make sure that these
+  // edges form fall-throughs in the final order.
+  DenseMap<CFGNode *, CFGNode *> mutuallyForcedOut;
+
+  // This maps every (ordered) pair of chains (with the first chain in the pair
+  // potentially splittable) to the highest-gain NodeChainAssembly for those
+  // chains. The Heap data structure allows fast retrieval of the maximum gain
+  // NodeChainAssembly, along with fast update.
+  ModifiablePriorityQueue<std::pair<NodeChain *, NodeChain *>,
+                          std::unique_ptr<NodeChainAssembly>,
+                          std::less<std::pair<NodeChain *, NodeChain *>>,
+                          NodeChainAssembly::CompareNodeChainAssembly>
+      nodeChainAssemblies;
+
+  // This map stores the candidate chains for each chain.
+  //
+  // For every chain, its candidate chains are the chains which can increase the
+  // overall ExtTSP score when merged with that chain. This is used to update
+  // the nodeChainAssemblies map whenever chains merge together. The candidate
+  // chains of a chain may also be updated as result of a merge.
+  std::unordered_map<NodeChain *, std::unordered_set<NodeChain *>>
+      candidateChains;
+
+  void coalesceChains();
+  void initializeExtTSP();
+
+  // This function separates builds the connected components for the chains so
+  // it can later merge each connected component separately.
+  // chains which are not connected to each other via profile edges will never
+  // be merged together by NodeChainBuilder. However, the composed chains may
+  // later be interleaved by ChainClustering.
+  void initializeChainComponents();
+  void attachFallThroughs();
+
+  // This function tries to place two nodes immediately adjacent to
+  // each other (used for fallthroughs).
+  // Returns true if this can be done.
+  bool attachNodes(CFGNode *src, CFGNode *sink);
+
+  void mergeChainEdges(NodeChain *splitChain, NodeChain *unsplitChain);
+
+  void mergeInOutEdges(NodeChain *mergerChain, NodeChain *mergeeChain);
+  void mergeChains(NodeChain *leftChain, NodeChain *rightChain);
+  void mergeChains(std::unique_ptr<NodeChainAssembly> assembly);
+
+  // Recompute the ExtTSP score of a chain
+  uint64_t computeExtTSPScore(NodeChain *chain) const;
+
+  // Update the related NodeChainAssembly records for two chains, with the
+  // assumption that unsplitChain has been merged into splitChain.
+  bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain);
+
+  void mergeAllChains();
+
+  void init();
+
+  // Initialize the mutuallyForcedOut map
+  void initMutuallyForcedEdges(ControlFlowGraph &cfg);
+
+  // Initialize basic block chains, with one chain for every node
+  void initNodeChains(ControlFlowGraph &cfg);
+
+public:
+  // This invokes the Extended TSP algorithm, orders the hot and cold basic
+  // blocks and inserts their associated symbols at the corresponding locations
+  // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the
+  // given SymbolList.
+  void doOrder(std::unique_ptr<ChainClustering> &clustering);
+
+  NodeChainBuilder(std::vector<ControlFlowGraph *> &cfgs) : cfgs(cfgs) {}
+
+  NodeChainBuilder(ControlFlowGraph *cfg) : cfgs(1, cfg) {}
+};
+
+} // namespace propeller
+} // namespace lld
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainBuilder.cpp
@@ -0,0 +1,816 @@
+//===- NodeChainBuilder.cpp  ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and includes the implementation of intra-function basic block
+// reordering algorithm based on the Extended TSP metric as described in [1].
+//
+// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic
+// blocks in a function, by combining the gains from fall-throughs and short
+// jumps.
+//
+// Given an ordering of the basic blocks, for a function f, the ExtTSP score is
+// computed as follows.
+//
+// sum_{edges e in f} frequency(e) * weight(e)
+//
+// where frequency(e) is the execution frequency and weight(e) is computed as
+// follows:
+//  * 1 if distance(src[e], sink[e]) = 0 (i.e. fallthrough)
+//
+//  * 0.1 * (1 - distance(src[e], sink[e]) / 1024) if src[e] < sink[e] and 0 <
+//    distance(src[e], sink[e]) < 1024 (i.e. short forward jump)
+//
+//  * 0.1 * (1 - distance(src[e], sink[e]) / 640) if src[e] > sink[e] and 0 <
+//    distance(src[e], sink[e]) < 640 (i.e. short backward jump)
+//
+//  * 0 otherwise
+//
+// In short, it computes a weighted sum of frequencies of all edges in the
+// control flow graph. Each edge gets its weight depending on whether the given
+// ordering makes the edge a fallthrough, a short forward jump, or a short
+// backward jump.
+//
+// Although this problem is NP-hard like the regular TSP, an iterative greedy
+// basic-block-chaining algorithm is used to find a close to optimal solution.
+// This algorithm is described as follows.
+//
+// Starting with one basic block sequence (bb chain) for every basic block, the
+// algorithm iteratively joins bb chains together in order to maximize the
+// extended TSP score of all the chains.
+//
+// Initially, it finds all mutually-forced edges in the profiled
+// controlFlowGraph. These are all the edges which are -- based on the profile
+// -- the only (executed) outgoing edge from their source node and the only
+// (executed) incoming edges to their sink nodes. Next, the source and sink of
+// all mutually-forced edges are attached together as fallthrough edges.
+//
+// Then, at every iteration, the algorithm tries to merge a pair of bb chains
+// which leads to the highest gain in the ExtTSP score. The algorithm extends
+// the search space by considering splitting short (less than 1KBs in
+// binary size) bb chains into two chains and then merging these two chains with
+// the other chain in four ways. After every merge, the new merge gains are
+// updated. The algorithm repeats joining bb chains until no additional can be
+// achieved. Eventually, it sorts all the existing chains in decreasing order
+// of their execution density, i.e., the total profiled frequency of the chain
+// divided by its binary size.
+//
+// The values used by this algorithm are reconfiguriable using lld's propeller
+// flags. Specifically, these parameters are:
+//
+//   * propeller-forward-jump-distance: maximum distance of a forward jump
+//     default-set to 1024 in the above equation).
+//
+//   * propeller-backward-jump-distance: maximum distance of a backward jump
+//     (default-set to 640 in the above equation).
+//
+//   * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1)
+//
+//   * propeller-forward-jump-weight: weight of a forward jump (default-set to
+//     0.1)
+//
+//   * propeller-backward-jump-weight: weight of a backward jump (default-set to
+//     0.1)
+//
+//   * propeller-chain-split-threshold: maximum binary size of a bb chain which
+//     the algorithm will consider for splitting (default-set to 1KB).
+//
+// References:
+//   * [1] A. Newell and S. Pupyrev, Improved Basic Block Reordering, available
+//         at https://arxiv.org/abs/1809.04676
+//===----------------------------------------------------------------------===//
+
+#include "NodeChainBuilder.h"
+
+#include "llvm/ADT/DenseSet.h"
+
+using llvm::DenseSet;
+
+using llvm::detail::DenseMapPair;
+
+namespace lld {
+namespace propeller {
+
+// Initializes the node chains for each cfg and finds all the mutually-forced
+// edges.
+void NodeChainBuilder::init() {
+  for (ControlFlowGraph *cfg : cfgs) {
+    initNodeChains(*cfg);
+    initMutuallyForcedEdges(*cfg);
+  }
+}
+
+// NodeChainBuilder calls this function after building all the chains to attach
+// as many fall-throughs as possible. Given that the algorithm already optimizes
+// the extend TSP score, this function will only affect the cold basic blocks
+// and thus we do not need to consider the edge weights.
+void NodeChainBuilder::attachFallThroughs() {
+  for (ControlFlowGraph *Cfg : cfgs) {
+    // First, try to keep the fall-throughs from the original order.
+    for (auto &Node : Cfg->nodes) {
+      if (Node->ftEdge != nullptr) {
+        attachNodes(Node.get(), Node->ftEdge->sink);
+      }
+    }
+
+    // Sometimes, the original fall-throughs cannot be kept. So we try to find
+    // new fall-through opportunities which did not exist in the original order.
+    for (auto &Edge : Cfg->intraEdges) {
+      if (Edge->type == CFGEdge::EdgeType::INTRA_FUNC ||
+          Edge->type == CFGEdge::EdgeType::INTRA_DYNA)
+        attachNodes(Edge->src, Edge->sink);
+    }
+  }
+}
+
+// This function sorts bb chains in decreasing order of their execution density.
+// NodeChainBuilder calls this function at the end to ensure that hot bb chains
+// are placed at the beginning of the function.
+void NodeChainBuilder::coalesceChains() {
+  std::vector<NodeChain *> chainOrder;
+  for (DenseMapPair<uint64_t, std::unique_ptr<NodeChain>> &elem : chains)
+    chainOrder.push_back(elem.second.get());
+
+  std::sort(
+      chainOrder.begin(), chainOrder.end(), [](NodeChain *c1, NodeChain *c2) {
+        if (!c1->isSameCFG(*c2))
+          error("Attempting to coalesce chains belonging to different "
+                "functions.");
+        // Always place the hot chains before cold ones
+        if (c1->freq == 0 ^ c2->freq == 0)
+          return c1->freq != 0;
+
+        // Place the entry node's chain before other chains
+        auto *entryNode = c1->controlFlowGraph->getEntryNode();
+        if (entryNode->chain == c1)
+          return true;
+        if (entryNode->chain == c2)
+          return false;
+
+        // Sort chains in decreasing order of execution density, and break ties
+        // according to the initial ordering.
+        double c1ExecDensity = c1->execDensity();
+        double c2ExecDensity = c2->execDensity();
+        if (c1ExecDensity == c2ExecDensity)
+          return c1->delegateNode->mappedAddr < c2->delegateNode->mappedAddr;
+        return c1ExecDensity > c2ExecDensity;
+      });
+
+  // We will merge all chains into at most two chains; a hot and cold one.
+  NodeChain *mergerChain = nullptr;
+
+  for (NodeChain *c : chainOrder) {
+    if (!mergerChain) {
+      mergerChain = c;
+      continue;
+    }
+    // Create a cold partition when -propeller-split-funcs is set.
+    if (propConfig.optSplitFuncs && (mergerChain->freq != 0 && c->freq == 0)) {
+      mergerChain = c;
+      continue;
+    }
+    // Merge this chain into the merger chain
+    mergeChains(mergerChain, c);
+  }
+}
+
+// Merge two chains in the specified order.
+void NodeChainBuilder::mergeChains(NodeChain *leftChain,
+                                   NodeChain *rightChain) {
+  if ((propConfig.optReorderIP || propConfig.optSplitFuncs) &&
+      (leftChain->freq == 0 ^ rightChain->freq == 0))
+    error("Attempting to merge hot and cold chains: \n" + toString(*leftChain) +
+          "\nAND\n" + toString(*rightChain));
+
+  if (leftChain->debugChain || rightChain->debugChain)
+    fprintf(stderr, "MERGING chains:\n%s\nAND%s\n",
+            toString(*leftChain).c_str(), toString(*rightChain).c_str());
+
+  mergeInOutEdges(leftChain, rightChain);
+
+  auto rightChainBegin = rightChain->nodes.begin();
+
+  leftChain->nodes.splice(leftChain->nodes.end(), rightChain->nodes);
+
+  for (auto it = rightChainBegin; it != leftChain->nodes.end(); ++it) {
+    (*it)->chain = leftChain;
+    (*it)->chainOffset += leftChain->size;
+  }
+
+  leftChain->size += rightChain->size;
+  leftChain->freq += rightChain->freq;
+
+  leftChain->debugChain |= rightChain->debugChain;
+  if (leftChain->controlFlowGraph &&
+      leftChain->controlFlowGraph != rightChain->controlFlowGraph)
+    leftChain->controlFlowGraph = nullptr;
+
+  chains.erase(rightChain->delegateNode->mappedAddr);
+}
+
+// This function tries to place two basic blocks immediately adjacent to each
+// other (used for fallthroughs). Returns true if the basic blocks have been
+// attached this way.
+bool NodeChainBuilder::attachNodes(CFGNode *src, CFGNode *sink) {
+  if (sink->isEntryNode())
+    return false;
+
+  // Ignore edges between hot and cold basic blocks.
+  if (src->freq == 0 ^ sink->freq == 0)
+    return false;
+  NodeChain *srcChain = src->chain;
+  NodeChain *sinkChain = sink->chain;
+  // Skip this edge if the source and sink are in the same chain
+  if (srcChain == sinkChain)
+    return false;
+
+  // It's possible to form a fall-through between src and sink only if
+  // they are respectively located at the end and beginning of their chains.
+  if (srcChain->nodes.back() != src || sinkChain->nodes.front() != sink)
+    return false;
+  // Attaching is possible. So we merge the chains in the corresponding order.
+  mergeChains(srcChain, sinkChain);
+  return true;
+}
+
+// This function merges the in-and-out chain-edges of one chain (mergeeChain)
+// into those of another (mergerChain).
+void NodeChainBuilder::mergeInOutEdges(NodeChain *mergerChain,
+                                       NodeChain *mergeeChain) {
+  // Add out-edges of mergeeChain to the out-edges of mergerChain
+  for (auto &elem : mergeeChain->outEdges) {
+    NodeChain *c = (elem.first == mergeeChain) ? mergerChain : elem.first;
+    auto res = mergerChain->outEdges.emplace(c, elem.second);
+    // If the chain-edge is already present, just add the controlFlowGraph
+    // edges, otherwise, add mergerChain to in-edges of the chain.
+    if (!res.second)
+      res.first->second.insert(res.first->second.end(), elem.second.begin(),
+                               elem.second.end());
+    else
+      c->inEdges.insert(mergerChain);
+
+    // Remove mergeeChain from in-edges
+    c->inEdges.erase(mergeeChain);
+  }
+
+  // Add in-edges of mergeeChain to in-edges of mergerChain
+  for (auto *c : mergeeChain->inEdges) {
+    // Self edges were already handled above.
+    if (c == mergeeChain)
+      continue;
+    // Move all controlFlowGraph edges from being mapped to mergeeChain to being
+    // mapped to mergerChain.
+    auto &mergeeChainEdges = c->outEdges[mergeeChain];
+    auto &mergerChainEdges = c->outEdges[mergerChain];
+
+    mergerChainEdges.insert(mergerChainEdges.end(), mergeeChainEdges.begin(),
+                            mergeeChainEdges.end());
+    mergerChain->inEdges.insert(c);
+    // Remove the defunct chain from out edges
+    c->outEdges.erase(mergeeChain);
+  }
+}
+
+// Merge two bb sequences according to the given NodeChainAssembly. A
+// NodeChainAssembly is an ordered triple of three slices from two chains.
+void NodeChainBuilder::mergeChains(
+    std::unique_ptr<NodeChainAssembly> assembly) {
+  if (assembly->splitChain()->freq == 0 ^ assembly->unsplitChain()->freq == 0)
+    error("Attempting to merge hot and cold chains: \n" +
+          toString(*assembly.get()));
+
+  if (assembly->splitChain()->freq == 0 ^ assembly->unsplitChain()->freq == 0)
+    error("Attempting to merge hot and cold chains: \n" +
+          toString(*assembly.get()));
+
+  // Decide which chain gets merged into the other chain, to make the reordering
+  // more efficient.
+  NodeChain *mergerChain = (assembly->mergeOrder == US2S1)
+                               ? assembly->unsplitChain()
+                               : assembly->splitChain();
+  NodeChain *mergeeChain = (assembly->mergeOrder == US2S1)
+                               ? assembly->splitChain()
+                               : assembly->unsplitChain();
+
+  // Merge in and out edges of the two chains
+  mergeInOutEdges(mergerChain, mergeeChain);
+
+  // Does S2 mark a function transition?
+  bool S2FuncTransition = assembly->splitsAtFunctionTransition();
+
+  // Create the new node order according the given assembly
+  auto S1Begin = assembly->splitChain()->nodes.begin();
+  auto S2Begin = assembly->slicePosition;
+  auto UBegin = assembly->unsplitChain()->nodes.begin();
+
+  // Reorder S1S2 to S2S1 if needed. This operation takes O(1) because we're
+  // splicing from and into the same list.
+  if (assembly->needsSplitChainRotation())
+    assembly->splitChain()->nodes.splice(S1Begin, assembly->splitChain()->nodes,
+                                         S2Begin,
+                                         assembly->splitChain()->nodes.end());
+
+  switch (assembly->mergeOrder) {
+  case S2S1U:
+    // Splice U at the end of S.
+    assembly->splitChain()->nodes.splice(assembly->splitChain()->nodes.end(),
+                                         assembly->unsplitChain()->nodes);
+    break;
+  case S1US2:
+    // Splice U in the middle
+    assembly->splitChain()->nodes.splice(S2Begin,
+                                         assembly->unsplitChain()->nodes);
+    break;
+  case S2US1:
+    // Splice U in the middle
+    assembly->splitChain()->nodes.splice(S1Begin,
+                                         assembly->unsplitChain()->nodes);
+    break;
+  case US2S1:
+    // Splice S at the end of U
+    assembly->unsplitChain()->nodes.splice(
+        assembly->unsplitChain()->nodes.end(), assembly->splitChain()->nodes);
+    break;
+  default:
+    break;
+  }
+
+  if (propConfig.optReorderIP && !mergerChain->isSameCFG(*mergeeChain)) {
+    // Merge function transitions of mergerChain with those of mergeeChain.
+    mergerChain->functionTransitions.splice(
+        mergerChain->functionTransitions.end(),
+        mergeeChain->functionTransitions);
+
+    // Add the beginnings of slices if they now mark a function transition.
+    std::vector<std::list<CFGNode *>::iterator> funcTransitionCandids;
+    // UBegin should always be examined.
+    funcTransitionCandids.push_back(UBegin);
+    // If S2Begin was a function transition point before, we don't examine it
+    // again as it will remain in the transition points (We never remove entries
+    // from functionTransitions).
+    if (!S2FuncTransition)
+      funcTransitionCandids.push_back(S2Begin);
+    // S1Begin will only be examined if this is a splitting assembly. Otherwise,
+    // S1 is empty.
+    if (assembly->splits())
+      funcTransitionCandids.push_back(S1Begin);
+
+    // Check if any of the candidates mark a function transition position and
+    // add those to the function transition list of the mergerChain.
+    for (auto it : funcTransitionCandids)
+      if (it != mergerChain->nodes.begin() &&
+          (*std::prev(it))->controlFlowGraph != (*it)->controlFlowGraph)
+        mergerChain->functionTransitions.push_back(it);
+  }
+
+  // Set the starting and ending point for updating the nodes's chain and offset
+  // in the new chain. This helps to reduce the computation time.
+  auto chainBegin = mergerChain->nodes.begin();
+  uint64_t chainBeginOffset = 0;
+
+  if (assembly->mergeOrder == S1US2 || assembly->mergeOrder == US2S1) {
+    // We can skip the first slice (slices[0]) in these cases.
+    chainBegin = assembly->slices[1].beginPosition;
+    chainBeginOffset = assembly->slices[0].size();
+  }
+
+  if (!assembly->splits()) {
+    // We can skip the S chain in this case.
+    chainBegin = UBegin;
+    chainBeginOffset = assembly->splitChain()->size;
+  }
+
+  uint64_t runningOffset = chainBeginOffset;
+
+  // Update chainOffset and the chain pointer for all the nodes in the sequence.
+  for (auto it = chainBegin; it != mergerChain->nodes.end(); ++it) {
+    CFGNode *node = *it;
+    node->chain = mergerChain;
+    node->chainOffset = runningOffset;
+    runningOffset += node->shSize;
+  }
+
+  mergerChain->size += mergeeChain->size;
+  assert(mergerChain->size == runningOffset &&
+         "Mismatch of merger chain's size and running offset!");
+
+  // Update the total frequency the aggregated chain
+  mergerChain->freq += mergeeChain->freq;
+
+  // We have already computed the new score in the assembly record. So we can
+  // update the score based on that and the other chain's score.
+  mergerChain->score += mergeeChain->score + assembly->scoreGain;
+
+  mergerChain->debugChain |= mergeeChain->debugChain;
+
+  if (mergerChain->controlFlowGraph &&
+      mergerChain->controlFlowGraph != mergeeChain->controlFlowGraph)
+    mergerChain->controlFlowGraph = nullptr;
+
+  // Merge the assembly candidate chains of the two chains into the candidate
+  // chains of the remaining NodeChain and remove the records for the defunct
+  // NodeChain.
+  for (NodeChain *c : candidateChains[mergeeChain]) {
+    nodeChainAssemblies.erase(std::make_pair(c, mergeeChain));
+    nodeChainAssemblies.erase(std::make_pair(mergeeChain, c));
+    candidateChains[c].erase(mergeeChain);
+    if (c != mergerChain)
+      candidateChains[mergerChain].insert(c);
+  }
+
+  // Update the NodeChainAssembly for all candidate chains of the merged
+  // NodeChain. Remove a NodeChain from the merge chain's candidates if the
+  // NodeChainAssembly update finds no gain.
+  auto &mergerChainCandidateChains = candidateChains[mergerChain];
+
+  for (auto CI = mergerChainCandidateChains.begin(),
+            CE = mergerChainCandidateChains.end();
+       CI != CE;) {
+    NodeChain *otherChain = *CI;
+    auto &otherChainCandidateChains = candidateChains[otherChain];
+
+    bool x = updateNodeChainAssembly(otherChain, mergerChain);
+
+    if (!x)
+      nodeChainAssemblies.erase(std::make_pair(otherChain, mergerChain));
+
+    bool y = updateNodeChainAssembly(mergerChain, otherChain);
+
+    if (!y)
+      nodeChainAssemblies.erase(std::make_pair(mergerChain, otherChain));
+
+    if (x || y) {
+      otherChainCandidateChains.insert(mergerChain);
+      CI++;
+    } else {
+      otherChainCandidateChains.erase(mergerChain);
+      CI = mergerChainCandidateChains.erase(CI);
+    }
+  }
+
+  // remove all the candidate chain records for the merged-in chain
+  candidateChains.erase(mergeeChain);
+
+  // Finally, remove the defunct (merged-in) chain record from the chains
+  chains.erase(mergeeChain->delegateNode->mappedAddr);
+}
+
+// Calculate the Extended TSP metric for a bb chain.
+// This function goes over all the BBs in the chain and for bb chain and
+// aggregates the score of all edges which are contained in the same chain.
+uint64_t NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const {
+  uint64_t score = 0;
+
+  auto visit = [&score](CFGEdge &edge, NodeChain *srcChain,
+                        NodeChain *sinkChain) {
+    assert(srcChain == sinkChain);
+    auto srcOffset = edge.src->chainOffset;
+    auto sinkOffset = edge.sink->chainOffset;
+    bool edgeForward = srcOffset + edge.src->shSize <= sinkOffset;
+    // Calculate the distance between src and sink
+    uint64_t distance = edgeForward ? sinkOffset - srcOffset - edge.src->shSize
+                                    : srcOffset - sinkOffset + edge.src->shSize;
+    score += getEdgeExtTSPScore(edge, edgeForward, distance);
+  };
+
+  chain->forEachOutEdgeToChain(chain, visit);
+
+  return score;
+}
+
+// Updates the best NodeChainAssembly between two NodeChains. The existing
+// record will be replaced by the new NodeChainAssembly if a non-zero gain
+// is achieved. Otherwise, it will be removed.
+// If a nodechain assembly record is (kept) inserted, returns true. Otherwise
+// returns false.
+bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain,
+                                               NodeChain *unsplitChain) {
+  // Only consider splitting the chain if the size of the chain is smaller than
+  // a threshold.
+  bool doSplit = (splitChain->size <= propConfig.optChainSplitThreshold);
+  // If we are not splitting, we only consider the slice position at the
+  // beginning of the chain (effectively no splitting).
+  auto slicePosEnd =
+      doSplit ? splitChain->nodes.end() : std::next(splitChain->nodes.begin());
+
+  std::unique_ptr<NodeChainAssembly> bestAssembly(nullptr);
+
+  // This function creates the different nodeChainAssemblies for splitChain and
+  // unsplitChain, given a slice position in splitChain, and updates the best
+  // assembly if required.
+  auto updateBestAssembly = [this, splitChain, unsplitChain, &bestAssembly](
+                                std::list<CFGNode *>::iterator slicePos) {
+    // If the split position is at the beginning (no splitting), only consider
+    // one MergeOrder
+    auto mergeOrderEnd = (slicePos == splitChain->nodes.begin())
+                             ? MergeOrder::BeginNext
+                             : MergeOrder::End;
+    for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) {
+      MergeOrder mOrder = static_cast<MergeOrder>(MI);
+
+      // Create the NodeChainAssembly representing this particular assembly.
+      auto NCA = std::unique_ptr<NodeChainAssembly>(
+          new NodeChainAssembly(splitChain, unsplitChain, slicePos, mOrder));
+
+      // Update bestAssembly if needed
+      if (NCA->isValid() &&
+          (!bestAssembly || nodeChainAssemblyComparator(bestAssembly, NCA)))
+        bestAssembly = std::move(NCA);
+    }
+  };
+
+  for (auto slicePos = splitChain->nodes.begin(); slicePos != slicePosEnd;
+       ++slicePos) {
+    // Do not split the mutually-forced edges in the chain.
+    if (slicePos != splitChain->nodes.begin() &&
+        mutuallyForcedOut.count(*std::prev(slicePos)))
+      continue;
+    updateBestAssembly(slicePos);
+  }
+
+  if (propConfig.optReorderIP && !doSplit) {
+    // For inter-procedural layout, we always try splitting at the function
+    // transition positions regardless of the splitChain's size.
+    for (auto transit = splitChain->functionTransitions.begin();
+         transit != splitChain->functionTransitions.end();) {
+      auto slicePos = *transit;
+      if (slicePos == splitChain->nodes.begin() ||
+          (*std::prev(slicePos))->controlFlowGraph ==
+              (*slicePos)->controlFlowGraph) {
+        transit = splitChain->functionTransitions.erase(transit);
+        continue;
+      }
+      updateBestAssembly(slicePos);
+      transit++;
+    }
+  }
+
+  // Insert the best assembly of the two chains.
+  if (bestAssembly) {
+    if (splitChain->debugChain || unsplitChain->debugChain)
+      fprintf(stderr, "INSERTING ASSEMBLY: %s\n",
+              toString(*bestAssembly).c_str());
+
+    nodeChainAssemblies.insert(bestAssembly->chainPair,
+                               std::move(bestAssembly));
+    return true;
+  } else
+    return false;
+}
+
+void NodeChainBuilder::initNodeChains(ControlFlowGraph &cfg) {
+  for (auto &node : cfg.nodes) {
+    NodeChain *chain = new NodeChain(node.get());
+    node->chain = chain;
+    node->chainOffset = 0;
+    chains.try_emplace(node->mappedAddr, chain);
+  }
+}
+
+// Find all the mutually-forced edges.
+// These are all the edges which are -- based on the profile -- the only
+// (executed) outgoing edge from their source node and the only (executed)
+// incoming edges to their sink nodes
+void NodeChainBuilder::initMutuallyForcedEdges(ControlFlowGraph &cfg) {
+  DenseMap<CFGNode *, CFGNode *> mutuallyForcedOut;
+
+  DenseMap<CFGNode *, std::vector<CFGEdge *>> profiledOuts;
+  DenseMap<CFGNode *, std::vector<CFGEdge *>> profiledIns;
+
+  for (auto &node : cfg.nodes) {
+    std::copy_if(node->outs.begin(), node->outs.end(),
+                 std::back_inserter(profiledOuts[node.get()]),
+                 [](CFGEdge *edge) {
+                   return (edge->type == CFGEdge::EdgeType::INTRA_FUNC ||
+                           edge->type == CFGEdge::EdgeType::INTRA_DYNA) &&
+                          edge->weight != 0;
+                 });
+    std::copy_if(node->ins.begin(), node->ins.end(),
+                 std::back_inserter(profiledIns[node.get()]),
+                 [](CFGEdge *edge) {
+                   return (edge->type == CFGEdge::EdgeType::INTRA_FUNC ||
+                           edge->type == CFGEdge::EdgeType::INTRA_DYNA) &&
+                          edge->weight != 0;
+                 });
+  }
+
+  for (auto &node : cfg.nodes) {
+    if (profiledOuts[node.get()].size() == 1) {
+      CFGEdge *edge = profiledOuts[node.get()].front();
+      if (profiledIns[edge->sink].size() == 1)
+        mutuallyForcedOut.try_emplace(node.get(), edge->sink);
+    }
+  }
+
+  // Break cycles in the mutually forced edges by cutting the edge sinking to
+  // the smallest address in every cycle (hopefully a loop backedge)
+  DenseMap<CFGNode *, unsigned> nodeToPathMap;
+  SmallVector<CFGNode *, 16> cycleCutNodes;
+  unsigned pathCount = 0;
+  for (auto it = mutuallyForcedOut.begin(); it != mutuallyForcedOut.end();
+       ++it) {
+    // Check to see if the node (and its cycle) have already been visited.
+    if (nodeToPathMap[it->first])
+      continue;
+    CFGEdge *victimEdge = nullptr;
+    auto nodeIt = it;
+    pathCount++;
+    while (nodeIt != mutuallyForcedOut.end()) {
+      CFGNode *node = nodeIt->first;
+      unsigned path = nodeToPathMap[node];
+      if (path != 0) {
+        // If this node is marked with a number, either it is the same number,
+        // in which case we have found a cycle. Or it is a different number,
+        // which means we have found a path to a previously visited path
+        // (non-cycle).
+        if (path == pathCount) {
+          // We have found a cycle: add the victim edge
+          cycleCutNodes.push_back(victimEdge->src);
+        }
+        break;
+      } else
+        nodeToPathMap[node] = pathCount;
+      if (!profiledOuts[node].empty()) {
+        CFGEdge *edge = profiledOuts[node].front();
+        if (!victimEdge ||
+            (edge->sink->mappedAddr < victimEdge->sink->mappedAddr)) {
+          victimEdge = edge;
+        }
+      }
+      nodeIt = mutuallyForcedOut.find(nodeIt->second);
+    }
+  }
+
+  // Remove the victim edges to break cycles in the mutually forced edges
+  for (CFGNode *node : cycleCutNodes)
+    mutuallyForcedOut.erase(node);
+
+  for (auto &elem : mutuallyForcedOut)
+    mutuallyForcedOut.insert(elem);
+}
+
+// This function initializes the ExtTSP algorithm's data structures;
+// the nodeChainAssemblies and the candidateChains maps.
+void NodeChainBuilder::initializeExtTSP() {
+  // For each chain, compute its ExtTSP score, add its chain assembly records
+  // and its merge candidate chain.
+  candidateChains.clear();
+  for (NodeChain *chain : components[currentComponent])
+    chain->score = chain->freq ? computeExtTSPScore(chain) : 0;
+
+  DenseSet<std::pair<NodeChain *, NodeChain *>> visited;
+
+  for (NodeChain *chain : components[currentComponent]) {
+    auto &thisCandidateChains = candidateChains[chain];
+    for (auto &chainEdge : chain->outEdges) {
+      NodeChain *otherChain = chainEdge.first;
+      if (chain == otherChain)
+        continue;
+      std::pair<NodeChain *, NodeChain *> chainPair1(chain, otherChain),
+          chainPair2(otherChain, chain);
+      auto p = std::less<std::pair<NodeChain *, NodeChain *>>()(chainPair1,
+                                                                chainPair2)
+                   ? chainPair1
+                   : chainPair2;
+      if (!visited.insert(p).second)
+        continue;
+
+      bool x = updateNodeChainAssembly(chain, otherChain);
+      bool y = updateNodeChainAssembly(otherChain, chain);
+
+      if (x || y) {
+        thisCandidateChains.insert(otherChain);
+        candidateChains[otherChain].insert(chain);
+      }
+    }
+  }
+}
+
+void NodeChainBuilder::initializeChainComponents() {
+
+  DenseMap<NodeChain *, unsigned> chainToComponentMap;
+  unsigned componentId = 0;
+  for (DenseMapPair<uint64_t, std::unique_ptr<NodeChain>> &elem : chains) {
+    NodeChain *chain = elem.second.get();
+    // Cold chains will not be placed in any component.
+    if (!chain->freq)
+      continue;
+    // Skip if this chain has already been assigned to a component.
+    if (!chainToComponentMap.try_emplace(chain, componentId).second)
+      continue;
+    // Start creating the component containing this chain and its connected
+    // chains.
+    std::vector<NodeChain *> toVisit(1, chain);
+    unsigned index = 0;
+
+    // Find the connected component using BFS.
+    while (index != toVisit.size()) {
+      auto *tchain = toVisit[index++];
+      for (auto *c : tchain->inEdges) {
+        if (chainToComponentMap.try_emplace(c, componentId).second)
+          toVisit.push_back(c);
+      }
+      for (auto &e : tchain->outEdges) {
+        if (chainToComponentMap.try_emplace(e.first, componentId).second)
+          toVisit.push_back(e.first);
+      }
+    }
+    components.push_back(std::move(toVisit));
+    componentId++;
+  }
+}
+
+void NodeChainBuilder::mergeAllChains() {
+  // Attach the mutually-foced edges (which will not be split anymore by the
+  // Extended TSP algorithm).
+  for (auto &elem : mutuallyForcedOut)
+    attachNodes(elem.first, elem.second);
+
+  // Set up the outgoing edges for every chain
+  for (auto &elem : chains) {
+    auto *chain = elem.second.get();
+    // Ignore cold chains as they cannot have any hot edges to other nodes
+    if (!chain->freq)
+      continue;
+    auto addEdge = [chain](CFGEdge &edge) {
+      // Ignore returns and zero-frequency edges as these edges will not be used
+      // by the ExtTSP score algorithm.
+      if (!edge.weight || edge.isReturn())
+        return;
+      auto *sinkNodeChain = edge.sink->chain;
+      chain->outEdges[sinkNodeChain].push_back(&edge);
+      sinkNodeChain->inEdges.insert(chain);
+    };
+
+    if (propConfig.optReorderIP) {
+      for (CFGNode *node : chain->nodes)
+        node->forEachOutEdgeRef(addEdge);
+    } else {
+      for (CFGNode *node : chain->nodes)
+        node->forEachIntraOutEdgeRef(addEdge);
+    }
+  }
+
+  initializeChainComponents();
+
+  for (currentComponent = 0; currentComponent < components.size();
+       ++currentComponent) {
+    if (propConfig.optPrintStats)
+      fprintf(stderr, "COMPONENT: %u -> SIZE: %zu\n", currentComponent,
+              components[currentComponent].size());
+    // Initialize the Extended TSP algorithm's data.
+    initializeExtTSP();
+
+    // Keep merging the chain assembly record with the highest ExtTSP gain,
+    // until no more gain is possible.
+    while (!nodeChainAssemblies.empty()) {
+      auto bestAssembly = nodeChainAssemblies.top();
+      nodeChainAssemblies.pop();
+      if (bestAssembly->splitChain()->debugChain ||
+          bestAssembly->unsplitChain()->debugChain)
+        fprintf(stderr, "MERGING for %s\n",
+                toString(*bestAssembly.get()).c_str());
+      mergeChains(std::move(bestAssembly));
+    }
+  }
+}
+
+void NodeChainBuilder::doOrder(std::unique_ptr<ChainClustering> &clustering) {
+  init();
+
+  mergeAllChains();
+
+  // Merge fallthrough basic blocks if we have missed any
+  attachFallThroughs();
+
+  if (!propConfig.optReorderIP) {
+    // If this is not inter-procedural, coalesce all hot chains into a single
+    // hot chain and all cold chains into a single cold chain.
+    coalesceChains();
+    assert(cfgs.size() == 1 && chains.size() <= 2);
+
+#ifdef PROPELLER_PROTOBUF
+    ControlFlowGraph *cfg = cfgs.back();
+    if (prop->protobufPrinter) {
+      std::list<CFGNode *> nodeOrder;
+      for (auto &elem : chains) {
+        auto *chain = elem.second.get();
+        nodeOrder.insert(chain->freq ? nodeOrder.begin() : nodeOrder.end(),
+                         chain->nodes.begin(), chain->nodes.end());
+      }
+      prop->protobufPrinter->addCFG(*cfg, &nodeOrder);
+    }
+#endif
+  }
+
+  // Hand in the built chains to the chain clustering algorithm
+  for (auto &elem : chains)
+    clustering->addChain(std::move(elem.second));
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.h
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.h
@@ -0,0 +1,143 @@
+//===- PropellerChainClustering.h  ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H
+#define LLD_ELF_PROPELLER_CHAIN_CLUSTERING_H
+
+#include "NodeChain.h"
+
+#include "llvm/ADT/DenseMap.h"
+
+#include <vector>
+
+using llvm::DenseMap;
+
+namespace lld {
+namespace propeller {
+
+// This is the base class for clustering chains. It provides the basic data
+// structures and functions.
+class ChainClustering {
+public:
+  class Cluster {
+  public:
+    // Constructor for building a cluster from a single chain.
+    Cluster(NodeChain *);
+
+    // The chains in this cluster in the merged order.
+    std::vector<NodeChain *> chains;
+
+    // The representative chain for this cluster.
+    NodeChain *delegateChain;
+
+    // Total size of the cluster.
+    uint64_t size;
+
+    // Total frequency of the cluster.
+    uint64_t freq;
+
+    // This function merges this cluster with another cluster
+    Cluster &mergeWith(Cluster &other) {
+      chains.insert(chains.end(), other.chains.begin(), other.chains.end());
+      this->freq += other.freq;
+      this->size += other.size;
+      return *this;
+    }
+
+    // Returns the execution density of this cluster
+    double getDensity() { return ((double)freq / size); }
+  };
+
+  // This function merges two clusters in the given order, and updates
+  // chainToClusterMap for the chains in the mergee cluster and also removes the
+  // mergee cluster from the clusters.
+  void mergeTwoClusters(Cluster *predecessorCluster, Cluster *cluster) {
+    // Join the two clusters into predecessorCluster.
+    predecessorCluster->mergeWith(*cluster);
+
+    // Update chain to cluster mapping, because all chains that were
+    // previsously in cluster are now in predecessorCluster.
+    for (NodeChain *c : cluster->chains) {
+      chainToClusterMap[c] = predecessorCluster;
+    }
+
+    // Delete the defunct cluster
+    clusters.erase(cluster->delegateChain->delegateNode->mappedAddr);
+  }
+
+  // Adds a chain to the input of the clustering process
+  void addChain(std::unique_ptr<NodeChain> &&chain_ptr);
+
+  // Any subclass should override this function.
+  virtual void doOrder(std::vector<CFGNode *> &hotOrder,
+                       std::vector<CFGNode *> &coldOrder);
+
+  virtual ~ChainClustering() = default;
+
+protected:
+  // Optional function for merging clusters, which could be overriden by
+  // subclasses.
+  virtual void mergeClusters(){};
+
+  // This function sorts the final clusters in decreasing order of their
+  // execution density.
+  void sortClusters(std::vector<Cluster *> &);
+
+  // This function initializes clusters with each cluster including a single
+  // chain.
+  void initClusters() {
+    for (auto &c_ptr : hotChains) {
+      NodeChain *chain = c_ptr.get();
+      Cluster *cl = new Cluster(chain);
+      cl->freq = chain->freq;
+      cl->size = std::max(chain->size, (uint64_t)1);
+      chainToClusterMap[chain] = cl;
+      clusters.try_emplace(cl->delegateChain->delegateNode->mappedAddr, cl);
+    }
+  }
+
+  // chains processed by the clustering algorithm separated into hot and cold
+  // ones based on their frequency.
+  std::vector<std::unique_ptr<NodeChain>> hotChains, coldChains;
+
+  // All clusters currently in process.
+  DenseMap<uint64_t, std::unique_ptr<Cluster>> clusters;
+
+  // This maps every chain to its containing cluster.
+  DenseMap<NodeChain *, Cluster *> chainToClusterMap;
+};
+
+// This class computes an ordering for the input chains which conforms to the
+// initial ordering of nodes in the binary.
+class NoOrdering : public ChainClustering {
+public:
+  void doOrder(std::vector<CFGNode *> &hotOrder,
+               std::vector<CFGNode *> &coldOrder);
+};
+
+// This class implements the call-chain-clustering algorithm.
+class CallChainClustering : public ChainClustering {
+private:
+  Cluster *getMostLikelyPredecessor(NodeChain *chain, Cluster *cluster);
+  void mergeClusters();
+};
+
+} // namespace propeller
+} // namespace lld
+
+namespace std {
+template <> struct less<lld::propeller::ChainClustering::Cluster *> {
+  bool operator()(const lld::propeller::ChainClustering::Cluster *c1,
+                  const lld::propeller::ChainClustering::Cluster *c2) const {
+    return less<lld::propeller::NodeChain *>()(c1->delegateChain,
+                                               c2->delegateChain);
+  }
+};
+
+} // namespace std
+
+#endif
Index: lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp
===================================================================
--- /dev/null
+++ lld/ELF/Propeller/CodeLayout/NodeChainClustering.cpp
@@ -0,0 +1,233 @@
+//===- PropellerChainClustering.cpp  --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and includes the implementation of the Call-chain-Clustering
+// algorithm as described in [1].
+//
+// The algorithm iteratively merges chains into clusters. To do so, it processes
+// the basic block chains in decreasing order of
+// their execution density and for each chain, merges its cluster with its
+// most-frequent predecessor cluster. The merging of a cluster stops when it
+// reaches the -propeller-cluster-merge-size-threshold (set to 2MB by default).
+//
+//   [1] Guilherme Ottoni and Bertrand Maher. Optimizing Function Placement for
+//   Large-Scale Data-Center Applications. Available at
+//   https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+//===----------------------------------------------------------------------===//
+#include "NodeChainClustering.h"
+#include "PropellerConfig.h"
+
+using llvm::detail::DenseMapPair;
+
+namespace lld {
+namespace propeller {
+
+void ChainClustering::addChain(std::unique_ptr<NodeChain> &&chain_ptr) {
+  for (CFGNode *n : chain_ptr->nodes)
+    n->chain = chain_ptr.get();
+  auto &chainList = ((propConfig.optReorderIP || propConfig.optSplitFuncs ||
+                      propConfig.optReorderFuncs) &&
+                     chain_ptr->freq == 0)
+                        ? coldChains
+                        : hotChains;
+  chainList.push_back(std::move(chain_ptr));
+}
+
+// Initializes a cluster containing a single chain an associates it with a
+// unique id.
+ChainClustering::Cluster::Cluster(NodeChain *chain)
+    : chains(1, chain), delegateChain(chain) {}
+
+// Returns the most frequent predecessor of a chain. This function also gets as
+// the second parameter the cluster containing the chain to save a lookup
+// into chainToClusterMap.
+ChainClustering::Cluster *
+CallChainClustering::getMostLikelyPredecessor(NodeChain *chain,
+                                              Cluster *cluster) {
+  // This map stores the total weight of the incoming edges to the given cluster
+  // from every other cluster.
+  DenseMap<Cluster *, uint64_t> clusterEdge;
+
+  for (CFGNode *n : chain->nodes) {
+    // For non-inter-procedural, we only consider the function entries.
+    if (!propConfig.optReorderIP && !n->isEntryNode())
+      continue;
+    auto visit = [&clusterEdge, n, chain, cluster, this](CFGEdge &edge) {
+      if (!edge.weight || edge.isReturn())
+        return;
+      if (!propConfig.optReorderIP && !edge.isCall())
+        return;
+      auto *callerChain = edge.src->chain;
+      if (!callerChain) {
+        warn("Caller for node: " + n->shName + " does not have a chain!");
+        return;
+      }
+      auto *callerCluster = chainToClusterMap[callerChain];
+      if (callerChain == chain || callerCluster == cluster)
+        return;
+      // Ignore clusters which are too big
+      if (callerCluster->size > propConfig.optClusterMergeSizeThreshold)
+        return;
+      // Ignore edges which are cold relative to the sink node
+      if (edge.weight * 10 < n->freq)
+        return;
+      // Do not merge if the caller cluster's density would degrade by more than
+      // 1/8 by the merge.
+      if (8 * callerCluster->size * (cluster->freq * callerCluster->freq) <
+          callerCluster->freq * (cluster->size + callerCluster->size))
+        return;
+      clusterEdge[callerCluster] += edge.weight;
+    };
+    n->forEachInEdgeRef(visit);
+  }
+
+  // Get the highest-frequency caller
+  auto bestCaller =
+      std::max_element(clusterEdge.begin(), clusterEdge.end(),
+                       [](const DenseMapPair<Cluster *, uint64_t> &p1,
+                          const DenseMapPair<Cluster *, uint64_t> &p2) {
+                         if (p1.second == p2.second) // Consistently break ties
+                           return std::less<Cluster *>()(p1.first, p2.first);
+                         return p1.second < p2.second;
+                       });
+
+  if (bestCaller == clusterEdge.end())
+    return nullptr;
+  return bestCaller->first;
+}
+
+void ChainClustering::sortClusters(std::vector<Cluster *> &clusterOrder) {
+  for (auto &p : clusters)
+    clusterOrder.push_back(p.second.get());
+
+  auto clusterComparator = [](Cluster *c1, Cluster *c2) -> bool {
+    // Set a deterministic order when execution densities are equal.
+    if (c1->getDensity() == c2->getDensity())
+      return c1->delegateChain->delegateNode->mappedAddr <
+             c2->delegateChain->delegateNode->mappedAddr;
+    return c1->getDensity() > c2->getDensity();
+  };
+
+  std::sort(clusterOrder.begin(), clusterOrder.end(), clusterComparator);
+}
+
+void NoOrdering::doOrder(std::vector<CFGNode *> &hotOrder,
+                         std::vector<CFGNode *> &coldOrder) {
+  auto chainComparator = [](const std::unique_ptr<NodeChain> &c_ptr1,
+                            const std::unique_ptr<NodeChain> &c_ptr2) -> bool {
+    return c_ptr1->delegateNode->mappedAddr < c_ptr2->delegateNode->mappedAddr;
+  };
+
+  std::sort(hotChains.begin(), hotChains.end(), chainComparator);
+  std::sort(coldChains.begin(), coldChains.end(), chainComparator);
+
+  for (auto &c_ptr : hotChains)
+    for (CFGNode *n : c_ptr->nodes)
+      hotOrder.push_back(n);
+
+  for (auto &c_ptr : coldChains)
+    for (CFGNode *n : c_ptr->nodes)
+      coldOrder.push_back(n);
+}
+
+// Merge clusters together based on the CallChainClustering algorithm.
+void CallChainClustering::mergeClusters() {
+  // Build a map for the execution density of each chain.
+  DenseMap<NodeChain *, double> chainWeightMap;
+
+  for (auto &c_ptr : hotChains) {
+    NodeChain *chain = c_ptr.get();
+    chainWeightMap.try_emplace(chain, chain->execDensity());
+  }
+
+  // Sort the hot chains in decreasing order of their execution density.
+  std::sort(hotChains.begin(), hotChains.end(),
+            [&chainWeightMap](const std::unique_ptr<NodeChain> &c_ptr1,
+                              const std::unique_ptr<NodeChain> &c_ptr2) {
+              auto chain1Weight = chainWeightMap[c_ptr1.get()];
+              auto chain2Weight = chainWeightMap[c_ptr2.get()];
+              if (chain1Weight == chain2Weight)
+                return c_ptr1->delegateNode->mappedAddr <
+                       c_ptr2->delegateNode->mappedAddr;
+              return chain1Weight > chain2Weight;
+            });
+
+  for (auto &c_ptr : hotChains) {
+    NodeChain *chain = c_ptr.get();
+    if (chainWeightMap[chain] <= 0.005)
+      break;
+    auto *cluster = chainToClusterMap[chain];
+    // Ignore merging if the cluster containing this function is bigger than
+    // 2MBs (size of a large page).
+    if (cluster->size > propConfig.optClusterMergeSizeThreshold)
+      continue;
+    assert(cluster);
+
+    Cluster *predecessorCluster = getMostLikelyPredecessor(chain, cluster);
+    if (!predecessorCluster)
+      continue;
+
+    mergeTwoClusters(predecessorCluster, cluster);
+  }
+}
+
+// This function orders the hot chains based on mergeClusters and sortClusters
+// and uses a consistent ordering for the cold part.
+void ChainClustering::doOrder(std::vector<CFGNode *> &hotOrder,
+                              std::vector<CFGNode *> &coldOrder) {
+  initClusters();
+  mergeClusters();
+  std::vector<Cluster *> clusterOrder;
+  sortClusters(clusterOrder);
+  // This maps every controlFlowGraph to the position of its first hot node in
+  // the layout.
+  DenseMap<ControlFlowGraph *, size_t> hotCFGOrder;
+
+  for (Cluster *cl : clusterOrder)
+    for (NodeChain *c : cl->chains)
+      for (CFGNode *n : c->nodes) {
+        hotCFGOrder.try_emplace(n->controlFlowGraph, hotOrder.size());
+        hotOrder.push_back(n);
+      }
+
+  auto coldChainComparator =
+      [&hotCFGOrder](const std::unique_ptr<NodeChain> &c_ptr1,
+                     const std::unique_ptr<NodeChain> &c_ptr2) -> bool {
+    // If each of the chains comes from a single controlFlowGraph, we can order
+    // the chains consistently based on the hot layout
+    if (c_ptr1->controlFlowGraph && c_ptr2->controlFlowGraph) {
+      // If one controlFlowGraph is cold and the other is hot, make sure the hot
+      // controlFlowGraph's chain comes earlier in the order.
+      if (c_ptr1->controlFlowGraph->isHot() !=
+          c_ptr2->controlFlowGraph->isHot())
+        return c_ptr1->controlFlowGraph->isHot();
+      // If both cfgs are hot, make the order for cold chains consistent with
+      // the order for hot ones.
+      if (c_ptr1->controlFlowGraph->isHot() &&
+          c_ptr2->controlFlowGraph->isHot() &&
+          (c_ptr1->controlFlowGraph != c_ptr2->controlFlowGraph))
+        return hotCFGOrder[c_ptr1->controlFlowGraph] <
+               hotCFGOrder[c_ptr2->controlFlowGraph];
+      // Otherwise, use a consistent order based on the representative nodes.
+      return c_ptr1->delegateNode->mappedAddr <
+             c_ptr2->delegateNode->mappedAddr;
+    }
+    // Use an order consistent with the initial ordering of the representative
+    // nodes.
+    return c_ptr1->delegateNode->mappedAddr < c_ptr2->delegateNode->mappedAddr;
+  };
+
+  std::sort(coldChains.begin(), coldChains.end(), coldChainComparator);
+
+  for (auto &c_ptr : coldChains)
+    for (CFGNode *n : c_ptr->nodes)
+      coldOrder.push_back(n);
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/test/ELF/propeller/codelayout/function-ordering.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/function-ordering.s
@@ -0,0 +1,78 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises function reordering on three functions.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld  %t.o -o %t.out
+
+# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000008 t foo
+# BEFORE-NEXT:	0000000000201128 0000000000000008 t bar
+# BEFORE-NEXT:	0000000000201130 0000000000000008 t baz
+
+## Create a propeller profile based on the following inter-procedural calls.
+##
+##       100        100
+##  baz -----> bar -----> foo
+##
+
+# RUN: echo "!foo" > %t_prof.propeller
+# RUN: echo "!bar" >> %t_prof.propeller
+# RUN: echo "!baz" >> %t_prof.propeller
+# RUN: echo "Symbols" >> %t_prof.propeller
+# RUN: echo "1 8 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 8 Nbar" >> %t_prof.propeller
+# RUN: echo "3 8 Nbaz" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "3 2 100 C" >> %t_prof.propeller
+# RUN: echo "2 1 100 C" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+
+## Link with the propeller profile and expect the functions to be reordered.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out
+# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 0000000000000008 t baz
+# REORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# REORDER-NEXT:	0000000000201130 0000000000000008 t foo
+
+## Disable function reordering and expect that the original function order is retained.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out
+# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER
+
+# NOREORDER:		0000000000201120 0000000000000008 t foo
+# NOREORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# NOREORDER-NEXT:	0000000000201130 0000000000000008 t baz
+
+.section	.text,"ax",@progbits,unique,1
+# -- Begin function foo
+.type	foo,@function
+foo:
+ .quad 0x11
+
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
+
+.section	.text,"ax",@progbits,unique,2
+# -- Begin function bar
+.type	bar,@function
+bar:
+ .quad 0x22
+
+.Lbar_end:
+ .size	bar, .Lbar_end-bar
+# -- End function bar
+
+.section	.text,"ax",@progbits,unique,3
+# -- Begin function baz
+.type	baz,@function
+baz:
+ .quad 0x33
+
+.Lbaz_end:
+ .size	baz, .Lbaz_end-baz
+# -- End function baz
Index: lld/test/ELF/propeller/codelayout/function-with-loop.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/function-with-loop.s
@@ -0,0 +1,134 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises basic block reordering on a single function with a simple loop.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	0000000000201123 a.BB.foo:
+# BEFORE-NEXT:	nopl	(%rax)
+# BEFORE-NEXT:	je      3 <aaa.BB.foo>
+
+# BEFORE:	0000000000201128 aa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	000000000020112b aaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	jne     -13 <a.BB.foo>
+
+# BEFORE:	0000000000201130 aaaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	retq
+
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##             foo
+##              |
+##              |5
+##              V
+##      +--> a.BB.foo <--+
+##      |      / \       |
+##     0|    0/   \95    |90
+##      |    /     \     |
+##      |   v       \    |
+##  aa.BB.foo        v   |
+##      \        aaa.BB.foo
+##       \           /
+##        \         /
+##        0\       /10
+##          \     /
+##           v   v
+##         aaaa.BB.foo
+##
+## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo
+## as a fallthrough chain in the layout.
+
+# RUN: echo "!foo" > %t_prof.propeller
+# RUN: echo "!!1" >> %t_prof.propeller
+# RUN: echo "!!3" >> %t_prof.propeller
+# RUN: echo "!!4" >> %t_prof.propeller
+# RUN: echo "Symbols" >> %t_prof.propeller
+# RUN: echo "1 14 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 3 1.2" >> %t_prof.propeller
+# RUN: echo "4 5 1.3" >> %t_prof.propeller
+# RUN: echo "5 4 1.4" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "4 2 90" >> %t_prof.propeller
+# RUN: echo "2 4 95" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "4 5 10" >> %t_prof.propeller
+# RUN: echo "1 2 5" >> %t_prof.propeller
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out
+# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 foo:
+# REORDER-NEXT:	nopl	(%rax)
+
+# REORDER:	0000000000201123 a.BB.foo:
+# REORDER-NEXT:	nopl	(%rax)
+# REORDER-NEXT:	jne      9 <aa.BB.foo>
+
+# REORDER:	0000000000201128 aaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jne     -10 <a.BB.foo>
+
+# REORDER:	000000000020112d aaaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	retq
+
+# REORDER:	0000000000201131 aa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jmp	-14 <aaa.BB.foo>
+
+## Disable basic block reordering and expect that the original bb order is retained.
+#
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out
+# RUN: diff %t.propeller.noreorder.out %t.out
+
+.section	.text,"ax",@progbits
+# -- Begin function foo
+.type	foo,@function
+foo:
+ nopl (%rax)
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ jmp	aaa.BB.foo
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ jne	a.BB.foo
+ jmp	aaaa.BB.foo
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits,unique,4
+aaaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaaa.BB.foo_end:
+ .size	aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
Index: lld/test/ELF/propeller/codelayout/opt-all-combinations.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/opt-all-combinations.s
@@ -0,0 +1,301 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises all combinations of propeller options (reorder-funcs, reorder-blocks,
+## and split-funcs) on four functions.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	Disassembly of section .text:
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	foo:
+# BEFORE-NEXT:	xorb	%al, 0
+# BEFORE-NEXT:	int3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	bar:
+# BEFORE-NEXT:	xorb	%al, 1
+# BEFORE-NEXT:	je	9 <aa.BB.bar>
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	a.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 2
+# BEFORE-NEXT:	jmp      7 <aaa.BB.bar>
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	aa.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	aaa.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 4
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	baz:
+# BEFORE-NEXT:	xorb	%al, 5
+# BEFORE-NEXT:	int3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	qux:
+# BEFORE-NEXT:	xorb	%al, 6
+#
+
+## Create a propeller profile for four functions foo, bar, baz, and qux based on the cfg below:
+##
+##
+##                   bar
+##                  /   \
+##                 /     \100
+##                /       \
+##       100     v         v
+## foo <----- a.BB.bar  aa.BB.bar     baz     qux ---+
+##               \        /                    ^     |
+##                \      /100                  |   1 |
+##                 \    /                      +-----+
+##                  v  v
+##                aaa.BB.bar
+##
+
+# RUN: echo "!foo" > %t_prof.propeller
+# RUN: echo "!bar" >> %t_prof.propeller
+# RUN: echo "!!1" >> %t_prof.propeller
+# RUN: echo "!!2" >> %t_prof.propeller
+# RUN: echo "!!3" >> %t_prof.propeller
+# RUN: echo "!qux" >> %t_prof.propeller
+# RUN: echo "Symbols" >> %t_prof.propeller
+# RUN: echo "1 8 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 20 Nbar" >> %t_prof.propeller
+# RUN: echo "3 9 2.1" >> %t_prof.propeller
+# RUN: echo "4 7 2.2" >> %t_prof.propeller
+# RUN: echo "5 7 2.3" >> %t_prof.propeller
+# RUN: echo "6 8 Nbaz" >> %t_prof.propeller
+# RUN: echo "7 7 Nqux" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "2 4 100" >> %t_prof.propeller
+# RUN: echo "4 1 100 C" >> %t_prof.propeller
+# RUN: echo "7 7 10 C" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "4 5 100" >> %t_prof.propeller
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller --verbose -propeller-keep-named-symbols -o %t.propeller.reorder.out
+# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	Disassembly of section .text:
+# REORDER-EMPTY:
+# REORDER-NEXT:	bar:
+# REORDER-NEXT: xorb	%al, 1
+# REORDER-NEXT:	jne	30 <a.BB.bar>
+# REORDER-EMPTY:
+# REORDER-NEXT:	aa.BB.bar:
+# REORDER-NEXT:	xorb	%al, 3
+# REORDER-EMPTY:
+# REORDER-NEXT:	aaa.BB.bar:
+# REORDER-NEXT:	xorb	%al, 4
+# REORDER-NEXT:	int3
+# REORDER-EMPTY:
+# REORDER-NEXT:	foo:
+# REORDER-NEXT:	xorb	%al, 0
+# REORDER-NEXT:	int3
+# REORDER-EMPTY:
+# REORDER-NEXT:	qux:
+# REORDER-NEXT:	xorb	%al, 6
+# REORDER-EMPTY:
+# REORDER-NEXT:	a.BB.bar:
+# REORDER-NEXT:	xorb	%al, 2
+# REORDER-NEXT:	jmp	-32 <aaa.BB.bar>
+# REORDER-EMPTY:
+# REORDER-NEXT:	baz:
+# REORDER-NEXT:	xorb	%al, 5
+#
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorderblocks.out  2>&1 | FileCheck %s --check-prefixes=WARN,IMPLICITNOSPLIT
+# WARN-NOT:		warning:
+# IMPLICITNOSPLIT:	warning: propeller: no-reorder-blocks implicitly sets no-split-funcs.
+#
+# RUN: llvm-objdump -d %t.propeller.noreorderblocks.out| FileCheck %s --check-prefix=NO_REORDER_BB
+#
+# NO_REORDER_BB:		Disassembly of section .text:
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 1
+# NO_REORDER_BB-NEXT:	je	9 <aa.BB.bar>
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	a.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 2
+# NO_REORDER_BB-NEXT:	jmp	7 <aaa.BB.bar>
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	aa.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	aaa.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 4
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	foo:
+# NO_REORDER_BB-NEXT:	xorb	%al, 0
+# NO_REORDER_BB-NEXT:	int3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	qux:
+# NO_REORDER_BB-NEXT:	xorb	%al, 6
+# NO_REORDER_BB-NEXT:	int3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	baz:
+# NO_REORDER_BB-NEXT:	xorb	%al, 5
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.noreorderfuncs.out
+# RUN: llvm-objdump -d %t.propeller.noreorderfuncs.out| FileCheck %s --check-prefix=NO_REORDER_FUNC
+#
+# NO_REORDER_FUNC:	Disassembly of section .text:
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	foo:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 0
+# NO_REORDER_FUNC-NEXT:	int3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 1
+# NO_REORDER_FUNC-NEXT:	jne	22 <a.BB.bar>
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	aa.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	aaa.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 4
+# NO_REORDER_FUNC-NEXT:	int3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	qux:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 6
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	a.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 2
+# NO_REORDER_FUNC-NEXT:	jmp	-24 <aaa.BB.bar>
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	baz:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 5
+#
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitfuncs.out
+# RUN: llvm-objdump -d %t.propeller.nosplitfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_FUNC
+#
+# NO_SPLIT_FUNC:	Disassembly of section .text:
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 1
+# NO_SPLIT_FUNC-NEXT:	jne	14 <a.BB.bar>
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	aa.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	aaa.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 4
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	a.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 2
+# NO_SPLIT_FUNC-NEXT:	jmp	-16 <aaa.BB.bar>
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	foo:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 0
+# NO_SPLIT_FUNC-NEXT:	int3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	qux:
+# NO_SPLIT_FUNC-NEXT:	xorb	%al, 6
+# NO_SPLIT_FUNC-NEXT:	int3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	baz:
+# NO_SPLIT_FUNC-NEXT:	xorb	%al, 5
+#
+
+## Check that the combination of no-reorder-blocks and split-funcs makes lld fail.
+# RUN: not ld.lld  %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-opt=split-funcs 2>&1 -o /dev/null | FileCheck %s --check-prefix=FAIL
+# FAIL:		propeller: Inconsistent combination of propeller optimizations 'split-funcs' and 'no-reorder-blocks'.
+
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitreorderfuncs.out
+# RUN: llvm-objdump -d %t.propeller.nosplitreorderfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_REORDER_FUNC
+#
+# NO_SPLIT_REORDER_FUNC:	Disassembly of section .text:
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	foo:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 0
+# NO_SPLIT_REORDER_FUNC-NEXT:	int3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 1
+# NO_SPLIT_REORDER_FUNC-NEXT:	jne	14 <a.BB.bar>
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	aa.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	aaa.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 4
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	a.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 2
+# NO_SPLIT_REORDER_FUNC-NEXT:	jmp	-16 <aaa.BB.bar>
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	baz:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 5
+# NO_SPLIT_REORDER_FUNC-NEXT:	int3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	qux:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 6
+
+.section	.text,"ax",@progbits
+# -- Begin function foo
+.type	foo,@function
+.align	4
+foo:
+ xor %al,0
+
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
+
+.section	.text,"ax",@progbits,unique,1
+# -- Begin function bar
+.type	bar,@function
+.align	4
+bar:
+ xor %al,1
+ je	aa.BB.bar
+ jmp	a.BB.bar
+
+.section	.text,"ax",@progbits,unique,2
+a.BB.bar:
+ xor %al,2
+ jmp	aaa.BB.bar
+.La.BB.bar_end:
+ .size	a.BB.bar, .La.BB.bar_end-a.BB.bar
+
+.section	.text,"ax",@progbits,unique,3
+aa.BB.bar:
+ xor %al,3
+ jmp	aaa.BB.bar
+.Laa.BB.bar_end:
+ .size	aa.BB.bar, .Laa.BB.bar_end-aa.BB.bar
+
+.section	.text,"ax",@progbits,unique,4
+aaa.BB.bar:
+ xor %al,4
+.Laaa.BB.bar_end:
+ .size	aaa.BB.bar, .Laaa.BB.bar_end-aaa.BB.bar
+
+.section	.text,"ax",@progbits,unique,1
+.Lbar_end:
+ .size	bar, .Lbar_end-bar
+# -- End function bar
+
+.section	.text,"ax",@progbits,unique,5
+# -- Begin function baz
+.type	baz,@function
+.align	4
+baz:
+ xor %al,5
+
+.Lbaz_end:
+ .size	baz, .Lbaz_end-baz
+# -- End function baz
+
+.section	.text,"ax",@progbits,unique,6
+# -- Begin function qux
+.type	qux,@function
+.align	4
+qux:
+ xor %al,6
+
+.Lqux_end:
+ .size	qux, .Lqux_end-qux
+# -- End function qux
Index: lld/test/ELF/propeller/codelayout/optimal-fallthrough.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/codelayout/optimal-fallthrough.s
@@ -0,0 +1,90 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises optimal basic block reordering one a single function.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000005 t foo
+# BEFORE-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# BEFORE-NEXT:	000000000020112a 0000000000000004 t aa.BB.foo
+# BEFORE-NEXT:	000000000020112e 0000000000000004 t aaa.BB.foo
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##
+##     foo
+##      |\
+##      | \100
+##      |  \
+##      |   v
+##   150|  a.BB.foo
+##      |   /  \
+##      |  /100 \0
+##      | /      \
+##      vv        v
+##   aaa.BB.foo  aa.BB.foo
+##
+## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and
+## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo.
+## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf.
+
+# RUN: echo "!foo" > %t_prof.propeller
+# RUN: echo "!!1" >> %t_prof.propeller
+# RUN: echo "!!3" >> %t_prof.propeller
+# RUN: echo "Symbols" >> %t_prof.propeller
+# RUN: echo "1 12 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 4 1.2" >> %t_prof.propeller
+# RUN: echo "4 4 1.3" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "1 4 150" >> %t_prof.propeller
+# RUN: echo "2 4 100" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "1 2 100" >> %t_prof.propeller
+
+# RUN: ld.lld %t.o -propeller-debug-symbol=foo -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out
+# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER
+
+# AFTER:	0000000000201120 0000000000000005 t foo
+# AFTER-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# AFTER-NEXT:	000000000020112a 0000000000000004 t aaa.BB.foo
+# AFTER-NEXT:	000000000020112e 0000000000000004 t aa.BB.foo
+
+#.global _start
+#_start:
+# -- Begin function foo
+.section	.text,"ax",@progbits
+.type	foo,@function
+foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ ret
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo