Index: lld/ELF/PropellerBBReordering.h
===================================================================
--- /dev/null
+++ lld/ELF/PropellerBBReordering.h
@@ -0,0 +1,355 @@
+//===- PropellerBBReordering.h ----------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+//Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H
+#define LLD_ELF_PROPELLER_BB_REORDERING_H
+
+#include "PropellerELFCfg.h"
+
+#include "lld/Common/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include <list>
+#include <unordered_map>
+#include <unordered_set>
+
+using llvm::DenseMap;
+
+namespace lld {
+namespace propeller {
+
+enum MergeOrder {
+  Begin,
+  X2X1Y = Begin,
+  BeginNext,
+  X1YX2 = BeginNext,
+  X2YX1,
+  YX2X1,
+  End
+};
+
+// Represents a chain of nodes (basic blocks).
+class NodeChain {
+public:
+  // Representative node of the chain, with which it is initially constructed.
+  const ELFCfgNode *DelegateNode = nullptr;
+  std::vector<const ELFCfgNode *> Nodes;
+
+  // Total binary size of the chain
+  uint32_t Size = 0;
+
+  // Total execution frequency of the chain
+  uint64_t Freq = 0;
+
+  // Extended TSP score of the chain
+  double Score = 0;
+
+  // Constructor for building a NodeChain from a single Node
+  NodeChain(const ELFCfgNode *Node) {
+    DelegateNode = Node;
+    Nodes.push_back(Node);
+    Size = Node->ShSize;
+    Freq = Node->Freq;
+  }
+
+  double execDensity() const {
+    return ((double)Freq) / std::max(Size, (uint32_t)1);
+  }
+
+  const ELFCfgNode *getFirstNode() const { return Nodes.front(); }
+
+  const ELFCfgNode *getLastNode() const { return Nodes.back(); }
+};
+
+// BB Chain builder based on the ExtTSP metric
+class NodeChainBuilder {
+private:
+  class NodeChainAssembly;
+  class NodeChainSlice;
+
+  // Cfg representing a single function.
+  const ELFCfg *CFG;
+
+  // Set of built chains, keyed by section index of their Delegate Nodes.
+  DenseMap<uint64_t, unique_ptr<NodeChain>> Chains;
+
+  // Map from every node to its containing chain.
+  // This map will be updated as chains keep merging together during the
+  // algorithm.
+  DenseMap<const ELFCfgNode *, NodeChain *> NodeToChainMap;
+
+  // Map from every node to its (binary) offset in its containing chain.
+  // This map will be updated as chains keep merging together during the
+  // algorithm.
+  DenseMap<const ELFCfgNode *, uint32_t> NodeOffsetMap;
+
+  // These represent all the edges which are -- based on the profile -- the only
+  // (executed) outgoing edges from their source node and the only (executed)
+  // incoming edges to their sink nodes. The algorithm will make sure that these
+  // edges form fall-throughs in the final order.
+  DenseMap<const ELFCfgNode *, ELFCfgNode *> MutuallyForcedOut;
+
+  // This maps every (ordered) pair of chains (with the first chain in the pair
+  // potentially splittable) to the highest-gain NodeChainAssembly for those
+  // chains.
+  DenseMap<std::pair<NodeChain *, NodeChain *>,
+           std::unique_ptr<NodeChainAssembly>>
+      NodeChainAssemblies;
+
+  // This map stores the candidate chains for each chain.
+  //
+  // For every chain, its candidate chains are the chains which can increase the
+  // overall ExtTSP score when merged with that chain. This is used to update
+  // the NodeChainAssemblies map whenever chains merge together. The candidate
+  // chains of a chain may also be updated as result of a merge.
+  std::unordered_map<NodeChain *, std::unordered_set<NodeChain *>>
+      CandidateChains;
+
+  void sortChainsByExecutionDensity(std::vector<const NodeChain *> &ChainOrder);
+
+  void initializeExtTSP();
+  void attachFallThroughs();
+
+  // This function tries to place two nodes immediately adjacent to
+  // each other (used for fallthroughs).
+  // Returns true if this can be done.
+  bool attachNodes(const ELFCfgNode *src, const ELFCfgNode *sink);
+
+  void mergeChainEdges(NodeChain *splitChain, NodeChain *unSplitChain);
+
+  void mergeChains(NodeChain *leftChain, NodeChain *rightChain);
+  void mergeChains(std::unique_ptr<NodeChainAssembly> assembly);
+
+  // Recompute the ExtTSP score of a chain
+  double computeExtTSPScore(NodeChain *chain) const;
+
+  // Update the related NodeChainAssembly records for two chains, with the
+  // assumption that UnsplitChain has been merged into SplitChain.
+  bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain);
+
+  void computeChainOrder(std::vector<const NodeChain *> &chainOrder);
+
+  // Initialize the mutuallyForcedOut map
+  void initMutuallyForcedEdges();
+
+  // Initialize basic block chains, with one chain for every node
+  void initNodeChains();
+
+  uint32_t getNodeOffset(const ELFCfgNode *node) const {
+    auto it = NodeOffsetMap.find(node);
+    assert("Node does not exist in the offset map." &&
+           it != NodeOffsetMap.end());
+    return it->second;
+  }
+
+  NodeChain *getNodeChain(const ELFCfgNode *node) const {
+    auto it = NodeToChainMap.find(node);
+    assert("Node does not exist in the chain map." &&
+           it != NodeToChainMap.end());
+    return it->second;
+  }
+
+public:
+  NodeChainBuilder(const ELFCfg *_CFG) : CFG(_CFG) {
+    initNodeChains();
+    initMutuallyForcedEdges();
+  }
+
+  // This invokes the Extended TSP algorithm, orders the hot and cold basic
+  // blocks and inserts their associated symbols at the corresponding locations
+  // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the
+  // given SymbolList. This function also modifies the alignment of basic blocks
+  // based on the new order if the feature is requested by
+  // -propeller-align-basic-blocks.
+  void doSplitOrder(std::list<StringRef> &SymbolList,
+                    std::list<StringRef>::iterator HotPlaceHolder,
+                    std::list<StringRef>::iterator ColdPlaceHolder);
+};
+
+class NodeChainBuilder::NodeChainSlice {
+private:
+  // Chain from which this slice comes from
+  NodeChain *Chain;
+
+  // The endpoints of the slice in the corresponding chain
+  std::vector<const ELFCfgNode *>::iterator Begin, End;
+
+  // The offsets corresponding to the two endpoints
+  uint32_t BeginOffset, EndOffset;
+
+  // Constructor for building a chain slice from a given chain and the two
+  // endpoints of the chain.
+  NodeChainSlice(NodeChain *c, std::vector<const ELFCfgNode *>::iterator begin,
+                 std::vector<const ELFCfgNode *>::iterator end,
+                 const NodeChainBuilder &chainBuilder)
+      : Chain(c), Begin(begin), End(end) {
+
+    BeginOffset = chainBuilder.getNodeOffset(*begin);
+    if (End == Chain->Nodes.end())
+      EndOffset = Chain->Size;
+    else
+      EndOffset = chainBuilder.getNodeOffset(*end);
+  }
+
+  // (Binary) size of this slice
+  uint32_t size() const { return EndOffset - BeginOffset; }
+
+  friend class NodeChainBuilder;
+  friend class NodeChainAssembly;
+};
+
+class NodeChainBuilder::NodeChainAssembly {
+private:
+  // Total ExtTSP score of this NodeChainAssembly
+  double Score = 0;
+
+  // Whether the ExtTSP score has been computed for this NodeChainAssembly
+  bool ScoreComputed = false;
+
+  // Corresponding NodeChainBuilder of the assembled chains
+  const NodeChainBuilder *ChainBuilder;
+
+  // The two assembled chains
+  NodeChain *SplitChain, *UnsplitChain;
+
+  // The slice position of Splitchain
+  std::vector<const ELFCfgNode *>::iterator SlicePosition;
+
+  // The three chain slices
+  std::vector<NodeChainSlice> Slices;
+
+  NodeChainAssembly(NodeChain *chainX, NodeChain *chainY,
+                    std::vector<const ELFCfgNode *>::iterator slicePosition,
+                    MergeOrder mergeOrder, const NodeChainBuilder *chainBuilder)
+      : ChainBuilder(chainBuilder), SplitChain(chainX), UnsplitChain(chainY),
+        SlicePosition(slicePosition) {
+    NodeChainSlice x1(chainX, chainX->Nodes.begin(), SlicePosition,
+                      *chainBuilder);
+    NodeChainSlice x2(chainX, SlicePosition, chainX->Nodes.end(),
+                      *chainBuilder);
+    NodeChainSlice y(chainY, chainY->Nodes.begin(), chainY->Nodes.end(),
+                     *chainBuilder);
+
+    switch (mergeOrder) {
+    case MergeOrder::X2X1Y:
+      Slices = {x2, x1, y};
+      break;
+    case MergeOrder::X1YX2:
+      Slices = {x1, y, x2};
+      break;
+    case MergeOrder::X2YX1:
+      Slices = {x2, y, x1};
+      break;
+    case MergeOrder::YX2X1:
+      Slices = {y, x2, x1};
+      break;
+    default:
+      assert("Invalid MergeOrder!" && false);
+    }
+  }
+
+  double extTSPScore() {
+    if (ScoreComputed)
+      return Score;
+    else {
+      ScoreComputed = true;
+      return (Score = computeExtTSPScore());
+    }
+  }
+
+  // Return the gain in ExtTSP score achieved by this NodeChainAssembly once it
+  // is accordingly applied to the two chains.
+  double extTSPScoreGain() {
+    return extTSPScore() - SplitChain->Score - UnsplitChain->Score;
+  }
+
+  // Find the NodeChainSlice in this NodeChainAssembly which contains the given
+  // node. If the node is not contained in this NodeChainAssembly, then return
+  // false. Otherwise, set idx equal to the index of the corresponding slice and
+  // return true.
+  bool findSliceIndex(const ELFCfgNode *node, uint8_t &idx) const {
+    // First find the chain containing the given node.
+    NodeChain *chain = ChainBuilder->getNodeChain(node);
+
+    // Return false if the found chain is not used in this NodeChainAssembly.
+    if (SplitChain != chain && UnsplitChain != chain)
+      return false;
+
+    // Find the slice containing the node using the node's offset in the chain
+    auto offset = ChainBuilder->getNodeOffset(node);
+
+    for (idx = 0; idx < 3; ++idx) {
+      if (chain != Slices[idx].Chain)
+        continue;
+      if (offset < Slices[idx].BeginOffset)
+        continue;
+      if (offset > Slices[idx].EndOffset)
+        continue;
+      if (offset < Slices[idx].EndOffset && offset > Slices[idx].BeginOffset)
+        return true;
+      // A node can have zero size, which means multiple nodes may be associated
+      // with the same offset. This means that if the node's offset is at the
+      // beginning or the end of the slice, the node may reside in either slices
+      // of the chain.
+      if (offset == Slices[idx].EndOffset) {
+        // If offset is at the end of the slice, iterate backwards over the
+        // slice to find a zero-sized node.
+        for (auto nodeIt = std::prev(Slices[idx].End);
+             nodeIt != std::prev(Slices[idx].Begin); nodeIt--) {
+          // Stop iterating if the node's size is non-zero as this would change
+          // the offset.
+          if (!(*nodeIt)->ShSize)
+            break;
+          if (*nodeIt == node)
+            return true;
+        }
+      }
+      if (offset == Slices[idx].BeginOffset) {
+        // If offset is at the beginning of the slice, iterate forwards over the
+        // slice to find the node.
+        for (auto nodeIt = Slices[idx].Begin; nodeIt != Slices[idx].End;
+             nodeIt++) {
+          if (*nodeIt == node)
+            return true;
+          // Stop iterating if the node's size is non-zero as this would change
+          // the offset.
+          if ((*nodeIt)->ShSize)
+            break;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Total Extended TSP score of this NodeChainAssembly once it is assembled
+  // accordingly.
+  double computeExtTSPScore() const;
+
+  // First node in the resulting assembled chain.
+  const ELFCfgNode *getFirstNode() const {
+    for (auto &slice : Slices)
+      if (slice.Begin != slice.End)
+        return *slice.Begin;
+    return nullptr;
+  }
+
+  friend class NodeChainBuilder;
+
+public:
+  // We delete the copy constructor to make sure NodeChainAssembly is moved
+  // rather than copied.
+  NodeChainAssembly(NodeChainAssembly &&) = default;
+  // copy constructor is implicitly deleted
+  // NodeChainAssembly(const NodeChainAssembly&) = delete;
+  NodeChainAssembly() = delete;
+};
+
+} // namespace propeller
+} // namespace lld
+#endif
Index: lld/ELF/PropellerBBReordering.cpp
===================================================================
--- /dev/null
+++ lld/ELF/PropellerBBReordering.cpp
@@ -0,0 +1,627 @@
+//===- PropellerBBReordering.cpp  -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Propeller infrastcture for doing code layout
+// optimization and includes the implementation of intra-function basic block
+// reordering algorithm based on the Extended TSP metric
+// (https://arxiv.org/abs/1809.04676).
+//
+// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic
+// blocks in a function, by combining the gains from fall-throughs and short
+// jumps.
+//
+// Given an ordering of the basic blocks, for a function f, the ExtTSP score is
+// computed as follows.
+//
+// sum_{edges e in f} frequency(e) * weight(e)
+//
+// where frequency(e) is the execution frequency and weight(e) is computed as
+// follows:
+//   *  1 if distance(Src[e], Sink[e]) = 0 (i.e. fallthrough)
+//   *  0.1 * (1 - distance(Src[e], Sink[e]) / 1024) if Src[e] < Sink[e] and
+//   0 < distance(Src[e], Sink[e]) < 1024 (i.e. short forward jump)
+//   *  0.1 * (1 - distance(Src[e], Sink[e]) / 640) if Src[e] > Sink[e] and
+//   0 < distance(Src[e], Sink[e]) < 640 (i.e. short backward jump)
+//   *  0 otherwise
+//
+// In short, it computes a weighted sum of frequencies of all edges in the
+// control flow graph. Each edge gets its weight depending on whether the given
+// ordering makes the edge a fallthrough, a short forward jump, or a short
+// backward jump.
+//
+// Although this problem is NP-hard like the regular TSP, an iterative greedy
+// basic-block-chaining algorithm is used to find a close to optimal solution.
+// This algorithm is described as follows.
+//
+// Starting with one basic block sequence (BB chain) for every basic block, the
+// algorithm iteratively joins BB chains together in order to maximize the
+// extended TSP score of all the chains.
+//
+// Initially, it finds all mutually-forced edges in the profiled CFG. These are
+// all the edges which are -- based on the profile -- the only (executed)
+// outgoing edge from their source node and the only (executed) incoming edges
+// to their sink nodes. Next, the source and sink of all mutually-forced edges
+// are attached together as fallthrough edges.
+//
+// Then, at every iteration, the algorithm tries to merge a pair of BB chains
+// which leads to the highest gain in the ExtTSP score. The algorithm extends
+// the search space by considering splitting short (less than 128 bytes in
+// binary size) BB chains into two chains and then merging these two chains with
+// the other chain in four ways. After every merge, the new merge gains are
+// updated. The algorithm repeats joining BB chains until no additional can be
+// achieved. Eventually, it sorts all the existing chains in decreasing order of
+// their execution density, i.e., the total profiled frequency of the chain
+// divided by its binary size.
+//
+// The values used by this algorithm are reconfiguriable using lld's propeller
+// flags. Specifically, these parameters are:
+//    * propeller-forward-jump-distance: maximum distance of a forward jump
+//    (default-set to 1024 in the above equation).
+//    * propeller-backward-jump-distance: maximum distance of a backward jump
+//    (default-set to 640 in the above equation).
+//    * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1)
+//    * propeller-forward-jump-weight: weight of a forward jump (default-set to
+//    0.1)
+//    * propeller-backward-jump-weight: weight of a backward jump (default-set
+//    to 0.1)
+//    * propeller-chain-split-threshold: maximum binary size of a BB chain
+//    which the algorithm will consider for splitting (default-set to 128).
+//===----------------------------------------------------------------------===//
+#include "PropellerBBReordering.h"
+
+#include "Config.h"
+#include "llvm/ADT/DenseSet.h"
+
+#include <stdio.h>
+
+#include <numeric>
+#include <vector>
+
+using lld::elf::config;
+
+using llvm::DenseSet;
+using llvm::detail::DenseMapPair;
+
+namespace lld {
+namespace propeller {
+
+// Return the Extended TSP score for one edge, given its source to sink
+// direction and distance in the layout.
+double getEdgeExtTSPScore(const ELFCfgEdge *edge, bool isEdgeForward,
+                          uint32_t srcSinkDistance) {
+  if (edge->Weight == 0)
+    return 0;
+
+  if (srcSinkDistance == 0 && (edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC))
+    return edge->Weight * config->propellerFallthroughWeight;
+
+  if (isEdgeForward && srcSinkDistance < config->propellerForwardJumpDistance)
+    return edge->Weight * config->propellerForwardJumpWeight *
+           (1.0 -
+            ((double)srcSinkDistance) / config->propellerForwardJumpDistance);
+
+  if (!isEdgeForward && srcSinkDistance < config->propellerBackwardJumpDistance)
+    return edge->Weight * config->propellerBackwardJumpWeight *
+           (1.0 -
+            ((double)srcSinkDistance) / config->propellerBackwardJumpDistance);
+  return 0;
+}
+
+// Sort BB chains in decreasing order of their execution density.
+// NodeChainBuilder calls this function at the end to ensure that hot BB chains
+// are placed at the beginning of the function.
+void NodeChainBuilder::sortChainsByExecutionDensity(
+    std::vector<const NodeChain *> &chainOrder) {
+  for (auto CI = Chains.begin(), CE = Chains.end(); CI != CE; ++CI) {
+    chainOrder.push_back(CI->second.get());
+  }
+
+  std::sort(
+      chainOrder.begin(), chainOrder.end(),
+      [this](const NodeChain *c1, const NodeChain *c2) {
+        if (c1->getFirstNode() == this->CFG->getEntryNode())
+          return true;
+        if (c2->getFirstNode() == this->CFG->getEntryNode())
+          return false;
+        double c1ExecDensity = c1->execDensity();
+        double c2ExecDensity = c2->execDensity();
+        if (c1ExecDensity == c2ExecDensity) {
+          if (c1->DelegateNode->MappedAddr == c2->DelegateNode->MappedAddr)
+            return c1->DelegateNode->Shndx < c2->DelegateNode->Shndx;
+          return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr;
+        }
+        return c1ExecDensity > c2ExecDensity;
+      });
+}
+
+// NodeChainBuilder calls this function after building all the chains to attach
+// as many fall-throughs as possible. Given that the algorithm already optimizes
+// the extend TSP score, this function will only affect the cold basic blocks
+// and thus we do not need to consider the edge weights.
+void NodeChainBuilder::attachFallThroughs() {
+  // First, try to keep the fall-throughs from the original order.
+  for (auto &node : CFG->Nodes) {
+    if (node->FTEdge != nullptr) {
+      attachNodes(node.get(), node->FTEdge->Sink);
+    }
+  }
+
+  // Sometimes, the original fall-throughs cannot be kept. So we try to find new
+  // fall-through opportunities which did not exist in the original order.
+  for (auto &edge : CFG->IntraEdges) {
+    attachNodes(edge->Src, edge->Sink);
+  }
+}
+
+// Merge two chains in the specified order.
+void NodeChainBuilder::mergeChains(NodeChain *leftChain,
+                                   NodeChain *rightChain) {
+  for (const ELFCfgNode *node : rightChain->Nodes) {
+    leftChain->Nodes.push_back(node);
+    NodeToChainMap[node] = leftChain;
+    NodeOffsetMap[node] += leftChain->Size;
+  }
+  leftChain->Size += rightChain->Size;
+  leftChain->Freq += rightChain->Freq;
+  Chains.erase(rightChain->DelegateNode->Shndx);
+}
+
+// This function tries to place two basic blocks immediately adjacent to each
+// other (used for fallthroughs). Returns true if the basic blocks have been
+// attached this way.
+bool NodeChainBuilder::attachNodes(const ELFCfgNode *src,
+                                   const ELFCfgNode *sink) {
+  // No edge cannot fall-through to the entry basic block.
+  if (sink == CFG->getEntryNode())
+    return false;
+
+  // Ignore edges between hot and cold basic blocks.
+  if (src->Freq == 0 ^ sink->Freq == 0)
+    return false;
+  NodeChain *srcChain = getNodeChain(src);
+  NodeChain *sinkChain = getNodeChain(sink);
+  // Skip this edge if the source and sink are in the same chain
+  if (srcChain == sinkChain)
+    return false;
+
+  // It's only possible to form a fall-through between src and sink if src is
+  // they are respectively located at the end and beginning of their chains.
+  if (srcChain->getLastNode() != src || sinkChain->getFirstNode() != sink)
+    return false;
+  // Attaching is possible. So we merge the chains in the corresponding order.
+  mergeChains(srcChain, sinkChain);
+  return true;
+}
+
+// Merge two BB sequences according to the given NodeChainAssembly. A
+// NodeChainAssembly is an ordered triple of three slices from two chains.
+void NodeChainBuilder::mergeChains(
+    std::unique_ptr<NodeChainAssembly> assembly) {
+  // Create the new node order according the given assembly
+  std::vector<const ELFCfgNode *> newNodeOrder;
+  for (NodeChainSlice &slice : assembly->Slices) {
+    std::copy(slice.Begin, slice.End, std::back_inserter(newNodeOrder));
+  }
+  // We merge the nodes into the SplitChain
+  assembly->SplitChain->Nodes = std::move(newNodeOrder);
+
+  // Update nodeOffset and nodeToChainMap for all the nodes in the sequence.
+  uint32_t runningOffset = 0;
+  for (const ELFCfgNode *node : assembly->SplitChain->Nodes) {
+    NodeToChainMap[node] = assembly->SplitChain;
+    NodeOffsetMap[node] = runningOffset;
+    runningOffset += node->ShSize;
+  }
+  assembly->SplitChain->Size = runningOffset;
+
+  // Update the total frequency and ExtTSP score of the aggregated chain
+  assembly->SplitChain->Freq += assembly->UnsplitChain->Freq;
+  // We have already computed the gain in the assembly record. So we can just
+  // increment the aggregated chain's score by that gain.
+  assembly->SplitChain->Score += assembly->extTSPScoreGain();
+
+  // Merge the assembly candidate chains of the two chains into the candidate
+  // chains of the remaining NodeChain and remove the records for the defunct
+  // NodeChain.
+  for (NodeChain *c : CandidateChains[assembly->UnsplitChain]) {
+    NodeChainAssemblies.erase(std::make_pair(c, assembly->UnsplitChain));
+    NodeChainAssemblies.erase(std::make_pair(assembly->UnsplitChain, c));
+    CandidateChains[c].erase(assembly->UnsplitChain);
+    if (c != assembly->SplitChain)
+      CandidateChains[assembly->SplitChain].insert(c);
+  }
+
+  // Update the NodeChainAssembly for all candidate chains of the merged
+  // NodeChain. Remove a NodeChain from the merge chain's candidates if the
+  // NodeChainAssembly update finds no gain.
+  auto &splitChainCandidateChains = CandidateChains[assembly->SplitChain];
+
+  for (auto CI = splitChainCandidateChains.begin(),
+            CE = splitChainCandidateChains.end();
+       CI != CE;) {
+    NodeChain *otherChain = *CI;
+    auto &otherChainCandidateChains = CandidateChains[otherChain];
+    bool x = updateNodeChainAssembly(otherChain, assembly->SplitChain);
+    bool y = updateNodeChainAssembly(assembly->SplitChain, otherChain);
+    if (x || y) {
+      otherChainCandidateChains.insert(assembly->SplitChain);
+      CI++;
+    } else {
+      otherChainCandidateChains.erase(assembly->SplitChain);
+      CI = splitChainCandidateChains.erase(CI);
+    }
+  }
+
+  // remove all the candidate chain records for the merged-in chain
+  CandidateChains.erase(assembly->UnsplitChain);
+
+  // Finally, remove the defunct (merged-in) chain record from the chains
+  Chains.erase(assembly->UnsplitChain->DelegateNode->Shndx);
+}
+
+// Calculate the Extended TSP metric for a BB chain.
+// This function goes over all the BBs in the chain and for BB chain and
+// aggregates the score of all edges which are contained in the same chain.
+double NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const {
+  double score = 0;
+  uint32_t srcOffset = 0;
+  for (const ELFCfgNode *node : chain->Nodes) {
+    for (const ELFCfgEdge *edge : node->Outs) {
+      if (!edge->Weight)
+        continue;
+      NodeChain *sinkChain = getNodeChain(edge->Sink);
+      if (sinkChain != chain)
+        continue;
+      auto sinkOffset = getNodeOffset(edge->Sink);
+      bool edgeForward = srcOffset + node->ShSize <= sinkOffset;
+      // Calculate the distance between src and sink
+      uint32_t distance = edgeForward ? sinkOffset - srcOffset - node->ShSize
+                                      : srcOffset - sinkOffset + node->ShSize;
+      score += getEdgeExtTSPScore(edge, edgeForward, distance);
+    }
+    srcOffset += node->ShSize;
+  }
+  return score;
+}
+
+// Updates the best NodeChainAssembly between two NodeChains. The existing
+// record will be replaced by the new NodeChainAssembly if a non-zero gain
+// is achieved. Otherwise, it will be removed.
+// If a nodechain assembly record is (kept) inserted, returns true. Otherwise
+// returns false.
+bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain,
+                                               NodeChain *unsplitChain) {
+  // Remove the assembly record
+  NodeChainAssemblies.erase(std::make_pair(splitChain, unsplitChain));
+
+  // Only consider split the chain if the size of the chain is smaller than a
+  // treshold.
+  bool doSplit = (splitChain->Size <= config->propellerChainSplitThreshold);
+  auto slicePosEnd =
+      doSplit ? splitChain->Nodes.end() : std::next(splitChain->Nodes.begin());
+
+  SmallVector<unique_ptr<NodeChainAssembly>, 128> candidateNCAs;
+
+  for (auto slicePos = splitChain->Nodes.begin(); slicePos != slicePosEnd;
+       ++slicePos) {
+    // Do not split the mutually-forced edges in the chain.
+    if (slicePos != splitChain->Nodes.begin() &&
+        MutuallyForcedOut[*std::prev(slicePos)] == *slicePos)
+      continue;
+
+    // If the split position is at the beginning (no splitting), only consider
+    // one MergeOrder
+    auto mergeOrderEnd = (slicePos == splitChain->Nodes.begin())
+                             ? MergeOrder::BeginNext
+                             : MergeOrder::End;
+
+    for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) {
+      MergeOrder mOrder = static_cast<MergeOrder>(MI);
+
+      // Create the NodeChainAssembly representing this particular assembly.
+      auto NCA = unique_ptr<NodeChainAssembly>(new NodeChainAssembly(
+          splitChain, unsplitChain, slicePos, mOrder, this));
+
+      ELFCfgNode *entryNode = CFG->getEntryNode();
+      if ((NCA->SplitChain->getFirstNode() == entryNode ||
+           NCA->UnsplitChain->getFirstNode() == entryNode) &&
+          NCA->getFirstNode() != entryNode)
+        continue;
+      candidateNCAs.push_back(std::move(NCA));
+    }
+  }
+
+  // Insert the best assembly of the two chains (only if it has positive gain).
+  auto bestCandidate = std::max_element(
+      candidateNCAs.begin(), candidateNCAs.end(),
+      [](unique_ptr<NodeChainAssembly> &c1, unique_ptr<NodeChainAssembly> &c2) {
+        return c1->extTSPScoreGain() < c2->extTSPScoreGain();
+      });
+
+  if (bestCandidate != candidateNCAs.end() &&
+      (*bestCandidate)->extTSPScoreGain() > 0) {
+    NodeChainAssemblies.try_emplace(std::make_pair(splitChain, unsplitChain),
+                                    std::move(*bestCandidate));
+    return true;
+  } else
+    return false;
+}
+
+void NodeChainBuilder::initNodeChains() {
+  for (auto &node : CFG->Nodes) {
+    NodeChain *chain = new NodeChain(node.get());
+    NodeToChainMap[node.get()] = chain;
+    NodeOffsetMap[node.get()] = 0;
+    Chains.try_emplace(node->Shndx, chain);
+  }
+}
+
+// Find all the mutually-forced edges.
+// These are all the edges which are -- based on the profile -- the only
+// (executed) outgoing edge from their source node and the only (executed)
+// incoming edges to their sink nodes
+void NodeChainBuilder::initMutuallyForcedEdges() {
+  DenseMap<const ELFCfgNode *, std::vector<ELFCfgEdge *>> profiledOuts;
+  DenseMap<const ELFCfgNode *, std::vector<ELFCfgEdge *>> profiledIns;
+
+  for (auto &node : CFG->Nodes) {
+    std::copy_if(node->Outs.begin(), node->Outs.end(),
+                 std::back_inserter(profiledOuts[node.get()]),
+                 [](const ELFCfgEdge *edge) {
+                   return edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC &&
+                          edge->Weight != 0;
+                 });
+    std::copy_if(node->Ins.begin(), node->Ins.end(),
+                 std::back_inserter(profiledIns[node.get()]),
+                 [](const ELFCfgEdge *edge) {
+                   return edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC &&
+                          edge->Weight != 0;
+                 });
+  }
+
+  for (auto &node : CFG->Nodes) {
+    if (profiledOuts[node.get()].size() == 1) {
+      ELFCfgEdge *edge = profiledOuts[node.get()].front();
+      if (profiledIns[edge->Sink].size() == 1)
+        MutuallyForcedOut[node.get()] = edge->Sink;
+    }
+  }
+
+  // Break cycles in the mutually forced edges by cutting the edge sinking to
+  // the smallest address in every cycle (hopefully a loop backedge)
+  DenseMap<const ELFCfgNode *, unsigned> nodeToPathMap;
+  SmallVector<const ELFCfgNode *, 16> cycleCutNodes;
+  unsigned pathCount = 0;
+  for (auto it = MutuallyForcedOut.begin(); it != MutuallyForcedOut.end();
+       ++it) {
+    // Check to see if the node (and its cycle) have already been visited.
+    if (nodeToPathMap[it->first])
+      continue;
+    const ELFCfgEdge *victimEdge = nullptr;
+    auto nodeIt = it;
+    pathCount++;
+    while (nodeIt != MutuallyForcedOut.end()) {
+      const ELFCfgNode *node = nodeIt->first;
+      unsigned path = nodeToPathMap[node];
+      if (path != 0) {
+        // If this node is marked with a number, either it is the same number,
+        // in which case we have found a cycle. Or it is a different number,
+        // which means we have found a path to a previously visited path
+        // (non-cycle).
+        if (path == pathCount) {
+          // We have found a cycle: add the victim edge
+          cycleCutNodes.push_back(victimEdge->Src);
+        }
+        break;
+      } else
+        nodeToPathMap[node] = pathCount;
+      const ELFCfgEdge *edge = profiledOuts[node].front();
+      if (!victimEdge ||
+          (edge->Sink->MappedAddr < victimEdge->Sink->MappedAddr)) {
+        victimEdge = edge;
+      }
+      nodeIt = MutuallyForcedOut.find(nodeIt->second);
+    }
+  }
+
+  // Remove the victim edges to break cycles in the mutually forced edges
+  for (const ELFCfgNode *node : cycleCutNodes)
+    MutuallyForcedOut.erase(node);
+}
+
+// This function initializes the ExtTSP algorithm's data structures. This
+// the NodeChainAssemblies and the CandidateChains maps.
+void NodeChainBuilder::initializeExtTSP() {
+  // For each chain, compute its ExtTSP score, add its chain assembly records
+  // and its merge candidate chain.
+  DenseSet<std::pair<NodeChain *, NodeChain *>> visited;
+  for (auto &c : Chains) {
+    NodeChain *chain = c.second.get();
+    chain->Score = computeExtTSPScore(chain);
+    for (const ELFCfgNode *node : chain->Nodes) {
+      for (const ELFCfgEdge *edge : node->Outs) {
+        if (!edge->Weight)
+          continue;
+        NodeChain *otherChain = getNodeChain(edge->Sink);
+        if (chain == otherChain)
+          continue;
+        if (visited.count(std::make_pair(chain, otherChain)))
+          continue;
+        bool x = updateNodeChainAssembly(chain, otherChain);
+        bool y = updateNodeChainAssembly(otherChain, chain);
+        if (x || y) {
+          CandidateChains[chain].insert(otherChain);
+          CandidateChains[otherChain].insert(chain);
+        }
+        visited.insert(std::make_pair(chain, otherChain));
+        visited.insert(std::make_pair(otherChain, chain));
+      }
+    }
+  }
+}
+
+void NodeChainBuilder::computeChainOrder(
+    std::vector<const NodeChain *> &chainOrder) {
+  // Attach the mutually-foced edges (which will not be split anymore by the
+  // Extended TSP algorithm).
+  for (auto &kv : MutuallyForcedOut)
+    attachNodes(kv.first, kv.second);
+
+  // Initialize the Extended TSP algorithm's data.
+  initializeExtTSP();
+
+  // Keep merging the chain assembly record with the highest ExtTSP gain, until
+  // no more gain is possible.
+  bool merged = false;
+  do {
+    merged = false;
+    auto bestCandidate = std::max_element(
+        NodeChainAssemblies.begin(), NodeChainAssemblies.end(),
+        [](DenseMapPair<std::pair<NodeChain *, NodeChain *>,
+                        unique_ptr<NodeChainAssembly>> &c1,
+           DenseMapPair<std::pair<NodeChain *, NodeChain *>,
+                        unique_ptr<NodeChainAssembly>> &c2) {
+          return c1.second->extTSPScoreGain() < c2.second->extTSPScoreGain();
+        });
+
+    if (bestCandidate != NodeChainAssemblies.end() &&
+        bestCandidate->second->extTSPScoreGain() > 0) {
+      unique_ptr<NodeChainAssembly> bestCandidateNCA =
+          std::move(bestCandidate->second);
+      NodeChainAssemblies.erase(bestCandidate);
+      mergeChains(std::move(bestCandidateNCA));
+      merged = true;
+    }
+  } while (merged);
+
+  // Merge fallthrough basic blocks if we have missed any
+  attachFallThroughs();
+
+  sortChainsByExecutionDensity(chainOrder);
+}
+
+// This function computes the ExtTSP score for a chain assembly record. This
+// goes the three BB slices in the assembly record and considers all edges
+// whose source and sink belongs to the chains in the assembly record.
+double NodeChainBuilder::NodeChainAssembly::computeExtTSPScore() const {
+  double score = 0;
+  for (uint8_t srcSliceIdx = 0; srcSliceIdx < 3; ++srcSliceIdx) {
+    const NodeChainSlice &srcSlice = Slices[srcSliceIdx];
+    uint32_t srcNodeOffset = srcSlice.BeginOffset;
+    for (auto nodeIt = srcSlice.Begin; nodeIt != srcSlice.End;
+         srcNodeOffset += (*nodeIt)->ShSize, ++nodeIt) {
+      const ELFCfgNode *node = *nodeIt;
+      for (const ELFCfgEdge *edge : node->Outs) {
+        if (!edge->Weight)
+          continue;
+
+        uint8_t sinkSliceIdx;
+
+        if (findSliceIndex(edge->Sink, sinkSliceIdx)) {
+          auto sinkNodeOffset = ChainBuilder->getNodeOffset(edge->Sink);
+          bool edgeForward = (srcSliceIdx < sinkSliceIdx) ||
+                             (srcSliceIdx == sinkSliceIdx &&
+                              (srcNodeOffset + node->ShSize <= sinkNodeOffset));
+
+          uint32_t distance = 0;
+
+          if (srcSliceIdx == sinkSliceIdx) {
+            distance = edgeForward
+                           ? sinkNodeOffset - srcNodeOffset - node->ShSize
+                           : srcNodeOffset - sinkNodeOffset + node->ShSize;
+          } else {
+            const NodeChainSlice &sinkSlice = Slices[sinkSliceIdx];
+            distance = edgeForward
+                           ? srcSlice.EndOffset - srcNodeOffset - node->ShSize +
+                                 sinkNodeOffset - sinkSlice.BeginOffset
+                           : srcNodeOffset - srcSlice.BeginOffset +
+                                 node->ShSize + sinkSlice.EndOffset -
+                                 sinkNodeOffset;
+            // Increment the distance by the size of the middle slice if the src
+            // and sink are from the two ends.
+            if (std::abs(sinkSliceIdx - srcSliceIdx) == 2)
+              distance += Slices[1].size();
+          }
+          score += getEdgeExtTSPScore(edge, edgeForward, distance);
+        }
+      }
+    }
+  }
+  return score;
+}
+
+void NodeChainBuilder::doSplitOrder(list<StringRef> &symbolList,
+                                    list<StringRef>::iterator hotPlaceHolder,
+                                    list<StringRef>::iterator coldPlaceHolder) {
+  std::vector<const NodeChain *> chainOrder;
+  computeChainOrder(chainOrder);
+
+  DenseMap<const ELFCfgNode *, unsigned> address;
+  unsigned currentAddress = 0;
+  for (const NodeChain *c : chainOrder) {
+    list<StringRef>::iterator insertPos =
+        c->Freq ? hotPlaceHolder : coldPlaceHolder;
+    for (const ELFCfgNode *n : c->Nodes) {
+      symbolList.insert(insertPos, n->ShName);
+      if (c->Freq) {
+        address[n] = currentAddress;
+        currentAddress += n->ShSize;
+      }
+    }
+  }
+
+  if (config->propellerAlignBasicBlocks) {
+
+    enum VisitStatus { NONE = 0, DURING, FINISHED };
+
+    DenseMap<const ELFCfgNode *, uint64_t> backEdgeFreq;
+    DenseMap<const ELFCfgNode *, VisitStatus> visited;
+
+    std::function<void(const ELFCfgNode *)> visit;
+    visit = [&address, &visited, &backEdgeFreq, &visit](const ELFCfgNode *n) {
+      if (visited[n] != NONE)
+        return;
+      if (!n->Freq)
+        return;
+      visited[n] = DURING;
+      if (n->FTEdge)
+        visit(n->FTEdge->Sink);
+      for (const ELFCfgEdge *e : n->Outs) {
+        if (e->Sink->Freq && address[e->Sink] > address[n])
+          visit(e->Sink);
+      }
+      for (const ELFCfgEdge *e : n->Outs) {
+        if (e->Sink->Freq && address[e->Sink] <= address[n]) {
+          if (visited[e->Sink] == DURING) {
+            backEdgeFreq[e->Sink] += e->Weight;
+          }
+        }
+      }
+      visited[n] = FINISHED;
+    };
+
+    for (const NodeChain *c : chainOrder)
+      if (c->Freq != 0) {
+        for (const ELFCfgNode *n : c->Nodes)
+          visit(n);
+      }
+
+    for (auto &n : CFG->Nodes) {
+      if (n.get() == CFG->getEntryNode())
+        continue;
+      if (n->Freq && (n->Freq >= 10 * CFG->getEntryNode()->Freq) &&
+          (backEdgeFreq[n.get()] * 5 >= n->Freq * 4)) {
+        config->symbolAlignmentFile.insert(std::make_pair(n->ShName, 16));
+      } else
+        config->symbolAlignmentFile.insert(std::make_pair(n->ShName, 1));
+    }
+  }
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/PropellerFuncOrdering.h
===================================================================
--- /dev/null
+++ lld/ELF/PropellerFuncOrdering.h
@@ -0,0 +1,74 @@
+//===- PropellerFuncReordering.h
+//--------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_PROPELLER_FUNC_ORDERING_H
+#define LLD_ELF_PROPELLER_FUNC_ORDERING_H
+
+#include "Propeller.h"
+
+#include <list>
+#include <map>
+#include <vector>
+
+namespace lld {
+namespace propeller {
+
+class ELFCfg;
+
+class CallChainClustering {
+public:
+  class Cluster {
+  public:
+    Cluster(ELFCfg *cfg, unsigned);
+    // All cfgs in this cluster
+    std::vector<ELFCfg *> CFGs;
+    // Unique id associated with the cluster
+    unsigned Id;
+    // Total binary size of this cluster (only the hot part if using
+    // split-funcs.
+    uint64_t Size;
+    // Total byte-level execution frequency of the cluster
+    uint64_t Weight;
+
+    // Merge the "other" cluster into this cluster.
+    Cluster &mergeWith(Cluster &other) {
+      CFGs.insert(CFGs.end(), other.CFGs.begin(), other.CFGs.end());
+      this->Weight += other.Weight;
+      this->Size += other.Size;
+      return *this;
+    }
+
+    // Returns the per-byte execution density of this cluster
+    double getDensity() { return ((double)Weight) / Size; }
+  };
+
+  CallChainClustering() {}
+
+  void init(Propeller &propeller);
+
+  unsigned doOrder(std::list<ELFCfg *> &cfgOrder);
+
+private:
+  unsigned ClusterCount = 0;
+
+  ELFCfg *getMostLikelyPredecessor(ELFCfg *cfg, Cluster *cluster);
+
+  void mergeClusters();
+  void sortClusters(std::vector<Cluster *> &);
+
+  std::vector<ELFCfg *> HotCFGs, ColdCFGs;
+  std::map<ELFCfg *, Cluster *> CFGToClusterMap;
+  std::map<unsigned, std::unique_ptr<Cluster>> Clusters;
+};
+
+} // namespace propeller
+} // namespace lld
+
+#endif
Index: lld/ELF/PropellerFuncOrdering.cpp
===================================================================
--- /dev/null
+++ lld/ELF/PropellerFuncOrdering.cpp
@@ -0,0 +1,193 @@
+//===- PropellerFuncReordering.cpp
+//-------------------------------------------===//
+////
+//// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+//// See https://llvm.org/LICENSE.txt for license information.
+//// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+////
+////===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastcture for doing code layout
+// optimization and includes the implementation of function reordering based on
+// the CallChainClustering algorithm as published in [1].
+//
+// This algorithm keeps merging functions together into clusters until the
+// cluster sizes reach a limit. The algorithm iterates over functions in
+// decreasing order of their execution density (total frequency divided by size)
+// and for each function, it first finds the cluster containing the
+// most-frequent caller of that function and then places the caller's cluster
+// right before the callee's cluster. Finally, all the remaining clusters are
+// ordered in decreasing order of their execution density.
+//
+// [1] G.Ottoni and B.Maher, Optimizing Function Placement for Large-Scale
+// Data-Center Applications, CGO 2017. available at
+// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+#include "PropellerFuncOrdering.h"
+
+#include "Config.h"
+#include "PropellerELFCfg.h"
+
+#include <algorithm>
+#include <map>
+
+using lld::elf::config;
+
+namespace lld {
+namespace propeller {
+
+const unsigned ClusterMergeSizeThreshold = 1 << 21;
+
+// Initializes the CallChainClustering algorithm with the cfgs from propeller.
+// It separates cfgs into hot and cold cfgs and initially orders each collection
+// of cfgs based on the address of their corresponding functions in the original
+// binary.
+void CallChainClustering::init(Propeller &propeller) {
+  propeller.forEachCfgRef([this](ELFCfg &cfg) {
+    if (cfg.isHot())
+      this->HotCFGs.push_back(&cfg);
+    else
+      this->ColdCFGs.push_back(&cfg);
+  });
+
+  auto cfgComparator = [](ELFCfg *cfg1, ELFCfg *cfg2) -> bool {
+    return cfg1->getEntryNode()->MappedAddr < cfg2->getEntryNode()->MappedAddr;
+  };
+  auto sortCFGs = [&cfgComparator](std::vector<ELFCfg *> &cfgs) {
+    std::sort(cfgs.begin(), cfgs.end(), cfgComparator);
+  };
+  sortCFGs(HotCFGs);
+  sortCFGs(ColdCFGs);
+}
+
+// Initialize a cluster containing a single cfg an associates it with a unique
+// id.
+CallChainClustering::Cluster::Cluster(ELFCfg *cfg, unsigned id)
+    : CFGs(1, cfg), Id(id) {}
+
+// Returns the most frequent caller of a function. This function also gets as
+// the second parameter the cluster containing this function to save a lookup
+// into the CFGToClusterMap.
+ELFCfg *CallChainClustering::getMostLikelyPredecessor(ELFCfg *cfg,
+                                                      Cluster *cluster) {
+  ELFCfgNode *entry = cfg->getEntryNode();
+  if (!entry)
+    return nullptr;
+  ELFCfgEdge *bestCallIn = nullptr;
+
+  // Iterate over all callers of the entry basic block of the function.
+  for (ELFCfgEdge *callIn : entry->CallIns) {
+    auto *caller = callIn->Src->Cfg;
+    auto *callerCluster = CFGToClusterMap[caller];
+    assert(caller->isHot());
+    // Ignore callers from the same function, or the same cluster
+    if (caller == cfg || callerCluster == cluster)
+      continue;
+    // Ignore callers with overly large clusters
+    if (callerCluster->Size > ClusterMergeSizeThreshold)
+      continue;
+    // Ignore calls which are cold relative to the callee
+    if (callIn->Weight * 10 < entry->Freq)
+      continue;
+    // Do not merge if the caller cluster's density would degrade by more than
+    // 1/8 if merged.
+    if (8 * callerCluster->Size * (cluster->Weight * callerCluster->Weight) <
+        callerCluster->Weight * (cluster->Size + callerCluster->Size))
+      continue;
+    // Update the best CallIn edge if needed
+    if (!bestCallIn || bestCallIn->Weight < callIn->Weight)
+      bestCallIn = callIn;
+  }
+  return bestCallIn ? bestCallIn->Src->Cfg : nullptr;
+}
+
+// Merge clusters together based on the CallChainClustering algorithm.
+void CallChainClustering::mergeClusters() {
+  // Build a map for the execution density of each cfg. This value will depend
+  // on whether function-splitting is used or not.
+  std::map<ELFCfg *, double> cfgWeightMap;
+  for (ELFCfg *cfg : HotCFGs) {
+    uint64_t cfgWeight = 0;
+    uint64_t cfgSize = 0;
+    cfg->forEachNodeRef([&cfgSize, &cfgWeight](ELFCfgNode &n) {
+      cfgWeight += n.Freq * n.ShSize;
+      if (!config->propellerSplitFuncs || n.Freq)
+        cfgSize += n.ShSize;
+    });
+
+    Cluster *c = new Cluster(cfg, ClusterCount++);
+    c->Weight = cfgWeight;
+    c->Size = std::max(cfgSize, (uint64_t)1);
+    cfgWeightMap.emplace(cfg, c->getDensity());
+    Clusters.emplace(c->Id, c);
+    CFGToClusterMap[cfg] = c;
+  }
+
+  // Sort the hot cfgs in decreasing order of their execution density.
+  std::stable_sort(HotCFGs.begin(), HotCFGs.end(),
+                   [&cfgWeightMap](ELFCfg *cfg1, ELFCfg *cfg2) {
+                     return cfgWeightMap[cfg1] > cfgWeightMap[cfg2];
+                   });
+
+  for (ELFCfg *cfg : HotCFGs) {
+    if (cfgWeightMap[cfg] <= 0.005)
+      break;
+    auto *cluster = CFGToClusterMap[cfg];
+    // Ignore merging if the cluster containing this function is bigger than
+    // 2MBs (size of a large page).
+    if (cluster->Size > ClusterMergeSizeThreshold)
+      continue;
+    assert(cluster);
+
+    ELFCfg *predecessorCfg = getMostLikelyPredecessor(cfg, cluster);
+    if (!predecessorCfg)
+      continue;
+    auto *predecessorCluster = CFGToClusterMap[predecessorCfg];
+
+    assert(predecessorCluster != cluster && predecessorCfg != cfg);
+
+    // Join the two clusters into predecessorCluster.
+    predecessorCluster->mergeWith(*cluster);
+
+    // Update cfg to cluster mapping, because all cfgs that were
+    // previsously in cluster are now in predecessorCluster.
+    for (ELFCfg *cfg : cluster->CFGs) {
+      CFGToClusterMap[cfg] = predecessorCluster;
+    }
+
+    // Delete the defunct cluster
+    Clusters.erase(cluster->Id);
+  }
+}
+
+// This functions sorts all remaining clusters in decreasing order of their
+// execution density.
+void CallChainClustering::sortClusters(std::vector<Cluster *> &clusterOrder) {
+  for (auto &p : Clusters)
+    clusterOrder.push_back(p.second.get());
+  std::sort(clusterOrder.begin(), clusterOrder.end(),
+            [](Cluster *c1, Cluster *c2) {
+              // Set a deterministic order when execution densities are equal.
+              if (c1->getDensity() == c2->getDensity())
+                return c1->CFGs.front()->getEntryNode()->MappedAddr <
+                       c2->CFGs.front()->getEntryNode()->MappedAddr;
+              return c1->getDensity() > c2->getDensity();
+            });
+}
+
+// This function performs CallChainClustering on all cfgs and then orders all
+// the built clusters based on their execution density. It places all cold
+// functions after hot functions and returns the number of hot functions.
+unsigned CallChainClustering::doOrder(std::list<ELFCfg *> &cfgOrder) {
+  mergeClusters();
+  std::vector<Cluster *> clusterOrder;
+  sortClusters(clusterOrder);
+  for (Cluster *c : clusterOrder) {
+    cfgOrder.insert(cfgOrder.end(), c->CFGs.begin(), c->CFGs.end());
+  }
+
+  cfgOrder.insert(cfgOrder.end(), ColdCFGs.begin(), ColdCFGs.end());
+  return HotCFGs.size();
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/test/ELF/propeller/propeller-layout-function-ordering.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/propeller-layout-function-ordering.s
@@ -0,0 +1,75 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises function reordering on three functions.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld  %t.o -o %t.out
+
+# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000008 t foo
+# BEFORE-NEXT:	0000000000201128 0000000000000008 t bar
+# BEFORE-NEXT:	0000000000201130 0000000000000008 t baz
+
+## Create a propeller profile based on the following inter-procedural calls.
+##
+##       100        100
+##  baz -----> bar -----> foo
+##
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 8 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 8 Nbar" >> %t_prof.propeller
+# RUN: echo "3 8 Nbaz" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "3 2 100 C" >> %t_prof.propeller
+# RUN: echo "2 1 100 C" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+
+## Link with the propeller profile and expect the functions to be reordered.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out
+# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 0000000000000008 t baz
+# REORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# REORDER-NEXT:	0000000000201130 0000000000000008 t foo
+
+## Disable function reordering and expect that the original function order is retained.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out
+# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER
+
+# NOREORDER:		0000000000201120 0000000000000008 t foo
+# NOREORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# NOREORDER-NEXT:	0000000000201130 0000000000000008 t baz
+
+.section	.text,"ax",@progbits,unique,1
+# -- Begin function foo
+.type	foo,@function
+foo:
+ .quad 0x11
+
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
+
+.section	.text,"ax",@progbits,unique,2
+# -- Begin function bar
+.type	bar,@function
+bar:
+ .quad 0x22
+
+.Lbar_end:
+ .size	bar, .Lbar_end-bar
+# -- End function bar
+
+.section	.text,"ax",@progbits,unique,3
+# -- Begin function baz
+.type	baz,@function
+baz:
+ .quad 0x33
+
+.Lbaz_end:
+ .size	baz, .Lbaz_end-baz
+# -- End function baz
Index: lld/test/ELF/propeller/propeller-layout-function-with-loop.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/propeller-layout-function-with-loop.s
@@ -0,0 +1,130 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises basic block reordering on a single function with a simple loop.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	0000000000201123 a.BB.foo:
+# BEFORE-NEXT:	nopl	(%rax)
+# BEFORE-NEXT:	je      3 <aaa.BB.foo>
+
+# BEFORE:	0000000000201128 aa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	000000000020112b aaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	jne     -13 <a.BB.foo>
+
+# BEFORE:	0000000000201130 aaaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	retq
+
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##             foo
+##              |
+##              |5
+##              V
+##      +--> a.BB.foo <--+
+##      |      / \       |
+##     0|    0/   \95    |90
+##      |    /     \     |
+##      |   v       \    |
+##  aa.BB.foo        v   |
+##      \        aaa.BB.foo
+##       \           /
+##        \         /
+##        0\       /10
+##          \     /
+##           v   v
+##         aaaa.BB.foo
+##
+## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo
+## as a fallthrough chain in the layout.
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 14 Nfoo" > %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 3 1.2" >> %t_prof.propeller
+# RUN: echo "4 5 1.3" >> %t_prof.propeller
+# RUN: echo "5 4 1.4" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "4 2 90" >> %t_prof.propeller
+# RUN: echo "2 4 95" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "4 5 10" >> %t_prof.propeller
+# RUN: echo "1 2 5" >> %t_prof.propeller
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out
+# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 foo:
+# REORDER-NEXT:	nopl	(%rax)
+
+# REORDER:	0000000000201123 a.BB.foo:
+# REORDER-NEXT:	nopl	(%rax)
+# REORDER-NEXT:	jne      9 <aa.BB.foo>
+
+# REORDER:	0000000000201128 aaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jne     -10 <a.BB.foo>
+
+# REORDER:	000000000020112d aaaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	retq
+
+# REORDER:	0000000000201131 aa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jmp	-14 <aaa.BB.foo>
+
+## Disable basic block reordering and expect that the original bb order is retained.
+#
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out
+# RUN: diff %t.propeller.noreorder.out %t.out
+
+.section	.text,"ax",@progbits
+# -- Begin function foo
+.type	foo,@function
+foo:
+ nopl (%rax)
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ jmp	aaa.BB.foo
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ jne	a.BB.foo
+ jmp	aaaa.BB.foo
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits,unique,4
+aaaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaaa.BB.foo_end:
+ .size	aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
Index: lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s
@@ -0,0 +1,87 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises optimal basic block reordering one a single function.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000005 t foo
+# BEFORE-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# BEFORE-NEXT:	000000000020112a 0000000000000004 t aa.BB.foo
+# BEFORE-NEXT:	000000000020112e 0000000000000004 t aaa.BB.foo
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##
+##     foo
+##      |\
+##      | \100
+##      |  \
+##      |   v
+##   150|  a.BB.foo
+##      |   /  \
+##      |  /100 \0
+##      | /      \
+##      vv        v
+##   aaa.BB.foo  aa.BB.foo
+##
+## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and
+## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo.
+## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf.
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 12 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 4 1.2" >> %t_prof.propeller
+# RUN: echo "4 4 1.3" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "1 4 150" >> %t_prof.propeller
+# RUN: echo "2 4 100" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "1 2 100" >> %t_prof.propeller
+
+# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out
+# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER
+
+# AFTER:	0000000000201120 0000000000000005 t foo
+# AFTER-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# AFTER-NEXT:	000000000020112a 0000000000000004 t aaa.BB.foo
+# AFTER-NEXT:	000000000020112e 0000000000000004 t aa.BB.foo
+
+#.global _start
+#_start:
+# -- Begin function foo
+.section	.text,"ax",@progbits
+.type	foo,@function
+foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ ret
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo