Index: lld/ELF/PropellerBBReordering.h
===================================================================
--- /dev/null
+++ lld/ELF/PropellerBBReordering.h
@@ -0,0 +1,344 @@
+//===- PropellerBBReordering.cpp  -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H
+#define LLD_ELF_PROPELLER_BB_REORDERING_H
+
+#include "PropellerELFCfg.h"
+
+#include "lld/Common/LLVM.h"
+#include "llvm/ADT/DenseMap.h"
+
+#include <list>
+#include <unordered_map>
+#include <unordered_set>
+
+using llvm::DenseMap;
+
+namespace lld {
+namespace propeller {
+
+enum MergeOrder {
+  Begin,
+  X2X1Y = Begin,
+  BeginNext,
+  X1YX2 = BeginNext,
+  X2YX1,
+  YX2X1,
+  End
+};
+
+// Represents a chain of nodes (basic blocks).
+class NodeChain {
+public:
+  // Representative node of the chain, with which it is initially constructed.
+  const CFGNode *DelegateNode;
+  std::vector<const CFGNode *> Nodes;
+
+  // Total binary size of the chain
+  uint32_t Size;
+
+  // Total execution frequency of the chain
+  uint64_t Freq;
+
+  // Extended TSP score of the chain
+  double Score;
+
+  // Constructor for building a NodeChain from a single Node
+  NodeChain(const CFGNode *Node)
+      : DelegateNode(Node), Nodes(1, Node), Size(Node->ShSize),
+        Freq(Node->Freq) {}
+
+  double execDensity() const {
+    return ((double)Freq) / std::max(Size, (uint32_t)1);
+  }
+};
+
+// BB Chain builder based on the ExtTSP metric
+class NodeChainBuilder {
+private:
+  class NodeChainAssembly;
+  class NodeChainSlice;
+
+  // Cfg representing a single function.
+  const ControlFlowGraph *CFG;
+
+  // Set of built chains, keyed by section index of their Delegate Nodes.
+  DenseMap<uint64_t, std::unique_ptr<NodeChain>> Chains;
+
+  // Map from every node to its containing chain.
+  // This map will be updated as chains keep merging together during the
+  // algorithm.
+  DenseMap<const CFGNode *, NodeChain *> NodeToChainMap;
+
+  // Map from every node to its (binary) offset in its containing chain.
+  // This map will be updated as chains keep merging together during the
+  // algorithm.
+  DenseMap<const CFGNode *, uint32_t> NodeOffsetMap;
+
+  // These represent all the edges which are -- based on the profile -- the only
+  // (executed) outgoing edges from their source node and the only (executed)
+  // incoming edges to their sink nodes. The algorithm will make sure that these
+  // edges form fall-throughs in the final order.
+  DenseMap<const CFGNode *, CFGNode *> MutuallyForcedOut;
+
+  // This maps every (ordered) pair of chains (with the first chain in the pair
+  // potentially splittable) to the highest-gain NodeChainAssembly for those
+  // chains.
+  DenseMap<std::pair<NodeChain *, NodeChain *>,
+           std::unique_ptr<NodeChainAssembly>>
+      NodeChainAssemblies;
+
+  // This map stores the candidate chains for each chain.
+  //
+  // For every chain, its candidate chains are the chains which can increase the
+  // overall ExtTSP score when merged with that chain. This is used to update
+  // the NodeChainAssemblies map whenever chains merge together. The candidate
+  // chains of a chain may also be updated as result of a merge.
+  std::unordered_map<NodeChain *, std::unordered_set<NodeChain *>>
+      CandidateChains;
+
+  void sortChainsByExecutionDensity(std::vector<const NodeChain *> &ChainOrder);
+
+  void initializeExtTSP();
+  void attachFallThroughs();
+
+  // This function tries to place two nodes immediately adjacent to
+  // each other (used for fallthroughs).
+  // Returns true if this can be done.
+  bool attachNodes(const CFGNode *src, const CFGNode *sink);
+
+  void mergeChainEdges(NodeChain *splitChain, NodeChain *unSplitChain);
+
+  void mergeChains(NodeChain *leftChain, NodeChain *rightChain);
+  void mergeChains(std::unique_ptr<NodeChainAssembly> assembly);
+
+  // Recompute the ExtTSP score of a chain
+  double computeExtTSPScore(NodeChain *chain) const;
+
+  // Update the related NodeChainAssembly records for two chains, with the
+  // assumption that UnsplitChain has been merged into SplitChain.
+  bool updateNodeChainAssembly(NodeChain *splitChain, NodeChain *unsplitChain);
+
+  void computeChainOrder(std::vector<const NodeChain *> &chainOrder);
+
+  // Initialize the mutuallyForcedOut map
+  void initMutuallyForcedEdges();
+
+  // Initialize basic block chains, with one chain for every node
+  void initNodeChains();
+
+  uint32_t getNodeOffset(const CFGNode *node) const {
+    auto it = NodeOffsetMap.find(node);
+    assert("Node does not exist in the offset map." &&
+           it != NodeOffsetMap.end());
+    return it->second;
+  }
+
+  NodeChain *getNodeChain(const CFGNode *node) const {
+    auto it = NodeToChainMap.find(node);
+    assert("Node does not exist in the chain map." &&
+           it != NodeToChainMap.end());
+    return it->second;
+  }
+
+public:
+  NodeChainBuilder(const ControlFlowGraph *_CFG) : CFG(_CFG) {
+    initNodeChains();
+    initMutuallyForcedEdges();
+  }
+
+  // This invokes the Extended TSP algorithm, orders the hot and cold basic
+  // blocks and inserts their associated symbols at the corresponding locations
+  // specified by the parameters (HotPlaceHolder and ColdPlaceHolder) in the
+  // given SymbolList.
+  void doSplitOrder(std::list<StringRef> &SymbolList,
+                    std::list<StringRef>::iterator HotPlaceHolder,
+                    std::list<StringRef>::iterator ColdPlaceHolder);
+};
+
+class NodeChainBuilder::NodeChainSlice {
+private:
+  // Chain from which this slice comes from
+  NodeChain *Chain;
+
+  // The endpoints of the slice in the corresponding chain
+  std::vector<const CFGNode *>::iterator Begin, End;
+
+  // The offsets corresponding to the two endpoints
+  uint32_t BeginOffset, EndOffset;
+
+  // Constructor for building a chain slice from a given chain and the two
+  // endpoints of the chain.
+  NodeChainSlice(NodeChain *c, std::vector<const CFGNode *>::iterator begin,
+                 std::vector<const CFGNode *>::iterator end,
+                 const NodeChainBuilder &chainBuilder)
+      : Chain(c), Begin(begin), End(end) {
+
+    BeginOffset = chainBuilder.getNodeOffset(*begin);
+    if (End == Chain->Nodes.end())
+      EndOffset = Chain->Size;
+    else
+      EndOffset = chainBuilder.getNodeOffset(*end);
+  }
+
+  // (Binary) size of this slice
+  uint32_t size() const { return EndOffset - BeginOffset; }
+
+  friend class NodeChainBuilder;
+  friend class NodeChainAssembly;
+};
+
+class NodeChainBuilder::NodeChainAssembly {
+private:
+  // Total ExtTSP score of this NodeChainAssembly
+  double Score = 0;
+
+  // Whether the ExtTSP score has been computed for this NodeChainAssembly
+  bool ScoreComputed = false;
+
+  // Corresponding NodeChainBuilder of the assembled chains
+  const NodeChainBuilder *ChainBuilder;
+
+  // The two assembled chains
+  NodeChain *SplitChain, *UnsplitChain;
+
+  // The slice position of Splitchain
+  std::vector<const CFGNode *>::iterator SlicePosition;
+
+  // The three chain slices
+  std::vector<NodeChainSlice> Slices;
+
+  NodeChainAssembly(NodeChain *chainX, NodeChain *chainY,
+                    std::vector<const CFGNode *>::iterator slicePosition,
+                    MergeOrder mergeOrder, const NodeChainBuilder *chainBuilder)
+      : ChainBuilder(chainBuilder), SplitChain(chainX), UnsplitChain(chainY),
+        SlicePosition(slicePosition) {
+    NodeChainSlice x1(chainX, chainX->Nodes.begin(), SlicePosition,
+                      *chainBuilder);
+    NodeChainSlice x2(chainX, SlicePosition, chainX->Nodes.end(),
+                      *chainBuilder);
+    NodeChainSlice y(chainY, chainY->Nodes.begin(), chainY->Nodes.end(),
+                     *chainBuilder);
+
+    switch (mergeOrder) {
+    case MergeOrder::X2X1Y:
+      Slices = {x2, x1, y};
+      break;
+    case MergeOrder::X1YX2:
+      Slices = {x1, y, x2};
+      break;
+    case MergeOrder::X2YX1:
+      Slices = {x2, y, x1};
+      break;
+    case MergeOrder::YX2X1:
+      Slices = {y, x2, x1};
+      break;
+    default:
+      assert("Invalid MergeOrder!" && false);
+    }
+  }
+
+  double extTSPScore() {
+    if (ScoreComputed)
+      return Score;
+    else {
+      ScoreComputed = true;
+      return (Score = computeExtTSPScore());
+    }
+  }
+
+  // Return the gain in ExtTSP score achieved by this NodeChainAssembly once it
+  // is accordingly applied to the two chains.
+  double extTSPScoreGain() {
+    return extTSPScore() - SplitChain->Score - UnsplitChain->Score;
+  }
+
+  // Find the NodeChainSlice in this NodeChainAssembly which contains the given
+  // node. If the node is not contained in this NodeChainAssembly, then return
+  // false. Otherwise, set idx equal to the index of the corresponding slice and
+  // return true.
+  bool findSliceIndex(const CFGNode *node, uint8_t &idx) const {
+    // First find the chain containing the given node.
+    NodeChain *chain = ChainBuilder->getNodeChain(node);
+
+    // Return false if the found chain is not used in this NodeChainAssembly.
+    if (SplitChain != chain && UnsplitChain != chain)
+      return false;
+
+    // Find the slice containing the node using the node's offset in the chain
+    auto offset = ChainBuilder->getNodeOffset(node);
+
+    for (idx = 0; idx < 3; ++idx) {
+      if (chain != Slices[idx].Chain)
+        continue;
+      if (offset < Slices[idx].BeginOffset)
+        continue;
+      if (offset > Slices[idx].EndOffset)
+        continue;
+      if (offset < Slices[idx].EndOffset && offset > Slices[idx].BeginOffset)
+        return true;
+      // A node can have zero size, which means multiple nodes may be associated
+      // with the same offset. This means that if the node's offset is at the
+      // beginning or the end of the slice, the node may reside in either slices
+      // of the chain.
+      if (offset == Slices[idx].EndOffset) {
+        // If offset is at the end of the slice, iterate backwards over the
+        // slice to find a zero-sized node.
+        for (auto nodeIt = std::prev(Slices[idx].End);
+             nodeIt != std::prev(Slices[idx].Begin); nodeIt--) {
+          // Stop iterating if the node's size is non-zero as this would change
+          // the offset.
+          if (!(*nodeIt)->ShSize)
+            break;
+          if (*nodeIt == node)
+            return true;
+        }
+      }
+      if (offset == Slices[idx].BeginOffset) {
+        // If offset is at the beginning of the slice, iterate forwards over the
+        // slice to find the node.
+        for (auto nodeIt = Slices[idx].Begin; nodeIt != Slices[idx].End;
+             nodeIt++) {
+          if (*nodeIt == node)
+            return true;
+          // Stop iterating if the node's size is non-zero as this would change
+          // the offset.
+          if ((*nodeIt)->ShSize)
+            break;
+        }
+      }
+    }
+    return false;
+  }
+
+  // Total Extended TSP score of this NodeChainAssembly once it is assembled
+  // accordingly.
+  double computeExtTSPScore() const;
+
+  // First node in the resulting assembled chain.
+  const CFGNode *getFirstNode() const {
+    for (auto &slice : Slices)
+      if (slice.Begin != slice.End)
+        return *slice.Begin;
+    return nullptr;
+  }
+
+  friend class NodeChainBuilder;
+
+public:
+  // We delete the copy constructor to make sure NodeChainAssembly is moved
+  // rather than copied.
+  NodeChainAssembly(NodeChainAssembly &&) = default;
+  // copy constructor is implicitly deleted
+  // NodeChainAssembly(const NodeChainAssembly&) = delete;
+  NodeChainAssembly() = delete;
+};
+
+} // namespace propeller
+} // namespace lld
+#endif
Index: lld/ELF/PropellerBBReordering.cpp
===================================================================
--- /dev/null
+++ lld/ELF/PropellerBBReordering.cpp
@@ -0,0 +1,584 @@
+//===- PropellerBBReordering.cpp  -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and includes the implementation of intra-function basic block
+// reordering algorithm based on the Extended TSP metric as described in [1].
+//
+// The Extend TSP metric (ExtTSP) provides a score for every ordering of basic
+// blocks in a function, by combining the gains from fall-throughs and short
+// jumps.
+//
+// Given an ordering of the basic blocks, for a function f, the ExtTSP score is
+// computed as follows.
+//
+// sum_{edges e in f} frequency(e) * weight(e)
+//
+// where frequency(e) is the execution frequency and weight(e) is computed as
+// follows:
+//  * 1 if distance(Src[e], Sink[e]) = 0 (i.e. fallthrough)
+//
+//  * 0.1 * (1 - distance(Src[e], Sink[e]) / 1024) if Src[e] < Sink[e] and 0 <
+//    distance(Src[e], Sink[e]) < 1024 (i.e. short forward jump)
+//
+//  * 0.1 * (1 - distance(Src[e], Sink[e]) / 640) if Src[e] > Sink[e] and 0 <
+//    distance(Src[e], Sink[e]) < 640 (i.e. short backward jump)
+//
+//  * 0 otherwise
+//
+//
+// In short, it computes a weighted sum of frequencies of all edges in the
+// control flow graph. Each edge gets its weight depending on whether the given
+// ordering makes the edge a fallthrough, a short forward jump, or a short
+// backward jump.
+//
+// Although this problem is NP-hard like the regular TSP, an iterative greedy
+// basic-block-chaining algorithm is used to find a close to optimal solution.
+// This algorithm is described as follows.
+//
+// Starting with one basic block sequence (BB chain) for every basic block, the
+// algorithm iteratively joins BB chains together in order to maximize the
+// extended TSP score of all the chains.
+//
+// Initially, it finds all mutually-forced edges in the profiled CFG. These are
+// all the edges which are -- based on the profile -- the only (executed)
+// outgoing edge from their source node and the only (executed) incoming edges
+// to their sink nodes. Next, the source and sink of all mutually-forced edges
+// are attached together as fallthrough edges.
+//
+// Then, at every iteration, the algorithm tries to merge a pair of BB chains
+// which leads to the highest gain in the ExtTSP score. The algorithm extends
+// the search space by considering splitting short (less than 128 bytes in
+// binary size) BB chains into two chains and then merging these two chains with
+// the other chain in four ways. After every merge, the new merge gains are
+// updated. The algorithm repeats joining BB chains until no additional can be
+// achieved. Eventually, it sorts all the existing chains in decreasing order of
+// their execution density, i.e., the total profiled frequency of the chain
+// divided by its binary size.
+//
+// The values used by this algorithm are reconfiguriable using lld's propeller
+// flags. Specifically, these parameters are:
+//
+//   * propeller-forward-jump-distance: maximum distance of a forward jump
+//     default-set to 1024 in the above equation).
+//
+//   * propeller-backward-jump-distance: maximum distance of a backward jump
+//     (default-set to 640 in the above equation).
+//
+//   * propeller-fallthrough-weight: weight of a fallthrough (default-set to 1)
+//
+//   * propeller-forward-jump-weight: weight of a forward jump (default-set to
+//     0.1)
+//
+//   * propeller-backward-jump-weight: weight of a backward jump (default-set to
+//     0.1)
+//
+//   * propeller-chain-split-threshold: maximum binary size of a BB chain which
+//     the algorithm will consider for splitting (default-set to 128).
+//
+// References:
+//   * [1] A. Newell and S. Pupyrev, Improved Basic Block Reordering, available
+//         at https://arxiv.org/abs/1809.04676
+//===----------------------------------------------------------------------===//
+#include "PropellerBBReordering.h"
+
+#include "Config.h"
+#include "llvm/ADT/DenseSet.h"
+
+#include <numeric>
+#include <vector>
+
+using lld::elf::config;
+
+using llvm::DenseSet;
+using llvm::detail::DenseMapPair;
+
+namespace lld {
+namespace propeller {
+
+// Return the Extended TSP score for one edge, given its source to sink
+// direction and distance in the layout.
+double getEdgeExtTSPScore(const CFGEdge *edge, bool isEdgeForward,
+                          uint32_t srcSinkDistance) {
+  if (edge->Weight == 0)
+    return 0;
+
+  if (srcSinkDistance == 0 && (edge->Type == CFGEdge::EdgeType::INTRA_FUNC))
+    return edge->Weight * config->propellerFallthroughWeight;
+
+  if (isEdgeForward && srcSinkDistance < config->propellerForwardJumpDistance)
+    return edge->Weight * config->propellerForwardJumpWeight *
+           (1.0 -
+            ((double)srcSinkDistance) / config->propellerForwardJumpDistance);
+
+  if (!isEdgeForward && srcSinkDistance < config->propellerBackwardJumpDistance)
+    return edge->Weight * config->propellerBackwardJumpWeight *
+           (1.0 -
+            ((double)srcSinkDistance) / config->propellerBackwardJumpDistance);
+  return 0;
+}
+
+// Sort BB chains in decreasing order of their execution density.
+// NodeChainBuilder calls this function at the end to ensure that hot BB chains
+// are placed at the beginning of the function.
+void NodeChainBuilder::sortChainsByExecutionDensity(
+    std::vector<const NodeChain *> &chainOrder) {
+  for (DenseMapPair<uint64_t, std::unique_ptr<NodeChain>> &elem : Chains)
+    chainOrder.push_back(elem.second.get());
+
+  std::sort(
+      chainOrder.begin(), chainOrder.end(),
+      [this](const NodeChain *c1, const NodeChain *c2) {
+        if (c1->Nodes.front() == this->CFG->getEntryNode())
+          return true;
+        if (c2->Nodes.front() == this->CFG->getEntryNode())
+          return false;
+        double c1ExecDensity = c1->execDensity();
+        double c2ExecDensity = c2->execDensity();
+        if (c1ExecDensity == c2ExecDensity) {
+          if (c1->DelegateNode->MappedAddr == c2->DelegateNode->MappedAddr)
+            return c1->DelegateNode->Shndx < c2->DelegateNode->Shndx;
+          return c1->DelegateNode->MappedAddr < c2->DelegateNode->MappedAddr;
+        }
+        return c1ExecDensity > c2ExecDensity;
+      });
+}
+
+// NodeChainBuilder calls this function after building all the chains to attach
+// as many fall-throughs as possible. Given that the algorithm already optimizes
+// the extend TSP score, this function will only affect the cold basic blocks
+// and thus we do not need to consider the edge weights.
+void NodeChainBuilder::attachFallThroughs() {
+  // First, try to keep the fall-throughs from the original order.
+  for (auto &node : CFG->Nodes) {
+    if (node->FTEdge != nullptr) {
+      attachNodes(node.get(), node->FTEdge->Sink);
+    }
+  }
+
+  // Sometimes, the original fall-throughs cannot be kept. So we try to find new
+  // fall-through opportunities which did not exist in the original order.
+  for (auto &edge : CFG->IntraEdges) {
+    attachNodes(edge->Src, edge->Sink);
+  }
+}
+
+// Merge two chains in the specified order.
+void NodeChainBuilder::mergeChains(NodeChain *leftChain,
+                                   NodeChain *rightChain) {
+  for (const CFGNode *node : rightChain->Nodes) {
+    leftChain->Nodes.push_back(node);
+    NodeToChainMap[node] = leftChain;
+    NodeOffsetMap[node] += leftChain->Size;
+  }
+  leftChain->Size += rightChain->Size;
+  leftChain->Freq += rightChain->Freq;
+  Chains.erase(rightChain->DelegateNode->Shndx);
+}
+
+// This function tries to place two basic blocks immediately adjacent to each
+// other (used for fallthroughs). Returns true if the basic blocks have been
+// attached this way.
+bool NodeChainBuilder::attachNodes(const CFGNode *src, const CFGNode *sink) {
+  // No edge cannot fall-through to the entry basic block.
+  if (sink == CFG->getEntryNode())
+    return false;
+
+  // Ignore edges between hot and cold basic blocks.
+  if (src->Freq == 0 ^ sink->Freq == 0)
+    return false;
+  NodeChain *srcChain = getNodeChain(src);
+  NodeChain *sinkChain = getNodeChain(sink);
+  // Skip this edge if the source and sink are in the same chain
+  if (srcChain == sinkChain)
+    return false;
+
+  // It's possible to form a fall-through between src and sink only if
+  // they are respectively located at the end and beginning of their chains.
+  if (srcChain->Nodes.back() != src || sinkChain->Nodes.front() != sink)
+    return false;
+  // Attaching is possible. So we merge the chains in the corresponding order.
+  mergeChains(srcChain, sinkChain);
+  return true;
+}
+
+// Merge two BB sequences according to the given NodeChainAssembly. A
+// NodeChainAssembly is an ordered triple of three slices from two chains.
+void NodeChainBuilder::mergeChains(
+    std::unique_ptr<NodeChainAssembly> assembly) {
+  // Create the new node order according the given assembly
+  std::vector<const CFGNode *> newNodeOrder;
+  for (NodeChainSlice &slice : assembly->Slices) {
+    std::copy(slice.Begin, slice.End, std::back_inserter(newNodeOrder));
+  }
+  // We merge the nodes into the SplitChain
+  assembly->SplitChain->Nodes = std::move(newNodeOrder);
+
+  // Update nodeOffsetMap and nodeToChainMap for all the nodes in the sequence.
+  uint32_t runningOffset = 0;
+  for (const CFGNode *node : assembly->SplitChain->Nodes) {
+    NodeToChainMap[node] = assembly->SplitChain;
+    NodeOffsetMap[node] = runningOffset;
+    runningOffset += node->ShSize;
+  }
+  assembly->SplitChain->Size = runningOffset;
+
+  // Update the total frequency and ExtTSP score of the aggregated chain
+  assembly->SplitChain->Freq += assembly->UnsplitChain->Freq;
+  // We have already computed the gain in the assembly record. So we can just
+  // increment the aggregated chain's score by that gain.
+  assembly->SplitChain->Score += assembly->extTSPScoreGain();
+
+  // Merge the assembly candidate chains of the two chains into the candidate
+  // chains of the remaining NodeChain and remove the records for the defunct
+  // NodeChain.
+  for (NodeChain *c : CandidateChains[assembly->UnsplitChain]) {
+    NodeChainAssemblies.erase(std::make_pair(c, assembly->UnsplitChain));
+    NodeChainAssemblies.erase(std::make_pair(assembly->UnsplitChain, c));
+    CandidateChains[c].erase(assembly->UnsplitChain);
+    if (c != assembly->SplitChain)
+      CandidateChains[assembly->SplitChain].insert(c);
+  }
+
+  // Update the NodeChainAssembly for all candidate chains of the merged
+  // NodeChain. Remove a NodeChain from the merge chain's candidates if the
+  // NodeChainAssembly update finds no gain.
+  auto &splitChainCandidateChains = CandidateChains[assembly->SplitChain];
+
+  for (auto CI = splitChainCandidateChains.begin(),
+            CE = splitChainCandidateChains.end();
+       CI != CE;) {
+    NodeChain *otherChain = *CI;
+    auto &otherChainCandidateChains = CandidateChains[otherChain];
+    bool x = updateNodeChainAssembly(otherChain, assembly->SplitChain);
+    bool y = updateNodeChainAssembly(assembly->SplitChain, otherChain);
+    if (x || y) {
+      otherChainCandidateChains.insert(assembly->SplitChain);
+      CI++;
+    } else {
+      otherChainCandidateChains.erase(assembly->SplitChain);
+      CI = splitChainCandidateChains.erase(CI);
+    }
+  }
+
+  // remove all the candidate chain records for the merged-in chain
+  CandidateChains.erase(assembly->UnsplitChain);
+
+  // Finally, remove the defunct (merged-in) chain record from the chains
+  Chains.erase(assembly->UnsplitChain->DelegateNode->Shndx);
+}
+
+// Calculate the Extended TSP metric for a BB chain.
+// This function goes over all the BBs in the chain and for BB chain and
+// aggregates the score of all edges which are contained in the same chain.
+double NodeChainBuilder::computeExtTSPScore(NodeChain *chain) const {
+  double score = 0;
+  uint32_t srcOffset = 0;
+  for (const CFGNode *node : chain->Nodes) {
+    for (const CFGEdge *edge : node->Outs) {
+      if (!edge->Weight)
+        continue;
+      NodeChain *sinkChain = getNodeChain(edge->Sink);
+      if (sinkChain != chain)
+        continue;
+      auto sinkOffset = getNodeOffset(edge->Sink);
+      bool edgeForward = srcOffset + node->ShSize <= sinkOffset;
+      // Calculate the distance between src and sink
+      uint32_t distance = edgeForward ? sinkOffset - srcOffset - node->ShSize
+                                      : srcOffset - sinkOffset + node->ShSize;
+      score += getEdgeExtTSPScore(edge, edgeForward, distance);
+    }
+    srcOffset += node->ShSize;
+  }
+  return score;
+}
+
+// Updates the best NodeChainAssembly between two NodeChains. The existing
+// record will be replaced by the new NodeChainAssembly if a non-zero gain
+// is achieved. Otherwise, it will be removed.
+// If a nodechain assembly record is (kept) inserted, returns true. Otherwise
+// returns false.
+bool NodeChainBuilder::updateNodeChainAssembly(NodeChain *splitChain,
+                                               NodeChain *unsplitChain) {
+  // Remove the assembly record
+  NodeChainAssemblies.erase(std::make_pair(splitChain, unsplitChain));
+
+  // Only consider splitting the chain if the size of the chain is smaller than
+  // a threshold.
+  bool doSplit = (splitChain->Size <= config->propellerChainSplitThreshold);
+  auto slicePosEnd =
+      doSplit ? splitChain->Nodes.end() : std::next(splitChain->Nodes.begin());
+
+  SmallVector<std::unique_ptr<NodeChainAssembly>, 128> candidateNCAs;
+
+  for (auto slicePos = splitChain->Nodes.begin(); slicePos != slicePosEnd;
+       ++slicePos) {
+    // Do not split the mutually-forced edges in the chain.
+    if (slicePos != splitChain->Nodes.begin() &&
+        MutuallyForcedOut[*std::prev(slicePos)] == *slicePos)
+      continue;
+
+    // If the split position is at the beginning (no splitting), only consider
+    // one MergeOrder
+    auto mergeOrderEnd = (slicePos == splitChain->Nodes.begin())
+                             ? MergeOrder::BeginNext
+                             : MergeOrder::End;
+
+    for (uint8_t MI = MergeOrder::Begin; MI != mergeOrderEnd; MI++) {
+      MergeOrder mOrder = static_cast<MergeOrder>(MI);
+
+      // Create the NodeChainAssembly representing this particular assembly.
+      auto NCA = std::unique_ptr<NodeChainAssembly>(new NodeChainAssembly(
+          splitChain, unsplitChain, slicePos, mOrder, this));
+
+      CFGNode *entryNode = CFG->getEntryNode();
+      if ((NCA->SplitChain->Nodes.front() == entryNode ||
+           NCA->UnsplitChain->Nodes.back() == entryNode) &&
+          NCA->getFirstNode() != entryNode)
+        continue;
+      candidateNCAs.push_back(std::move(NCA));
+    }
+  }
+
+  // Insert the best assembly of the two chains (only if it has positive gain).
+  auto bestCandidate =
+      std::max_element(candidateNCAs.begin(), candidateNCAs.end(),
+                       [](std::unique_ptr<NodeChainAssembly> &c1,
+                          std::unique_ptr<NodeChainAssembly> &c2) {
+                         return c1->extTSPScoreGain() < c2->extTSPScoreGain();
+                       });
+
+  if (bestCandidate != candidateNCAs.end() &&
+      (*bestCandidate)->extTSPScoreGain() > 0) {
+    NodeChainAssemblies.try_emplace(std::make_pair(splitChain, unsplitChain),
+                                    std::move(*bestCandidate));
+    return true;
+  } else
+    return false;
+}
+
+void NodeChainBuilder::initNodeChains() {
+  for (auto &node : CFG->Nodes) {
+    NodeChain *chain = new NodeChain(node.get());
+    NodeToChainMap[node.get()] = chain;
+    NodeOffsetMap[node.get()] = 0;
+    Chains.try_emplace(node->Shndx, chain);
+  }
+}
+
+// Find all the mutually-forced edges.
+// These are all the edges which are -- based on the profile -- the only
+// (executed) outgoing edge from their source node and the only (executed)
+// incoming edges to their sink nodes
+void NodeChainBuilder::initMutuallyForcedEdges() {
+  DenseMap<const CFGNode *, std::vector<CFGEdge *>> profiledOuts;
+  DenseMap<const CFGNode *, std::vector<CFGEdge *>> profiledIns;
+
+  for (auto &node : CFG->Nodes) {
+    std::copy_if(node->Outs.begin(), node->Outs.end(),
+                 std::back_inserter(profiledOuts[node.get()]),
+                 [](const CFGEdge *edge) {
+                   return edge->Type == CFGEdge::EdgeType::INTRA_FUNC &&
+                          edge->Weight != 0;
+                 });
+    std::copy_if(node->Ins.begin(), node->Ins.end(),
+                 std::back_inserter(profiledIns[node.get()]),
+                 [](const CFGEdge *edge) {
+                   return edge->Type == CFGEdge::EdgeType::INTRA_FUNC &&
+                          edge->Weight != 0;
+                 });
+  }
+
+  for (auto &node : CFG->Nodes) {
+    if (profiledOuts[node.get()].size() == 1) {
+      CFGEdge *edge = profiledOuts[node.get()].front();
+      if (profiledIns[edge->Sink].size() == 1)
+        MutuallyForcedOut[node.get()] = edge->Sink;
+    }
+  }
+
+  // Break cycles in the mutually forced edges by cutting the edge sinking to
+  // the smallest address in every cycle (hopefully a loop backedge)
+  DenseMap<const CFGNode *, unsigned> nodeToPathMap;
+  SmallVector<const CFGNode *, 16> cycleCutNodes;
+  unsigned pathCount = 0;
+  for (auto it = MutuallyForcedOut.begin(); it != MutuallyForcedOut.end();
+       ++it) {
+    // Check to see if the node (and its cycle) have already been visited.
+    if (nodeToPathMap[it->first])
+      continue;
+    const CFGEdge *victimEdge = nullptr;
+    auto nodeIt = it;
+    pathCount++;
+    while (nodeIt != MutuallyForcedOut.end()) {
+      const CFGNode *node = nodeIt->first;
+      unsigned path = nodeToPathMap[node];
+      if (path != 0) {
+        // If this node is marked with a number, either it is the same number,
+        // in which case we have found a cycle. Or it is a different number,
+        // which means we have found a path to a previously visited path
+        // (non-cycle).
+        if (path == pathCount) {
+          // We have found a cycle: add the victim edge
+          cycleCutNodes.push_back(victimEdge->Src);
+        }
+        break;
+      } else
+        nodeToPathMap[node] = pathCount;
+      const CFGEdge *edge = profiledOuts[node].front();
+      if (!victimEdge ||
+          (edge->Sink->MappedAddr < victimEdge->Sink->MappedAddr)) {
+        victimEdge = edge;
+      }
+      nodeIt = MutuallyForcedOut.find(nodeIt->second);
+    }
+  }
+
+  // Remove the victim edges to break cycles in the mutually forced edges
+  for (const CFGNode *node : cycleCutNodes)
+    MutuallyForcedOut.erase(node);
+}
+
+// This function initializes the ExtTSP algorithm's data structures. This
+// the NodeChainAssemblies and the CandidateChains maps.
+void NodeChainBuilder::initializeExtTSP() {
+  // For each chain, compute its ExtTSP score, add its chain assembly records
+  // and its merge candidate chain.
+  DenseSet<std::pair<NodeChain *, NodeChain *>> visited;
+  for (DenseMapPair<uint64_t, std::unique_ptr<NodeChain>> &elem : Chains) {
+    NodeChain *chain = elem.second.get();
+    chain->Score = computeExtTSPScore(chain);
+    for (const CFGNode *node : chain->Nodes) {
+      for (const CFGEdge *edge : node->Outs) {
+        if (!edge->Weight)
+          continue;
+        NodeChain *otherChain = getNodeChain(edge->Sink);
+        if (chain == otherChain)
+          continue;
+        if (visited.count(std::make_pair(chain, otherChain)))
+          continue;
+        bool x = updateNodeChainAssembly(chain, otherChain);
+        bool y = updateNodeChainAssembly(otherChain, chain);
+        if (x || y) {
+          CandidateChains[chain].insert(otherChain);
+          CandidateChains[otherChain].insert(chain);
+        }
+        visited.insert(std::make_pair(chain, otherChain));
+        visited.insert(std::make_pair(otherChain, chain));
+      }
+    }
+  }
+}
+
+void NodeChainBuilder::computeChainOrder(
+    std::vector<const NodeChain *> &chainOrder) {
+  // Attach the mutually-foced edges (which will not be split anymore by the
+  // Extended TSP algorithm).
+  for (auto &kv : MutuallyForcedOut)
+    attachNodes(kv.first, kv.second);
+
+  // Initialize the Extended TSP algorithm's data.
+  initializeExtTSP();
+
+  // Keep merging the chain assembly record with the highest ExtTSP gain, until
+  // no more gain is possible.
+  bool merged = false;
+  do {
+    merged = false;
+    auto bestCandidate = std::max_element(
+        NodeChainAssemblies.begin(), NodeChainAssemblies.end(),
+        [](DenseMapPair<std::pair<NodeChain *, NodeChain *>,
+                        std::unique_ptr<NodeChainAssembly>> &c1,
+           DenseMapPair<std::pair<NodeChain *, NodeChain *>,
+                        std::unique_ptr<NodeChainAssembly>> &c2) {
+          return c1.second->extTSPScoreGain() < c2.second->extTSPScoreGain();
+        });
+
+    if (bestCandidate != NodeChainAssemblies.end() &&
+        bestCandidate->second->extTSPScoreGain() > 0) {
+      std::unique_ptr<NodeChainAssembly> bestCandidateNCA =
+          std::move(bestCandidate->second);
+      NodeChainAssemblies.erase(bestCandidate);
+      mergeChains(std::move(bestCandidateNCA));
+      merged = true;
+    }
+  } while (merged);
+
+  // Merge fallthrough basic blocks if we have missed any
+  attachFallThroughs();
+
+  sortChainsByExecutionDensity(chainOrder);
+}
+
+// This function computes the ExtTSP score for a chain assembly record. This
+// goes the three BB slices in the assembly record and considers all edges
+// whose source and sink belongs to the chains in the assembly record.
+double NodeChainBuilder::NodeChainAssembly::computeExtTSPScore() const {
+  double score = 0;
+  for (uint8_t srcSliceIdx = 0; srcSliceIdx < 3; ++srcSliceIdx) {
+    const NodeChainSlice &srcSlice = Slices[srcSliceIdx];
+    uint32_t srcNodeOffset = srcSlice.BeginOffset;
+    for (auto nodeIt = srcSlice.Begin; nodeIt != srcSlice.End;
+         srcNodeOffset += (*nodeIt)->ShSize, ++nodeIt) {
+      const CFGNode *node = *nodeIt;
+      for (const CFGEdge *edge : node->Outs) {
+        if (!edge->Weight)
+          continue;
+
+        uint8_t sinkSliceIdx;
+
+        if (findSliceIndex(edge->Sink, sinkSliceIdx)) {
+          auto sinkNodeOffset = ChainBuilder->getNodeOffset(edge->Sink);
+          bool edgeForward = (srcSliceIdx < sinkSliceIdx) ||
+                             (srcSliceIdx == sinkSliceIdx &&
+                              (srcNodeOffset + node->ShSize <= sinkNodeOffset));
+
+          uint32_t distance = 0;
+
+          if (srcSliceIdx == sinkSliceIdx) {
+            distance = edgeForward
+                           ? sinkNodeOffset - srcNodeOffset - node->ShSize
+                           : srcNodeOffset - sinkNodeOffset + node->ShSize;
+          } else {
+            const NodeChainSlice &sinkSlice = Slices[sinkSliceIdx];
+            distance = edgeForward
+                           ? srcSlice.EndOffset - srcNodeOffset - node->ShSize +
+                                 sinkNodeOffset - sinkSlice.BeginOffset
+                           : srcNodeOffset - srcSlice.BeginOffset +
+                                 node->ShSize + sinkSlice.EndOffset -
+                                 sinkNodeOffset;
+            // Increment the distance by the size of the middle slice if the src
+            // and sink are from the two ends.
+            if (std::abs(sinkSliceIdx - srcSliceIdx) == 2)
+              distance += Slices[1].size();
+          }
+          score += getEdgeExtTSPScore(edge, edgeForward, distance);
+        }
+      }
+    }
+  }
+  return score;
+}
+
+void NodeChainBuilder::doSplitOrder(
+    std::list<StringRef> &symbolList,
+    std::list<StringRef>::iterator hotPlaceHolder,
+    std::list<StringRef>::iterator coldPlaceHolder) {
+  std::vector<const NodeChain *> chainOrder;
+  computeChainOrder(chainOrder);
+
+  for (const NodeChain *c : chainOrder) {
+    std::list<StringRef>::iterator insertPos =
+        c->Freq ? hotPlaceHolder : coldPlaceHolder;
+    for (const CFGNode *n : c->Nodes)
+      symbolList.insert(insertPos, n->ShName);
+  }
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/ELF/PropellerFuncOrdering.h
===================================================================
--- /dev/null
+++ lld/ELF/PropellerFuncOrdering.h
@@ -0,0 +1,72 @@
+//===- PropellerFuncReordering.h ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLD_ELF_PROPELLER_FUNC_ORDERING_H
+#define LLD_ELF_PROPELLER_FUNC_ORDERING_H
+
+#include "Propeller.h"
+
+#include <list>
+#include <map>
+#include <vector>
+
+namespace lld {
+namespace propeller {
+
+class ControlFlowGraph;
+
+class CallChainClustering {
+public:
+  class Cluster {
+  public:
+    Cluster(ControlFlowGraph *cfg, unsigned);
+    // All cfgs in this cluster
+    std::vector<ControlFlowGraph *> CFGs;
+    // Unique id associated with the cluster
+    unsigned Id;
+    // Total binary size of this cluster (only the hot part if using
+    // split-funcs.
+    uint64_t Size;
+    // Total byte-level execution frequency of the cluster
+    uint64_t Weight;
+
+    // Merge the "other" cluster into this cluster.
+    Cluster &mergeWith(Cluster &other) {
+      CFGs.insert(CFGs.end(), other.CFGs.begin(), other.CFGs.end());
+      this->Weight += other.Weight;
+      this->Size += other.Size;
+      return *this;
+    }
+
+    // Returns the per-byte execution density of this cluster
+    double getDensity() { return ((double)Weight) / Size; }
+  };
+
+  CallChainClustering() {}
+
+  void init(Propeller &propeller);
+
+  unsigned doOrder(std::list<ControlFlowGraph *> &cfgOrder);
+
+private:
+  unsigned ClusterCount = 0;
+
+  ControlFlowGraph *getMostLikelyPredecessor(ControlFlowGraph *cfg,
+                                             Cluster *cluster);
+
+  void mergeClusters();
+  void sortClusters(std::vector<Cluster *> &);
+
+  std::vector<ControlFlowGraph *> HotCFGs, ColdCFGs;
+  std::map<ControlFlowGraph *, Cluster *> CFGToClusterMap;
+  std::map<unsigned, std::unique_ptr<Cluster>> Clusters;
+};
+
+} // namespace propeller
+} // namespace lld
+
+#endif
Index: lld/ELF/PropellerFuncOrdering.cpp
===================================================================
--- /dev/null
+++ lld/ELF/PropellerFuncOrdering.cpp
@@ -0,0 +1,198 @@
+//===- PropellerFuncReordering.cpp ----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM
+// Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// This file is part of the Propeller infrastructure for doing code layout
+// optimization and includes the implementation of function reordering based on
+// the CallChainClustering algorithm as published in [1].
+//
+// This algorithm keeps merging functions together into clusters until the
+// cluster sizes reach a limit. The algorithm iterates over functions in
+// decreasing order of their execution density (total frequency divided by size)
+// and for each function, it first finds the cluster containing the
+// most-frequent caller of that function and then places the caller's cluster
+// right before the callee's cluster. Finally, all the remaining clusters are
+// ordered in decreasing order of their execution density.
+//
+// References:
+//   * [1] G.Ottoni and B.Maher, Optimizing Function Placement for Large-Scale
+//        Data-Center Applications, CGO 2017. available at
+//        https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+//===----------------------------------------------------------------------===//
+#include "PropellerFuncOrdering.h"
+
+#include "Config.h"
+#include "Propeller.h"
+#include "PropellerELFCfg.h"
+
+#include <algorithm>
+#include <map>
+
+using lld::elf::config;
+
+namespace lld {
+namespace propeller {
+
+const unsigned ClusterMergeSizeThreshold = 1 << 21;
+
+// Initializes the CallChainClustering algorithm with the cfgs from propeller.
+// It separates cfgs into hot and cold cfgs and initially orders each collection
+// of cfgs based on the address of their corresponding functions in the original
+// binary.
+void CallChainClustering::init(Propeller &propeller) {
+  propeller.forEachCfgRef([this](ControlFlowGraph &cfg) {
+    if (cfg.isHot())
+      this->HotCFGs.push_back(&cfg);
+    else
+      this->ColdCFGs.push_back(&cfg);
+  });
+
+  auto cfgComparator = [](ControlFlowGraph *cfg1,
+                          ControlFlowGraph *cfg2) -> bool {
+    return cfg1->getEntryNode()->MappedAddr < cfg2->getEntryNode()->MappedAddr;
+  };
+  auto sortCFGs = [&cfgComparator](std::vector<ControlFlowGraph *> &cfgs) {
+    std::sort(cfgs.begin(), cfgs.end(), cfgComparator);
+  };
+  sortCFGs(HotCFGs);
+  sortCFGs(ColdCFGs);
+}
+
+// Initialize a cluster containing a single cfg an associates it with a unique
+// id.
+CallChainClustering::Cluster::Cluster(ControlFlowGraph *cfg, unsigned id)
+    : CFGs(1, cfg), Id(id) {}
+
+// Returns the most frequent caller of a function. This function also gets as
+// the second parameter the cluster containing this function to save a lookup
+// into the CFGToClusterMap.
+ControlFlowGraph *
+CallChainClustering::getMostLikelyPredecessor(ControlFlowGraph *cfg,
+                                              Cluster *cluster) {
+  CFGNode *entry = cfg->getEntryNode();
+  if (!entry)
+    return nullptr;
+  CFGEdge *bestCallIn = nullptr;
+
+  // Iterate over all callers of the entry basic block of the function.
+  for (CFGEdge *callIn : entry->CallIns) {
+    auto *caller = callIn->Src->CFG;
+    auto *callerCluster = CFGToClusterMap[caller];
+    assert(caller->isHot());
+    // Ignore callers from the same function, or the same cluster
+    if (caller == cfg || callerCluster == cluster)
+      continue;
+    // Ignore callers with overly large clusters
+    if (callerCluster->Size > ClusterMergeSizeThreshold)
+      continue;
+    // Ignore calls which are cold relative to the callee
+    if (callIn->Weight * 10 < entry->Freq)
+      continue;
+    // Do not merge if the caller cluster's density would degrade by more than
+    // 1/8 if merged.
+    if (8 * callerCluster->Size * (cluster->Weight * callerCluster->Weight) <
+        callerCluster->Weight * (cluster->Size + callerCluster->Size))
+      continue;
+    // Update the best CallIn edge if needed
+    if (!bestCallIn || bestCallIn->Weight < callIn->Weight)
+      bestCallIn = callIn;
+  }
+  return bestCallIn ? bestCallIn->Src->CFG : nullptr;
+}
+
+// Merge clusters together based on the CallChainClustering algorithm.
+void CallChainClustering::mergeClusters() {
+  // Build a map for the execution density of each cfg. This value will depend
+  // on whether function-splitting is used or not.
+  std::map<ControlFlowGraph *, double> cfgWeightMap;
+  for (ControlFlowGraph *cfg : HotCFGs) {
+    uint64_t cfgWeight = 0;
+    uint64_t cfgSize = 0;
+    cfg->forEachNodeRef([&cfgSize, &cfgWeight](CFGNode &n) {
+      cfgWeight += n.Freq * n.ShSize;
+      if (!config->propellerSplitFuncs || n.Freq)
+        cfgSize += n.ShSize;
+    });
+
+    Cluster *c = new Cluster(cfg, ClusterCount++);
+    c->Weight = cfgWeight;
+    c->Size = std::max(cfgSize, (uint64_t)1);
+    cfgWeightMap.emplace(cfg, c->getDensity());
+    Clusters.emplace(c->Id, c);
+    CFGToClusterMap[cfg] = c;
+  }
+
+  // Sort the hot cfgs in decreasing order of their execution density.
+  std::stable_sort(
+      HotCFGs.begin(), HotCFGs.end(),
+      [&cfgWeightMap](ControlFlowGraph *cfg1, ControlFlowGraph *cfg2) {
+        return cfgWeightMap[cfg1] > cfgWeightMap[cfg2];
+      });
+
+  for (ControlFlowGraph *cfg : HotCFGs) {
+    if (cfgWeightMap[cfg] <= 0.005)
+      break;
+    auto *cluster = CFGToClusterMap[cfg];
+    // Ignore merging if the cluster containing this function is bigger than
+    // 2MBs (size of a large page).
+    if (cluster->Size > ClusterMergeSizeThreshold)
+      continue;
+    assert(cluster);
+
+    ControlFlowGraph *predecessorCfg = getMostLikelyPredecessor(cfg, cluster);
+    if (!predecessorCfg)
+      continue;
+    auto *predecessorCluster = CFGToClusterMap[predecessorCfg];
+
+    assert(predecessorCluster != cluster && predecessorCfg != cfg);
+
+    // Join the two clusters into predecessorCluster.
+    predecessorCluster->mergeWith(*cluster);
+
+    // Update cfg to cluster mapping, because all cfgs that were
+    // previsously in cluster are now in predecessorCluster.
+    for (ControlFlowGraph *cfg : cluster->CFGs) {
+      CFGToClusterMap[cfg] = predecessorCluster;
+    }
+
+    // Delete the defunct cluster
+    Clusters.erase(cluster->Id);
+  }
+}
+
+// This functions sorts all remaining clusters in decreasing order of their
+// execution density.
+void CallChainClustering::sortClusters(std::vector<Cluster *> &clusterOrder) {
+  for (auto &p : Clusters)
+    clusterOrder.push_back(p.second.get());
+  std::sort(clusterOrder.begin(), clusterOrder.end(),
+            [](Cluster *c1, Cluster *c2) {
+              // Set a deterministic order when execution densities are equal.
+              if (c1->getDensity() == c2->getDensity())
+                return c1->CFGs.front()->getEntryNode()->MappedAddr <
+                       c2->CFGs.front()->getEntryNode()->MappedAddr;
+              return c1->getDensity() > c2->getDensity();
+            });
+}
+
+// This function performs CallChainClustering on all cfgs and then orders all
+// the built clusters based on their execution density. It places all cold
+// functions after hot functions and returns the number of hot functions.
+unsigned CallChainClustering::doOrder(std::list<ControlFlowGraph *> &cfgOrder) {
+  mergeClusters();
+  std::vector<Cluster *> clusterOrder;
+  sortClusters(clusterOrder);
+  for (Cluster *c : clusterOrder) {
+    cfgOrder.insert(cfgOrder.end(), c->CFGs.begin(), c->CFGs.end());
+  }
+
+  cfgOrder.insert(cfgOrder.end(), ColdCFGs.begin(), ColdCFGs.end());
+  return HotCFGs.size();
+}
+
+} // namespace propeller
+} // namespace lld
Index: lld/test/ELF/propeller/propeller-layout-function-ordering.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/propeller-layout-function-ordering.s
@@ -0,0 +1,75 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises function reordering on three functions.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld  %t.o -o %t.out
+
+# RUN: llvm-nm -nS %t.out | FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000008 t foo
+# BEFORE-NEXT:	0000000000201128 0000000000000008 t bar
+# BEFORE-NEXT:	0000000000201130 0000000000000008 t baz
+
+## Create a propeller profile based on the following inter-procedural calls.
+##
+##       100        100
+##  baz -----> bar -----> foo
+##
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 8 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 8 Nbar" >> %t_prof.propeller
+# RUN: echo "3 8 Nbaz" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "3 2 100 C" >> %t_prof.propeller
+# RUN: echo "2 1 100 C" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+
+## Link with the propeller profile and expect the functions to be reordered.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out
+# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 0000000000000008 t baz
+# REORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# REORDER-NEXT:	0000000000201130 0000000000000008 t foo
+
+## Disable function reordering and expect that the original function order is retained.
+#
+# RUN: ld.lld  %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out
+# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER
+
+# NOREORDER:		0000000000201120 0000000000000008 t foo
+# NOREORDER-NEXT:	0000000000201128 0000000000000008 t bar
+# NOREORDER-NEXT:	0000000000201130 0000000000000008 t baz
+
+.section	.text,"ax",@progbits,unique,1
+# -- Begin function foo
+.type	foo,@function
+foo:
+ .quad 0x11
+
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
+
+.section	.text,"ax",@progbits,unique,2
+# -- Begin function bar
+.type	bar,@function
+bar:
+ .quad 0x22
+
+.Lbar_end:
+ .size	bar, .Lbar_end-bar
+# -- End function bar
+
+.section	.text,"ax",@progbits,unique,3
+# -- Begin function baz
+.type	baz,@function
+baz:
+ .quad 0x33
+
+.Lbaz_end:
+ .size	baz, .Lbaz_end-baz
+# -- End function baz
Index: lld/test/ELF/propeller/propeller-layout-function-with-loop.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/propeller-layout-function-with-loop.s
@@ -0,0 +1,130 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises basic block reordering on a single function with a simple loop.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	0000000000201123 a.BB.foo:
+# BEFORE-NEXT:	nopl	(%rax)
+# BEFORE-NEXT:	je      3 <aaa.BB.foo>
+
+# BEFORE:	0000000000201128 aa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+
+# BEFORE:	000000000020112b aaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	jne     -13 <a.BB.foo>
+
+# BEFORE:	0000000000201130 aaaa.BB.foo:
+# BEFORE-NEXT:	nopl    (%rax)
+# BEFORE-NEXT:	retq
+
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##             foo
+##              |
+##              |5
+##              V
+##      +--> a.BB.foo <--+
+##      |      / \       |
+##     0|    0/   \95    |90
+##      |    /     \     |
+##      |   v       \    |
+##  aa.BB.foo        v   |
+##      \        aaa.BB.foo
+##       \           /
+##        \         /
+##        0\       /10
+##          \     /
+##           v   v
+##         aaaa.BB.foo
+##
+## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo
+## as a fallthrough chain in the layout.
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 14 Nfoo" > %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 3 1.2" >> %t_prof.propeller
+# RUN: echo "4 5 1.3" >> %t_prof.propeller
+# RUN: echo "5 4 1.4" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "4 2 90" >> %t_prof.propeller
+# RUN: echo "2 4 95" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "4 5 10" >> %t_prof.propeller
+# RUN: echo "1 2 5" >> %t_prof.propeller
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out
+# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	0000000000201120 foo:
+# REORDER-NEXT:	nopl	(%rax)
+
+# REORDER:	0000000000201123 a.BB.foo:
+# REORDER-NEXT:	nopl	(%rax)
+# REORDER-NEXT:	jne      9 <aa.BB.foo>
+
+# REORDER:	0000000000201128 aaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jne     -10 <a.BB.foo>
+
+# REORDER:	000000000020112d aaaa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	retq
+
+# REORDER:	0000000000201131 aa.BB.foo:
+# REORDER-NEXT:	nopl    (%rax)
+# REORDER-NEXT:	jmp	-14 <aaa.BB.foo>
+
+## Disable basic block reordering and expect that the original bb order is retained.
+#
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out
+# RUN: diff %t.propeller.noreorder.out %t.out
+
+.section	.text,"ax",@progbits
+# -- Begin function foo
+.type	foo,@function
+foo:
+ nopl (%rax)
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ jmp	aaa.BB.foo
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ jne	a.BB.foo
+ jmp	aaaa.BB.foo
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits,unique,4
+aaaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaaa.BB.foo_end:
+ .size	aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
Index: lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s
@@ -0,0 +1,87 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises optimal basic block reordering one a single function.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld  %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	0000000000201120 0000000000000005 t foo
+# BEFORE-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# BEFORE-NEXT:	000000000020112a 0000000000000004 t aa.BB.foo
+# BEFORE-NEXT:	000000000020112e 0000000000000004 t aaa.BB.foo
+
+## Create a propeller profile for foo, based on the cfg below:
+##
+##
+##     foo
+##      |\
+##      | \100
+##      |  \
+##      |   v
+##   150|  a.BB.foo
+##      |   /  \
+##      |  /100 \0
+##      | /      \
+##      vv        v
+##   aaa.BB.foo  aa.BB.foo
+##
+## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and
+## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo.
+## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf.
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 12 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 5 1.1" >> %t_prof.propeller
+# RUN: echo "3 4 1.2" >> %t_prof.propeller
+# RUN: echo "4 4 1.3" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "1 4 150" >> %t_prof.propeller
+# RUN: echo "2 4 100" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "1 2 100" >> %t_prof.propeller
+
+# RUN: ld.lld %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out
+# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER
+
+# AFTER:	0000000000201120 0000000000000005 t foo
+# AFTER-NEXT:	0000000000201125 0000000000000005 t a.BB.foo
+# AFTER-NEXT:	000000000020112a 0000000000000004 t aaa.BB.foo
+# AFTER-NEXT:	000000000020112e 0000000000000004 t aa.BB.foo
+
+#.global _start
+#_start:
+# -- Begin function foo
+.section	.text,"ax",@progbits
+.type	foo,@function
+foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	a.BB.foo
+
+.section	.text,"ax",@progbits,unique,1
+a.BB.foo:
+ nopl (%rax)
+ je	aaa.BB.foo
+ jmp	aa.BB.foo
+.La.BB.foo_end:
+ .size	a.BB.foo, .La.BB.foo_end-a.BB.foo
+
+.section	.text,"ax",@progbits,unique,2
+aa.BB.foo:
+ nopl (%rax)
+ ret
+.Laa.BB.foo_end:
+ .size	aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo
+
+.section	.text,"ax",@progbits,unique,3
+aaa.BB.foo:
+ nopl (%rax)
+ ret
+.Laaa.BB.foo_end:
+ .size	aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo
+
+.section	.text,"ax",@progbits
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
Index: lld/test/ELF/propeller/propeller-opt-all-combinations.s
===================================================================
--- /dev/null
+++ lld/test/ELF/propeller/propeller-opt-all-combinations.s
@@ -0,0 +1,295 @@
+# REQUIRES: x86
+## Basic propeller tests.
+## This test exercises all combinations of propeller options (reorder-funcs, reorder-blocks,
+## and split-funcs) on four functions.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+# RUN: ld.lld %t.o -optimize-bb-jumps -o %t.out
+# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE
+
+# BEFORE:	Disassembly of section .text:
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	foo:
+# BEFORE-NEXT:	xorb	%al, 0
+# BEFORE-NEXT:	int3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	bar:
+# BEFORE-NEXT:	xorb	%al, 1
+# BEFORE-NEXT:	je	9 <aa.BB.bar>
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	a.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 2
+# BEFORE-NEXT:	jmp      7 <aaa.BB.bar>
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	aa.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	aaa.BB.bar:
+# BEFORE-NEXT:	xorb	%al, 4
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	baz:
+# BEFORE-NEXT:	xorb	%al, 5
+# BEFORE-NEXT:	int3
+# BEFORE-EMPTY:
+# BEFORE-NEXT:	qux:
+# BEFORE-NEXT:	xorb	%al, 6
+#
+
+## Create a propeller profile for four functions foo, bar, baz, and qux based on the cfg below:
+##
+##
+##                   bar
+##                  /   \
+##                 /     \100
+##                /       \
+##       100     v         v
+## foo <----- a.BB.bar  aa.BB.bar     baz     qux ---+
+##               \        /                    ^     |
+##                \      /100                  |   1 |
+##                 \    /                      +-----+
+##                  v  v
+##                aaa.BB.bar
+##
+
+# RUN: echo "Symbols" > %t_prof.propeller
+# RUN: echo "1 8 Nfoo" >> %t_prof.propeller
+# RUN: echo "2 20 Nbar" >> %t_prof.propeller
+# RUN: echo "3 9 2.1" >> %t_prof.propeller
+# RUN: echo "4 7 2.2" >> %t_prof.propeller
+# RUN: echo "5 7 2.3" >> %t_prof.propeller
+# RUN: echo "6 8 Nbaz" >> %t_prof.propeller
+# RUN: echo "7 7 Nqux" >> %t_prof.propeller
+# RUN: echo "Branches" >> %t_prof.propeller
+# RUN: echo "2 4 100" >> %t_prof.propeller
+# RUN: echo "4 1 100 C" >> %t_prof.propeller
+# RUN: echo "7 7 10 C" >> %t_prof.propeller
+# RUN: echo "Fallthroughs" >> %t_prof.propeller
+# RUN: echo "4 5 100" >> %t_prof.propeller
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller --verbose -propeller-keep-named-symbols -o %t.propeller.reorder.out
+# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER
+
+# REORDER:	Disassembly of section .text:
+# REORDER-EMPTY:
+# REORDER-NEXT:	bar:
+# REORDER-NEXT: xorb	%al, 1
+# REORDER-NEXT:	jne	30 <a.BB.bar>
+# REORDER-EMPTY:
+# REORDER-NEXT:	aa.BB.bar:
+# REORDER-NEXT:	xorb	%al, 3
+# REORDER-EMPTY:
+# REORDER-NEXT:	aaa.BB.bar:
+# REORDER-NEXT:	xorb	%al, 4
+# REORDER-NEXT:	int3
+# REORDER-EMPTY:
+# REORDER-NEXT:	foo:
+# REORDER-NEXT:	xorb	%al, 0
+# REORDER-NEXT:	int3
+# REORDER-EMPTY:
+# REORDER-NEXT:	qux:
+# REORDER-NEXT:	xorb	%al, 6
+# REORDER-EMPTY:
+# REORDER-NEXT:	a.BB.bar:
+# REORDER-NEXT:	xorb	%al, 2
+# REORDER-NEXT:	jmp	-32 <aaa.BB.bar>
+# REORDER-EMPTY:
+# REORDER-NEXT:	baz:
+# REORDER-NEXT:	xorb	%al, 5
+#
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorderblocks.out  2>&1 | FileCheck %s --check-prefixes=WARN,IMPLICITNOSPLIT
+# WARN-NOT:		warning:
+# IMPLICITNOSPLIT:	warning: propeller: no-reorder-blocks implicitly sets no-split-funcs.
+#
+# RUN: llvm-objdump -d %t.propeller.noreorderblocks.out| FileCheck %s --check-prefix=NO_REORDER_BB
+#
+# NO_REORDER_BB:		Disassembly of section .text:
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 1
+# NO_REORDER_BB-NEXT:	je	9 <aa.BB.bar>
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	a.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 2
+# NO_REORDER_BB-NEXT:	jmp	7 <aaa.BB.bar>
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	aa.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	aaa.BB.bar:
+# NO_REORDER_BB-NEXT:	xorb	%al, 4
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	foo:
+# NO_REORDER_BB-NEXT:	xorb	%al, 0
+# NO_REORDER_BB-NEXT:	int3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	qux:
+# NO_REORDER_BB-NEXT:	xorb	%al, 6
+# NO_REORDER_BB-NEXT:	int3
+# NO_REORDER_BB-EMPTY:
+# NO_REORDER_BB-NEXT:	baz:
+# NO_REORDER_BB-NEXT:	xorb	%al, 5
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.noreorderfuncs.out
+# RUN: llvm-objdump -d %t.propeller.noreorderfuncs.out| FileCheck %s --check-prefix=NO_REORDER_FUNC
+#
+# NO_REORDER_FUNC:	Disassembly of section .text:
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	foo:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 0
+# NO_REORDER_FUNC-NEXT:	int3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 1
+# NO_REORDER_FUNC-NEXT:	jne	22 <a.BB.bar>
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	aa.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	aaa.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 4
+# NO_REORDER_FUNC-NEXT:	int3
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	qux:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 6
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	a.BB.bar:
+# NO_REORDER_FUNC-NEXT: xorb	%al, 2
+# NO_REORDER_FUNC-NEXT:	jmp	-24 <aaa.BB.bar>
+# NO_REORDER_FUNC-EMPTY:
+# NO_REORDER_FUNC-NEXT:	baz:
+# NO_REORDER_FUNC-NEXT:	xorb	%al, 5
+#
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitfuncs.out
+# RUN: llvm-objdump -d %t.propeller.nosplitfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_FUNC
+#
+# NO_SPLIT_FUNC:	Disassembly of section .text:
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 1
+# NO_SPLIT_FUNC-NEXT:	jne	14 <a.BB.bar>
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	aa.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	aaa.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 4
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	a.BB.bar:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 2
+# NO_SPLIT_FUNC-NEXT:	jmp	-16 <aaa.BB.bar>
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	foo:
+# NO_SPLIT_FUNC-NEXT: 	xorb	%al, 0
+# NO_SPLIT_FUNC-NEXT:	int3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	qux:
+# NO_SPLIT_FUNC-NEXT:	xorb	%al, 6
+# NO_SPLIT_FUNC-NEXT:	int3
+# NO_SPLIT_FUNC-EMPTY:
+# NO_SPLIT_FUNC-NEXT:	baz:
+# NO_SPLIT_FUNC-NEXT:	xorb	%al, 5
+#
+
+## Check that the combination of no-reorder-blocks and split-funcs makes lld fail.
+# RUN: not ld.lld  %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-opt=split-funcs 2>&1 -o /dev/null | FileCheck %s --check-prefix=FAIL
+# FAIL:		propeller: Inconsistent combination of propeller optimizations 'split-funcs' and 'no-reorder-blocks'.
+
+
+# RUN: ld.lld  %t.o -optimize-bb-jumps -propeller=%t_prof.propeller -propeller-opt=no-split-funcs -propeller-opt=no-reorder-funcs -propeller-keep-named-symbols -o %t.propeller.nosplitreorderfuncs.out
+# RUN: llvm-objdump -d %t.propeller.nosplitreorderfuncs.out| FileCheck %s --check-prefix=NO_SPLIT_REORDER_FUNC
+#
+# NO_SPLIT_REORDER_FUNC:	Disassembly of section .text:
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	foo:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 0
+# NO_SPLIT_REORDER_FUNC-NEXT:	int3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 1
+# NO_SPLIT_REORDER_FUNC-NEXT:	jne	14 <a.BB.bar>
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	aa.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	aaa.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 4
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	a.BB.bar:
+# NO_SPLIT_REORDER_FUNC-NEXT: 	xorb	%al, 2
+# NO_SPLIT_REORDER_FUNC-NEXT:	jmp	-16 <aaa.BB.bar>
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	baz:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 5
+# NO_SPLIT_REORDER_FUNC-NEXT:	int3
+# NO_SPLIT_REORDER_FUNC-EMPTY:
+# NO_SPLIT_REORDER_FUNC-NEXT:	qux:
+# NO_SPLIT_REORDER_FUNC-NEXT:	xorb	%al, 6
+
+.section	.text,"ax",@progbits
+# -- Begin function foo
+.type	foo,@function
+.align	4
+foo:
+ xor %al,0
+
+.Lfoo_end:
+ .size	foo, .Lfoo_end-foo
+# -- End function foo
+
+.section	.text,"ax",@progbits,unique,1
+# -- Begin function bar
+.type	bar,@function
+.align	4
+bar:
+ xor %al,1
+ je	aa.BB.bar
+ jmp	a.BB.bar
+
+.section	.text,"ax",@progbits,unique,2
+a.BB.bar:
+ xor %al,2
+ jmp	aaa.BB.bar
+.La.BB.bar_end:
+ .size	a.BB.bar, .La.BB.bar_end-a.BB.bar
+
+.section	.text,"ax",@progbits,unique,3
+aa.BB.bar:
+ xor %al,3
+ jmp	aaa.BB.bar
+.Laa.BB.bar_end:
+ .size	aa.BB.bar, .Laa.BB.bar_end-aa.BB.bar
+
+.section	.text,"ax",@progbits,unique,4
+aaa.BB.bar:
+ xor %al,4
+.Laaa.BB.bar_end:
+ .size	aaa.BB.bar, .Laaa.BB.bar_end-aaa.BB.bar
+
+.section	.text,"ax",@progbits,unique,1
+.Lbar_end:
+ .size	bar, .Lbar_end-bar
+# -- End function bar
+
+.section	.text,"ax",@progbits,unique,5
+# -- Begin function baz
+.type	baz,@function
+.align	4
+baz:
+ xor %al,5
+
+.Lbaz_end:
+ .size	baz, .Lbaz_end-baz
+# -- End function baz
+
+.section	.text,"ax",@progbits,unique,6
+# -- Begin function qux
+.type	qux,@function
+.align	4
+qux:
+ xor %al,6
+
+.Lqux_end:
+ .size	qux, .Lqux_end-qux
+# -- End function qux