Index: ELF/CMakeLists.txt
===================================================================
--- ELF/CMakeLists.txt
+++ ELF/CMakeLists.txt
@@ -19,6 +19,7 @@
   Arch/SPARCV9.cpp
   Arch/X86.cpp
   Arch/X86_64.cpp
+  CallGraphSort.cpp
   Driver.cpp
   DriverUtils.cpp
   EhFrame.cpp
Index: ELF/CallGraphSort.h
===================================================================
--- /dev/null
+++ ELF/CallGraphSort.h
@@ -0,0 +1,23 @@
+//===- CallGraphSort.h ------------------------------------------*- C++ -*-===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_CALL_GRAPH_SORT_H
+#define LLD_ELF_CALL_GRAPH_SORT_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace lld {
+namespace elf {
+class InputSectionBase;
+
+llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
+} // namespace elf
+} // namespace lld
+
+#endif
Index: ELF/CallGraphSort.cpp
===================================================================
--- /dev/null
+++ ELF/CallGraphSort.cpp
@@ -0,0 +1,379 @@
+//===- CallGraphSort.cpp --------------------------------------------------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
+/// for Large-Scale Data-Center Applications
+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+///
+/// The goal of this algorithm is to improve runtime performance of the final
+/// executable by arranging code sections such that page table and i-cache
+/// misses are minimized.
+///
+/// Definitions:
+/// * Cluster
+///   * An ordered list of input sections which are layed out as a unit. At the
+///     beginning of the algorithm each input section has its own cluster and
+///     the weight of the cluster is the sum of the weight of all incomming
+///     edges.
+/// * Call-Chain Clustering (Cｳ) Heuristic
+///   * Defines when and how clusters are combined. Pick the highest weight edge
+///     from cluster _u_ to _v_ then move the sections in _v_ and append them to
+///     _u_ unless the combined size would be larger than the page size.
+/// * Density
+///   * The weight of the cluster divided by the size of the cluster. This is a
+///     proxy for the ammount of execution time spent per byte of the cluster.
+///
+/// It does so given a call graph profile by the following:
+/// * Build a call graph from the profile
+/// * While there are unresolved edges
+///   * Find the edge with the highest weight
+///   * Check if merging the two clusters would create a cluster larger than the
+///     target page size
+///   * If not, contract that edge putting the callee after the caller
+/// * Sort remaining clusters by density
+///
+//===----------------------------------------------------------------------===//
+
+#include "CallGraphSort.h"
+#include "OutputSections.h"
+#include "SymbolTable.h"
+#include "Symbols.h"
+#include "Target.h"
+
+#include "llvm/Support/MathExtras.h"
+
+#include <queue>
+#include <set>
+#include <unordered_map>
+
+using namespace llvm;
+using namespace lld;
+using namespace lld::elf;
+
+namespace {
+using NodeIndex = std::ptrdiff_t;
+using EdgeIndex = std::ptrdiff_t;
+
+struct Edge;
+
+struct EdgePriorityCmp {
+  std::vector<Edge> &Edges;
+  bool operator()(EdgeIndex A, EdgeIndex B) const;
+};
+
+using PriorityQueue = std::multiset<EdgeIndex, EdgePriorityCmp>;
+
+struct Node {
+  Node() = default;
+  Node(const InputSectionBase *IS);
+  std::vector<const InputSectionBase *> Sections;
+  std::vector<EdgeIndex> IncidentEdges;
+  int64_t Size = 0;
+  uint64_t Weight = 0;
+};
+
+struct Edge {
+  NodeIndex From;
+  NodeIndex To;
+  uint64_t Weight;
+  PriorityQueue::iterator PriorityPos;
+  bool operator==(const Edge Other) const;
+  bool operator<(const Edge Other) const;
+};
+
+bool EdgePriorityCmp::operator()(EdgeIndex A, EdgeIndex B) const {
+  return Edges[A].Weight < Edges[B].Weight;
+}
+
+struct EdgeDenseMapInfo {
+  static Edge getEmptyKey() {
+    return {DenseMapInfo<NodeIndex>::getEmptyKey(),
+            DenseMapInfo<NodeIndex>::getEmptyKey(), 0,
+            PriorityQueue::iterator()};
+  }
+  static Edge getTombstoneKey() {
+    return {DenseMapInfo<NodeIndex>::getTombstoneKey(),
+            DenseMapInfo<NodeIndex>::getTombstoneKey(), 0,
+            PriorityQueue::iterator()};
+  }
+  static unsigned getHashValue(const Edge &Val) {
+    return hash_combine(DenseMapInfo<NodeIndex>::getHashValue(Val.From),
+                        DenseMapInfo<NodeIndex>::getHashValue(Val.To));
+  }
+  static bool isEqual(const Edge &LHS, const Edge &RHS) { return LHS == RHS; }
+};
+
+class CallGraphSort {
+public:
+  CallGraphSort();
+
+  DenseMap<const InputSectionBase *, int> run();
+
+private:
+  std::vector<Node> Nodes;
+  std::vector<Edge> Edges;
+
+  PriorityQueue WorkQueue{EdgePriorityCmp{Edges}};
+
+  bool killEdge(EdgeIndex EI);
+  void contractEdge(EdgeIndex CEI);
+  void generateClusters();
+};
+} // end anonymous namespace
+
+Node::Node(const InputSectionBase *IS) {
+  Sections.push_back(IS);
+  Size = IS->getSize();
+}
+
+bool Edge::operator==(const Edge Other) const {
+  return From == Other.From && To == Other.To;
+}
+
+bool Edge::operator<(const Edge Other) const {
+  if (From != Other.From)
+    return From < Other.From;
+  return To < Other.To;
+}
+
+static bool isKnownNonreorderableSection(const OutputSection *OS) {
+  return llvm::StringSwitch<bool>(OS->Name)
+      .Cases(".init", ".fini", ".init_array.", ".fini_array.", ".ctors.",
+             ".dtors.", true)
+      .Default(false);
+}
+
+// Take the edge list in Config->CallGraphProfile, resolve symbol names to
+// Symbols, and generate a graph between InputSections with the provided
+// weights.
+CallGraphSort::CallGraphSort() {
+  MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
+            uint64_t> &Profile = Config->CallGraphProfile;
+  DenseMap<const InputSectionBase *, NodeIndex> SecToNode;
+  DenseMap<Edge, EdgeIndex, EdgeDenseMapInfo> EdgeMap;
+
+  auto GetOrCreateNode = [&](const InputSectionBase *IS) -> NodeIndex {
+    auto Res = SecToNode.insert(std::make_pair(IS, Nodes.size()));
+    if (Res.second)
+      Nodes.emplace_back(IS);
+    return Res.first->second;
+  };
+
+  // Create the graph.
+  for (const auto &C : Profile) {
+    const InputSectionBase *FromSB = C.first.first;
+    const InputSectionBase *ToSB = C.first.second;
+    uint64_t Weight = C.second;
+
+    if (Weight == 0)
+      continue;
+
+    // Ignore edges with input sections from different output sections.  This is
+    // done so that we don't form clusters which contain input sections which
+    // can't be laid out adjacently.  This also prevents accidental reordering
+    // of input sections due to getting assigned an order in the order map.
+    if (FromSB->getOutputSection() != ToSB->getOutputSection())
+      continue;
+
+    if (isKnownNonreorderableSection(FromSB->getOutputSection()))
+      continue;
+
+    NodeIndex From = GetOrCreateNode(FromSB);
+    NodeIndex To = GetOrCreateNode(ToSB);
+
+    Nodes[To].Weight = SaturatingAdd(Nodes[To].Weight, Weight);
+
+    // Reject self edges after adding node weight.  This ends up with the same
+    // weight as if we had merged the nodes.
+    if (From == To)
+      continue;
+
+    Edge E{From, To, Weight, WorkQueue.end()};
+
+    // Add or increment an edge
+    auto Res = EdgeMap.insert(std::make_pair(E, Edges.size()));
+    EdgeIndex EI = Res.first->second;
+    if (Res.second) {
+      Edges.push_back(E);
+      Nodes[From].IncidentEdges.push_back(EI);
+      Nodes[To].IncidentEdges.push_back(EI);
+    } else
+      Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, Weight);
+  }
+}
+
+/// Like std::unique, but calls Merge on equal values. Merge is allowed
+/// to modifiy its first argument.
+///
+/// Merge is a callable with signature
+///   Merge(*declval<ForwardIt>(), *declval<ForwardIt>())
+///
+/// Example:
+///
+///   int a[] = {1, 2, 2, 3, 4, 5, 5};
+///   auto end = merge_unique(std::begin(a), std::end(a),
+///     [](int a, int b) { return a == b; },
+///     [](int &a, int b) { a += b; });
+///
+///   for (auto i = a; i != end; ++i)
+///     std::cout << *i << " ";
+///
+///   -- 1 4 3 4 10
+template <class ForwardIt, class PredTy, class MergeTy>
+static ForwardIt merge_unique(ForwardIt First, ForwardIt Last, PredTy Pred,
+                              MergeTy Merge) {
+  if (First == Last)
+    return Last;
+
+  ForwardIt I = First;
+  while (++I != Last) {
+    if (Pred(*First, *I))
+      Merge(*First, *I);
+    else if (++First != I)
+      *First = std::move(*I);
+  }
+  return ++First;
+}
+
+/// Marks an edge as head and removes it from the work queue.
+/// Returns true if the edge was killed, false if it was already dead.
+bool CallGraphSort::killEdge(EdgeIndex EI) {
+  Edge &E = Edges[EI];
+  if (E.PriorityPos != WorkQueue.end()) {
+    WorkQueue.erase(E.PriorityPos);
+    E.PriorityPos = WorkQueue.end();
+    return true;
+  }
+  return false;
+}
+
+/// Remove edge \p CEI from the graph while simultaneously merging its two
+/// incident vertices u and v. This merges any duplicate edges between u and v
+/// by accumulating their weights.
+void CallGraphSort::contractEdge(EdgeIndex CEI) {
+  // Make a copy of the edge as the original will be marked killed while being
+  // used.
+  Edge CE = Edges[CEI];
+  assert(CE.From != CE.To && "Got self edge!");
+  std::vector<EdgeIndex> &FE = Nodes[CE.From].IncidentEdges;
+
+  // Remove the self edge from From.
+  FE.erase(std::remove(FE.begin(), FE.end(), CEI));
+  std::vector<EdgeIndex> &TE = Nodes[CE.To].IncidentEdges;
+
+  // Update all edges incident with To to reference From instead. Then if they
+  // aren't self edges add them to From.
+  for (EdgeIndex EI : TE) {
+    Edge &E = Edges[EI];
+    if (E.From == CE.To)
+      E.From = CE.From;
+    if (E.To == CE.To)
+      E.To = CE.From;
+    if (E.To == E.From) {
+      killEdge(EI);
+      continue;
+    }
+    FE.push_back(EI);
+  }
+
+  // Free memory. Otherwise we end up with N^2 memory usage.
+  std::vector<EdgeIndex>().swap(TE);
+
+  if (FE.empty())
+    return;
+
+  // Sort edges so they can be merged. The stability of this sort doesn't matter
+  // as equal edges will be merged in an order independent manner.
+  std::sort(FE.begin(), FE.end(),
+            [&](EdgeIndex AI, EdgeIndex BI) { return Edges[AI] < Edges[BI]; });
+
+  FE.erase(merge_unique(FE.begin(), FE.end(),
+                        [&](EdgeIndex AI, EdgeIndex BI) {
+                          return Edges[AI] == Edges[BI];
+                        },
+                        [&](EdgeIndex AI, EdgeIndex BI) {
+                          Edge &A = Edges[AI];
+                          Edge &B = Edges[BI];
+                          killEdge(BI);
+                          bool Restore = killEdge(AI);
+                          A.Weight = SaturatingAdd(A.Weight, B.Weight);
+                          if (Restore)
+                            A.PriorityPos = WorkQueue.insert(AI);
+                        }),
+           FE.end());
+}
+
+// Group InputSections into clusters using the Call-Chain Clustering heuristic
+// then sort the clusters by density.
+void CallGraphSort::generateClusters() {
+  for (size_t I = 0; I < Edges.size(); ++I) {
+    Edges[I].PriorityPos = WorkQueue.insert(I);
+  }
+
+  // Collapse the graph.
+  while (!WorkQueue.empty()) {
+    PriorityQueue::const_iterator I = --WorkQueue.end();
+    EdgeIndex MaxI = *I;
+    const Edge MaxE = Edges[MaxI];
+    killEdge(MaxI);
+    // Merge the Nodes.
+    Node &From = Nodes[MaxE.From];
+    Node &To = Nodes[MaxE.To];
+    if (From.Size + To.Size > Target->PageSize)
+      continue;
+    contractEdge(MaxI);
+    From.Sections.insert(From.Sections.end(), To.Sections.begin(),
+                         To.Sections.end());
+    From.Size += To.Size;
+    From.Weight = SaturatingAdd(From.Weight, To.Weight);
+    To.Sections.clear();
+    To.Size = 0;
+    To.Weight = 0;
+  }
+
+  // Remove empty or dead nodes.
+  Nodes.erase(std::remove_if(Nodes.begin(), Nodes.end(),
+                             [](const Node &N) {
+                               return N.Size == 0 || N.Sections.empty();
+                             }),
+              Nodes.end());
+
+  // Sort by density. Invalidates all NodeIndexs.
+  std::sort(Nodes.begin(), Nodes.end(), [](const Node &A, const Node &B) {
+    return (APFloat(APFloat::IEEEdouble(), A.Weight) /
+            APFloat(APFloat::IEEEdouble(), A.Size))
+               .compare(APFloat(APFloat::IEEEdouble(), B.Weight) /
+                        APFloat(APFloat::IEEEdouble(), B.Size)) ==
+           APFloat::cmpLessThan;
+  });
+}
+
+DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
+  generateClusters();
+
+  // Generate order.
+  llvm::DenseMap<const InputSectionBase *, int> OrderMap;
+  ssize_t CurOrder = 1;
+
+  for (const Node &N : Nodes)
+    for (const InputSectionBase *IS : N.Sections)
+      OrderMap[IS] = CurOrder++;
+
+  return OrderMap;
+}
+
+// Sort sections by the profile data provided by -callgraph-profile-file
+//
+// This first builds a call graph based on the profile data then iteratively
+// merges the hottest call edges as long as it would not create a cluster larger
+// than the page size. All clusters are then sorted by a density metric to
+// further improve locality.
+DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
+  return CallGraphSort().run();
+}
Index: ELF/Config.h
===================================================================
--- ELF/Config.h
+++ ELF/Config.h
@@ -25,6 +25,7 @@
 namespace elf {
 
 class InputFile;
+class InputSectionBase;
 
 enum ELFKind {
   ELFNoneKind,
@@ -104,6 +105,9 @@
   std::vector<SymbolVersion> VersionScriptGlobals;
   std::vector<SymbolVersion> VersionScriptLocals;
   std::vector<uint8_t> BuildIdVector;
+  llvm::MapVector<std::pair<const InputSectionBase *, const InputSectionBase *>,
+                  uint64_t>
+      CallGraphProfile;
   bool AllowMultipleDefinition;
   bool AndroidPackDynRelocs = false;
   bool ARMHasBlx = false;
Index: ELF/Driver.cpp
===================================================================
--- ELF/Driver.cpp
+++ ELF/Driver.cpp
@@ -570,6 +570,31 @@
   return {BuildIdKind::None, {}};
 }
 
+static void readCallGraph(MemoryBufferRef MB) {
+  // Build a map from symbol name to section
+  DenseMap<StringRef, InputSectionBase *> SymbolSection;
+  for (InputFile *File : ObjectFiles)
+    for (Symbol *Sym : File->getSymbols())
+      if (auto *D = dyn_cast<Defined>(Sym))
+        if (auto *IS = dyn_cast_or_null<InputSectionBase>(D->Section))
+          SymbolSection[D->getName()] = IS;
+
+  std::vector<StringRef> Lines = args::getLines(MB);
+  for (StringRef L : Lines) {
+    SmallVector<StringRef, 3> Fields;
+    L.split(Fields, ' ');
+    if (Fields.size() != 3)
+      fatal("parse error");
+    uint64_t Count;
+    if (!to_integer(Fields[2], Count))
+      fatal("parse error");
+    InputSectionBase *FromSec = SymbolSection.lookup(Fields[0]);
+    InputSectionBase *ToSec = SymbolSection.lookup(Fields[1]);
+    if (FromSec && ToSec)
+      Config->CallGraphProfile[std::make_pair(FromSec, ToSec)] = Count;
+  }
+}
+
 static bool getCompressDebugSections(opt::InputArgList &Args) {
   StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none");
   if (S == "none")
@@ -1097,6 +1122,10 @@
   // Apply symbol renames for -wrap.
   Symtab->applySymbolWrap();
 
+  if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file))
+    if (Optional<MemoryBufferRef> Buffer = readFile(Arg->getValue()))
+      readCallGraph(*Buffer);
+
   // Now that we have a complete list of input files.
   // Beyond this point, no new files are added.
   // Aggregate all input sections into one place.
Index: ELF/Options.td
===================================================================
--- ELF/Options.td
+++ ELF/Options.td
@@ -66,6 +66,9 @@
     "Only set DT_NEEDED for shared libraries if used",
     "Always set DT_NEEDED for shared libraries">;
 
+def call_graph_ordering_file: S<"call-graph-ordering-file">,
+  HelpText<"Layout sections to optimize the given callgraph">;
+
 // -chroot doesn't have a help text because it is an internal option.
 def chroot: S<"chroot">;
 
Index: ELF/Writer.cpp
===================================================================
--- ELF/Writer.cpp
+++ ELF/Writer.cpp
@@ -9,6 +9,7 @@
 
 #include "Writer.h"
 #include "AArch64ErrataFix.h"
+#include "CallGraphSort.h"
 #include "Config.h"
 #include "Filesystem.h"
 #include "LinkerScript.h"
@@ -1058,6 +1059,17 @@
 // If no layout was provided by linker script, we want to apply default
 // sorting for special input sections. This also handles --symbol-ordering-file.
 template <class ELFT> void Writer<ELFT>::sortInputSections() {
+  // Use the rarely used option -call-graph-ordering-file to sort sections.
+  if (!Config->CallGraphProfile.empty()) {
+    DenseMap<const InputSectionBase *, int> OrderMap =
+        computeCallGraphProfileOrder();
+
+    for (BaseCommand *Base : Script->SectionCommands)
+      if (auto *Sec = dyn_cast<OutputSection>(Base))
+        if (Sec->Live)
+          Sec->sort([&](InputSectionBase *S) { return OrderMap.lookup(S); });
+  }
+
   // Sort input sections by priority using the list provided
   // by --symbol-ordering-file.
   DenseMap<SectionBase *, int> Order = buildSectionOrder();
Index: test/ELF/cgprofile-txt.s
===================================================================
--- /dev/null
+++ test/ELF/cgprofile-txt.s
@@ -0,0 +1,106 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t
+# RUN: ld.lld -e A %t -o %t2
+# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT
+
+# RUN: echo "A B 100" > %t.call_graph
+# RUN: echo "A C 40" >> %t.call_graph
+# RUN: echo "B C 30" >> %t.call_graph
+# RUN: echo "C D 90" >> %t.call_graph
+# RUN: echo "PP TS 100" >> %t.call_graph
+# RUN: echo "_init2 _init 24567837" >> %t.call_graph
+# RUN: echo "TS QC 9001" >> %t.call_graph
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
+# RUN: llvm-readobj -symbols %t2 | FileCheck %s
+
+    .section    .text.D,"ax",@progbits
+D:
+    retq
+
+    .section    .text.C,"ax",@progbits
+    .globl  C
+C:
+    retq
+
+    .section    .text.B,"ax",@progbits
+    .globl  B
+B:
+    retq
+
+    .section    .text.A,"ax",@progbits
+    .globl  A
+A:
+    retq
+
+    .section    .ponies,"ax",@progbits,unique,1
+    .globl TS
+TS:
+    retq
+
+    .section    .ponies,"ax",@progbits,unique,2
+    .globl PP
+PP:
+    retq
+
+    .section    .other,"ax",@progbits,unique,1
+    .globl QC
+QC:
+    retq
+
+    .section    .other,"ax",@progbits,unique,2
+    .globl GB
+GB:
+    retq
+
+    .section    .init,"ax",@progbits,unique,1
+    .globl _init
+_init:
+    retq
+
+    .section    .init,"ax",@progbits,unique,2
+    .globl _init2
+_init2:
+    retq
+
+# CHECK:          Name: D
+# CHECK-NEXT:     Value: 0x201003
+# CHECK:          Name: A
+# CHECK-NEXT:     Value: 0x201000
+# CHECK:          Name: B
+# CHECK-NEXT:     Value: 0x201001
+# CHECK:          Name: C
+# CHECK-NEXT:     Value: 0x201002
+# CHECK:          Name: GB
+# CHECK-NEXT:     Value: 0x201007
+# CHECK:          Name: PP
+# CHECK-NEXT:     Value: 0x201004
+# CHECK:          Name: QC
+# CHECK-NEXT:     Value: 0x201006
+# CHECK:          Name: TS
+# CHECK-NEXT:     Value: 0x201005
+# CHECK:          Name: _init
+# CHECK-NEXT:     Value: 0x201008
+# CHECK:          Name: _init2
+# CHECK-NEXT:     Value: 0x201009
+
+# NOSORT:          Name: D
+# NOSORT-NEXT:     Value: 0x201000
+# NOSORT:          Name: A
+# NOSORT-NEXT:     Value: 0x201003
+# NOSORT:          Name: B
+# NOSORT-NEXT:     Value: 0x201002
+# NOSORT:          Name: C
+# NOSORT-NEXT:     Value: 0x201001
+# NOSORT:          Name: GB
+# NOSORT-NEXT:     Value: 0x201007
+# NOSORT:          Name: PP
+# NOSORT-NEXT:     Value: 0x201005
+# NOSORT:          Name: QC
+# NOSORT-NEXT:     Value: 0x201006
+# NOSORT:          Name: TS
+# NOSORT-NEXT:     Value: 0x201004
+# NOSORT:          Name: _init
+# NOSORT-NEXT:     Value: 0x201008
+# NOSORT:          Name: _init2
+# NOSORT-NEXT:     Value: 0x201009