diff --git a/lld/COFF/CMakeLists.txt b/lld/COFF/CMakeLists.txt
--- a/lld/COFF/CMakeLists.txt
+++ b/lld/COFF/CMakeLists.txt
@@ -8,6 +8,7 @@
 
 add_lld_library(lldCOFF
   Chunks.cpp
+  CallGraphSort.cpp
   DebugTypes.cpp
   DLL.cpp
   Driver.cpp
diff --git a/lld/COFF/CallGraphSort.h b/lld/COFF/CallGraphSort.h
new file mode 100644
--- /dev/null
+++ b/lld/COFF/CallGraphSort.h
@@ -0,0 +1,22 @@
+//===- CallGraphSort.h ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_COFF_CALL_GRAPH_SORT_H
+#define LLD_COFF_CALL_GRAPH_SORT_H
+
+#include "llvm/ADT/DenseMap.h"
+
+namespace lld {
+namespace coff {
+class SectionChunk;
+
+llvm::DenseMap<const SectionChunk *, int> computeCallGraphProfileOrder();
+} // namespace coff
+} // namespace lld
+
+#endif
diff --git a/lld/COFF/CallGraphSort.cpp b/lld/COFF/CallGraphSort.cpp
new file mode 100644
--- /dev/null
+++ b/lld/COFF/CallGraphSort.cpp
@@ -0,0 +1,271 @@
+//===- CallGraphSort.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
+/// for Large-Scale Data-Center Applications
+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+///
+/// The goal of this algorithm is to improve runtime performance of the final
+/// executable by arranging code sections such that page table and i-cache
+/// misses are minimized.
+///
+/// Definitions:
+/// * Cluster
+///   * An ordered list of input sections which are laid out as a unit. At the
+///     beginning of the algorithm each input section has its own cluster and
+///     the weight of the cluster is the sum of the weight of all incoming
+///     edges.
+/// * Call-Chain Clustering (C³) Heuristic
+///   * Defines when and how clusters are combined. Pick the highest weighted
+///     input section then add it to its most likely predecessor if it wouldn't
+///     penalize it too much.
+/// * Density
+///   * The weight of the cluster divided by the size of the cluster. This is a
+///     proxy for the amount of execution time spent per byte of the cluster.
+///
+/// It does so given a call graph profile by the following:
+/// * Build a weighted call graph from the call graph profile
+/// * Sort input sections by weight
+/// * For each input section starting with the highest weight
+///   * Find its most likely predecessor cluster
+///   * Check if the combined cluster would be too large, or would have too low
+///     a density.
+///   * If not, then combine the clusters.
+/// * Sort non-empty clusters by density
+///
+//===----------------------------------------------------------------------===//
+
+#include "CallGraphSort.h"
+#include "InputFiles.h"
+#include "SymbolTable.h"
+#include "Symbols.h"
+#include "lld/Common/ErrorHandler.h"
+
+#include <cstdio>
+#include <numeric>
+
+using namespace llvm;
+using namespace lld;
+using namespace lld::coff;
+
+namespace {
+struct Edge {
+  int from;
+  uint64_t weight;
+};
+
+struct Cluster {
+  Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {}
+
+  double getDensity() const {
+    if (size == 0)
+      return 0;
+    return double(weight) / double(size);
+  }
+
+  int next;
+  int prev;
+  size_t size = 0;
+  uint64_t weight = 0;
+  uint64_t initialWeight = 0;
+  Edge bestPred = {-1, 0};
+};
+
+class CallGraphSort {
+public:
+  CallGraphSort();
+
+  DenseMap<const SectionChunk *, int> run();
+
+private:
+  std::vector<Cluster> clusters;
+  std::vector<const SectionChunk *> sections;
+};
+
+// Maximum amount the combined cluster density can be worse than the original
+// cluster to consider merging.
+constexpr int MAX_DENSITY_DEGRADATION = 8;
+
+// Maximum cluster size in bytes.
+constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024;
+} // end anonymous namespace
+
+using SectionPair = std::pair<const SectionChunk *, const SectionChunk *>;
+
+// Take the edge list in Config->CallGraphProfile, resolve symbol names to
+// Symbols, and generate a graph between InputSections with the provided
+// weights.
+CallGraphSort::CallGraphSort() {
+  MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
+  DenseMap<const SectionChunk *, int> secToCluster;
+
+  auto getOrCreateNode = [&](const SectionChunk *isec) -> int {
+    auto res = secToCluster.try_emplace(isec, clusters.size());
+    if (res.second) {
+      sections.push_back(isec);
+      clusters.emplace_back(clusters.size(), isec->getSize());
+    }
+    return res.first->second;
+  };
+
+  // Create the graph.
+  for (std::pair<SectionPair, uint64_t> &c : profile) {
+    const auto *fromSB = cast<SectionChunk>(c.first.first->repl);
+    const auto *toSB = cast<SectionChunk>(c.first.second->repl);
+    uint64_t weight = c.second;
+
+    // Ignore edges between input sections belonging to different output
+    // sections.  This is done because otherwise we would end up with clusters
+    // containing input sections that can't actually be placed adjacently in the
+    // output.  This messes with the cluster size and density calculations.  We
+    // would also end up moving input sections in other output sections without
+    // moving them closer to what calls them.
+    if (fromSB->getOutputSection() != toSB->getOutputSection())
+      continue;
+
+    int from = getOrCreateNode(fromSB);
+    int to = getOrCreateNode(toSB);
+
+    clusters[to].weight += weight;
+
+    if (from == to)
+      continue;
+
+    // Remember the best edge.
+    Cluster &toC = clusters[to];
+    if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) {
+      toC.bestPred.from = from;
+      toC.bestPred.weight = weight;
+    }
+  }
+  for (Cluster &c : clusters)
+    c.initialWeight = c.weight;
+}
+
+// It's bad to merge clusters which would degrade the density too much.
+static bool isNewDensityBad(Cluster &a, Cluster &b) {
+  double newDensity = double(a.weight + b.weight) / double(a.size + b.size);
+  return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION;
+}
+
+// Find the leader of V's belonged cluster (represented as an equivalence
+// class). We apply union-find path-halving technique (simple to implement) in
+// the meantime as it decreases depths and the time complexity.
+static int getLeader(std::vector<int> &leaders, int v) {
+  while (leaders[v] != v) {
+    leaders[v] = leaders[leaders[v]];
+    v = leaders[v];
+  }
+  return v;
+}
+
+static void mergeClusters(std::vector<Cluster> &cs, Cluster &into, int intoIdx,
+                          Cluster &from, int fromIdx) {
+  int tail1 = into.prev, tail2 = from.prev;
+  into.prev = tail2;
+  cs[tail2].next = intoIdx;
+  from.prev = tail1;
+  cs[tail1].next = fromIdx;
+  into.size += from.size;
+  into.weight += from.weight;
+  from.size = 0;
+  from.weight = 0;
+}
+
+// Group InputSections into clusters using the Call-Chain Clustering heuristic
+// then sort the clusters by density.
+DenseMap<const SectionChunk *, int> CallGraphSort::run() {
+  std::vector<int> sorted(clusters.size());
+  std::vector<int> leaders(clusters.size());
+
+  std::iota(leaders.begin(), leaders.end(), 0);
+  std::iota(sorted.begin(), sorted.end(), 0);
+  llvm::stable_sort(sorted, [&](int a, int b) {
+    return clusters[a].getDensity() > clusters[b].getDensity();
+  });
+
+  for (int l : sorted) {
+    // The cluster index is the same as the index of its leader here because
+    // clusters[L] has not been merged into another cluster yet.
+    Cluster &c = clusters[l];
+
+    // Don't consider merging if the edge is unlikely.
+    if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight)
+      continue;
+
+    int predL = getLeader(leaders, c.bestPred.from);
+    if (l == predL)
+      continue;
+
+    Cluster *predC = &clusters[predL];
+    if (c.size + predC->size > MAX_CLUSTER_SIZE)
+      continue;
+
+    if (isNewDensityBad(*predC, c))
+      continue;
+
+    leaders[l] = predL;
+    mergeClusters(clusters, *predC, predL, c, l);
+  }
+
+  // Sort remaining non-empty clusters by density.
+  sorted.clear();
+  for (int i = 0, e = (int)clusters.size(); i != e; ++i)
+    if (clusters[i].size > 0)
+      sorted.push_back(i);
+  llvm::stable_sort(sorted, [&](int a, int b) {
+    return clusters[a].getDensity() > clusters[b].getDensity();
+  });
+
+  DenseMap<const SectionChunk *, int> orderMap;
+  int curOrder = INT_MIN;
+  for (int leader : sorted)
+    for (int i = leader;;) {
+      orderMap[sections[i]] = curOrder++;
+      i = clusters[i].next;
+      if (i == leader)
+        break;
+    }
+
+  if (!config->printSymbolOrder.empty()) {
+    std::error_code ec;
+    raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None);
+    if (ec) {
+      error("cannot open " + config->printSymbolOrder + ": " + ec.message());
+      return orderMap;
+    }
+    // Print the symbols ordered by C3, in the order of increasing curOrder
+    // Instead of sorting all the orderMap, just repeat the loops above.
+    for (int leader : sorted)
+      for (int i = leader;;) {
+        const SectionChunk *sc = sections[i];
+
+        // Search all the symbols in the file of the section
+        // and find out a Defined symbol with name that is within the section.
+        for (Symbol *sym : sc->file->getSymbols())
+          if (auto *d = dyn_cast_or_null<DefinedCOFF>(sym))
+            if (d->isCOMDAT && !d->getCOFFSymbol().isSection())
+              if (sc == d->getChunk())
+                os << sym->getName() << "\n";
+        i = clusters[i].next;
+        if (i == leader)
+          break;
+      }
+  }
+
+  return orderMap;
+}
+
+// Sort sections by the profile data provided by /callgraph-profile-file
+//
+// This first builds a call graph based on the profile data then merges sections
+// according to the C³ heuristic. All clusters are then sorted by a density
+// metric to further improve locality.
+DenseMap<const SectionChunk *, int> coff::computeCallGraphProfileOrder() {
+  return CallGraphSort().run();
+}
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -9,6 +9,7 @@
 #ifndef LLD_COFF_CONFIG_H
 #define LLD_COFF_CONFIG_H
 
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Object/COFF.h"
@@ -29,6 +30,7 @@
 class StringChunk;
 class Symbol;
 class InputFile;
+class SectionChunk;
 
 // Short aliases.
 static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64;
@@ -200,6 +202,14 @@
   // Used for /lto-obj-path:
   llvm::StringRef ltoObjPath;
 
+  // Used for /call-graph-ordering-file:
+  llvm::MapVector<std::pair<const SectionChunk *, const SectionChunk *>,
+                  uint64_t>
+      callGraphProfile;
+
+  // Used for /print-symbol-order:
+  StringRef printSymbolOrder;
+
   uint64_t align = 4096;
   uint64_t imageBase = -1;
   uint64_t fileAlign = 512;
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -924,6 +924,46 @@
   }
 }
 
+static void parseCallGraphFile(StringRef path) {
+  std::unique_ptr<MemoryBuffer> mb = CHECK(
+      MemoryBuffer::getFile(path, -1, false, true), "could not open " + path);
+
+  // Build a map from symbol name to section
+  DenseMap<StringRef, Symbol *> map;
+  for (ObjFile *file : ObjFile::instances)
+    for (Symbol *sym : file->getSymbols())
+      if (sym)
+        map[sym->getName()] = sym;
+
+  auto findSection = [&](StringRef name) -> SectionChunk * {
+    Symbol *sym = map.lookup(name);
+    if (!sym) {
+      if (config->warnMissingOrderSymbol)
+        warn(path + ": no such symbol: " + name);
+      return nullptr;
+    }
+
+    if (DefinedCOFF *dr = dyn_cast_or_null<DefinedCOFF>(sym))
+      return dyn_cast_or_null<SectionChunk>(dr->getChunk());
+    return nullptr;
+  };
+
+  for (StringRef line : args::getLines(*mb)) {
+    SmallVector<StringRef, 3> fields;
+    line.split(fields, ' ');
+    uint64_t count;
+
+    if (fields.size() != 3 || !to_integer(fields[2], count)) {
+      error(path + ": parse error");
+      return;
+    }
+
+    if (SectionChunk *from = findSection(fields[0]))
+      if (SectionChunk *to = findSection(fields[1]))
+        config->callGraphProfile[std::make_pair(from, to)] += count;
+  }
+}
+
 static void markAddrsig(Symbol *s) {
   if (auto *d = dyn_cast_or_null<Defined>(s))
     if (SectionChunk *c = dyn_cast_or_null<SectionChunk>(d->getChunk()))
@@ -2020,12 +2060,23 @@
   if (config->manifest == Configuration::SideBySide)
     createSideBySideManifest();
 
+  if (args.hasArg(OPT_order) && args.hasArg(OPT_call_graph_ordering_file))
+    error("/order and /call-graph-order-file may not be used together");
+
   // Handle /order. We want to do this at this moment because we
   // need a complete list of comdat sections to warn on nonexistent
   // functions.
   if (auto *arg = args.getLastArg(OPT_order))
     parseOrderFile(arg->getValue());
 
+  // Handle /call-graph-ordering-file
+  if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
+    parseCallGraphFile(arg->getValue());
+
+  // Handle /print-symbol-order
+  if (auto *arg = args.getLastArg(OPT_print_symbol_order))
+    config->printSymbolOrder = arg->getValue();
+
   // Identify unreferenced COMDAT sections.
   if (config->doGC)
     markLive(symtab->getChunks());
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -234,6 +234,13 @@
 def threads
     : P<"threads", "Number of threads. '1' disables multi-threading. By "
                    "default all available hardware threads are used">;
+def call_graph_ordering_file: P<
+    "call-graph-ordering-file", 
+    "Layout sections to optimize the given callgraph">;
+def print_symbol_order: P<
+    "print-symbol-order",
+    "Print a symbol order specified by /call-graph-ordering-file into the "
+    "specified file">;
 
 // Flags for debugging
 def lldmap : F<"lldmap">;
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Writer.h"
+#include "CallGraphSort.h"
 #include "Config.h"
 #include "DLL.h"
 #include "InputFiles.h"
@@ -229,6 +230,7 @@
   void setSectionPermissions();
   void writeSections();
   void writeBuildId();
+  void sortSections();
   void sortExceptionTable();
   void sortCRTSectionChunks(std::vector<Chunk *> &chunks);
   void addSyntheticIdata();
@@ -798,6 +800,19 @@
          name.startswith(".xdata$") || name.startswith(".eh_frame$");
 }
 
+void Writer::sortSections() {
+  if (!config->callGraphProfile.empty()) {
+    DenseMap<const SectionChunk *, int> order = computeCallGraphProfileOrder();
+    for (auto I : order) {
+      if (DefinedRegular *sym = I.first->sym)
+        config->order[sym->getName()] = I.second;
+    }
+  }
+  if (!config->order.empty())
+    for (auto it : partialSections)
+      sortBySectionOrder(it.second->chunks);
+}
+
 // Create output section objects and add them to OutputSections.
 void Writer::createSections() {
   // First, create the builtin sections.
@@ -861,10 +876,7 @@
   if (hasIdata)
     addSyntheticIdata();
 
-  // Process an /order option.
-  if (!config->order.empty())
-    for (auto it : partialSections)
-      sortBySectionOrder(it.second->chunks);
+  sortSections();
 
   if (hasIdata)
     locateImportTables();
diff --git a/lld/test/COFF/cgprofile-bad-clusters.s b/lld/test/COFF/cgprofile-bad-clusters.s
new file mode 100644
--- /dev/null
+++ b/lld/test/COFF/cgprofile-bad-clusters.s
@@ -0,0 +1,61 @@
+# REQUIRES: x86
+# This test checks that CallGraphSort ignores edges that would form "bad"
+# clusters.
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t
+# RUN: echo "A C 1" > %t.call_graph
+# RUN: echo "E B 4" >> %t.call_graph
+# RUN: echo "C D 2" >> %t.call_graph
+# RUN: echo "B D 1" >> %t.call_graph
+# RUN: echo "F G 6" >> %t.call_graph
+# RUN: echo "G H 5" >> %t.call_graph
+# RUN: echo "H I 4" >> %t.call_graph
+# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab
+# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s
+
+    .section    .text,"ax",one_only,A
+    .globl A
+A:
+    retq
+
+    .section    .text,"ax",one_only,D
+D:
+    .fill 1000, 1, 0
+
+    .section    .text,"ax",one_only,E
+E:
+    retq
+
+    .section    .text,"ax",one_only,C
+C:
+    retq
+
+    .section    .text,"ax",one_only,B
+B:
+    .fill 1000, 1, 0
+
+    .section    .text,"ax",one_only,F
+F:
+    .fill (1024 * 1024) - 1, 1, 0
+
+    .section    .text,"ax",one_only,G
+G:
+    retq
+
+    .section    .text,"ax",one_only,H
+H:
+    retq
+
+    .section    .text,"ax",one_only,I
+I:
+    .fill 13, 1, 0
+
+# CHECK: 140001000 t H
+# CHECK: 140001001 t I
+# CHECK: 14000100e T A
+# CHECK: 14000100f t C
+# CHECK: 140001010 t E
+# CHECK: 140001011 t B
+# CHECK: 1400013f9 t D
+# CHECK: 1400017e1 t F
+# CHECK: 1401017e0 t G
diff --git a/lld/test/COFF/cgprofile-icf.s b/lld/test/COFF/cgprofile-icf.s
new file mode 100644
--- /dev/null
+++ b/lld/test/COFF/cgprofile-icf.s
@@ -0,0 +1,45 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t
+
+# RUN: echo "A B 100" > %t.call_graph
+# RUN: echo "A C 40" >> %t.call_graph
+# RUN: echo "C D 61" >> %t.call_graph
+# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab /opt:icf
+# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s
+# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab
+# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOICF
+
+    .section    .text,"x",one_only,D
+    .globl  D
+D:
+    mov $60, %rax
+    retq
+
+    .section    .text,"x",one_only,C
+    .globl  C
+C:
+    mov $60, %rax
+    retq
+
+    .section    .text,"x",one_only,B
+    .globl  B
+B:
+    mov $2, %rax
+    retq
+
+    .section    .text,"x",one_only,A
+    .globl  A
+A:
+    mov $42, %rax
+    retq
+
+# CHECK: 140001000 T A
+# CHECK: 140001008 T C
+# CHECK: 140001008 T D
+# CHECK: 140001010 T B
+
+# NOICF: 140001000 T A
+# NOICF: 140001008 T B
+# NOICF: 140001010 T C
+# NOICF: 140001018 T D
diff --git a/lld/test/COFF/cgprofile-print.s b/lld/test/COFF/cgprofile-print.s
new file mode 100644
--- /dev/null
+++ b/lld/test/COFF/cgprofile-print.s
@@ -0,0 +1,35 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t
+# RUN: echo "A B 5" > %t.call_graph
+# RUN: echo "B C 50" >> %t.call_graph
+# RUN: echo "C D 40" >> %t.call_graph
+# RUN: echo "D B 10" >> %t.call_graph
+# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /print-symbol-order:%t3
+# RUN: FileCheck %s --input-file %t3
+
+# CHECK: B
+# CHECK-NEXT: C
+# CHECK-NEXT: D
+# CHECK-NEXT: A
+
+.section    .text, "x", one_only, A
+.globl  A
+A:
+ nop
+
+.section    .text, "x", one_only, B
+.globl  B
+B:
+ nop
+
+.section    .text, "x", one_only, C
+.globl  C
+C:
+ nop
+
+.section    .text, "x", one_only, D
+.globl  D
+D:
+ nop
+
diff --git a/lld/test/COFF/cgprofile-txt.s b/lld/test/COFF/cgprofile-txt.s
new file mode 100644
--- /dev/null
+++ b/lld/test/COFF/cgprofile-txt.s
@@ -0,0 +1,43 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t
+# RUN: lld-link /subsystem:console /entry:A %t /out:%t2 /debug:symtab
+# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOSORT 
+
+# RUN: echo "A B 5" > %t.call_graph
+# RUN: echo "B C 50" >> %t.call_graph
+# RUN: echo "C D 40" >> %t.call_graph
+# RUN: echo "D B 10" >> %t.call_graph
+# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab
+# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s 
+
+# NOSORT: 140001000 T A
+# NOSORT: 140001001 T B
+# NOSORT: 140001002 T C
+# NOSORT: 140001003 T D
+
+# CHECK: 140001000 T B
+# CHECK: 140001001 T C
+# CHECK: 140001002 T D
+# CHECK: 140001003 T A
+
+.section    .text, "x", one_only, A
+.globl  A
+A:
+ nop
+
+.section    .text, "x", one_only, B
+.globl  B
+B:
+ nop
+
+.section    .text, "x", one_only, C
+.globl  C
+C:
+ nop
+
+.section    .text, "x", one_only, D
+.globl  D
+D:
+ nop
+