diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -72,6 +72,7 @@
   Option
   Passes
   Support
+  TransformUtils
   TargetParser
 
   LINK_LIBS
diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h
--- a/lld/ELF/CallGraphSort.h
+++ b/lld/ELF/CallGraphSort.h
@@ -14,6 +14,8 @@
 namespace lld::elf {
 class InputSectionBase;
 
+llvm::DenseMap<const InputSectionBase *, int> applyCDSort();
+
 llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
 } // namespace lld::elf
 
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp
@@ -6,38 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
-/// for Large-Scale Data-Center Applications
-/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
-///
-/// The goal of this algorithm is to improve runtime performance of the final
-/// executable by arranging code sections such that page table and i-cache
-/// misses are minimized.
-///
-/// Definitions:
-/// * Cluster
-///   * An ordered list of input sections which are laid out as a unit. At the
-///     beginning of the algorithm each input section has its own cluster and
-///     the weight of the cluster is the sum of the weight of all incoming
-///     edges.
-/// * Call-Chain Clustering (C³) Heuristic
-///   * Defines when and how clusters are combined. Pick the highest weighted
-///     input section then add it to its most likely predecessor if it wouldn't
-///     penalize it too much.
-/// * Density
-///   * The weight of the cluster divided by the size of the cluster. This is a
-///     proxy for the amount of execution time spent per byte of the cluster.
-///
-/// It does so given a call graph profile by the following:
-/// * Build a weighted call graph from the call graph profile
-/// * Sort input sections by weight
-/// * For each input section starting with the highest weight
-///   * Find its most likely predecessor cluster
-///   * Check if the combined cluster would be too large, or would have too low
-///     a density.
-///   * If not, then combine the clusters.
-/// * Sort non-empty clusters by density
+/// The file is responsible for sorting sections using the profile data by
+/// placing frequently executed code sections together. The goal of the
+/// placement is to improve the runtime performance of the final executable by
+/// arranging code sections such that page table and i-cache misses are reduced.
 ///
+/// The algorithm first builds a call graph based on the profile data and then
+/// iteratively merges "chains" (ordered lists) of input sections which will be
+/// laid out as a unit. There are two implementations for deciding how to
+/// merge a pair of chains:
+///  - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
+///    "Optimizing Function Placement for Large-Scale Data-Center Applications"
+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
+///   typically produces layouts with higher locality, and hence, yields fewer
+///   instruction cache misses on large binaries.
 //===----------------------------------------------------------------------===//
 
 #include "CallGraphSort.h"
@@ -45,6 +28,7 @@
 #include "InputSection.h"
 #include "Symbols.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 
 #include <numeric>
 
@@ -75,6 +59,33 @@
   Edge bestPred = {-1, 0};
 };
 
+/// Implementation of the Call-Chain Clustering (C^3). The goal of this
+/// algorithm is to improve runtime performance of the executable by arranging
+/// code sections such that page table and i-cache misses are minimized.
+///
+/// Definitions:
+/// * Cluster
+///   * An ordered list of input sections which are laid out as a unit. At the
+///     beginning of the algorithm each input section has its own cluster and
+///     the weight of the cluster is the sum of the weight of all incoming
+///     edges.
+/// * Call-Chain Clustering (C³) Heuristic
+///   * Defines when and how clusters are combined. Pick the highest weighted
+///     input section then add it to its most likely predecessor if it wouldn't
+///     penalize it too much.
+/// * Density
+///   * The weight of the cluster divided by the size of the cluster. This is a
+///     proxy for the amount of execution time spent per byte of the cluster.
+///
+/// It does so given a call graph profile by the following:
+/// * Build a weighted call graph from the call graph profile
+/// * Sort input sections by weight
+/// * For each input section starting with the highest weight
+///   * Find its most likely predecessor cluster
+///   * Check if the combined cluster would be too large, or would have too low
+///     a density.
+///   * If not, then combine the clusters.
+/// * Sort non-empty clusters by density
 class CallGraphSort {
 public:
   CallGraphSort();
@@ -260,11 +271,85 @@
   return orderMap;
 }
 
+using CallT = std::pair<uint64_t, uint64_t>;
+
+// Fill in necessary data for Cache-Directed Sort.
+static void buildCallGraph(std::vector<uint64_t> &funcSizes,
+                           std::vector<uint64_t> &funcCounts,
+                           std::vector<std::pair<CallT, uint64_t>> &callCounts,
+                           std::vector<uint64_t> &callOffsets,
+                           std::vector<const InputSectionBase *> &sections) {
+  MapVector<SectionPair, uint64_t> &profile = config->callGraphProfile;
+  DenseMap<const InputSectionBase *, size_t> secToTargetId;
+
+  auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
+    auto res = secToTargetId.try_emplace(inSec, sections.size());
+    if (res.second) {
+      // inSec does not appear before in the graph.
+      sections.push_back(inSec);
+      assert(inSec->getSize() > 0 && "found a function with zero size");
+      funcSizes.push_back(inSec->getSize());
+      funcCounts.push_back(0);
+    }
+    return res.first->second;
+  };
+
+  // Create the graph
+  for (std::pair<SectionPair, uint64_t> &c : profile) {
+    const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
+    const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
+    // Ignore edges between input sections belonging to different sections.
+    if (fromSB->getOutputSection() != toSB->getOutputSection())
+      continue;
+
+    uint64_t weight = c.second;
+    // Ignore edges with zero weight.
+    if (weight == 0)
+      continue;
+
+    size_t from = getOrCreateNode(fromSB);
+    size_t to = getOrCreateNode(toSB);
+    // Ignore self-edges (recursive calls).
+    if (from == to)
+      continue;
+
+    callCounts.push_back(std::make_pair(std::make_pair(from, to), weight));
+    callOffsets.push_back((funcSizes[from] + 1) / 2);
+    funcCounts[to] += weight;
+  }
+}
+
+// Sort sections by the profile data using the Cache-Directed Sort algorithm.
+// The placement is done by optimizing the locality by co-locating frequently
+// executed code sections together.
+DenseMap<const InputSectionBase *, int> elf::applyCDSort() {
+  // Creating a call graph and necessary data.
+  std::vector<uint64_t> funcSizes;
+  std::vector<uint64_t> funcCounts;
+  std::vector<std::pair<CallT, uint64_t>> callCounts;
+  std::vector<uint64_t> callOffsets;
+  std::vector<const InputSectionBase *> sections;
+  buildCallGraph(funcSizes, funcCounts, callCounts, callOffsets, sections);
+
+  // Run the layout algorithm.
+  std::vector<uint64_t> sortedSections =
+      applyCDSLayout(funcSizes, funcCounts, callCounts, callOffsets);
+
+  // Create the final order.
+  DenseMap<const InputSectionBase *, int> orderMap;
+  int curOrder = 1;
+  for (uint64_t secIdx : sortedSections)
+    orderMap[sections[secIdx]] = curOrder++;
+
+  return orderMap;
+}
+
 // Sort sections by the profile data provided by --callgraph-profile-file.
 //
 // This first builds a call graph based on the profile data then merges sections
-// according to the C³ heuristic. All clusters are then sorted by a density
-// metric to further improve locality.
+// according either to the C³ or Cache-Directed-Sort ordering algorithm.
 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
+  if (config->useCDSort)
+    return applyCDSort();
   return CallGraphSort().run();
 }
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -289,6 +289,7 @@
   bool undefinedVersion;
   bool unique;
   bool useAndroidRelrTags = false;
+  bool useCDSort;
   bool warnBackrefs;
   llvm::SmallVector<llvm::GlobPattern, 0> warnBackrefsExclude;
   bool warnCommon;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1364,6 +1364,7 @@
   config->unique = args.hasArg(OPT_unique);
   config->useAndroidRelrTags = args.hasFlag(
       OPT_use_android_relr_tags, OPT_no_use_android_relr_tags, false);
+  config->useCDSort = args.hasFlag(OPT_use_cdsort, OPT_no_use_cdsort, true);
   config->warnBackrefs =
       args.hasFlag(OPT_warn_backrefs, OPT_no_warn_backrefs, false);
   config->warnCommon = args.hasFlag(OPT_warn_common, OPT_no_warn_common, false);
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -337,6 +337,10 @@
     "Use SHT_ANDROID_RELR / DT_ANDROID_RELR* tags instead of SHT_RELR / DT_RELR*",
     "Use SHT_RELR / DT_RELR* tags (default)">;
 
+defm use_cdsort: BB<"use-cdsort",
+    "Use cache-directed-sort to reorder functions in the binary (default)",
+    "Do not use cache-directed-sort to reorder functions in the binary">;
+
 def pic_veneer: F<"pic-veneer">,
   HelpText<"Always generate position independent thunks (veneers)">;
 
diff --git a/lld/test/ELF/cgprofile-bad-clusters.s b/lld/test/ELF/cgprofile-bad-clusters.s
--- a/lld/test/ELF/cgprofile-bad-clusters.s
+++ b/lld/test/ELF/cgprofile-bad-clusters.s
@@ -10,7 +10,7 @@
 # RUN: echo "F G 6" >> %t.call_graph
 # RUN: echo "G H 5" >> %t.call_graph
 # RUN: echo "H I 4" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2
 # RUN: llvm-readobj --symbols %t2 | FileCheck %s
 
     .section    .text.A,"ax",@progbits
diff --git a/lld/test/ELF/cgprofile-icf.s b/lld/test/ELF/cgprofile-icf.s
--- a/lld/test/ELF/cgprofile-icf.s
+++ b/lld/test/ELF/cgprofile-icf.s
@@ -5,9 +5,9 @@
 # RUN: echo "A B 100" > %t.call_graph
 # RUN: echo "A C 40" >> %t.call_graph
 # RUN: echo "C D 61" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t.out -icf=all
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t.out -icf=all
 # RUN: llvm-readobj --symbols %t.out | FileCheck %s
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2.out
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2.out
 # RUN: llvm-readobj --symbols %t2.out | FileCheck %s --check-prefix=NOICF
 
     .section    .text.D,"ax",@progbits
diff --git a/lld/test/ELF/cgprofile-rela.test b/lld/test/ELF/cgprofile-rela.test
--- a/lld/test/ELF/cgprofile-rela.test
+++ b/lld/test/ELF/cgprofile-rela.test
@@ -3,7 +3,7 @@
 # REQUIRES: x86
 
 # RUN: yaml2obj %s -o %t.o
-# RUN: ld.lld %t.o -o %t
+# RUN: ld.lld --no-use-cdsort %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s
 # RUN: ld.lld --no-call-graph-profile-sort %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG
diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s
--- a/lld/test/ELF/cgprofile-txt.s
+++ b/lld/test/ELF/cgprofile-txt.s
@@ -24,8 +24,10 @@
 # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2
 # RUN: llvm-readobj --symbols %t2 | FileCheck %s
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --use-cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT
 
     .section    .text.D,"ax",@progbits
 D:
@@ -159,6 +161,31 @@
 # CHECK:          Name: _init2
 # CHECK-NEXT:     Value: 0x201141
 
+# CDSORT:          Name: D
+# CDSORT-NEXT:     Value: 0x201123
+# CDSORT:          Name: TooManyPreds
+# CDSORT-NEXT:     Value: 0x20112F
+# CDSORT:          Name: TooManyPreds10
+# CDSORT-NEXT:     Value: 0x20112E
+# CDSORT:          Name: C
+# CDSORT-NEXT:     Value: 0x201122
+# CDSORT:          Name: B
+# CDSORT-NEXT:     Value: 0x201121
+# CDSORT:          Name: A
+# CDSORT-NEXT:     Value: 0x201120
+# CDSORT:          Name: TS
+# CDSORT-NEXT:     Value: 0x20113D
+# CDSORT:          Name: PP
+# CDSORT-NEXT:     Value: 0x20113C
+# CDSORT:          Name: QC
+# CDSORT-NEXT:     Value: 0x20113E
+# CDSORT:          Name: GB
+# CDSORT-NEXT:     Value: 0x20113F
+# CDSORT:          Name: _init
+# CDSORT-NEXT:     Value: 0x201140
+# CDSORT:          Name: _init2
+# CDSORT-NEXT:     Value: 0x201141
+
 # NOSORT:          Name: D
 # NOSORT-NEXT:     Value: 0x201120
 # NOSORT:          Name: TooManyPreds
diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s
--- a/lld/test/ELF/cgprofile-txt2.s
+++ b/lld/test/ELF/cgprofile-txt2.s
@@ -5,17 +5,28 @@
 # RUN: echo "B C 50" >> %t.call_graph
 # RUN: echo "C D 40" >> %t.call_graph
 # RUN: echo "D B 10" >> %t.call_graph
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3
 # RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
-# RUN: llvm-readobj --symbols %t2 | FileCheck %s
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS
 
-# CHECK:      Name: A
-# CHECK-NEXT: Value: 0x201123
-# CHECK:      Name: B
-# CHECK-NEXT: Value: 0x201120
-# CHECK:      Name: C
-# CHECK-NEXT: Value: 0x201121
-# CHECK:      Name: D
-# CHECK-NEXT: Value: 0x201122
+# CHECKC3:      Name: A
+# CHECKC3-NEXT: Value: 0x201123
+# CHECKC3:      Name: B
+# CHECKC3-NEXT: Value: 0x201120
+# CHECKC3:      Name: C
+# CHECKC3-NEXT: Value: 0x201121
+# CHECKC3:      Name: D
+# CHECKC3-NEXT: Value: 0x201122
+
+# CHECKCDS:      Name: A
+# CHECKCDS-NEXT: Value: 0x201120
+# CHECKCDS:      Name: B
+# CHECKCDS-NEXT: Value: 0x201121
+# CHECKCDS:      Name: C
+# CHECKCDS-NEXT: Value: 0x201122
+# CHECKCDS:      Name: D
+# CHECKCDS-NEXT: Value: 0x201123
 
 .section    .text.A,"ax",@progbits
 .globl  A