diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -32,6 +32,7 @@
     RT_EXEC_COUNT,
     RT_HFSORT,
     RT_HFSORT_PLUS,
+    RT_CDS,
     RT_PETTIS_HANSEN,
     RT_RANDOM,
     RT_USER
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -15,6 +15,7 @@
 #include "bolt/Utils/Utils.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 #include <fstream>
 
 #define DEBUG_TYPE "hfsort"
@@ -29,33 +30,27 @@
 
 extern size_t padFunction(const bolt::BinaryFunction &Function);
 
-cl::opt<bolt::ReorderFunctions::ReorderType>
-ReorderFunctions("reorder-functions",
-  cl::desc("reorder and cluster functions (works only with relocations)"),
-  cl::init(bolt::ReorderFunctions::RT_NONE),
-  cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE,
-      "none",
-      "do not reorder functions"),
-    clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT,
-      "exec-count",
-      "order by execution count"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT,
-      "hfsort",
-      "use hfsort algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS,
-      "hfsort+",
-      "use hfsort+ algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
-      "pettis-hansen",
-      "use Pettis-Hansen algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_RANDOM,
-      "random",
-      "reorder functions randomly"),
-    clEnumValN(bolt::ReorderFunctions::RT_USER,
-      "user",
-      "use function order specified by -function-order")),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
+cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions(
+    "reorder-functions",
+    cl::desc("reorder and cluster functions (works only with relocations)"),
+    cl::init(bolt::ReorderFunctions::RT_NONE),
+    cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none",
+                          "do not reorder functions"),
+               clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count",
+                          "order by execution count"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort",
+                          "use hfsort algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+",
+                          "use hfsort+ algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_CDS, "cds",
+                          "use cache-directed sort"),
+               clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
+                          "pettis-hansen", "use Pettis-Hansen algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random",
+                          "reorder functions randomly"),
+               clEnumValN(bolt::ReorderFunctions::RT_USER, "user",
+                          "use function order specified by -function-order")),
+    cl::ZeroOrMore, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> ReorderFunctionsUseHotSize(
     "reorder-functions-use-hot-size",
@@ -292,29 +287,33 @@
     break;
   case RT_EXEC_COUNT:
     {
-      std::vector<BinaryFunction *> SortedFunctions(BFs.size());
-      uint32_t Index = 0;
-      llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(),
-                      [](BinaryFunction &BF) { return &BF; });
-      llvm::stable_sort(SortedFunctions, [&](const BinaryFunction *A,
-                                             const BinaryFunction *B) {
-        if (A->isIgnored())
-          return false;
-        const size_t PadA = opts::padFunction(*A);
-        const size_t PadB = opts::padFunction(*B);
-        if (!PadA || !PadB) {
-          if (PadA)
-            return true;
-          if (PadB)
-            return false;
-        }
-        return !A->hasProfile() &&
-               (B->hasProfile() ||
-                (A->getExecutionCount() > B->getExecutionCount()));
-      });
-      for (BinaryFunction *BF : SortedFunctions)
-        if (BF->hasProfile())
-          BF->setIndex(Index++);
+    std::vector<BinaryFunction *> SortedFunctions(BFs.size());
+    llvm::transform(llvm::make_second_range(BFs), SortedFunctions.begin(),
+                    [](BinaryFunction &BF) { return &BF; });
+    llvm::stable_sort(SortedFunctions,
+                      [&](const BinaryFunction *A, const BinaryFunction *B) {
+                        if (A->isIgnored())
+                          return false;
+                        if (B->isIgnored())
+                          return true;
+                        const size_t PadA = opts::padFunction(*A);
+                        const size_t PadB = opts::padFunction(*B);
+                        if (!PadA || !PadB) {
+                          if (PadA)
+                            return true;
+                          if (PadB)
+                            return false;
+                        }
+                        if (!A->hasProfile())
+                          return false;
+                        if (!B->hasProfile())
+                          return true;
+                        return A->getExecutionCount() > B->getExecutionCount();
+                      });
+    uint32_t Index = 0;
+    for (BinaryFunction *BF : SortedFunctions)
+      if (BF->hasProfile())
+        BF->setIndex(Index++);
     }
     break;
   case RT_HFSORT:
@@ -323,6 +322,39 @@
   case RT_HFSORT_PLUS:
     Clusters = hfsortPlus(Cg);
     break;
+  case RT_CDS: {
+    // It is required that the sum of incoming arc weights is not greater
+    // than the number of samples for every function. Ensuring the call graph
+    // obeys the property before running the algorithm.
+    Cg.adjustArcWeights();
+
+    // Initialize CFG nodes and their data
+    std::vector<uint64_t> FuncSizes;
+    std::vector<uint64_t> FuncCounts;
+    using JumpT = std::pair<uint64_t, uint64_t>;
+    std::vector<std::pair<JumpT, uint64_t>> CallCounts;
+    std::vector<uint64_t> CallOffsets;
+    for (NodeId F = 0; F < Cg.numNodes(); ++F) {
+      FuncSizes.push_back(Cg.size(F));
+      FuncCounts.push_back(Cg.samples(F));
+      for (NodeId Succ : Cg.successors(F)) {
+        const Arc &Arc = *Cg.findArc(F, Succ);
+        auto It = std::make_pair(F, Succ);
+        CallCounts.push_back(std::make_pair(It, Arc.weight()));
+        CallOffsets.push_back(uint64_t(Arc.avgCallOffset()));
+      }
+    }
+
+    // Run the layout algorithm
+    CDSortConfig Config;
+    std::vector<uint64_t> Result =
+        applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
+
+    // Set the order of hot functions based on the layout
+    uint32_t Index = 0;
+    for (uint64_t FuncId : Result)
+      Cg.nodeIdToFunc(FuncId)->setIndex(Index++);
+  } break;
   case RT_PETTIS_HANSEN:
     Clusters = pettisAndHansen(Cg);
     break;