diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
--- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
@@ -53,6 +53,34 @@
                        const std::vector<uint64_t> &NodeCounts,
                        const std::vector<EdgeCountT> &EdgeCounts);
 
+/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for
+/// the best performance of large-scale front-end bound binaries.
+struct CDSortConfig {
+  /// The size of the cache.
+  unsigned CacheEntries = 16;
+  /// The size of a line in the cache.
+  unsigned CacheSize = 2048;
+  /// The power exponent for the distane-locality.
+  double DistancePower = 0.25;
+  /// The scale factor for the frequency-locality.
+  double FrequencyScale = 0.25;
+};
+
+/// Apply a Cache-Directed Sort for functions represented by a call graph.
+/// The placement is done by optimizing the call locality by co-locating
+/// frequently executed functions.
+/// \p FuncSizes: The sizes of the nodes (in bytes).
+/// \p FuncCounts: The execution counts of the nodes in the profile.
+/// \p CallCounts: The execution counts of every edge (jump) in the profile. The
+///    map also defines the edges in CFG and should include 0-count edges.
+/// \p CallOffsets: The offsets of the calls from their source nodes.
+/// \returns The best function order found.
+std::vector<uint64_t> applyCDSLayout(const CDSortConfig &Config,
+                                     const std::vector<uint64_t> &FuncSizes,
+                                     const std::vector<uint64_t> &FuncCounts,
+                                     const std::vector<EdgeCountT> &CallCounts,
+                                     const std::vector<uint64_t> &CallOffsets);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -985,6 +985,393 @@
   std::vector<ChainT *> HotChains;
 };
 
+/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
+/// functions represented by a call graph.
+class CDSortImpl {
+public:
+  CDSortImpl(const CDSortConfig &Config, const std::vector<uint64_t> &NodeSizes,
+             const std::vector<uint64_t> &NodeCounts,
+             const std::vector<EdgeCountT> &EdgeCounts,
+             const std::vector<uint64_t> &EdgeOffsets)
+      : Config(Config), NumNodes(NodeSizes.size()) {
+    initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
+  }
+
+  /// Run the algorithm and return an ordered set of function clusters.
+  void run(std::vector<uint64_t> &Result) {
+    // Merge pairs of chains while improving the objective
+    mergeChainPairs();
+
+    LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
+                      << " of chains from " << NumNodes << " to "
+                      << HotChains.size() << "\n");
+
+    // Collect nodes from all the chains
+    concatChains(Result);
+  }
+
+private:
+  /// Initialize the algorithm's data structures.
+  void initialize(const std::vector<uint64_t> &NodeSizes,
+                  const std::vector<uint64_t> &NodeCounts,
+                  const std::vector<EdgeCountT> &EdgeCounts,
+                  const std::vector<uint64_t> &EdgeOffsets) {
+    // Initialize nodes
+    AllNodes.reserve(NumNodes);
+    for (uint64_t Node = 0; Node < NumNodes; Node++) {
+      uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
+      uint64_t ExecutionCount = NodeCounts[Node];
+      AllNodes.emplace_back(Node, Size, ExecutionCount);
+      TotalSamples += ExecutionCount;
+      if (ExecutionCount > 0)
+        TotalSize += Size;
+    }
+
+    // Initialize jumps between the nodes
+    SuccNodes.resize(NumNodes);
+    PredNodes.resize(NumNodes);
+    AllJumps.reserve(EdgeCounts.size());
+    for (size_t I = 0; I < EdgeCounts.size(); I++) {
+      auto It = EdgeCounts[I];
+      uint64_t Pred = It.first.first;
+      uint64_t Succ = It.first.second;
+      // Ignore recursive calls
+      if (Pred == Succ)
+        continue;
+
+      SuccNodes[Pred].push_back(Succ);
+      PredNodes[Succ].push_back(Pred);
+      uint64_t ExecutionCount = It.second;
+      if (ExecutionCount > 0) {
+        NodeT &PredNode = AllNodes[Pred];
+        NodeT &SuccNode = AllNodes[Succ];
+        AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
+        AllJumps.back().Offset = EdgeOffsets[I];
+        SuccNode.InJumps.push_back(&AllJumps.back());
+        PredNode.OutJumps.push_back(&AllJumps.back());
+      }
+    }
+
+    // Initialize chains
+    AllChains.reserve(NumNodes);
+    HotChains.reserve(NumNodes);
+    for (NodeT &Node : AllNodes) {
+      // Adjust execution counts
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
+      // Create chain
+      AllChains.emplace_back(Node.Index, &Node);
+      Node.CurChain = &AllChains.back();
+      if (Node.ExecutionCount > 0) {
+        HotChains.push_back(&AllChains.back());
+      }
+    }
+
+    // Initialize chain edges
+    AllEdges.reserve(AllJumps.size());
+    for (NodeT &PredNode : AllNodes) {
+      for (JumpT *Jump : PredNode.OutJumps) {
+        NodeT *SuccNode = Jump->Target;
+        ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
+        // this edge is already present in the graph
+        if (CurEdge != nullptr) {
+          assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
+          CurEdge->appendJump(Jump);
+          continue;
+        }
+        // this is a new edge
+        AllEdges.emplace_back(Jump);
+        PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
+        SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
+      }
+    }
+  }
+
+  /// Merge pairs of chains while there is an improvement in the objective.
+  void mergeChainPairs() {
+    // Create a priority queue containing all edges ordered by the merge gain
+    auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
+      return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
+             std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
+    };
+    std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
+
+    // Insert the edges into the queue
+    for (ChainT *ChainPred : HotChains) {
+      for (auto EdgeIt : ChainPred->Edges) {
+        ChainEdge *Edge = EdgeIt.second;
+        // Ignore self-edges
+        if (Edge->isSelfEdge())
+          continue;
+        // Ignore already processed edges
+        if (Edge->gain() != -1.0)
+          continue;
+
+        // Compute the gain of merging the two chains
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+
+    // Merge the chains while the gain of merging is positive
+    while (!Queue.empty()) {
+      // Extract the best (top) edge for merging
+      ChainEdge *BestEdge = *Queue.begin();
+      Queue.erase(Queue.begin());
+      // Ignore self-edges
+      if (BestEdge->isSelfEdge())
+        continue;
+      // Ignore edges with non-positive gains
+      if (BestEdge->gain() <= EPS)
+        continue;
+
+      ChainT *BestSrcChain = BestEdge->srcChain();
+      ChainT *BestDstChain = BestEdge->dstChain();
+
+      // Remove outdated edges from the queue
+      for (std::pair<ChainT *, ChainEdge *> EdgeIt : BestSrcChain->Edges)
+        Queue.erase(EdgeIt.second);
+      for (std::pair<ChainT *, ChainEdge *> EdgeIt : BestDstChain->Edges)
+        Queue.erase(EdgeIt.second);
+
+      // Merge the best pair of chains
+      MergeGainT BestGain = BestEdge->getMergeGain();
+      mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
+                  BestGain.mergeType());
+
+      // Insert newly created edges into the queue
+      for (auto EdgeIt : BestSrcChain->Edges) {
+        ChainEdge *Edge = EdgeIt.second;
+        // Ignore loop edges
+        if (Edge->isSelfEdge())
+          continue;
+
+        // Compute the gain of merging the two chains
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+  }
+
+  /// Compute the gain of merging two chains.
+  ///
+  /// The function considers all possible ways of merging two chains and
+  /// computes the one having the largest increase in ExtTSP objective. The
+  /// result is a pair with the first element being the gain and the second
+  /// element being the corresponding merging type.
+  MergeGainT getBestMergeGain(ChainEdge *Edge) const {
+    // Precompute jumps between ChainPred and ChainSucc
+    auto Jumps = Edge->jumps();
+    assert(!Jumps.empty() && "trying to merge chains w/o jumps");
+    ChainT *SrcChain = Edge->srcChain();
+    ChainT *DstChain = Edge->dstChain();
+
+    // The object holds the best currently chosen gain of merging the two chains
+    MergeGainT Gain = MergeGainT();
+
+    /// Given a list of merge types, try to merge two chains and update Gain
+    /// with a better alternative.
+    auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
+      // Apply the merge, compute the corresponding gain, and update the best
+      // value, if the merge is beneficial
+      for (const MergeTypeT &MergeType : MergeTypes) {
+        MergeGainT NewGain =
+            computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
+
+        // When forward and backward gains are the same, prioritize merging that
+        // preserves the original order of the functions in the binary
+        if (std::abs(Gain.score() - NewGain.score()) < EPS) {
+          if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
+              (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
+            Gain = NewGain;
+          }
+        } else if (NewGain.score() > Gain.score() + EPS) {
+          Gain = NewGain;
+        }
+      }
+    };
+
+    // Try to concatenate two chains w/o splitting
+    tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
+
+    return Gain;
+  }
+
+  /// Compute the score gain of merging two chains, respecting a given type.
+  ///
+  /// The two chains are not modified in the method.
+  MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
+                              const std::vector<JumpT *> &Jumps,
+                              MergeTypeT MergeType) const {
+    // This doesn't depend on the ordering of the nodes
+    double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
+
+    // Merge offset is always 0, as the chains are not split
+    size_t MergeOffset = 0;
+    auto MergedBlocks =
+        mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
+    double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
+
+    double GainScore = DistGain + Config.FrequencyScale * FreqGain;
+    // Scale the result to increase the importance of merging short chains
+    if (GainScore >= 0.0)
+      GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
+
+    return MergeGainT(GainScore, MergeOffset, MergeType);
+  }
+
+  /// Compute the change of the frequency locality after merging the chains.
+  double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
+    auto missProbability = [&](double ChainDensity) {
+      double PageSamples = ChainDensity * Config.CacheSize;
+      if (PageSamples >= TotalSamples)
+        return 0.0;
+      double P = PageSamples / TotalSamples;
+      return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
+    };
+
+    // Cache misses on the chains before merging
+    double CurScore =
+        ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
+        ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
+
+    // Cache misses on the merged chain
+    double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
+    double MergedSize = ChainPred->Size + ChainSucc->Size;
+    double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
+    double NewScore = MergedCounts * missProbability(MergedDensity);
+
+    return CurScore - NewScore;
+  }
+
+  /// Compute the distance locality for a jump / call.
+  double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
+    uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
+    double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
+    return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
+  }
+
+  /// Compute the change of the distance locality after merging the chains.
+  double distBasedLocalityGain(const MergedChain &MergedBlocks,
+                               const std::vector<JumpT *> &Jumps) const {
+    if (Jumps.empty())
+      return 0.0;
+    uint64_t CurAddr = 0;
+    MergedBlocks.forEach([&](const NodeT *Node) {
+      Node->EstimatedAddr = CurAddr;
+      CurAddr += Node->Size;
+    });
+
+    double CurScore = 0;
+    double NewScore = 0;
+    for (const JumpT *Arc : Jumps) {
+      uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset;
+      uint64_t DstAddr = Arc->Target->EstimatedAddr;
+      NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount);
+      CurScore += distScore(0, TotalSize, Arc->ExecutionCount);
+    }
+    return NewScore - CurScore;
+  }
+
+  /// Merge chain From into chain Into, update the list of active chains,
+  /// adjacency information, and the corresponding cached values.
+  void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
+                   MergeTypeT MergeType) {
+    assert(Into != From && "a chain cannot be merged with itself");
+
+    // Merge the nodes
+    MergedChain MergedNodes =
+        mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
+    Into->merge(From, MergedNodes.getNodes());
+
+    // Merge the edges
+    Into->mergeEdges(From);
+    From->clear();
+
+    // Remove the chain from the list of active chains
+    llvm::erase_value(HotChains, From);
+  }
+
+  /// Concatenate all chains into the final order.
+  void concatChains(std::vector<uint64_t> &Order) {
+    // Collect chains and calculate density stats for their sorting
+    std::vector<const ChainT *> SortedChains;
+    DenseMap<const ChainT *, double> ChainDensity;
+    for (ChainT &Chain : AllChains) {
+      if (!Chain.Nodes.empty()) {
+        SortedChains.push_back(&Chain);
+        // Using doubles to avoid overflow of ExecutionCounts
+        double Size = 0;
+        double ExecutionCount = 0;
+        for (NodeT *Node : Chain.Nodes) {
+          Size += static_cast<double>(Node->Size);
+          ExecutionCount += static_cast<double>(Node->ExecutionCount);
+        }
+        assert(Size > 0 && "a chain of zero size");
+        ChainDensity[&Chain] = ExecutionCount / Size;
+      }
+    }
+
+    // Sort chains by density in the decreasing order
+    std::stable_sort(SortedChains.begin(), SortedChains.end(),
+                     [&](const ChainT *L, const ChainT *R) {
+                       const double DL = ChainDensity[L];
+                       const double DR = ChainDensity[R];
+                       // Compare by density and break ties by chain identifiers
+                       return std::make_tuple(-DL, L->Id) <
+                              std::make_tuple(-DR, R->Id);
+                     });
+
+    // Collect the nodes in the order specified by their chains
+    Order.reserve(NumNodes);
+    for (const ChainT *Chain : SortedChains) {
+      for (NodeT *Node : Chain->Nodes) {
+        Order.push_back(Node->Index);
+      }
+    }
+  }
+
+private:
+  /// Config for the algorithm.
+  const CDSortConfig &Config;
+
+  /// The number of nodes in the graph.
+  const size_t NumNodes;
+
+  /// Successors of each node.
+  std::vector<std::vector<uint64_t>> SuccNodes;
+
+  /// Predecessors of each node.
+  std::vector<std::vector<uint64_t>> PredNodes;
+
+  /// All nodes (functions) in the graph.
+  std::vector<NodeT> AllNodes;
+
+  /// All jumps (function calls) between the nodes.
+  std::vector<JumpT> AllJumps;
+
+  /// All chains of nodes.
+  std::vector<ChainT> AllChains;
+
+  /// All edges between the chains.
+  std::vector<ChainEdge> AllEdges;
+
+  /// Active chains. The vector gets updated at runtime when chains are merged.
+  std::vector<ChainT *> HotChains;
+
+  /// The total number of samples in the graph.
+  uint64_t TotalSamples{0};
+
+  /// The total size of the nodes in the graph.
+  uint64_t TotalSize{0};
+};
+
 } // end of anonymous namespace
 
 std::vector<uint64_t>
@@ -1043,3 +1430,22 @@
   }
   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
 }
+
+std::vector<uint64_t>
+llvm::applyCDSLayout(const CDSortConfig &Config,
+                     const std::vector<uint64_t> &FuncSizes,
+                     const std::vector<uint64_t> &FuncCounts,
+                     const std::vector<EdgeCountT> &CallCounts,
+                     const std::vector<uint64_t> &CallOffsets) {
+  // Verify correctness of the input data
+  assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
+
+  // Apply the reordering algorithm
+  CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
+  std::vector<uint64_t> Result;
+  Alg.run(Result);
+
+  // Verify correctness of the output
+  assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
+  return Result;
+}