diff --git a/llvm/include/llvm/Support/BalancedPartitioning.h b/llvm/include/llvm/Support/BalancedPartitioning.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Support/BalancedPartitioning.h
@@ -0,0 +1,203 @@
+//===- BalancedPartitioning.h ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements BalancedPartitioning, a recursive balanced graph
+// partitioning algorithm.
+//
+// The algorithm is used to find an ordering of FunctionNodes while optimizing
+// a specified objective. The algorithm uses recursive bisection; it starts
+// with a collection of unordered FunctionNodes and tries to split them into
+// two sets (buckets) of equal cardinality. Each bisection step is comprised of
+// iterations that greedily swap the FunctionNodes between the two buckets while
+// there is an improvement of the objective. Once the process converges, the
+// problem is divided into two sub-problems of half the size, which are
+// recursively applied for the two buckets. The final ordering of the
+// FunctionNodes is obtained by concatenating the two (recursively computed)
+// orderings.
+//
+// In order to speed up the computation, we limit the depth of the recursive
+// tree by a specified constant (SplitDepth) and apply at most a constant
+// number of greedy iterations per split (IterationsPerSplit). The worst-case
+// time complexity of the implementation is bounded by O(M*log^2 N), where
+// N is the number of FunctionNodes and M is the number of
+// FunctionNode-UtilityNode edges; (assuming that any collection of D
+// FunctionNodes contains O(D) UtilityNodes). Notice that the two different
+// recursive sub-problems are independent and thus can be efficiently processed
+// in parallel.
+//
+// Reference:
+//   * Optimizing Function Layout for Mobile Applications,
+//     https://arxiv.org/abs/2211.09285
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
+#define LLVM_SUPPORT_BALANCED_PARTITIONING_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <random>
+#include <string>
+#include <vector>
+
+namespace llvm {
+
+/// A function with a set of utility nodes where it is beneficial to order two
+/// functions close together if they have similar utility nodes
+class BPFunctionNode {
+  friend class BalancedPartitioning;
+
+public:
+  using IDT = uint64_t;
+  using UtilityNodeT = uint32_t;
+
+  /// \param UtilityNodes the set of utility nodes (must be unique'd)
+  BPFunctionNode(IDT Id, std::vector<UtilityNodeT> &UtilityNodes)
+      : Id(Id), UtilityNodes(UtilityNodes) {}
+
+  /// The ID of this node
+  IDT Id;
+
+  /// Construct a list of nodes from a set of temporal profile traces
+  static std::vector<BPFunctionNode>
+  fromTemporalProfTraces(const SmallVectorImpl<TemporalProfTraceTy> &Traces,
+                         ArrayRef<uint32_t> Cutoffs);
+
+protected:
+  /// The list of utility nodes associated with this node
+  std::vector<UtilityNodeT> UtilityNodes;
+  /// The bucket assigned by balanced partitioning
+  std::optional<uint32_t> Bucket;
+  /// The index of the input order of the FunctionNodes
+  uint64_t InputOrderIndex = 0;
+};
+
+/// Algorithm parameters; default values are tuned on real-world binaries
+struct BalancedPartitioningConfig {
+  /// The depth of the recursive bisection
+  uint32_t SplitDepth = 16;
+  /// The maximum number of bp iterations per split
+  uint32_t IterationsPerSplit = 40;
+  /// The probability for a vertex to skip a move from its current bucket to
+  /// another bucket; it often helps to escape from a local optima
+  double SkipProbability = 0.1;
+  /// Recursive subtasks up to the given depth are added to the queue and
+  /// distributed among threads by ThreadPool; all subsequent calls are executed
+  /// on the same thread
+  uint32_t TaskSplitDepth = 9;
+};
+
+class BalancedPartitioning {
+public:
+  BalancedPartitioning(const BalancedPartitioningConfig &Config);
+
+  /// Run recursive graph partitioning that optimizes a given objective.
+  void run(std::vector<BPFunctionNode> &Nodes) const;
+
+private:
+  struct UtilitySignature;
+  using SignatureMapT =
+      DenseMap<BPFunctionNode::UtilityNodeT, UtilitySignature>;
+  using FunctionNodeRange =
+      iterator_range<std::vector<BPFunctionNode>::iterator>;
+
+  /// A special ThreadPool that allows for spawning new tasks after blocking on
+  /// wait(). BalancedPartitioning recursively spawns new threads inside other
+  /// threads, so we need to track how many active threads that could spawn more
+  /// threads.
+  struct BPThreadPool {
+    ThreadPool TheThreadPool;
+    std::mutex mtx;
+    std::condition_variable cv;
+    /// The number of threads that could spawn more threads
+    std::atomic<int> NumActiveThreads = 0;
+    /// Only true when all threads are down spawning new threads
+    bool IsFinishedSpawning = false;
+    /// Asynchronous submission of the task to the pool
+    template <typename Func> void async(Func &&F);
+    /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
+    /// acceptable for other threads to add more tasks while blocking on this
+    /// call.
+    void wait();
+  };
+
+  /// Run a recursive bisection of a given list of FunctionNodes
+  /// \param RecDepth the current depth of recursion
+  /// \param RootBucket the initial bucket of the dataVertices
+  /// \param Offset the assigned buckets are the range [Offset, Offset +
+  /// Nodes.size()]
+  void bisect(const FunctionNodeRange Nodes, uint32_t RecDepth,
+              uint32_t RootBucket, uint32_t Offset,
+              std::optional<BPThreadPool> &TP) const;
+
+  /// Run bisection iterations
+  /// \returns true iff a progress has been made
+  bool runIterations(const FunctionNodeRange Nodes, uint32_t RecDepth,
+                     uint32_t LeftBucket, uint32_t RightBucket,
+                     std::mt19937 &RNG) const;
+
+  /// Run a bisection iteration to improve the optimization goal
+  /// \returns the total number of moved FunctionNodes
+  uint32_t runIteration(const FunctionNodeRange Nodes, uint32_t LeftBucket,
+                        uint32_t RightBucket, SignatureMapT &Signatures,
+                        std::mt19937 &RNG) const;
+
+  /// Try to move \p N from one bucket to another
+  /// \returns true iff \p N is moved
+  bool moveFunctionNode(BPFunctionNode &N, uint32_t LeftBucket,
+                        uint32_t RightBucket, SignatureMapT &Signatures,
+                        std::mt19937 &RNG) const;
+
+  /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
+  /// 1 The method is used for an initial assignment before a bisection step
+  void split(const FunctionNodeRange Nodes, uint32_t StartBucket) const;
+
+  /// Initialize utility node signature before a bisection iteration
+  // void prepareSignature(UtilitySignature &Signature) const;
+
+  /// Compute the move gain for uniform log-gap cost
+  double moveGain(const BPFunctionNode &N, bool FromLeftToRight,
+                  const SignatureMapT &Signatures) const;
+
+  /// The cost of the uniform log-gap cost, assuming a utility node has \p X
+  /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
+  double logCost(uint32_t X, uint32_t Y) const;
+
+  /// Compute an average optimization goal for a given utility node signature
+  double computeGoal(const SignatureMapT &Signatures) const;
+
+  double log2Cached(uint32_t i) const;
+
+private:
+  const BalancedPartitioningConfig &Config;
+
+  /// Precomputed values of log2(x). Table size is small enough to fit in cache.
+  static constexpr uint32_t LOG_CACHE_SIZE = 16384;
+  double Log2Cache[LOG_CACHE_SIZE];
+
+  /// The signature of a particular utility node used for the bisection step,
+  /// i.e., the number of \p FunctionNodes in each of the two buckets
+  struct UtilitySignature {
+    /// The number of \p FunctionNodes in the left bucket
+    uint32_t LeftCount = 0;
+    /// The number of \p FunctionNodes in the right bucket
+    uint32_t RightCount = 0;
+    /// The cached cost of moving a \p FunctionNode between buckets. The first
+    /// element is from left to right and the second is from right to left.
+    std::optional<std::pair<double, double>> CachedCost = {};
+  };
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -0,0 +1,371 @@
+//===- BalancedPartitioning.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements BalancedPartitioning, a recursive balanced graph
+// partitioning algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+#define DEBUG_TYPE "balanced-partitioning"
+
+std::vector<BPFunctionNode> BPFunctionNode::fromTemporalProfTraces(
+    const SmallVectorImpl<TemporalProfTraceTy> &Traces,
+    ArrayRef<uint32_t> Cutoffs) {
+  // Collect all function IDs ordered by their smallest timestamp. This will be
+  // used as the initial FunctionNode order.
+  SetVector<IDT> FunctionIds;
+  size_t LargestTraceSize = 0;
+  for (auto &Trace : Traces)
+    LargestTraceSize = std::max(LargestTraceSize, Trace.size());
+  for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
+    for (auto &Trace : Traces)
+      if (Timestamp < Trace.size())
+        FunctionIds.insert(Trace[Timestamp]);
+
+  DenseMap<IDT, std::vector<UtilityNodeT>> FuncGroups;
+  for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
+    auto &Trace = Traces[TraceIdx];
+    for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
+      for (size_t CutoffIdx = 0; CutoffIdx < Cutoffs.size(); CutoffIdx++) {
+        if (Timestamp <= Cutoffs[CutoffIdx]) {
+          auto &FunctionId = Trace[Timestamp];
+          UtilityNodeT GroupId = TraceIdx * Cutoffs.size() + CutoffIdx;
+          FuncGroups[FunctionId].push_back(GroupId);
+        }
+      }
+    }
+  }
+
+  std::vector<BPFunctionNode> Nodes;
+  for (auto &Id : FunctionIds) {
+    auto &UNs = FuncGroups[Id];
+    llvm::sort(UNs);
+    UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
+    Nodes.emplace_back(Id, UNs);
+  }
+  return Nodes;
+}
+
+template <typename Func>
+void BalancedPartitioning::BPThreadPool::async(Func &&F) {
+  // This new thread could spawn more threads, so mark it as active
+  ++NumActiveThreads;
+  TheThreadPool.async([=]() {
+    // Run the task
+    F();
+
+    // This thread will no longer spawn new threads, so mark it as inactive
+    if (--NumActiveThreads == 0) {
+      // There are no more active threads, so mark as finished and notify
+      {
+        std::unique_lock<std::mutex> lock(mtx);
+        assert(!IsFinishedSpawning);
+        IsFinishedSpawning = true;
+      }
+      cv.notify_one();
+    }
+  });
+}
+
+void BalancedPartitioning::BPThreadPool::wait() {
+  // TODO: We could remove the mutex and condition variable and use
+  // std::atomic::wait() instead, but that isn't available until C++20
+  {
+    std::unique_lock<std::mutex> lock(mtx);
+    cv.wait(lock, [&]() { return IsFinishedSpawning; });
+    assert(IsFinishedSpawning && NumActiveThreads == 0);
+  }
+  // Now we can call ThreadPool::wait() since all tasks have been submitted
+  TheThreadPool.wait();
+}
+
+BalancedPartitioning::BalancedPartitioning(
+    const BalancedPartitioningConfig &Config)
+    : Config(Config) {
+  // Pre-computing log2 values
+  Log2Cache[0] = 0.0;
+  for (uint32_t I = 1; I < LOG_CACHE_SIZE; I++)
+    Log2Cache[I] = std::log2(I);
+}
+
+void BalancedPartitioning::run(std::vector<BPFunctionNode> &Nodes) const {
+  LLVM_DEBUG(dbgs() << "Partitioning " << Nodes.size() << " nodes using depth "
+                    << Config.SplitDepth << " and " << Config.IterationsPerSplit
+                    << " iterations per split\n");
+  std::optional<BPThreadPool> TP;
+  if (Config.TaskSplitDepth > 1)
+    TP.emplace();
+
+  // Record the input order
+  for (unsigned I = 0; I < Nodes.size(); I++)
+    Nodes[I].InputOrderIndex = I;
+
+  auto NodesRange = llvm::make_range(Nodes.begin(), Nodes.end());
+  auto BisectTask = [=, &TP]() {
+    bisect(NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP);
+  };
+  if (TP) {
+    TP->async(std::move(BisectTask));
+    TP->wait();
+  } else {
+    BisectTask();
+  }
+
+  llvm::stable_sort(NodesRange, [](const auto &L, const auto &R) {
+    return L.Bucket < R.Bucket;
+  });
+
+  LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n");
+}
+
+void BalancedPartitioning::bisect(const FunctionNodeRange Nodes,
+                                  uint32_t RecDepth, uint32_t RootBucket,
+                                  uint32_t Offset,
+                                  std::optional<BPThreadPool> &TP) const {
+  uint32_t NumNodes = std::distance(Nodes.begin(), Nodes.end());
+  if (NumNodes < 1 || RecDepth >= Config.SplitDepth) {
+    // We've reach the lowest level of the recursion tree. Fall back to the
+    // original order and assign to buckets.
+    llvm::stable_sort(Nodes, [](const auto &L, const auto &R) {
+      return L.InputOrderIndex < R.InputOrderIndex;
+    });
+    for (auto &N : Nodes)
+      N.Bucket = Offset++;
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << "Bisect with " << NumNodes << " nodes and root bucket "
+                    << RootBucket << "\n");
+
+  std::mt19937 RNG(RootBucket);
+
+  uint32_t LeftBucket = 2 * RootBucket;
+  uint32_t RightBucket = 2 * RootBucket + 1;
+
+  // Split into two and assign to the left and right buckets
+  split(Nodes, LeftBucket);
+
+  bool Improvement =
+      runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG);
+  if (!Improvement) {
+    // No improvement, so assign buckets and stop
+    for (auto &N : Nodes)
+      N.Bucket = Offset++;
+    return;
+  }
+
+  // Split nodes wrt the resulting buckets
+  auto NodesMid =
+      llvm::partition(Nodes, [&](auto &N) { return N.Bucket == LeftBucket; });
+  uint32_t MidOffset = Offset + std::distance(Nodes.begin(), NodesMid);
+
+  auto LeftNodes = llvm::make_range(Nodes.begin(), NodesMid);
+  auto RightNodes = llvm::make_range(NodesMid, Nodes.end());
+
+  auto LeftRecTask = [=, &TP]() {
+    bisect(LeftNodes, RecDepth + 1, LeftBucket, Offset, TP);
+  };
+  auto RightRecTask = [=, &TP]() {
+    bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP);
+  };
+
+  if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) {
+    TP->async(std::move(LeftRecTask));
+    TP->async(std::move(RightRecTask));
+  } else {
+    LeftRecTask();
+    RightRecTask();
+  }
+}
+
+bool BalancedPartitioning::runIterations(const FunctionNodeRange Nodes,
+                                         uint32_t RecDepth, uint32_t LeftBucket,
+                                         uint32_t RightBucket,
+                                         std::mt19937 &RNG) const {
+  uint32_t NumNodes = std::distance(Nodes.begin(), Nodes.end());
+
+  // Initialize signatures. Empirically the number of corresponding UtilityNodes
+  // is 4x larger than the number of FunctionNodes.
+  SignatureMapT Signatures;
+  Signatures.reserve(NumNodes * 4);
+  for (auto &N : Nodes) {
+    for (auto &UN : N.UtilityNodes) {
+      if (N.Bucket == LeftBucket) {
+        Signatures[UN].LeftCount++;
+      } else {
+        Signatures[UN].RightCount++;
+      }
+    }
+  }
+
+  double InitialGoal = computeGoal(Signatures);
+
+  for (uint32_t Iter = 0; Iter < Config.IterationsPerSplit; Iter++) {
+    uint32_t NumMovedNodes =
+        runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG);
+    if (NumMovedNodes == 0)
+      break;
+  }
+
+  double FinalGoal = computeGoal(Signatures);
+  // Return true iff progress has been made
+  return InitialGoal > FinalGoal;
+}
+
+uint32_t BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
+                                            uint32_t LeftBucket,
+                                            uint32_t RightBucket,
+                                            SignatureMapT &Signatures,
+                                            std::mt19937 &RNG) const {
+  // Init signature cost caches
+  for (auto &[UN, Signature] : Signatures) {
+    if (Signature.CachedCost.has_value())
+      continue;
+    uint32_t L = Signature.LeftCount;
+    uint32_t R = Signature.RightCount;
+    assert((L > 0 || R > 0) && "incorrect signature");
+    // cost = x * log(U / (x+1)) + y * log(U / (y+1)) =
+    //      = x * log(U) + y * log(U) - (x * log(x+1) + y * log(y+1)) =
+    //      = U * log(U) - (x * log(x+1) + y * log(y+1))
+    double cost = logCost(L, R);
+    double CostLR = 0, CostRL = 0;
+    if (L > 0)
+      CostLR = cost - logCost(L - 1, R + 1);
+    if (R > 0)
+      CostRL = cost - logCost(L + 1, R - 1);
+    Signature.CachedCost = std::make_pair(CostLR, CostRL);
+  }
+
+  // Compute move gains
+  typedef std::pair<double, BPFunctionNode *> GainPair;
+  std::vector<GainPair> Gains;
+  for (auto &N : Nodes) {
+    bool FromLeftToRight = (N.Bucket == LeftBucket);
+    double Gain = moveGain(N, FromLeftToRight, Signatures);
+    Gains.push_back(std::make_pair(Gain, &N));
+  }
+
+  // Collect left and right gains
+  auto LeftGains = Gains.begin();
+  auto LeftEnd = llvm::partition(
+      Gains, [&](const auto &GP) { return GP.second->Bucket == LeftBucket; });
+
+  auto RightGains = LeftEnd;
+  auto RightEnd = Gains.end();
+
+  // Sort gains
+  auto LargerGain = [](const auto &L, const auto &R) {
+    return L.first > R.first;
+  };
+  std::stable_sort(LeftGains, LeftEnd, LargerGain);
+  std::stable_sort(RightGains, RightEnd, LargerGain);
+
+  // Exchange: change buckets and update queryVertex signatures
+  uint32_t NumMovedDataVertices = 0;
+  uint32_t MinSize = std::min(std::distance(LeftGains, LeftEnd),
+                              std::distance(RightGains, RightEnd));
+  for (uint32_t I = 0; I < MinSize; I++) {
+    if (LeftGains[I].first + RightGains[I].first <= 0.0)
+      break;
+    // Try to swap the two nodes
+    NumMovedDataVertices += moveFunctionNode(*LeftGains[I].second, LeftBucket,
+                                             RightBucket, Signatures, RNG);
+    NumMovedDataVertices += moveFunctionNode(*RightGains[I].second, LeftBucket,
+                                             RightBucket, Signatures, RNG);
+  }
+  return NumMovedDataVertices;
+}
+
+bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N,
+                                            uint32_t LeftBucket,
+                                            uint32_t RightBucket,
+                                            SignatureMapT &Signatures,
+                                            std::mt19937 &RNG) const {
+  // Sometimes we skip the move. This helps to escape local optima
+  if (std::uniform_real_distribution<double>(0.0, 1.0)(RNG) <=
+      Config.SkipProbability)
+    return false;
+
+  bool FromLeftToRight = (N.Bucket == LeftBucket);
+
+  // Update the current bucket
+  N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket);
+
+  // Update signatures and invalidate cost cache
+  for (auto &UN : N.UtilityNodes) {
+    auto &Signature = Signatures[UN];
+    if (FromLeftToRight) {
+      Signature.LeftCount--;
+      Signature.RightCount++;
+    } else {
+      Signature.LeftCount++;
+      Signature.RightCount--;
+    }
+    Signature.CachedCost = {};
+  }
+
+  return true;
+}
+
+void BalancedPartitioning::split(const FunctionNodeRange Nodes,
+                                 uint32_t StartBucket) const {
+  uint32_t NumNodes = std::distance(Nodes.begin(), Nodes.end());
+  auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2;
+
+  std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) {
+    return L.InputOrderIndex < R.InputOrderIndex;
+  });
+
+  for (auto &N : llvm::make_range(Nodes.begin(), NodesMid))
+    N.Bucket = StartBucket;
+  for (auto &N : llvm::make_range(NodesMid, Nodes.end()))
+    N.Bucket = StartBucket + 1;
+}
+
+double BalancedPartitioning::moveGain(const BPFunctionNode &N,
+                                      bool FromLeftToRight,
+                                      const SignatureMapT &Signatures) const {
+  double Gain = 0;
+  for (auto &UN : N.UtilityNodes) {
+    auto &Signature = Signatures.find(UN)->second;
+    auto &[CostLR, CostRL] = Signature.CachedCost.value();
+    Gain += (FromLeftToRight ? CostLR : CostRL);
+  }
+  return Gain;
+}
+
+double BalancedPartitioning::logCost(uint32_t X, uint32_t Y) const {
+  return -(X * log2Cached(X + 1) + Y * log2Cached(Y + 1));
+}
+
+double
+BalancedPartitioning::computeGoal(const SignatureMapT &Signatures) const {
+  double Sum = 0;
+  double Cnt = 0;
+  for (auto &[UN, Signature] : Signatures) {
+    uint32_t L = Signature.LeftCount;
+    uint32_t R = Signature.RightCount;
+    assert((L > 0 || R > 0) && "incorrect signature");
+    // Shifting opt goal to the interval [0..xxx)
+    Sum += logCost(L, R) - logCost(L + R, 0);
+    Cnt += L + R;
+  }
+  return Cnt > 0 ? Sum / Cnt : 0;
+}
+
+double BalancedPartitioning::log2Cached(uint32_t i) const {
+  return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i);
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -129,6 +129,7 @@
   Allocator.cpp
   AutoConvert.cpp
   Base64.cpp
+  BalancedPartitioning.cpp
   BinaryStreamError.cpp
   BinaryStreamReader.cpp
   BinaryStreamRef.cpp
diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-order.proftext
@@ -0,0 +1,40 @@
+# RUN: llvm-profdata order --cutoffs=0,1,2,3 %s | FileCheck %s
+
+# CHECK: a
+# CHECK: b
+# CHECK: c
+# CHECK: x
+
+# Header
+:ir
+:temporal_prof_traces
+# Num Traces
+3
+# Trace Stream Size:
+3
+a, main.c:b, c
+a, x, main.c:b, c
+a, main.c:b, c
+
+a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+main.c:b
+0x5678
+1
+202
+
+c
+0xabcd
+1
+303
+
+x
+0xefff
+1
+404
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ProfileData/RawMemProfReader.h"
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/ProfileData/SampleProfWriter.h"
+#include "llvm/Support/BalancedPartitioning.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Discriminator.h"
 #include "llvm/Support/Errc.h"
@@ -3034,6 +3035,58 @@
   return showMemProfProfile(Filename, ProfiledBinary, SFormat, OS);
 }
 
+static int order_main(int argc, const char *argv[]) {
+  cl::opt<std::string> Filename(cl::Positional, cl::desc("<profdata-file>"));
+  cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                      cl::init("-"), cl::desc("Output file"));
+  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                            cl::aliasopt(OutputFilename));
+  cl::list<uint32_t> Cutoffs(
+      "cutoffs", cl::CommaSeparated,
+      cl::desc("Timestamp cutoff values used to initialize function nodes for "
+               "ordering functions"),
+      cl::list_init(
+          ArrayRef<uint32_t>({1000, 2000, 4000, 8000, 15000, 20000, 30000})));
+  cl::ParseCommandLineOptions(argc, argv, "LLVM profile data order\n");
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = InstrProfReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = std::move(ReaderOrErr.get());
+  for (auto &I : *Reader) {
+    // Read all entries
+    (void)I;
+  }
+  auto &Traces = Reader->getTemporalProfTraces();
+
+  auto Nodes = BPFunctionNode::fromTemporalProfTraces(Traces, Cutoffs);
+  BalancedPartitioningConfig Config;
+  BalancedPartitioning BP(Config);
+  BP.run(Nodes);
+
+  WithColor::note() << "# Ordered " << Nodes.size() << " functions\n";
+  for (auto &N : Nodes) {
+    auto FuncName = Reader->getSymtab().getFuncName(N.Id);
+    if (FuncName.contains(':')) {
+      // GlobalValue::getGlobalIdentifier() prefixes the filename if the symbol
+      // is local. This logic will break if there is a colon in the filename,
+      // but we cannot use rsplit() because ObjC symbols can have colons.
+      auto [Filename, ParsedFuncName] = FuncName.split(':');
+      // Emit a comment describing where this symbol came from
+      OS << "# " << Filename << "\n";
+      FuncName = ParsedFuncName;
+    }
+    OS << FuncName << "\n";
+  }
+  return 0;
+}
+
 int llvm_profdata_main(int argc, char **argvNonConst,
                        const llvm::ToolContext &) {
   const char **argv = const_cast<const char **>(argvNonConst);
@@ -3049,6 +3102,8 @@
       func = show_main;
     else if (strcmp(argv[1], "overlap") == 0)
       func = overlap_main;
+    else if (strcmp(argv[1], "order") == 0)
+      func = order_main;
 
     if (func) {
       std::string Invocation(ProgName.str() + " " + argv[1]);
@@ -3073,6 +3128,6 @@
   else
     errs() << ProgName << ": Unknown command!\n";
 
-  errs() << "USAGE: " << ProgName << " <merge|show|overlap> [args...]\n";
+  errs() << "USAGE: " << ProgName << " <merge|show|overlap|order> [args...]\n";
   return 1;
 }
diff --git a/llvm/unittests/Support/BalancedPartitioningTest.cpp b/llvm/unittests/Support/BalancedPartitioningTest.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/unittests/Support/BalancedPartitioningTest.cpp
@@ -0,0 +1,73 @@
+//===- BalancedPartitioningTest.cpp - BalancedPartitioning tests ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace {
+using ::testing::ElementsAre;
+using ::testing::UnorderedElementsAreArray;
+
+class BalancedPartitioningTest : public ::testing::Test {
+protected:
+  BalancedPartitioningConfig Config;
+  BalancedPartitioning Bp;
+  BalancedPartitioningTest() : Bp(Config) {}
+};
+
+static std::vector<BPFunctionNode::IDT>
+getOrderedIds(std::vector<BPFunctionNode> Nodes) {
+  std::vector<BPFunctionNode::IDT> OrderedIds;
+  for (auto &N : Nodes)
+    OrderedIds.push_back(N.Id);
+  return OrderedIds;
+}
+
+TEST_F(BalancedPartitioningTest, Basic) {
+  std::vector<BPFunctionNode::UtilityNodeT> UN1 = {1, 2}, UN2 = {3, 4},
+                                            UN3 = {4};
+  std::vector<BPFunctionNode> Nodes = {
+      BPFunctionNode(1, UN1), //
+      BPFunctionNode(3, UN2), //
+      BPFunctionNode(2, UN1), //
+      BPFunctionNode(4, UN2), //
+      BPFunctionNode(5, UN3), //
+  };
+
+  Bp.run(Nodes);
+
+  EXPECT_THAT(getOrderedIds(Nodes), ElementsAre(1, 2, 3, 4, 5));
+}
+
+TEST_F(BalancedPartitioningTest, Large) {
+  std::vector<BPFunctionNode::UtilityNodeT> AllUNs;
+  for (int i = 0; i < 100; i++)
+    AllUNs.emplace_back(i);
+
+  std::mt19937 RNG;
+  std::vector<BPFunctionNode> Nodes;
+  for (int i = 0; i < 1000; i++) {
+    std::vector<BPFunctionNode::UtilityNodeT> UNs;
+    int SampleSize =
+        std::uniform_int_distribution<int>(0, AllUNs.size() - 1)(RNG);
+    std::sample(AllUNs.begin(), AllUNs.end(), std::back_inserter(UNs),
+                SampleSize, RNG);
+    Nodes.emplace_back(i, UNs);
+  }
+
+  auto OrigOrderedIds = getOrderedIds(Nodes);
+
+  Bp.run(Nodes);
+
+  EXPECT_THAT(getOrderedIds(Nodes), UnorderedElementsAreArray(OrigOrderedIds));
+}
+
+} // namespace
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -14,6 +14,7 @@
   BinaryStreamTest.cpp
   BLAKE3Test.cpp
   BlockFrequencyTest.cpp
+  BalancedPartitioningTest.cpp
   BranchProbabilityTest.cpp
   CachePruningTest.cpp
   CrashRecoveryTest.cpp