diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst
--- a/llvm/docs/CommandGuide/llvm-profdata.rst
+++ b/llvm/docs/CommandGuide/llvm-profdata.rst
@@ -20,6 +20,7 @@
 * :ref:`merge <profdata-merge>`
 * :ref:`show <profdata-show>`
 * :ref:`overlap <profdata-overlap>`
+* :ref:`order <profdata-order>`
 
 .. program:: llvm-profdata merge
 
@@ -418,6 +419,40 @@
  Only show overlap for the context sensitive profile counts. The default is to show
  non-context sensitive profile counts.
 
+.. program:: llvm-profdata order
+
+.. _profdata-order:
+
+ORDER
+-------
+
+SYNOPSIS
+^^^^^^^^
+
+:program:`llvm-profdata order` [*options*] [*filename*]
+
+DESCRIPTION
+^^^^^^^^^^^
+
+:program:`llvm-profdata order` uses temporal profiling traces from a profile and
+finds a function order that reduces the number of page faults for those traces.
+This output can be directly passed to ``lld`` via ``--symbol-ordering-file=``
+for ELF or ``-order-file`` for Mach-O. If the traces found in the profile are
+representative of the real world, then this order should improve startup
+performance.
+
+OPTIONS
+^^^^^^^
+
+.. option:: --help
+
+ Print a summary of command line options.
+
+.. option:: --output=<output>, -o
+
+ Specify the output file name.  If *output* is ``-`` or it isn't specified,
+ then the output is sent to standard output.
+
 EXIT STATUS
 -----------
 
diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h
--- a/llvm/include/llvm/ProfileData/InstrProf.h
+++ b/llvm/include/llvm/ProfileData/InstrProf.h
@@ -336,8 +336,11 @@
 /// An ordered list of functions identified by their NameRef found in
 /// INSTR_PROF_DATA
 struct TemporalProfTraceTy {
-  uint64_t Weight = 1;
   std::vector<uint64_t> FunctionNameRefs;
+  uint64_t Weight;
+  TemporalProfTraceTy(std::initializer_list<uint64_t> Trace = {},
+                      uint64_t Weight = 1)
+      : FunctionNameRefs(Trace), Weight(Weight) {}
 };
 
 inline std::error_code make_error_code(instrprof_error E) {
diff --git a/llvm/include/llvm/Support/BalancedPartitioning.h b/llvm/include/llvm/Support/BalancedPartitioning.h
new file mode 100644
--- /dev/null
+++ b/llvm/include/llvm/Support/BalancedPartitioning.h
@@ -0,0 +1,200 @@
+//===- BalancedPartitioning.h ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements BalancedPartitioning, a recursive balanced graph
+// partitioning algorithm.
+//
+// The algorithm is used to find an ordering of FunctionNodes while optimizing
+// a specified objective. The algorithm uses recursive bisection; it starts
+// with a collection of unordered FunctionNodes and tries to split them into
+// two sets (buckets) of equal cardinality. Each bisection step is comprised of
+// iterations that greedily swap the FunctionNodes between the two buckets while
+// there is an improvement of the objective. Once the process converges, the
+// problem is divided into two sub-problems of half the size, which are
+// recursively applied for the two buckets. The final ordering of the
+// FunctionNodes is obtained by concatenating the two (recursively computed)
+// orderings.
+//
+// In order to speed up the computation, we limit the depth of the recursive
+// tree by a specified constant (SplitDepth) and apply at most a constant
+// number of greedy iterations per split (IterationsPerSplit). The worst-case
+// time complexity of the implementation is bounded by O(M*log^2 N), where
+// N is the number of FunctionNodes and M is the number of
+// FunctionNode-UtilityNode edges; (assuming that any collection of D
+// FunctionNodes contains O(D) UtilityNodes). Notice that the two different
+// recursive sub-problems are independent and thus can be efficiently processed
+// in parallel.
+//
+// Reference:
+//   * Optimizing Function Layout for Mobile Applications,
+//     https://arxiv.org/abs/2211.09285
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H
+#define LLVM_SUPPORT_BALANCED_PARTITIONING_H
+
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/ThreadPool.h"
+
+#include <random>
+#include <vector>
+
+namespace llvm {
+
+/// A function with a set of utility nodes where it is beneficial to order two
+/// functions close together if they have similar utility nodes
+class BPFunctionNode {
+  friend class BalancedPartitioning;
+
+public:
+  using IDT = uint64_t;
+  using UtilityNodeT = uint32_t;
+
+  /// \param UtilityNodes the set of utility nodes (must be unique'd)
+  BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
+      : Id(Id), UtilityNodes(UtilityNodes) {}
+
+  /// The ID of this node
+  IDT Id;
+
+  /// Construct a list of nodes from a set of temporal profile traces
+  static std::vector<BPFunctionNode>
+  fromTemporalProfTraces(const SmallVectorImpl<TemporalProfTraceTy> &Traces);
+
+  void dump(raw_ostream &OS) const;
+
+protected:
+  /// The list of utility nodes associated with this node
+  SmallVector<UtilityNodeT, 4> UtilityNodes;
+  /// The bucket assigned by balanced partitioning
+  std::optional<unsigned> Bucket;
+  /// The index of the input order of the FunctionNodes
+  uint64_t InputOrderIndex = 0;
+
+  friend class BPFunctionNodeTest_Basic_Test;
+  friend class BalancedPartitioningTest_Basic_Test;
+  friend class BalancedPartitioningTest_Large_Test;
+};
+
+/// Algorithm parameters; default values are tuned on real-world binaries
+struct BalancedPartitioningConfig {
+  /// The depth of the recursive bisection
+  unsigned SplitDepth = 18;
+  /// The maximum number of bp iterations per split
+  unsigned IterationsPerSplit = 40;
+  /// The probability for a vertex to skip a move from its current bucket to
+  /// another bucket; it often helps to escape from a local optima
+  float SkipProbability = 0.1;
+  /// Recursive subtasks up to the given depth are added to the queue and
+  /// distributed among threads by ThreadPool; all subsequent calls are executed
+  /// on the same thread
+  unsigned TaskSplitDepth = 9;
+};
+
+class BalancedPartitioning {
+public:
+  BalancedPartitioning(const BalancedPartitioningConfig &Config);
+
+  /// Run recursive graph partitioning that optimizes a given objective.
+  void run(std::vector<BPFunctionNode> &Nodes) const;
+
+private:
+  struct UtilitySignature;
+  using SignaturesT = SmallVector<UtilitySignature, 4>;
+  using FunctionNodeRange =
+      iterator_range<std::vector<BPFunctionNode>::iterator>;
+
+  /// A special ThreadPool that allows for spawning new tasks after blocking on
+  /// wait(). BalancedPartitioning recursively spawns new threads inside other
+  /// threads, so we need to track how many active threads that could spawn more
+  /// threads.
+  struct BPThreadPool {
+    ThreadPool TheThreadPool;
+    std::mutex mtx;
+    std::condition_variable cv;
+    /// The number of threads that could spawn more threads
+    std::atomic<int> NumActiveThreads = 0;
+    /// Only true when all threads are down spawning new threads
+    bool IsFinishedSpawning = false;
+    /// Asynchronous submission of the task to the pool
+    template <typename Func> void async(Func &&F);
+    /// Blocking wait for all threads to complete. Unlike ThreadPool, it is
+    /// acceptable for other threads to add more tasks while blocking on this
+    /// call.
+    void wait();
+  };
+
+  /// Run a recursive bisection of a given list of FunctionNodes
+  /// \param RecDepth the current depth of recursion
+  /// \param RootBucket the initial bucket of the dataVertices
+  /// \param Offset the assigned buckets are the range [Offset, Offset +
+  /// Nodes.size()]
+  void bisect(const FunctionNodeRange Nodes, unsigned RecDepth,
+              unsigned RootBucket, unsigned Offset,
+              std::optional<BPThreadPool> &TP) const;
+
+  /// Run bisection iterations
+  void runIterations(const FunctionNodeRange Nodes, unsigned RecDepth,
+                     unsigned LeftBucket, unsigned RightBucket,
+                     std::mt19937 &RNG) const;
+
+  /// Run a bisection iteration to improve the optimization goal
+  /// \returns the total number of moved FunctionNodes
+  unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket,
+                        unsigned RightBucket, SignaturesT &Signatures,
+                        std::mt19937 &RNG) const;
+
+  /// Try to move \p N from one bucket to another
+  /// \returns true iff \p N is moved
+  bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket,
+                        unsigned RightBucket, SignaturesT &Signatures,
+                        std::mt19937 &RNG) const;
+
+  /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket +
+  /// 1 The method is used for an initial assignment before a bisection step
+  void split(const FunctionNodeRange Nodes, unsigned StartBucket) const;
+
+  /// Compute the move gain for uniform log-gap cost
+  float moveGain(const BPFunctionNode &N, bool FromLeftToRight,
+                 const SignaturesT &Signatures) const;
+
+  /// The cost of the uniform log-gap cost, assuming a utility node has \p X
+  /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one.
+  float logCost(unsigned X, unsigned Y) const;
+
+  float log2Cached(unsigned i) const;
+
+private:
+  const BalancedPartitioningConfig &Config;
+
+  /// Precomputed values of log2(x). Table size is small enough to fit in cache.
+  static constexpr unsigned LOG_CACHE_SIZE = 16384;
+  float Log2Cache[LOG_CACHE_SIZE];
+
+  /// The signature of a particular utility node used for the bisection step,
+  /// i.e., the number of \p FunctionNodes in each of the two buckets
+  struct UtilitySignature {
+    /// The number of \p FunctionNodes in the left bucket
+    unsigned LeftCount = 0;
+    /// The number of \p FunctionNodes in the right bucket
+    unsigned RightCount = 0;
+    /// The cached cost of moving a \p FunctionNode from the left bucket to the
+    /// right bucket
+    float CachedCostLR;
+    /// The cached cost of moving a \p FunctionNode from the right bucket to the
+    /// left bucket
+    float CachedCostRL;
+    /// Whether \p CachedCostLR and \p CachedCostRL are valid
+    bool CachedCostIsValid = false;
+  };
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -0,0 +1,376 @@
+//===- BalancedPartitioning.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements BalancedPartitioning, a recursive balanced graph
+// partitioning algorithm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+#define DEBUG_TYPE "balanced-partitioning"
+
+std::vector<BPFunctionNode> BPFunctionNode::fromTemporalProfTraces(
+    const SmallVectorImpl<TemporalProfTraceTy> &Traces) {
+  // Collect all function IDs ordered by their smallest timestamp. This will be
+  // used as the initial FunctionNode order.
+  SetVector<IDT> FunctionIds;
+  size_t LargestTraceSize = 0;
+  for (auto &Trace : Traces)
+    LargestTraceSize =
+        std::max(LargestTraceSize, Trace.FunctionNameRefs.size());
+  for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++)
+    for (auto &Trace : Traces)
+      if (Timestamp < Trace.FunctionNameRefs.size())
+        FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]);
+
+  int N = std::ceil(std::log2(LargestTraceSize));
+
+  // TODO: We need to use the Trace.Weight field to give more weight to more
+  // important utilities
+  DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
+  for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
+    auto &Trace = Traces[TraceIdx].FunctionNameRefs;
+    for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
+      for (int I = std::floor(std::log2(Timestamp + 1)); I < N; I++) {
+        auto &FunctionId = Trace[Timestamp];
+        UtilityNodeT GroupId = TraceIdx * N + I;
+        FuncGroups[FunctionId].push_back(GroupId);
+      }
+    }
+  }
+
+  std::vector<BPFunctionNode> Nodes;
+  for (auto &Id : FunctionIds) {
+    auto &UNs = FuncGroups[Id];
+    llvm::sort(UNs);
+    UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
+    Nodes.emplace_back(Id, UNs);
+  }
+  return Nodes;
+}
+
+void BPFunctionNode::dump(raw_ostream &OS) const {
+  OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id,
+                make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket);
+}
+
+template <typename Func>
+void BalancedPartitioning::BPThreadPool::async(Func &&F) {
+  // This new thread could spawn more threads, so mark it as active
+  ++NumActiveThreads;
+  TheThreadPool.async([=]() {
+    // Run the task
+    F();
+
+    // This thread will no longer spawn new threads, so mark it as inactive
+    if (--NumActiveThreads == 0) {
+      // There are no more active threads, so mark as finished and notify
+      {
+        std::unique_lock<std::mutex> lock(mtx);
+        assert(!IsFinishedSpawning);
+        IsFinishedSpawning = true;
+      }
+      cv.notify_one();
+    }
+  });
+}
+
+void BalancedPartitioning::BPThreadPool::wait() {
+  // TODO: We could remove the mutex and condition variable and use
+  // std::atomic::wait() instead, but that isn't available until C++20
+  {
+    std::unique_lock<std::mutex> lock(mtx);
+    cv.wait(lock, [&]() { return IsFinishedSpawning; });
+    assert(IsFinishedSpawning && NumActiveThreads == 0);
+  }
+  // Now we can call ThreadPool::wait() since all tasks have been submitted
+  TheThreadPool.wait();
+}
+
+BalancedPartitioning::BalancedPartitioning(
+    const BalancedPartitioningConfig &Config)
+    : Config(Config) {
+  // Pre-computing log2 values
+  Log2Cache[0] = 0.0;
+  for (unsigned I = 1; I < LOG_CACHE_SIZE; I++)
+    Log2Cache[I] = std::log2(I);
+}
+
+void BalancedPartitioning::run(std::vector<BPFunctionNode> &Nodes) const {
+  LLVM_DEBUG(
+      dbgs() << format(
+          "Partitioning %d nodes using depth %d and %d iterations per split\n",
+          Nodes.size(), Config.SplitDepth, Config.IterationsPerSplit));
+  std::optional<BPThreadPool> TP;
+  if (Config.TaskSplitDepth > 1)
+    TP.emplace();
+
+  // Record the input order
+  for (unsigned I = 0; I < Nodes.size(); I++)
+    Nodes[I].InputOrderIndex = I;
+
+  auto NodesRange = llvm::make_range(Nodes.begin(), Nodes.end());
+  auto BisectTask = [=, &TP]() {
+    bisect(NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP);
+  };
+  if (TP) {
+    TP->async(std::move(BisectTask));
+    TP->wait();
+  } else {
+    BisectTask();
+  }
+
+  llvm::stable_sort(NodesRange, [](const auto &L, const auto &R) {
+    return L.Bucket < R.Bucket;
+  });
+
+  LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n");
+}
+
+void BalancedPartitioning::bisect(const FunctionNodeRange Nodes,
+                                  unsigned RecDepth, unsigned RootBucket,
+                                  unsigned Offset,
+                                  std::optional<BPThreadPool> &TP) const {
+  unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+  if (NumNodes < 1 || RecDepth >= Config.SplitDepth) {
+    // We've reach the lowest level of the recursion tree. Fall back to the
+    // original order and assign to buckets.
+    llvm::stable_sort(Nodes, [](const auto &L, const auto &R) {
+      return L.InputOrderIndex < R.InputOrderIndex;
+    });
+    for (auto &N : Nodes)
+      N.Bucket = Offset++;
+    return;
+  }
+
+  LLVM_DEBUG(dbgs() << format("Bisect with %d nodes and root bucket %d\n",
+                              NumNodes, RootBucket));
+
+  std::mt19937 RNG(RootBucket);
+
+  unsigned LeftBucket = 2 * RootBucket;
+  unsigned RightBucket = 2 * RootBucket + 1;
+
+  // Split into two and assign to the left and right buckets
+  split(Nodes, LeftBucket);
+
+  runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG);
+
+  // Split nodes wrt the resulting buckets
+  auto NodesMid =
+      llvm::partition(Nodes, [&](auto &N) { return N.Bucket == LeftBucket; });
+  unsigned MidOffset = Offset + std::distance(Nodes.begin(), NodesMid);
+
+  auto LeftNodes = llvm::make_range(Nodes.begin(), NodesMid);
+  auto RightNodes = llvm::make_range(NodesMid, Nodes.end());
+
+  auto LeftRecTask = [=, &TP]() {
+    bisect(LeftNodes, RecDepth + 1, LeftBucket, Offset, TP);
+  };
+  auto RightRecTask = [=, &TP]() {
+    bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP);
+  };
+
+  if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) {
+    TP->async(std::move(LeftRecTask));
+    TP->async(std::move(RightRecTask));
+  } else {
+    LeftRecTask();
+    RightRecTask();
+  }
+}
+
+void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes,
+                                         unsigned RecDepth, unsigned LeftBucket,
+                                         unsigned RightBucket,
+                                         std::mt19937 &RNG) const {
+  unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+  DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeDegree;
+  for (auto &N : Nodes)
+    for (auto &UN : N.UtilityNodes)
+      ++UtilityNodeDegree[UN];
+  // Remove utility nodes if they have just one edge or are connected to all
+  // functions
+  for (auto &N : Nodes)
+    llvm::erase_if(N.UtilityNodes, [&](auto &UN) {
+      return UtilityNodeDegree[UN] <= 1 || UtilityNodeDegree[UN] >= NumNodes;
+    });
+
+  // Renumber utility nodes so they can be used to index into Signatures
+  DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeIndex;
+  for (auto &N : Nodes)
+    for (auto &UN : N.UtilityNodes)
+      if (!UtilityNodeIndex.count(UN))
+        UtilityNodeIndex[UN] = UtilityNodeIndex.size();
+  for (auto &N : Nodes)
+    for (auto &UN : N.UtilityNodes)
+      UN = UtilityNodeIndex[UN];
+
+  // Initialize signatures
+  SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size());
+  for (auto &N : Nodes) {
+    for (auto &UN : N.UtilityNodes) {
+      assert(UN < Signatures.size());
+      if (N.Bucket == LeftBucket) {
+        Signatures[UN].LeftCount++;
+      } else {
+        Signatures[UN].RightCount++;
+      }
+    }
+  }
+
+  for (unsigned I = 0; I < Config.IterationsPerSplit; I++) {
+    unsigned NumMovedNodes =
+        runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG);
+    if (NumMovedNodes == 0)
+      break;
+  }
+}
+
+unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
+                                            unsigned LeftBucket,
+                                            unsigned RightBucket,
+                                            SignaturesT &Signatures,
+                                            std::mt19937 &RNG) const {
+  // Init signature cost caches
+  for (auto &Signature : Signatures) {
+    if (Signature.CachedCostIsValid)
+      continue;
+    unsigned L = Signature.LeftCount;
+    unsigned R = Signature.RightCount;
+    assert((L > 0 || R > 0) && "incorrect signature");
+    // cost = x * log(U / (x+1)) + y * log(U / (y+1)) =
+    //      = x * log(U) + y * log(U) - (x * log(x+1) + y * log(y+1)) =
+    //      = U * log(U) - (x * log(x+1) + y * log(y+1))
+    float cost = logCost(L, R);
+    float CostLR = 0, CostRL = 0;
+    if (L > 0)
+      CostLR = cost - logCost(L - 1, R + 1);
+    if (R > 0)
+      CostRL = cost - logCost(L + 1, R - 1);
+    Signature.CachedCostLR = CostLR;
+    Signature.CachedCostRL = CostRL;
+    Signature.CachedCostIsValid = true;
+  }
+
+  // Compute move gains
+  typedef std::pair<float, BPFunctionNode *> GainPair;
+  std::vector<GainPair> Gains;
+  for (auto &N : Nodes) {
+    bool FromLeftToRight = (N.Bucket == LeftBucket);
+    float Gain = moveGain(N, FromLeftToRight, Signatures);
+    Gains.push_back(std::make_pair(Gain, &N));
+  }
+
+  // Collect left and right gains
+  auto LeftGains = Gains.begin();
+  auto LeftEnd = llvm::partition(
+      Gains, [&](const auto &GP) { return GP.second->Bucket == LeftBucket; });
+
+  auto RightGains = LeftEnd;
+  auto RightEnd = Gains.end();
+
+  // Sort gains
+  auto LargerGain = [](const auto &L, const auto &R) {
+    return L.first > R.first;
+  };
+  std::stable_sort(LeftGains, LeftEnd, LargerGain);
+  std::stable_sort(RightGains, RightEnd, LargerGain);
+
+  // Exchange: change buckets and update queryVertex signatures
+  unsigned NumMovedDataVertices = 0;
+  unsigned MinSize = std::min(std::distance(LeftGains, LeftEnd),
+                              std::distance(RightGains, RightEnd));
+  for (unsigned I = 0; I < MinSize; I++) {
+    if (LeftGains[I].first + RightGains[I].first <= 0.0)
+      break;
+    // Try to swap the two nodes
+    NumMovedDataVertices += moveFunctionNode(*LeftGains[I].second, LeftBucket,
+                                             RightBucket, Signatures, RNG);
+    NumMovedDataVertices += moveFunctionNode(*RightGains[I].second, LeftBucket,
+                                             RightBucket, Signatures, RNG);
+  }
+  return NumMovedDataVertices;
+}
+
+bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N,
+                                            unsigned LeftBucket,
+                                            unsigned RightBucket,
+                                            SignaturesT &Signatures,
+                                            std::mt19937 &RNG) const {
+  // Sometimes we skip the move. This helps to escape local optima
+  if (std::uniform_real_distribution<float>(0.0, 1.0)(RNG) <=
+      Config.SkipProbability)
+    return false;
+
+  bool FromLeftToRight = (N.Bucket == LeftBucket);
+  // Update the current bucket
+  N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket);
+
+  // Update signatures and invalidate cost cache
+  if (FromLeftToRight) {
+    for (auto &UN : N.UtilityNodes) {
+      auto &Signature = Signatures[UN];
+      Signature.LeftCount--;
+      Signature.RightCount++;
+      Signature.CachedCostIsValid = false;
+    }
+  } else {
+    for (auto &UN : N.UtilityNodes) {
+      auto &Signature = Signatures[UN];
+      Signature.LeftCount++;
+      Signature.RightCount--;
+      Signature.CachedCostIsValid = false;
+    }
+  }
+  return true;
+}
+
+void BalancedPartitioning::split(const FunctionNodeRange Nodes,
+                                 unsigned StartBucket) const {
+  unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+  auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2;
+
+  std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) {
+    return L.InputOrderIndex < R.InputOrderIndex;
+  });
+
+  for (auto &N : llvm::make_range(Nodes.begin(), NodesMid))
+    N.Bucket = StartBucket;
+  for (auto &N : llvm::make_range(NodesMid, Nodes.end()))
+    N.Bucket = StartBucket + 1;
+}
+
+float BalancedPartitioning::moveGain(const BPFunctionNode &N,
+                                     bool FromLeftToRight,
+                                     const SignaturesT &Signatures) const {
+  if (FromLeftToRight) {
+    return std::accumulate(
+        N.UtilityNodes.begin(), N.UtilityNodes.end(), 0.f,
+        [&](float Gain, auto &UN) { return Signatures[UN].CachedCostLR; });
+  } else {
+    return std::accumulate(
+        N.UtilityNodes.begin(), N.UtilityNodes.end(), 0.f,
+        [&](float Gain, auto &UN) { return Signatures[UN].CachedCostRL; });
+  }
+}
+
+float BalancedPartitioning::logCost(unsigned X, unsigned Y) const {
+  return -(X * log2Cached(X + 1) + Y * log2Cached(Y + 1));
+}
+
+float BalancedPartitioning::log2Cached(unsigned i) const {
+  return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i);
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -129,6 +129,7 @@
   Allocator.cpp
   AutoConvert.cpp
   Base64.cpp
+  BalancedPartitioning.cpp
   BinaryStreamError.cpp
   BinaryStreamReader.cpp
   BinaryStreamRef.cpp
diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext
new file mode 100644
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/show-order.proftext
@@ -0,0 +1,46 @@
+# RUN: llvm-profdata order %s | FileCheck %s
+
+# CHECK: a
+# CHECK: b
+# CHECK: c
+# CHECK: x
+
+# Header
+:ir
+:temporal_prof_traces
+# Num Traces
+3
+# Trace Stream Size:
+3
+# Weight
+1
+a, main.c:b, c
+# Weight
+1
+a, x, main.c:b, c
+# Weight
+1
+a, main.c:b, c
+
+a
+# Func Hash:
+0x1234
+# Num Counters:
+1
+# Counter Values:
+101
+
+main.c:b
+0x5678
+1
+202
+
+c
+0xabcd
+1
+303
+
+x
+0xefff
+1
+404
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ProfileData/RawMemProfReader.h"
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/ProfileData/SampleProfWriter.h"
+#include "llvm/Support/BalancedPartitioning.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Discriminator.h"
 #include "llvm/Support/Errc.h"
@@ -3034,6 +3035,51 @@
   return showMemProfProfile(Filename, ProfiledBinary, SFormat, OS);
 }
 
+static int order_main(int argc, const char *argv[]) {
+  cl::opt<std::string> Filename(cl::Positional, cl::desc("<profdata-file>"));
+  cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
+                                      cl::init("-"), cl::desc("Output file"));
+  cl::alias OutputFilenameA("o", cl::desc("Alias for --output"),
+                            cl::aliasopt(OutputFilename));
+  cl::ParseCommandLineOptions(argc, argv, "LLVM profile data order\n");
+
+  std::error_code EC;
+  raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF);
+  if (EC)
+    exitWithErrorCode(EC, OutputFilename);
+  auto FS = vfs::getRealFileSystem();
+  auto ReaderOrErr = InstrProfReader::create(Filename, *FS);
+  if (Error E = ReaderOrErr.takeError())
+    exitWithError(std::move(E), Filename);
+
+  auto Reader = std::move(ReaderOrErr.get());
+  for (auto &I : *Reader) {
+    // Read all entries
+    (void)I;
+  }
+  auto &Traces = Reader->getTemporalProfTraces();
+  auto Nodes = BPFunctionNode::fromTemporalProfTraces(Traces);
+  BalancedPartitioningConfig Config;
+  BalancedPartitioning BP(Config);
+  BP.run(Nodes);
+
+  WithColor::note() << "# Ordered " << Nodes.size() << " functions\n";
+  for (auto &N : Nodes) {
+    auto FuncName = Reader->getSymtab().getFuncName(N.Id);
+    if (FuncName.contains(':')) {
+      // GlobalValue::getGlobalIdentifier() prefixes the filename if the symbol
+      // is local. This logic will break if there is a colon in the filename,
+      // but we cannot use rsplit() because ObjC symbols can have colons.
+      auto [Filename, ParsedFuncName] = FuncName.split(':');
+      // Emit a comment describing where this symbol came from
+      OS << "# " << Filename << "\n";
+      FuncName = ParsedFuncName;
+    }
+    OS << FuncName << "\n";
+  }
+  return 0;
+}
+
 int llvm_profdata_main(int argc, char **argvNonConst,
                        const llvm::ToolContext &) {
   const char **argv = const_cast<const char **>(argvNonConst);
@@ -3049,6 +3095,8 @@
       func = show_main;
     else if (strcmp(argv[1], "overlap") == 0)
       func = overlap_main;
+    else if (strcmp(argv[1], "order") == 0)
+      func = order_main;
 
     if (func) {
       std::string Invocation(ProgName.str() + " " + argv[1]);
@@ -3073,6 +3121,6 @@
   else
     errs() << ProgName << ": Unknown command!\n";
 
-  errs() << "USAGE: " << ProgName << " <merge|show|overlap> [args...]\n";
+  errs() << "USAGE: " << ProgName << " <merge|show|overlap|order> [args...]\n";
   return 1;
 }
diff --git a/llvm/unittests/Support/BalancedPartitioningTest.cpp b/llvm/unittests/Support/BalancedPartitioningTest.cpp
new file mode 100644
--- /dev/null
+++ b/llvm/unittests/Support/BalancedPartitioningTest.cpp
@@ -0,0 +1,108 @@
+//===- BalancedPartitioningTest.cpp - BalancedPartitioning tests ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BalancedPartitioning.h"
+#include "llvm/Testing/Support/SupportHelpers.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using testing::Each;
+using testing::Field;
+using testing::Not;
+using testing::UnorderedElementsAre;
+using testing::UnorderedElementsAreArray;
+
+namespace llvm {
+
+void PrintTo(const BPFunctionNode &Node, std::ostream *OS) {
+  raw_os_ostream ROS(*OS);
+  Node.dump(ROS);
+}
+
+TEST(BPFunctionNodeTest, Basic) {
+  SmallVector<TemporalProfTraceTy> Traces = {
+      TemporalProfTraceTy({0, 1, 2, 3, 4}),
+      TemporalProfTraceTy({4, 2}),
+  };
+  auto Nodes = BPFunctionNode::fromTemporalProfTraces(Traces);
+
+  auto NodeIs = [](BPFunctionNode::IDT Id,
+                   ArrayRef<BPFunctionNode::UtilityNodeT> UNs) {
+    return AllOf(Field("Id", &BPFunctionNode::Id, Id),
+                 Field("UtilityNodes", &BPFunctionNode::UtilityNodes,
+                       UnorderedElementsAreArray(UNs)));
+  };
+
+  EXPECT_THAT(Nodes,
+              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
+                                   NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
+                                   NodeIs(4, {2, 3, 4, 5})));
+}
+
+class BalancedPartitioningTest : public ::testing::Test {
+protected:
+  BalancedPartitioningConfig Config;
+  BalancedPartitioning Bp;
+  BalancedPartitioningTest() : Bp(Config) {}
+
+  static std::vector<BPFunctionNode::IDT>
+  getIds(std::vector<BPFunctionNode> Nodes) {
+    std::vector<BPFunctionNode::IDT> Ids;
+    for (auto &N : Nodes)
+      Ids.push_back(N.Id);
+    return Ids;
+  }
+};
+
+TEST_F(BalancedPartitioningTest, Basic) {
+  std::vector<BPFunctionNode> Nodes = {
+      BPFunctionNode(0, {1, 2}), BPFunctionNode(2, {3, 4}),
+      BPFunctionNode(1, {1, 2}), BPFunctionNode(3, {3, 4}),
+      BPFunctionNode(4, {4}),
+  };
+
+  Bp.run(Nodes);
+
+  auto NodeIs = [](BPFunctionNode::IDT Id, std::optional<uint32_t> Bucket) {
+    return AllOf(Field("Id", &BPFunctionNode::Id, Id),
+                 Field("Bucket", &BPFunctionNode::Bucket, Bucket));
+  };
+
+  EXPECT_THAT(Nodes,
+              UnorderedElementsAre(NodeIs(0, 0), NodeIs(1, 1), NodeIs(2, 2),
+                                   NodeIs(3, 3), NodeIs(4, 4)));
+}
+
+TEST_F(BalancedPartitioningTest, Large) {
+  const int ProblemSize = 1000;
+  std::vector<BPFunctionNode::UtilityNodeT> AllUNs;
+  for (int i = 0; i < ProblemSize; i++)
+    AllUNs.emplace_back(i);
+
+  std::mt19937 RNG;
+  std::vector<BPFunctionNode> Nodes;
+  for (int i = 0; i < ProblemSize; i++) {
+    std::vector<BPFunctionNode::UtilityNodeT> UNs;
+    int SampleSize =
+        std::uniform_int_distribution<int>(0, AllUNs.size() - 1)(RNG);
+    std::sample(AllUNs.begin(), AllUNs.end(), std::back_inserter(UNs),
+                SampleSize, RNG);
+    Nodes.emplace_back(i, UNs);
+  }
+
+  auto OrigIds = getIds(Nodes);
+
+  Bp.run(Nodes);
+
+  EXPECT_THAT(
+      Nodes, Each(Not(Field("Bucket", &BPFunctionNode::Bucket, std::nullopt))));
+  EXPECT_THAT(getIds(Nodes), UnorderedElementsAreArray(OrigIds));
+}
+
+} // end namespace llvm
diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt
--- a/llvm/unittests/Support/CMakeLists.txt
+++ b/llvm/unittests/Support/CMakeLists.txt
@@ -17,6 +17,7 @@
   BinaryStreamTest.cpp
   BLAKE3Test.cpp
   BlockFrequencyTest.cpp
+  BalancedPartitioningTest.cpp
   BranchProbabilityTest.cpp
   CachePruningTest.cpp
   CrashRecoveryTest.cpp