diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst --- a/llvm/docs/CommandGuide/llvm-profdata.rst +++ b/llvm/docs/CommandGuide/llvm-profdata.rst @@ -20,6 +20,7 @@ * :ref:`merge ` * :ref:`show ` * :ref:`overlap ` +* :ref:`order ` .. program:: llvm-profdata merge @@ -418,6 +419,40 @@ Only show overlap for the context sensitive profile counts. The default is to show non-context sensitive profile counts. +.. program:: llvm-profdata order + +.. _profdata-order: + +ORDER +------- + +SYNOPSIS +^^^^^^^^ + +:program:`llvm-profdata order` [*options*] [*filename*] + +DESCRIPTION +^^^^^^^^^^^ + +:program:`llvm-profdata order` uses temporal profiling traces from a profile and +finds a function order that reduces the number of page faults for those traces. +This output can be directly passed to ``lld`` via ``--symbol-ordering-file=`` +for ELF or ``-order-file`` for Mach-O. If the traces found in the profile are +representative of the real world, then this order should improve startup +performance. + +OPTIONS +^^^^^^^ + +.. option:: --help + + Print a summary of command line options. + +.. option:: --output=, -o + + Specify the output file name. If *output* is ``-`` or it isn't specified, + then the output is sent to standard output. + EXIT STATUS ----------- diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -336,8 +336,11 @@ /// An ordered list of functions identified by their NameRef found in /// INSTR_PROF_DATA struct TemporalProfTraceTy { - uint64_t Weight = 1; std::vector FunctionNameRefs; + uint64_t Weight; + TemporalProfTraceTy(std::initializer_list Trace = {}, + uint64_t Weight = 1) + : FunctionNameRefs(Trace), Weight(Weight) {} }; inline std::error_code make_error_code(instrprof_error E) { diff --git a/llvm/include/llvm/Support/BalancedPartitioning.h b/llvm/include/llvm/Support/BalancedPartitioning.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Support/BalancedPartitioning.h @@ -0,0 +1,208 @@ +//===- BalancedPartitioning.h ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements BalancedPartitioning, a recursive balanced graph +// partitioning algorithm. +// +// The algorithm is used to find an ordering of FunctionNodes while optimizing +// a specified objective. The algorithm uses recursive bisection; it starts +// with a collection of unordered FunctionNodes and tries to split them into +// two sets (buckets) of equal cardinality. Each bisection step is comprised of +// iterations that greedily swap the FunctionNodes between the two buckets while +// there is an improvement of the objective. Once the process converges, the +// problem is divided into two sub-problems of half the size, which are +// recursively applied for the two buckets. The final ordering of the +// FunctionNodes is obtained by concatenating the two (recursively computed) +// orderings. +// +// In order to speed up the computation, we limit the depth of the recursive +// tree by a specified constant (SplitDepth) and apply at most a constant +// number of greedy iterations per split (IterationsPerSplit). The worst-case +// time complexity of the implementation is bounded by O(M*log^2 N), where +// N is the number of FunctionNodes and M is the number of +// FunctionNode-UtilityNode edges; (assuming that any collection of D +// FunctionNodes contains O(D) UtilityNodes). Notice that the two different +// recursive sub-problems are independent and thus can be efficiently processed +// in parallel. +// +// Reference: +// * Optimizing Function Layout for Mobile Applications, +// https://arxiv.org/abs/2211.09285 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H +#define LLVM_SUPPORT_BALANCED_PARTITIONING_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Support/ThreadPool.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include +#include + +namespace llvm { + +/// A function with a set of utility nodes where it is beneficial to order two +/// functions close together if they have similar utility nodes +class BPFunctionNode { + friend class BalancedPartitioning; + +public: + using IDT = uint64_t; + using UtilityNodeT = uint32_t; + + /// \param UtilityNodes the set of utility nodes (must be unique'd) + BPFunctionNode(IDT Id, std::vector &UtilityNodes) + : Id(Id), UtilityNodes(UtilityNodes) {} + + /// The ID of this node + IDT Id; + + /// Construct a list of nodes from a set of temporal profile traces + static std::vector + fromTemporalProfTraces(const SmallVectorImpl &Traces); + + void dump(raw_ostream &OS) const; + +protected: + /// The list of utility nodes associated with this node + std::vector UtilityNodes; + /// The bucket assigned by balanced partitioning + std::optional Bucket; + /// The index of the input order of the FunctionNodes + uint64_t InputOrderIndex = 0; + + friend class BPFunctionNodeTest_Basic_Test; + friend class BalancedPartitioningTest_Basic_Test; + friend class BalancedPartitioningTest_Large_Test; +}; + +/// Algorithm parameters; default values are tuned on real-world binaries +struct BalancedPartitioningConfig { + /// The depth of the recursive bisection + uint32_t SplitDepth = 16; + /// The maximum number of bp iterations per split + uint32_t IterationsPerSplit = 40; + /// The probability for a vertex to skip a move from its current bucket to + /// another bucket; it often helps to escape from a local optima + double SkipProbability = 0.1; + /// Recursive subtasks up to the given depth are added to the queue and + /// distributed among threads by ThreadPool; all subsequent calls are executed + /// on the same thread + uint32_t TaskSplitDepth = 9; +}; + +class BalancedPartitioning { +public: + BalancedPartitioning(const BalancedPartitioningConfig &Config); + + /// Run recursive graph partitioning that optimizes a given objective. + void run(std::vector &Nodes) const; + +private: + struct UtilitySignature; + using SignatureMapT = + DenseMap; + using FunctionNodeRange = + iterator_range::iterator>; + + /// A special ThreadPool that allows for spawning new tasks after blocking on + /// wait(). BalancedPartitioning recursively spawns new threads inside other + /// threads, so we need to track how many active threads that could spawn more + /// threads. + struct BPThreadPool { + ThreadPool TheThreadPool; + std::mutex mtx; + std::condition_variable cv; + /// The number of threads that could spawn more threads + std::atomic NumActiveThreads = 0; + /// Only true when all threads are down spawning new threads + bool IsFinishedSpawning = false; + /// Asynchronous submission of the task to the pool + template void async(Func &&F); + /// Blocking wait for all threads to complete. Unlike ThreadPool, it is + /// acceptable for other threads to add more tasks while blocking on this + /// call. + void wait(); + }; + + /// Run a recursive bisection of a given list of FunctionNodes + /// \param RecDepth the current depth of recursion + /// \param RootBucket the initial bucket of the dataVertices + /// \param Offset the assigned buckets are the range [Offset, Offset + + /// Nodes.size()] + void bisect(const FunctionNodeRange Nodes, uint32_t RecDepth, + uint32_t RootBucket, uint32_t Offset, + std::optional &TP) const; + + /// Run bisection iterations + /// \returns true iff a progress has been made + bool runIterations(const FunctionNodeRange Nodes, uint32_t RecDepth, + uint32_t LeftBucket, uint32_t RightBucket, + std::mt19937 &RNG) const; + + /// Run a bisection iteration to improve the optimization goal + /// \returns the total number of moved FunctionNodes + uint32_t runIteration(const FunctionNodeRange Nodes, uint32_t LeftBucket, + uint32_t RightBucket, SignatureMapT &Signatures, + std::mt19937 &RNG) const; + + /// Try to move \p N from one bucket to another + /// \returns true iff \p N is moved + bool moveFunctionNode(BPFunctionNode &N, uint32_t LeftBucket, + uint32_t RightBucket, SignatureMapT &Signatures, + std::mt19937 &RNG) const; + + /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket + + /// 1 The method is used for an initial assignment before a bisection step + void split(const FunctionNodeRange Nodes, uint32_t StartBucket) const; + + /// Initialize utility node signature before a bisection iteration + // void prepareSignature(UtilitySignature &Signature) const; + + /// Compute the move gain for uniform log-gap cost + double moveGain(const BPFunctionNode &N, bool FromLeftToRight, + const SignatureMapT &Signatures) const; + + /// The cost of the uniform log-gap cost, assuming a utility node has \p X + /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one. + double logCost(uint32_t X, uint32_t Y) const; + + /// Compute an average optimization goal for a given utility node signature + double computeGoal(const SignatureMapT &Signatures) const; + + double log2Cached(uint32_t i) const; + +private: + const BalancedPartitioningConfig &Config; + + /// Precomputed values of log2(x). Table size is small enough to fit in cache. + static constexpr uint32_t LOG_CACHE_SIZE = 16384; + double Log2Cache[LOG_CACHE_SIZE]; + + /// The signature of a particular utility node used for the bisection step, + /// i.e., the number of \p FunctionNodes in each of the two buckets + struct UtilitySignature { + /// The number of \p FunctionNodes in the left bucket + uint32_t LeftCount = 0; + /// The number of \p FunctionNodes in the right bucket + uint32_t RightCount = 0; + /// The cached cost of moving a \p FunctionNode between buckets. The first + /// element is from left to right and the second is from right to left. + std::optional> CachedCost = {}; + }; +}; + +} // end namespace llvm + +#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -0,0 +1,386 @@ +//===- BalancedPartitioning.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements BalancedPartitioning, a recursive balanced graph +// partitioning algorithm. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/BalancedPartitioning.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; +#define DEBUG_TYPE "balanced-partitioning" + +std::vector BPFunctionNode::fromTemporalProfTraces( + const SmallVectorImpl &Traces) { + // Collect all function IDs ordered by their smallest timestamp. This will be + // used as the initial FunctionNode order. + SetVector FunctionIds; + size_t LargestTraceSize = 0; + for (auto &Trace : Traces) + LargestTraceSize = + std::max(LargestTraceSize, Trace.FunctionNameRefs.size()); + for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++) + for (auto &Trace : Traces) + if (Timestamp < Trace.FunctionNameRefs.size()) + FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]); + + int N = std::ceil(std::log2(LargestTraceSize)); + + DenseMap> FuncGroups; + for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) { + auto &Trace = Traces[TraceIdx].FunctionNameRefs; + for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) { + for (int i = std::floor(std::log2(Timestamp + 1)); i < N; i++) { + auto &FunctionId = Trace[Timestamp]; + UtilityNodeT GroupId = TraceIdx * N + i; + FuncGroups[FunctionId].push_back(GroupId); + } + } + } + + std::vector Nodes; + for (auto &Id : FunctionIds) { + auto &UNs = FuncGroups[Id]; + llvm::sort(UNs); + UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end()); + Nodes.emplace_back(Id, UNs); + } + return Nodes; +} + +void BPFunctionNode::dump(raw_ostream &OS) const { + OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id, + make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket); +} + +template +void BalancedPartitioning::BPThreadPool::async(Func &&F) { + // This new thread could spawn more threads, so mark it as active + ++NumActiveThreads; + TheThreadPool.async([=]() { + // Run the task + F(); + + // This thread will no longer spawn new threads, so mark it as inactive + if (--NumActiveThreads == 0) { + // There are no more active threads, so mark as finished and notify + { + std::unique_lock lock(mtx); + assert(!IsFinishedSpawning); + IsFinishedSpawning = true; + } + cv.notify_one(); + } + }); +} + +void BalancedPartitioning::BPThreadPool::wait() { + // TODO: We could remove the mutex and condition variable and use + // std::atomic::wait() instead, but that isn't available until C++20 + { + std::unique_lock lock(mtx); + cv.wait(lock, [&]() { return IsFinishedSpawning; }); + assert(IsFinishedSpawning && NumActiveThreads == 0); + } + // Now we can call ThreadPool::wait() since all tasks have been submitted + TheThreadPool.wait(); +} + +BalancedPartitioning::BalancedPartitioning( + const BalancedPartitioningConfig &Config) + : Config(Config) { + // Pre-computing log2 values + Log2Cache[0] = 0.0; + for (uint32_t I = 1; I < LOG_CACHE_SIZE; I++) + Log2Cache[I] = std::log2(I); +} + +void BalancedPartitioning::run(std::vector &Nodes) const { + LLVM_DEBUG(dbgs() << "Partitioning " << Nodes.size() << " nodes using depth " + << Config.SplitDepth << " and " << Config.IterationsPerSplit + << " iterations per split\n"); + std::optional TP; + if (Config.TaskSplitDepth > 1) + TP.emplace(); + + // Record the input order + for (unsigned I = 0; I < Nodes.size(); I++) + Nodes[I].InputOrderIndex = I; + + auto NodesRange = llvm::make_range(Nodes.begin(), Nodes.end()); + auto BisectTask = [=, &TP]() { + bisect(NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP); + }; + if (TP) { + TP->async(std::move(BisectTask)); + TP->wait(); + } else { + BisectTask(); + } + + llvm::stable_sort(NodesRange, [](const auto &L, const auto &R) { + return L.Bucket < R.Bucket; + }); + + LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n"); +} + +void BalancedPartitioning::bisect(const FunctionNodeRange Nodes, + uint32_t RecDepth, uint32_t RootBucket, + uint32_t Offset, + std::optional &TP) const { + uint32_t NumNodes = std::distance(Nodes.begin(), Nodes.end()); + if (NumNodes < 1 || RecDepth >= Config.SplitDepth) { + // We've reach the lowest level of the recursion tree. Fall back to the + // original order and assign to buckets. + llvm::stable_sort(Nodes, [](const auto &L, const auto &R) { + return L.InputOrderIndex < R.InputOrderIndex; + }); + for (auto &N : Nodes) + N.Bucket = Offset++; + return; + } + + LLVM_DEBUG(dbgs() << "Bisect with " << NumNodes << " nodes and root bucket " + << RootBucket << "\n"); + + std::mt19937 RNG(RootBucket); + + uint32_t LeftBucket = 2 * RootBucket; + uint32_t RightBucket = 2 * RootBucket + 1; + + // Split into two and assign to the left and right buckets + split(Nodes, LeftBucket); + + bool Improvement = + runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG); + if (!Improvement) { + // No improvement, so assign buckets and stop + for (auto &N : Nodes) + N.Bucket = Offset++; + return; + } + + // Split nodes wrt the resulting buckets + auto NodesMid = + llvm::partition(Nodes, [&](auto &N) { return N.Bucket == LeftBucket; }); + uint32_t MidOffset = Offset + std::distance(Nodes.begin(), NodesMid); + + auto LeftNodes = llvm::make_range(Nodes.begin(), NodesMid); + auto RightNodes = llvm::make_range(NodesMid, Nodes.end()); + + auto LeftRecTask = [=, &TP]() { + bisect(LeftNodes, RecDepth + 1, LeftBucket, Offset, TP); + }; + auto RightRecTask = [=, &TP]() { + bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP); + }; + + if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) { + TP->async(std::move(LeftRecTask)); + TP->async(std::move(RightRecTask)); + } else { + LeftRecTask(); + RightRecTask(); + } +} + +bool BalancedPartitioning::runIterations(const FunctionNodeRange Nodes, + uint32_t RecDepth, uint32_t LeftBucket, + uint32_t RightBucket, + std::mt19937 &RNG) const { + int NumNodes = std::distance(Nodes.begin(), Nodes.end()); + DenseMap UtilityNodeDegree; + for (auto &N : Nodes) + for (auto &UN : N.UtilityNodes) + ++UtilityNodeDegree[UN]; + // Remove utility nodes if they have just one edge or are connected to all + // functions + for (auto &N : Nodes) + llvm::erase_if(N.UtilityNodes, [&](auto &UN) { + return UtilityNodeDegree[UN] <= 1 || UtilityNodeDegree[UN] >= NumNodes; + }); + + // Initialize signatures. Empirically the number of corresponding UtilityNodes + // is 4x larger than the number of FunctionNodes. + SignatureMapT Signatures(/*InitialReserve=*/NumNodes * 4); + for (auto &N : Nodes) { + for (auto &UN : N.UtilityNodes) { + if (N.Bucket == LeftBucket) { + Signatures[UN].LeftCount++; + } else { + Signatures[UN].RightCount++; + } + } + } + + double InitialGoal = computeGoal(Signatures); + + for (uint32_t Iter = 0; Iter < Config.IterationsPerSplit; Iter++) { + uint32_t NumMovedNodes = + runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG); + if (NumMovedNodes == 0) + break; + } + + double FinalGoal = computeGoal(Signatures); + // Return true iff progress has been made + return InitialGoal > FinalGoal; +} + +uint32_t BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, + uint32_t LeftBucket, + uint32_t RightBucket, + SignatureMapT &Signatures, + std::mt19937 &RNG) const { + // Init signature cost caches + for (auto &[UN, Signature] : Signatures) { + if (Signature.CachedCost.has_value()) + continue; + uint32_t L = Signature.LeftCount; + uint32_t R = Signature.RightCount; + assert((L > 0 || R > 0) && "incorrect signature"); + // cost = x * log(U / (x+1)) + y * log(U / (y+1)) = + // = x * log(U) + y * log(U) - (x * log(x+1) + y * log(y+1)) = + // = U * log(U) - (x * log(x+1) + y * log(y+1)) + double cost = logCost(L, R); + double CostLR = 0, CostRL = 0; + if (L > 0) + CostLR = cost - logCost(L - 1, R + 1); + if (R > 0) + CostRL = cost - logCost(L + 1, R - 1); + Signature.CachedCost = std::make_pair(CostLR, CostRL); + } + + // Compute move gains + typedef std::pair GainPair; + std::vector Gains; + for (auto &N : Nodes) { + bool FromLeftToRight = (N.Bucket == LeftBucket); + double Gain = moveGain(N, FromLeftToRight, Signatures); + Gains.push_back(std::make_pair(Gain, &N)); + } + + // Collect left and right gains + auto LeftGains = Gains.begin(); + auto LeftEnd = llvm::partition( + Gains, [&](const auto &GP) { return GP.second->Bucket == LeftBucket; }); + + auto RightGains = LeftEnd; + auto RightEnd = Gains.end(); + + // Sort gains + auto LargerGain = [](const auto &L, const auto &R) { + return L.first > R.first; + }; + std::stable_sort(LeftGains, LeftEnd, LargerGain); + std::stable_sort(RightGains, RightEnd, LargerGain); + + // Exchange: change buckets and update queryVertex signatures + uint32_t NumMovedDataVertices = 0; + uint32_t MinSize = std::min(std::distance(LeftGains, LeftEnd), + std::distance(RightGains, RightEnd)); + for (uint32_t I = 0; I < MinSize; I++) { + if (LeftGains[I].first + RightGains[I].first <= 0.0) + break; + // Try to swap the two nodes + NumMovedDataVertices += moveFunctionNode(*LeftGains[I].second, LeftBucket, + RightBucket, Signatures, RNG); + NumMovedDataVertices += moveFunctionNode(*RightGains[I].second, LeftBucket, + RightBucket, Signatures, RNG); + } + return NumMovedDataVertices; +} + +bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N, + uint32_t LeftBucket, + uint32_t RightBucket, + SignatureMapT &Signatures, + std::mt19937 &RNG) const { + // Sometimes we skip the move. This helps to escape local optima + if (std::uniform_real_distribution(0.0, 1.0)(RNG) <= + Config.SkipProbability) + return false; + + bool FromLeftToRight = (N.Bucket == LeftBucket); + + // Update the current bucket + N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket); + + // Update signatures and invalidate cost cache + for (auto &UN : N.UtilityNodes) { + auto &Signature = Signatures[UN]; + if (FromLeftToRight) { + Signature.LeftCount--; + Signature.RightCount++; + } else { + Signature.LeftCount++; + Signature.RightCount--; + } + Signature.CachedCost = {}; + } + + return true; +} + +void BalancedPartitioning::split(const FunctionNodeRange Nodes, + uint32_t StartBucket) const { + uint32_t NumNodes = std::distance(Nodes.begin(), Nodes.end()); + auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2; + + std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) { + return L.InputOrderIndex < R.InputOrderIndex; + }); + + for (auto &N : llvm::make_range(Nodes.begin(), NodesMid)) + N.Bucket = StartBucket; + for (auto &N : llvm::make_range(NodesMid, Nodes.end())) + N.Bucket = StartBucket + 1; +} + +double BalancedPartitioning::moveGain(const BPFunctionNode &N, + bool FromLeftToRight, + const SignatureMapT &Signatures) const { + double Gain = 0; + for (auto &UN : N.UtilityNodes) { + auto &Signature = Signatures.find(UN)->second; + auto &[CostLR, CostRL] = Signature.CachedCost.value(); + Gain += (FromLeftToRight ? CostLR : CostRL); + } + return Gain; +} + +double BalancedPartitioning::logCost(uint32_t X, uint32_t Y) const { + return -(X * log2Cached(X + 1) + Y * log2Cached(Y + 1)); +} + +double +BalancedPartitioning::computeGoal(const SignatureMapT &Signatures) const { + double Sum = 0; + double Cnt = 0; + for (auto &[UN, Signature] : Signatures) { + uint32_t L = Signature.LeftCount; + uint32_t R = Signature.RightCount; + assert((L > 0 || R > 0) && "incorrect signature"); + // Shifting opt goal to the interval [0..xxx) + Sum += logCost(L, R) - logCost(L + R, 0); + Cnt += L + R; + } + return Cnt > 0 ? Sum / Cnt : 0; +} + +double BalancedPartitioning::log2Cached(uint32_t i) const { + return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i); +} diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -129,6 +129,7 @@ Allocator.cpp AutoConvert.cpp Base64.cpp + BalancedPartitioning.cpp BinaryStreamError.cpp BinaryStreamReader.cpp BinaryStreamRef.cpp diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-profdata/show-order.proftext @@ -0,0 +1,46 @@ +# RUN: llvm-profdata order %s | FileCheck %s + +# CHECK: a +# CHECK: b +# CHECK: x +# CHECK: c + +# Header +:ir +:temporal_prof_traces +# Num Traces +3 +# Trace Stream Size: +3 +# Weight +1 +a, main.c:b, c +# Weight +1 +a, x, main.c:b, c +# Weight +1 +a, main.c:b, c + +a +# Func Hash: +0x1234 +# Num Counters: +1 +# Counter Values: +101 + +main.c:b +0x5678 +1 +202 + +c +0xabcd +1 +303 + +x +0xefff +1 +404 diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -23,6 +23,7 @@ #include "llvm/ProfileData/RawMemProfReader.h" #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/ProfileData/SampleProfWriter.h" +#include "llvm/Support/BalancedPartitioning.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Discriminator.h" #include "llvm/Support/Errc.h" @@ -3034,6 +3035,51 @@ return showMemProfProfile(Filename, ProfiledBinary, SFormat, OS); } +static int order_main(int argc, const char *argv[]) { + cl::opt Filename(cl::Positional, cl::desc("")); + cl::opt OutputFilename("output", cl::value_desc("output"), + cl::init("-"), cl::desc("Output file")); + cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + cl::ParseCommandLineOptions(argc, argv, "LLVM profile data order\n"); + + std::error_code EC; + raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF); + if (EC) + exitWithErrorCode(EC, OutputFilename); + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = InstrProfReader::create(Filename, *FS); + if (Error E = ReaderOrErr.takeError()) + exitWithError(std::move(E), Filename); + + auto Reader = std::move(ReaderOrErr.get()); + for (auto &I : *Reader) { + // Read all entries + (void)I; + } + auto &Traces = Reader->getTemporalProfTraces(); + auto Nodes = BPFunctionNode::fromTemporalProfTraces(Traces); + BalancedPartitioningConfig Config; + BalancedPartitioning BP(Config); + BP.run(Nodes); + + WithColor::note() << "# Ordered " << Nodes.size() << " functions\n"; + for (auto &N : Nodes) { + auto FuncName = Reader->getSymtab().getFuncName(N.Id); + if (FuncName.contains(':')) { + // GlobalValue::getGlobalIdentifier() prefixes the filename if the symbol + // is local. This logic will break if there is a colon in the filename, + // but we cannot use rsplit() because ObjC symbols can have colons. + auto [Filename, ParsedFuncName] = FuncName.split(':'); + // Emit a comment describing where this symbol came from + OS << "# " << Filename << "\n"; + FuncName = ParsedFuncName; + } + OS << FuncName << "\n"; + } + return 0; +} + int llvm_profdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) { const char **argv = const_cast(argvNonConst); @@ -3049,6 +3095,8 @@ func = show_main; else if (strcmp(argv[1], "overlap") == 0) func = overlap_main; + else if (strcmp(argv[1], "order") == 0) + func = order_main; if (func) { std::string Invocation(ProgName.str() + " " + argv[1]); @@ -3073,6 +3121,6 @@ else errs() << ProgName << ": Unknown command!\n"; - errs() << "USAGE: " << ProgName << " [args...]\n"; + errs() << "USAGE: " << ProgName << " [args...]\n"; return 1; } diff --git a/llvm/unittests/Support/BalancedPartitioningTest.cpp b/llvm/unittests/Support/BalancedPartitioningTest.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/Support/BalancedPartitioningTest.cpp @@ -0,0 +1,108 @@ +//===- BalancedPartitioningTest.cpp - BalancedPartitioning tests ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/BalancedPartitioning.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; +using testing::Each; +using testing::Field; +using testing::Not; +using testing::UnorderedElementsAre; +using testing::UnorderedElementsAreArray; + +namespace llvm { + +void PrintTo(const BPFunctionNode &Node, std::ostream *OS) { + raw_os_ostream ROS(*OS); + Node.dump(ROS); +} + +TEST(BPFunctionNodeTest, Basic) { + SmallVector Traces = { + TemporalProfTraceTy({0, 1, 2, 3, 4}), + TemporalProfTraceTy({4, 2}), + }; + auto Nodes = BPFunctionNode::fromTemporalProfTraces(Traces); + + auto NodeIs = [](BPFunctionNode::IDT Id, + std::initializer_list UNs) { + return AllOf(Field("Id", &BPFunctionNode::Id, Id), + Field("UtilityNodes", &BPFunctionNode::UtilityNodes, + UnorderedElementsAreArray(UNs))); + }; + + EXPECT_THAT(Nodes, + UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}), + NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}), + NodeIs(4, {2, 3, 4, 5}))); +} + +class BalancedPartitioningTest : public ::testing::Test { +protected: + BalancedPartitioningConfig Config; + BalancedPartitioning Bp; + BalancedPartitioningTest() : Bp(Config) {} + + static std::vector + getIds(std::vector Nodes) { + std::vector Ids; + for (auto &N : Nodes) + Ids.push_back(N.Id); + return Ids; + } +}; + +TEST_F(BalancedPartitioningTest, Basic) { + std::vector UN1 = {1, 2}, UN2 = {3, 4}, + UN3 = {4}; + std::vector Nodes = { + BPFunctionNode(0, UN1), BPFunctionNode(2, UN2), BPFunctionNode(1, UN1), + BPFunctionNode(3, UN2), BPFunctionNode(4, UN3), + }; + + Bp.run(Nodes); + + auto NodeIs = [](BPFunctionNode::IDT Id, std::optional Bucket) { + return AllOf(Field("Id", &BPFunctionNode::Id, Id), + Field("Bucket", &BPFunctionNode::Bucket, Bucket)); + }; + + EXPECT_THAT(Nodes, + UnorderedElementsAre(NodeIs(0, 0), NodeIs(1, 1), NodeIs(2, 2), + NodeIs(3, 3), NodeIs(4, 4))); +} + +TEST_F(BalancedPartitioningTest, Large) { + std::vector AllUNs; + for (int i = 0; i < 100; i++) + AllUNs.emplace_back(i); + + std::mt19937 RNG; + std::vector Nodes; + for (int i = 0; i < 1000; i++) { + std::vector UNs; + int SampleSize = + std::uniform_int_distribution(0, AllUNs.size() - 1)(RNG); + std::sample(AllUNs.begin(), AllUNs.end(), std::back_inserter(UNs), + SampleSize, RNG); + Nodes.emplace_back(i, UNs); + } + + auto OrigIds = getIds(Nodes); + + Bp.run(Nodes); + + EXPECT_THAT( + Nodes, Each(Not(Field("Bucket", &BPFunctionNode::Bucket, std::nullopt)))); + EXPECT_THAT(getIds(Nodes), UnorderedElementsAreArray(OrigIds)); +} + +} // end namespace llvm diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -14,6 +14,7 @@ BinaryStreamTest.cpp BLAKE3Test.cpp BlockFrequencyTest.cpp + BalancedPartitioningTest.cpp BranchProbabilityTest.cpp CachePruningTest.cpp CrashRecoveryTest.cpp