diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst --- a/llvm/docs/CommandGuide/llvm-profdata.rst +++ b/llvm/docs/CommandGuide/llvm-profdata.rst @@ -20,6 +20,7 @@ * :ref:`merge ` * :ref:`show ` * :ref:`overlap ` +* :ref:`order ` .. program:: llvm-profdata merge @@ -418,6 +419,40 @@ Only show overlap for the context sensitive profile counts. The default is to show non-context sensitive profile counts. +.. program:: llvm-profdata order + +.. _profdata-order: + +ORDER +------- + +SYNOPSIS +^^^^^^^^ + +:program:`llvm-profdata order` [*options*] [*filename*] + +DESCRIPTION +^^^^^^^^^^^ + +:program:`llvm-profdata order` uses temporal profiling traces from a profile and +finds a function order that reduces the number of page faults for those traces. +This output can be directly passed to ``lld`` via ``--symbol-ordering-file=`` +for ELF or ``-order-file`` for Mach-O. If the traces found in the profile are +representative of the real world, then this order should improve startup +performance. + +OPTIONS +^^^^^^^ + +.. option:: --help + + Print a summary of command line options. + +.. option:: --output=, -o + + Specify the output file name. If *output* is ``-`` or it isn't specified, + then the output is sent to standard output. + EXIT STATUS ----------- diff --git a/llvm/include/llvm/ProfileData/InstrProf.h b/llvm/include/llvm/ProfileData/InstrProf.h --- a/llvm/include/llvm/ProfileData/InstrProf.h +++ b/llvm/include/llvm/ProfileData/InstrProf.h @@ -23,6 +23,7 @@ #include "llvm/IR/GlobalValue.h" #include "llvm/IR/ProfileSummary.h" #include "llvm/ProfileData/InstrProfData.inc" +#include "llvm/Support/BalancedPartitioning.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Compiler.h" #include "llvm/Support/Endian.h" @@ -337,8 +338,17 @@ /// An ordered list of functions identified by their NameRef found in /// INSTR_PROF_DATA struct TemporalProfTraceTy { - uint64_t Weight = 1; std::vector FunctionNameRefs; + uint64_t Weight; + TemporalProfTraceTy(std::initializer_list Trace = {}, + uint64_t Weight = 1) + : FunctionNameRefs(Trace), Weight(Weight) {} + + /// Use a set of temporal profile traces to create a list of balanced + /// partitioning function nodes used by BalancedPartitioning to generate a + /// function order that reduces page faults during startup + static std::vector + createBPFunctionNodes(ArrayRef Traces); }; inline std::error_code make_error_code(instrprof_error E) { diff --git a/llvm/include/llvm/Support/BalancedPartitioning.h b/llvm/include/llvm/Support/BalancedPartitioning.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Support/BalancedPartitioning.h @@ -0,0 +1,197 @@ +//===- BalancedPartitioning.h ---------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements BalancedPartitioning, a recursive balanced graph +// partitioning algorithm. +// +// The algorithm is used to find an ordering of FunctionNodes while optimizing +// a specified objective. The algorithm uses recursive bisection; it starts +// with a collection of unordered FunctionNodes and tries to split them into +// two sets (buckets) of equal cardinality. Each bisection step is comprised of +// iterations that greedily swap the FunctionNodes between the two buckets while +// there is an improvement of the objective. Once the process converges, the +// problem is divided into two sub-problems of half the size, which are +// recursively applied for the two buckets. The final ordering of the +// FunctionNodes is obtained by concatenating the two (recursively computed) +// orderings. +// +// In order to speed up the computation, we limit the depth of the recursive +// tree by a specified constant (SplitDepth) and apply at most a constant +// number of greedy iterations per split (IterationsPerSplit). The worst-case +// time complexity of the implementation is bounded by O(M*log^2 N), where +// N is the number of FunctionNodes and M is the number of +// FunctionNode-UtilityNode edges; (assuming that any collection of D +// FunctionNodes contains O(D) UtilityNodes). Notice that the two different +// recursive sub-problems are independent and thus can be efficiently processed +// in parallel. +// +// Reference: +// * Optimizing Function Layout for Mobile Applications, +// https://arxiv.org/abs/2211.09285 +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_BALANCED_PARTITIONING_H +#define LLVM_SUPPORT_BALANCED_PARTITIONING_H + +#include "raw_ostream.h" +#include "llvm/Support/ThreadPool.h" + +#include +#include + +namespace llvm { + +/// A function with a set of utility nodes where it is beneficial to order two +/// functions close together if they have similar utility nodes +class BPFunctionNode { + friend class BalancedPartitioning; + +public: + using IDT = uint64_t; + using UtilityNodeT = uint32_t; + + /// \param UtilityNodes the set of utility nodes (must be unique'd) + BPFunctionNode(IDT Id, ArrayRef UtilityNodes) + : Id(Id), UtilityNodes(UtilityNodes) {} + + /// The ID of this node + IDT Id; + + void dump(raw_ostream &OS) const; + +protected: + /// The list of utility nodes associated with this node + SmallVector UtilityNodes; + /// The bucket assigned by balanced partitioning + std::optional Bucket; + /// The index of the input order of the FunctionNodes + uint64_t InputOrderIndex = 0; + + friend class BPFunctionNodeTest_Basic_Test; + friend class BalancedPartitioningTest_Basic_Test; + friend class BalancedPartitioningTest_Large_Test; +}; + +/// Algorithm parameters; default values are tuned on real-world binaries +struct BalancedPartitioningConfig { + /// The depth of the recursive bisection + unsigned SplitDepth = 18; + /// The maximum number of bp iterations per split + unsigned IterationsPerSplit = 40; + /// The probability for a vertex to skip a move from its current bucket to + /// another bucket; it often helps to escape from a local optima + float SkipProbability = 0.1; + /// Recursive subtasks up to the given depth are added to the queue and + /// distributed among threads by ThreadPool; all subsequent calls are executed + /// on the same thread + unsigned TaskSplitDepth = 9; +}; + +class BalancedPartitioning { +public: + BalancedPartitioning(const BalancedPartitioningConfig &Config); + + /// Run recursive graph partitioning that optimizes a given objective. + void run(std::vector &Nodes) const; + +private: + struct UtilitySignature; + using SignaturesT = SmallVector; + using FunctionNodeRange = + iterator_range::iterator>; + + /// A special ThreadPool that allows for spawning new tasks after blocking on + /// wait(). BalancedPartitioning recursively spawns new threads inside other + /// threads, so we need to track how many active threads that could spawn more + /// threads. + struct BPThreadPool { + ThreadPool TheThreadPool; + std::mutex mtx; + std::condition_variable cv; + /// The number of threads that could spawn more threads + std::atomic NumActiveThreads = 0; + /// Only true when all threads are down spawning new threads + bool IsFinishedSpawning = false; + /// Asynchronous submission of the task to the pool + template void async(Func &&F); + /// Blocking wait for all threads to complete. Unlike ThreadPool, it is + /// acceptable for other threads to add more tasks while blocking on this + /// call. + void wait(); + }; + + /// Run a recursive bisection of a given list of FunctionNodes + /// \param RecDepth the current depth of recursion + /// \param RootBucket the initial bucket of the dataVertices + /// \param Offset the assigned buckets are the range [Offset, Offset + + /// Nodes.size()] + void bisect(const FunctionNodeRange Nodes, unsigned RecDepth, + unsigned RootBucket, unsigned Offset, + std::optional &TP) const; + + /// Run bisection iterations + void runIterations(const FunctionNodeRange Nodes, unsigned RecDepth, + unsigned LeftBucket, unsigned RightBucket, + std::mt19937 &RNG) const; + + /// Run a bisection iteration to improve the optimization goal + /// \returns the total number of moved FunctionNodes + unsigned runIteration(const FunctionNodeRange Nodes, unsigned LeftBucket, + unsigned RightBucket, SignaturesT &Signatures, + std::mt19937 &RNG) const; + + /// Try to move \p N from one bucket to another + /// \returns true iff \p N is moved + bool moveFunctionNode(BPFunctionNode &N, unsigned LeftBucket, + unsigned RightBucket, SignaturesT &Signatures, + std::mt19937 &RNG) const; + + /// Split all the FunctionNodes into 2 buckets, StartBucket and StartBucket + + /// 1 The method is used for an initial assignment before a bisection step + void split(const FunctionNodeRange Nodes, unsigned StartBucket) const; + + /// The cost of the uniform log-gap cost, assuming a utility node has \p X + /// FunctionNodes in the left bucket and \p Y FunctionNodes in the right one. + float logCost(unsigned X, unsigned Y) const; + + float log2Cached(unsigned i) const; + + const BalancedPartitioningConfig &Config; + + /// Precomputed values of log2(x). Table size is small enough to fit in cache. + static constexpr unsigned LOG_CACHE_SIZE = 16384; + float Log2Cache[LOG_CACHE_SIZE]; + + /// The signature of a particular utility node used for the bisection step, + /// i.e., the number of \p FunctionNodes in each of the two buckets + struct UtilitySignature { + /// The number of \p FunctionNodes in the left bucket + unsigned LeftCount = 0; + /// The number of \p FunctionNodes in the right bucket + unsigned RightCount = 0; + /// The cached gain of moving a \p FunctionNode from the left bucket to the + /// right bucket + float CachedGainLR; + /// The cached gain of moving a \p FunctionNode from the right bucket to the + /// left bucket + float CachedGainRL; + /// Whether \p CachedGainLR and \p CachedGainRL are valid + bool CachedGainIsValid = false; + }; + +protected: + /// Compute the move gain for uniform log-gap cost + static float moveGain(const BPFunctionNode &N, bool FromLeftToRight, + const SignaturesT &Signatures); + friend class BalancedPartitioningTest_MoveGain_Test; +}; + +} // end namespace llvm + +#endif // LLVM_SUPPORT_BALANCED_PARTITIONING_H diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp --- a/llvm/lib/ProfileData/InstrProf.cpp +++ b/llvm/lib/ProfileData/InstrProf.cpp @@ -13,6 +13,7 @@ #include "llvm/ProfileData/InstrProf.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SetVector.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" @@ -803,6 +804,48 @@ ValueSites.emplace_back(VData, VData + N); } +std::vector TemporalProfTraceTy::createBPFunctionNodes( + ArrayRef Traces) { + using IDT = BPFunctionNode::IDT; + using UtilityNodeT = BPFunctionNode::UtilityNodeT; + // Collect all function IDs ordered by their smallest timestamp. This will be + // used as the initial FunctionNode order. + SetVector FunctionIds; + size_t LargestTraceSize = 0; + for (auto &Trace : Traces) + LargestTraceSize = + std::max(LargestTraceSize, Trace.FunctionNameRefs.size()); + for (size_t Timestamp = 0; Timestamp < LargestTraceSize; Timestamp++) + for (auto &Trace : Traces) + if (Timestamp < Trace.FunctionNameRefs.size()) + FunctionIds.insert(Trace.FunctionNameRefs[Timestamp]); + + int N = std::ceil(std::log2(LargestTraceSize)); + + // TODO: We need to use the Trace.Weight field to give more weight to more + // important utilities + DenseMap> FuncGroups; + for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) { + auto &Trace = Traces[TraceIdx].FunctionNameRefs; + for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) { + for (int I = std::floor(std::log2(Timestamp + 1)); I < N; I++) { + auto &FunctionId = Trace[Timestamp]; + UtilityNodeT GroupId = TraceIdx * N + I; + FuncGroups[FunctionId].push_back(GroupId); + } + } + } + + std::vector Nodes; + for (auto &Id : FunctionIds) { + auto &UNs = FuncGroups[Id]; + llvm::sort(UNs); + UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end()); + Nodes.emplace_back(Id, UNs); + } + return Nodes; +} + #define INSTR_PROF_COMMON_API_IMPL #include "llvm/ProfileData/InstrProfData.inc" diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Support/BalancedPartitioning.cpp @@ -0,0 +1,325 @@ +//===- BalancedPartitioning.cpp -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements BalancedPartitioning, a recursive balanced graph +// partitioning algorithm. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/BalancedPartitioning.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/FormatVariadic.h" + +using namespace llvm; +#define DEBUG_TYPE "balanced-partitioning" + +void BPFunctionNode::dump(raw_ostream &OS) const { + OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id, + make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket); +} + +template +void BalancedPartitioning::BPThreadPool::async(Func &&F) { + // This new thread could spawn more threads, so mark it as active + ++NumActiveThreads; + TheThreadPool.async([=]() { + // Run the task + F(); + + // This thread will no longer spawn new threads, so mark it as inactive + if (--NumActiveThreads == 0) { + // There are no more active threads, so mark as finished and notify + { + std::unique_lock lock(mtx); + assert(!IsFinishedSpawning); + IsFinishedSpawning = true; + } + cv.notify_one(); + } + }); +} + +void BalancedPartitioning::BPThreadPool::wait() { + // TODO: We could remove the mutex and condition variable and use + // std::atomic::wait() instead, but that isn't available until C++20 + { + std::unique_lock lock(mtx); + cv.wait(lock, [&]() { return IsFinishedSpawning; }); + assert(IsFinishedSpawning && NumActiveThreads == 0); + } + // Now we can call ThreadPool::wait() since all tasks have been submitted + TheThreadPool.wait(); +} + +BalancedPartitioning::BalancedPartitioning( + const BalancedPartitioningConfig &Config) + : Config(Config) { + // Pre-computing log2 values + Log2Cache[0] = 0.0; + for (unsigned I = 1; I < LOG_CACHE_SIZE; I++) + Log2Cache[I] = std::log2(I); +} + +void BalancedPartitioning::run(std::vector &Nodes) const { + LLVM_DEBUG( + dbgs() << format( + "Partitioning %d nodes using depth %d and %d iterations per split\n", + Nodes.size(), Config.SplitDepth, Config.IterationsPerSplit)); + std::optional TP; + if (Config.TaskSplitDepth > 1) + TP.emplace(); + + // Record the input order + for (unsigned I = 0; I < Nodes.size(); I++) + Nodes[I].InputOrderIndex = I; + + auto NodesRange = llvm::make_range(Nodes.begin(), Nodes.end()); + auto BisectTask = [=, &TP]() { + bisect(NodesRange, /*RecDepth=*/0, /*RootBucket=*/1, /*Offset=*/0, TP); + }; + if (TP) { + TP->async(std::move(BisectTask)); + TP->wait(); + } else { + BisectTask(); + } + + llvm::stable_sort(NodesRange, [](const auto &L, const auto &R) { + return L.Bucket < R.Bucket; + }); + + LLVM_DEBUG(dbgs() << "Balanced partitioning completed\n"); +} + +void BalancedPartitioning::bisect(const FunctionNodeRange Nodes, + unsigned RecDepth, unsigned RootBucket, + unsigned Offset, + std::optional &TP) const { + unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); + if (NumNodes <= 1 || RecDepth >= Config.SplitDepth) { + // We've reach the lowest level of the recursion tree. Fall back to the + // original order and assign to buckets. + llvm::stable_sort(Nodes, [](const auto &L, const auto &R) { + return L.InputOrderIndex < R.InputOrderIndex; + }); + for (auto &N : Nodes) + N.Bucket = Offset++; + return; + } + + LLVM_DEBUG(dbgs() << format("Bisect with %d nodes and root bucket %d\n", + NumNodes, RootBucket)); + + std::mt19937 RNG(RootBucket); + + unsigned LeftBucket = 2 * RootBucket; + unsigned RightBucket = 2 * RootBucket + 1; + + // Split into two and assign to the left and right buckets + split(Nodes, LeftBucket); + + runIterations(Nodes, RecDepth, LeftBucket, RightBucket, RNG); + + // Split nodes wrt the resulting buckets + auto NodesMid = + llvm::partition(Nodes, [&](auto &N) { return N.Bucket == LeftBucket; }); + unsigned MidOffset = Offset + std::distance(Nodes.begin(), NodesMid); + + auto LeftNodes = llvm::make_range(Nodes.begin(), NodesMid); + auto RightNodes = llvm::make_range(NodesMid, Nodes.end()); + + auto LeftRecTask = [=, &TP]() { + bisect(LeftNodes, RecDepth + 1, LeftBucket, Offset, TP); + }; + auto RightRecTask = [=, &TP]() { + bisect(RightNodes, RecDepth + 1, RightBucket, MidOffset, TP); + }; + + if (TP && RecDepth < Config.TaskSplitDepth && NumNodes >= 4) { + TP->async(std::move(LeftRecTask)); + TP->async(std::move(RightRecTask)); + } else { + LeftRecTask(); + RightRecTask(); + } +} + +void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes, + unsigned RecDepth, unsigned LeftBucket, + unsigned RightBucket, + std::mt19937 &RNG) const { + unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); + DenseMap UtilityNodeDegree; + for (auto &N : Nodes) + for (auto &UN : N.UtilityNodes) + ++UtilityNodeDegree[UN]; + // Remove utility nodes if they have just one edge or are connected to all + // functions + for (auto &N : Nodes) + llvm::erase_if(N.UtilityNodes, [&](auto &UN) { + return UtilityNodeDegree[UN] <= 1 || UtilityNodeDegree[UN] >= NumNodes; + }); + + // Renumber utility nodes so they can be used to index into Signatures + DenseMap UtilityNodeIndex; + for (auto &N : Nodes) + for (auto &UN : N.UtilityNodes) + if (!UtilityNodeIndex.count(UN)) + UtilityNodeIndex[UN] = UtilityNodeIndex.size(); + for (auto &N : Nodes) + for (auto &UN : N.UtilityNodes) + UN = UtilityNodeIndex[UN]; + + // Initialize signatures + SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size()); + for (auto &N : Nodes) { + for (auto &UN : N.UtilityNodes) { + assert(UN < Signatures.size()); + if (N.Bucket == LeftBucket) { + Signatures[UN].LeftCount++; + } else { + Signatures[UN].RightCount++; + } + } + } + + for (unsigned I = 0; I < Config.IterationsPerSplit; I++) { + unsigned NumMovedNodes = + runIteration(Nodes, LeftBucket, RightBucket, Signatures, RNG); + if (NumMovedNodes == 0) + break; + } +} + +unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes, + unsigned LeftBucket, + unsigned RightBucket, + SignaturesT &Signatures, + std::mt19937 &RNG) const { + // Init signature cost caches + for (auto &Signature : Signatures) { + if (Signature.CachedGainIsValid) + continue; + unsigned L = Signature.LeftCount; + unsigned R = Signature.RightCount; + assert((L > 0 || R > 0) && "incorrect signature"); + float Cost = logCost(L, R); + Signature.CachedGainLR = 0; + Signature.CachedGainRL = 0; + if (L > 0) + Signature.CachedGainLR = Cost - logCost(L - 1, R + 1); + if (R > 0) + Signature.CachedGainRL = Cost - logCost(L + 1, R - 1); + Signature.CachedGainIsValid = true; + } + + // Compute move gains + typedef std::pair GainPair; + std::vector Gains; + for (auto &N : Nodes) { + bool FromLeftToRight = (N.Bucket == LeftBucket); + float Gain = moveGain(N, FromLeftToRight, Signatures); + Gains.push_back(std::make_pair(Gain, &N)); + } + + // Collect left and right gains + auto LeftEnd = llvm::partition( + Gains, [&](const auto &GP) { return GP.second->Bucket == LeftBucket; }); + auto LeftRange = llvm::make_range(Gains.begin(), LeftEnd); + auto RightRange = llvm::make_range(LeftEnd, Gains.end()); + + // Sort gains in descending order + auto LargerGain = [](const auto &L, const auto &R) { + return L.first > R.first; + }; + llvm::stable_sort(LeftRange, LargerGain); + llvm::stable_sort(RightRange, LargerGain); + + unsigned NumMovedDataVertices = 0; + for (auto [LeftPair, RightPair] : llvm::zip(LeftRange, RightRange)) { + auto &[LeftGain, LeftNode] = LeftPair; + auto &[RightGain, RightNode] = RightPair; + // Stop when the gain is no longer beneficial + if (LeftGain + RightGain <= 0.0) + break; + // Try to exchange the nodes between buckets + if (moveFunctionNode(*LeftNode, LeftBucket, RightBucket, Signatures, RNG)) + ++NumMovedDataVertices; + if (moveFunctionNode(*RightNode, LeftBucket, RightBucket, Signatures, RNG)) + ++NumMovedDataVertices; + } + return NumMovedDataVertices; +} + +bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N, + unsigned LeftBucket, + unsigned RightBucket, + SignaturesT &Signatures, + std::mt19937 &RNG) const { + // Sometimes we skip the move. This helps to escape local optima + if (std::uniform_real_distribution(0.0, 1.0)(RNG) <= + Config.SkipProbability) + return false; + + bool FromLeftToRight = (N.Bucket == LeftBucket); + // Update the current bucket + N.Bucket = (FromLeftToRight ? RightBucket : LeftBucket); + + // Update signatures and invalidate gain cache + if (FromLeftToRight) { + for (auto &UN : N.UtilityNodes) { + auto &Signature = Signatures[UN]; + Signature.LeftCount--; + Signature.RightCount++; + Signature.CachedGainIsValid = false; + } + } else { + for (auto &UN : N.UtilityNodes) { + auto &Signature = Signatures[UN]; + Signature.LeftCount++; + Signature.RightCount--; + Signature.CachedGainIsValid = false; + } + } + return true; +} + +void BalancedPartitioning::split(const FunctionNodeRange Nodes, + unsigned StartBucket) const { + unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end()); + auto NodesMid = Nodes.begin() + (NumNodes + 1) / 2; + + std::nth_element(Nodes.begin(), NodesMid, Nodes.end(), [](auto &L, auto &R) { + return L.InputOrderIndex < R.InputOrderIndex; + }); + + for (auto &N : llvm::make_range(Nodes.begin(), NodesMid)) + N.Bucket = StartBucket; + for (auto &N : llvm::make_range(NodesMid, Nodes.end())) + N.Bucket = StartBucket + 1; +} + +float BalancedPartitioning::moveGain(const BPFunctionNode &N, + bool FromLeftToRight, + const SignaturesT &Signatures) { + float Gain = 0; + for (auto &UN : N.UtilityNodes) + Gain += (FromLeftToRight ? Signatures[UN].CachedGainLR + : Signatures[UN].CachedGainRL); + return Gain; +} + +float BalancedPartitioning::logCost(unsigned X, unsigned Y) const { + return -(X * log2Cached(X + 1) + Y * log2Cached(Y + 1)); +} + +float BalancedPartitioning::log2Cached(unsigned i) const { + return (i < LOG_CACHE_SIZE) ? Log2Cache[i] : std::log2(i); +} diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt --- a/llvm/lib/Support/CMakeLists.txt +++ b/llvm/lib/Support/CMakeLists.txt @@ -129,6 +129,7 @@ Allocator.cpp AutoConvert.cpp Base64.cpp + BalancedPartitioning.cpp BinaryStreamError.cpp BinaryStreamReader.cpp BinaryStreamRef.cpp diff --git a/llvm/test/tools/llvm-profdata/show-order.proftext b/llvm/test/tools/llvm-profdata/show-order.proftext new file mode 100644 --- /dev/null +++ b/llvm/test/tools/llvm-profdata/show-order.proftext @@ -0,0 +1,46 @@ +# RUN: llvm-profdata order %s | FileCheck %s + +# CHECK: a +# CHECK: b +# CHECK: c +# CHECK: x + +# Header +:ir +:temporal_prof_traces +# Num Traces +3 +# Trace Stream Size: +3 +# Weight +1 +a, main.c:b, c +# Weight +1 +a, x, main.c:b, c +# Weight +1 +a, main.c:b, c + +a +# Func Hash: +0x1234 +# Num Counters: +1 +# Counter Values: +101 + +main.c:b +0x5678 +1 +202 + +c +0xabcd +1 +303 + +x +0xefff +1 +404 diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -23,6 +23,7 @@ #include "llvm/ProfileData/RawMemProfReader.h" #include "llvm/ProfileData/SampleProfReader.h" #include "llvm/ProfileData/SampleProfWriter.h" +#include "llvm/Support/BalancedPartitioning.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Discriminator.h" #include "llvm/Support/Errc.h" @@ -3038,6 +3039,51 @@ return showMemProfProfile(Filename, ProfiledBinary, SFormat, OS); } +static int order_main(int argc, const char *argv[]) { + cl::opt Filename(cl::Positional, cl::desc("")); + cl::opt OutputFilename("output", cl::value_desc("output"), + cl::init("-"), cl::desc("Output file")); + cl::alias OutputFilenameA("o", cl::desc("Alias for --output"), + cl::aliasopt(OutputFilename)); + cl::ParseCommandLineOptions(argc, argv, "LLVM profile data order\n"); + + std::error_code EC; + raw_fd_ostream OS(OutputFilename.data(), EC, sys::fs::OF_TextWithCRLF); + if (EC) + exitWithErrorCode(EC, OutputFilename); + auto FS = vfs::getRealFileSystem(); + auto ReaderOrErr = InstrProfReader::create(Filename, *FS); + if (Error E = ReaderOrErr.takeError()) + exitWithError(std::move(E), Filename); + + auto Reader = std::move(ReaderOrErr.get()); + for (auto &I : *Reader) { + // Read all entries + (void)I; + } + auto &Traces = Reader->getTemporalProfTraces(); + auto Nodes = TemporalProfTraceTy::createBPFunctionNodes(Traces); + BalancedPartitioningConfig Config; + BalancedPartitioning BP(Config); + BP.run(Nodes); + + WithColor::note() << "# Ordered " << Nodes.size() << " functions\n"; + for (auto &N : Nodes) { + auto FuncName = Reader->getSymtab().getFuncName(N.Id); + if (FuncName.contains(':')) { + // GlobalValue::getGlobalIdentifier() prefixes the filename if the symbol + // is local. This logic will break if there is a colon in the filename, + // but we cannot use rsplit() because ObjC symbols can have colons. + auto [Filename, ParsedFuncName] = FuncName.split(':'); + // Emit a comment describing where this symbol came from + OS << "# " << Filename << "\n"; + FuncName = ParsedFuncName; + } + OS << FuncName << "\n"; + } + return 0; +} + int llvm_profdata_main(int argc, char **argvNonConst, const llvm::ToolContext &) { const char **argv = const_cast(argvNonConst); @@ -3053,6 +3099,8 @@ func = show_main; else if (strcmp(argv[1], "overlap") == 0) func = overlap_main; + else if (strcmp(argv[1], "order") == 0) + func = order_main; if (func) { std::string Invocation(ProgName.str() + " " + argv[1]); @@ -3083,6 +3131,6 @@ else errs() << ProgName << ": Unknown command!\n"; - errs() << "USAGE: " << ProgName << " [args...]\n"; + errs() << "USAGE: " << ProgName << " [args...]\n"; return 1; } diff --git a/llvm/unittests/Support/BalancedPartitioningTest.cpp b/llvm/unittests/Support/BalancedPartitioningTest.cpp new file mode 100644 --- /dev/null +++ b/llvm/unittests/Support/BalancedPartitioningTest.cpp @@ -0,0 +1,121 @@ +//===- BalancedPartitioningTest.cpp - BalancedPartitioning tests ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/BalancedPartitioning.h" +#include "llvm/ProfileData/InstrProf.h" +#include "llvm/Testing/Support/SupportHelpers.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +using namespace llvm; +using testing::Each; +using testing::Field; +using testing::Not; +using testing::UnorderedElementsAre; +using testing::UnorderedElementsAreArray; + +namespace llvm { + +void PrintTo(const BPFunctionNode &Node, std::ostream *OS) { + raw_os_ostream ROS(*OS); + Node.dump(ROS); +} + +TEST(BPFunctionNodeTest, Basic) { + auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({ + TemporalProfTraceTy({0, 1, 2, 3, 4}), + TemporalProfTraceTy({4, 2}), + }); + + auto NodeIs = [](BPFunctionNode::IDT Id, + ArrayRef UNs) { + return AllOf(Field("Id", &BPFunctionNode::Id, Id), + Field("UtilityNodes", &BPFunctionNode::UtilityNodes, + UnorderedElementsAreArray(UNs))); + }; + + EXPECT_THAT(Nodes, + UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}), + NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}), + NodeIs(4, {2, 3, 4, 5}))); +} + +class BalancedPartitioningTest : public ::testing::Test { +protected: + BalancedPartitioningConfig Config; + BalancedPartitioning Bp; + BalancedPartitioningTest() : Bp(Config) {} + + static std::vector + getIds(std::vector Nodes) { + std::vector Ids; + for (auto &N : Nodes) + Ids.push_back(N.Id); + return Ids; + } +}; + +TEST_F(BalancedPartitioningTest, Basic) { + std::vector Nodes = { + BPFunctionNode(0, {1, 2}), BPFunctionNode(2, {3, 4}), + BPFunctionNode(1, {1, 2}), BPFunctionNode(3, {3, 4}), + BPFunctionNode(4, {4}), + }; + + Bp.run(Nodes); + + auto NodeIs = [](BPFunctionNode::IDT Id, std::optional Bucket) { + return AllOf(Field("Id", &BPFunctionNode::Id, Id), + Field("Bucket", &BPFunctionNode::Bucket, Bucket)); + }; + + EXPECT_THAT(Nodes, + UnorderedElementsAre(NodeIs(0, 0), NodeIs(1, 1), NodeIs(2, 2), + NodeIs(3, 3), NodeIs(4, 4))); +} + +TEST_F(BalancedPartitioningTest, Large) { + const int ProblemSize = 1000; + std::vector AllUNs; + for (int i = 0; i < ProblemSize; i++) + AllUNs.emplace_back(i); + + std::mt19937 RNG; + std::vector Nodes; + for (int i = 0; i < ProblemSize; i++) { + std::vector UNs; + int SampleSize = + std::uniform_int_distribution(0, AllUNs.size() - 1)(RNG); + std::sample(AllUNs.begin(), AllUNs.end(), std::back_inserter(UNs), + SampleSize, RNG); + Nodes.emplace_back(i, UNs); + } + + auto OrigIds = getIds(Nodes); + + Bp.run(Nodes); + + EXPECT_THAT( + Nodes, Each(Not(Field("Bucket", &BPFunctionNode::Bucket, std::nullopt)))); + EXPECT_THAT(getIds(Nodes), UnorderedElementsAreArray(OrigIds)); +} + +TEST_F(BalancedPartitioningTest, MoveGain) { + BalancedPartitioning::SignaturesT Signatures = { + {10, 10, 10.f, 0.f, true}, // 0 + {10, 10, 0.f, 10.f, true}, // 1 + {10, 10, 0.f, 20.f, true}, // 2 + }; + EXPECT_FLOAT_EQ(Bp.moveGain(BPFunctionNode(0, {}), true, Signatures), 0.f); + EXPECT_FLOAT_EQ(Bp.moveGain(BPFunctionNode(0, {0, 1}), true, Signatures), + 10.f); + EXPECT_FLOAT_EQ(Bp.moveGain(BPFunctionNode(0, {1, 2}), false, Signatures), + 30.f); +} + +} // end namespace llvm diff --git a/llvm/unittests/Support/CMakeLists.txt b/llvm/unittests/Support/CMakeLists.txt --- a/llvm/unittests/Support/CMakeLists.txt +++ b/llvm/unittests/Support/CMakeLists.txt @@ -1,4 +1,5 @@ set(LLVM_LINK_COMPONENTS + ProfileData Support TargetParser ) @@ -14,6 +15,7 @@ BinaryStreamTest.cpp BLAKE3Test.cpp BlockFrequencyTest.cpp + BalancedPartitioningTest.cpp BranchProbabilityTest.cpp CachePruningTest.cpp CrashRecoveryTest.cpp