diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h new file mode 100644 --- /dev/null +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileInference.h @@ -0,0 +1,50 @@ +//===- Transforms/Utils/SampleProfileInference.h ----------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file provides the interface for the profile inference algorithm. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H +#define LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/IR/BasicBlock.h" +#include "llvm/IR/Function.h" + +namespace llvm { + +using Edge = std::pair; +using BlockWeightMap = DenseMap; +using EdgeWeightMap = DenseMap; +using BlockEdgeMap = + DenseMap>; + +/// Sample profile inference pass. +class SampleProfileInference { +public: + SampleProfileInference(Function &F, BlockEdgeMap &Successors, + BlockWeightMap &SampleBlockWeights) + : F(F), Successors(Successors), SampleBlockWeights(SampleBlockWeights) {} + + void apply(BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights); + +private: + /// Function. + const Function &F; + + /// Successors for each basic block in the CFG. + BlockEdgeMap &Successors; + + /// Map basic blocks to their sampled weights. + BlockWeightMap &SampleBlockWeights; +}; + +} // end namespace llvm +#endif // LLVM_TRANSFORMS_UTILS_SAMPLEPROFILEINFERENCE_H diff --git a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h --- a/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h +++ b/llvm/include/llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h @@ -38,6 +38,7 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/GenericDomTree.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/SampleProfileInference.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" namespace llvm { @@ -77,6 +78,7 @@ extern cl::opt SampleProfileMaxPropagateIterations; extern cl::opt SampleProfileRecordCoverage; extern cl::opt SampleProfileSampleCoverage; +extern cl::opt SampleProfileUseProfi; extern cl::opt NoWarnSampleUnused; template class SampleProfileLoaderBaseImpl { @@ -147,6 +149,10 @@ ArrayRef Descendants, PostDominatorTreeT *DomTree); void propagateWeights(FunctionT &F); + inline void applyProfi(Function &F, BlockEdgeMap &Successors, + BlockWeightMap &SampleBlockWeights, + BlockWeightMap &BlockWeights, + EdgeWeightMap &EdgeWeights); uint64_t visitEdge(Edge E, unsigned *NumUnknownEdges, Edge *UnknownEdge); void buildEdges(FunctionT &F); bool propagateThroughEdges(FunctionT &F, bool UpdateBlockCount); @@ -155,6 +161,11 @@ bool computeAndPropagateWeights(FunctionT &F, const DenseSet &InlinedGUIDs); + void initWeightPropagation(FunctionT &F, + const DenseSet &InlinedGUIDs); + void + finalizeWeightPropagation(FunctionT &F, + const DenseSet &InlinedGUIDs); void emitCoverageRemarks(FunctionT &F); /// Map basic blocks to their computed weights. @@ -746,50 +757,71 @@ /// known). template void SampleProfileLoaderBaseImpl::propagateWeights(FunctionT &F) { - bool Changed = true; - unsigned I = 0; - - // If BB weight is larger than its corresponding loop's header BB weight, - // use the BB weight to replace the loop header BB weight. - for (auto &BI : F) { - BasicBlockT *BB = &BI; - LoopT *L = LI->getLoopFor(BB); - if (!L) { - continue; + // Flow-based profile inference is only usable with BasicBlock instantiation + // of SampleProfileLoaderBaseImpl. + if (SampleProfileUseProfi && std::is_same::value) { + // Prepare block sample counts for inference. + BlockWeightMap SampleBlockWeights; + for (const auto &BI : F) { + ErrorOr Weight = getBlockWeight(&BI); + if (Weight) + SampleBlockWeights[&BI] = Weight.get(); } - BasicBlockT *Header = L->getHeader(); - if (Header && BlockWeights[BB] > BlockWeights[Header]) { - BlockWeights[Header] = BlockWeights[BB]; + // Fill in BlockWeights and EdgeWeights using an inference algorithm. + applyProfi(getFunction(F), Successors, SampleBlockWeights, BlockWeights, + EdgeWeights); + } else { + bool Changed = true; + unsigned I = 0; + + // If BB weight is larger than its corresponding loop's header BB weight, + // use the BB weight to replace the loop header BB weight. + for (auto &BI : F) { + BasicBlockT *BB = &BI; + LoopT *L = LI->getLoopFor(BB); + if (!L) { + continue; + } + BasicBlockT *Header = L->getHeader(); + if (Header && BlockWeights[BB] > BlockWeights[Header]) { + BlockWeights[Header] = BlockWeights[BB]; + } } - } - // Before propagation starts, build, for each block, a list of - // unique predecessors and successors. This is necessary to handle - // identical edges in multiway branches. Since we visit all blocks and all - // edges of the CFG, it is cleaner to build these lists once at the start - // of the pass. - buildEdges(F); + // Propagate until we converge or we go past the iteration limit. + while (Changed && I++ < SampleProfileMaxPropagateIterations) { + Changed = propagateThroughEdges(F, false); + } - // Propagate until we converge or we go past the iteration limit. - while (Changed && I++ < SampleProfileMaxPropagateIterations) { - Changed = propagateThroughEdges(F, false); - } + // The first propagation propagates BB counts from annotated BBs to unknown + // BBs. The 2nd propagation pass resets edges weights, and use all BB + // weights to propagate edge weights. + VisitedEdges.clear(); + Changed = true; + while (Changed && I++ < SampleProfileMaxPropagateIterations) { + Changed = propagateThroughEdges(F, false); + } - // The first propagation propagates BB counts from annotated BBs to unknown - // BBs. The 2nd propagation pass resets edges weights, and use all BB weights - // to propagate edge weights. - VisitedEdges.clear(); - Changed = true; - while (Changed && I++ < SampleProfileMaxPropagateIterations) { - Changed = propagateThroughEdges(F, false); + // The 3rd propagation pass allows adjust annotated BB weights that are + // obviously wrong. + Changed = true; + while (Changed && I++ < SampleProfileMaxPropagateIterations) { + Changed = propagateThroughEdges(F, true); + } } +} - // The 3rd propagation pass allows adjust annotated BB weights that are - // obviously wrong. - Changed = true; - while (Changed && I++ < SampleProfileMaxPropagateIterations) { - Changed = propagateThroughEdges(F, true); - } +template +inline void SampleProfileLoaderBaseImpl::applyProfi( + Function &F, BlockEdgeMap &Successors, BlockWeightMap &SampleBlockWeights, + BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights) {} + +template <> +inline void SampleProfileLoaderBaseImpl::applyProfi( + Function &F, BlockEdgeMap &Successors, BlockWeightMap &SampleBlockWeights, + BlockWeightMap &BlockWeights, EdgeWeightMap &EdgeWeights) { + auto Infer = SampleProfileInference(F, Successors, SampleBlockWeights); + Infer.apply(BlockWeights, EdgeWeights); } /// Generate branch weight metadata for all branches in \p F. @@ -847,26 +879,64 @@ Changed |= computeBlockWeights(F); if (Changed) { - // Add an entry count to the function using the samples gathered at the - // function entry. - // Sets the GUIDs that are inlined in the profiled binary. This is used - // for ThinLink to make correct liveness analysis, and also make the IR - // match the profiled binary before annotation. - getFunction(F).setEntryCount( - ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real), - &InlinedGUIDs); + // Initialize propagation. + initWeightPropagation(F, InlinedGUIDs); + + // Propagate weights to all edges. + propagateWeights(F); + + // Post-process propagated weights. + finalizeWeightPropagation(F, InlinedGUIDs); + } + + return Changed; +} +template +void SampleProfileLoaderBaseImpl::initWeightPropagation( + FunctionT &F, const DenseSet &InlinedGUIDs) { + // Add an entry count to the function using the samples gathered at the + // function entry. + // Sets the GUIDs that are inlined in the profiled binary. This is used + // for ThinLink to make correct liveness analysis, and also make the IR + // match the profiled binary before annotation. + getFunction(F).setEntryCount( + ProfileCount(Samples->getHeadSamples() + 1, Function::PCT_Real), + &InlinedGUIDs); + + if (!SampleProfileUseProfi) { // Compute dominance and loop info needed for propagation. computeDominanceAndLoopInfo(F); // Find equivalence classes. findEquivalenceClasses(F); - - // Propagate weights to all edges. - propagateWeights(F); } - return Changed; + // Before propagation starts, build, for each block, a list of + // unique predecessors and successors. This is necessary to handle + // identical edges in multiway branches. Since we visit all blocks and all + // edges of the CFG, it is cleaner to build these lists once at the start + // of the pass. + buildEdges(F); +} + +template +void SampleProfileLoaderBaseImpl::finalizeWeightPropagation( + FunctionT &F, const DenseSet &InlinedGUIDs) { + // If we utilize a flow-based count inference, then we trust the computed + // counts and set the entry count as computed by the algorithm. This is + // primarily done to sync the counts produced by profi and BFI inference, + // which uses the entry count for mass propagation. + // If profi produces a zero-value for the entry count, we fallback to + // Samples->getHeadSamples() + 1 to avoid functions with zero count. + if (SampleProfileUseProfi) { + const BasicBlockT *EntryBB = getEntryBB(&F); + if (BlockWeights[EntryBB] > 0) { + getFunction(F).setEntryCount( + ProfileCount(BlockWeights[EntryBB], Function::PCT_Real), + &InlinedGUIDs); + } + } } template diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp --- a/llvm/lib/Transforms/IPO/SampleProfile.cpp +++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp @@ -84,6 +84,7 @@ #include "llvm/Transforms/Instrumentation.h" #include "llvm/Transforms/Utils/CallPromotionUtils.h" #include "llvm/Transforms/Utils/Cloning.h" +#include "llvm/Transforms/Utils/SampleProfileInference.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseImpl.h" #include "llvm/Transforms/Utils/SampleProfileLoaderBaseUtil.h" #include @@ -1557,6 +1558,19 @@ SmallVector Weights; uint32_t MaxWeight = 0; Instruction *MaxDestInst; + // Since profi treats multiple edges (multiway branches) as a single edge, + // we need to distribute the computed weight among the branches. We do + // this by evenly splitting the edge weight among destinations. + DenseMap EdgeMultiplicity; + std::vector EdgeIndex; + if (SampleProfileUseProfi) { + EdgeIndex.resize(TI->getNumSuccessors()); + for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { + const BasicBlock *Succ = TI->getSuccessor(I); + EdgeIndex[I] = EdgeMultiplicity[Succ]; + EdgeMultiplicity[Succ]++; + } + } for (unsigned I = 0; I < TI->getNumSuccessors(); ++I) { BasicBlock *Succ = TI->getSuccessor(I); Edge E = std::make_pair(BB, Succ); @@ -1569,9 +1583,19 @@ LLVM_DEBUG(dbgs() << " (saturated due to uint32_t overflow)"); Weight = std::numeric_limits::max(); } - // Weight is added by one to avoid propagation errors introduced by - // 0 weights. - Weights.push_back(static_cast(Weight + 1)); + if (!SampleProfileUseProfi) { + // Weight is added by one to avoid propagation errors introduced by + // 0 weights. + Weights.push_back(static_cast(Weight + 1)); + } else { + // Profi creates proper weights that do not require "+1" adjustments but + // we evenly split the weight among branches with the same destination. + uint64_t W = Weight / EdgeMultiplicity[Succ]; + // Rounding up, if needed, so that first branches are hotter. + if (EdgeIndex[I] < Weight % EdgeMultiplicity[Succ]) + W++; + Weights.push_back(static_cast(W)); + } if (Weight != 0) { if (Weight > MaxWeight) { MaxWeight = Weight; diff --git a/llvm/lib/Transforms/Utils/CMakeLists.txt b/llvm/lib/Transforms/Utils/CMakeLists.txt --- a/llvm/lib/Transforms/Utils/CMakeLists.txt +++ b/llvm/lib/Transforms/Utils/CMakeLists.txt @@ -60,6 +60,7 @@ StripGCRelocates.cpp SSAUpdater.cpp SSAUpdaterBulk.cpp + SampleProfileInference.cpp SampleProfileLoaderBaseUtil.cpp SanitizerStats.cpp SimplifyCFG.cpp diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -0,0 +1,642 @@ +//===- SampleProfileInference.cpp - Adjust sample profiles in the IR ------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a profile inference algorithm. Given an incomplete and +// possibly imprecise block counts, the algorithm reconstructs realistic block +// and edge counts that satisfy flow conservation rules, while minimally modify +// input block counts. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Transforms/Utils/SampleProfileInference.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/GraphTraits.h" +#include "llvm/IR/CFG.h" +#include "llvm/IR/Instructions.h" +#include "llvm/Support/Debug.h" +#include +#include + +using namespace llvm; +#define DEBUG_TYPE "sample-profile-inference" + +namespace { + +/// A value indicating an infinite flow/capacity/weight of a block/edge. +/// Not using numeric_limits::max(), as the values can be summed up +/// during the execution. +static constexpr int64_t INF = ((int64_t)1) << 50; + +struct FlowJump; + +/// A wrapper of a binary basic block. +struct FlowBlock { + uint64_t Index; + uint64_t Weight{0}; + bool Dangling{false}; + uint64_t Flow{0}; + bool HasSelfEdge{false}; + std::vector SuccJumps; + std::vector PredJumps; + + /// Check if it is the entry block in the function. + bool isEntry() const { return PredJumps.empty(); } + + /// Check if it is an exit block in the function. + bool isExit() const { return SuccJumps.empty(); } +}; + +/// A wrapper of a jump between two basic blocks. +struct FlowJump { + uint64_t Source; + uint64_t Target; + uint64_t Flow{0}; + bool IsUnlikely{false}; +}; + +/// A wrapper of binary function with basic blocks and jumps. +struct FlowFunction { + std::vector Blocks; + std::vector Jumps; + /// The index of the entry block. + uint64_t Entry; +}; + +/// The minimum-cost maximum flow algorithm. +/// +/// The algorithm finds the maximum flow of minimum cost on a given (directed) +/// network using a modified version of the classical Moore-Bellman-Ford +/// approach. The algorithm applies a number of augmentation iterations in which +/// flow is sent along paths of positive capacity from the source to the sink. +/// The worst-case time complexity of the implementation is O(v(f)*m*n), where +/// where m is the number of edges, n is the number of vertices, and v(f) is the +/// value of the maximum flow. However, the observed running time on typical +/// instances is sub-quadratic, that is, o(n^2). +/// +/// The input is a set of edges with specified costs and capacities, and a pair +/// of nodes (source and sink). The output is the flow along each edge of the +/// minimum total cost respecting the given edge capacities. +class MinCostMaxFlow { +public: + // Initialize algorithm's data structures for a network of a given size. + void initialize(uint64_t NodeCount, uint64_t SourceNode, uint64_t SinkNode) { + Source = SourceNode; + Target = SinkNode; + + Nodes = std::vector(NodeCount); + Edges = std::vector>(NodeCount, std::vector()); + } + + // Run the algorithm. + int64_t run() { + // Find an augmenting path and update the flow along the path + size_t AugmentationIters = 0; + while (findAugmentingPath()) { + augmentFlowAlongPath(); + AugmentationIters++; + } + + // Compute the total flow and its cost + int64_t TotalCost = 0; + int64_t TotalFlow = 0; + for (uint64_t Src = 0; Src < Nodes.size(); Src++) { + for (auto &Edge : Edges[Src]) { + if (Edge.Flow > 0) { + TotalCost += Edge.Cost * Edge.Flow; + if (Src == Source) + TotalFlow += Edge.Flow; + } + } + } + LLVM_DEBUG(dbgs() << "Completed profi after " << AugmentationIters + << " iterations with " << TotalFlow << " total flow" + << " of " << TotalCost << " cost\n"); + return TotalCost; + } + + /// Adding an edge to the network with a specified capacity and a cost. + /// Multiple edges between a pair of nodes are allowed but self-edges + /// are not supported. + void addEdge(uint64_t Src, uint64_t Dst, int64_t Capacity, int64_t Cost) { + assert(Capacity > 0 && "adding an edge of zero capacity"); + assert(Src != Dst && "loop edge are not supported"); + + Edge SrcEdge; + SrcEdge.Dst = Dst; + SrcEdge.Cost = Cost; + SrcEdge.Capacity = Capacity; + SrcEdge.Flow = 0; + SrcEdge.RevEdgeIndex = Edges[Dst].size(); + + Edge DstEdge; + DstEdge.Dst = Src; + DstEdge.Cost = -Cost; + DstEdge.Capacity = 0; + DstEdge.Flow = 0; + DstEdge.RevEdgeIndex = Edges[Src].size(); + + Edges[Src].push_back(SrcEdge); + Edges[Dst].push_back(DstEdge); + } + + /// Adding an edge to the network of infinite capacity and a given cost. + void addEdge(uint64_t Src, uint64_t Dst, int64_t Cost) { + addEdge(Src, Dst, INF, Cost); + } + + /// Get the total flow from a given source node. + /// Returns a list of pairs (target node, amount of flow to the target). + const std::vector> getFlow(uint64_t Src) const { + std::vector> Flow; + for (auto &Edge : Edges[Src]) { + if (Edge.Flow > 0) + Flow.push_back(std::make_pair(Edge.Dst, Edge.Flow)); + } + return Flow; + } + + /// Get the total flow between a pair of nodes. + int64_t getFlow(uint64_t Src, uint64_t Dst) const { + int64_t Flow = 0; + for (auto &Edge : Edges[Src]) { + if (Edge.Dst == Dst) { + Flow += Edge.Flow; + } + } + return Flow; + } + + /// A cost of increasing a block's count by one. + static constexpr int64_t AuxCostInc = 10; + /// A cost of decreasing a block's count by one. + static constexpr int64_t AuxCostDec = 20; + /// A cost of increasing a count of zero-weight block by one. + static constexpr int64_t AuxCostIncZero = 11; + /// A cost of increasing the entry block's count by one. + static constexpr int64_t AuxCostIncEntry = 40; + /// A cost of decreasing the entry block's count by one. + static constexpr int64_t AuxCostDecEntry = 10; + /// A cost of taking an unlikely jump. + static constexpr int64_t AuxCostUnlikely = ((int64_t)1) << 20; + +private: + /// Check for existence of an augmenting path with a positive capacity. + bool findAugmentingPath() { + // Initialize data structures + for (auto &Node : Nodes) { + Node.Distance = INF; + Node.ParentNode = uint64_t(-1); + Node.ParentEdgeIndex = uint64_t(-1); + Node.Taken = false; + } + + std::queue Queue; + Queue.push(Source); + Nodes[Source].Distance = 0; + Nodes[Source].Taken = true; + while (!Queue.empty()) { + uint64_t Src = Queue.front(); + Queue.pop(); + Nodes[Src].Taken = false; + // Although the residual network contains edges with negative costs + // (in particular, backward edges), it can be shown that there are no + // negative-weight cycles and the following two invariants are maintained: + // (i) Dist[Source, V] >= 0 and (ii) Dist[V, Target] >= 0 for all nodes V, + // where Dist is the length of the shortest path between two nodes. This + // allows to prune the search-space of the path-finding algorithm using + // the following early-stop criteria: + // -- If we find a path with zero-distance from Source to Target, stop the + // search, as the path is the shortest since Dist[Source, Target] >= 0; + // -- If we have Dist[Source, V] > Dist[Source, Target], then do not + // process node V, as it is guaranteed _not_ to be on a shortest path + // from Source to Target; it follows from inequalities + // Dist[Source, Target] >= Dist[Source, V] + Dist[V, Target] + // >= Dist[Source, V] + if (Nodes[Target].Distance == 0) + break; + if (Nodes[Src].Distance > Nodes[Target].Distance) + continue; + + // Process adjacent edges + for (uint64_t EdgeIdx = 0; EdgeIdx < Edges[Src].size(); EdgeIdx++) { + auto &Edge = Edges[Src][EdgeIdx]; + if (Edge.Flow < Edge.Capacity) { + uint64_t Dst = Edge.Dst; + int64_t NewDistance = Nodes[Src].Distance + Edge.Cost; + if (Nodes[Dst].Distance > NewDistance) { + // Update the distance and the parent node/edge + Nodes[Dst].Distance = NewDistance; + Nodes[Dst].ParentNode = Src; + Nodes[Dst].ParentEdgeIndex = EdgeIdx; + // Add the node to the queue, if it is not there yet + if (!Nodes[Dst].Taken) { + Queue.push(Dst); + Nodes[Dst].Taken = true; + } + } + } + } + } + + return Nodes[Target].Distance != INF; + } + + /// Update the current flow along the augmenting path. + void augmentFlowAlongPath() { + // Find path capacity + int64_t PathCapacity = INF; + uint64_t Now = Target; + while (Now != Source) { + uint64_t Pred = Nodes[Now].ParentNode; + auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; + PathCapacity = std::min(PathCapacity, Edge.Capacity - Edge.Flow); + Now = Pred; + } + + assert(PathCapacity > 0 && "found incorrect augmenting path"); + + // Update the flow along the path + Now = Target; + while (Now != Source) { + uint64_t Pred = Nodes[Now].ParentNode; + auto &Edge = Edges[Pred][Nodes[Now].ParentEdgeIndex]; + auto &RevEdge = Edges[Now][Edge.RevEdgeIndex]; + + Edge.Flow += PathCapacity; + RevEdge.Flow -= PathCapacity; + + Now = Pred; + } + } + + /// An node in a flow network. + struct Node { + /// The cost of the cheapest path from the source to the current node. + int64_t Distance; + /// The node preceding the current one in the path. + uint64_t ParentNode; + /// The index of the edge between ParentNode and the current node. + uint64_t ParentEdgeIndex; + /// An indicator of whether the current node is in a queue. + bool Taken; + }; + /// An edge in a flow network. + struct Edge { + /// The cost of the edge. + int64_t Cost; + /// The capacity of the edge. + int64_t Capacity; + /// The current flow on the edge. + int64_t Flow; + /// The destination node of the edge. + uint64_t Dst; + /// The index of the reverse edge between Dst and the current node. + uint64_t RevEdgeIndex; + }; + + /// The set of network nodes. + std::vector Nodes; + /// The set of network edges. + std::vector> Edges; + /// Source node of the flow. + uint64_t Source; + /// Target (sink) node of the flow. + uint64_t Target; +}; + +/// Initializing flow network for a given function. +/// +/// Every block is split into three nodes that are responsible for (i) an +/// incoming flow, (ii) an outgoing flow, and (iii) penalizing an increase or +/// reduction of the block weight. +void initializeNetwork(MinCostMaxFlow &Network, FlowFunction &Func) { + uint64_t NumBlocks = Func.Blocks.size(); + assert(NumBlocks > 1 && "Too few blocks in a function"); + LLVM_DEBUG(dbgs() << "Initializing profi for " << NumBlocks << " blocks\n"); + + // Pre-process data: make sure the entry weight is at least 1 + if (Func.Blocks[Func.Entry].Weight == 0) { + Func.Blocks[Func.Entry].Weight = 1; + } + // Introducing dummy source/sink pairs to allow flow circulation. + // The nodes corresponding to blocks of Func have indicies in the range + // [0..3 * NumBlocks); the dummy nodes are indexed by the next four values. + uint64_t S = 3 * NumBlocks; + uint64_t T = S + 1; + uint64_t S1 = S + 2; + uint64_t T1 = S + 3; + + Network.initialize(3 * NumBlocks + 4, S1, T1); + + // Create three nodes for every block of the function + for (uint64_t B = 0; B < NumBlocks; B++) { + auto &Block = Func.Blocks[B]; + assert((!Block.Dangling || Block.Weight == 0 || Block.isEntry()) && + "non-zero weight of a dangling block except for a dangling entry"); + + // Split every block into two nodes + uint64_t Bin = 3 * B; + uint64_t Bout = 3 * B + 1; + uint64_t Baux = 3 * B + 2; + if (Block.Weight > 0) { + Network.addEdge(S1, Bout, Block.Weight, 0); + Network.addEdge(Bin, T1, Block.Weight, 0); + } + + // Edges from S and to T + assert((!Block.isEntry() || !Block.isExit()) && + "a block cannot be an entry and an exit"); + if (Block.isEntry()) { + Network.addEdge(S, Bin, 0); + } else if (Block.isExit()) { + Network.addEdge(Bout, T, 0); + } + + // An auxiliary node to allow increase/reduction of block counts: + // We assume that decreasing block counts is more expensive than increasing, + // and thus, setting separate costs here. In the future we may want to tune + // the relative costs so as to maximize the quality of generated profiles. + int64_t AuxCostInc = MinCostMaxFlow::AuxCostInc; + int64_t AuxCostDec = MinCostMaxFlow::AuxCostDec; + if (Block.Dangling) { + // Do not penalize changing weights of dangling blocks + AuxCostInc = 0; + AuxCostDec = 0; + } else { + // Increasing the count for "cold" blocks with zero initial count is more + // expensive than for "hot" ones + if (Block.Weight == 0) { + AuxCostInc = MinCostMaxFlow::AuxCostIncZero; + } + // Modifying the count of the entry block is expensive + if (Block.isEntry()) { + AuxCostInc = MinCostMaxFlow::AuxCostIncEntry; + AuxCostDec = MinCostMaxFlow::AuxCostDecEntry; + } + } + // For blocks with self-edges, do not penalize a reduction of the count, + // as all of the increase can be attributed to the self-edge + if (Block.HasSelfEdge) { + AuxCostDec = 0; + } + + Network.addEdge(Bin, Baux, AuxCostInc); + Network.addEdge(Baux, Bout, AuxCostInc); + if (Block.Weight > 0) { + Network.addEdge(Bout, Baux, AuxCostDec); + Network.addEdge(Baux, Bin, AuxCostDec); + } + } + + // Creating edges for every jump + for (auto &Jump : Func.Jumps) { + uint64_t Src = Jump.Source; + uint64_t Dst = Jump.Target; + if (Src != Dst) { + uint64_t SrcOut = 3 * Src + 1; + uint64_t DstIn = 3 * Dst; + uint64_t Cost = Jump.IsUnlikely ? MinCostMaxFlow::AuxCostUnlikely : 0; + Network.addEdge(SrcOut, DstIn, Cost); + } + } + + // Make sure we have a valid flow circulation + Network.addEdge(T, S, 0); +} + +/// Try to infer branch probabilities mimicking implementation of +/// BranchProbabilityInfo. Unlikely taken branches are marked so that the +/// inference algorithm can avoid sending flow along corresponding edges. +void findUnlikelyJumps(const std::vector &BasicBlocks, + BlockEdgeMap &Successors, FlowFunction &Func) { + for (auto &Jump : Func.Jumps) { + const auto *BB = BasicBlocks[Jump.Source]; + const auto *Succ = BasicBlocks[Jump.Target]; + const Instruction *TI = BB->getTerminator(); + // Check if a block ends with InvokeInst and mark non-taken branch unlikely. + // In that case block Succ should be a landing pad + if (Successors[BB].size() == 2 && Successors[BB].back() == Succ) { + if (isa(TI)) { + Jump.IsUnlikely = true; + } + } + const Instruction *SuccTI = Succ->getTerminator(); + // Check if the target block contains UnreachableInst and mark it unlikely + if (SuccTI->getNumSuccessors() == 0) { + if (isa(SuccTI)) { + Jump.IsUnlikely = true; + } + } + } +} + +/// Extract resulting block and edge counts from the flow network. +void extractWeights(MinCostMaxFlow &Network, FlowFunction &Func) { + uint64_t NumBlocks = Func.Blocks.size(); + + // Extract resulting block counts + for (uint64_t Src = 0; Src < NumBlocks; Src++) { + auto &Block = Func.Blocks[Src]; + uint64_t SrcOut = 3 * Src + 1; + int64_t Flow = 0; + for (auto &Adj : Network.getFlow(SrcOut)) { + uint64_t DstIn = Adj.first; + int64_t DstFlow = Adj.second; + bool IsAuxNode = (DstIn < 3 * NumBlocks && DstIn % 3 == 2); + if (!IsAuxNode || Block.HasSelfEdge) { + Flow += DstFlow; + } + } + Block.Flow = Flow; + assert(Flow >= 0 && "negative block flow"); + } + + // Extract resulting jump counts + for (auto &Jump : Func.Jumps) { + uint64_t Src = Jump.Source; + uint64_t Dst = Jump.Target; + int64_t Flow = 0; + if (Src != Dst) { + uint64_t SrcOut = 3 * Src + 1; + uint64_t DstIn = 3 * Dst; + Flow = Network.getFlow(SrcOut, DstIn); + } else { + uint64_t SrcOut = 3 * Src + 1; + uint64_t SrcAux = 3 * Src + 2; + int64_t AuxFlow = Network.getFlow(SrcOut, SrcAux); + if (AuxFlow > 0) + Flow = AuxFlow; + } + Jump.Flow = Flow; + assert(Flow >= 0 && "negative jump flow"); + } +} + +#ifndef NDEBUG +/// Verify that the computed flow values satisfy flow conservation rules +void verifyWeights(const FlowFunction &Func) { + const uint64_t NumBlocks = Func.Blocks.size(); + auto InFlow = std::vector(NumBlocks, 0); + auto OutFlow = std::vector(NumBlocks, 0); + for (auto &Jump : Func.Jumps) { + InFlow[Jump.Target] += Jump.Flow; + OutFlow[Jump.Source] += Jump.Flow; + } + + uint64_t TotalInFlow = 0; + uint64_t TotalOutFlow = 0; + for (uint64_t I = 0; I < NumBlocks; I++) { + auto &Block = Func.Blocks[I]; + if (Block.isEntry()) { + TotalInFlow += Block.Flow; + assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); + } else if (Block.isExit()) { + TotalOutFlow += Block.Flow; + assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); + } else { + assert(Block.Flow == OutFlow[I] && "incorrectly computed control flow"); + assert(Block.Flow == InFlow[I] && "incorrectly computed control flow"); + } + } + assert(TotalInFlow == TotalOutFlow && "incorrectly computed control flow"); +} +#endif + +} // end of anonymous namespace + +/// Apply the profile inference algorithm for a given function +void SampleProfileInference::apply(BlockWeightMap &BlockWeights, + EdgeWeightMap &EdgeWeights) { + // Find all forwards reachable blocks which the inference algorithm will be + // applied on. + df_iterator_default_set Reachable; + for (auto *BB : depth_first_ext(&F, Reachable)) + (void)BB /* Mark all reachable blocks */; + + // Find all backwards reachable blocks which the inference algorithm will be + // applied on. + df_iterator_default_set InverseReachable; + for (const auto &BB : F) { + // An exit block is a block without any successors. + if (succ_empty(&BB)) { + for (auto *RBB : inverse_depth_first_ext(&BB, InverseReachable)) + (void)RBB; + } + } + + // Keep a stable order for reachable blocks + DenseMap BlockIndex; + std::vector BasicBlocks; + BlockIndex.reserve(Reachable.size()); + BasicBlocks.reserve(Reachable.size()); + for (const auto &BB : F) { + if (Reachable.count(&BB) && InverseReachable.count(&BB)) { + BlockIndex[&BB] = BasicBlocks.size(); + BasicBlocks.push_back(&BB); + } + } + + BlockWeights.clear(); + EdgeWeights.clear(); + bool HasSamples = false; + for (const auto *BB : BasicBlocks) { + auto It = SampleBlockWeights.find(BB); + if (It != SampleBlockWeights.end() && It->second > 0) { + HasSamples = true; + BlockWeights[BB] = It->second; + } + } + // Quit early for functions with a single block or ones w/o samples + if (BasicBlocks.size() <= 1 || !HasSamples) { + return; + } + + // Create necessary objects + FlowFunction Func; + Func.Blocks.reserve(BasicBlocks.size()); + // Create FlowBlocks + for (const auto *BB : BasicBlocks) { + FlowBlock Block; + if (SampleBlockWeights.find(BB) != SampleBlockWeights.end()) { + Block.Dangling = false; + Block.Weight = SampleBlockWeights[BB]; + } else { + Block.Dangling = true; + Block.Weight = 0; + } + Block.Index = Func.Blocks.size(); + Func.Blocks.push_back(Block); + } + // Create FlowEdges + for (const auto *BB : BasicBlocks) { + for (auto *Succ : Successors[BB]) { + if (!BlockIndex.count(Succ)) + continue; + FlowJump Jump; + Jump.Source = BlockIndex[BB]; + Jump.Target = BlockIndex[Succ]; + Func.Jumps.push_back(Jump); + if (BB == Succ) { + Func.Blocks[BlockIndex[BB]].HasSelfEdge = true; + } + } + } + for (auto &Jump : Func.Jumps) { + Func.Blocks[Jump.Source].SuccJumps.push_back(&Jump); + Func.Blocks[Jump.Target].PredJumps.push_back(&Jump); + } + + // Try to infer probabilities of jumps based on the content of basic block + findUnlikelyJumps(BasicBlocks, Successors, Func); + + // Find the entry block + for (size_t I = 0; I < Func.Blocks.size(); I++) { + if (Func.Blocks[I].isEntry()) { + Func.Entry = I; + break; + } + } + + // Create and apply an inference network model + auto InferenceNetwork = MinCostMaxFlow(); + initializeNetwork(InferenceNetwork, Func); + InferenceNetwork.run(); + + // Extract flow values for every block and every edge + extractWeights(InferenceNetwork, Func); + +#ifndef NDEBUG + // Verify the result + verifyWeights(Func); +#endif + + // Extract the resulting weights from the control flow + // All weights are increased by one to avoid propagation errors introduced by + // zero weights. + for (const auto *BB : BasicBlocks) { + BlockWeights[BB] = Func.Blocks[BlockIndex[BB]].Flow; + } + for (auto &Jump : Func.Jumps) { + Edge E = std::make_pair(BasicBlocks[Jump.Source], BasicBlocks[Jump.Target]); + EdgeWeights[E] = Jump.Flow; + } + +#ifndef NDEBUG + // Unreachable blocks and edges should not have a weight. + for (auto &I : BlockWeights) { + assert(Reachable.contains(I.first)); + assert(InverseReachable.contains(I.first)); + } + for (auto &I : EdgeWeights) { + assert(Reachable.contains(I.first.first) && + Reachable.contains(I.first.second)); + assert(InverseReachable.contains(I.first.first) && + InverseReachable.contains(I.first.second)); + } +#endif +} diff --git a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp --- a/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileLoaderBaseUtil.cpp @@ -34,6 +34,10 @@ cl::desc("Use this option to turn off/on warnings about function with " "samples but without debug information to use those samples. ")); +cl::opt SampleProfileUseProfi( + "sample-profile-use-profi", cl::init(false), cl::Hidden, cl::ZeroOrMore, + cl::desc("Use profi to infer block and edge counts.")); + namespace sampleprofutil { /// Return true if the given callsite is hot wrt to hot cutoff threshold. diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference.prof @@ -0,0 +1,23 @@ +test_1:23968:0 + 1: 100 + 2: 60 + 3: 40 + !CFGChecksum: 4294967295 + +test_2:23968:0 + 1: 100 + 3: 10 + !CFGChecksum: 37753817093 + +test_3:10000:0 + 3: 13 + 5: 89 + !CFGChecksum: 69502983527 + +sum_of_squares:23968:0 + 2: 5993 + 3: 1 + 4: 5992 + 5: 5992 + 8: 5992 + !CFGChecksum: 175862120757 diff --git a/llvm/test/Transforms/SampleProfile/profile-inference.ll b/llvm/test/Transforms/SampleProfile/profile-inference.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/profile-inference.ll @@ -0,0 +1,245 @@ +; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference.prof | opt -analyze -block-freq -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2 + +; The test verifies that profile inference correctly builds branch probabilities +; from sampling-based block counts. +; +; +---------+ +----------+ +; | b3 [40] | <-- | b1 [100] | +; +---------+ +----------+ +; | +; | +; v +; +----------+ +; | b2 [60] | +; +----------+ + +@yydebug = dso_local global i32 0, align 4 + +; Function Attrs: nounwind uwtable +define dso_local i32 @test_1() #0 { +b1: + call void @llvm.pseudoprobe(i64 7964825052912775246, i64 1, i32 0, i64 -1) + %0 = load i32, i32* @yydebug, align 4 + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %b2, label %b3 +; CHECK: edge b1 -> b2 probability is 0x4ccccccd / 0x80000000 = 60.00% +; CHECK: edge b1 -> b3 probability is 0x33333333 / 0x80000000 = 40.00% +; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 100 + +b2: + call void @llvm.pseudoprobe(i64 7964825052912775246, i64 2, i32 0, i64 -1) + ret i32 %0 +; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 60 + +b3: + call void @llvm.pseudoprobe(i64 7964825052912775246, i64 3, i32 0, i64 -1) + ret i32 %0 +; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 40 +} + + +; The test verifies that profile inference correctly builds branch probabilities +; from sampling-based block counts in the presence of "dangling" probes (whose +; block counts are missing). +; +; +---------+ +----------+ +; | b3 [10] | <-- | b1 [100] | +; +---------+ +----------+ +; | +; | +; v +; +----------+ +; | b2 [?] | +; +----------+ + +; Function Attrs: nounwind uwtable +define dso_local i32 @test_2() #0 { +b1: + call void @llvm.pseudoprobe(i64 -6216829535442445639, i64 1, i32 0, i64 -1) + %0 = load i32, i32* @yydebug, align 4 + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %b2, label %b3 +; CHECK: edge b1 -> b2 probability is 0x73333333 / 0x80000000 = 90.00% +; CHECK: edge b1 -> b3 probability is 0x0ccccccd / 0x80000000 = 10.00% +; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 100 + +b2: + call void @llvm.pseudoprobe(i64 -6216829535442445639, i64 2, i32 0, i64 -1) + ret i32 %0 +; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 90 + +b3: + call void @llvm.pseudoprobe(i64 -6216829535442445639, i64 3, i32 0, i64 -1) + ret i32 %0 +} +; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 10 + + +; The test verifies that profi is able to infer block counts from hot subgraphs. +; +; +---------+ +---------+ +; | b4 [?] | <-- | b1 [?] | +; +---------+ +---------+ +; | | +; | | +; v v +; +---------+ +---------+ +; | b5 [89] | | b2 [?] | +; +---------+ +---------+ +; | +; | +; v +; +---------+ +; | b3 [13] | +; +---------+ + +; Function Attrs: nounwind uwtable +define dso_local i32 @test_3() #0 { +b1: + call void @llvm.pseudoprobe(i64 1649282507922421973, i64 1, i32 0, i64 -1) + %0 = load i32, i32* @yydebug, align 4 + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %b2, label %b4 +; CHECK: edge b1 -> b2 probability is 0x10505050 / 0x80000000 = 12.75% +; CHECK: edge b1 -> b4 probability is 0x6fafafb0 / 0x80000000 = 87.25% +; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 102 + +b2: + call void @llvm.pseudoprobe(i64 1649282507922421973, i64 2, i32 0, i64 -1) + br label %b3 +; CHECK: edge b2 -> b3 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 13 + +b3: + call void @llvm.pseudoprobe(i64 1649282507922421973, i64 3, i32 0, i64 -1) + ret i32 %0 +; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 13 + +b4: + call void @llvm.pseudoprobe(i64 1649282507922421973, i64 4, i32 0, i64 -1) + br label %b5 +; CHECK: edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 89 + +b5: + call void @llvm.pseudoprobe(i64 1649282507922421973, i64 5, i32 0, i64 -1) + ret i32 %0 +; CHECK2: - b5: float = {{.*}}, int = {{.*}}, count = 89 +} + + +; A larger test to verify that profile inference correctly identifies hot parts +; of the control-flow graph. +; +; +-----------+ +; | b1 [?] | +; +-----------+ +; | +; | +; v +; +--------+ +-----------+ +; | b3 [1] | <-- | b2 [5993] | +; +--------+ +-----------+ +; | | +; | | +; | v +; | +-----------+ +--------+ +; | | b4 [5992] | --> | b6 [?] | +; | +-----------+ +--------+ +; | | | +; | | | +; | v | +; | +-----------+ | +; | | b5 [5992] | | +; | +-----------+ | +; | | | +; | | | +; | v | +; | +-----------+ | +; | | b7 [?] | | +; | +-----------+ | +; | | | +; | | | +; | v | +; | +-----------+ | +; | | b8 [5992] | <-----+ +; | +-----------+ +; | | +; | | +; | v +; | +-----------+ +; +----------> | b9 [?] | +; +-----------+ + +; Function Attrs: nounwind uwtable +define dso_local i32 @sum_of_squares() #0 { +b1: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 1, i32 0, i64 -1) + %0 = load i32, i32* @yydebug, align 4 + %cmp = icmp ne i32 %0, 0 + br label %b2 +; CHECK: edge b1 -> b2 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 5993 + +b2: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 2, i32 0, i64 -1) + br i1 %cmp, label %b4, label %b3 +; CHECK: edge b2 -> b4 probability is 0x7ffa8844 / 0x80000000 = 99.98% +; CHECK: edge b2 -> b3 probability is 0x000577bc / 0x80000000 = 0.02% +; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 5993 + +b3: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 3, i32 0, i64 -1) + br label %b9 +; CHECK: edge b3 -> b9 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1 + +b4: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 4, i32 0, i64 -1) + br i1 %cmp, label %b5, label %b6 +; CHECK: edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK: edge b4 -> b6 probability is 0x00000000 / 0x80000000 = 0.00% +; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 5992 + +b5: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 5, i32 0, i64 -1) + br label %b7 +; CHECK: edge b5 -> b7 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b5: float = {{.*}}, int = {{.*}}, count = 5992 + +b6: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 6, i32 0, i64 -1) + br label %b8 +; CHECK: edge b6 -> b8 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b6: float = {{.*}}, int = {{.*}}, count = 0 + +b7: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 7, i32 0, i64 -1) + br label %b8 +; CHECK: edge b7 -> b8 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b7: float = {{.*}}, int = {{.*}}, count = 5992 + +b8: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 8, i32 0, i64 -1) + br label %b9 +; CHECK: edge b8 -> b9 probability is 0x80000000 / 0x80000000 = 100.00% +; CHECK2: - b8: float = {{.*}}, int = {{.*}}, count = 5992 + +b9: + call void @llvm.pseudoprobe(i64 -907520326213521421, i64 9, i32 0, i64 -1) + ret i32 %0 +} +; CHECK2: - b9: float = {{.*}}, int = {{.*}}, count = 5993 + +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1 + +attributes #0 = { noinline nounwind uwtable "use-sample-profile"} +attributes #1 = { nounwind } + +!llvm.pseudo_probe_desc = !{!6, !7, !8, !9} + +!6 = !{i64 7964825052912775246, i64 4294967295, !"test_1", null} +!7 = !{i64 -6216829535442445639, i64 37753817093, !"test_2", null} +!8 = !{i64 1649282507922421973, i64 69502983527, !"test_3", null} +!9 = !{i64 -907520326213521421, i64 175862120757, !"sum_of_squares", null}