diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h --- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -42,7 +42,9 @@ #include #include #include +#include #include +#include #include #include @@ -51,6 +53,10 @@ namespace llvm { extern llvm::cl::opt CheckBFIUnknownBlockQueries; +extern llvm::cl::opt UseIterativeBFIInference; +extern llvm::cl::opt IterativeBFIMaxIterationsPerBlock; +extern llvm::cl::opt IterativeBFIPrecision; + class BranchProbabilityInfo; class Function; class Loop; @@ -967,6 +973,45 @@ return bfi_detail::getBlockName(getBlock(Node)); } + /// The current implementation for computing relative block frequencies does + /// not handle correctly control-flow graphs containing irreducible loops. To + /// resolve the problem, we apply a post-processing step, which iteratively + /// updates block frequencies based on the frequencies of their predesessors. + /// This corresponds to finding the stationary point of the Markov chain by + /// an iterative method aka "PageRank computation". + /// The algorithm takes at most O(|E| * IterativeBFIMaxIterations) steps but + /// typically converges faster. + /// + /// Decide whether we want to apply iterative inference for a given function. + bool needIterativeInference() const; + + /// Apply an iterative post-processing to infer correct counts for irr loops. + void applyIterativeInference(); + + using ProbMatrixType = std::vector>>; + + /// Run iterative inference for a probability matrix and initial frequencies. + void iterativeInference(const ProbMatrixType &ProbMatrix, + std::vector &Freq) const; + + /// Find all blocks to apply inference on, that is, reachable from the entry + /// and backward reachable from exists along edges with positive probability. + void findReachableBlocks(std::vector &Blocks) const; + + /// Build a matrix of probabilities with transitions (edges) between the + /// blocks: ProbMatrix[I] holds pairs (J, P), where Pr[J -> I | J] = P + void initTransitionProbabilities( + const std::vector &Blocks, + const DenseMap &BlockIndex, + ProbMatrixType &ProbMatrix) const; + +#ifndef NDEBUG + /// Compute the discrepancy between current block frequencies and the + /// probability matrix. + Scaled64 discrepancy(const ProbMatrixType &ProbMatrix, + const std::vector &Freq) const; +#endif + public: BlockFrequencyInfoImpl() = default; @@ -1093,6 +1138,10 @@ computeMassInLoops(); computeMassInFunction(); unwrapLoops(); + // Apply a post-processing step improving computed frequencies for functions + // with irreducible loops. + if (needIterativeInference()) + applyIterativeInference(); finalizeMetrics(); if (CheckBFIUnknownBlockQueries) { @@ -1313,6 +1362,294 @@ llvm_unreachable("unhandled irreducible control flow"); } +template +bool BlockFrequencyInfoImpl::needIterativeInference() const { + if (!UseIterativeBFIInference) + return false; + if (!F->getFunction().hasProfileData()) + return false; + // Apply iterative inference only if the function contains irreducible loops; + // otherwise, computed block frequencies are reasonably correct. + for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) { + if (L->isIrreducible()) + return true; + } + return false; +} + +template void BlockFrequencyInfoImpl::applyIterativeInference() { + // Extract blocks for processing: a block is considered for inference iff it + // can be reached from the entry by edges with a positive probability. + // Non-processed blocks are assigned with the zero frequency and are ignored + // in the computation + std::vector ReachableBlocks; + findReachableBlocks(ReachableBlocks); + if (ReachableBlocks.empty()) + return; + + // The map is used to to index successors/predecessors of reachable blocks in + // the ReachableBlocks vector + DenseMap BlockIndex; + // Extract initial frequencies for the reachable blocks + auto Freq = std::vector(ReachableBlocks.size()); + Scaled64 SumFreq; + for (size_t I = 0; I < ReachableBlocks.size(); I++) { + const BlockT *BB = ReachableBlocks[I]; + BlockIndex[BB] = I; + Freq[I] = getFloatingBlockFreq(BB); + SumFreq += Freq[I]; + } + assert(!SumFreq.isZero() && "empty initial block frequencies"); + + LLVM_DEBUG(dbgs() << "Applying iterative inference for " << F->getName() + << " with " << ReachableBlocks.size() << " blocks\n"); + + // Normalizing frequencies so they sum up to 1.0 + for (auto &Value : Freq) { + Value /= SumFreq; + } + + // Setting up edge probabilities using sparse matrix representation: + // ProbMatrix[I] holds a vector of pairs (J, P) where Pr[J -> I | J] = P + ProbMatrixType ProbMatrix; + initTransitionProbabilities(ReachableBlocks, BlockIndex, ProbMatrix); + + // Run the propagation + iterativeInference(ProbMatrix, Freq); + + // Assign computed frequency values + for (const BlockT &BB : *F) { + auto Node = getNode(&BB); + if (!Node.isValid()) + continue; + if (BlockIndex.count(&BB)) { + Freqs[Node.Index].Scaled = Freq[BlockIndex[&BB]]; + } else { + Freqs[Node.Index].Scaled = Scaled64::getZero(); + } + } +} + +template +void BlockFrequencyInfoImpl::iterativeInference( + const ProbMatrixType &ProbMatrix, std::vector &Freq) const { + assert(0.0 < IterativeBFIPrecision && IterativeBFIPrecision < 1.0 && + "incorrectly specified precision"); + // Convert double precision to Scaled64 + const auto Precision = + Scaled64::getInverse(static_cast(1.0 / IterativeBFIPrecision)); + const size_t MaxIterations = IterativeBFIMaxIterationsPerBlock * Freq.size(); + +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << " Initial discrepancy = " + << discrepancy(ProbMatrix, Freq).toString() << "\n"); +#endif + + // Successors[I] holds unique sucessors of the I-th block + auto Successors = std::vector>(Freq.size()); + for (size_t I = 0; I < Freq.size(); I++) { + for (auto &Jump : ProbMatrix[I]) { + Successors[Jump.first].push_back(I); + } + } + + // To speedup computation, we maintain a set of "active" blocks whose + // frequencies need to be updated based on the incoming edges. + // The set is dynamic and changes after every update. Initially all blocks + // with a positive frequency are active + auto IsActive = std::vector(Freq.size(), false); + std::queue ActiveSet; + for (size_t I = 0; I < Freq.size(); I++) { + if (Freq[I] > 0) { + ActiveSet.push(I); + IsActive[I] = true; + } + } + + // Iterate over the blocks propagating frequencies + size_t It = 0; + while (It++ < MaxIterations && !ActiveSet.empty()) { + size_t I = ActiveSet.front(); + ActiveSet.pop(); + IsActive[I] = false; + + // Compute a new frequency for the block: NewFreq := Freq \times ProbMatrix. + // A special care is taken for self-edges that needs to be scaled by + // (1.0 - SelfProb), where SelfProb is the sum of probabilities on the edges + Scaled64 NewFreq; + Scaled64 OneMinusSelfProb = Scaled64::getOne(); + for (auto &Jump : ProbMatrix[I]) { + if (Jump.first == I) { + OneMinusSelfProb -= Jump.second; + } else { + NewFreq += Freq[Jump.first] * Jump.second; + } + } + if (OneMinusSelfProb != Scaled64::getOne()) + NewFreq /= OneMinusSelfProb; + + // If the block's frequency has changed enough, then + // make sure the block and its successors are in the active set + auto Change = Freq[I] >= NewFreq ? Freq[I] - NewFreq : NewFreq - Freq[I]; + if (Change > Precision) { + ActiveSet.push(I); + IsActive[I] = true; + for (size_t Succ : Successors[I]) { + if (!IsActive[Succ]) { + ActiveSet.push(Succ); + IsActive[Succ] = true; + } + } + } + + // Update the frequency for the block + Freq[I] = NewFreq; + } + + LLVM_DEBUG(dbgs() << " Completed " << It << " inference iterations" + << format(" (%0.0f per block)", double(It) / Freq.size()) + << "\n"); +#ifndef NDEBUG + LLVM_DEBUG(dbgs() << " Final discrepancy = " + << discrepancy(ProbMatrix, Freq).toString() << "\n"); +#endif +} + +template +void BlockFrequencyInfoImpl::findReachableBlocks( + std::vector &Blocks) const { + // Find all blocks to apply inference on, that is, reachable from the entry + // along edges with non-zero probablities + std::queue Queue; + std::unordered_set Reachable; + const BlockT *Entry = &F->front(); + Queue.push(Entry); + Reachable.insert(Entry); + while (!Queue.empty()) { + const BlockT *SrcBB = Queue.front(); + Queue.pop(); + for (const BlockT *DstBB : children(SrcBB)) { + auto EP = BPI->getEdgeProbability(SrcBB, DstBB); + if (EP.isZero()) + continue; + if (Reachable.find(DstBB) == Reachable.end()) { + Queue.push(DstBB); + Reachable.insert(DstBB); + } + } + } + + // Find all blocks to apply inference on, that is, backward reachable from + // the entry along (backward) edges with non-zero probablities + std::unordered_set InverseReachable; + for (const BlockT &BB : *F) { + // An exit block is a block without any successors + bool HasSucc = GraphTraits::child_begin(&BB) != + GraphTraits::child_end(&BB); + if (!HasSucc && Reachable.count(&BB)) { + Queue.push(&BB); + InverseReachable.insert(&BB); + } + } + while (!Queue.empty()) { + const BlockT *SrcBB = Queue.front(); + Queue.pop(); + for (const BlockT *DstBB : children>(SrcBB)) { + auto EP = BPI->getEdgeProbability(DstBB, SrcBB); + if (EP.isZero()) + continue; + if (InverseReachable.find(DstBB) == InverseReachable.end()) { + Queue.push(DstBB); + InverseReachable.insert(DstBB); + } + } + } + + // Collect the result + Blocks.reserve(F->size()); + for (const BlockT &BB : *F) { + if (Reachable.count(&BB) && InverseReachable.count(&BB)) { + Blocks.push_back(&BB); + } + } +} + +template +void BlockFrequencyInfoImpl::initTransitionProbabilities( + const std::vector &Blocks, + const DenseMap &BlockIndex, + ProbMatrixType &ProbMatrix) const { + const size_t NumBlocks = Blocks.size(); + auto Succs = std::vector>>(NumBlocks); + auto SumProb = std::vector(NumBlocks); + + // Find unique successors and corresponding probabilities for every block + for (size_t Src = 0; Src < NumBlocks; Src++) { + const BlockT *BB = Blocks[Src]; + std::unordered_set UniqueSuccs; + for (const auto SI : children(BB)) { + // Ignore cold blocks + if (BlockIndex.find(SI) == BlockIndex.end()) + continue; + // Ignore parallel edges between BB and SI blocks + if (UniqueSuccs.find(SI) != UniqueSuccs.end()) + continue; + UniqueSuccs.insert(SI); + // Ignore jumps with zero probability + auto EP = BPI->getEdgeProbability(BB, SI); + if (EP.isZero()) + continue; + + auto EdgeProb = + Scaled64::getFraction(EP.getNumerator(), EP.getDenominator()); + size_t Dst = BlockIndex.find(SI)->second; + Succs[Src].push_back(std::make_pair(Dst, EdgeProb)); + SumProb[Src] += EdgeProb; + } + } + + // Add transitions for every jump with positive branch probability + ProbMatrix = ProbMatrixType(NumBlocks); + for (size_t Src = 0; Src < NumBlocks; Src++) { + // Ignore blocks w/o successors + if (Succs[Src].empty()) + continue; + + assert(!SumProb[Src].isZero() && "Zero sum probability of non-exit block"); + for (auto &Jump : Succs[Src]) { + size_t Dst = Jump.first; + Scaled64 Prob = Jump.second; + ProbMatrix[Dst].push_back(std::make_pair(Src, Prob / SumProb[Src])); + } + } + + // Add transitions from sinks to the source + size_t EntryIdx = BlockIndex.find(&F->front())->second; + for (size_t Src = 0; Src < NumBlocks; Src++) { + if (Succs[Src].empty()) { + ProbMatrix[EntryIdx].push_back(std::make_pair(Src, Scaled64::getOne())); + } + } +} + +#ifndef NDEBUG +template +BlockFrequencyInfoImplBase::Scaled64 BlockFrequencyInfoImpl::discrepancy( + const ProbMatrixType &ProbMatrix, const std::vector &Freq) const { + assert(Freq[0] > 0 && "Incorrectly computed frequency of the entry block"); + Scaled64 Discrepancy; + for (size_t I = 0; I < ProbMatrix.size(); I++) { + Scaled64 Sum; + for (const auto &Jump : ProbMatrix[I]) { + Sum += Freq[Jump.first] * Jump.second; + } + Discrepancy += Freq[I] >= Sum ? Freq[I] - Sum : Sum - Freq[I]; + } + // Normalizing by the frequency of the entry block + return Discrepancy / Freq[0]; +} +#endif + /// \note This should be a lambda, but that crashes GCC 4.7. namespace bfi_detail { diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp --- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -46,6 +46,20 @@ cl::init(false), cl::Hidden, cl::desc("Check if block frequency is queried for an unknown block " "for debugging missed BFI updates")); + +cl::opt UseIterativeBFIInference( + "use-iterative-bfi-inference", cl::init(false), cl::Hidden, + cl::desc("Apply an iterative post-processing to infer correct BFI counts")); + +cl::opt IterativeBFIMaxIterationsPerBlock( + "iterative-bfi-max-iterations-per-block", cl::init(1000), cl::Hidden, + cl::desc("Iterative inference: maximum number of update iterations " + "per block")); + +cl::opt IterativeBFIPrecision( + "iterative-bfi-precision", cl::init(1e-12), cl::Hidden, + cl::desc("Iterative inference: delta convergence precision; smaller values " + "typically lead to better results at the cost of worsen runtime")); } ScaledNumber BlockMass::toScaled() const { diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-correlation-irreducible-loops.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-correlation-irreducible-loops.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-correlation-irreducible-loops.prof @@ -0,0 +1,19 @@ +yyparse_1:10822:0 + 1: 1 + 2: 1003 + 3: 1002 + 4: 1002 + 5: 1 + 6: 0 + 7: 1 + 8: 0 + 9: 1 + !CFGChecksum: 158496288380146391 + +foo1:2297361:0 + 1: 1 + 2: 86 + 3: 8212 + 4: 1 + 5: 17747 + !CFGChecksum: 404850113186107133 diff --git a/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/profile-correlation-irreducible-loops.ll @@ -0,0 +1,187 @@ +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-correlation-irreducible-loops.prof | opt -analyze -block-freq -enable-new-pm=0 -use-iterative-bfi-inference | FileCheck %s +; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/profile-correlation-irreducible-loops.prof -S | FileCheck %s --check-prefix=CHECK2 +; RUN: opt < %s -analyze -block-freq -enable-new-pm=0 -use-iterative-bfi-inference | FileCheck %s --check-prefix=CHECK3 + +; The C++ code for this test case is from c-parse.c in 403.gcc (SPEC2006) +; The problem with BFI for the test is solved by applying iterative inference. +; The corresponding CFG graph is shown below, with intended counts for every +; basic block. The hot loop, b3->b4->b2, is not getting proper (large) counts +; unless the -use-iterative-bfi-inference option is specified. +; +; +-------------------------------------------+ +; | | +; | +----------+ | +; | | b1 [1] | | +; | +----------+ | +; | | | +; | | | +; | v | +; | +----------+ | +; | +------------> | b2 [625] | -+ | +; | | +----------+ | | +; | | | | | +; | | | | | +; | | v | | +; | +----------+ +----------+ | | +; | | b4 [624] | <-- | b3 [625] | <+---------+ +; | +----------+ +----------+ | +; | | | +; +----+ | | +; | v v +; +----------+ +--------------------+ +; | b8 [1] | <-- | b7 [2] | +; +----------+ +--------------------+ +; | ^ +; | | +; v | +; +----------+ +----------+ | +; | b9 [1] | <-- | b5 [2] | | +; +----------+ +----------+ | +; | | +; | | +; v | +; +----------+ | +; | b6 [1] | -+ +; +----------+ + +@yydebug = dso_local global i32 0, align 4 + +; Function Attrs: noinline nounwind uwtable +define dso_local i32 @yyparse_1() #0 { +b1: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 1, i32 0, i64 -1) + %0 = load i32, i32* @yydebug, align 4 + %cmp = icmp ne i32 %0, 0 + br label %b2 +; CHECK: - b1: float = {{.*}}, int = {{.*}}, count = 1 + +b2: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 2, i32 0, i64 -1) + br i1 %cmp, label %b7, label %b3 +; CHECK: - b2: float = {{.*}}, int = {{.*}}, count = 625 + +b3: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 3, i32 0, i64 -1) + br i1 %cmp, label %b7, label %b4 +; CHECK: - b3: float = {{.*}}, int = {{.*}}, count = 625 +; CHECK2: br i1 %cmp, label %b7, label %b4, +; CHECK2-SAME: !prof ![[END172_PROF:[0-9]+]] + +b4: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 4, i32 0, i64 -1) + br label %b2 +; CHECK: - b4: float = {{.*}}, int = {{.*}}, count = 624 + +b5: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 5, i32 0, i64 -1) + br i1 %cmp, label %b9, label %b6 +; CHECK: - b5: float = {{.*}}, int = {{.*}}, count = 2 + +b6: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 6, i32 0, i64 -1) + br label %b7 +; CHECK: - b6: float = {{.*}}, int = {{.*}}, count = 1 + +b7: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 7, i32 0, i64 -1) + br i1 %cmp, label %b5, label %b8 +; CHECK: - b7: float = {{.*}}, int = {{.*}}, count = 2 +; CHECK2: br i1 %cmp, label %b5, label %b8, +; CHECK2-SAME: !prof ![[FALSE4858_PROF:[0-9]+]] + +b8: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 8, i32 0, i64 -1) + br label %b3 +; CHECK: - b8: float = {{.*}}, int = {{.*}}, count = 1 + +b9: + call void @llvm.pseudoprobe(i64 -7702751003264189226, i64 9, i32 0, i64 -1) + %1 = load i32, i32* @yydebug, align 4 + ret i32 %1 +; CHECK: - b9: float = {{.*}}, int = {{.*}}, count = 1 + +} + +; Another difficult (for BFI) instance with irreducible loops, +; containing 'indirectbr'. The corresponding CFG graph is shown below, with +; intended counts for every basic block. +; +; +-----------+ +; | b1 [1] | +; +-----------+ +; | +; | +; v +; +------------------------+ +; +- | b2 [86] | <+ +; | +------------------------+ | +; | | | | +; | | | | +; | v | | +; | +-----------+ | | +; | | b3 [8212] | <+-------+ | +; | +-----------+ | | | +; | | | | | +; | | | | | +; | v v | | +; | +------------------------+ | +; | | indirectgoto [17747] | -+ +; | +------------------------+ +; | | ^ | +; | | +--+ +; | v +; | +-----------+ +; +> | b4 [1] | +; +-----------+ + +; Function Attrs: nounwind uwtable +define dso_local i32 @foo1() #0 !prof !132 { +b1: + call void @llvm.pseudoprobe(i64 7682762345278052905, i64 1, i32 0, i64 -1) + %0 = load i32, i32* @yydebug, align 4 + %cmp = icmp ne i32 %0, 0 + br label %b2 +; CHECK3: - b1: float = {{.*}}, int = {{.*}}, count = 1 + +b2: + call void @llvm.pseudoprobe(i64 7682762345278052905, i64 2, i32 0, i64 -1) + %1 = load i32, i32* @yydebug, align 4 + switch i32 %1, label %b4 [ + i32 1, label %indirectgoto + i32 2, label %b3 + ], !prof !133 +; CHECK3: - b2: float = {{.*}}, int = {{.*}}, count = 86 + +b3: + call void @llvm.pseudoprobe(i64 7682762345278052905, i64 3, i32 0, i64 -1) + br label %indirectgoto +; CHECK3: - b3: float = {{.*}}, int = {{.*}}, count = 8212 + +b4: + call void @llvm.pseudoprobe(i64 7682762345278052905, i64 4, i32 0, i64 -1) + %2 = load i32, i32* @yydebug, align 4 + ret i32 %2 +; CHECK3: - b4: float = {{.*}}, int = {{.*}}, count = 1 + +indirectgoto: + %indirect.goto.dest = alloca i8, align 4 + call void @llvm.pseudoprobe(i64 7682762345278052905, i64 5, i32 0, i64 -1) + indirectbr i8* %indirect.goto.dest, [label %b2, label %indirectgoto, label %b4, label %b3], !prof !134 +; CHECK3: - indirectgoto: float = {{.*}}, int = {{.*}}, count = 17747 + +} + +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #1 + +attributes #0 = { noinline nounwind uwtable "use-sample-profile"} +attributes #1 = { nounwind } + +!llvm.pseudo_probe_desc = !{!1079, !4496} +!1079 = !{i64 -7702751003264189226, i64 158496288380146391, !"yyparse_1", null} +!4496 = !{i64 7682762345278052905, i64 404850113186107133, !"foo1", null} +!132 = !{!"function_entry_count", i64 1} +!133 = !{!"branch_weights", i32 0, i32 86, i32 0} +!134 = !{!"branch_weights", i32 85, i32 9449, i32 1, i32 8212} + +; CHECK2: ![[END172_PROF]] = !{!"branch_weights", i32 1, i32 1003} +; CHECK2: ![[FALSE4858_PROF]] = !{!"branch_weights", i32 2, i32 1}