diff --git a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp --- a/llvm/lib/Transforms/Utils/SampleProfileInference.cpp +++ b/llvm/lib/Transforms/Utils/SampleProfileInference.cpp @@ -255,7 +255,7 @@ Now = Pred; } - assert(PathCapacity > 0 && "found incorrect augmenting path"); + assert(PathCapacity > 0 && "found an incorrect augmenting path"); // Update the flow along the path Now = Target; @@ -306,7 +306,22 @@ uint64_t Target; }; -/// Post-processing adjustment of the control flow. +/// A post-processing adjustment of control flow. It applies two steps by +/// rerouting some flow and making it more realistic: +/// +/// - First, it removes all isolated components ("islands") with a positive flow +/// that are unreachable from the entry block. For every such component, we +/// find the shortest from the entry to an exit passing through the component, +/// and increase the flow by one unit along the path. +/// +/// - Second, it identifies all "dangling subgraphs" consisting of basic blocks +/// with no sampled counts. Then it rebalnces the flow that goes through such +/// a subgraph so that each branch is taken with probability 50%. +/// A dangling subgraph contains nodes such that for every two nodes u and v: +/// - u dominates v and u is not dangling; +/// - v post-dominates u; and +/// - all inner-nodes of all (u,v)-paths are dangling. +/// class FlowAdjuster { public: FlowAdjuster(FlowFunction &Func) : Func(Func) { @@ -316,14 +331,16 @@ // Run the post-processing void run() { - /// We adjust the control flow in a function so as to remove all - /// "isolated" components with positive flow that are unreachable - /// from the entry block. For every such component, we find the shortest - /// path from the entry to an exit passing through the component, and - /// increase the flow by one unit along the path. + /// Adjust the flow to get rid of isolated components. joinIsolatedComponents(); + + /// Rebalance the flow inside dangling subgraphs. + rebalanceDanglingSubgraphs(); } + /// The probability for the first successor of a dangling subgraph + static constexpr double DanglingFirstSuccProbability = 0.5; + private: void joinIsolatedComponents() { // Find blocks that are reachable from the source @@ -350,7 +367,7 @@ } } - /// Run bfs from a given block along the jumps with a positive flow and mark + /// Run BFS from a given block along the jumps with a positive flow and mark /// all reachable blocks. void findReachable(uint64_t Src, std::vector &Visited) { if (Visited[Src]) @@ -470,6 +487,164 @@ uint64_t NumBlocks() const { return Func.Blocks.size(); } + /// Rebalance dangling subgraphs so as each branch splits with probabilities + /// DanglingFirstSuccProbability and 1 - DanglingFirstSuccProbability + void rebalanceDanglingSubgraphs() { + assert(DanglingFirstSuccProbability >= 0.0 && + DanglingFirstSuccProbability <= 1.0 && + "the share of the dangling successor should be between 0 and 1"); + // Try to find dangling subgraphs from each non-dangling block + for (uint64_t I = 0; I < Func.Blocks.size(); I++) { + auto SrcBlock = &Func.Blocks[I]; + // Do not attempt to find dangling successors from a dangling or a + // zero-flow block + if (SrcBlock->Dangling || SrcBlock->Flow == 0) + continue; + + std::vector DanglingSuccs; + FlowBlock *DstBlock = nullptr; + // Find a dangling subgraphs starting at block SrcBlock + if (!findDanglingSubgraph(SrcBlock, DstBlock, DanglingSuccs)) + continue; + // At the moment, we do not rebalance subgraphs containing cycles among + // dangling blocks + if (!isAcyclicSubgraph(SrcBlock, DstBlock, DanglingSuccs)) + continue; + + // Rebalance the flow + rebalanceDanglingSubgraph(SrcBlock, DstBlock, DanglingSuccs); + } + } + + /// Find a dangling subgraph starting at block SrcBlock. + /// If the search is successful, the method sets DstBlock and DanglingSuccs. + bool findDanglingSubgraph(FlowBlock *SrcBlock, FlowBlock *&DstBlock, + std::vector &DanglingSuccs) { + // Run BFS from SrcBlock and make sure all paths are going through dangling + // blocks and end at a non-dangling DstBlock + auto Visited = std::vector(NumBlocks(), false); + std::queue Queue; + DstBlock = nullptr; + + Queue.push(SrcBlock->Index); + Visited[SrcBlock->Index] = true; + while (!Queue.empty()) { + auto &Block = Func.Blocks[Queue.front()]; + Queue.pop(); + // Process blocks reachable from Block + for (auto Jump : Block.SuccJumps) { + uint64_t Dst = Jump->Target; + if (Visited[Dst]) + continue; + Visited[Dst] = true; + if (!Func.Blocks[Dst].Dangling) { + // If we see non-unique non-dangling block reachable from SrcBlock, + // stop processing and skip rebalancing + FlowBlock *CandidateDstBlock = &Func.Blocks[Dst]; + if (DstBlock != nullptr && DstBlock != CandidateDstBlock) + return false; + DstBlock = CandidateDstBlock; + } else { + Queue.push(Dst); + DanglingSuccs.push_back(&Func.Blocks[Dst]); + } + } + } + + // If the list of dangling blocks is empty, we don't need rebalancing + if (DanglingSuccs.empty()) + return false; + // If all reachable nodes from SrcBlock are dangling, skip rebalancing + if (DstBlock == nullptr) + return false; + // If any of the dangling blocks is an exit block, skip rebalancing + for (auto Block : DanglingSuccs) { + if (Block->isExit()) + return false; + } + + return true; + } + + /// Verify if the given dangling subgraph is acyclic, and if yes, reorder + /// DanglingSuccs in the topological order (so that all jumps are "forward"). + bool isAcyclicSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, + std::vector &DanglingSuccs) { + // Extract local in-degrees in the considered subgraph + auto LocalInDegree = std::vector(NumBlocks(), 0); + for (auto Jump : SrcBlock->SuccJumps) { + LocalInDegree[Jump->Target]++; + } + for (uint64_t I = 0; I < DanglingSuccs.size(); I++) { + for (auto Jump : DanglingSuccs[I]->SuccJumps) { + LocalInDegree[Jump->Target]++; + } + } + // A loop containing SrcBlock + if (LocalInDegree[SrcBlock->Index] > 0) + return false; + + std::vector AcyclicOrder; + std::queue Queue; + Queue.push(SrcBlock->Index); + while (!Queue.empty()) { + auto &Block = Func.Blocks[Queue.front()]; + Queue.pop(); + // Stop propagation once we reach DstBlock + if (Block.Index == DstBlock->Index) + break; + + AcyclicOrder.push_back(&Block); + // Add to the queue all successors with zero local in-degree + for (auto Jump : Block.SuccJumps) { + uint64_t Dst = Jump->Target; + LocalInDegree[Dst]--; + if (LocalInDegree[Dst] == 0) { + Queue.push(Dst); + } + } + } + + // If there is a cycle in the subgraph, AcyclicOrder contains only a subset + // of all blocks + if (DanglingSuccs.size() + 1 != AcyclicOrder.size()) + return false; + DanglingSuccs = AcyclicOrder; + return true; + } + + /// Rebalance a given subgraph. + void rebalanceDanglingSubgraph(FlowBlock *SrcBlock, FlowBlock *DstBlock, + std::vector &DanglingSuccs) { + assert(SrcBlock->Flow > 0 && "zero-flow block in dangling subgraph"); + assert(DanglingSuccs.front() == SrcBlock && "incorrect order of dangles"); + + for (auto Block : DanglingSuccs) { + // Block's flow is the sum of incoming flows + uint64_t TotalFlow = 0; + if (Block == SrcBlock) { + TotalFlow = Block->Flow; + } else { + for (auto Jump : Block->PredJumps) { + TotalFlow += Jump->Flow; + } + Block->Flow = TotalFlow; + } + + // Process all successor jumps and update corresponding flow values + for (uint64_t I = 0; I < Block->SuccJumps.size(); I++) { + auto Jump = Block->SuccJumps[I]; + if (I + 1 == Block->SuccJumps.size()) { + Jump->Flow = TotalFlow; + continue; + } + uint64_t Flow = uint64_t(TotalFlow * DanglingFirstSuccProbability); + Jump->Flow = Flow; + TotalFlow -= Flow; + } + } + } + /// A constant indicating an arbitrary exit block of a function. static constexpr uint64_t AnyExitBlock = uint64_t(-1); @@ -683,7 +858,7 @@ } } - // Run bfs from the source along edges with positive flow + // Run BFS from the source along edges with positive flow std::queue Queue; auto Visited = std::vector(NumBlocks, false); Queue.push(Func.Entry); @@ -796,7 +971,7 @@ findUnlikelyJumps(BasicBlocks, Successors, Func); // Find the entry block - for (size_t I = 0; I < Func.Blocks.size(); I++) { + for (uint64_t I = 0; I < Func.Blocks.size(); I++) { if (Func.Blocks[I].isEntry()) { Func.Entry = I; break; diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance.prof b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance.prof new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-inference-rebalance.prof @@ -0,0 +1,27 @@ +countMultipliers:37078302:0 + 2: 65536 + 3: 10 + 4: 65536 + 7: 65536 + 9: 65536 + 10: 65536 + !CFGChecksum: 223598586707 + +countMultipliers2:37078302:0 + 1: 2100 + 2: 2000 + 6: 2100 + !CFGChecksum: 2235985 + +countMultipliers3:37078302:0 + 1: 100 + 2: 100 + 3: 100 + !CFGChecksum: 22985 + +countMultipliers4:37078302:0 + 1: 100 + 2: 50 + 3: 50 + 5: 100 + !CFGChecksum: 2298578 diff --git a/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/SampleProfile/profile-inference-rebalance.ll @@ -0,0 +1,290 @@ +; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance.prof | opt -analyze -branch-prob -enable-new-pm=0 | FileCheck %s +; RUN: opt < %s -passes=pseudo-probe,sample-profile -sample-profile-use-profi -sample-profile-file=%S/Inputs/profile-inference-rebalance.prof | opt -analyze -block-freq -enable-new-pm=0 | FileCheck %s --check-prefix=CHECK2 + +; The test contains a "dimanond" and a "triangle" that needs to be rebalanced +; after basic profile inference. +; +; +----------------+ +; | b11 [?] | +; +----------------+ +; | +; v +; +----------+ +----------------+ +; | b13 [10] | <-- | b12 [65536] | +; +----------+ +----------------+ +; | +; v +; +----------+ +----------------+ +; | b16 [?] | <-- | b14 [65536] | +; +----------+ +----------------+ +; | | +; | v +; | +----------------+ +; | | b15 [?] | +; | +----------------+ +; | | +; | v +; | +----------------+ +; +------------> | b17 [65536] | -+ +; +----------------+ | +; | | +; v | +; +----------------+ | +; | b18 [?] | | +; +----------------+ | +; | | +; v | +; +----------------+ | +; | b19 [65536] | <+ +; +----------------+ +; | +; v +; +----------------+ +; | b110 [65536] | +; +----------------+ + +@yydebug = dso_local global i32 0, align 4 + +; Function Attrs: nounwind uwtable +define dso_local i32 @countMultipliers(i32 %0, i32 %1) #0 { +b11: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 1, i32 0, i64 -1) + %cmp = icmp ne i32 %0, 0 + br label %b12 +; CHECK2: - b11: float = {{.*}}, int = {{.*}}, count = 65546 + +b12: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 2, i32 0, i64 -1) + br i1 %cmp, label %b14, label %b13 +; CHECK2: - b12: float = {{.*}}, int = {{.*}}, count = 65546 + +b13: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 3, i32 0, i64 -1) + ret i32 %1 +; CHECK2: - b13: float = {{.*}}, int = {{.*}}, count = 10 + +b14: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 4, i32 0, i64 -1) + br i1 %cmp, label %b15, label %b16 +; CHECK: edge b14 -> b15 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK: edge b14 -> b16 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK2: - b14: float = {{.*}}, int = {{.*}}, count = 65536 + +b15: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 5, i32 0, i64 -1) + br label %b17 +; CHECK2: - b15: float = {{.*}}, int = {{.*}}, count = 32768 + +b16: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 6, i32 0, i64 -1) + br label %b17 +; CHECK2: - b16: float = {{.*}}, int = {{.*}}, count = 32768 + +b17: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 7, i32 0, i64 -1) + br i1 %cmp, label %b18, label %b19 +; CHECK: edge b17 -> b18 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK: edge b17 -> b19 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK2: - b17: float = {{.*}}, int = {{.*}}, count = 65536 + +b18: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 8, i32 0, i64 -1) + br label %b19 +; CHECK2: - b18: float = {{.*}}, int = {{.*}}, count = 32768 + +b19: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 9, i32 0, i64 -1) + br label %b110 +; CHECK2: - b19: float = {{.*}}, int = {{.*}}, count = 65536 + +b110: + call void @llvm.pseudoprobe(i64 -5758218299531803684, i64 10, i32 0, i64 -1) + ret i32 %1 +; CHECK2: - b110: float = {{.*}}, int = {{.*}}, count = 65536 +} + + +; The test contains a triangle comprised of dangling blocks. +; +; +-----------+ +; | b0 [2100] | -+ +; +-----------+ | +; | | +; | | +; v | +; +-----------+ | +; +- | b1 [2000] | | +; | +-----------+ | +; | | | +; | | | +; | v | +; +--------+ | +-----------+ | +; | b4 [?] | <-----+- | b2 [?] | | +; +--------+ | +-----------+ | +; | | | | +; | | | | +; | | v | +; | | +-----------+ | +; | +> | b3 [?] | | +; | +-----------+ | +; | | | +; | | | +; | v | +; | +-----------+ | +; +---------------> | b5 [2100] | <+ +; +-----------+ + +define dso_local i32 @countMultipliers2(i32 %0, i32 %1) #0 { +b0: + call void @llvm.pseudoprobe(i64 2506109673213838996, i64 1, i32 0, i64 -1) + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %b1, label %b5 +; CHECK: edge b0 -> b1 probability is 0x79e79e7a / 0x80000000 = 95.24% [HOT edge] +; CHECK: edge b0 -> b5 probability is 0x06186186 / 0x80000000 = 4.76% +; CHECK2: - b0: float = {{.*}}, int = {{.*}}, count = 2100 + +b1: + call void @llvm.pseudoprobe(i64 2506109673213838996, i64 2, i32 0, i64 -1) + br i1 %cmp, label %b2, label %b3 +; CHECK: edge b1 -> b2 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK: edge b1 -> b3 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK2: - b1: float = {{.*}}, int = {{.*}}, count = 1973 + +b2: + call void @llvm.pseudoprobe(i64 2506109673213838996, i64 3, i32 0, i64 -1) + br i1 %cmp, label %b3, label %b4 +; CHECK: edge b2 -> b3 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK: edge b2 -> b4 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK2: - b2: float = {{.*}}, int = {{.*}}, count = 955 + +b3: + call void @llvm.pseudoprobe(i64 2506109673213838996, i64 4, i32 0, i64 -1) + br label %b5 +; CHECK: edge b3 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge] +; CHECK2: - b3: float = {{.*}}, int = {{.*}}, count = 1527 + +b4: + call void @llvm.pseudoprobe(i64 2506109673213838996, i64 5, i32 0, i64 -1) + br label %b5 +; CHECK: edge b4 -> b5 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge] +; CHECK2: - b4: float = {{.*}}, int = {{.*}}, count = 445 + +b5: + call void @llvm.pseudoprobe(i64 2506109673213838996, i64 6, i32 0, i64 -1) + ret i32 %1 +; CHECK2: - b5: float = {{.*}}, int = {{.*}}, count = 2100 + +} + + +; The test contains a dangling subgraph that contains an exit dangling block. +; No rebalancing is necessary here. +; +; +-----------+ +; | b31 [100] | +; +-----------+ +; | +; | +; v +; +---------+ +-----------+ +; | b34 [?] | <-- | b32 [100] | +; +---------+ +-----------+ +; | +; | +; v +; +-----------+ +; | b33 [100] | +; +-----------+ + +define dso_local i32 @countMultipliers3(i32 %0, i32 %1) #0 { +b31: + call void @llvm.pseudoprobe(i64 -544905447084884130, i64 1, i32 0, i64 -1) + br label %b32 +; CHECK2: - b31: float = {{.*}}, int = {{.*}}, count = 100 + +b32: + call void @llvm.pseudoprobe(i64 -544905447084884130, i64 2, i32 0, i64 -1) + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %b34, label %b33 +; CHECK: edge b32 -> b34 probability is 0x00000000 / 0x80000000 = 0.00% +; CHECK: edge b32 -> b33 probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge] +; CHECK2: - b32: float = {{.*}}, int = {{.*}}, count = 100 + +b33: + call void @llvm.pseudoprobe(i64 -544905447084884130, i64 3, i32 0, i64 -1) + ret i32 %1 +; CHECK2: - b33: float = {{.*}}, int = {{.*}}, count = 100 + +b34: + call void @llvm.pseudoprobe(i64 -544905447084884130, i64 4, i32 0, i64 -1) + ret i32 %1 +; CHECK2: - b34: float = {{.*}}, int = {{.*}}, count = 0 + +} + +; Another dangling subgraph (b42, b43, b44) containing a single dangling block. +; +; +----------+ +-----------+ +; +- | b42 [50] | <-- | b40 [100] | +; | +----------+ +-----------+ +; | | | +; | | | +; | | v +; | | +-----------+ +; | | | b41 [50] | +; | | +-----------+ +; | | | +; | | | +; | | v +; | | +-----------+ +; | +------------> | b43 [?] | +; | +-----------+ +; | | +; | | +; | v +; | +-----------+ +; +-----------------> | b44 [100] | +; +-----------+ + +define dso_local i32 @countMultipliers4(i32 %0, i32 %1) #0 { +b40: + call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 1, i32 0, i64 -1) + %cmp = icmp ne i32 %0, 0 + br i1 %cmp, label %b41, label %b42 +; CHECK2: - b40: float = {{.*}}, int = {{.*}}, count = 100 + +b41: + call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 2, i32 0, i64 -1) + br label %b43 +; CHECK2: - b41: float = {{.*}}, int = {{.*}}, count = 50 + +b42: + call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 3, i32 0, i64 -1) + br i1 %cmp, label %b43, label %b44 +; CHECK: edge b42 -> b43 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK: edge b42 -> b44 probability is 0x40000000 / 0x80000000 = 50.00% +; CHECK2: - b42: float = {{.*}}, int = {{.*}}, count = 50 + +b43: + call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 4, i32 0, i64 -1) + br label %b44 +; CHECK2: - b43: float = {{.*}}, int = {{.*}}, count = 75 + +b44: + call void @llvm.pseudoprobe(i64 -2989539179265513123, i64 5, i32 0, i64 -1) + ret i32 %1 +; CHECK2: - b44: float = {{.*}}, int = {{.*}}, count = 100 +} + +; Function Attrs: inaccessiblememonly nounwind willreturn +declare void @llvm.pseudoprobe(i64, i64, i32, i64) #4 + +attributes #0 = { noinline nounwind uwtable "use-sample-profile" } +attributes #4 = { inaccessiblememonly nounwind willreturn } + +!llvm.pseudo_probe_desc = !{!7, !8, !9, !10} + +!7 = !{i64 -5758218299531803684, i64 223598586707, !"countMultipliers", null} +!8 = !{i64 2506109673213838996, i64 2235985, !"countMultipliers2", null} +!9 = !{i64 -544905447084884130, i64 22985, !"countMultipliers3", null} +!10 = !{i64 -2989539179265513123, i64 2298578, !"countMultipliers4", null}