Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -292,6 +292,13 @@ const BlockFilterSet *BlockFilter, BranchProbability SuccProb, BranchProbability HotProb); + BranchProbability + getLayoutSuccessorProbThreshold(MachineBasicBlock *BB, + MachineBasicBlock *Succ, + BranchProbability SuccProb, + BlockChain &SuccChain, + BlockChain &Chain, + const BlockFilterSet *BlockFilter); bool hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain, BranchProbability SuccProb, @@ -521,29 +528,180 @@ return false; } -// When profile is not present, return the StaticLikelyProb. -// When profile is available, we need to handle the triangle-shape CFG. -static BranchProbability getLayoutSuccessorProbThreshold( - MachineBasicBlock *BB) { +// Computes the branch probability threshold that is applied to edge +// BB->Succ where Succ that still has unscheduled predecessor(s). When +// the probability of BB->Succ is greater than the threshold, it is +// definitely benefifical to lay out Succ as BB's layout successor, otherwise +// it is better to connect Succ with one of its predecessors instead. The +// computed probability threshold is either forward branch probability (i.e. +// probabilty to branch out), or backward probability (i.e. the probability +// of an edge merging into the destination block) -- depending on the +// scenarios. For details see algorithm description below. +BranchProbability MachineBlockPlacement::getLayoutSuccessorProbThreshold( + MachineBasicBlock *BB, MachineBasicBlock *Succ, BranchProbability SuccProb, + BlockChain &SuccChain, BlockChain &Chain, const BlockFilterSet *BlockFilter) { + + // No profile, just use the pre-set threshold. if (!BB->getParent()->getFunction()->getEntryCount()) return BranchProbability(StaticLikelyProb, 100); - if (BB->succ_size() == 2) { - const MachineBasicBlock *Succ1 = *BB->succ_begin(); - const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1); - if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) { - /* See case 1 below for the cost analysis. For BB->Succ to - * be taken with smaller cost, the following needs to hold: - * Prob(BB->Succ) > 2* Prob(BB->Pred) - * So the threshold T - * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1, - * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified - * branch bias, we have - * T = (2/3)*(ProfileLikelyProb/50) - * = (2*ProfileLikelyProb)/150) - */ - return BranchProbability(2 * ProfileLikelyProb, 150); + + // When Profile data is available : + + // (We don't yet handle cases with fan-out > 2) + if (BB->succ_size() <= 2) { + // + // BB BB + // | \ | \ + // | \ | \ + // | Pred0 | Pred + // | / \ | / + // | / PredK | / + // | / / \ Succ + // | | / \ + // \ | / / .... + // Succ + // + // (A) (B) + // + // (A) is a general form of triangular CFG (B). In (A), Let + // + // P1 = Prob(BB->Succ), + // P2 = Prob(BB->Pred0), + // F2 = Freq(BB->Pred0), + // F3 = Freq(PredK, Succ), and + // P3 = F3/F2, where + // PredK is the unscheduled predecessor of Succ + // with the largest edge frequency from PredK to Succ. + // + // The objective is to find the probability threshold of + // BB->Succ edge to make laying out BB->Succ profitable. + // Assuming the freuqency of BB is 1, the fall through + // benefit of picking BB->Succ is P1 (e.g, taken branches + // saved when not selected). If Pred0 is selected as + // BB's layout successor, the layout benefit will be + // P2 + P2 * P3. Note that the internal edges between Pred* + // blocks do not count because the benefit of laying them + // out sequentially will be materialized independent of the + // decision here. To compute the threshold T of P1, + // T = P2 + P2 * P3, i.e, P2 = T/(1 + P3) + // Since P1 + P2 = 1, substitue P1 with T, we get + // T = (1 + P3)/(2 + P3) + // = (F2 + F3)/(2*F2 + F3) + // Finally, adding user specified branch bias, we have + // T = (F2 + F3) * ProfileLikelyProb/((2 * F2 + F3) * 50) + // Example: + // 1. For (B), F2 == F3, and P3 is 1, so T = 67%. + // 2. For (A), When P3 is 0.5, T = 60%. + // + // The same formula also applies to the backward probability + // threshold computation when the 'heaviest' predecessor is not + // dominated by the source block BB. + // + // BB Pred2 + // / | / + // / | | BB Pred2 + // Pred1 | | | | + // \ | | \ / + // \ | / \ / + // \ | / \ / + // Succ Succ + // + // (C) (D) + // (D) is a degenerated case of (C) where Pred1 does not exist. In (D), + // Let + // F1 = Freq(BB->Succ), + // F2 = Freq(Pred2->Succ), and + // F3 = Freq(BB->Pred1) + // + // Here we assume F2 > F3. Otherwise, the cost model will be the same as + // scenarios (A)/(B) above. + // + // The savings (eliminated taken branches) of laying out Succ after BB + // is F1, and the cost (introduced taken branches) consists of two parts: + // one is the branch from source block BB to Pred1 (F3), and the other + // part is the branch from Pred2 to the destination block Succ (F3). In + // order for BB->Succ to be selected, we must have F1 > F2 + F3. So the + // backward probability threshold is (backward checking igores other + // predecessors other than Pred2): + // T = (F2 + F3)/(F2 + F3 + F2) + // = (F2 + F3)/(2*F2 + F3) + // + // QED + // + // Examples: + // 1. For (D), F3 == 0, so we have T = 50% + // 2. For (C) where F2 = 10, F3 = 20, we have T = 30/50 = 60%. It matches + // the expection: Freq(BB->Succ) needs to be at least 30 in order to be + // profitable to layout Succ after BB. + // + BlockFrequency MaxPredEdgeFreq = 0; + MachineBasicBlock *PredWithMaxFreq = nullptr; + for (MachineBasicBlock *Pred : Succ->predecessors()) { + // This filters out the loop backedge predecessor + // (e.g, loops are laid out first. Later when + // connecting pre-header (BB) with the loop header (Succ), we need + // to skip the Pred which is already in Succ's chain. + if ( BlockToChain[Pred] == &SuccChain || + Succ == Pred || + // Skip if either already placed or filtered out due outlining + (BlockFilter && !BlockFilter->count(Pred)) || + BlockToChain[Pred] == &Chain) + continue; + + // Find one unscheduled Pred: + BlockFrequency PredEdgeFreq = + MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ); + if (PredEdgeFreq > MaxPredEdgeFreq) { + MaxPredEdgeFreq = PredEdgeFreq; + PredWithMaxFreq = Pred; + } } + + if (PredWithMaxFreq == nullptr || MaxPredEdgeFreq.getFrequency() == 0) + return BranchProbability(ProfileLikelyProb, 100); + + BlockFrequency F2, F3; + if (MDT->properlyDominates(BB, PredWithMaxFreq)) { + // Scenario (A) + F2 = MBFI->getBlockFreq(BB) * SuccProb.getCompl(); + F3 = MaxPredEdgeFreq; + if (F2 < F3) { + // handle rounding errors in block frequency propagation + F2 = F3; + } + } else { + // Scenario (C) + F2 = MaxPredEdgeFreq; + F3 = MBFI->getBlockFreq(BB) * SuccProb.getCompl(); + } + + APInt ProbBias(128, ProfileLikelyProb); + // N = (F2 + F3) * ProfileLikelyProb + APInt NV(128, F2.getFrequency()); + APInt F3V(128, F3.getFrequency()); + NV += F3V; + NV *= ProbBias; + + // D = (2 * F2 + F3) * 50 + APInt DV(128, F2.getFrequency()); + DV *= APInt(128, 2); + DV += F3V; + DV *= APInt(128, 50); + + if (DV.ult(NV)) + DV = NV; + + APInt L(128, APInt::getMaxValue(64).getLimitedValue()); + if (DV.ugt(L)) { + APInt Scale = DV.udiv(L); + DV = DV.udiv(Scale); + NV = NV.udiv(Scale); + } + + return BranchProbability::getBranchProbability(NV.getLimitedValue(), DV.getLimitedValue()); } + + // For other shapes, return the default. return BranchProbability(ProfileLikelyProb, 100); } @@ -554,11 +712,13 @@ BranchProbability SuccProb, BranchProbability RealSuccProb, BlockChain &Chain, const BlockFilterSet *BlockFilter) { - // This is no global conflict, just return false. + // There is no global conflict, just return false. if (SuccChain.UnscheduledPredecessors == 0) return false; - // There are two basic scenarios here: + // Descriptions of two basic scenarios. + // For more general cases and detailed description of the cost analysis + // algorithm, see getLayoutSuccessorProbThreshold method // ------------------------------------- // Case 1: triagular shape CFG: // BB @@ -585,6 +745,8 @@ // // When real profile data is available, we can precisely compute the the // probabililty threshold that is needed for edge BB->Succ to be considered. + // See getLayoutSuccessorProbThrehold for description on cost analysis of + // general triangular shape. // With out profile data, the heuristic requires the branch bias to be // a lot larger to make sure the signal is very strong (e.g. 80% default). // ----------------------------------------------------------------- @@ -611,7 +773,9 @@ // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without // profile data). - BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB); + BranchProbability HotProb = + getLayoutSuccessorProbThreshold(BB, Succ, RealSuccProb, SuccChain, + Chain, BlockFilter); // Forward checking. For case 2, SuccProb will be 1. if (SuccProb < HotProb) { @@ -630,7 +794,7 @@ (BlockFilter && !BlockFilter->count(Pred)) || BlockToChain[Pred] == &Chain) continue; - // Do backward checking. For case 1, it is actually redundant check. For + // Now do backward checking. For case 1, it is actually redundant check. For // case 2 above, we need a backward checking to filter out edges that are // not 'strongly' biased. With profile data available, the check is mostly // redundant too (when threshold prob is set at 50%) unless S has more than Index: test/CodeGen/X86/code_placement_pgo_cost.ll =================================================================== --- test/CodeGen/X86/code_placement_pgo_cost.ll +++ test/CodeGen/X86/code_placement_pgo_cost.ll @@ -0,0 +1,475 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s + +declare void @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() + +;; Simple double triangles +;; a +;; | \ 20 +;; | b +;; 70 |10/\ 10 +;; | / c +;; | | / 10 +;; \|/ +;; d +;; The optimal layout is with cost = 20 + 10 + 10 = 40. +;; The default layout (without profile) is with cost 80. +; CHECK-LABEL: test_double_triangles_1: +; CHECK: callq a +; CHECK: callq d +; CHECK: callq b +; CHECK: callq c + +define void @test_double_triangles_1(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %else, !prof !8 + +else: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2, !prof !9 + +else2: + call void @c() + br label %merge + +merge: + call void @d() + ret void +} + +;; a +;; | \ 0 +;; | b +;; 70 | /\ +;; | / c +;; | | / +;; \|/ +;; d +;; Same as above with more extreme probability. +; CHECK-LABEL: test_double_triangles_2: +; CHECK: callq a +; CHECK: callq d +; CHECK: callq b +; CHECK: callq c +define void @test_double_triangles_2(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %else, !prof !10 + +else: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2 + +else2: + call void @c() + br label %merge + +merge: + call void @d() + ret void +} + +;; a +;; | \ 40 +;; | b +;; 70 |25/\ 15 +;; | / c +;; | | / 15 +;; \|/ +;; d +;; Similar to above -- the optimal layout is still with cost +;; 40 + 15 + 25 = 80. The default layout has cost 95. +; CHECK-LABEL: test_double_triangles_3: +; CHECK: callq a +; CHECK: callq d +; CHECK: callq b +; CHECK: callq c +define void @test_double_triangles_3(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %else, !prof !11 + +else: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2, !prof !12 + +else2: + call void @c() + br label %merge + +merge: + call void @d() + ret void +} + +;; a +;; | \ 45 +;; | b +;; 60 |35/\ 10 +;; | / c +;; | | / 10 +;; \|/ +;; d +;; The optimal layout in this case is . It has cost +;; 60 + 10 + 10 = 80. The default layout has cost = 95. +;; Layout has cost = 45 + 35 + 10 = 90 +; CHECK-LABEL: test_double_triangles_4: +; CHECK: callq a +; CHECK: callq b +; CHECK: callq d +; CHECK: callq c +; +define void @test_double_triangles_4(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %else, !prof !13 + +else: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2, !prof !14 + +else2: + call void @c() + br label %merge + +merge: + call void @d() + ret void +} + +;; a +;; | \ 40 +;; | b +;; 60 |10/\ 30 +;; | / c +;; | | / 30 +;; \|/ +;; d +;; Optimal layout is with cost = 60 + 10 = 70 +;; Alternate layout has cost = 40 + 10 + 30 = 80 +;; a b d c has cost = 120 +; CHECK-LABEL: test_double_triangles_5: +; CHECK: callq a +; CHECK: callq b +; CHECK: callq c +; CHECK: callq d + +define void @test_double_triangles_5(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %else, !prof !15 + +else: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2, !prof !16 + +else2: + call void @c() + br label %merge + +merge: + call void @d() + ret void +} + +;; The following CFG has a shape with double triangle nested in a +;; diamond. +;; For the following CFG, the optimal layout is which has +;; cost of 80. If using the default layout strategy (without PGO), the +;; order is which has cost of 90. +;; a +;; | \ 40 +;; 60 | \ +;; | b +;; | 0/ \40 +;; | / c +;; | / / \ +;; \| /10 | 30 +;; d | +;; 70 \ / +;; \ / +;; e +; CHECK-LABEL: test_multi_merge_1: +; CHECK: callq a +; CHECK: callq d +; CHECK: callq e +; CHECK: callq b +; CHECK: callq c +define void @test_multi_merge_1(i32 %t1, i32 %t2, i32 %t3) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %then, !prof !2 + +then: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %then2, !prof !3 + +then2: + call void @c() + %cond3 = icmp eq i32 %t3, 0 + br i1 %cond3, label %merge, label %exit, !prof !4 + +merge: + call void @d() + br label %exit +exit: + call void @e() + ret void +} + +;; Similar to the above with the same optimal layout (with cost 80) +;; The default no optimal layout has cost of 100. +;; a +;; | \ 40 +;; 60 | \ +;; | b +;; | 0/ \40 +;; | / c +;; | / / \ +;; \| / 0 | 40 +;; d | +;; 60 \ / +;; \ / +;; e +; CHECK-LABEL: test_multi_merge_2: +; CHECK: callq a +; CHECK: callq d +; CHECK: callq e +; CHECK: callq b +; CHECK: callq c +define void @test_multi_merge_2(i32 %t1, i32 %t2, i32 %t3) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %then, !prof !2 + +then: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %then2, !prof !3 + +then2: + call void @c() + %cond3 = icmp eq i32 %t3, 0 + br i1 %cond3, label %merge, label %exit, !prof !5 + +merge: + call void @d() + br label %exit +exit: + call void @e() + ret void +} + +;; The CFG is the same, but the count distribution changes here. It is no +;; longer optimal to layout d after a. The optimal layout is +;; +;; a +;; | \ 40 +;; 60 | \ +;; | b +;; | 40/ \ 0 +;; | / c +;; | / / \ +;; \| / 0 | 0 +;; d | +;; 100 \ / +;; \ / +;; e +; CHECK-LABEL: test_multi_merge_3: +; CHECK: callq a +; CHECK: callq b +; CHECK: callq d +; CHECK: callq e +; CHECK: callq c +define void @test_multi_merge_3(i32 %t1, i32 %t2, i32 %t3) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %merge, label %then, !prof !2 + +then: + call void @b() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %then2, !prof !6 + +then2: + call void @c() + %cond3 = icmp eq i32 %t3, 0 + br i1 %cond3, label %merge, label %exit, !prof !7 + +merge: + call void @d() + br label %exit +exit: + call void @e() + ret void +} + +;; Single triangle inside (lower half) of a diamond +;; +;; a +;; 45 / \ 55 +;; / \ +;; b c +;; \ 40| \ 15 +;; 45 \ | d +;; \ | / 15 +;; \|/ +;; e +;; Optimal layout is with cost 45 + 15 + 40 = 100 +; CHECK-LABEL: test_triangle_in_diamond_1 +; CHECK: callq a +; CHECK: callq c +; CHECK: callq d +; CHECK: callq b +; CHECK: callq e +define void @test_triangle_in_diamond_1(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %then, label %else, !prof !17 + +then: + call void @b() + br label %merge + +else: + call void @c() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2, !prof !18 + +else2: + call void @d() + br label %merge + +merge: + call void @e() + ret void +} + +;; a +;; 25 / \ 75 +;; / \ +;; b c +;; \ 40| \ 35 +;; 25 \ | d +;; \ | / 35 +;; \|/ +;; e +;; Optimal layout with cost = 25 + 25 + 40 = 90 +;; The default layout 's cost is 40 + 35 + 25 = 100 +;; +; CHECK-LABEL: test_triangle_in_diamond_2 +; CHECK: callq a +; CHECK: callq c +; CHECK: callq d +; CHECK: callq e +; CHECK: callq b +define void @test_triangle_in_diamond_2(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %then, label %else, !prof !19 + +then: + call void @b() + br label %merge + +else: + call void @c() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2, !prof !20 + +else2: + call void @d() + br label %merge + +merge: + call void @e() + ret void +} + +;; a +;; 25 / \ 75 +;; / \ +;; b c +;; \ 55| \ 20 +;; 25 \ | d +;; \ | / 20 +;; \|/ +;; e +;; The optimal layout is with cost +;; 25 + 25 + 20 + 20 = 90 +;; Default layout has cost 25 + 55 + 20 = 100 + +; CHECK-LABEL: test_triangle_in_diamond_3 +; CHECK: callq a +; CHECK: callq c +; CHECK: callq e +; CHECK: callq b +; CHECK: callq d + +define void @test_triangle_in_diamond_3(i32 %t1, i32 %t2) !prof !1 { +entry: + call void @a() + %cond1 = icmp eq i32 %t1, 0 + br i1 %cond1, label %then, label %else, !prof !19 + +then: + call void @b() + br label %merge + +else: + call void @c() + %cond2 = icmp eq i32 %t2, 0 + br i1 %cond2, label %merge, label %else2, !prof !21 + +else2: + call void @d() + br label %merge + +merge: + call void @e() + ret void +} + +;; Double triangle inside (lower half) of a diamond + +!1 = !{!"function_entry_count", i32 10} +!2 = !{!"branch_weights", i32 60, i32 40} +!3 = !{!"branch_weights", i32 0, i32 40} +!4 = !{!"branch_weights", i32 10, i32 30} +!5 = !{!"branch_weights", i32 0, i32 40} +!6 = !{!"branch_weights", i32 40, i32 0} +!7 = !{!"branch_weights", i32 0, i32 0} +!8 = !{!"branch_weights", i32 70, i32 20} +!9 = !{!"branch_weights", i32 10, i32 10} +!10 = !{!"branch_weights", i32 70, i32 0} +!11 = !{!"branch_weights", i32 70, i32 40} +!12 = !{!"branch_weights", i32 25, i32 15} +!13 = !{!"branch_weights", i32 65, i32 45} +!14 = !{!"branch_weights", i32 35, i32 10} +!15 = !{!"branch_weights", i32 60, i32 40} +!16 = !{!"branch_weights", i32 10, i32 30} +!17 = !{!"branch_weights", i32 45, i32 55} +!18 = !{!"branch_weights", i32 40, i32 15} +!19 = !{!"branch_weights", i32 25, i32 75} +!20 = !{!"branch_weights", i32 40, i32 35} +!21 = !{!"branch_weights", i32 55, i32 20}