Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -292,6 +292,13 @@
const BlockFilterSet *BlockFilter,
BranchProbability SuccProb,
BranchProbability HotProb);
+ BranchProbability
+ getLayoutSuccessorProbThreshold(MachineBasicBlock *BB,
+ MachineBasicBlock *Succ,
+ BranchProbability SuccProb,
+ BlockChain &SuccChain,
+ BlockChain &Chain,
+ const BlockFilterSet *BlockFilter);
bool
hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
BlockChain &SuccChain, BranchProbability SuccProb,
@@ -521,29 +528,180 @@
return false;
}
-// When profile is not present, return the StaticLikelyProb.
-// When profile is available, we need to handle the triangle-shape CFG.
-static BranchProbability getLayoutSuccessorProbThreshold(
- MachineBasicBlock *BB) {
+// Computes the branch probability threshold that is applied to edge
+// BB->Succ where Succ that still has unscheduled predecessor(s). When
+// the probability of BB->Succ is greater than the threshold, it is
+// definitely benefifical to lay out Succ as BB's layout successor, otherwise
+// it is better to connect Succ with one of its predecessors instead. The
+// computed probability threshold is either forward branch probability (i.e.
+// probabilty to branch out), or backward probability (i.e. the probability
+// of an edge merging into the destination block) -- depending on the
+// scenarios. For details see algorithm description below.
+BranchProbability MachineBlockPlacement::getLayoutSuccessorProbThreshold(
+ MachineBasicBlock *BB, MachineBasicBlock *Succ, BranchProbability SuccProb,
+ BlockChain &SuccChain, BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+
+ // No profile, just use the pre-set threshold.
if (!BB->getParent()->getFunction()->getEntryCount())
return BranchProbability(StaticLikelyProb, 100);
- if (BB->succ_size() == 2) {
- const MachineBasicBlock *Succ1 = *BB->succ_begin();
- const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
- if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
- /* See case 1 below for the cost analysis. For BB->Succ to
- * be taken with smaller cost, the following needs to hold:
- * Prob(BB->Succ) > 2* Prob(BB->Pred)
- * So the threshold T
- * T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
- * We have T + T/2 = 1, i.e. T = 2/3. Also adding user specified
- * branch bias, we have
- * T = (2/3)*(ProfileLikelyProb/50)
- * = (2*ProfileLikelyProb)/150)
- */
- return BranchProbability(2 * ProfileLikelyProb, 150);
+
+ // When Profile data is available :
+
+ // (We don't yet handle cases with fan-out > 2)
+ if (BB->succ_size() <= 2) {
+ //
+ // BB BB
+ // | \ | \
+ // | \ | \
+ // | Pred0 | Pred
+ // | / \ | /
+ // | / PredK | /
+ // | / / \ Succ
+ // | | / \
+ // \ | / / ....
+ // Succ
+ //
+ // (A) (B)
+ //
+ // (A) is a general form of triangular CFG (B). In (A), Let
+ //
+ // P1 = Prob(BB->Succ),
+ // P2 = Prob(BB->Pred0),
+ // F2 = Freq(BB->Pred0),
+ // F3 = Freq(PredK, Succ), and
+ // P3 = F3/F2, where
+ // PredK is the unscheduled predecessor of Succ
+ // with the largest edge frequency from PredK to Succ.
+ //
+ // The objective is to find the probability threshold of
+ // BB->Succ edge to make laying out BB->Succ profitable.
+ // Assuming the freuqency of BB is 1, the fall through
+ // benefit of picking BB->Succ is P1 (e.g, taken branches
+ // saved when not selected). If Pred0 is selected as
+ // BB's layout successor, the layout benefit will be
+ // P2 + P2 * P3. Note that the internal edges between Pred*
+ // blocks do not count because the benefit of laying them
+ // out sequentially will be materialized independent of the
+ // decision here. To compute the threshold T of P1,
+ // T = P2 + P2 * P3, i.e, P2 = T/(1 + P3)
+ // Since P1 + P2 = 1, substitue P1 with T, we get
+ // T = (1 + P3)/(2 + P3)
+ // = (F2 + F3)/(2*F2 + F3)
+ // Finally, adding user specified branch bias, we have
+ // T = (F2 + F3) * ProfileLikelyProb/((2 * F2 + F3) * 50)
+ // Example:
+ // 1. For (B), F2 == F3, and P3 is 1, so T = 67%.
+ // 2. For (A), When P3 is 0.5, T = 60%.
+ //
+ // The same formula also applies to the backward probability
+ // threshold computation when the 'heaviest' predecessor is not
+ // dominated by the source block BB.
+ //
+ // BB Pred2
+ // / | /
+ // / | | BB Pred2
+ // Pred1 | | | |
+ // \ | | \ /
+ // \ | / \ /
+ // \ | / \ /
+ // Succ Succ
+ //
+ // (C) (D)
+ // (D) is a degenerated case of (C) where Pred1 does not exist. In (D),
+ // Let
+ // F1 = Freq(BB->Succ),
+ // F2 = Freq(Pred2->Succ), and
+ // F3 = Freq(BB->Pred1)
+ //
+ // Here we assume F2 > F3. Otherwise, the cost model will be the same as
+ // scenarios (A)/(B) above.
+ //
+ // The savings (eliminated taken branches) of laying out Succ after BB
+ // is F1, and the cost (introduced taken branches) consists of two parts:
+ // one is the branch from source block BB to Pred1 (F3), and the other
+ // part is the branch from Pred2 to the destination block Succ (F3). In
+ // order for BB->Succ to be selected, we must have F1 > F2 + F3. So the
+ // backward probability threshold is (backward checking igores other
+ // predecessors other than Pred2):
+ // T = (F2 + F3)/(F2 + F3 + F2)
+ // = (F2 + F3)/(2*F2 + F3)
+ //
+ // QED
+ //
+ // Examples:
+ // 1. For (D), F3 == 0, so we have T = 50%
+ // 2. For (C) where F2 = 10, F3 = 20, we have T = 30/50 = 60%. It matches
+ // the expection: Freq(BB->Succ) needs to be at least 30 in order to be
+ // profitable to layout Succ after BB.
+ //
+ BlockFrequency MaxPredEdgeFreq = 0;
+ MachineBasicBlock *PredWithMaxFreq = nullptr;
+ for (MachineBasicBlock *Pred : Succ->predecessors()) {
+ // This filters out the loop backedge predecessor
+ // (e.g, loops are laid out first. Later when
+ // connecting pre-header (BB) with the loop header (Succ), we need
+ // to skip the Pred which is already in Succ's chain.
+ if ( BlockToChain[Pred] == &SuccChain ||
+ Succ == Pred ||
+ // Skip if either already placed or filtered out due outlining
+ (BlockFilter && !BlockFilter->count(Pred)) ||
+ BlockToChain[Pred] == &Chain)
+ continue;
+
+ // Find one unscheduled Pred:
+ BlockFrequency PredEdgeFreq =
+ MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
+ if (PredEdgeFreq > MaxPredEdgeFreq) {
+ MaxPredEdgeFreq = PredEdgeFreq;
+ PredWithMaxFreq = Pred;
+ }
}
+
+ if (PredWithMaxFreq == nullptr || MaxPredEdgeFreq.getFrequency() == 0)
+ return BranchProbability(ProfileLikelyProb, 100);
+
+ BlockFrequency F2, F3;
+ if (MDT->properlyDominates(BB, PredWithMaxFreq)) {
+ // Scenario (A)
+ F2 = MBFI->getBlockFreq(BB) * SuccProb.getCompl();
+ F3 = MaxPredEdgeFreq;
+ if (F2 < F3) {
+ // handle rounding errors in block frequency propagation
+ F2 = F3;
+ }
+ } else {
+ // Scenario (C)
+ F2 = MaxPredEdgeFreq;
+ F3 = MBFI->getBlockFreq(BB) * SuccProb.getCompl();
+ }
+
+ APInt ProbBias(128, ProfileLikelyProb);
+ // N = (F2 + F3) * ProfileLikelyProb
+ APInt NV(128, F2.getFrequency());
+ APInt F3V(128, F3.getFrequency());
+ NV += F3V;
+ NV *= ProbBias;
+
+ // D = (2 * F2 + F3) * 50
+ APInt DV(128, F2.getFrequency());
+ DV *= APInt(128, 2);
+ DV += F3V;
+ DV *= APInt(128, 50);
+
+ if (DV.ult(NV))
+ DV = NV;
+
+ APInt L(128, APInt::getMaxValue(64).getLimitedValue());
+ if (DV.ugt(L)) {
+ APInt Scale = DV.udiv(L);
+ DV = DV.udiv(Scale);
+ NV = NV.udiv(Scale);
+ }
+
+ return BranchProbability::getBranchProbability(NV.getLimitedValue(), DV.getLimitedValue());
}
+
+ // For other shapes, return the default.
return BranchProbability(ProfileLikelyProb, 100);
}
@@ -554,11 +712,13 @@
BranchProbability SuccProb, BranchProbability RealSuccProb,
BlockChain &Chain, const BlockFilterSet *BlockFilter) {
- // This is no global conflict, just return false.
+ // There is no global conflict, just return false.
if (SuccChain.UnscheduledPredecessors == 0)
return false;
- // There are two basic scenarios here:
+ // Descriptions of two basic scenarios.
+ // For more general cases and detailed description of the cost analysis
+ // algorithm, see getLayoutSuccessorProbThreshold method
// -------------------------------------
// Case 1: triagular shape CFG:
// BB
@@ -585,6 +745,8 @@
//
// When real profile data is available, we can precisely compute the the
// probabililty threshold that is needed for edge BB->Succ to be considered.
+ // See getLayoutSuccessorProbThrehold for description on cost analysis of
+ // general triangular shape.
// With out profile data, the heuristic requires the branch bias to be
// a lot larger to make sure the signal is very strong (e.g. 80% default).
// -----------------------------------------------------------------
@@ -611,7 +773,9 @@
// edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
// profile data).
- BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
+ BranchProbability HotProb =
+ getLayoutSuccessorProbThreshold(BB, Succ, RealSuccProb, SuccChain,
+ Chain, BlockFilter);
// Forward checking. For case 2, SuccProb will be 1.
if (SuccProb < HotProb) {
@@ -630,7 +794,7 @@
(BlockFilter && !BlockFilter->count(Pred)) ||
BlockToChain[Pred] == &Chain)
continue;
- // Do backward checking. For case 1, it is actually redundant check. For
+ // Now do backward checking. For case 1, it is actually redundant check. For
// case 2 above, we need a backward checking to filter out edges that are
// not 'strongly' biased. With profile data available, the check is mostly
// redundant too (when threshold prob is set at 50%) unless S has more than
Index: test/CodeGen/X86/code_placement_pgo_cost.ll
===================================================================
--- test/CodeGen/X86/code_placement_pgo_cost.ll
+++ test/CodeGen/X86/code_placement_pgo_cost.ll
@@ -0,0 +1,475 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+
+;; Simple double triangles
+;; a
+;; | \ 20
+;; | b
+;; 70 |10/\ 10
+;; | / c
+;; | | / 10
+;; \|/
+;; d
+;; The optimal layout is with cost = 20 + 10 + 10 = 40.
+;; The default layout (without profile) is with cost 80.
+; CHECK-LABEL: test_double_triangles_1:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq c
+
+define void @test_double_triangles_1(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %else, !prof !8
+
+else:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2, !prof !9
+
+else2:
+ call void @c()
+ br label %merge
+
+merge:
+ call void @d()
+ ret void
+}
+
+;; a
+;; | \ 0
+;; | b
+;; 70 | /\
+;; | / c
+;; | | /
+;; \|/
+;; d
+;; Same as above with more extreme probability.
+; CHECK-LABEL: test_double_triangles_2:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq c
+define void @test_double_triangles_2(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %else, !prof !10
+
+else:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2
+
+else2:
+ call void @c()
+ br label %merge
+
+merge:
+ call void @d()
+ ret void
+}
+
+;; a
+;; | \ 40
+;; | b
+;; 70 |25/\ 15
+;; | / c
+;; | | / 15
+;; \|/
+;; d
+;; Similar to above -- the optimal layout is still with cost
+;; 40 + 15 + 25 = 80. The default layout has cost 95.
+; CHECK-LABEL: test_double_triangles_3:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq c
+define void @test_double_triangles_3(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %else, !prof !11
+
+else:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2, !prof !12
+
+else2:
+ call void @c()
+ br label %merge
+
+merge:
+ call void @d()
+ ret void
+}
+
+;; a
+;; | \ 45
+;; | b
+;; 60 |35/\ 10
+;; | / c
+;; | | / 10
+;; \|/
+;; d
+;; The optimal layout in this case is . It has cost
+;; 60 + 10 + 10 = 80. The default layout has cost = 95.
+;; Layout has cost = 45 + 35 + 10 = 90
+; CHECK-LABEL: test_double_triangles_4:
+; CHECK: callq a
+; CHECK: callq b
+; CHECK: callq d
+; CHECK: callq c
+;
+define void @test_double_triangles_4(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %else, !prof !13
+
+else:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2, !prof !14
+
+else2:
+ call void @c()
+ br label %merge
+
+merge:
+ call void @d()
+ ret void
+}
+
+;; a
+;; | \ 40
+;; | b
+;; 60 |10/\ 30
+;; | / c
+;; | | / 30
+;; \|/
+;; d
+;; Optimal layout is with cost = 60 + 10 = 70
+;; Alternate layout has cost = 40 + 10 + 30 = 80
+;; a b d c has cost = 120
+; CHECK-LABEL: test_double_triangles_5:
+; CHECK: callq a
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq d
+
+define void @test_double_triangles_5(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %else, !prof !15
+
+else:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2, !prof !16
+
+else2:
+ call void @c()
+ br label %merge
+
+merge:
+ call void @d()
+ ret void
+}
+
+;; The following CFG has a shape with double triangle nested in a
+;; diamond.
+;; For the following CFG, the optimal layout is which has
+;; cost of 80. If using the default layout strategy (without PGO), the
+;; order is which has cost of 90.
+;; a
+;; | \ 40
+;; 60 | \
+;; | b
+;; | 0/ \40
+;; | / c
+;; | / / \
+;; \| /10 | 30
+;; d |
+;; 70 \ /
+;; \ /
+;; e
+; CHECK-LABEL: test_multi_merge_1:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq c
+define void @test_multi_merge_1(i32 %t1, i32 %t2, i32 %t3) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %then, !prof !2
+
+then:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %then2, !prof !3
+
+then2:
+ call void @c()
+ %cond3 = icmp eq i32 %t3, 0
+ br i1 %cond3, label %merge, label %exit, !prof !4
+
+merge:
+ call void @d()
+ br label %exit
+exit:
+ call void @e()
+ ret void
+}
+
+;; Similar to the above with the same optimal layout (with cost 80)
+;; The default no optimal layout has cost of 100.
+;; a
+;; | \ 40
+;; 60 | \
+;; | b
+;; | 0/ \40
+;; | / c
+;; | / / \
+;; \| / 0 | 40
+;; d |
+;; 60 \ /
+;; \ /
+;; e
+; CHECK-LABEL: test_multi_merge_2:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq c
+define void @test_multi_merge_2(i32 %t1, i32 %t2, i32 %t3) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %then, !prof !2
+
+then:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %then2, !prof !3
+
+then2:
+ call void @c()
+ %cond3 = icmp eq i32 %t3, 0
+ br i1 %cond3, label %merge, label %exit, !prof !5
+
+merge:
+ call void @d()
+ br label %exit
+exit:
+ call void @e()
+ ret void
+}
+
+;; The CFG is the same, but the count distribution changes here. It is no
+;; longer optimal to layout d after a. The optimal layout is
+;;
+;; a
+;; | \ 40
+;; 60 | \
+;; | b
+;; | 40/ \ 0
+;; | / c
+;; | / / \
+;; \| / 0 | 0
+;; d |
+;; 100 \ /
+;; \ /
+;; e
+; CHECK-LABEL: test_multi_merge_3:
+; CHECK: callq a
+; CHECK: callq b
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq c
+define void @test_multi_merge_3(i32 %t1, i32 %t2, i32 %t3) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %merge, label %then, !prof !2
+
+then:
+ call void @b()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %then2, !prof !6
+
+then2:
+ call void @c()
+ %cond3 = icmp eq i32 %t3, 0
+ br i1 %cond3, label %merge, label %exit, !prof !7
+
+merge:
+ call void @d()
+ br label %exit
+exit:
+ call void @e()
+ ret void
+}
+
+;; Single triangle inside (lower half) of a diamond
+;;
+;; a
+;; 45 / \ 55
+;; / \
+;; b c
+;; \ 40| \ 15
+;; 45 \ | d
+;; \ | / 15
+;; \|/
+;; e
+;; Optimal layout is with cost 45 + 15 + 40 = 100
+; CHECK-LABEL: test_triangle_in_diamond_1
+; CHECK: callq a
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq e
+define void @test_triangle_in_diamond_1(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %then, label %else, !prof !17
+
+then:
+ call void @b()
+ br label %merge
+
+else:
+ call void @c()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2, !prof !18
+
+else2:
+ call void @d()
+ br label %merge
+
+merge:
+ call void @e()
+ ret void
+}
+
+;; a
+;; 25 / \ 75
+;; / \
+;; b c
+;; \ 40| \ 35
+;; 25 \ | d
+;; \ | / 35
+;; \|/
+;; e
+;; Optimal layout with cost = 25 + 25 + 40 = 90
+;; The default layout 's cost is 40 + 35 + 25 = 100
+;;
+; CHECK-LABEL: test_triangle_in_diamond_2
+; CHECK: callq a
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+define void @test_triangle_in_diamond_2(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %then, label %else, !prof !19
+
+then:
+ call void @b()
+ br label %merge
+
+else:
+ call void @c()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2, !prof !20
+
+else2:
+ call void @d()
+ br label %merge
+
+merge:
+ call void @e()
+ ret void
+}
+
+;; a
+;; 25 / \ 75
+;; / \
+;; b c
+;; \ 55| \ 20
+;; 25 \ | d
+;; \ | / 20
+;; \|/
+;; e
+;; The optimal layout is with cost
+;; 25 + 25 + 20 + 20 = 90
+;; Default layout has cost 25 + 55 + 20 = 100
+
+; CHECK-LABEL: test_triangle_in_diamond_3
+; CHECK: callq a
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq d
+
+define void @test_triangle_in_diamond_3(i32 %t1, i32 %t2) !prof !1 {
+entry:
+ call void @a()
+ %cond1 = icmp eq i32 %t1, 0
+ br i1 %cond1, label %then, label %else, !prof !19
+
+then:
+ call void @b()
+ br label %merge
+
+else:
+ call void @c()
+ %cond2 = icmp eq i32 %t2, 0
+ br i1 %cond2, label %merge, label %else2, !prof !21
+
+else2:
+ call void @d()
+ br label %merge
+
+merge:
+ call void @e()
+ ret void
+}
+
+;; Double triangle inside (lower half) of a diamond
+
+!1 = !{!"function_entry_count", i32 10}
+!2 = !{!"branch_weights", i32 60, i32 40}
+!3 = !{!"branch_weights", i32 0, i32 40}
+!4 = !{!"branch_weights", i32 10, i32 30}
+!5 = !{!"branch_weights", i32 0, i32 40}
+!6 = !{!"branch_weights", i32 40, i32 0}
+!7 = !{!"branch_weights", i32 0, i32 0}
+!8 = !{!"branch_weights", i32 70, i32 20}
+!9 = !{!"branch_weights", i32 10, i32 10}
+!10 = !{!"branch_weights", i32 70, i32 0}
+!11 = !{!"branch_weights", i32 70, i32 40}
+!12 = !{!"branch_weights", i32 25, i32 15}
+!13 = !{!"branch_weights", i32 65, i32 45}
+!14 = !{!"branch_weights", i32 35, i32 10}
+!15 = !{!"branch_weights", i32 60, i32 40}
+!16 = !{!"branch_weights", i32 10, i32 30}
+!17 = !{!"branch_weights", i32 45, i32 55}
+!18 = !{!"branch_weights", i32 40, i32 15}
+!19 = !{!"branch_weights", i32 25, i32 75}
+!20 = !{!"branch_weights", i32 40, i32 35}
+!21 = !{!"branch_weights", i32 55, i32 20}