Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -292,6 +292,13 @@
                                  const BlockFilterSet *BlockFilter,
                                  BranchProbability SuccProb,
                                  BranchProbability HotProb);
+  BranchProbability
+  getLayoutSuccessorProbThreshold(MachineBasicBlock *BB,
+                                  MachineBasicBlock *Succ,
+                                  BranchProbability SuccProb, 
+                                  BlockChain &SuccChain,
+                                  BlockChain &Chain,
+                                  const BlockFilterSet *BlockFilter);
   bool
   hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
                              BlockChain &SuccChain, BranchProbability SuccProb,
@@ -521,29 +528,180 @@
     return false;
 }
 
-// When profile is not present, return the StaticLikelyProb.
-// When profile is available, we need to handle the triangle-shape CFG.
-static BranchProbability getLayoutSuccessorProbThreshold(
-      MachineBasicBlock *BB) {
+// Computes the branch probability threshold that is applied to edge
+// BB->Succ where Succ that still has unscheduled predecessor(s). When
+// the probability of BB->Succ is greater than the threshold, it is
+// definitely benefifical to lay out Succ as BB's layout successor, otherwise
+// it is better to connect Succ with one of its predecessors instead. The
+// computed probability threshold is either forward branch probability (i.e.
+// probabilty to branch out), or backward probability (i.e. the probability
+// of an edge merging into the destination block) -- depending on the
+// scenarios. For details see algorithm description below.
+BranchProbability MachineBlockPlacement::getLayoutSuccessorProbThreshold(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BranchProbability SuccProb,
+    BlockChain &SuccChain, BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+
+  // No profile, just use the pre-set threshold.
   if (!BB->getParent()->getFunction()->getEntryCount())
     return BranchProbability(StaticLikelyProb, 100);
-  if (BB->succ_size() == 2) {
-    const MachineBasicBlock *Succ1 = *BB->succ_begin();
-    const MachineBasicBlock *Succ2 = *(BB->succ_begin() + 1);
-    if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
-      /* See case 1 below for the cost analysis. For BB->Succ to
-       * be taken with smaller cost, the following needs to hold:
-       *   Prob(BB->Succ) > 2* Prob(BB->Pred)
-       *   So the threshold T
-       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
-       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
-       * branch bias, we have
-       *   T = (2/3)*(ProfileLikelyProb/50)
-       *     = (2*ProfileLikelyProb)/150)
-       */
-      return BranchProbability(2 * ProfileLikelyProb, 150);
+
+  // When Profile data is available :
+
+  // (We don't yet handle cases with fan-out > 2)
+  if (BB->succ_size() <= 2) {
+    //
+    //      BB                     BB
+    //     |   \                  |  \
+    //     |    \                 |   \
+    //     |    Pred0             |  Pred
+    //     |    /  \              |   /
+    //     |   /  PredK           |  /
+    //     |  /    / \            Succ
+    //     |  |   /   \
+    //      \ |  /   / ....
+    //      Succ
+    //
+    //      (A)                   (B)
+    //
+    //  (A) is a general form of triangular CFG (B). In (A), Let
+    //
+    //    P1 = Prob(BB->Succ),
+    //    P2 = Prob(BB->Pred0),
+    //    F2 = Freq(BB->Pred0),
+    //    F3 = Freq(PredK, Succ), and
+    //    P3 = F3/F2, where
+    //  PredK is the unscheduled predecessor of Succ
+    //  with the largest edge frequency from PredK to Succ.
+    //
+    //  The objective is to find the probability threshold of
+    //  BB->Succ edge to make laying out BB->Succ profitable.
+    //  Assuming the freuqency of BB is 1, the fall through
+    //  benefit of picking BB->Succ is P1 (e.g, taken branches
+    //  saved when not selected). If Pred0 is selected as
+    //  BB's layout successor, the layout benefit will be
+    //  P2 + P2 * P3.  Note that the internal edges between Pred*
+    //  blocks do not count because the benefit of laying them
+    //  out sequentially will be materialized independent of the
+    //  decision here. To compute the threshold T of P1,
+    //   T = P2 + P2 * P3, i.e,  P2 = T/(1 + P3)
+    //  Since P1 + P2 = 1, substitue P1 with T, we get
+    //       T = (1 + P3)/(2 + P3)
+    //         = (F2 + F3)/(2*F2 + F3)
+    //  Finally, adding user specified branch bias, we have
+    //   T  = (F2 + F3) * ProfileLikelyProb/((2 * F2 + F3) * 50)
+    //  Example:
+    //     1. For (B), F2 == F3, and P3 is 1, so T = 67%.
+    //     2. For (A), When P3 is 0.5, T = 60%.
+    //
+    //  The same formula also applies to the backward probability
+    //  threshold computation when the 'heaviest' predecessor is not
+    //  dominated by the source block BB.
+    //
+    //         BB  Pred2
+    //        / |  /
+    //       /  |  |            BB   Pred2
+    //    Pred1 |  |             |     |
+    //      \   |  |             \     /
+    //       \  |  /              \   /
+    //        \ | /                \ /
+    //        Succ                Succ
+    //
+    //         (C)                 (D)
+    //  (D) is a degenerated case of (C) where Pred1 does not exist. In (D),
+    //  Let
+    //      F1 = Freq(BB->Succ),
+    //      F2 = Freq(Pred2->Succ), and
+    //      F3 = Freq(BB->Pred1)
+    //
+    //  Here we assume F2 > F3. Otherwise, the cost model will be the same as
+    //  scenarios (A)/(B) above.
+    //
+    //  The savings (eliminated taken branches) of laying out Succ after BB
+    //  is F1, and the cost (introduced taken branches) consists of two parts:
+    //  one is the branch from source block BB to Pred1 (F3), and the other
+    //  part is the  branch from Pred2 to the destination block Succ (F3). In
+    //  order for BB->Succ to be selected, we must have F1 > F2 + F3. So the
+    //  backward probability threshold is (backward checking igores other
+    //  predecessors other than Pred2):
+    //     T = (F2 + F3)/(F2 + F3 + F2)
+    //       = (F2 + F3)/(2*F2 + F3)
+    //
+    //  QED
+    //
+    //  Examples:
+    //  1. For (D), F3 == 0, so we have T = 50%
+    //  2. For (C) where F2 = 10, F3 = 20, we have T = 30/50 = 60%. It matches
+    //   the expection: Freq(BB->Succ) needs to be at least 30 in order to be
+    //   profitable to layout Succ after BB.
+    //
+    BlockFrequency MaxPredEdgeFreq = 0;
+    MachineBasicBlock *PredWithMaxFreq = nullptr;
+    for (MachineBasicBlock *Pred : Succ->predecessors()) {
+      // This filters out the loop backedge predecessor
+      // (e.g, loops are laid out first. Later when
+      // connecting pre-header (BB) with the loop header (Succ), we need
+      // to skip the Pred which is already in Succ's chain.
+      if ( BlockToChain[Pred] == &SuccChain ||
+          Succ == Pred ||
+          // Skip if either already placed or filtered out due outlining
+          (BlockFilter && !BlockFilter->count(Pred)) ||
+          BlockToChain[Pred] == &Chain)
+        continue;
+
+      // Find one unscheduled Pred:
+      BlockFrequency PredEdgeFreq =
+          MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
+      if (PredEdgeFreq > MaxPredEdgeFreq) {
+        MaxPredEdgeFreq = PredEdgeFreq;
+        PredWithMaxFreq = Pred;
+      }
     }
+
+    if (PredWithMaxFreq == nullptr || MaxPredEdgeFreq.getFrequency() == 0)
+      return BranchProbability(ProfileLikelyProb, 100);
+
+    BlockFrequency F2, F3;
+    if (MDT->properlyDominates(BB, PredWithMaxFreq)) {
+      // Scenario (A)
+      F2 = MBFI->getBlockFreq(BB) * SuccProb.getCompl();
+      F3 = MaxPredEdgeFreq;
+      if (F2 < F3) {
+        // handle rounding errors in block frequency propagation
+        F2 = F3;
+      }
+    } else {
+      // Scenario (C)
+      F2 = MaxPredEdgeFreq;
+      F3 = MBFI->getBlockFreq(BB) * SuccProb.getCompl();
+    }
+
+    APInt ProbBias(128, ProfileLikelyProb);
+    // N = (F2 + F3) * ProfileLikelyProb
+    APInt NV(128, F2.getFrequency());
+    APInt F3V(128, F3.getFrequency());
+    NV += F3V;
+    NV *= ProbBias;
+
+    // D = (2 * F2 + F3) * 50
+    APInt DV(128, F2.getFrequency());
+    DV *= APInt(128, 2);
+    DV += F3V;
+    DV *= APInt(128, 50);
+
+    if (DV.ult(NV))
+      DV = NV;
+
+    APInt L(128, APInt::getMaxValue(64).getLimitedValue());
+    if (DV.ugt(L)) {
+      APInt Scale = DV.udiv(L);
+      DV = DV.udiv(Scale);
+      NV = NV.udiv(Scale);
+    }
+
+    return BranchProbability::getBranchProbability(NV.getLimitedValue(), DV.getLimitedValue());
   }
+
+  // For other shapes, return the default.
   return BranchProbability(ProfileLikelyProb, 100);
 }
 
@@ -554,11 +712,13 @@
     BranchProbability SuccProb, BranchProbability RealSuccProb,
     BlockChain &Chain, const BlockFilterSet *BlockFilter) {
 
-  // This is no global conflict, just return false.
+  // There is no global conflict, just return false.
   if (SuccChain.UnscheduledPredecessors == 0)
     return false;
 
-  // There are two basic scenarios here:
+  // Descriptions of two basic scenarios.
+  // For more general cases and detailed description of the cost analysis
+  // algorithm, see getLayoutSuccessorProbThreshold method
   // -------------------------------------
   // Case 1: triagular shape CFG:
   //     BB
@@ -585,6 +745,8 @@
   //
   // When real profile data is available, we can precisely compute the the
   // probabililty threshold that is needed for edge BB->Succ to be considered.
+  // See getLayoutSuccessorProbThrehold for description on cost analysis of
+  // general triangular shape.
   // With out profile data, the heuristic requires the branch bias to be
   // a lot larger to make sure the signal is very strong (e.g. 80% default).
   // -----------------------------------------------------------------
@@ -611,7 +773,9 @@
   // edge: Prob(Succ->BB) needs to >= HotProb in order to be selected (without
   // profile data).
 
-  BranchProbability HotProb = getLayoutSuccessorProbThreshold(BB);
+  BranchProbability HotProb =
+      getLayoutSuccessorProbThreshold(BB, Succ, RealSuccProb, SuccChain,
+                                      Chain, BlockFilter);
 
   // Forward checking. For case 2, SuccProb will be 1.
   if (SuccProb < HotProb) {
@@ -630,7 +794,7 @@
         (BlockFilter && !BlockFilter->count(Pred)) ||
         BlockToChain[Pred] == &Chain)
       continue;
-    // Do backward checking. For case 1, it is actually redundant check. For
+    // Now do backward checking. For case 1, it is actually redundant check. For
     // case 2 above, we need a backward checking to filter out edges that are
     // not 'strongly' biased. With profile data available, the check is mostly
     // redundant too (when threshold prob is set at 50%) unless S has more than
Index: test/CodeGen/X86/code_placement_pgo_cost.ll
===================================================================
--- test/CodeGen/X86/code_placement_pgo_cost.ll
+++ test/CodeGen/X86/code_placement_pgo_cost.ll
@@ -0,0 +1,475 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
+
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+
+;; Simple double triangles
+;;      a
+;;      | \ 20
+;;      |  b
+;;   70 |10/\ 10
+;;      | /  c
+;;      | | / 10
+;;       \|/
+;;        d
+;; The optimal layout is <a, d, b, c> with cost = 20 + 10 + 10 = 40.
+;; The default layout (without profile) is <a, b, c, d> with cost 80.
+; CHECK-LABEL: test_double_triangles_1:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq c
+
+define void @test_double_triangles_1(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %else, !prof !8
+
+else:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2, !prof !9
+
+else2:
+  call void @c()
+  br label %merge
+
+merge:
+  call void @d()
+  ret void
+}
+
+;;      a
+;;      | \ 0
+;;      |  b
+;;   70 |  /\
+;;      | /  c
+;;      | | / 
+;;       \|/
+;;        d
+;; Same as above with more extreme probability.
+; CHECK-LABEL: test_double_triangles_2:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq c
+define void @test_double_triangles_2(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %else, !prof !10
+
+else:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2
+
+else2:
+  call void @c()
+  br label %merge
+
+merge:
+  call void @d()
+  ret void
+}
+
+;;      a
+;;      | \ 40
+;;      |  b
+;;   70 |25/\ 15
+;;      | /  c
+;;      | | / 15
+;;       \|/
+;;        d
+;; Similar to above -- the optimal layout is still <a,d,b,c> with cost
+;; 40 + 15 + 25 = 80. The default layout has cost 95.
+; CHECK-LABEL: test_double_triangles_3:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq c
+define void @test_double_triangles_3(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %else, !prof !11
+
+else:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2, !prof !12
+
+else2:
+  call void @c()
+  br label %merge
+
+merge:
+  call void @d()
+  ret void
+}
+
+;;      a
+;;      | \ 45
+;;      |  b
+;;   60 |35/\ 10
+;;      | /  c
+;;      | | / 10
+;;       \|/
+;;        d
+;; The optimal layout in this case is <a,b,d,c>. It has cost
+;; 60 + 10 + 10 = 80. The default layout <a,b,c,d> has cost = 95.
+;; Layout <a,d,b,c> has cost = 45 + 35 + 10 = 90
+; CHECK-LABEL: test_double_triangles_4:
+; CHECK: callq a
+; CHECK: callq b
+; CHECK: callq d
+; CHECK: callq c
+; 
+define void @test_double_triangles_4(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %else, !prof !13
+
+else:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2, !prof !14
+
+else2:
+  call void @c()
+  br label %merge
+
+merge:
+  call void @d()
+  ret void
+}
+
+;;      a
+;;      | \ 40
+;;      |  b
+;;   60 |10/\ 30
+;;      | /  c
+;;      | | / 30
+;;       \|/
+;;        d
+;; Optimal layout is <a,b,c,d> with cost = 60 + 10 = 70
+;; Alternate layout <a,d,b,c> has cost = 40 + 10 + 30 = 80
+;; a b d c has cost = 120
+; CHECK-LABEL: test_double_triangles_5:
+; CHECK: callq a
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq d
+
+define void @test_double_triangles_5(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %else, !prof !15
+
+else:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2, !prof !16
+
+else2:
+  call void @c()
+  br label %merge
+
+merge:
+  call void @d()
+  ret void
+}
+
+;; The following CFG has a shape with double triangle nested in a
+;; diamond.
+;; For the following CFG, the optimal layout is <a, d, e, b, c> which has
+;; cost of 80. If using the default layout strategy (without PGO), the
+;; order is <a, b, c, d, e> which has cost of 90.
+;;         a
+;;        |  \ 40
+;;     60 |   \
+;;        |    b
+;;        |  0/ \40
+;;        |  /  c
+;;        | /  /  \
+;;         \| /10 | 30
+;;          d     | 
+;;       70  \   /
+;;            \ /
+;;             e
+; CHECK-LABEL: test_multi_merge_1:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq c
+define void @test_multi_merge_1(i32 %t1, i32 %t2, i32 %t3) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %then, !prof !2
+
+then:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %then2, !prof !3
+
+then2:
+  call void @c()
+  %cond3 = icmp eq i32 %t3, 0
+  br i1 %cond3, label %merge, label %exit, !prof !4
+
+merge:
+  call void @d()
+  br label %exit
+exit:
+  call void @e()
+  ret void
+}
+
+;; Similar to the above with the same optimal layout (with cost 80)
+;; The default no optimal layout has cost of 100.
+;;         a
+;;        |  \ 40
+;;     60 |   \
+;;        |    b
+;;        |  0/ \40
+;;        |  /  c
+;;        | /  /  \
+;;         \| / 0 | 40
+;;          d     | 
+;;       60  \   /
+;;            \ /
+;;             e
+; CHECK-LABEL: test_multi_merge_2:
+; CHECK: callq a
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq c
+define void @test_multi_merge_2(i32 %t1, i32 %t2, i32 %t3) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %then, !prof !2
+
+then:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %then2, !prof !3
+
+then2:
+  call void @c()
+  %cond3 = icmp eq i32 %t3, 0
+  br i1 %cond3, label %merge, label %exit, !prof !5
+
+merge:
+  call void @d()
+  br label %exit
+exit:
+  call void @e()
+  ret void
+}
+
+;; The CFG is the same, but the count distribution changes here. It is no
+;; longer optimal to layout d after a. The optimal layout is 
+;;      <a, b, d, e, c>
+;;         a
+;;        |  \ 40
+;;     60 |   \
+;;        |    b
+;;        | 40/ \ 0
+;;        |  /  c
+;;        | /  /  \
+;;         \| / 0 | 0
+;;          d     | 
+;;       100 \   /
+;;            \ /
+;;             e
+; CHECK-LABEL: test_multi_merge_3:
+; CHECK: callq a
+; CHECK: callq b
+; CHECK: callq d
+; CHECK: callq e 
+; CHECK: callq c
+define void @test_multi_merge_3(i32 %t1, i32 %t2, i32 %t3) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %merge, label %then, !prof !2
+
+then:
+  call void @b()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %then2, !prof !6
+
+then2:
+  call void @c()
+  %cond3 = icmp eq i32 %t3, 0
+  br i1 %cond3, label %merge, label %exit, !prof !7
+
+merge:
+  call void @d()
+  br label %exit
+exit:
+  call void @e()
+  ret void
+}
+
+;;  Single triangle inside (lower half) of a diamond
+;;
+;;           a
+;;       45 / \ 55
+;;         /   \
+;;        b    c
+;;         \ 40| \ 15
+;;       45 \  |  d
+;;           \ | / 15
+;;            \|/
+;;             e
+;; Optimal layout is <a,c,d,b,e>  with cost 45 + 15 + 40 = 100
+; CHECK-LABEL: test_triangle_in_diamond_1
+; CHECK: callq a
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq b
+; CHECK: callq e
+define void @test_triangle_in_diamond_1(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %then, label %else, !prof !17
+
+then:
+  call void @b()
+  br label %merge
+
+else:
+  call void @c()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2, !prof !18
+
+else2:
+  call void @d()
+  br label %merge
+
+merge:
+  call void @e()
+  ret void
+}
+
+;;           a
+;;       25 / \ 75
+;;         /   \
+;;        b    c
+;;         \ 40| \ 35
+;;       25 \  |  d
+;;           \ | / 35
+;;            \|/
+;;             e
+;; Optimal layout <a,c,d,e,b> with cost = 25 + 25 + 40 = 90
+;; The default layout <a,c,d,b,e>'s cost is 40 + 35 + 25 = 100
+;;
+; CHECK-LABEL: test_triangle_in_diamond_2
+; CHECK: callq a
+; CHECK: callq c
+; CHECK: callq d
+; CHECK: callq e
+; CHECK: callq b
+define void @test_triangle_in_diamond_2(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %then, label %else, !prof !19
+
+then:
+  call void @b()
+  br label %merge
+
+else:
+  call void @c()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2, !prof !20
+
+else2:
+  call void @d()
+  br label %merge
+
+merge:
+  call void @e()
+  ret void
+}
+
+;;           a
+;;       25 / \ 75
+;;         /   \
+;;        b    c
+;;         \ 55| \ 20
+;;       25 \  |  d
+;;           \ | / 20
+;;            \|/
+;;             e
+;; The optimal layout is <a, c, e, b, d> with cost
+;;   25 + 25 + 20 + 20 = 90
+;; Default layout <a,c,d,b,e> has cost 25 + 55 + 20 = 100
+
+; CHECK-LABEL: test_triangle_in_diamond_3
+; CHECK: callq a
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq b
+; CHECK: callq d
+
+define void @test_triangle_in_diamond_3(i32 %t1, i32 %t2) !prof !1 {
+entry:
+  call void @a()
+  %cond1 = icmp eq i32 %t1, 0
+  br i1 %cond1, label %then, label %else, !prof !19
+
+then:
+  call void @b()
+  br label %merge
+
+else:
+  call void @c()
+  %cond2 = icmp eq i32 %t2, 0
+  br i1 %cond2, label %merge, label %else2, !prof !21
+
+else2:
+  call void @d()
+  br label %merge
+
+merge:
+  call void @e()
+  ret void
+}
+
+;;  Double triangle inside (lower half) of a diamond
+
+!1 = !{!"function_entry_count", i32 10}
+!2 = !{!"branch_weights", i32 60, i32 40}
+!3 = !{!"branch_weights", i32 0, i32 40}
+!4 = !{!"branch_weights", i32 10, i32 30}
+!5 = !{!"branch_weights", i32 0, i32 40}
+!6 = !{!"branch_weights", i32 40, i32 0}
+!7 = !{!"branch_weights", i32 0, i32 0}
+!8 = !{!"branch_weights", i32 70, i32 20}
+!9 = !{!"branch_weights", i32 10, i32 10}
+!10 = !{!"branch_weights", i32 70, i32 0}
+!11 = !{!"branch_weights", i32 70, i32 40}
+!12 = !{!"branch_weights", i32 25, i32 15}
+!13 = !{!"branch_weights", i32 65, i32 45}
+!14 = !{!"branch_weights", i32 35, i32 10}
+!15 = !{!"branch_weights", i32 60, i32 40}
+!16 = !{!"branch_weights", i32 10, i32 30}
+!17 = !{!"branch_weights", i32 45, i32 55}
+!18 = !{!"branch_weights", i32 40, i32 15}
+!19 = !{!"branch_weights", i32 25, i32 75}
+!20 = !{!"branch_weights", i32 40, i32 35}
+!21 = !{!"branch_weights", i32 55, i32 20}