Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -413,6 +413,8 @@
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
   bool shouldTailDuplicate(MachineBasicBlock *BB);
   /// Check the edge frequencies to see if tail duplication will increase
   /// fallthroughs.
@@ -586,7 +588,22 @@
   return SuccProb;
 }
 
-/// Check if a block should be tail duplicated.
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool hasSameSuccessors(
+    MachineBasicBlock &BB, SmallPtrSetImpl<MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
 /// \p BB Block to check.
 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
   // Blocks with single successors don't create additional fallthrough
@@ -683,23 +700,21 @@
   //    | /       |     C' (+Succ)
   //    Succ      Succ /|
   //    / \       |  \/ |
-  //  U/   =V     =  /= =
+  //  U/   =V     |  == |
   //  /     \     | /  \|
   //  D      E    D     E
   //  '=' : Branch taken for that CFG edge
   //  Cost in the first case is: P + V
   //  For this calculation, we always assume P > Qout. If Qout > P
   //  The result of this function will be ignored at the caller.
-  //  Cost in the second case is: Qout + Qin * V + P * U + P * V
-  //  TODO(iteratee): If we lay out D after Succ, the P * U term
-  //  goes away. This logic is coming in D28522.
+  //  Cost in the second case is: Qout + Qin * U + P * V
   if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
     BranchProbability UProb = BestSuccSucc;
     BranchProbability VProb = AdjustedSuccSumProb - UProb;
     BlockFrequency V = SuccFreq * VProb;
-    BlockFrequency QinV = Qin * VProb;
+    BlockFrequency QinU = Qin * UProb;
     BlockFrequency BaseCost = P + V;
-    BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;
+    BlockFrequency DupCost = Qout + QinU + P * VProb;
     return (BaseCost > DupCost);
   }
   BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
@@ -716,7 +731,7 @@
   // Succ       Succ /|            Succ        Succ /|
   // | \  V     |  \/ |            | \  V      |  \/ |
   // |U \       |U /\ |            |U =        |U /\ |
-  // =   D      = =  =|            |   D       | =  =|
+  // =   D      | =  =|            |   D       | =  =|
   // |  /       |/    D            |  /        |/    D
   // | /        |    /             | =         |    /
   // |/         |   /              |/          |   =
@@ -739,7 +754,7 @@
     // Cases 3 & 4
     return (P + V) > (Qout + Qin * UProb + P * VProb);
   // Cases 1 & 2
-  return (P + U) > (Qout + Qin * UProb + P * AdjustedSuccSumProb);
+  return (P + U) > (Qout + Qin * UProb + P * VProb);
 }
 
 
@@ -752,14 +767,54 @@
   if (!shouldTailDuplicate(Succ))
     return false;
 
+  // For CFG checking.
+  SmallPtrSet<MachineBasicBlock *, 4> Successors(BB->succ_begin(), BB->succ_end());
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     // Make sure all unplaced and unfiltered predecessors can be
     // tail-duplicated into.
     if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
         || BlockToChain[Pred] == &Chain)
       continue;
-    if (!TailDup.canTailDuplicate(Succ, Pred))
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1
+          && hasSameSuccessors(*Pred, Successors))
+        // This looks like a tail-duplicated block. Skip it.
+        // We are attempting to identify the CFG that matches a tail-duplicated
+        // block, rather than keeping a list of blocks for 2 reasons:
+        // 1) Tail Merging during layout can cause layout to run again, and we
+        // need to try to be repeatable in that case.
+        // 2) If the user code created a lattice outside of layout, we would
+        // also like to lay it out in a chain.
+        // By checking for the CFG rather than keeping track of the blocks that
+        // received a copy, we accomplish these 2 goals in addition to laying
+        // out chains of blocks that can be tail-duplicated sequentially.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering whether
+        // Succ can be duplicated into all its unplaced predecessors, we ignore C.
+        // This allows lattices to be laid out in 2 separate chains (ABE...) and
+        // later (CD...) This is a reasonable heuristic because it allows the
+        // creation of 2 fallthrough paths with links between them.
+        // As above we want to lay out the CFG on the right the same whether it
+        // was generated by duplication during layout, or by something before
+        // layout.
+        continue;
       return false;
+    }
   }
   return true;
 }
@@ -943,11 +998,12 @@
   //  |  Pred----|                     |  S1----
   //  |  |                             |       |
   //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
   //
   // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
   //    + min(freq(Pred->S1), freq(Pred->S2))
   // Non-topo-order cost:
-  // In the worst case, S2 will not get laid out after Pred.
   // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
   // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
   // is 0. Then the non topo layout is better when
@@ -962,6 +1018,7 @@
   BlockFrequency CandidateEdgeFreq = MBFI->getBlockFreq(BB) * RealSuccProb;
   bool BadCFGConflict = false;
 
+  SmallPtrSet<MachineBasicBlock *, 4> Successors(BB->succ_begin(), BB->succ_end());
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
         (BlockFilter && !BlockFilter->count(Pred)) ||
@@ -971,6 +1028,17 @@
         // placed yet.
         (Pred == BB))
       continue;
+    BlockFrequency PredEdgeFreq =
+        MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
+    // Lattice checking
+    if (Successors.size() != 1 && hasSameSuccessors(*Pred, Successors)) {
+      if (PredEdgeFreq > CandidateEdgeFreq) {
+        BadCFGConflict = true;
+        break;
+      }
+      DEBUG(dbgs() << "Skipping Pred " << getBlockName(Pred) << " because of lattice.\n");
+      continue;
+    }
     // Do backward checking.
     // For all cases above, we need a backward checking to filter out edges that
     // are not 'strongly' biased.
@@ -984,8 +1052,6 @@
     //      i.e. freq((BB->Succ) * (1 - HotProb) > freq(Pred->Succ) * HotProb
     // Case 1 is covered too, because the first equation reduces to:
     // prob(BB->Succ) > HotProb. (freq(Succ) = freq(BB) for a triangle)
-    BlockFrequency PredEdgeFreq =
-        MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Succ);
     if (PredEdgeFreq * HotProb >= CandidateEdgeFreq * HotProb.getCompl()) {
       BadCFGConflict = true;
       break;
Index: test/CodeGen/AArch64/branch-relax-cbz.ll
===================================================================
--- test/CodeGen/AArch64/branch-relax-cbz.ll
+++ test/CodeGen/AArch64/branch-relax-cbz.ll
@@ -6,23 +6,18 @@
 
 ; CHECK-NEXT: ; BB#1: ; %b3
 ; CHECK: ldr [[LOAD:w[0-9]+]]
-; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
-
-; CHECK-NEXT: [[SKIP_LONG_B]]:
+; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
 
+; CHECK-NEXT: [[B8]]: ; %b8
+; CHECK-NEXT: ret
+
 ; CHECK-NEXT: [[B2]]: ; %b2
 ; CHECK: mov w{{[0-9]+}}, #93
 ; CHECK: bl _extfunc
 ; CHECK: cbz w{{[0-9]+}}, [[B7]]
+; CHECK-NEXT: b [[B8]]
 
-; CHECK-NEXT: [[B8]]: ; %b8
-; CHECK-NEXT: ret
-
-; CHECK-NEXT: [[B7]]: ; %b7
-; CHECK: mov w{{[0-9]+}}, #13
-; CHECK: b _extfunc
 define void @split_block_no_fallthrough(i64 %val) #0 {
 bb:
   %c0 = icmp sgt i64 %val, -5
Index: test/CodeGen/AArch64/optimize-cond-branch.ll
===================================================================
--- test/CodeGen/AArch64/optimize-cond-branch.ll
+++ test/CodeGen/AArch64/optimize-cond-branch.ll
@@ -11,7 +11,8 @@
 ;
 ; CHECK-LABEL: func
 ; CHECK-NOT: and
-; CHECK: tbnz
+; Layout reverses the test.
+; CHECK: tbz
 define void @func() {
   %c0 = icmp sgt i64 0, 0
   br i1 %c0, label %b1, label %b6
Index: test/CodeGen/AMDGPU/basic-branch.ll
===================================================================
--- test/CodeGen/AMDGPU/basic-branch.ll
+++ test/CodeGen/AMDGPU/basic-branch.ll
@@ -8,13 +8,10 @@
 ; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
-
-; GCN: ; BB#1
 ; GCNNOOPT: v_readlane_b32
 ; GCNNOOPT: v_readlane_b32
 ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
+; GCNNOOPT: s_endpgm
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
Index: test/CodeGen/AMDGPU/branch-relaxation.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-relaxation.ll
+++ test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -491,8 +491,8 @@
 
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
-; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
+; GCN-NEXT: s_cbranch_scc1 BB{{[0-9]+_[0-9]+}}
+; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 
 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
 ; GCN: s_setpc_b64
Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
 
 ; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
Index: test/CodeGen/AMDGPU/convergent-inlineasm.ll
===================================================================
--- test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -29,6 +29,7 @@
 ; GCN: v_cmp_ne_u32_e64
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
+
 define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/salu-to-valu.ll
===================================================================
--- test/CodeGen/AMDGPU/salu-to-valu.ll
+++ test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -439,7 +439,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
+; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
Index: test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
===================================================================
--- test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
+++ test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
@@ -12,11 +12,11 @@
 ; CHECK: bl _quux
 ; CHECK-NOT: bl _quux
 
-; NOMERGE: bl _baz
-; NOMERGE: bl _baz
+; NOMERGE-DAG: bl _baz
+; NOMERGE-DAG: bl _baz
 
-; NOMERGE: bl _quux
-; NOMERGE: bl _quux
+; NOMERGE-DAG: bl _quux
+; NOMERGE-DAG: bl _quux
 
 ; ModuleID = 'tail.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
Index: test/CodeGen/ARM/atomic-cmpxchg.ll
===================================================================
--- test/CodeGen/ARM/atomic-cmpxchg.ll
+++ test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -66,14 +66,14 @@
 ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
 ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
-; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
+; CHECK-ARMV7-NEXT: moveq r0, #1
 ; CHECK-ARMV7-NEXT: bxeq lr
 ; CHECK-ARMV7-NEXT: [[TRY]]:
-; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
-; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
 ; CHECK-ARMV7-NEXT: beq [[HEAD]]
 ; CHECK-ARMV7-NEXT: clrex
-; CHECK-ARMV7-NEXT: mov [[RES]], #0
+; CHECK-ARMV7-NEXT: mov r0, #0
 ; CHECK-ARMV7-NEXT: bx lr
 
 ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
Index: test/CodeGen/ARM/fold-stack-adjust.ll
===================================================================
--- test/CodeGen/ARM/fold-stack-adjust.ll
+++ test/CodeGen/ARM/fold-stack-adjust.ll
@@ -135,7 +135,7 @@
 
   ; Important to check for beginning of basic block, because if it gets
   ; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
 ; CHECK-NEXT: vpop {d7, d8}
 ; CHECK-NEXT: pop {r4, pc}
 
Index: test/CodeGen/PowerPC/tail-dup-break-cfg.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-break-cfg.ll
+++ test/CodeGen/PowerPC/tail-dup-break-cfg.ll
@@ -17,12 +17,12 @@
 ;CHECK-NEXT: # %test2
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
 ;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
+;CHECK: b [[EXITLABEL:[._0-9A-Za-z]+]]
 ;CHECK-NEXT: [[BODY1LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[BODY2LABEL]]
-;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK-NEXT: bne 0, [[BODY2LABEL]]
+;CHECK: [[EXITLABEL]]: # %exit
 ;CHECK: blr
 define void @tail_dup_break_cfg(i32 %tag) {
 entry:
@@ -79,7 +79,7 @@
 test2:
   %tagbit2 = and i32 %tag, 2
   %tagbit2eq0 = icmp ne i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
+  br i1 %tagbit2eq0, label %body2, label %exit, !prof !4 ; %body2 more likely
 body2:
   call void @b()
   call void @b()
@@ -138,3 +138,4 @@
 !1 = !{!"branch_weights", i32 5, i32 3}
 !2 = !{!"branch_weights", i32 95, i32 5}
 !3 = !{!"branch_weights", i32 50, i32 50}
+!4 = !{!"branch_weights", i32 8, i32 3}
Index: test/CodeGen/PowerPC/tail-dup-layout.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-layout.ll
+++ test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,22 +1,22 @@
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; Intended layout:
-; The outlining flag produces the layout
+; The chain-based outlining produces the layout
 ; test1
 ; test2
 ; test3
 ; test4
-; exit
 ; optional1
 ; optional2
 ; optional3
 ; optional4
+; exit
 ; Tail duplication puts test n+1 at the end of optional n
 ; so optional1 includes a copy of test2 at the end, and branches
 ; to test3 (at the top) or falls through to optional 2.
-; The CHECK statements check for the whole string of tests and exit block,
+; The CHECK statements check for the whole string of tests
 ; and then check that the correct test has been duplicated into the end of
 ; the optional blocks and that the optional blocks are in the correct order.
 ;CHECK-LABEL: straight_test:
@@ -32,9 +32,9 @@
 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
 ;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
-;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
-;CHECK: blr
+;CHECK-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[OPT4LABEL:[_0-9A-Za-z]+]]:
+;CHECK: b .[[EXITLABEL]]
 ;CHECK-NEXT: .[[OPT1LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
 ;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
@@ -43,9 +43,9 @@
 ;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
 ;CHECK-NEXT: .[[OPT3LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: beq 0, .[[EXITLABEL]]
-;CHECK-NEXT: .[[OPT4LABEL]]
-;CHECK: b .[[EXITLABEL]]
+;CHECK-NEXT: bne 0, .[[OPT4LABEL]]
+;CHECK: .[[EXITLABEL]]: # %exit
+;CHECK: blr
 
 define void @straight_test(i32 %tag) {
 entry:
@@ -53,7 +53,7 @@
 test1:
   %tagbit1 = and i32 %tag, 1
   %tagbit1eq0 = icmp eq i32 %tagbit1, 0
-  br i1 %tagbit1eq0, label %test2, label %optional1
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
 optional1:
   call void @a()
   call void @a()
@@ -63,7 +63,7 @@
 test2:
   %tagbit2 = and i32 %tag, 2
   %tagbit2eq0 = icmp eq i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %test3, label %optional2
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
 optional2:
   call void @b()
   call void @b()
@@ -73,7 +73,7 @@
 test3:
   %tagbit3 = and i32 %tag, 4
   %tagbit3eq0 = icmp eq i32 %tagbit3, 0
-  br i1 %tagbit3eq0, label %test4, label %optional3
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
 optional3:
   call void @c()
   call void @c()
@@ -83,7 +83,7 @@
 test4:
   %tagbit4 = and i32 %tag, 8
   %tagbit4eq0 = icmp eq i32 %tagbit4, 0
-  br i1 %tagbit4eq0, label %exit, label %optional4
+  br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
 optional4:
   call void @d()
   call void @d()
@@ -94,6 +94,113 @@
   ret void
 }
 
+; Intended layout:
+; The chain-based outlining produces the layout
+; entry
+; --- Begin loop ---
+; for.latch
+; for.check
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; --- End loop ---
+; exit
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: loop_test:
+;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
+;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
+;CHECK: addi
+;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
+;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
+;CHECK: # %test1
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL:[._0-9A-Za-z]+]]:
+;CHECK: b .[[LATCHLABEL]]
+;CHECK: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK: b .[[OPT4LABEL]]
+define void @loop_test(i32* %tags, i32 %count) {
+entry:
+  br label %for.check
+for.check:
+  %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
+  %done.count = icmp ugt i32 %count.loop, 0
+  %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
+  %tag = load i32, i32* %tag_ptr
+  %done.tag = icmp eq i32 %tag, 0
+  %done = and i1 %done.count, %done.tag
+  br i1 %done, label %test1, label %exit, !prof !1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %for.latch
+for.latch:
+  %count.sub = sub i32 %count.loop, 1
+  br label %for.check
+exit:
+  ret void
+}
+
 ; The block then2 is not unavoidable, but since it can be tail-duplicated, it
 ; should be placed as a fallthrough from test2 and copied.
 ; CHECK-LABEL: avoidable_test:
@@ -104,8 +211,7 @@
 ; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
 ; CHECK: # %then2
 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
-; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
-; CHECK: # %end2
+; CHECK: # %end1
 ; CHECK: # %else1
 ; CHECK: bl a
 ; CHECK: bl a
@@ -113,6 +219,7 @@
 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
 ; CHECK: # %else2
 ; CHECK: bl c
+; CHECK: # %end2
 define void @avoidable_test(i32 %tag) {
 entry:
   br label %test1
@@ -142,9 +249,62 @@
   ret void
 }
 
+; CHECK-LABEL: lattice_test
+; CHECK: # %entry
+; CHECK: # %c
+; CHECK: # %e
+; CHECK: # %f
+; CHECK: # %b
+; CHECK: # %d
+; CHECK: # %ret
+define void @lattice_test(i32 %tag) {
+entry:
+  br label %a
+a:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %c, label %b, !prof !2 ; balanced
+c:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  br i1 %tagbits.c.eq0, label %e, label %d, !prof !2 ; balanced
+e:
+  call void @e()
+  call void @e()
+  %tagbits.e = and i32 %tag, 48
+  %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+  br i1 %tagbits.e.eq0, label %f, label %ret, !prof !2 ; balanced
+b:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %d, label %e, !prof !2 ; balanced
+d:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %f, label %ret, !prof !2 ; balanced
+f:
+  call void @f()
+  call void @f()
+  br label %ret
+ret:
+  ret void
+}
+
+
 declare void @a()
 declare void @b()
 declare void @c()
 declare void @d()
+declare void @e()
+declare void @f()
 
-!1 = !{!"branch_weights", i32 2, i32 1}
+!1 = !{!"branch_weights", i32 3, i32 2}
+!2 = !{!"branch_weights", i32 50, i32 50}
Index: test/CodeGen/SPARC/sjlj.ll
===================================================================
--- test/CodeGen/SPARC/sjlj.ll
+++ test/CodeGen/SPARC/sjlj.ll
@@ -67,14 +67,16 @@
 ; CHECK:  nop
 ; CHECK:.LBB1_1:                                ! %entry
 ; CHECK:  mov  %g0, %i0
+; CHECK:                                        ! %entry
 ; CHECK:  cmp %i0, 0
-; CHECK:  bne  .LBB1_4
-; CHECK:  ba   .LBB1_5
-; CHECK:.LBB1_2:                                ! Block address taken
-; CHECK:  mov  1, %i0
 ; CHECK:  be   .LBB1_5
+; CHECK:  nop
 ; CHECK:.LBB1_4:
-; CHECK:  ba   .LBB1_6
+; CHECK:.LBB1_2:                                ! Block address taken
+; CHECK:  mov  1, %i0
+; CHECK:  cmp %i0, 0
+; CHECK:  bne   .LBB1_4
+; CHECK:  nop
 }
 declare i8* @llvm.frameaddress(i32) #2
 
Index: test/CodeGen/WebAssembly/mem-intrinsics.ll
===================================================================
--- test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
Index: test/CodeGen/X86/atom-bypass-slow-division.ll
===================================================================
--- test/CodeGen/X86/atom-bypass-slow-division.ll
+++ test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -46,9 +46,9 @@
 define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
 ; CHECK-LABEL: Test_use_div_and_idiv:
 ; CHECK: idivl
-; CHECK: divb
 ; CHECK: divl
 ; CHECK: divb
+; CHECK: divb
 ; CHECK: addl
 ; CHECK: ret
   %resultidiv = sdiv i32 %a, %b
Index: test/CodeGen/X86/block-placement.ll
===================================================================
--- test/CodeGen/X86/block-placement.ll
+++ test/CodeGen/X86/block-placement.ll
@@ -314,7 +314,7 @@
 define void @unnatural_cfg1() {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -352,7 +352,11 @@
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
-; CHECK: unnatural_cfg2
+; The tail-duplication outlining algorithm places
+; %loop.body3 and %loop.inner1.begin out-of-line at the end of the loop,
+; because %loop.body4 is unnavoidable within the loop and short,
+; and %loop.inner1.begin has an alternate fallthrough of %loop.body3
+; CHECK-LABEL: unnatural_cfg2
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -559,7 +563,7 @@
 ; didn't correctly locate the fallthrough successor, assuming blindly that the
 ; first one was the fallthrough successor. As a result, we would add an
 ; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
 ; CHECK: %entry
 ; CHECK-NOT: jmp
 ; CHECK: %loop
@@ -587,7 +591,7 @@
 ; fallthrough simply won't occur. Make sure we don't crash trying to update
 ; terminators for such constructs.
 ;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
 ; CHECK: %entry
 ; CHECK: %cleanup
 
@@ -609,7 +613,7 @@
 ; attempt to merge onto the wrong end of the inner loop just because we find it
 ; first. This was reduced from a crasher in GCC's single source.
 ;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
 ; CHECK: %loop2b
 ; CHECK: %loop1
@@ -649,7 +653,7 @@
 ; fallthrough because that happens to always produce unanalyzable branches on
 ; x86.
 ;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
 ; CHECK: %entry
 ; CHECK: %loop
 ; CHECK: %exit
@@ -673,7 +677,7 @@
 ; This branch is now analyzable and hence the destination block becomes the
 ; hotter one. The right order is entry->bar->exit->foo.
 ;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
 ; CHECK: %bar
 ; CHECK: %exit
@@ -699,7 +703,7 @@
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
 ; CHECK: %entry
 ; CHECK: %a
 ; CHECK: %b
@@ -729,7 +733,7 @@
 ; Ensure that we don't crash as we're building up many unanalyzable branches,
 ; blocks, and loops.
 ;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
 ; CHECK: %entry
 ; CHECK: %exit
 
@@ -948,7 +952,7 @@
 ;    strange layouts that are siginificantly less efficient, often times maing
 ;    it discontiguous.
 ;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
 ; CHECK: %entry
 ; First rotated loop top.
 ; CHECK: .p2align
Index: test/CodeGen/X86/sse1.ll
===================================================================
--- test/CodeGen/X86/sse1.ll
+++ test/CodeGen/X86/sse1.ll
@@ -59,24 +59,22 @@
 ; X32-NEXT:  # BB#2: # %entry
 ; X32-NEXT:    xorps %xmm1, %xmm1
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    jne .LBB1_5
-; X32-NEXT:    jmp .LBB1_4
-; X32-NEXT:  .LBB1_1:
-; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_4
 ; X32-NEXT:  .LBB1_5: # %entry
 ; X32-NEXT:    xorps %xmm2, %xmm2
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    jne .LBB1_8
-; X32-NEXT:    jmp .LBB1_7
-; X32-NEXT:  .LBB1_4:
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_7
 ; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
 ; X32-NEXT:    jmp .LBB1_9
+; X32-NEXT:  .LBB1_1:
+; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_5
+; X32-NEXT:  .LBB1_4:
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_8
 ; X32-NEXT:  .LBB1_7:
 ; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X32-NEXT:  .LBB1_9: # %entry
@@ -98,24 +96,22 @@
 ; X64-NEXT:  # BB#2: # %entry
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    testl %edx, %edx
-; X64-NEXT:    jne .LBB1_5
-; X64-NEXT:    jmp .LBB1_4
-; X64-NEXT:  .LBB1_1:
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    je .LBB1_4
 ; X64-NEXT:  .LBB1_5: # %entry
 ; X64-NEXT:    xorps %xmm2, %xmm2
 ; X64-NEXT:    testl %r8d, %r8d
-; X64-NEXT:    jne .LBB1_8
-; X64-NEXT:    jmp .LBB1_7
-; X64-NEXT:  .LBB1_4:
-; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    testl %r8d, %r8d
 ; X64-NEXT:    je .LBB1_7
 ; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
 ; X64-NEXT:    jmp .LBB1_9
+; X64-NEXT:  .LBB1_1:
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    testl %edx, %edx
+; X64-NEXT:    jne .LBB1_5
+; X64-NEXT:  .LBB1_4:
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    testl %r8d, %r8d
+; X64-NEXT:    jne .LBB1_8
 ; X64-NEXT:  .LBB1_7:
 ; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X64-NEXT:  .LBB1_9: # %entry
Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll
===================================================================
--- test/CodeGen/X86/tail-dup-merge-loop-headers.ll
+++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: tail_dup_merge_loops
 ; CHECK: # %entry
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_exit
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_latch
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_test
-; CHECK-NOT: # %{{[a-zA-Z_]+}}
-; CHECK: # %exit
 define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
 entry:
   %notlhs674.i = icmp eq i32 %a, 0
Index: test/CodeGen/X86/tail-dup-repeat.ll
===================================================================
--- test/CodeGen/X86/tail-dup-repeat.ll
+++ test/CodeGen/X86/tail-dup-repeat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
Index: test/CodeGen/X86/tail-opts.ll
===================================================================
--- test/CodeGen/X86/tail-opts.ll
+++ test/CodeGen/X86/tail-opts.ll
@@ -112,14 +112,13 @@
 ; CHECK:        ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_3
 ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
-; CHECK-NEXT:   ja .LBB2_4
-; CHECK-NEXT:   jmp .LBB2_2
-; CHECK-NEXT: .LBB2_3:
-; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_2
 ; CHECK-NEXT: .LBB2_4:
 ; CHECK-NEXT:   xorl %eax, %eax
 ; CHECK-NEXT:   ret
+; CHECK-NEXT: .LBB2_3:
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
+; CHECK-NEXT:   ja .LBB2_4
 ; CHECK-NEXT: .LBB2_2:
 ; CHECK-NEXT:   movb $1, %al
 ; CHECK-NEXT:   ret
Index: test/CodeGen/X86/twoaddr-coalesce-3.ll
===================================================================
--- test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -19,7 +19,7 @@
 
 ; Check that only one mov will be generated in the kernel loop.
 ; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
 ; CHECK-NOT: mov
@@ -56,7 +56,7 @@
 
 ; Check that only two mov will be generated in the kernel loop.
 ; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
 ; CHECK-NOT: mov
Index: test/CodeGen/X86/win-alloca-expander.ll
===================================================================
--- test/CodeGen/X86/win-alloca-expander.ll
+++ test/CodeGen/X86/win-alloca-expander.ll
@@ -115,27 +115,28 @@
 ; Test that the blocks are analyzed in the correct order.
 ; CHECK-LABEL: cfg:
 entry:
-  br i1 %x, label %bb1, label %bb2
+  br i1 %x, label %bb1, label %bb3
 
 bb1:
   %p1 = alloca %struct.S
 ; CHECK: pushl %eax
 ; CHECK: subl $1020, %esp
-  br label %bb3
+  br label %bb4
+
 bb2:
+  %p4 = alloca %struct.S
+; CHECK: subl $1024, %esp
+  call void @f(%struct.S* %p4)
+  ret void
+
+bb3:
   %p2 = alloca %struct.T
 ; CHECK: pushl %eax
 ; CHECK: subl $2996, %esp
-  br label %bb3
-
-bb3:
-  br i1 %y, label %bb4, label %bb5
+  br label %bb4
 
 bb4:
-  %p4 = alloca %struct.S
-; CHECK: subl $1024, %esp
-  call void @f(%struct.S* %p4)
-  ret void
+  br i1 %y, label %bb2, label %bb5
 
 bb5:
   %p5 = alloca %struct.T