Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -407,11 +407,15 @@
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
   bool shouldTailDuplicate(MachineBasicBlock *BB);
   /// Check the edge frequencies to see if tail duplication will increase
   /// fallthroughs.
   bool probabilityJustifiesTailDuplicate(
       MachineBasicBlock *BB, MachineBasicBlock *Succ);
+  /// Returns true if a block can tail duplicate into all unplaced
+  /// predecessors. Filters based on loop.
   bool canTailDuplicateUnplacedPreds(
       MachineBasicBlock *BB, MachineBasicBlock *Succ,
       BlockChain &Chain, const BlockFilterSet *BlockFilter);
@@ -576,7 +580,22 @@
   return SuccProb;
 }
 
-/// Check if a block should be tail duplicated.
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool hasSameSuccessors(
+    MachineBasicBlock &BB, SmallPtrSetImpl<MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
 /// \p BB Block to check.
 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
   // Blocks with single successors don't create additional fallthrough
@@ -631,22 +650,23 @@
   //    | /       |     |
   //    Succ      Succ /|
   //    / \       |  \/ |
-  //  U/   =V     =  /= =
+  //  U/   =V     |  == |
   //  /     \     | /  \|
   //  D      E    D     E
   //  Cost in the first case is: P + V
-  //  Cost in the second case is: Q + QV + PU + PV
+  //  Cost in the second case is: Q + QU + PV
   if (Dom == nullptr || !Succ->isSuccessor(Dom)) {
     BranchProbability P = (MBPI->getEdgeProbability(BB, Succ));
     BranchProbability Q = P.getCompl();
     BranchProbability U = BestSuccSucc;
     BranchProbability V = U.getCompl();
-    BranchProbability QV = Q * V;
+    BranchProbability PV = P * V;
+    BranchProbability QU = Q * U;
     uint64_t BaseCost = static_cast<uint64_t>(P.getNumerator()) +
         static_cast<uint64_t>(V.getNumerator());
     uint64_t DupCost = static_cast<uint64_t>(Q.getNumerator()) +
-        static_cast<uint64_t>(QV.getNumerator()) +
-        static_cast<uint64_t>(P.getNumerator());
+        static_cast<uint64_t>(QU.getNumerator()) +
+        static_cast<uint64_t>(PV.getNumerator());
     return (BaseCost > DupCost);
   }
   BranchProbability U = MBPI->getEdgeProbability(Succ, Dom);
@@ -690,14 +710,54 @@
   if (!shouldTailDuplicate(Succ))
     return false;
 
+  // For CFG checking.
+  SmallPtrSet<MachineBasicBlock *, 4> Successors(BB->succ_begin(), BB->succ_end());
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     // Make sure all unplaced and unfiltered predecessors can be
     // tail-duplicated into.
     if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
         || BlockToChain[Pred] == &Chain)
       continue;
-    if (!TailDup.canTailDuplicate(Succ, Pred))
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1
+          && hasSameSuccessors(*Pred, Successors))
+        // This looks like a tail-duplicated block. Skip it.
+        // We are attempting to identify the CFG that matches a tail-duplicated
+        // block, rather than keeping a list of blocks for 2 reasons:
+        // 1) Tail Merging during layout can cause layout to run again, and we
+        // need to try to be repeatable in that case.
+        // 2) If the user code created a lattice outside of layout, we would
+        // also like to lay it out in a chain.
+        // By checking for the CFG rather than keeping track of the blocks that
+        // received a copy, we accomplish these 2 goals in addition to laying
+        // out chains of blocks that can be tail-duplicated sequentially.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering whether
+        // Succ can be duplicated into all its unplaced predecessors, we ignore C.
+        // This allows lattices to be laid out in 2 separate chains (ABE...) and
+        // later (CD...) This is a reasonable heuristic because it allows the
+        // creation of 2 fallthrough paths with links between them.
+        // As above we want to lay out the CFG on the right the same whether it
+        // was generated by duplication during layout, or by something before
+        // layout.
+        continue;
       return false;
+    }
   }
   return true;
 }
Index: test/CodeGen/AArch64/addsub.ll
===================================================================
--- test/CodeGen/AArch64/addsub.ll
+++ test/CodeGen/AArch64/addsub.ll
@@ -140,12 +140,17 @@
 
 test5:
 ; CHECK: cmn {{w[0-9]+}}, #444
-; CHECK: b.gt [[RET]]
+; CHECK: b.le [[TEST6:.?LBB[0-9]+_[0-9]+]]
   %newval5 = add i32 %val, 4
   store i32 %newval5, i32* @var_i32
   %cmp_neg_uge = icmp sgt i32 %val2, -444
   br i1 %cmp_neg_uge, label %ret, label %test6
 
+; CHECK: {{^}}[[RET]]:
+; CHECK: ret
+; CHECK: {{^}}[[TEST6]]:
+; CHECK: ret
+
 test6:
   %newval6 = add i32 %val, 5
   store i32 %newval6, i32* @var_i32
Index: test/CodeGen/AArch64/arm64-atomic.ll
===================================================================
--- test/CodeGen/AArch64/arm64-atomic.ll
+++ test/CodeGen/AArch64/arm64-atomic.ll
@@ -9,10 +9,10 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -27,10 +27,12 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
   %new = load i32, i32* %pnew
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
@@ -41,15 +43,15 @@
 ; CHECK-LABEL: val_compare_and_swap_rel:
 ; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
-; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
+; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -64,10 +66,10 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
Index: test/CodeGen/AArch64/arm64-ccmp.ll
===================================================================
--- test/CodeGen/AArch64/arm64-ccmp.ll
+++ test/CodeGen/AArch64/arm64-ccmp.ll
@@ -108,10 +108,10 @@
 ; CHECK: cmp w0, #1
 ; CHECK: sdiv [[DIVRES:w[0-9]+]], w1, w0
 ; CHECK: ccmp [[DIVRES]], #16, #0, ge
-; CHECK: b.gt [[BLOCK:LBB[0-9_]+]]
-; CHECK: bl _foo
-; CHECK: [[BLOCK]]:
+; CHECK: b.le [[BLOCK:LBB[0-9_]+]]
 ; CHECK: orr w0, wzr, #0x7
+; CHECK: [[BLOCK]]:
+; CHECK: bl _foo
 define i32 @speculate_division(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp sgt i32 %a, 0
@@ -135,7 +135,7 @@
 ; CHECK: cmp
 ; CHECK-NOT: b.
 ; CHECK: fccmp {{.*}}, #8, ge
-; CHECK: b.lt
+; CHECK: b.ge
 define i32 @single_fcmp(i32 %a, float %b) nounwind ssp {
 entry:
   %cmp = icmp sgt i32 %a, 0
Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll
===================================================================
--- test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -346,19 +346,15 @@
 ; CHECK-NEXT: sub w1, w1, #1
 ; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
 ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
-; DISABLE-NEXT: b [[IFEND_LABEL]]
-;
-; DISABLE: [[ELSE_LABEL]]: ; %if.else
-; DISABLE: lsl w0, w1, #1
-;
-; CHECK: [[IFEND_LABEL]]:
+; CHECK-NEXT: [[IFEND_LABEL]]:
 ; Epilogue code.
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: ret
 ;
-; ENABLE: [[ELSE_LABEL]]: ; %if.else
-; ENABLE-NEXT: lsl w0, w1, #1
-; ENABLE_NEXT: ret
+; CHECK: [[ELSE_LABEL]]: ; %if.else
+; CHECK-NEXT: lsl w0, w1, #1
+; DISABLE-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
 define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
 entry:
   %ap = alloca i8*, align 8
Index: test/CodeGen/AArch64/branch-relax-cbz.ll
===================================================================
--- test/CodeGen/AArch64/branch-relax-cbz.ll
+++ test/CodeGen/AArch64/branch-relax-cbz.ll
@@ -6,23 +6,18 @@
 
 ; CHECK-NEXT: ; BB#1: ; %b3
 ; CHECK: ldr [[LOAD:w[0-9]+]]
-; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
-
-; CHECK-NEXT: [[SKIP_LONG_B]]:
+; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
 
+; CHECK-NEXT: [[B8]]: ; %b8
+; CHECK-NEXT: ret
+
 ; CHECK-NEXT: [[B2]]: ; %b2
 ; CHECK: mov w{{[0-9]+}}, #93
 ; CHECK: bl _extfunc
 ; CHECK: cbz w{{[0-9]+}}, [[B7]]
+; CHECK-NEXT: b [[B8]]
 
-; CHECK-NEXT: [[B8]]: ; %b8
-; CHECK-NEXT: ret
-
-; CHECK-NEXT: [[B7]]: ; %b7
-; CHECK: mov w{{[0-9]+}}, #13
-; CHECK: b _extfunc
 define void @split_block_no_fallthrough(i64 %val) #0 {
 bb:
   %c0 = icmp sgt i64 %val, -5
Index: test/CodeGen/AArch64/compare-branch.ll
===================================================================
--- test/CodeGen/AArch64/compare-branch.ll
+++ test/CodeGen/AArch64/compare-branch.ll
@@ -27,7 +27,7 @@
   %val4 = load volatile i64, i64* @var64
   %tst4 = icmp ne i64 %val4, 0
   br i1 %tst4, label %end, label %test5, !prof !1
-; CHECK: cbnz {{x[0-9]+}}, .LBB
+; CHECK: cbz {{x[0-9]+}}, .LBB
 
 test5:
   store volatile i64 %val4, i64* @var64
Index: test/CodeGen/AArch64/logical_shifted_reg.ll
===================================================================
--- test/CodeGen/AArch64/logical_shifted_reg.ll
+++ test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -210,7 +210,7 @@
 
 test3:
 ; CHECK: tst {{x[0-9]+}}, {{x[0-9]+}}, asr #12
-; CHECK: b.gt .L
+; CHECK: b.le .L
   %asr_op = ashr i64 %val2, 12
   %asr_and = and i64 %asr_op, %val1
   %tst3 = icmp sgt i64 %asr_and, 0
Index: test/CodeGen/AArch64/optimize-cond-branch.ll
===================================================================
--- test/CodeGen/AArch64/optimize-cond-branch.ll
+++ test/CodeGen/AArch64/optimize-cond-branch.ll
@@ -11,7 +11,8 @@
 ;
 ; CHECK-LABEL: func
 ; CHECK-NOT: and
-; CHECK: tbnz
+; Layout reverses the test.
+; CHECK: tbz
 define void @func() {
   %c0 = icmp sgt i64 0, 0
   br i1 %c0, label %b1, label %b6
Index: test/CodeGen/AArch64/tbz-tbnz.ll
===================================================================
--- test/CodeGen/AArch64/tbz-tbnz.ll
+++ test/CodeGen/AArch64/tbz-tbnz.ll
@@ -10,7 +10,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -28,7 +28,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -118,7 +118,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -178,7 +178,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -194,7 +194,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -209,7 +209,7 @@
 
 ; CHECK: ldr [[CMP:x[0-9]+]], [x1]
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
   %val = load i64, i64* %ptr
   %tst = icmp slt i64 %val, 0
@@ -229,7 +229,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -247,7 +247,7 @@
 
 ; CHECK: orr [[CMP:x[0-9]+]], x0, x1
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -262,7 +262,7 @@
   br i1 %cond, label %if.end, label %if.then
 
 ; CHECK-NOT: and
-; CHECK: tbnz w0, #0
+; CHECK: tbz w0, #0
 
 if.then:
   call void @t()
Index: test/CodeGen/AMDGPU/basic-branch.ll
===================================================================
--- test/CodeGen/AMDGPU/basic-branch.ll
+++ test/CodeGen/AMDGPU/basic-branch.ll
@@ -8,13 +8,10 @@
 ; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
-
-; GCN: ; BB#1
 ; GCNNOOPT: v_readlane_b32
 ; GCNNOOPT: v_readlane_b32
 ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
+; GCNNOOPT: s_endpgm
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
 
 ; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
Index: test/CodeGen/AMDGPU/convergent-inlineasm.ll
===================================================================
--- test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -29,6 +29,7 @@
 ; GCN: v_cmp_ne_u32_e64
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
+
 define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/salu-to-valu.ll
===================================================================
--- test/CodeGen/AMDGPU/salu-to-valu.ll
+++ test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -439,7 +439,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
+; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
Index: test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
===================================================================
--- test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -37,7 +37,10 @@
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_ret_noloop:
-; GCN: s_cbranch_scc1
+; GCN: s_cbranch_scc0 [[BODY:BB[0-9]+_[0-9]+]]
+; GCN: s_endpgm
+
+; GCN: {{^}}[[BODY]]:
 ; GCN: s_endpgm
 ; GCN: .Lfunc_end1
 define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
Index: test/CodeGen/AMDGPU/uniform-cfg.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-cfg.ll
+++ test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -252,10 +252,12 @@
 ; GCN: s_cmp_lt_i32 [[COND]], 1
 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
 ; GCN: v_cmp_gt_i32_e64 vcc, [[COND]], 0{{$}}
-; GCN: s_cbranch_vccnz [[EXIT]]
-; GCN: buffer_store
+; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
+; GCN: {{^}}[[BODY]]:
+; GCN: buffer_store
+; GCN: s_endpgm
 define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -302,9 +304,10 @@
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
-; GCN: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
 ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[ENDIF_LABEL]]
+; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]]
+; GCN: s_endpgm
+; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
@@ -328,14 +331,13 @@
 
 ; GCN-LABEL: {{^}}divergent_inside_uniform:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+; GCN: [[IF_LABEL]]:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-; GCN: [[ENDIF_LABEL]]:
-; GCN: s_endpgm
 define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %u_cmp = icmp eq i32 %cond, 0
@@ -363,11 +365,11 @@
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_or_b64 exec, exec, [[MASK]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
+; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]]
+; GCN: s_endpgm
+; GCN: [[IF_UNIFORM]]:
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
-; GCN: [[EXIT]]:
-; GCN: s_endpgm
 define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
@@ -398,16 +400,20 @@
 ; GCN-LABEL: {{^}}cse_uniform_condition_different_blocks:
 ; GCN: s_load_dword [[COND:s[0-9]+]]
 ; GCN: s_cmp_lt_i32 [[COND]], 1
-; GCN: s_cbranch_scc1 BB[[FNNUM:[0-9]+]]_3
+; GCN: s_cbranch_scc1 [[FN:BB[0-9_]+]]
 
 ; GCN: BB#1:
 ; GCN-NOT: cmp
 ; GCN: buffer_load_dword
 ; GCN: buffer_store_dword
-; GCN: s_cbranch_scc1 BB[[FNNUM]]_3
+; GCN: s_cbranch_scc0 [[BB7:BB[0-9_]+]]
 
-; GCN: BB[[FNNUM]]_3:
+; GCN: [[FN]]:
 ; GCN: s_endpgm
+
+; GCN: [[BB7]]:
+; GCN: s_endpgm
+
 define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
Index: test/CodeGen/ARM/arm-and-tst-peephole.ll
===================================================================
--- test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -49,9 +49,9 @@
 ; V8-NEXT: beq
 ; V8-NEXT: %tailrecurse.switch
 ; V8: cmp
-; V8-NEXT: bne
-; V8-NEXT: b	
-; The trailing space in the last line checks that the branch is unconditional
+; V8-NEXT: beq
+; V8-NEXT: %sw.epilog
+; V8-NEXT: bx lr
   switch i32 %and, label %sw.epilog [
     i32 1, label %sw.bb
     i32 3, label %sw.bb6
Index: test/CodeGen/ARM/atomic-cmpxchg.ll
===================================================================
--- test/CodeGen/ARM/atomic-cmpxchg.ll
+++ test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -66,14 +66,14 @@
 ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
 ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
-; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
+; CHECK-ARMV7-NEXT: moveq r0, #1
 ; CHECK-ARMV7-NEXT: bxeq lr
 ; CHECK-ARMV7-NEXT: [[TRY]]:
-; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
-; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
 ; CHECK-ARMV7-NEXT: beq [[HEAD]]
 ; CHECK-ARMV7-NEXT: clrex
-; CHECK-ARMV7-NEXT: mov [[RES]], #0
+; CHECK-ARMV7-NEXT: mov r0, #0
 ; CHECK-ARMV7-NEXT: bx lr
 
 ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
Index: test/CodeGen/ARM/atomic-op.ll
===================================================================
--- test/CodeGen/ARM/atomic-op.ll
+++ test/CodeGen/ARM/atomic-op.ll
@@ -320,10 +320,10 @@
 ; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
 ; CHECK:     cmp     [[SUCCESS]], #0
 ; CHECK:     bne     [[LOOP_BB]]
-; CHECK:     b       [[END_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK:     dmb     ish
+; CHECK:     bx      lr
 ; CHECK: [[FAIL_BB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[END_BB]]:
 ; CHECK:     dmb     ish
 ; CHECK:     bx      lr
 
Index: test/CodeGen/ARM/atomic-ops-v8.ll
===================================================================
--- test/CodeGen/ARM/atomic-ops-v8.ll
+++ test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1045,20 +1045,21 @@
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i8 %old
 }
 
@@ -1078,20 +1079,21 @@
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i16 %old
 }
 
@@ -1110,20 +1112,21 @@
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-NEXT: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-ARM-NEXT: bx lr
    ret void
 }
 
@@ -1148,16 +1151,16 @@
 ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r2, r3 is a reasonable guess.
 ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+; CHECK-NEXT: pop
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
Index: test/CodeGen/ARM/fold-stack-adjust.ll
===================================================================
--- test/CodeGen/ARM/fold-stack-adjust.ll
+++ test/CodeGen/ARM/fold-stack-adjust.ll
@@ -135,7 +135,7 @@
 
   ; Important to check for beginning of basic block, because if it gets
   ; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
 ; CHECK-NEXT: vpop {d7, d8}
 ; CHECK-NEXT: pop {r4, pc}
 
Index: test/CodeGen/ARM/machine-cse-cmp.ll
===================================================================
--- test/CodeGen/ARM/machine-cse-cmp.ll
+++ test/CodeGen/ARM/machine-cse-cmp.ll
@@ -52,7 +52,7 @@
 ; CHECK-LABEL: f3:
 ; CHECK-NOT: sub
 ; CHECK: cmp
-; CHECK: blt
+; CHECK: bge
 %0 = load i32, i32* %offset, align 4
 %cmp = icmp slt i32 %0, %size
 %s = sub nsw i32 %0, %size
Index: test/CodeGen/Mips/llvm-ir/ashr.ll
===================================================================
--- test/CodeGen/Mips/llvm-ir/ashr.ll
+++ test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -91,12 +91,13 @@
   ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
   ; M2:         or        $3, $[[T3]], $[[T2]]
   ; M2:         $[[BB0]]:
-  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
   ; M2:         nop
-  ; M2:         sra       $2, $4, 31
-  ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
   ; M2:         nop
+  ; M2:         $[[BB1]]:
+  ; M2:         jr        $ra
+  ; M2:         sra       $2, $4, 31
 
   ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -177,12 +178,13 @@
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
   ; M3:             [[BB0]]:
-  ; M3:             beqz      $[[T3]], [[BB1:.LBB[0-9_]+]]
+  ; M3:             bnez      $[[T3]], [[BB1:.LBB[0-9_]+]]
   ; M3:             nop
-  ; M3:             dsra      $2, $4, 63
-  ; M3:             [[BB1]]:
   ; M3:             jr        $ra
   ; M3:             nop
+  ; M3:             [[BB1]]:
+  ; M3:             jr        $ra
+  ; M3:             dsra      $2, $4, 63
 
   ; GP64-NOT-R6:    dsrlv     $[[T0:[0-9]+]], $5, $7
   ; GP64-NOT-R6:    dsll      $[[T1:[0-9]+]], $4, 1
Index: test/CodeGen/PowerPC/tail-dup-layout.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-layout.ll
+++ test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,51 +1,51 @@
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; Intended layout:
-; The outlining flag produces the layout
+; The chain-based outlining produces the layout
 ; test1
 ; test2
 ; test3
 ; test4
-; exit
 ; optional1
 ; optional2
 ; optional3
 ; optional4
+; exit
 ; Tail duplication puts test n+1 at the end of optional n
 ; so optional1 includes a copy of test2 at the end, and branches
 ; to test3 (at the top) or falls through to optional 2.
-; The CHECK statements check for the whole string of tests and exit block,
+; The CHECK statements check for the whole string of tests
 ; and then check that the correct test has been duplicated into the end of
 ; the optional blocks and that the optional blocks are in the correct order.
 ;CHECK-LABEL: straight_test:
 ; test1 may have been merged with entry
 ;CHECK: mr [[TAGREG:[0-9]+]], 3
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
-;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
 ;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
-;CHECK: blr
-;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK-NEXT: beq 0, .[[EXITLABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[OPT1LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[TEST3LABEL]]
-;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: beq 0, [[TEST4LABEL]]
-;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[OPT4LABEL]]
-;CHECK: b [[EXITLABEL]]
+;CHECK-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL]]
+;CHECK: .[[EXITLABEL]]: # %exit
+;CHECK: blr
 
 define void @straight_test(i32 %tag) {
 entry:
@@ -53,7 +53,7 @@
 test1:
   %tagbit1 = and i32 %tag, 1
   %tagbit1eq0 = icmp eq i32 %tagbit1, 0
-  br i1 %tagbit1eq0, label %test2, label %optional1
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
 optional1:
   call void @a()
   call void @a()
@@ -63,7 +63,7 @@
 test2:
   %tagbit2 = and i32 %tag, 2
   %tagbit2eq0 = icmp eq i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %test3, label %optional2
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
 optional2:
   call void @b()
   call void @b()
@@ -73,7 +73,7 @@
 test3:
   %tagbit3 = and i32 %tag, 4
   %tagbit3eq0 = icmp eq i32 %tagbit3, 0
-  br i1 %tagbit3eq0, label %test4, label %optional3
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
 optional3:
   call void @c()
   call void @c()
@@ -83,7 +83,7 @@
 test4:
   %tagbit4 = and i32 %tag, 8
   %tagbit4eq0 = icmp eq i32 %tagbit4, 0
-  br i1 %tagbit4eq0, label %exit, label %optional4
+  br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
 optional4:
   call void @d()
   call void @d()
@@ -94,6 +94,113 @@
   ret void
 }
 
+; Intended layout:
+; The chain-based outlining produces the layout
+; entry
+; --- Begin loop ---
+; for.latch
+; for.check
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; --- End loop ---
+; exit
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: loop_test:
+;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
+;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
+;CHECK: addi
+;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
+;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
+;CHECK: # %test1
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL]]
+;CHECK: b .[[LATCHLABEL]]
+define void @loop_test(i32* %tags, i32 %count) {
+entry:
+  br label %for.check
+for.check:
+  %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
+  %done.count = icmp ugt i32 %count.loop, 0
+  %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
+  %tag = load i32, i32* %tag_ptr
+  %done.tag = icmp eq i32 %tag, 0
+  %done = and i1 %done.count, %done.tag
+  br i1 %done, label %test1, label %exit, !prof !1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %for.latch
+for.latch:
+  %count.sub = sub i32 %count.loop, 1
+  br label %for.check
+exit:
+  ret void
+}
+
 ; The block then2 is not unavoidable, but since it can be tail-duplicated, it
 ; should be placed as a fallthrough from test2 and copied.
 ; CHECK-LABEL: avoidable_test:
@@ -105,7 +212,6 @@
 ; CHECK: # %then2
 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
 ; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
-; CHECK: # %end2
 ; CHECK: # %else1
 ; CHECK: bl a
 ; CHECK: bl a
@@ -113,6 +219,7 @@
 ; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
 ; CHECK: # %else2
 ; CHECK: bl c
+; CHECK: # %end2
 define void @avoidable_test(i32 %tag) {
 entry:
   br label %test1
@@ -141,10 +248,9 @@
   call void @d()
   ret void
 }
-
 declare void @a()
 declare void @b()
 declare void @c()
 declare void @d()
 
-!1 = !{!"branch_weights", i32 2, i32 1}
+!1 = !{!"branch_weights", i32 5, i32 3}
Index: test/CodeGen/SystemZ/asm-18.ll
===================================================================
--- test/CodeGen/SystemZ/asm-18.ll
+++ test/CodeGen/SystemZ/asm-18.ll
@@ -297,7 +297,7 @@
 ; CHECK: iihf [[REG]], 2102030405
 ; CHECK: blah [[REG]]
 ; CHECK: br %r14
-  %cmp = icmp eq i32 %x, 0
+  %cmp = icmp ne i32 %x, 0
   %val = select i1 %cmp, i32 0, i32 2102030405
   call void asm sideeffect "blah $0", "h"(i32 %val)
   ret void
@@ -311,7 +311,7 @@
 ; CHECK: iilf [[REG]], 2102030405
 ; CHECK: blah [[REG]]
 ; CHECK: br %r14
-  %cmp = icmp eq i32 %x, 0
+  %cmp = icmp ne i32 %x, 0
   %val = select i1 %cmp, i32 0, i32 2102030405
   call void asm sideeffect "blah $0", "r"(i32 %val)
   ret void
Index: test/CodeGen/SystemZ/cond-store-01.ll
===================================================================
--- test/CodeGen/SystemZ/cond-store-01.ll
+++ test/CodeGen/SystemZ/cond-store-01.ll
@@ -297,8 +297,11 @@
 define void @f18(i8 *%ptr, i8 %alt, i32 %limit) {
 ; CHECK-LABEL: f18:
 ; CHECK: lb {{%r[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: stc {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}}
 ; CHECK: stc {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
@@ -331,8 +334,11 @@
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f20:
 ; CHECK: lb {{%r[0-9]+}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: stc {{%r[0-9]+}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}}
 ; CHECK: stc {{%r[0-9]+}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
Index: test/CodeGen/SystemZ/cond-store-02.ll
===================================================================
--- test/CodeGen/SystemZ/cond-store-02.ll
+++ test/CodeGen/SystemZ/cond-store-02.ll
@@ -297,8 +297,11 @@
 define void @f18(i16 *%ptr, i16 %alt, i32 %limit) {
 ; CHECK-LABEL: f18:
 ; CHECK: lh {{%r[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: sth {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}}
 ; CHECK: sth {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
@@ -331,8 +334,11 @@
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f20:
 ; CHECK: lh {{%r[0-9]+}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: sth {{%r[0-9]+}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lr {{%r[0-9]+}}, {{%r[0-9]+}}
 ; CHECK: sth {{%r[0-9]+}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
Index: test/CodeGen/SystemZ/cond-store-03.ll
===================================================================
--- test/CodeGen/SystemZ/cond-store-03.ll
+++ test/CodeGen/SystemZ/cond-store-03.ll
@@ -226,8 +226,11 @@
 define void @f14(i32 *%ptr, i32 %alt, i32 %limit) {
 ; CHECK-LABEL: f14:
 ; CHECK: l {{%r[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: st {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}}
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
@@ -260,8 +263,11 @@
 ; FIXME: should use a normal load instead of CS.
 ; CHECK-LABEL: f16:
 ; CHECK: l {{%r[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: st {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lr {{%r[0-5]}}, {{%r[0-5]}}
 ; CHECK: st {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
Index: test/CodeGen/SystemZ/cond-store-04.ll
===================================================================
--- test/CodeGen/SystemZ/cond-store-04.ll
+++ test/CodeGen/SystemZ/cond-store-04.ll
@@ -124,8 +124,11 @@
 define void @f8(i64 *%ptr, i64 %alt, i32 %limit) {
 ; CHECK-LABEL: f8:
 ; CHECK: lg {{%r[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: stg {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lgr {{%r[0-5]}}, {{%r[0-5]}}
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
@@ -158,8 +161,11 @@
 ; FIXME: should use a normal load instead of CSG.
 ; CHECK-LABEL: f10:
 ; CHECK: lg {{%r[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: stg {{%r[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: lgr {{%r[0-5]}}, {{%r[0-5]}}
 ; CHECK: stg {{%r[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
Index: test/CodeGen/SystemZ/cond-store-05.ll
===================================================================
--- test/CodeGen/SystemZ/cond-store-05.ll
+++ test/CodeGen/SystemZ/cond-store-05.ll
@@ -156,8 +156,11 @@
 define void @f10(float *%ptr, float %alt, i32 %limit) {
 ; CHECK-LABEL: f10:
 ; CHECK: le {{%f[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: ste {{%f[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: ler {{%f[0-5]}}, {{%f[0-5]}}
 ; CHECK: ste {{%f[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
Index: test/CodeGen/SystemZ/cond-store-06.ll
===================================================================
--- test/CodeGen/SystemZ/cond-store-06.ll
+++ test/CodeGen/SystemZ/cond-store-06.ll
@@ -156,8 +156,11 @@
 define void @f10(double *%ptr, double %alt, i32 %limit) {
 ; CHECK-LABEL: f10:
 ; CHECK: ld {{%f[0-5]}}, 0(%r2)
-; CHECK: {{jl|jnl}} [[LABEL:[^ ]*]]
+; CHECK: {{jhe|jnhe}} [[LABEL:[^ ]*]]
+; CHECK: std {{%f[0-5]}}, 0(%r2)
+; CHECK: br %r14
 ; CHECK: [[LABEL]]:
+; CHECK: ldr {{%f[0-5]}}, {{%f[0-5]}}
 ; CHECK: std {{%f[0-5]}}, 0(%r2)
 ; CHECK: br %r14
   %cond = icmp ult i32 %limit, 420
Index: test/CodeGen/SystemZ/int-cmp-48.ll
===================================================================
--- test/CodeGen/SystemZ/int-cmp-48.ll
+++ test/CodeGen/SystemZ/int-cmp-48.ll
@@ -52,7 +52,7 @@
 define double @f3(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f3:
 ; CHECK: tm 0(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %byte = load i8 , i8 *%src
   %and = and i8 %byte, 1
@@ -80,7 +80,7 @@
 define double @f5(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f5:
 ; CHECK: tm 0(%r2), 1
-; CHECK: jne {{\.L.*}}
+; CHECK: je {{\.L.*}}
 ; CHECK: br %r14
   %byte = load i8 , i8 *%src
   %and = and i8 %byte, 1
@@ -93,7 +93,7 @@
 define double @f6(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f6:
 ; CHECK: tm 0(%r2), 254
-; CHECK: jo {{\.L.*}}
+; CHECK: jno {{\.L.*}}
 ; CHECK: br %r14
   %byte = load i8 , i8 *%src
   %and = and i8 %byte, 254
@@ -106,7 +106,7 @@
 define double @f7(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f7:
 ; CHECK: tm 0(%r2), 254
-; CHECK: jno {{\.L.*}}
+; CHECK: jo {{\.L.*}}
 ; CHECK: br %r14
   %byte = load i8 , i8 *%src
   %and = and i8 %byte, 254
@@ -121,7 +121,7 @@
 ; CHECK-LABEL: f8:
 ; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: tmll [[REG]], 3
-; CHECK: jh {{\.L.*}}
+; CHECK: jnh {{\.L.*}}
 ; CHECK: br %r14
   %byte = load i8 , i8 *%src
   %and = and i8 %byte, 3
@@ -135,7 +135,7 @@
 ; CHECK-LABEL: f9:
 ; CHECK: llc [[REG:%r[0-5]]], 0(%r2)
 ; CHECK: tmll [[REG]], 3
-; CHECK: jl {{\.L.*}}
+; CHECK: jnl {{\.L.*}}
 ; CHECK: br %r14
   %byte = load i8 , i8 *%src
   %and = and i8 %byte, 3
@@ -148,7 +148,7 @@
 define double @f10(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f10:
 ; CHECK: tm 4095(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %ptr = getelementptr i8, i8 *%src, i64 4095
   %byte = load i8 , i8 *%ptr
@@ -162,7 +162,7 @@
 define double @f11(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f11:
 ; CHECK: tmy 4096(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %ptr = getelementptr i8, i8 *%src, i64 4096
   %byte = load i8 , i8 *%ptr
@@ -176,7 +176,7 @@
 define double @f12(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f12:
 ; CHECK: tmy 524287(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %ptr = getelementptr i8, i8 *%src, i64 524287
   %byte = load i8 , i8 *%ptr
@@ -191,7 +191,7 @@
 ; CHECK-LABEL: f13:
 ; CHECK: agfi %r2, 524288
 ; CHECK: tm 0(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %ptr = getelementptr i8, i8 *%src, i64 524288
   %byte = load i8 , i8 *%ptr
@@ -205,7 +205,7 @@
 define double @f14(i8 *%src, double %a, double %b) {
 ; CHECK-LABEL: f14:
 ; CHECK: tmy -524288(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %ptr = getelementptr i8, i8 *%src, i64 -524288
   %byte = load i8 , i8 *%ptr
@@ -220,7 +220,7 @@
 ; CHECK-LABEL: f15:
 ; CHECK: agfi %r2, -524289
 ; CHECK: tm 0(%r2), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %ptr = getelementptr i8, i8 *%src, i64 -524289
   %byte = load i8 , i8 *%ptr
@@ -234,7 +234,7 @@
 define double @f16(i8 *%src, i64 %index, double %a, double %b) {
 ; CHECK-LABEL: f16:
 ; CHECK: tm 0({{%r[1-5]}}), 1
-; CHECK: je {{\.L.*}}
+; CHECK: jne {{\.L.*}}
 ; CHECK: br %r14
   %ptr = getelementptr i8, i8 *%src, i64 %index
   %byte = load i8 , i8 *%ptr
Index: test/CodeGen/SystemZ/tdc-06.ll
===================================================================
--- test/CodeGen/SystemZ/tdc-06.ll
+++ test/CodeGen/SystemZ/tdc-06.ll
@@ -26,25 +26,27 @@
 nonzeroord:
 ; CHECK: lhi %r2, 2
 ; CHECK: tcdb %f0, 48
-; CHECK: jl [[RET]]
+; CHECK: je [[FINITE:.]]
   %abs = tail call double @llvm.fabs.f64(double %x)
   %testinf = fcmp oeq double %abs, 0x7FF0000000000000
   br i1 %testinf, label %ret, label %finite, !prof !1
 
+ret:
+; CHECK: [[RET]]:
+; CHECK: br %r14
+  %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ]
+  ret i32 %res
+
 finite:
 ; CHECK: lhi %r2, 3
 ; CHECK: tcdb %f0, 831
 ; CHECK: blr %r14
 ; CHECK: lhi %r2, 4
+; CHECK: br %r14
   %testnormal = fcmp uge double %abs, 0x10000000000000
   %finres = select i1 %testnormal, i32 3, i32 4
   br label %ret
 
-ret:
-; CHECK: [[RET]]:
-; CHECK: br %r14
-  %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ]
-  ret i32 %res
 }
 
 !1 = !{!"branch_weights", i32 1, i32 1}
Index: test/CodeGen/WebAssembly/mem-intrinsics.ll
===================================================================
--- test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
Index: test/CodeGen/X86/avx-splat.ll
===================================================================
--- test/CodeGen/X86/avx-splat.ll
+++ test/CodeGen/X86/avx-splat.ll
@@ -62,8 +62,10 @@
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ## implicit-def: %YMM0
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne LBB4_2
-; CHECK-NEXT:  ## BB#1: ## %load.i1247
+; CHECK-NEXT:    je LBB4_1
+; CHECK-NEXT:  ## BB#2: ## %__load_and_broadcast_32.exit1249
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB4_1: ## %load.i1247
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    andq $-32, %rsp
@@ -71,7 +73,6 @@
 ; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %ymm0
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
 ; CHECK-NEXT:    retq
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
Index: test/CodeGen/X86/block-placement.ll
===================================================================
--- test/CodeGen/X86/block-placement.ll
+++ test/CodeGen/X86/block-placement.ll
@@ -314,7 +314,7 @@
 define void @unnatural_cfg1() {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -352,7 +352,11 @@
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
-; CHECK: unnatural_cfg2
+; The tail-duplication outlining algorithm places
+; %loop.body3 and %loop.inner1.begin out-of-line at the end of the loop,
+; because %loop.body4 is unnavoidable within the loop and short,
+; and %loop.inner1.begin has an alternate fallthrough of %loop.body3
+; CHECK-LABEL: unnatural_cfg2
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -559,7 +563,7 @@
 ; didn't correctly locate the fallthrough successor, assuming blindly that the
 ; first one was the fallthrough successor. As a result, we would add an
 ; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
 ; CHECK: %entry
 ; CHECK-NOT: jmp
 ; CHECK: %loop
@@ -587,7 +591,7 @@
 ; fallthrough simply won't occur. Make sure we don't crash trying to update
 ; terminators for such constructs.
 ;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
 ; CHECK: %entry
 ; CHECK: %cleanup
 
@@ -609,7 +613,7 @@
 ; attempt to merge onto the wrong end of the inner loop just because we find it
 ; first. This was reduced from a crasher in GCC's single source.
 ;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
 ; CHECK: %loop2b
 ; CHECK: %loop1
@@ -649,7 +653,7 @@
 ; fallthrough because that happens to always produce unanalyzable branches on
 ; x86.
 ;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
 ; CHECK: %entry
 ; CHECK: %loop
 ; CHECK: %exit
@@ -673,7 +677,7 @@
 ; This branch is now analyzable and hence the destination block becomes the
 ; hotter one. The right order is entry->bar->exit->foo.
 ;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
 ; CHECK: %bar
 ; CHECK: %exit
@@ -699,7 +703,7 @@
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
 ; CHECK: %entry
 ; CHECK: %a
 ; CHECK: %b
@@ -729,7 +733,7 @@
 ; Ensure that we don't crash as we're building up many unanalyzable branches,
 ; blocks, and loops.
 ;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
 ; CHECK: %entry
 ; CHECK: %exit
 
@@ -948,7 +952,7 @@
 ;    strange layouts that are siginificantly less efficient, often times maing
 ;    it discontiguous.
 ;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
 ; CHECK: %entry
 ; First rotated loop top.
 ; CHECK: .p2align
Index: test/CodeGen/X86/critical-edge-split-2.ll
===================================================================
--- test/CodeGen/X86/critical-edge-split-2.ll
+++ test/CodeGen/X86/critical-edge-split-2.ll
@@ -24,6 +24,7 @@
 
 ; CHECK-LABEL: test1:
 ; CHECK: testb %dil, %dil
-; CHECK: jne LBB0_2
+; CHECK: je LBB0_1
+; CHECK: retq
+; CHECK: LBB0_1:
 ; CHECK: divl
-; CHECK: LBB0_2:
Index: test/CodeGen/X86/shift-double.ll
===================================================================
--- test/CodeGen/X86/shift-double.ll
+++ test/CodeGen/X86/shift-double.ll
@@ -14,11 +14,13 @@
 ; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    shldl %cl, %esi, %edx
 ; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    je .LBB0_2
-; CHECK-NEXT:  # BB#1:
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_1:
 ; CHECK-NEXT:    movl %eax, %edx
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:  .LBB0_2:
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
         %shift.upgrd.1 = zext i8 %C to i64              ; <i64> [#uses=1]
@@ -37,12 +39,14 @@
 ; CHECK-NEXT:    sarl %cl, %edx
 ; CHECK-NEXT:    shrdl %cl, %esi, %eax
 ; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    je .LBB1_2
-; CHECK-NEXT:  # BB#1:
+; CHECK-NEXT:    jne .LBB1_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB1_1:
 ; CHECK-NEXT:    sarl $31, %esi
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    movl %esi, %edx
-; CHECK-NEXT:  .LBB1_2:
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
         %shift.upgrd.2 = zext i8 %C to i64              ; <i64> [#uses=1]
@@ -61,11 +65,13 @@
 ; CHECK-NEXT:    shrl %cl, %edx
 ; CHECK-NEXT:    shrdl %cl, %esi, %eax
 ; CHECK-NEXT:    testb $32, %cl
-; CHECK-NEXT:    je .LBB2_2
-; CHECK-NEXT:  # BB#1:
+; CHECK-NEXT:    jne .LBB2_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB2_1:
 ; CHECK-NEXT:    movl %edx, %eax
 ; CHECK-NEXT:    xorl %edx, %edx
-; CHECK-NEXT:  .LBB2_2:
 ; CHECK-NEXT:    popl %esi
 ; CHECK-NEXT:    retl
         %shift.upgrd.3 = zext i8 %C to i64              ; <i64> [#uses=1]
Index: test/CodeGen/X86/sink-hoist.ll
===================================================================
--- test/CodeGen/X86/sink-hoist.ll
+++ test/CodeGen/X86/sink-hoist.ll
@@ -26,7 +26,8 @@
 
 ; CHECK-LABEL: split:
 ; CHECK-NEXT: testb $1, %dil
-; CHECK-NEXT: je
+; CHECK-NEXT: jne
+; CHECK:      ret
 ; CHECK:      divsd
 ; CHECK:      movapd
 ; CHECK:      ret
Index: test/CodeGen/X86/sse-scalar-fp-arith.ll
===================================================================
--- test/CodeGen/X86/sse-scalar-fp-arith.ll
+++ test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -1110,10 +1110,12 @@
 ; AVX1-LABEL: add_ss_mask:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    testb $1, %dil
-; AVX1-NEXT:    je .LBB62_2
-; AVX1-NEXT:  # BB#1:
+; AVX1-NEXT:    jne .LBB62_1
+; AVX1-NEXT:  # BB#2:
+; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB62_1:
 ; AVX1-NEXT:    vaddss %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:  .LBB62_2:
 ; AVX1-NEXT:    vblendps {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; AVX1-NEXT:    retq
 ;
@@ -1165,10 +1167,12 @@
 ; AVX1-LABEL: add_sd_mask:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    testb $1, %dil
-; AVX1-NEXT:    je .LBB63_2
-; AVX1-NEXT:  # BB#1:
+; AVX1-NEXT:    jne .LBB63_1
+; AVX1-NEXT:  # BB#2:
+; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; AVX1-NEXT:    retq
+; AVX1-NEXT:  .LBB63_1:
 ; AVX1-NEXT:    vaddsd %xmm1, %xmm0, %xmm2
-; AVX1-NEXT:  .LBB63_2:
 ; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
 ; AVX1-NEXT:    retq
 ;
Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll
===================================================================
--- test/CodeGen/X86/tail-dup-merge-loop-headers.ll
+++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: tail_dup_merge_loops
 ; CHECK: # %entry
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_exit
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_latch
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_test
-; CHECK-NOT: # %{{[a-zA-Z_]+}}
-; CHECK: # %exit
 define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
 entry:
   %notlhs674.i = icmp eq i32 %a, 0
Index: test/CodeGen/X86/tail-dup-repeat.ll
===================================================================
--- test/CodeGen/X86/tail-dup-repeat.ll
+++ test/CodeGen/X86/tail-dup-repeat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
Index: test/CodeGen/X86/tail-opts.ll
===================================================================
--- test/CodeGen/X86/tail-opts.ll
+++ test/CodeGen/X86/tail-opts.ll
@@ -112,14 +112,13 @@
 ; CHECK:        ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_3
 ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
-; CHECK-NEXT:   ja .LBB2_4
-; CHECK-NEXT:   jmp .LBB2_2
-; CHECK-NEXT: .LBB2_3:
-; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_2
 ; CHECK-NEXT: .LBB2_4:
 ; CHECK-NEXT:   xorl %eax, %eax
 ; CHECK-NEXT:   ret
+; CHECK-NEXT: .LBB2_3:
+; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
+; CHECK-NEXT:   ja .LBB2_4
 ; CHECK-NEXT: .LBB2_2:
 ; CHECK-NEXT:   movb $1, %al
 ; CHECK-NEXT:   ret
Index: test/CodeGen/X86/twoaddr-coalesce-3.ll
===================================================================
--- test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -19,7 +19,7 @@
 
 ; Check that only one mov will be generated in the kernel loop.
 ; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
 ; CHECK-NOT: mov
@@ -56,7 +56,7 @@
 
 ; Check that only two mov will be generated in the kernel loop.
 ; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
 ; CHECK-NOT: mov