Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -440,6 +440,8 @@
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
   bool shouldTailDuplicate(MachineBasicBlock *BB);
   /// Check the edge frequencies to see if tail duplication will increase
   /// fallthroughs.
@@ -447,6 +449,17 @@
     const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
     BranchProbability AdjustedSumProb,
     const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Check for a lattice layout.
+  bool isLattice(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Get the best successor given a lattice layout.
+  BlockAndTailDupResult getBestLatticeSuccessor(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      BranchProbability AdjustedSumProb,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
   /// Returns true if a block can tail duplicate into all unplaced
   /// predecessors. Filters based on loop.
   bool canTailDuplicateUnplacedPreds(
@@ -614,7 +627,23 @@
   return SuccProb;
 }
 
-/// Check if a block should be tail duplicated.
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool hasSameSuccessors(
+    MachineBasicBlock &BB,
+    SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
 /// \p BB Block to check.
 bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
   // Blocks with single successors don't create additional fallthrough
@@ -724,24 +753,22 @@
   //    | /       |     C' (+Succ)
   //    Succ      Succ /|
   //    / \       |  \/ |
-  //  U/   =V     =  /= =
+  //  U/   =V     |  == |
   //  /     \     | /  \|
   //  D      E    D     E
   //  '=' : Branch taken for that CFG edge
   //  Cost in the first case is: P + V
   //  For this calculation, we always assume P > Qout. If Qout > P
   //  The result of this function will be ignored at the caller.
-  //  Cost in the second case is: Qout + Qin * V + P * U + P * V
-  //  TODO(iteratee): If we lay out D after Succ, the P * U term
-  //  goes away. This logic is coming in D28522.
+  //  Cost in the second case is: Qout + Qin * U + P * V
 
   if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
     BranchProbability UProb = BestSuccSucc;
     BranchProbability VProb = AdjustedSuccSumProb - UProb;
     BlockFrequency V = SuccFreq * VProb;
-    BlockFrequency QinV = Qin * VProb;
+    BlockFrequency QinU = Qin * UProb;
     BlockFrequency BaseCost = P + V;
-    BlockFrequency DupCost = Qout + QinV + P * AdjustedSuccSumProb;
+    BlockFrequency DupCost = Qout + QinU + P * VProb;
     return greaterWithBias(BaseCost, DupCost, EntryFreq);
   }
   BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
@@ -758,7 +785,7 @@
   // Succ       Succ /|            Succ        Succ /|
   // | \  V     |  \/ |            | \  V      |  \/ |
   // |U \       |U /\ |            |U =        |U /\ |
-  // =   D      = =  =|            |   D       | =  =|
+  // =   D      = =  \=            |   D       | =  =|
   // |  /       |/    D            |  /        |/    D
   // | /        |    /             | =         |    /
   // |/         |   /              |/          |   =
@@ -768,7 +795,7 @@
   // The cost in the second case (assuming independence), given the layout:
   // BB, Succ, (C+Succ), D, Dom
   // is Qout + P * V + Qin * U
-  // compare P + U vs Qout + P + Qin * U.
+  // compare P + U vs Qout + P * U + Qin.
   //
   // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
   //
@@ -777,16 +804,185 @@
   // We choose 4 over 3 when (P + V) > Qout + Qin * U + P * V
   if (UProb > AdjustedSuccSumProb / 2
       && !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom],
-                                     UProb, UProb, Chain, BlockFilter)) {
+                                     UProb, UProb, Chain, BlockFilter))
     // Cases 3 & 4
     return greaterWithBias((P + V), (Qout + Qin * UProb + P * VProb),
                            EntryFreq);
-  }
   // Cases 1 & 2
   return greaterWithBias(
-      (P + U), (Qout + Qin * UProb + P * AdjustedSuccSumProb), EntryFreq);
+      (P + U), (Qout + Qin * AdjustedSuccSumProb + P * UProb), EntryFreq);
 }
 
+/// Check for a lattice layout. \p BB is part of a lattice if its successors
+/// form a lattice. A successor set S forms a lattice if all of the predecessors
+/// of S are either in S or have all of S as successors. We ignore lattices
+/// where BB doesn't have 2 successors because for fewer than 2, it's trivial,
+/// and for 3 or greater they are very uncommon and complex to compute
+/// optimally.
+bool MachineBlockPlacement::isLattice(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // Technically BB could form a lattice with branching factor higher than 2.
+  // But that's extremely uncommon.
+  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
+    return false;
+
+  SmallPtrSet<const MachineBasicBlock *, 2>
+      Successors(BB->succ_begin(), BB->succ_end());
+  // To avoid reviewing the same predecessors twice.
+  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+  for (auto Succ : ViableSuccs) {
+    int PredCount = 0;
+    for (auto SuccPred : Succ->predecessors()) {
+      // Allow triangle successors, but don't count them.
+      if (Successors.count(SuccPred))
+        continue;
+      if (SuccPred == BB
+          || (BlockFilter && !BlockFilter->count(SuccPred))
+          || BlockToChain[SuccPred] == &Chain
+          || BlockToChain[SuccPred] == BlockToChain[Succ])
+        continue;
+      ++PredCount;
+      // Perform the successor check only once.
+      if (!SeenPreds.insert(SuccPred).second)
+        continue;
+      if (!hasSameSuccessors(*SuccPred, Successors))
+        return false;
+    }
+    // If one of the successors has only BB as a predecessor, it is not a
+    // lattice.
+    if (PredCount < 1)
+      return false;
+  }
+  return true;
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a lattice.
+/// We only handle lattices with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the lattice. If those conflict,
+/// we consider which of them should be replaced with the second best.
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestLatticeSuccessor(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    BranchProbability AdjustedSumProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+
+  BlockAndTailDupResult Result = { nullptr, false };
+  SmallPtrSet<const MachineBasicBlock *, 4>
+      Successors(BB->succ_begin(), BB->succ_end());
+  SmallVector<SmallVector<std::tuple<BlockFrequency,
+      const MachineBasicBlock *, MachineBasicBlock *>, 8>, 2> BestEdges(2);
+
+  // We assume size 2 because it's common. For general n, we would have to do
+  // the Hungarian algorithm, but it's not worth the complexity because more
+  // than 2 successors is fairly uncommon, and a lattice is basically
+  // non-existent.
+  if (Successors.size() != 2 || ViableSuccs.size() != 2)
+    return Result;
+
+  // Collect the edge frequencies of all edges that form the lattice.
+  int SuccIndex = 0;
+  for (auto Succ : ViableSuccs) {
+    for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+      // Skip any placed predecessors that are not BB
+      if (SuccPred != BB)
+        if((BlockFilter && !BlockFilter->count(SuccPred))
+           || BlockToChain[SuccPred] == &Chain
+           || BlockToChain[SuccPred] == BlockToChain[Succ])
+          continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred)
+          * MBPI->getEdgeProbability(SuccPred, Succ);
+      BestEdges[SuccIndex].push_back(std::make_tuple(EdgeFreq, SuccPred, Succ));
+    }
+    ++SuccIndex;
+  }
+
+  auto Cmp = [](
+      std::tuple<BlockFrequency, const MachineBasicBlock*, MachineBasicBlock*> A,
+      std::tuple<BlockFrequency, const MachineBasicBlock*, MachineBasicBlock*> B) {
+    return std::get<0>(A) > std::get<0>(B);
+  };
+
+  // Sort the edges, and then for each successor, find the best and second best
+  // incoming predecessor. If the best incoming predecessors aren't the same,
+  // then that is clearly the best layout. If there is a conflict, one of the
+  // successors will have to fallthrough from the second best predecessor. We
+  // compare which combination is better overall. Given that we have split them
+  // out by successor, we know that the second best will not conflict if the
+  // best edges conflict.
+  std::stable_sort(BestEdges[0].begin(), BestEdges[0].end(), Cmp);
+  std::stable_sort(BestEdges[1].begin(), BestEdges[1].end(), Cmp);
+  auto BestA = BestEdges[0].begin();
+  auto BestB = BestEdges[1].begin();
+  auto SecondBestA = std::next(BestA);
+  auto SecondBestB = std::next(BestB);
+  assert (BestA != BestEdges[0].end() && BestB != BestEdges[1].end()
+          && SecondBestA != BestEdges[0].end()
+          && SecondBestB != BestEdges[1].end()
+          && "Should have found predecessors for both successors.");
+  // Arrange for the correct answer to be in BestA and BestB
+  // If the 2 best edges don't conflict, the answer is already there.
+  if (std::get<1>(*BestA) == std::get<1>(*BestB)) {
+    // Compare the total fallthrough frequency for each second best and pick
+    // the higher.
+    BlockFrequency BestAScore = std::get<0>(*BestA) + std::get<0>(*SecondBestB);
+    BlockFrequency BestBScore = std::get<0>(*BestB) + std::get<0>(*SecondBestA);
+    if (BestAScore < BestBScore)
+      BestA = SecondBestA;
+    else
+      BestB = SecondBestB;
+  }
+  // Arrange for the BB edge to be in BestA if it exists.
+  if (std::get<1>(*BestB) == BB) {
+    std::swap(BestA, BestB);
+  }
+  if (std::get<1>(*BestA) == BB) {
+    // Did we pick the triangle edge? If tail-duplication is profitable, do
+    // that instead. Otherwise merge the triangle edge now while we know it is
+    // optimal.
+    if (std::get<2>(*BestA) == std::get<1>(*BestB)) {
+      // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+      // would be better.
+      auto Succ1 = std::get<2>(*BestA);
+      auto Succ2 = std::get<2>(*BestB);
+      // Check to see if tail-duplication would be profitable.
+      if (TailDupPlacement &&
+          shouldTailDuplicate(Succ2) &&
+          canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+          isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+                                Chain, BlockFilter)) {
+        DEBUG(
+            BranchProbability Succ2Prob = getAdjustedProbability(
+                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+            dbgs() << "    Selected: " << getBlockName(Succ2)
+                   << ", probability: " << Succ2Prob
+                   << " (Tail Duplicate)\n");
+        Result.BB = Succ2;
+        Result.ShouldTailDup = true;
+        return Result;
+      }
+      // Else merge
+      BlockToChain[Succ1]->merge(Succ2, BlockToChain[Succ2]);
+    }
+    auto LatticeSucc = std::get<2>(*BestA);
+    DEBUG(
+        BranchProbability SuccProb = getAdjustedProbability(
+            MBPI->getEdgeProbability(BB, LatticeSucc), AdjustedSumProb);
+        dbgs() << "    Selected: " << getBlockName(LatticeSucc)
+               << ", probability: " << SuccProb << " (Lattice)\n");
+    Result.BB = LatticeSucc;
+    return Result;
+  }
+  // If we have a lattice, and BB doesn't have the best fallthrough edges,
+  // we shouldn't choose any successor. We've already looked and there's a
+  // better fallthrough edge for all the successors.
+  DEBUG(dbgs() << "Lattice, but not one of the chosen edges.\n");
+  return Result;
+}
 
 /// When the option TailDupPlacement is on, this method checks if the
 /// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
@@ -797,6 +993,8 @@
   if (!shouldTailDuplicate(Succ))
     return false;
 
+  // For CFG checking.
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(), BB->succ_end());
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     // Make sure all unplaced and unfiltered predecessors can be
     // tail-duplicated into.
@@ -804,8 +1002,43 @@
     if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
         || BlockToChain[Pred] == &Chain)
       continue;
-    if (!TailDup.canTailDuplicate(Succ, Pred))
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1
+          && hasSameSuccessors(*Pred, Successors))
+        // This will result in a lattice after tail duplication, so we don't
+        // need to copy Succ into this predecessor. In the presence
+        // of a lattice tail duplication can continue to be profitable.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering whether
+        // Succ can be duplicated into all its unplaced predecessors, we ignore C.
+        // We can do this because C already has a profitable fallthrough, namely
+        // D. TODO(iteratee): ignore sufficiently cold predecessors for
+        // duplication and for this test.
+        //
+        // This allows lattices to be laid out in 2 separate chains
+        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+        // because it allows the creation of 2 fallthrough paths with links
+        // between them, and we correctly identify the best layout for these
+        // CFGs. We want to extend lattices that the user created in addition to
+        // lattices created by tail-duplication, so we just look for the CFG.
+        continue;
       return false;
+    }
   }
   return true;
 }
@@ -990,11 +1223,12 @@
   //  |  Pred----|                     |  S1----
   //  |  |                             |       |
   //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
   //
   // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
   //    + min(freq(Pred->S1), freq(Pred->S2))
   // Non-topo-order cost:
-  // In the worst case, S2 will not get laid out after Pred.
   // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
   // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
   // is 0. Then the non topo layout is better when
@@ -1073,6 +1307,13 @@
 
   DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
 
+  // if BB is part of a lattice, Use the lattice to determine the optimal
+  // fallthrough edges
+  if (isLattice(BB, Successors, Chain, BlockFilter)) {
+    return getBestLatticeSuccessor(
+        BB, Successors, AdjustedSumProb, Chain, BlockFilter);
+  }
+
   // For blocks with CFG violations, we may be able to lay them out anyway with
   // tail-duplication. We keep this vector so we can perform the probability
   // calculations the minimum number of times.
Index: test/CodeGen/AArch64/branch-relax-cbz.ll
===================================================================
--- test/CodeGen/AArch64/branch-relax-cbz.ll
+++ test/CodeGen/AArch64/branch-relax-cbz.ll
@@ -6,23 +6,22 @@
 
 ; CHECK-NEXT: ; BB#1: ; %b3
 ; CHECK: ldr [[LOAD:w[0-9]+]]
-; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
-
-; CHECK-NEXT: [[SKIP_LONG_B]]:
+; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
 
+; CHECK-NEXT: [[B8]]: ; %b8
+; CHECK-NEXT: ret
+
 ; CHECK-NEXT: [[B2]]: ; %b2
 ; CHECK: mov w{{[0-9]+}}, #93
 ; CHECK: bl _extfunc
 ; CHECK: cbz w{{[0-9]+}}, [[B7]]
-
-; CHECK-NEXT: [[B8]]: ; %b8
-; CHECK-NEXT: ret
+; CHECK-NEXT: b [[B8]]
 
 ; CHECK-NEXT: [[B7]]: ; %b7
 ; CHECK: mov w{{[0-9]+}}, #13
 ; CHECK: b _extfunc
+
 define void @split_block_no_fallthrough(i64 %val) #0 {
 bb:
   %c0 = icmp sgt i64 %val, -5
Index: test/CodeGen/AArch64/combine-comparisons-by-cse.ll
===================================================================
--- test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -264,7 +264,7 @@
 define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
 ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
 ; CHECK: cmn
-; CHECK: b.gt
+; CHECK: b.le
 ; CHECK: cmp
 ; CHECK: b.gt
 entry:
Index: test/CodeGen/AArch64/optimize-cond-branch.ll
===================================================================
--- test/CodeGen/AArch64/optimize-cond-branch.ll
+++ test/CodeGen/AArch64/optimize-cond-branch.ll
@@ -11,7 +11,7 @@
 ;
 ; CHECK-LABEL: func
 ; CHECK-NOT: and
-; CHECK: tbnz
+; CHECK: tbz
 define void @func() {
   %c0 = icmp sgt i64 0, 0
   br i1 %c0, label %b1, label %b6
Index: test/CodeGen/AMDGPU/basic-branch.ll
===================================================================
--- test/CodeGen/AMDGPU/basic-branch.ll
+++ test/CodeGen/AMDGPU/basic-branch.ll
@@ -8,13 +8,10 @@
 ; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
-
-; GCN: ; BB#1
 ; GCNNOOPT: v_readlane_b32
 ; GCNNOOPT: v_readlane_b32
 ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
+; GCNNOOPT: s_endpgm
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
Index: test/CodeGen/AMDGPU/branch-relaxation.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-relaxation.ll
+++ test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -491,7 +491,8 @@
 
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
+; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
 
 ; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
Index: test/CodeGen/AMDGPU/convergent-inlineasm.ll
===================================================================
--- test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -29,6 +29,7 @@
 ; GCN: v_cmp_ne_u32_e64
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
+
 define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/salu-to-valu.ll
===================================================================
--- test/CodeGen/AMDGPU/salu-to-valu.ll
+++ test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -439,7 +439,7 @@
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
+; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
Index: test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
===================================================================
--- test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
+++ test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
@@ -12,11 +12,11 @@
 ; CHECK: bl _quux
 ; CHECK-NOT: bl _quux
 
-; NOMERGE: bl _baz
-; NOMERGE: bl _baz
+; NOMERGE-DAG: bl _baz
+; NOMERGE-DAG: bl _baz
 
-; NOMERGE: bl _quux
-; NOMERGE: bl _quux
+; NOMERGE-DAG: bl _quux
+; NOMERGE-DAG: bl _quux
 
 ; ModuleID = 'tail.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
Index: test/CodeGen/ARM/atomic-cmpxchg.ll
===================================================================
--- test/CodeGen/ARM/atomic-cmpxchg.ll
+++ test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -66,14 +66,14 @@
 ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
 ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
-; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
+; CHECK-ARMV7-NEXT: moveq r0, #1
 ; CHECK-ARMV7-NEXT: bxeq lr
 ; CHECK-ARMV7-NEXT: [[TRY]]:
-; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
-; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
 ; CHECK-ARMV7-NEXT: beq [[HEAD]]
 ; CHECK-ARMV7-NEXT: clrex
-; CHECK-ARMV7-NEXT: mov [[RES]], #0
+; CHECK-ARMV7-NEXT: mov r0, #0
 ; CHECK-ARMV7-NEXT: bx lr
 
 ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
Index: test/CodeGen/ARM/fold-stack-adjust.ll
===================================================================
--- test/CodeGen/ARM/fold-stack-adjust.ll
+++ test/CodeGen/ARM/fold-stack-adjust.ll
@@ -135,7 +135,7 @@
 
   ; Important to check for beginning of basic block, because if it gets
   ; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
 ; CHECK-NEXT: vpop {d7, d8}
 ; CHECK-NEXT: pop {r4, pc}
 
Index: test/CodeGen/PowerPC/tail-dup-break-cfg.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-break-cfg.ll
+++ test/CodeGen/PowerPC/tail-dup-break-cfg.ll
@@ -16,14 +16,14 @@
 ;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
 ;CHECK-NEXT: # %test2
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
 ;CHECK-NEXT: [[BODY1LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
 ;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[BODY2LABEL]]
-;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
-;CHECK: blr
+;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
+;CHECK: b [[EXITLABEL]]
 define void @tail_dup_break_cfg(i32 %tag) {
 entry:
   br label %test1
@@ -79,7 +79,7 @@
 test2:
   %tagbit2 = and i32 %tag, 2
   %tagbit2eq0 = icmp ne i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
+  br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
 body2:
   call void @b()
   call void @b()
@@ -137,4 +137,4 @@
 
 !1 = !{!"branch_weights", i32 5, i32 3}
 !2 = !{!"branch_weights", i32 95, i32 5}
-!3 = !{!"branch_weights", i32 7, i32 3}
+!3 = !{!"branch_weights", i32 8, i32 3}
Index: test/CodeGen/PowerPC/tail-dup-layout.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-layout.ll
+++ test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,59 +1,59 @@
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; Intended layout:
-; The outlining flag produces the layout
+; The chain-based outlining produces the layout
 ; test1
 ; test2
 ; test3
 ; test4
-; exit
 ; optional1
 ; optional2
 ; optional3
 ; optional4
+; exit
 ; Tail duplication puts test n+1 at the end of optional n
 ; so optional1 includes a copy of test2 at the end, and branches
 ; to test3 (at the top) or falls through to optional 2.
-; The CHECK statements check for the whole string of tests and exit block,
+; The CHECK statements check for the whole string of tests
 ; and then check that the correct test has been duplicated into the end of
 ; the optional blocks and that the optional blocks are in the correct order.
-;CHECK-LABEL: f:
+;CHECK-LABEL: straight_test:
 ; test1 may have been merged with entry
 ;CHECK: mr [[TAGREG:[0-9]+]], 3
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
-;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
 ;CHECK: blr
-;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK-NEXT: .[[OPT1LABEL]]:
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[TEST3LABEL]]
-;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]:
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: beq 0, [[TEST4LABEL]]
-;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]:
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[OPT4LABEL]]
-;CHECK: b [[EXITLABEL]]
+;CHECK-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL]]:
+;CHECK: b .[[EXITLABEL]]
 
-define void @f(i32 %tag) {
+define void @straight_test(i32 %tag) {
 entry:
   br label %test1
 test1:
   %tagbit1 = and i32 %tag, 1
   %tagbit1eq0 = icmp eq i32 %tagbit1, 0
-  br i1 %tagbit1eq0, label %test2, label %optional1
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
 optional1:
   call void @a()
   call void @a()
@@ -63,7 +63,7 @@
 test2:
   %tagbit2 = and i32 %tag, 2
   %tagbit2eq0 = icmp eq i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %test3, label %optional2
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
 optional2:
   call void @b()
   call void @b()
@@ -73,7 +73,7 @@
 test3:
   %tagbit3 = and i32 %tag, 4
   %tagbit3eq0 = icmp eq i32 %tagbit3, 0
-  br i1 %tagbit3eq0, label %test4, label %optional3
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
 optional3:
   call void @c()
   call void @c()
@@ -83,7 +83,7 @@
 test4:
   %tagbit4 = and i32 %tag, 8
   %tagbit4eq0 = icmp eq i32 %tagbit4, 0
-  br i1 %tagbit4eq0, label %exit, label %optional4
+  br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
 optional4:
   call void @d()
   call void @d()
@@ -94,7 +94,289 @@
   ret void
 }
 
+; Intended layout:
+; The chain-based outlining produces the layout
+; entry
+; --- Begin loop ---
+; for.latch
+; for.check
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; --- End loop ---
+; exit
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: loop_test:
+;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
+;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
+;CHECK: addi
+;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
+;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
+;CHECK: # %test1
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK: [[OPT4LABEL]]:
+;CHECK: b .[[LATCHLABEL]]
+define void @loop_test(i32* %tags, i32 %count) {
+entry:
+  br label %for.check
+for.check:
+  %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
+  %done.count = icmp ugt i32 %count.loop, 0
+  %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
+  %tag = load i32, i32* %tag_ptr
+  %done.tag = icmp eq i32 %tag, 0
+  %done = and i1 %done.count, %done.tag
+  br i1 %done, label %test1, label %exit, !prof !1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %for.latch
+for.latch:
+  %count.sub = sub i32 %count.loop, 1
+  br label %for.check
+exit:
+  ret void
+}
+
+; The block then2 is not unavoidable, but since it can be tail-duplicated, it
+; should be placed as a fallthrough from test2 and copied.
+; CHECK-LABEL: avoidable_test:
+; CHECK: # %entry
+; CHECK: andi.
+; CHECK: # %test2
+; Make sure then2 falls through from test2
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %then2
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %else1
+; CHECK: bl a
+; CHECK: bl a
+; Make sure then2 was copied into else1
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %end1
+; CHECK: bl d
+; CHECK: # %else2
+; CHECK: bl c
+; CHECK: # %end2
+define void @avoidable_test(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
+else1:
+  call void @a()
+  call void @a()
+  br label %then2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
+then2:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
+else2:
+  call void @c()
+  br label %end2
+end2:
+  ret void
+end1:
+  call void @d()
+  ret void
+}
+
+; CHECK-LABEL: lattice_test
+; The most important edge here is f->ret. It's twice as hot as the rest
+; We should also see:
+; entry -> (b|c)
+; b -> (d|e)
+; c -> (d|e)
+; (d|e) -> f
+; CHECK: # %entry
+; CHECK: # %c
+; CHECK: # %d
+; CHECK: # %f
+; CHECK: # %ret
+; CHECK: # %b
+; CHECK: # %e
+define void @lattice_test(i32 %tag) {
+entry:
+  br label %a
+a:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %c, label %b, !prof !2 ; balanced
+c:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  br i1 %tagbits.c.eq0, label %e, label %d, !prof !2 ; balanced
+e:
+  call void @e()
+  call void @e()
+  %tagbits.e = and i32 %tag, 48
+  %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+  br i1 %tagbits.e.eq0, label %f, label %ret, !prof !2 ; balanced
+b:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %d, label %e, !prof !2 ; balanced
+d:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %f, label %ret, !prof !2 ; balanced
+f:
+  call void @f()
+  call void @f()
+  br label %ret
+ret:
+  ret void
+}
+
+; Verify that we still consider tail-duplication opportunities if we find a
+; triangle lattice. Here D->F->G is the triangle, and D;E are both predecessors
+; of both F and G. The basic lattice algorithm picks the F->G edge, but after
+; checking, it's profitable to duplicate G into F.
+; CHECK-LABEL: lattice_then_dup_test
+; CHECK: # %entry
+; CHECK: # %b
+; CHECK: # %e
+; CHECK: # %g
+; CHECK: # %ret1
+; CHECK: # %c
+; CHECK: # %d
+; CHECK: # %f
+; CHECK: # %ret2
+; CHECK: # %ret
+define void @lattice_then_dup_test(i32 %tag) {
+entry:
+  br label %a
+a:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %b, label %c, !prof !2 ; balanced
+b:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %d, label %e, !prof !2 ; balanced
+d:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %g, label %f, !prof !1 ; balanced
+f:
+  call void @f()
+  call void @f()
+  br label %g
+g:
+  %tagbits.g = and i32 %tag, 192
+  %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
+  br i1 %tagbits.g.eq0, label %ret1, label %ret2, !prof !2 ; balanced
+c:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  br i1 %tagbits.c.eq0, label %e, label %d, !prof !2 ; balanced
+e:
+  call void @e()
+  call void @e()
+  %tagbits.e = and i32 %tag, 48
+  %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+  br i1 %tagbits.e.eq0, label %f, label %g, !prof !2 ; balanced
+ret1:
+  call void @a()
+  br label %ret
+ret2:
+  call void @b()
+  br label %ret
+ret:
+  ret void
+}
+
 declare void @a()
 declare void @b()
 declare void @c()
 declare void @d()
+declare void @e()
+declare void @f()
+
+!1 = !{!"branch_weights", i32 5, i32 3}
+!2 = !{!"branch_weights", i32 50, i32 50}
Index: test/CodeGen/SPARC/sjlj.ll
===================================================================
--- test/CodeGen/SPARC/sjlj.ll
+++ test/CodeGen/SPARC/sjlj.ll
@@ -67,14 +67,18 @@
 ; CHECK:  nop
 ; CHECK:.LBB1_1:                                ! %entry
 ; CHECK:  mov  %g0, %i0
+; CHECK:                                        ! %entry
 ; CHECK:  cmp %i0, 0
-; CHECK:  bne  .LBB1_4
-; CHECK:  ba   .LBB1_5
-; CHECK:.LBB1_2:                                ! Block address taken
-; CHECK:  mov  1, %i0
 ; CHECK:  be   .LBB1_5
+; CHECK:  nop
 ; CHECK:.LBB1_4:
-; CHECK:  ba   .LBB1_6
+; CHECK:  mov  1, %i0
+; CHECK:  ba .LBB1_6
+; CHECK:.LBB1_2:                                ! Block address taken
+; CHECK:  mov  1, %i0
+; CHECK:  cmp %i0, 0
+; CHECK:  bne  .LBB1_4
+; CHECK:  nop
 }
 declare i8* @llvm.frameaddress(i32) #2
 
Index: test/CodeGen/WebAssembly/mem-intrinsics.ll
===================================================================
--- test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
Index: test/CodeGen/X86/block-placement.ll
===================================================================
--- test/CodeGen/X86/block-placement.ll
+++ test/CodeGen/X86/block-placement.ll
@@ -314,7 +314,7 @@
 define void @unnatural_cfg1() {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -352,17 +352,15 @@
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
-; CHECK: unnatural_cfg2
+; CHECK-LABEL: unnatural_cfg2
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
-; CHECK: %loop.body3
-; CHECK: %loop.inner1.begin
-; The end block is folded with %loop.body3...
-; CHECK-NOT: %loop.inner1.end
 ; CHECK: %loop.body4
 ; CHECK: %loop.inner2.begin
-; The loop.inner2.end block is folded
+; CHECK: %loop.inner2.begin
+; CHECK: %loop.body3
+; CHECK: %loop.inner1.begin
 ; CHECK: %loop.header
 ; CHECK: %bail
 
@@ -559,7 +557,7 @@
 ; didn't correctly locate the fallthrough successor, assuming blindly that the
 ; first one was the fallthrough successor. As a result, we would add an
 ; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
 ; CHECK: %entry
 ; CHECK-NOT: jmp
 ; CHECK: %loop
@@ -587,7 +585,7 @@
 ; fallthrough simply won't occur. Make sure we don't crash trying to update
 ; terminators for such constructs.
 ;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
 ; CHECK: %entry
 ; CHECK: %cleanup
 
@@ -609,7 +607,7 @@
 ; attempt to merge onto the wrong end of the inner loop just because we find it
 ; first. This was reduced from a crasher in GCC's single source.
 ;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
 ; CHECK: %loop2b
 ; CHECK: %loop1
@@ -649,7 +647,7 @@
 ; fallthrough because that happens to always produce unanalyzable branches on
 ; x86.
 ;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
 ; CHECK: %entry
 ; CHECK: %loop
 ; CHECK: %exit
@@ -673,7 +671,7 @@
 ; This branch is now analyzable and hence the destination block becomes the
 ; hotter one. The right order is entry->bar->exit->foo.
 ;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
 ; CHECK: %bar
 ; CHECK: %exit
@@ -699,7 +697,7 @@
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
 ; CHECK: %entry
 ; CHECK: %a
 ; CHECK: %b
@@ -729,7 +727,7 @@
 ; Ensure that we don't crash as we're building up many unanalyzable branches,
 ; blocks, and loops.
 ;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
 ; CHECK: %entry
 ; CHECK: %exit
 
@@ -948,7 +946,7 @@
 ;    strange layouts that are siginificantly less efficient, often times maing
 ;    it discontiguous.
 ;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
 ; CHECK: %entry
 ; First rotated loop top.
 ; CHECK: .p2align
Index: test/CodeGen/X86/bypass-slow-division-32.ll
===================================================================
--- test/CodeGen/X86/bypass-slow-division-32.ll
+++ test/CodeGen/X86/bypass-slow-division-32.ll
@@ -95,20 +95,19 @@
 ; CHECK-NEXT:    idivl %ebx
 ; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    testl $-256, %edi
-; CHECK-NEXT:    jne .LBB3_5
-; CHECK-NEXT:    jmp .LBB3_4
-; CHECK-NEXT:  .LBB3_1:
-; CHECK-NEXT:    movzbl %cl, %eax
-; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
-; CHECK-NEXT:    divb %bl
-; CHECK-NEXT:    movzbl %al, %esi
-; CHECK-NEXT:    testl $-256, %edi
 ; CHECK-NEXT:    je .LBB3_4
 ; CHECK-NEXT:  .LBB3_5:
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    jmp .LBB3_6
+; CHECK-NEXT:  .LBB3_1:
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %bl
+; CHECK-NEXT:    movzbl %al, %esi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    jne .LBB3_5
 ; CHECK-NEXT:  .LBB3_4:
 ; CHECK-NEXT:    movzbl %cl, %eax
 ; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
Index: test/CodeGen/X86/sse1.ll
===================================================================
--- test/CodeGen/X86/sse1.ll
+++ test/CodeGen/X86/sse1.ll
@@ -60,7 +60,13 @@
 ; X32-NEXT:    xorps %xmm1, %xmm1
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    jne .LBB1_5
-; X32-NEXT:    jmp .LBB1_4
+; X32-NEXT:  .LBB1_4:
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_8
+; X32-NEXT:  .LBB1_7:
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    jmp .LBB1_9
 ; X32-NEXT:  .LBB1_1:
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
@@ -68,17 +74,9 @@
 ; X32-NEXT:  .LBB1_5: # %entry
 ; X32-NEXT:    xorps %xmm2, %xmm2
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    jne .LBB1_8
-; X32-NEXT:    jmp .LBB1_7
-; X32-NEXT:  .LBB1_4:
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_7
 ; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
-; X32-NEXT:    jmp .LBB1_9
-; X32-NEXT:  .LBB1_7:
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X32-NEXT:  .LBB1_9: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -99,7 +97,13 @@
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    jne .LBB1_5
-; X64-NEXT:    jmp .LBB1_4
+; X64-NEXT:  .LBB1_4:
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    testl %r8d, %r8d
+; X64-NEXT:    jne .LBB1_8
+; X64-NEXT:  .LBB1_7:
+; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT:    jmp .LBB1_9
 ; X64-NEXT:  .LBB1_1:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    testl %edx, %edx
@@ -107,17 +111,9 @@
 ; X64-NEXT:  .LBB1_5: # %entry
 ; X64-NEXT:    xorps %xmm2, %xmm2
 ; X64-NEXT:    testl %r8d, %r8d
-; X64-NEXT:    jne .LBB1_8
-; X64-NEXT:    jmp .LBB1_7
-; X64-NEXT:  .LBB1_4:
-; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    testl %r8d, %r8d
 ; X64-NEXT:    je .LBB1_7
 ; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
-; X64-NEXT:    jmp .LBB1_9
-; X64-NEXT:  .LBB1_7:
-; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X64-NEXT:  .LBB1_9: # %entry
 ; X64-NEXT:    testl %esi, %esi
 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -215,7 +211,7 @@
   ret <4 x i32> %zext
 }
 
-; Fragile test warning - we need to induce the generation of a vselect 
+; Fragile test warning - we need to induce the generation of a vselect
 ; post-legalization to cause the crash seen in:
 ; https://llvm.org/bugs/show_bug.cgi?id=31672
 ; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
Index: test/CodeGen/X86/tail-dup-merge-loop-headers.ll
===================================================================
--- test/CodeGen/X86/tail-dup-merge-loop-headers.ll
+++ test/CodeGen/X86/tail-dup-merge-loop-headers.ll
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: tail_dup_merge_loops
 ; CHECK: # %entry
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
+; CHECK: # %exit
+; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_exit
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_latch
 ; CHECK-NOT: # %{{[a-zA-Z_]+}}
 ; CHECK: # %inner_loop_test
-; CHECK-NOT: # %{{[a-zA-Z_]+}}
-; CHECK: # %exit
 define void @tail_dup_merge_loops(i32 %a, i8* %b, i8* %c) local_unnamed_addr #0 {
 entry:
   %notlhs674.i = icmp eq i32 %a, 0
Index: test/CodeGen/X86/tail-dup-repeat.ll
===================================================================
--- test/CodeGen/X86/tail-dup-repeat.ll
+++ test/CodeGen/X86/tail-dup-repeat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
Index: test/CodeGen/X86/tail-opts.ll
===================================================================
--- test/CodeGen/X86/tail-opts.ll
+++ test/CodeGen/X86/tail-opts.ll
@@ -113,16 +113,15 @@
 ; CHECK-NEXT:   jbe .LBB2_3
 ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   ja .LBB2_4
-; CHECK-NEXT:   jmp .LBB2_2
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT:   movb $1, %al
+; CHECK-NEXT:   ret
 ; CHECK-NEXT: .LBB2_3:
 ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_2
 ; CHECK-NEXT: .LBB2_4:
 ; CHECK-NEXT:   xorl %eax, %eax
 ; CHECK-NEXT:   ret
-; CHECK-NEXT: .LBB2_2:
-; CHECK-NEXT:   movb $1, %al
-; CHECK-NEXT:   ret
 
 define i1 @dont_merge_oddly(float* %result) nounwind {
 entry:
Index: test/CodeGen/X86/twoaddr-coalesce-3.ll
===================================================================
--- test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -19,7 +19,7 @@
 
 ; Check that only one mov will be generated in the kernel loop.
 ; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
 ; CHECK-NOT: mov
@@ -56,7 +56,7 @@
 
 ; Check that only two mov will be generated in the kernel loop.
 ; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
 ; CHECK-NOT: mov
Index: test/CodeGen/X86/win-alloca-expander.ll
===================================================================
--- test/CodeGen/X86/win-alloca-expander.ll
+++ test/CodeGen/X86/win-alloca-expander.ll
@@ -115,34 +115,36 @@
 ; Test that the blocks are analyzed in the correct order.
 ; CHECK-LABEL: cfg:
 entry:
-  br i1 %x, label %bb1, label %bb2
+  br i1 %x, label %bb1, label %bb3
 
 bb1:
   %p1 = alloca %struct.S
 ; CHECK: pushl %eax
 ; CHECK: subl $1020, %esp
-  br label %bb3
+  br label %bb4
+
 bb2:
-  %p2 = alloca %struct.T
+  %p5 = alloca %struct.T
 ; CHECK: pushl %eax
 ; CHECK: subl $2996, %esp
-  br label %bb3
+  call void @g(%struct.T* %p5)
+  ret void
 
 bb3:
-  br i1 %y, label %bb4, label %bb5
+  %p2 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+  br label %bb4
 
 bb4:
+  br i1 %y, label %bb5, label %bb2
+
+bb5:
   %p4 = alloca %struct.S
 ; CHECK: subl $1024, %esp
   call void @f(%struct.S* %p4)
   ret void
 
-bb5:
-  %p5 = alloca %struct.T
-; CHECK: pushl %eax
-; CHECK: subl $2996, %esp
-  call void @g(%struct.T* %p5)
-  ret void
 }