Index: include/llvm/Analysis/LoopInfoImpl.h
===================================================================
--- include/llvm/Analysis/LoopInfoImpl.h
+++ include/llvm/Analysis/LoopInfoImpl.h
@@ -185,8 +185,13 @@
 template<class BlockT, class LoopT>
 void LoopBase<BlockT, LoopT>::
 addBasicBlockToLoop(BlockT *NewBB, LoopInfoBase<BlockT, LoopT> &LIB) {
-  assert((Blocks.empty() || LIB[getHeader()] == this) &&
-         "Incorrect LI specified for this loop!");
+#ifndef NDEBUG
+  if (!Blocks.empty()) {
+    auto SameHeader = LIB[getHeader()];
+    assert(contains(SameHeader) && getHeader() == SameHeader->getHeader()
+           && "Incorrect LI specified for this loop!");
+  }
+#endif
   assert(NewBB && "Cannot add a null basic block to the loop!");
   assert(!LIB[NewBB] && "BasicBlock already in the loop!");
 
Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -306,10 +306,21 @@
   /// must be done inline.
   TailDuplicator TailDup;
 
-  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
-  /// all terminators of the MachineFunction.
+  /// \brief A set of blocks that are unavoidably executed.
+  ///
+  /// i.e. they dominate
+  /// all terminators of the MachineFunction. Also used within loops for blocks
+  /// that are unavoidable within the loop.
   SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
 
+  /// \brief A set of delayed blocks for tail-duplication.
+  ///
+  /// These blocks form a second spine through a loop/function, and so
+  /// predecessors within this set do not need to be able to placed.
+  /// This allows the tail-duplicated spine (or similar cfg) to grow beyond
+  /// 2 blocks. See the description of canTailDuplicateAllPreds.
+  SmallPtrSet<MachineBasicBlock *, 8> TailDupDelayBlocks;
+
   /// \brief Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -389,12 +400,25 @@
                   const BlockFilterSet &LoopBlockSet);
   void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
                              const BlockFilterSet &LoopBlockSet);
-  void collectMustExecuteBBs();
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Compute the set of blocks that are unavoidable within a loop's sub-CFG
+  void computeLoopUnavoidableBlocks(MachineLoop &L);
+  /// Compute the set of blocks that are unavoidable within a function.
+  void computeUnavoidableBlocks();
+  /// See if Succ can tail-duplicate into all un-placed, un-filtered
+  /// predecessors. Excludes predecessors in TailDupDelayBlocks.
+  bool canTailDuplicateAllPreds(MachineBasicBlock *BB, MachineBasicBlock *Succ,
+                                BlockChain &Chain,
+                                const BlockFilterSet *BlockFilter);
+  /// Add all un-filtered unplaced blocks that will be duplicated into to the
+  /// delay set.
+  void delayTailDuplicatedBlocks(MachineBasicBlock *BB, MachineBasicBlock *Succ,
+                                 BlockChain &Chain,
+                                 const BlockFilterSet *BlockFilter);
 
-public:
+ public:
   static char ID; // Pass identification, replacement for typeid
   MachineBlockPlacement() : MachineFunctionPass(ID) {
     initializeMachineBlockPlacementPass(*PassRegistry::getPassRegistry());
@@ -551,6 +575,85 @@
   return SuccProb;
 }
 
+static bool hasSameSuccessors(
+    MachineBasicBlock &BB, SmallPtrSetImpl<MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// When the option TailDupPlacement is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB. In
+/// addition we keep a set of blocks that have been tail-duplicated into and
+/// allow those blocks to be unplaced as well. This allows the creation of a
+/// second (larger) spine and a short fallthrough spine.
+/// We also identify blocks with the CFG that would have been produced by
+/// tail-duplication and lay them out in the same manner.
+bool MachineBlockPlacement::canTailDuplicateAllPreds(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+  DEBUG(dbgs() << "Checking to see if block " << getBlockName(Succ)
+        << " can tail duplicate into all its predecessors.\n");
+  bool IsSimple = TailDup.isSimpleBB(Succ);
+
+  if (!TailDup.shouldTailDuplicate(*Succ->getParent(), IsSimple, *Succ)) {
+    DEBUG(dbgs() << "Skipping because it is "
+          << "not a candidate for duplication.\n");
+    return false;
+  }
+  // For CFG checking.
+  SmallPtrSet<MachineBasicBlock *, 4> Successors(BB->succ_begin(), BB->succ_end());
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors are either part
+    // of the second spine, or can be tail-duplicated into.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain)
+      continue;
+    // If Pred is part of the growing second spine, we don't need to be
+    // able to copy succ onto the end of it.
+    if (TailDupDelayBlocks.count(Pred) > 0)
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      DEBUG(dbgs() << "Possibly skipping because it can't be duplicated into block "
+            << getBlockName(Pred) << ".\n");
+      // Check for #Successors > 1 to make sure we aren't just outlining in the
+      // triangle case.
+      if (Successors.size() > 1
+          && hasSameSuccessors(*Pred, Successors)) {
+        DEBUG(dbgs() << "Not skipping because it looks like a tail-duplicated block.\n");
+        continue;
+      } else {
+        DEBUG(dbgs() << "Skipping because it can't be duplicated into block "
+              << getBlockName(Pred) << ".\n");
+      }
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Add all un-filtered unplaced blocks that will be duplicated into to the
+/// delay set.
+void MachineBlockPlacement::delayTailDuplicatedBlocks(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain
+        || TailDupDelayBlocks.count(Pred) > 0)
+      continue;
+    DEBUG(dbgs() << "Delaying block: " << getBlockName(Pred) << ".\n");
+    TailDupDelayBlocks.insert(Pred);
+  }
+}
+
 /// When the option OutlineOptionalBranches is on, this method
 /// checks if the fallthrough candidate block \p Succ (of block
 /// \p BB) also has other unscheduled predecessor blocks which
@@ -564,12 +667,20 @@
     MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
     const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
     BranchProbability HotProb) {
-  if (!OutlineOptionalBranches)
+  if (!OutlineOptionalBranches && !TailDupPlacement)
     return false;
   // If we outline optional branches, look whether Succ is unavoidable, i.e.
   // dominates all terminators of the MachineFunction. If it does, other
   // successors must be optional. Don't do this for cold branches.
   if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
+    bool TailDupDelay;
+    if (OutlineOptionalBranches)
+      TailDupDelay = false;
+    else if (TailDupPlacement
+        && canTailDuplicateAllPreds(BB, Succ, Chain, BlockFilter))
+      TailDupDelay = true;
+    else
+      return false;
     for (MachineBasicBlock *Pred : Succ->predecessors()) {
       // Check whether there is an unplaced optional branch.
       if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
@@ -582,9 +693,11 @@
       if (Pred->size() < OutlineOptionalThreshold)
         return false;
     }
+    if (TailDupDelay)
+      delayTailDuplicatedBlocks(BB, Succ, Chain, BlockFilter);
     return true;
-  } else
-    return false;
+  }
+  return false;
 }
 
 // When profile is not present, return the StaticLikelyProb.
@@ -808,7 +921,9 @@
     BranchProbability SuccProb =
         getAdjustedProbability(RealSuccProb, AdjustedSumProb);
 
-    // This heuristic is off by default.
+    // Full outlinining is off by default.
+    // Tail-duplication during layout, and outlining blocks that are
+    // tail-duplicated into is on by default.
     if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
                                   HotProb))
       return Succ;
@@ -1046,6 +1161,7 @@
 
     // Place this block, updating the datastructures to reflect its placement.
     BlockChain &SuccChain = *BlockToChain[BestSucc];
+    TailDupDelayBlocks.erase(BestSucc);
     // Zero out UnscheduledPredecessors for the successor we're about to merge in case
     // we selected a successor that didn't fit naturally into the CFG.
     SuccChain.UnscheduledPredecessors = 0;
@@ -1465,6 +1581,80 @@
   return LoopBlockSet;
 }
 
+
+/// \brief Finds unavoidable blocks within a loop.
+///
+/// These blocks form the loop spine, and knowing which blocks they are allow
+/// the loop-optional blocks to be outlined to the end of the loop,
+/// unconditionally or if they can form a second tail-duped spine.
+void MachineBlockPlacement::computeLoopUnavoidableBlocks(MachineLoop &L) {
+  SmallVector<MachineBasicBlock *, 4> Exits;
+  L.getLoopLatches(Exits);
+  // Find the nearest common dominator of all of L's latches.
+  MachineBasicBlock *Dominator = nullptr;
+  for (MachineBasicBlock *MBB : Exits) {
+    DEBUG(dbgs() << "Block: " << getBlockName(MBB)
+          << " is a latch.\n");
+    if (Dominator == nullptr)
+      Dominator = MBB;
+    else
+      Dominator = MDT->findNearestCommonDominator(Dominator, MBB);
+  }
+
+  Exits.clear();
+  L.getExitingBlocks(Exits);
+  for (MachineBasicBlock *MBB : Exits) {
+    DEBUG(dbgs() << "Block: " << getBlockName(MBB)
+          << " is a loop exit.\n");
+    if (MBB == L.getHeader())
+      continue;
+    if (Dominator == nullptr)
+      Dominator = MBB;
+    else
+      Dominator = MDT->findNearestCommonDominator(Dominator, MBB);
+  }
+
+  // MBBs dominating this common dominator are unavoidable.
+  UnavoidableBlocks.clear();
+  for (MachineBasicBlock *MBB : L.getBlocks())
+    if (MDT->dominates(MBB, Dominator)) {
+      DEBUG(dbgs() << "Block: " << getBlockName(MBB)
+            << " is loop un-avoidable.\n");
+      UnavoidableBlocks.insert(MBB);
+    }
+}
+
+
+/// \brief Finds unavoidable blocks for the entire function
+///
+/// These blocks form the spine, and knowing which blocks they are allow
+/// the optional blocks to be outlined to the end of the function
+/// unconditionally or if they can form a second tail-duped spine.
+void MachineBlockPlacement::computeUnavoidableBlocks() {
+  MachineBasicBlock * Terminator = nullptr;
+  for (MachineBasicBlock &MBB : *F) {
+    if (MBB.succ_size() == 0) {
+      if (Terminator == nullptr)
+        Terminator = &MBB;
+      else
+        Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
+    }
+  }
+
+  // MBBs dominating this common dominator are unavoidable.
+  UnavoidableBlocks.clear();
+  // If there are no exit blocks from the function, punt and assume that there
+  // are no unavoidable blocks. This will result in a linear layout.
+  if (Terminator == nullptr)
+    return;
+  for (MachineBasicBlock &MBB : *F)
+    if (MDT->dominates(&MBB, Terminator)) {
+      DEBUG(dbgs() << "Block: " << getBlockName(&MBB)
+            << " is un-avoidable.\n");
+      UnavoidableBlocks.insert(&MBB);
+    }
+}
+
 /// \brief Forms basic block chains from the natural loop structures.
 ///
 /// These chains are designed to preserve the existing *structure* of the code
@@ -1481,6 +1671,13 @@
   assert(EHPadWorkList.empty());
   BlockFilterSet LoopBlockSet = collectLoopBlockSet(L);
 
+  // Find the unavoidable blocks within this loop. This allows partial outlining
+  // with tail duplication within a loop.
+  if (TailDupPlacement) {
+    computeLoopUnavoidableBlocks(L);
+    TailDupDelayBlocks.clear();
+  }
+
   // Check if we have profile data for this function. If yes, we will rotate
   // this loop by modeling costs more precisely which requires the profile data
   // for better layout.
@@ -1559,31 +1756,6 @@
   EHPadWorkList.clear();
 }
 
-/// When OutlineOpitonalBranches is on, this method collects BBs that
-/// dominates all terminator blocks of the function \p F.
-void MachineBlockPlacement::collectMustExecuteBBs() {
-  if (OutlineOptionalBranches) {
-    // Find the nearest common dominator of all of F's terminators.
-    MachineBasicBlock *Terminator = nullptr;
-    for (MachineBasicBlock &MBB : *F) {
-      if (MBB.succ_size() == 0) {
-        if (Terminator == nullptr)
-          Terminator = &MBB;
-        else
-          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
-      }
-    }
-
-    // MBBs dominating this common dominator are unavoidable.
-    UnavoidableBlocks.clear();
-    for (MachineBasicBlock &MBB : *F) {
-      if (MDT->dominates(&MBB, Terminator)) {
-        UnavoidableBlocks.insert(&MBB);
-      }
-    }
-  }
-}
-
 void MachineBlockPlacement::buildCFGChains() {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
@@ -1615,9 +1787,6 @@
     }
   }
 
-  // Turned on with OutlineOptionalBranches option
-  collectMustExecuteBBs();
-
   // Build any loop-based chains.
   for (MachineLoop *L : *MLI)
     buildLoopChains(*L);
@@ -1625,6 +1794,13 @@
   assert(BlockWorkList.empty());
   assert(EHPadWorkList.empty());
 
+  // This must go after the loop chains, because the loop chains compute their
+  // own loop-relative UnavoidableBlocks
+  if (OutlineOptionalBranches || TailDupPlacement) {
+    computeUnavoidableBlocks();
+    TailDupDelayBlocks.clear();
+  }
+
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   for (MachineBasicBlock &MBB : *F)
     fillWorkLists(&MBB, UpdatedPreds);
@@ -1963,15 +2139,14 @@
                     /*CommonHoist=*/false, *MBFI,
                     *MBPI);
 
-    DEBUG(MF.dump());
     if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
                             getAnalysisIfAvailable<MachineModuleInfo>(), MLI,
                             /*AfterBlockPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
-      DEBUG(MF.dump());
       BlockToChain.clear();
       // Must redo the dominator tree if blocks were changed.
       MDT->runOnMachineFunction(MF);
+      BlockToChain.clear();
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }
Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
===================================================================
--- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -664,12 +664,12 @@
 ; No realignment in the prologue.
 ; CHECK-NOT:  and
 ; CHECK-NOT:  0xffffffffffffffe0
-; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
+; CHECK:  tbnz  {{.*}} .[[LABEL:.*]]
+; CHECK:  ret
+; CHECK:  .[[LABEL]]:
 ; Stack is realigned in a non-entry BB.
 ; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
 ; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
-; CHECK:  .[[LABEL]]:
-; CHECK:  ret
 
 
 define void @realign_conditional2(i1 %b) {
@@ -687,15 +687,15 @@
 
 ; CHECK-LABEL: realign_conditional2
 ; Extra realignment in the prologue (performance issue).
-; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
+; CHECK:  tbnz  {{.*}} .[[LABEL:.*]]
+; CHECK:  ret
+; CHECK:  .[[LABEL]]:
 ; CHECK:  sub  x9, sp, #32            // =32
 ; CHECK:  and  sp, x9, #0xffffffffffffffe0
 ; CHECK:  mov   x19, sp
 ; Stack is realigned in a non-entry BB.
 ; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
 ; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
-; CHECK:  .[[LABEL]]:
-; CHECK:  ret
 
 attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/CodeGen/AArch64/arm64-atomic.ll
===================================================================
--- test/CodeGen/AArch64/arm64-atomic.ll
+++ test/CodeGen/AArch64/arm64-atomic.ll
@@ -9,10 +9,10 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -27,10 +27,12 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
   %new = load i32, i32* %pnew
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
@@ -41,15 +43,15 @@
 ; CHECK-LABEL: val_compare_and_swap_rel:
 ; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
-; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
+; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -64,10 +66,10 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
Index: test/CodeGen/AArch64/arm64-ccmp.ll
===================================================================
--- test/CodeGen/AArch64/arm64-ccmp.ll
+++ test/CodeGen/AArch64/arm64-ccmp.ll
@@ -51,7 +51,7 @@
 ; CHECK: cmp
 ; CHECK: b.eq
 ; CHECK: cmp
-; CHECK: b.gt
+; CHECK: b.le
 define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -78,7 +78,7 @@
 ; CHECK: cmp
 ; CHECK: b.eq
 ; CHECK: cmp
-; CHECK: tbz
+; CHECK: tbnz
 define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp eq i32 %a, 5
Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll
===================================================================
--- test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -10,9 +10,11 @@
 ; Compare the arguments and jump to exit.
 ; No prologue needed.
 ; ENABLE: cmp w0, w1
-; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]]
+; ENABLE-NEXT: b.lt [[PROLOGUE_LABEL:LBB[0-9_]+]]
+; ENABLE: ret
 ;
 ; Prologue code.
+; ENABLE: [[PROLOGUE_LABEL]]:
 ; CHECK: sub sp, sp, #32
 ; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16]
 ; CHECK-NEXT: add [[SAVE_SP]], sp, #16
@@ -37,7 +39,6 @@
 ; CHECK-NEXT: add sp, sp, #32
 ;
 ; With shrink-wrapping, exit block is a simple return.
-; ENABLE: [[EXIT_LABEL]]:
 ; CHECK-NEXT: ret
 define i32 @foo(i32 %a, i32 %b) {
   %tmp = alloca i32, align 4
Index: test/CodeGen/AArch64/branch-relax-bcc.ll
===================================================================
--- test/CodeGen/AArch64/branch-relax-bcc.ll
+++ test/CodeGen/AArch64/branch-relax-bcc.ll
@@ -35,22 +35,20 @@
 
 ; CHECK-LABEL: _block_split:
 ; CHECK: cmp w0, #5
-; CHECK-NEXT: b.eq [[LONG_BR_BB:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[LOR_LHS_FALSE_BB:LBB[0-9]+_[0-9]+]]
-
-; CHECK: [[LONG_BR_BB]]:
+; CHECK-NEXT: b.ne [[LOR_LHS_FALSE_BB:LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: b [[IF_THEN_BB:LBB[0-9]+_[0-9]+]]
 
 ; CHECK: [[LOR_LHS_FALSE_BB]]:
 ; CHECK: cmp w{{[0-9]+}}, #16
 ; CHECK-NEXT: b.le [[IF_THEN_BB]]
-; CHECK-NEXT: b [[IF_END_BB:LBB[0-9]+_[0-9]+]]
 
-; CHECK: [[IF_THEN_BB]]:
+; CHECK: ; %if.end
+; CHECK: #0x7
+; CHECK: ret
+
+; CHECK: [[IF_THEN_BB]]
 ; CHECK: bl _foo
 ; CHECK-NOT: b L
-
-; CHECK: [[IF_END_BB]]:
 ; CHECK: #0x7
 ; CHECK: ret
 define i32 @block_split(i32 %a, i32 %b) #0 {
Index: test/CodeGen/AArch64/combine-comparisons-by-cse.ll
===================================================================
--- test/CodeGen/AArch64/combine-comparisons-by-cse.ll
+++ test/CodeGen/AArch64/combine-comparisons-by-cse.ll
@@ -264,9 +264,9 @@
 define i32 @do_nothing_if_resultant_opcodes_would_differ() #0 {
 ; CHECK-LABEL: do_nothing_if_resultant_opcodes_would_differ
 ; CHECK: cmn
-; CHECK: b.gt
+; CHECK-NEXT: b.le
 ; CHECK: cmp
-; CHECK: b.gt
+; CHECK-NEXT: b.le
 entry:
   %0 = load i32, i32* @a, align 4
   %cmp4 = icmp slt i32 %0, -1
Index: test/CodeGen/AArch64/fcmp.ll
===================================================================
--- test/CodeGen/AArch64/fcmp.ll
+++ test/CodeGen/AArch64/fcmp.ll
@@ -31,7 +31,7 @@
   %tst4 = fcmp uge float %a, -0.0
   br i1 %tst4, label %t5, label %end
 ; CHECK-NOT: fcmp {{s[0-9]+}}, #0.0
-; CHECK: b.mi .LBB
+; CHECK: b.pl .LBB
 
 t5:
   call void @bar(i32 0)
@@ -70,7 +70,7 @@
   %tst4 = fcmp uge double %a, -0.0
   br i1 %tst4, label %t5, label %end
 ; CHECK-NOT: fcmp {{d[0-9]+}}, #0.0
-; CHECK: b.mi .LBB
+; CHECK: b.pl .LBB
 
 t5:
   call void @bar(i32 0)
Index: test/CodeGen/AArch64/rm_redundant_cmp.ll
===================================================================
--- test/CodeGen/AArch64/rm_redundant_cmp.ll
+++ test/CodeGen/AArch64/rm_redundant_cmp.ll
@@ -13,7 +13,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.gt
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
   %1 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
@@ -69,7 +69,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.hi
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
   %1 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
@@ -134,7 +134,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.gt
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
   %1 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
@@ -190,7 +190,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.hi
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
   %1 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
Index: test/CodeGen/AArch64/tbz-tbnz.ll
===================================================================
--- test/CodeGen/AArch64/tbz-tbnz.ll
+++ test/CodeGen/AArch64/tbz-tbnz.ll
@@ -10,7 +10,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -28,7 +28,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -46,7 +46,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbnz [[CMP]], #31
+; CHECK: tbz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -64,7 +64,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbnz [[CMP]], #63
+; CHECK: tbz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -82,7 +82,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbnz [[CMP]], #31
+; CHECK: tbz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -100,7 +100,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbnz [[CMP]], #63
+; CHECK: tbz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -118,7 +118,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -162,7 +162,7 @@
   br i1 %tst4, label %if.then4, label %if.end
 
 ; CHECK: tst x0, x1, lsl #62
-; CHECK: b.lt
+; CHECK: b.ge
 
 if.then4:
   call void @t()
@@ -178,7 +178,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -194,7 +194,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -209,7 +209,7 @@
 
 ; CHECK: ldr [[CMP:x[0-9]+]], [x1]
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
   %val = load i64, i64* %ptr
   %tst = icmp slt i64 %val, 0
@@ -229,7 +229,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -247,7 +247,7 @@
 
 ; CHECK: orr [[CMP:x[0-9]+]], x0, x1
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -262,7 +262,7 @@
   br i1 %cond, label %if.end, label %if.then
 
 ; CHECK-NOT: and
-; CHECK: tbnz w0, #0
+; CHECK: tbz w0, #0
 
 if.then:
   call void @t()
@@ -278,7 +278,7 @@
   br i1 %cond1, label %if.then, label %if.end
 
 ; CHECK-NOT: movn
-; CHECK: tbnz w0, #0
+; CHECK: tbz w0, #0
 
 if.then:
   call void @t()
@@ -296,7 +296,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: lsl
-; CHECK: tbnz w0, #2
+; CHECK: tbz w0, #2
 
 then:
   call void @t()
@@ -314,7 +314,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: lsr
-; CHECK: tbnz w0, #3
+; CHECK: tbz w0, #3
 
 then:
   call void @t()
@@ -331,7 +331,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: asr
-; CHECK: tbnz w0, #31
+; CHECK: tbz w0, #31
 
 then:
   call void @t()
@@ -350,7 +350,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: ubfx
-; CHECK: tbnz w0, #3
+; CHECK: tbz w0, #3
 
 then:
   call void @t()
Index: test/CodeGen/AMDGPU/cf-loop-on-constant.ll
===================================================================
--- test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,11 +2,11 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
 
 ; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: s_endpgm
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
-; GCN: s_endpgm
 define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   %cmp = icmp eq i32 %n, -1
Index: test/CodeGen/AMDGPU/convergent-inlineasm.ll
===================================================================
--- test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -6,6 +6,8 @@
 ; GCN: v_cmp_ne_i32_e64
 ; GCN: ; mask branch
 ; GCN: BB{{[0-9]+_[0-9]+}}:
+; GCN: BB{{[0-9]+_[0-9]+}}:
+; GCN: s_endpgm
 define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -26,9 +28,12 @@
 ; GCN: ; mask branch
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-; GCN: v_cmp_ne_i32_e64
+; GCN: s_endpgm
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
+; GCN: v_cmp_ne_i32_e64
+; GCN: s_endpgm
+
 define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
Index: test/CodeGen/AMDGPU/salu-to-valu.ll
===================================================================
--- test/CodeGen/AMDGPU/salu-to-valu.ll
+++ test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -437,11 +437,12 @@
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
 ; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
+; GCN: s_cbranch_vccz [[SUCCESS:[A-Z0-9_]+]]
+; GCN: s_endpgm
+; GCN: {{^}}[[SUCCESS]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
 ; GCN: s_endpgm
 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
Index: test/CodeGen/AMDGPU/si-annotate-cf.ll
===================================================================
--- test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -93,13 +93,13 @@
 ; SI-NEXT: s_cbranch_scc0 [[ENDPGM:BB[0-9]+_[0-9]+]]
 
 ; SI: s_cmp_gt_i32
-; SI-NEXT: s_cbranch_scc1 [[ENDPGM]]
-
-; SI: [[INFLOOP:BB[0-9]+_[0-9]+]]
-; SI: s_branch [[INFLOOP]]
+; SI-NEXT: s_cbranch_scc0 [[INFLOOP:BB[0-9]+_[0-9]+]]
 
 ; SI: [[ENDPGM]]:
 ; SI: s_endpgm
+
+; SI: [[INFLOOP]]
+; SI: s_branch [[INFLOOP]]
 define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
 entry:
   %cmp = icmp sgt i32 %c0, 0
Index: test/CodeGen/AMDGPU/skip-if-dead.ll
===================================================================
--- test/CodeGen/AMDGPU/skip-if-dead.ll
+++ test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -268,14 +268,16 @@
 ; CHECK: [[PHIBB]]:
 ; CHECK: v_cmp_eq_f32_e32 vcc, 0, [[PHIREG]]
 ; CHECK: s_and_b64 vcc, exec, vcc
-; CHECK: s_cbranch_vccz [[ENDBB:BB[0-9]+_[0-9]+]]
+; CHECK: s_cbranch_vccnz [[BB3:BB[0-9]+_[0-9]+]]
 
-; CHECK: ; BB#3: ; %bb10
+; CHECK: ; %end
+; CHECK-NEXT: s_endpgm
+
+; CHECK: [[BB3]]: ; %bb10
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 9
 ; CHECK: buffer_store_dword
-
-; CHECK: [[ENDBB]]:
 ; CHECK-NEXT: s_endpgm
+
 define amdgpu_ps void @phi_use_def_before_kill() #0 {
 bb:
   %tmp = fadd float undef, 1.000000e+00
Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -9,9 +9,10 @@
 ; GCN: s_waitcnt lgkmcnt(0)
 ; VCCZ-BUG: s_mov_b64 vcc, vcc
 ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
-; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
+; GCN: s_cbranch_vccz [[SUCCESS:[0-9A-Za-z_]+]]
+; GCN: s_endpgm
+; GCN: [[SUCCESS]]:
 ; GCN: buffer_store_dword
-; GCN: [[EXIT]]:
 ; GCN: s_endpgm
 define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
 entry:
Index: test/CodeGen/AMDGPU/uniform-cfg.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-cfg.ll
+++ test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -121,9 +121,10 @@
 ; be selected for the SALU and then later moved to the VALU.
 ; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
 ; SI: s_and_b64 vcc, exec, [[COND]]
-; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS_LABEL]]:
 ; SI: buffer_store_dword
-; SI: [[ENDIF_LABEL]]:
 ; SI: s_endpgm
 define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
 entry:
@@ -146,9 +147,10 @@
 ; be selected for the SALU and then later moved to the VALU.
 ; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
 ; SI: s_and_b64 vcc, exec, [[COND]]
-; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS_LABEL]]:
 ; SI: buffer_store_dword
-; SI: [[ENDIF_LABEL]]:
 ; SI: s_endpgm
 define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
 entry:
@@ -231,9 +233,10 @@
 
 ; SI-LABEL: {{^}}icmp_2_users:
 ; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1
-; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]]
+; SI: s_cbranch_scc0 [[SUCCESS:[a-zA-Z0-9_]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS]]:
 ; SI: buffer_store_dword
-; SI: [[LABEL]]:
 ; SI: s_endpgm
 define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
@@ -255,9 +258,10 @@
 ; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
 ; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]]
 ; SI: s_and_b64 vcc, exec, [[MASK]]
-; SI: s_cbranch_vccnz [[EXIT]]
+; SI: s_cbranch_vccz [[SUCCESS:[a-zA-Z0-9_]+]]
+; SI: s_endpgm
+; SI: {{^}}[[SUCCESS]]:
 ; SI: buffer_store
-; SI: {{^}}[[EXIT]]:
 ; SI: s_endpgm
 define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
 bb:
@@ -334,13 +338,14 @@
 
 ; SI-LABEL: {{^}}divergent_inside_uniform:
 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
-; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cbranch_scc0 [[SUCCESS_LABEL:[0-9_A-Za-z]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS_LABEL]]:
 ; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; SI: buffer_store_dword [[ONE]]
-; SI: [[ENDIF_LABEL]]:
 ; SI: s_endpgm
 define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
@@ -369,10 +374,11 @@
 ; SI: buffer_store_dword [[ONE]]
 ; SI: s_or_b64 exec, exec, [[MASK]]
 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
-; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
+; SI: s_cbranch_scc0 [[THREE:[A-Z0-9_]+]]
+; SI: s_endpgm
+; SI: [[THREE]]:
 ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; SI: buffer_store_dword [[TWO]]
-; SI: [[EXIT]]:
 ; SI: s_endpgm
 define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
Index: test/CodeGen/AMDGPU/uniform-crash.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-crash.ll
+++ test/CodeGen/AMDGPU/uniform-crash.ll
@@ -3,9 +3,10 @@
 
 ; GCN-LABEL: {{^}}icmp_2_users:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1
-; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
+; GCN: s_cbranch_scc0 [[LABEL:BB[0-9_A-Z]+]]
+; GCN: s_endpgm
 ; GCN: [[LABEL]]:
-; GCN-NEXT: s_endpgm
+; GCN: s_endpgm
 define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
Index: test/CodeGen/AMDGPU/valu-i1.ll
===================================================================
--- test/CodeGen/AMDGPU/valu-i1.ll
+++ test/CodeGen/AMDGPU/valu-i1.ll
@@ -116,9 +116,13 @@
 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
 ; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
+; SI: s_branch [[LABEL_PREHEADER:BB[0-9]+_[0-9]+]]
+
+; SI: [[LABEL_EXIT]]:
+; SI: s_endpgm
 
 ; Initialize inner condition to false
-; SI: BB{{[0-9]+_[0-9]+}}: ; %bb10.preheader
+; SI: [[LABEL_PREHEADER]]:
 ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
 
@@ -146,8 +150,6 @@
 
 ; SI: BB#5
 ; SI: s_or_b64 exec, exec, [[COND_STATE]]
-
-; SI: [[LABEL_EXIT]]:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
Index: test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
===================================================================
--- test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
+++ test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
@@ -112,15 +112,17 @@
 ; CHECK-NEXT: subs [[REG:r[0-9]+]], #120
 ; CHECK-NEXT: cmp [[REG]], r1
 ; CHECK-NOT: it lt
-; CHECK-NEXT: bge [[LABEL:.+]]
+; CHECK-NEXT: blt [[LABEL:.+]]
 ; Next BB
+; CHECK: subs r0, r1, r0
+; CHECK-NEXT: bx lr
+; Next BB
+; CHECK: [[LABEL]]:
 ; CHECK-NOT: cmplt
 ; CHECK: cmp r0, #119
 ; CHECK-NEXT: itt le
 ; CHECK-NEXT: addle r0, r1, #1
 ; CHECK-NEXT: bxle lr
-; Next BB
-; CHECK: [[LABEL]]:
 ; CHECK-NEXT: subs r0, r1, r0
 ; CHECK-NEXT: bx lr
 
Index: test/CodeGen/ARM/arm-shrink-wrapping.ll
===================================================================
--- test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -23,9 +23,11 @@
 ; Compare the arguments and jump to exit.
 ; No prologue needed.
 ; ENABLE: cmp r0, r1
-; ENABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+; ENABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]]
+; ENABLE: bx lr
 ;
 ; Prologue code.
+; ENABLE: [[SUCCESS_LABEL]]:
 ; CHECK: push {r7, lr}
 ; CHECK-NEXT: mov r7, sp
 ;;
@@ -33,8 +35,12 @@
 ; After the prologue is set.
 ; DISABLE: sub sp
 ; DISABLE: cmp r0, r1
-; DISABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+; DISABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]]
+; ARM-DISABLE: mov sp, r7
+; THUMB-DISABLE: add sp, 
+; DISABLE-NEXT: pop {r7, pc}
 ;
+; DISABLE: [[SUCCESS_LABEL]]:
 ; Store %a in the alloca.
 ; ARM-ENABLE: push {r0}
 ; THUMB-ENABLE: str r0, [sp, #-4]
@@ -50,9 +56,8 @@
 ; THUMB-ENABLE-NEXT: add sp, #4
 ; ENABLE-NEXT: pop{{(\.w)?}} {r7, lr}
 ;
-; CHECK: [[EXIT_LABEL]]:
-;
-; Without shrink-wrapping, epilogue is in the exit block.
+; Late stage tail-duplication removes the exit label with shrink-wrapping.
+; Without shrink-wrapping, epilogue is before the return.
 ; Epilogue code. (What we pop does not matter.)
 ; ARM-DISABLE: mov sp, r7
 ; THUMB-DISABLE: add sp, 
@@ -388,9 +393,9 @@
 ;
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; CHECK: add{{(\.w)?}} r4, r4, #1
+; ARM-DAG: subs [[IV]], [[IV]], #1
+; THUMB-DAG: subs [[IV]], #1
+; CHECK-DAG: add{{(\.w)?}} r4, r4, #1
 ; CHECK: bne [[LOOP]]
 ;
 ; Next BB.
Index: test/CodeGen/ARM/atomic-cmpxchg.ll
===================================================================
--- test/CodeGen/ARM/atomic-cmpxchg.ll
+++ test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -72,11 +72,11 @@
 ; CHECK-ARMV7-NEXT: mov [[RES:r[0-9]+]], #1
 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
 ; CHECK-ARMV7-NEXT: bne [[TRY]]
-; CHECK-ARMV7-NEXT: b [[END:.LBB[0-9_]+]]
+; CHECK-ARMV7-NEXT: mov r0, [[RES]]
+; CHECK-ARMV7-NEXT: bx lr
 ; CHECK-ARMV7-NEXT: [[FAIL]]:
 ; CHECK-ARMV7-NEXT: clrex
 ; CHECK-ARMV7-NEXT: mov [[RES]], #0
-; CHECK-ARMV7-NEXT: [[END]]:
 ; CHECK-ARMV7-NEXT: mov r0, [[RES]]
 ; CHECK-ARMV7-NEXT: bx lr
 
Index: test/CodeGen/ARM/atomic-op.ll
===================================================================
--- test/CodeGen/ARM/atomic-op.ll
+++ test/CodeGen/ARM/atomic-op.ll
@@ -297,10 +297,10 @@
 ; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
 ; CHECK:     cmp     [[SUCCESS]], #0
 ; CHECK:     bne     [[LOOP_BB]]
-; CHECK:     b       [[END_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK:     dmb     ish
+; CHECK:     bx      lr
 ; CHECK: [[FAIL_BB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[END_BB]]:
 ; CHECK:     dmb     ish
 ; CHECK:     bx      lr
 
Index: test/CodeGen/ARM/atomic-ops-v8.ll
===================================================================
--- test/CodeGen/ARM/atomic-ops-v8.ll
+++ test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1045,20 +1045,21 @@
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i8 %old
 }
 
@@ -1078,20 +1079,21 @@
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i16 %old
 }
 
@@ -1110,20 +1112,21 @@
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-NEXT: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-ARM-NEXT: bx lr
    ret void
 }
 
@@ -1148,16 +1151,16 @@
 ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r2, r3 is a reasonable guess.
 ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+; CHECK-NEXT: pop
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
Index: test/CodeGen/ARM/fold-stack-adjust.ll
===================================================================
--- test/CodeGen/ARM/fold-stack-adjust.ll
+++ test/CodeGen/ARM/fold-stack-adjust.ll
@@ -135,7 +135,7 @@
 
   ; Important to check for beginning of basic block, because if it gets
   ; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
 ; CHECK-NEXT: vpop {d7, d8}
 ; CHECK-NEXT: pop {r4, pc}
 
Index: test/CodeGen/ARM/machine-cse-cmp.ll
===================================================================
--- test/CodeGen/ARM/machine-cse-cmp.ll
+++ test/CodeGen/ARM/machine-cse-cmp.ll
@@ -52,7 +52,7 @@
 ; CHECK-LABEL: f3:
 ; CHECK-NOT: sub
 ; CHECK: cmp
-; CHECK: blt
+; CHECK: bge
 %0 = load i32, i32* %offset, align 4
 %cmp = icmp slt i32 %0, %size
 %s = sub nsw i32 %0, %size
Index: test/CodeGen/Mips/llvm-ir/ashr.ll
===================================================================
--- test/CodeGen/Mips/llvm-ir/ashr.ll
+++ test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -83,20 +83,23 @@
 
   ; M2:         srav      $[[T0:[0-9]+]], $4, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $3, $[[T0]]
+  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         nop
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         sll       $[[T4:[0-9]+]], $4, 1
   ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
+  ; M2:         beqz      $[[T1]], $[[EXIT]]
   ; M2:         or        $3, $[[T3]], $[[T2]]
-  ; M2:         $[[BB0]]:
-  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
-  ; M2:         nop
-  ; M2:         sra       $2, $4, 31
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         sra       $2, $4, 31
 
   ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -167,20 +170,23 @@
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrav     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], .[[BB0:LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
+  ; M3:             bnez      $[[T3]], .[[BB1:LBB[0-9_]+]]
+  ; M3:             nop
+  ; M3:             .[[EXIT:LBB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             .[[BB0]]:
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+  ; M3:             beqz      $[[T3]], .[[EXIT]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             [[BB0]]:
-  ; M3:             beqz      $[[T3]], [[BB1:.LBB[0-9_]+]]
-  ; M3:             nop
-  ; M3:             dsra      $2, $4, 63
-  ; M3:             [[BB1]]:
+  ; M3:             .[[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             dsra      $2, $4, 63
 
   ; GP64-NOT-R6:    dsrlv     $[[T0:[0-9]+]], $5, $7
   ; GP64-NOT-R6:    dsll      $[[T1:[0-9]+]], $4, 1
Index: test/CodeGen/Mips/llvm-ir/lshr.ll
===================================================================
--- test/CodeGen/Mips/llvm-ir/lshr.ll
+++ test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -81,20 +81,24 @@
 
   ; M2:         srlv      $[[T0:[0-9]+]], $4, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $3, $[[T0]]
+  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $2, $zero, 0
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         sll       $[[T4:[0-9]+]], $4, 1
   ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
   ; M2:         or        $3, $[[T3]], $[[T2]]
-  ; M2:         $[[BB0]]:
-  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         bnez      $[[T1]], $[[EXIT]]
   ; M2:         addiu     $2, $zero, 0
-  ; M2:         move      $2, $[[T0]]
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         move      $2, $[[T0]]
 
   ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -158,20 +162,24 @@
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrlv     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], .[[BB0:LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
+  ; M3:             beqz      $[[T3]], .[[BB1:LBB[0-9_]+]]
+  ; M3:             daddiu    $2, $zero, 0
+  ; M3:             .[[EXIT:LBB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             .[[BB0]]:
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             [[BB0]]:
-  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
+  ; M3:             bnez      $[[T3]], .[[EXIT]]
   ; M3:             daddiu    $2, $zero, 0
-  ; M3:             move      $2, $[[T1]]
   ; M3:             [[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             move      $2, $[[T1]]
 
   ; GP64-NOT-R6:    dsrlv     $[[T0:[0-9]+]], $5, $7
   ; GP64-NOT-R6:    dsll      $[[T1:[0-9]+]], $4, 1
Index: test/CodeGen/Mips/llvm-ir/shl.ll
===================================================================
--- test/CodeGen/Mips/llvm-ir/shl.ll
+++ test/CodeGen/Mips/llvm-ir/shl.ll
@@ -97,20 +97,24 @@
 
   ; M2:         sllv      $[[T0:[0-9]+]], $5, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $2, $[[T0]]
+  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $3, $zero, 0
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         sllv      $[[T2:[0-9]+]], $4, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         srl       $[[T4:[0-9]+]], $5, 1
   ; M2:         srlv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
   ; M2:         or        $2, $[[T2]], $[[T3]]
-  ; M2:         $[[BB0]]:
-  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         bnez      $[[T1]], $[[EXIT]]
   ; M2:         addiu     $3, $zero, 0
-  ; M2:         move      $3, $[[T0]]
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         move      $3, $[[T0]]
 
   ; 32R1-R5:    sllv      $[[T0:[0-9]+]], $4, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -174,20 +178,24 @@
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsllv     $[[T1:[0-9]+]], $5, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], .[[BB0:LBB[0-9_]+]]
   ; M3:             move      $2, $[[T1]]
+  ; M3:             beqz      $[[T3]], .[[BB1:LBB[0-9_]+]]
+  ; M3:             daddiu    $3, $zero, 0
+  ; M3:             .[[EXIT:LBB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             .[[BB0]]:
   ; M3:             dsllv     $[[T4:[0-9]+]], $4, $7
   ; M3:             dsrl      $[[T5:[0-9]+]], $5, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsrlv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $2, $[[T4]], $[[T7]]
-  ; M3:             [[BB0]]:
-  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
+  ; M3:             bnez      $[[T3]], .[[EXIT]]
   ; M3:             daddiu    $3, $zero, 0
-  ; M3:             move      $3, $[[T1]]
   ; M3:             [[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             move      $3, $[[T1]]
 
   ; GP64-NOT-R6:    dsllv     $[[T0:[0-9]+]], $4, $7
   ; GP64-NOT-R6:    dsrl      $[[T1:[0-9]+]], $5, 1
Index: test/CodeGen/Mips/longbranch.ll
===================================================================
--- test/CodeGen/Mips/longbranch.ll
+++ test/CodeGen/Mips/longbranch.ll
@@ -84,7 +84,7 @@
 ; Check the MIPS64 version.
 
 ; N64:        lui     $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1)))
-; N64:        bnez    $4, [[BB0:\.LBB[0-9_]+]]
+; N64:        beqz    $4, .[[EXIT:LBB[0-9_]+]]
 ; N64:        daddu   $[[R1:[0-9]+]], $[[R0]], $25
 
 ; Check for long branch expansion:
@@ -100,14 +100,15 @@
 ; N64-NEXT:      jr      $1
 ; N64-NEXT:      daddiu  $sp, $sp, 16
 
-; N64:   [[BB0]]:
+; N64:   [[EXIT]]:
+; N64:        jr      $ra
+; N64:        nop
+; N64:   [[BB2]]:
 ; N64:        daddiu  $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1)))
 ; N64:        ld      $[[R2:[0-9]+]], %got_disp(x)($[[GP]])
 ; N64:        addiu   $[[R3:[0-9]+]], $zero, 1
-; N64:        sw      $[[R3]], 0($[[R2]])
-; N64:   [[BB2]]:
 ; N64:        jr      $ra
-; N64:        nop
+; N64:        sw      $[[R3]], 0($[[R2]])
 
 ; In MIPS64R6 JR is an alias to JALR with $rd=0. As everything else remains the
 ; same with the N64 prefix, we use -asm-show-inst in order to make sure that
Index: test/CodeGen/PowerPC/bdzlr.ll
===================================================================
--- test/CodeGen/PowerPC/bdzlr.ll
+++ test/CodeGen/PowerPC/bdzlr.ll
@@ -53,13 +53,15 @@
 
 ; CHECK: @lua_xmove
 ; CHECK: bnelr
-; CHECK: bnelr
+; CHECK: beq
+; CHECK: blr
 ; CHECK: bdzlr
 ; CHECK-NOT: blr
 
 ; CHECK-CRB: @lua_xmove
 ; CHECK-CRB: bclr 12,
-; CHECK-CRB: bclr 12,
+; CHECK-CRB: bc 4,
+; CHECK-CRB: blr
 ; CHECK-CRB: bdzlr
 ; CHECK-CRB-NOT: blr
 }
Index: test/CodeGen/PowerPC/tail-dup-layout.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-layout.ll
+++ test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,25 +1,25 @@
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; Intended layout:
-; The outlining flag produces the layout
+; The chain-based outlining produces the layout
 ; test1
 ; test2
 ; test3
 ; test4
-; exit
 ; optional1
 ; optional2
 ; optional3
 ; optional4
+; exit
 ; Tail duplication puts test n+1 at the end of optional n
 ; so optional1 includes a copy of test2 at the end, and branches
 ; to test3 (at the top) or falls through to optional 2.
-; The CHECK statements check for the whole string of tests and exit block,
+; The CHECK statements check for the whole string of tests
 ; and then check that the correct test has been duplicated into the end of
 ; the optional blocks and that the optional blocks are in the correct order.
-;CHECK-LABEL: f:
+;CHECK-LABEL: straight_test:
 ; test1 may have been merged with entry
 ;CHECK: mr [[TAGREG:[0-9]+]], 3
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
@@ -33,8 +33,7 @@
 ;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
 ;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
-;CHECK: blr
+;CHECK-NEXT: b [[EXITLABEL:[._0-9A-Za-z]+]]
 ;CHECK-NEXT: [[OPT1LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
 ;CHECK-NEXT: beq 0, [[TEST3LABEL]]
@@ -45,9 +44,10 @@
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
 ;CHECK-NEXT: beq 0, [[EXITLABEL]]
 ;CHECK-NEXT: [[OPT4LABEL]]
-;CHECK: b [[EXITLABEL]]
+;CHECK: [[EXITLABEL]]: # %exit
+;CHECK: blr
 
-define void @f(i32 %tag) {
+define void @straight_test(i32 %tag) {
 entry:
   br label %test1
 test1:
@@ -94,7 +94,115 @@
   ret void
 }
 
+; Intended layout:
+; The chain-based outlining produces the layout
+; entry
+; --- Begin loop ---
+; for.latch
+; for.check
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; --- End loop ---
+; exit
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: loop_test:
+;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
+;CHECK: [[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
+;CHECK: addi
+;CHECK: [[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
+;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
+;CHECK: # %test1
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, [[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, [[LATCHLABEL]]
+;CHECK-NEXT: b [[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[TEST3LABEL]]
+;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, [[TEST4LABEL]]
+;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, [[LATCHLABEL]]
+;CHECK-NEXT: [[OPT4LABEL]]
+;CHECK: b [[LATCHLABEL]]
+define void @loop_test(i32* %tags, i32 %count) {
+entry:
+  br label %for.check
+for.check:
+  %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
+  %done.count = icmp ugt i32 %count.loop, 0
+  %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
+  %tag = load i32, i32* %tag_ptr
+  %done.tag = icmp eq i32 %tag, 0
+  %done = and i1 %done.count, %done.tag
+  br i1 %done, label %test1, label %exit
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %for.latch, label %optional4
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %for.latch
+for.latch:
+  %count.sub = sub i32 %count.loop, 1
+  br label %for.check
+exit:
+  ret void
+}
+
 declare void @a()
 declare void @b()
 declare void @c()
 declare void @d()
+
Index: test/CodeGen/SPARC/sjlj.ll
===================================================================
--- test/CodeGen/SPARC/sjlj.ll
+++ test/CodeGen/SPARC/sjlj.ll
@@ -66,14 +66,15 @@
 ; CHECK:  ba   .LBB1_1
 ; CHECK:  nop
 ; CHECK:.LBB1_1:                                ! %entry
-; CHECK:  ba   .LBB1_3
 ; CHECK:  mov  %g0, %i0
+; CHECK:  cmp %i0, 0
+; CHECK:  bne  .LBB1_4
+; CHECK:  ba   .LBB1_5
 ; CHECK:.LBB1_2:                                ! Block address taken
 ; CHECK:  mov  1, %i0
-; CHECK:.LBB1_3:                                ! %entry
-; CHECK:  cmp %i0, 0
 ; CHECK:  be   .LBB1_5
-; CHECK:  nop
+; CHECK:.LBB1_4:
+; CHECK:  ba   .LBB1_6
 }
 declare i8* @llvm.frameaddress(i32) #2
 
Index: test/CodeGen/SystemZ/tdc-06.ll
===================================================================
--- test/CodeGen/SystemZ/tdc-06.ll
+++ test/CodeGen/SystemZ/tdc-06.ll
@@ -26,25 +26,27 @@
 nonzeroord:
 ; CHECK: lhi %r2, 2
 ; CHECK: tcdb %f0, 48
-; CHECK: jl [[RET]]
+; CHECK: je [[FINITE:.]]
   %abs = tail call double @llvm.fabs.f64(double %x)
   %testinf = fcmp oeq double %abs, 0x7FF0000000000000
   br i1 %testinf, label %ret, label %finite, !prof !1
 
+ret:
+; CHECK: [[RET]]:
+; CHECK: br %r14
+  %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ]
+  ret i32 %res
+
 finite:
 ; CHECK: lhi %r2, 3
 ; CHECK: tcdb %f0, 831
 ; CHECK: blr %r14
 ; CHECK: lhi %r2, 4
+; CHECK: br %r14
   %testnormal = fcmp uge double %abs, 0x10000000000000
   %finres = select i1 %testnormal, i32 3, i32 4
   br label %ret
 
-ret:
-; CHECK: [[RET]]:
-; CHECK: br %r14
-  %res = phi i32 [ 5, %entry ], [ 1, %nonzero ], [ 2, %nonzeroord ], [ %finres, %finite ]
-  ret i32 %res
 }
 
 !1 = !{!"branch_weights", i32 1, i32 1}
Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll
===================================================================
--- test/CodeGen/Thumb/thumb-shrink-wrapping.ll
+++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll
@@ -1,11 +1,12 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
+
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
@@ -15,6 +16,8 @@
 ; edges.
 ; Also disable the late if-converter as it makes harder to reason on
 ; the diffs.
+; Disable tail-duplication during placement, as v4t vs v5t get different
+; results due to branches not being analyzable under v5
 
 ; Initial motivating example: Simple diamond with a call just on one side.
 ; CHECK-LABEL: foo:
Index: test/CodeGen/Thumb2/cbnz.ll
===================================================================
--- test/CodeGen/Thumb2/cbnz.ll
+++ test/CodeGen/Thumb2/cbnz.ll
@@ -26,7 +26,7 @@
   call void @x()
   call void @x()
   call void @x()
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %q = icmp eq i32 %y, 0
   br i1 %q, label %t2, label %f
 
Index: test/CodeGen/Thumb2/ifcvt-compare.ll
===================================================================
--- test/CodeGen/Thumb2/ifcvt-compare.ll
+++ test/CodeGen/Thumb2/ifcvt-compare.ll
@@ -4,7 +4,7 @@
 
 define void @f0(i32 %x) optsize {
   ; CHECK-LABEL: f0:
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %p = icmp eq i32 %x, 0
   br i1 %p, label %t, label %f
 
Index: test/CodeGen/WebAssembly/mem-intrinsics.ll
===================================================================
--- test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
Index: test/CodeGen/X86/2012-08-17-legalizer-crash.ll
===================================================================
--- test/CodeGen/X86/2012-08-17-legalizer-crash.ll
+++ test/CodeGen/X86/2012-08-17-legalizer-crash.ll
@@ -26,5 +26,5 @@
   ret void
 
 ; CHECK-LABEL: fn1:
-; CHECK: jb
+; CHECK: jae
 }
Index: test/CodeGen/X86/avx-splat.ll
===================================================================
--- test/CodeGen/X86/avx-splat.ll
+++ test/CodeGen/X86/avx-splat.ll
@@ -62,8 +62,10 @@
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ## implicit-def: %YMM0
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne LBB4_2
-; CHECK-NEXT:  ## BB#1: ## %load.i1247
+; CHECK-NEXT:    je LBB4_1
+; CHECK-NEXT:  ## BB#2: ## %__load_and_broadcast_32.exit1249
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB4_1: ## %load.i1247
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    andq $-32, %rsp
@@ -71,7 +73,6 @@
 ; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %ymm0
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
 ; CHECK-NEXT:    retq
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
Index: test/CodeGen/X86/avx512-cmp.ll
===================================================================
--- test/CodeGen/X86/avx512-cmp.ll
+++ test/CodeGen/X86/avx512-cmp.ll
@@ -69,13 +69,14 @@
 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ALL-NEXT:    vucomiss %xmm1, %xmm0
 ; ALL-NEXT:    jne LBB3_1
-; ALL-NEXT:    jnp LBB3_2
+; ALL-NEXT:    jp  LBB3_1
+; ALL-NEXT:  ## BB#2: ## %return
+; ALL-NEXT:    retq
 ; ALL-NEXT:  LBB3_1: ## %if.end
 ; ALL-NEXT:    seta %al
 ; ALL-NEXT:    movzbl %al, %eax
 ; ALL-NEXT:    leaq {{.*}}(%rip), %rcx
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:  LBB3_2: ## %return
 ; ALL-NEXT:    retq
 entry:
   %cmp = fcmp oeq float %p, 0.000000e+00
Index: test/CodeGen/X86/block-placement.ll
===================================================================
--- test/CodeGen/X86/block-placement.ll
+++ test/CodeGen/X86/block-placement.ll
@@ -314,7 +314,7 @@
 define void @unnatural_cfg1() {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -352,17 +352,22 @@
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
-; CHECK: unnatural_cfg2
+; The tail-duplication outlining algorithm places
+; %loop.body3 and %loop.inner1.begin out-of-line at the end of the loop,
+; because %loop.body4 is unnavoidable within the loop and short,
+; and %loop.inner1.begin has an alternate fallthrough of %loop.body3
+; CHECK-LABEL: unnatural_cfg2
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
+; CHECK: %loop.body4
+; CHECK: %loop.inner2.begin
+; CHECK: %loop.inner2.begin
+; The loop.inner2.end block is folded
 ; CHECK: %loop.body3
 ; CHECK: %loop.inner1.begin
 ; The end block is folded with %loop.body3...
 ; CHECK-NOT: %loop.inner1.end
-; CHECK: %loop.body4
-; CHECK: %loop.inner2.begin
-; The loop.inner2.end block is folded
 ; CHECK: %loop.header
 ; CHECK: %bail
 
@@ -559,7 +564,7 @@
 ; didn't correctly locate the fallthrough successor, assuming blindly that the
 ; first one was the fallthrough successor. As a result, we would add an
 ; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
 ; CHECK: %entry
 ; CHECK-NOT: jmp
 ; CHECK: %loop
@@ -587,7 +592,7 @@
 ; fallthrough simply won't occur. Make sure we don't crash trying to update
 ; terminators for such constructs.
 ;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
 ; CHECK: %entry
 ; CHECK: %cleanup
 
@@ -609,7 +614,7 @@
 ; attempt to merge onto the wrong end of the inner loop just because we find it
 ; first. This was reduced from a crasher in GCC's single source.
 ;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
 ; CHECK: %loop2b
 ; CHECK: %loop1
@@ -649,7 +654,7 @@
 ; fallthrough because that happens to always produce unanalyzable branches on
 ; x86.
 ;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
 ; CHECK: %entry
 ; CHECK: %loop
 ; CHECK: %exit
@@ -673,7 +678,7 @@
 ; This branch is now analyzable and hence the destination block becomes the
 ; hotter one. The right order is entry->bar->exit->foo.
 ;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
 ; CHECK: %bar
 ; CHECK: %exit
@@ -699,12 +704,13 @@
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
 ; CHECK: %entry
 ; CHECK: %a
 ; CHECK: %b
-; CHECK: %c
 ; CHECK: %exit
+; CHECK: %c
+; CHECK: retl
 
 entry:
   br i1 undef, label %a, label %b
@@ -729,7 +735,7 @@
 ; Ensure that we don't crash as we're building up many unanalyzable branches,
 ; blocks, and loops.
 ;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
 ; CHECK: %entry
 ; CHECK: %exit
 
@@ -948,7 +954,7 @@
 ;    strange layouts that are siginificantly less efficient, often times maing
 ;    it discontiguous.
 ;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
 ; CHECK: %entry
 ; First rotated loop top.
 ; CHECK: .p2align
Index: test/CodeGen/X86/cmovcmov.ll
===================================================================
--- test/CodeGen/X86/cmovcmov.ll
+++ test/CodeGen/X86/cmovcmov.ll
@@ -192,7 +192,7 @@
 ; CMOV-NEXT:   retq
 
 ; NOCMOV:        jne
-; NOCMOV-NEXT:   jp
+; NOCMOV-NEXT:   jnp
 define float @test_zext_fcmp_une(float %a, float %b) #0 {
 entry:
   %cmp = fcmp une float %a, %b
@@ -214,7 +214,7 @@
 ; CMOV-NEXT:   retq
 
 ; NOCMOV:        jne
-; NOCMOV-NEXT:   jp
+; NOCMOV-NEXT:   jnp
 define float @test_zext_fcmp_oeq(float %a, float %b) #0 {
 entry:
   %cmp = fcmp oeq float %a, %b
Index: test/CodeGen/X86/critical-edge-split-2.ll
===================================================================
--- test/CodeGen/X86/critical-edge-split-2.ll
+++ test/CodeGen/X86/critical-edge-split-2.ll
@@ -24,6 +24,7 @@
 
 ; CHECK-LABEL: test1:
 ; CHECK: testb %dil, %dil
-; CHECK: jne LBB0_2
+; CHECK: je LBB0_1
+; CHECK: retq
+; CHECK: LBB0_1:
 ; CHECK: divl
-; CHECK: LBB0_2:
Index: test/CodeGen/X86/shrink-wrap-chkstk.ll
===================================================================
--- test/CodeGen/X86/shrink-wrap-chkstk.ll
+++ test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -62,11 +62,12 @@
 ; CHECK-LABEL: @use_eax_before_prologue@8: # @use_eax_before_prologue
 ; CHECK: movl %ecx, %eax
 ; CHECK: cmpl %edx, %eax
-; CHECK: jge LBB1_2
+; CHECK: jl LBB1_1
+; CHECK: retl
+; CHECK: LBB1_1
 ; CHECK: pushl %eax
 ; CHECK: movl $4092, %eax
 ; CHECK: calll __chkstk
 ; CHECK: movl 4092(%esp), %eax
 ; CHECK: calll _doSomething
-; CHECK: LBB1_2:
 ; CHECK: retl
Index: test/CodeGen/X86/twoaddr-coalesce-3.ll
===================================================================
--- test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -19,7 +19,7 @@
 
 ; Check that only one mov will be generated in the kernel loop.
 ; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
 ; CHECK-NOT: mov
@@ -56,7 +56,7 @@
 
 ; Check that only two mov will be generated in the kernel loop.
 ; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
 ; CHECK-NOT: mov
Index: test/CodeGen/X86/x86-shrink-wrap-unwind.ll
===================================================================
--- test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -24,7 +24,9 @@
 ; After the prologue is set.
 ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
-; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; CHECK: popq
+; CHECK-NEXT: retq
 ;
 ; Store %a in the alloca.
 ; CHECK: movl [[ARG0CPY]], 4(%rsp)
@@ -33,14 +35,9 @@
 ; Set the first argument to zero.
 ; CHECK-NEXT: xorl %edi, %edi
 ; CHECK-NEXT: callq _doSomething
-;
-; CHECK: [[EXIT_LABEL]]:
-;
-; Without shrink-wrapping, epilogue is in the exit block.
-; Epilogue code. (What we pop does not matter.)
 ; CHECK-NEXT: popq
-;
 ; CHECK-NEXT: retq
+;
 define i32 @framelessUnwind(i32 %a, i32 %b) #0 {
   %tmp = alloca i32, align 4
   %tmp2 = icmp slt i32 %a, %b
@@ -70,9 +67,11 @@
 ; After the prologue is set.
 ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
-; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; CHECK: retq
 ;
 ; Prologue code.
+; CHECK-NEXT: [[SUCCESS_LABEL]]
 ; CHECK: pushq %rbp
 ; CHECK: movq %rsp, %rbp
 ;
@@ -86,9 +85,8 @@
 ;
 ; Epilogue code. (What we pop does not matter.)
 ; CHECK: popq %rbp
-;
-; CHECK: [[EXIT_LABEL]]:
 ; CHECK-NEXT: retq
+;
 define i32 @frameUnwind(i32 %a, i32 %b) #1 {
   %tmp = alloca i32, align 4
   %tmp2 = icmp slt i32 %a, %b
@@ -116,10 +114,12 @@
 ; After the prologue is set.
 ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
-; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; CHECK: retq
 ;
 ; Prologue code.
 ; (What we push does not matter. It should be some random sratch register.)
+; CHECK-NEXT: [[SUCCESS_LABEL]]
 ; CHECK: pushq
 ;
 ; Store %a in the alloca.
@@ -132,8 +132,6 @@
 ;
 ; Epilogue code.
 ; CHECK-NEXT: addq
-;
-; CHECK: [[EXIT_LABEL]]:
 ; CHECK-NEXT: retq
 define i32 @framelessnoUnwind(i32 %a, i32 %b) #2 {
   %tmp = alloca i32, align 4
Index: test/CodeGen/X86/x86-shrink-wrapping.ll
===================================================================
--- test/CodeGen/X86/x86-shrink-wrapping.ll
+++ test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -18,18 +18,24 @@
 ; No prologue needed.
 ; ENABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; ENABLE-NEXT: cmpl %esi, [[ARG0CPY]]
-; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; ENABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; ENABLE: retq
 ;
 ; Prologue code.
 ; (What we push does not matter. It should be some random sratch register.)
+; ENABLE: [[SUCCESS_LABEL]]:
 ; CHECK: pushq
 ;
 ; Compare the arguments and jump to exit.
 ; After the prologue is set.
 ; DISABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; DISABLE-NEXT: cmpl %esi, [[ARG0CPY]]
-; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; DISABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
 ;
+; DISABLE: popq
+; DISABLE-NEXT: retq
+
+; DISABLE: [[SUCCESS_LABEL]]:
 ; Store %a in the alloca.
 ; CHECK: movl [[ARG0CPY]], 4(%rsp)
 ; Set the alloca address in the second argument.
@@ -37,17 +43,11 @@
 ; Set the first argument to zero.
 ; CHECK-NEXT: xorl %edi, %edi
 ; CHECK-NEXT: callq _doSomething
-;
 ; With shrink-wrapping, epilogue is just after the call.
 ; ENABLE-NEXT: addq $8, %rsp
-;
-; CHECK: [[EXIT_LABEL]]:
-;
-; Without shrink-wrapping, epilogue is in the exit block.
-; Epilogue code. (What we pop does not matter.)
 ; DISABLE-NEXT: popq
-;
 ; CHECK-NEXT: retq
+
 define i32 @foo(i32 %a, i32 %b) {
   %tmp = alloca i32, align 4
   %tmp2 = icmp slt i32 %a, %b