Index: include/llvm/CodeGen/TailDuplicator.h
===================================================================
--- include/llvm/CodeGen/TailDuplicator.h
+++ include/llvm/CodeGen/TailDuplicator.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_CODEGEN_TAILDUPLICATOR_H
 #define LLVM_CODEGEN_TAILDUPLICATOR_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -52,9 +53,12 @@
   static bool isSimpleBB(MachineBasicBlock *TailBB);
   bool shouldTailDuplicate(const MachineFunction &MF, bool IsSimple,
                            MachineBasicBlock &TailBB, bool IgnoreFallthrough);
-  bool tailDuplicateAndUpdate(MachineFunction &MF, bool IsSimple,
-                              MachineBasicBlock *MBB,
-                              MachineBasicBlock *ForcedLayoutPred);
+  bool canTailDuplicate(MachineBasicBlock *TailBB, MachineBasicBlock *PredBB);
+  bool tailDuplicateAndUpdate(
+      MachineFunction &MF, bool IsSimple,
+      MachineBasicBlock *MBB,
+      MachineBasicBlock *ForcedLayoutPred,
+      llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback = nullptr);
 
 private:
   typedef TargetInstrInfo::RegSubRegPair RegSubRegPair;
@@ -87,7 +91,9 @@
                  SmallVectorImpl<std::pair<unsigned,RegSubRegPair>> &CopyInfos,
                  SmallVectorImpl<MachineInstr *> &Copies);
 
-  void removeDeadBlock(MachineBasicBlock *MBB);
+  void removeDeadBlock(
+      MachineBasicBlock *MBB,
+      llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback);
 };
 
 } // End llvm namespace
Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -179,6 +179,16 @@
   /// \brief End of blocks within the chain.
   iterator end() { return Blocks.end(); }
 
+  bool remove(MachineBasicBlock* BB) {
+    for(iterator i = begin(); i != end(); ++i) {
+      if (*i == BB) {
+        Blocks.erase(i);
+        return true;
+      }
+    }
+    return false;
+  }
+
   /// \brief Merge a block chain into this one.
   ///
   /// This routine merges a block chain into this one. It takes care of forming
@@ -215,6 +225,16 @@
     for (MachineBasicBlock *MBB : *this)
       MBB->dump();
   }
+
+  void dump_brief() {
+    iterator i = begin();
+    dbgs() << "[";
+    if (i != end())
+      dbgs() << "#" << (*i)->getNumber();
+    for (++i; i != end(); ++i)
+      dbgs() << ", #" << (*i)->getNumber();
+    dbgs() << "]\n";
+  }
 #endif // NDEBUG
 
   /// \brief Count of predecessors of any block within the chain which have not
@@ -242,7 +262,7 @@
   const MachineBlockFrequencyInfo *MBFI;
 
   /// \brief A handle to the loop info.
-  const MachineLoopInfo *MLI;
+  MachineLoopInfo *MLI;
 
   /// \brief A handle to the target's instruction info.
   const TargetInstrInfo *TII;
@@ -260,10 +280,20 @@
   /// must be done inline.
   TailDuplicator TailDup;
 
-  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
-  /// all terminators of the MachineFunction.
+  /// \brief A set of blocks that are unavoidably executed.
+  ///
+  /// i.e. they dominate
+  /// all terminators of the MachineFunction. Also used within loops for blocks
+  /// that are unavoidable within the loop.
   SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
 
+  /// \brief A set of delayed blocks for tail-duplication.
+  ///
+  /// These blocks form a second spine through a loop/function, and so
+  /// predecessors within this set do not need to be able to placed.
+  /// This allows the tail-duplicated spine to grow beyond 2 blocks.
+  SmallPtrSet<MachineBasicBlock *, 8> TailDupDelayBlocks;
+
   /// \brief Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -309,7 +339,7 @@
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
                   SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                   SmallVectorImpl<MachineBasicBlock *> &EHPadWorkList,
-                  const BlockFilterSet *BlockFilter = nullptr);
+                  BlockFilterSet *BlockFilter = nullptr);
   MachineBasicBlock *findBestLoopTop(MachineLoop &L,
                                      const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L,
@@ -323,6 +353,8 @@
   void buildCFGChains(MachineFunction &F);
   void optimizeBranches(MachineFunction &F);
   void alignBlocks(MachineFunction &F);
+  void computeLoopUnavoidableBlocks(MachineLoop &L);
+  void computeUnavoidableBlocks(MachineFunction &F);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -473,11 +505,39 @@
     else
       SuccProb = BranchProbability(SuccProbN, SuccProbD);
 
-    // If we outline optional branches, look whether Succ is unavoidable, i.e.
-    // dominates all terminators of the MachineFunction. If it does, other
-    // successors must be optional. Don't do this for cold branches.
-    if (OutlineOptionalBranches && SuccProb > HotProb.getCompl() &&
-        UnavoidableBlocks.count(Succ) > 0) {
+    // Check if Succ is unavoidable, for outlining with tail-duplication, in
+    // addition to straight outlining.
+    if (UnavoidableBlocks.count(Succ) > 0 && SuccProb > HotProb.getCompl()) {
+      auto CanTailDuplicateAllPreds = [&]() {
+        DEBUG(dbgs() << "Checking to see if block " << getBlockName(Succ)
+              << " can tail duplicate into all its predecessors.\n");
+        bool IsSimple = TailDup.isSimpleBB(Succ);
+
+        bool IgnoreFallthrough = true;
+        if (!TailDup.shouldTailDuplicate(*Succ->getParent(), IsSimple, *Succ,
+                                         IgnoreFallthrough)) {
+          DEBUG(dbgs() << "Skipping because it is "
+                << "not a candidate for duplication.\n");
+          return false;
+        }
+        for (MachineBasicBlock *Pred : Succ->predecessors()) {
+          // Make sure all unplaced and unfiltered predecessors are either part
+          // of the second spine, or can be tail-duplicated into.
+          if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+              || BlockToChain[Pred] == &Chain)
+            continue;
+          // If Pred is part of the growing second spine, we don't need to be
+          // able to copy succ onto the end of it.
+          if (TailDupDelayBlocks.count(Pred) > 0)
+            continue;
+          if (!TailDup.canTailDuplicate(Succ, Pred)) {
+            DEBUG(dbgs() << "Skipping because it can't be duplicated into block "
+                  << getBlockName(Pred) << ".\n");
+            return false;
+          }
+        }
+        return true;
+      };
       auto HasShortOptionalBranch = [&]() {
         for (MachineBasicBlock *Pred : Succ->predecessors()) {
           // Check whether there is an unplaced optional branch.
@@ -493,8 +553,25 @@
         }
         return false;
       };
-      if (!HasShortOptionalBranch())
-        return Succ;
+      if (OutlineOptionalBranches) {
+        // Don't outline a small single block branch.
+        if (!HasShortOptionalBranch())
+          return Succ;
+      }
+      if (TailDupPlacement && CanTailDuplicateAllPreds()) {
+        if (!HasShortOptionalBranch()) {
+          // Add blocks that were tail-duplicated into to the delay set so that
+          // the second spine can keep growing.
+          for (MachineBasicBlock *Pred : Succ->predecessors()) {
+            if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+                || BlockToChain[Pred] == &Chain
+                || TailDupDelayBlocks.count(Pred) > 0)
+              continue;
+            TailDupDelayBlocks.insert(Pred);
+          }
+          return Succ;
+        }
+      }
     }
 
     // Only consider successors which are either "hot", or wouldn't violate
@@ -646,6 +723,7 @@
     SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
     SmallVectorImpl<MachineBasicBlock *> &EHPadWorkList,
     const BlockFilterSet *BlockFilter = nullptr) {
+
   BlockChain &Chain = *BlockToChain[MBB];
   if (!UpdatedPreds.insert(&Chain).second)
     return;
@@ -676,9 +754,9 @@
     MachineBasicBlock *BB, BlockChain &Chain,
     SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
     SmallVectorImpl<MachineBasicBlock *> &EHPadWorkList,
-    const BlockFilterSet *BlockFilter) {
-  assert(BB);
-  assert(BlockToChain[BB] == &Chain);
+    BlockFilterSet *BlockFilter) {
+  assert(BB && "BB must not be null.\n");
+  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
   MachineFunction &F = *BB->getParent();
   MachineFunction::iterator PrevUnplacedBlockIt = F.begin();
 
@@ -687,32 +765,15 @@
                       BlockFilter);
   BB = *std::prev(Chain.end());
   for (;;) {
-    assert(BB);
-    assert(BlockToChain[BB] == &Chain);
-    assert(*std::prev(Chain.end()) == BB);
+    assert(BB && "null block found at end of chain in loop.");
+    assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
+    assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
+
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
     MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
 
-    // Placing an actual successor may have changed tail duplication
-    // opportunities. Check for that now.
-    if (TailDupPlacement && BestSucc) {
-      DEBUG(dbgs() << "Redoing tail duplication for BestSucc#"
-                   << BestSucc->getNumber() << "\n");
-      bool IsSimple = TailDup.isSimpleBB(BestSucc);
-      bool IgnoreFallthrough = true;
-      // Simple blocks should just fallthrough, so only worry about non-simple
-      // ones.
-      if (!IsSimple && TailDup.shouldTailDuplicate(F, IsSimple,
-                                                   *BestSucc, IgnoreFallthrough)) {
-        SmallVector<MachineOperand, 4> Cond;
-        MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-        if (!TII->AnalyzeBranch(*BestSucc, TBB, FBB, Cond))
-          TailDup.tailDuplicateAndUpdate(F, IsSimple, BestSucc, BB);
-      }
-    }
-
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
     // this point. This won't be a fallthrough, but it will increase locality.
@@ -731,13 +792,93 @@
                       "layout successor until the CFG reduces\n");
     }
 
+    // Placement may have changed tail duplication opportunities.
+    // Check for that now.
+    if (TailDupPlacement && BestSucc) {
+      DEBUG(dbgs() << "Redoing tail duplication for BestSucc#"
+                   << BestSucc->getNumber() << "\n");
+      bool IsSimple = TailDup.isSimpleBB(BestSucc);
+      bool IgnoreFallthrough = true;
+      if (TailDup.shouldTailDuplicate(F, IsSimple,
+                                      *BestSucc, IgnoreFallthrough)) {
+        SmallVector<MachineOperand, 4> Cond;
+        MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+
+        // Update UnscheduledPredecessors to reflect tail-duplication.
+        for (MachineBasicBlock *Pred : BestSucc->predecessors()) {
+          // We're only looking for unscheduled predecessors that match the
+          // filter.
+          if (!TII->AnalyzeBranch(*BestSucc, TBB, FBB, Cond)) {
+            BlockChain* PredChain = BlockToChain[Pred];
+            if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+                || PredChain == &Chain)
+              continue;
+            if (TailDup.canTailDuplicate(BestSucc, Pred)) {
+              for (MachineBasicBlock *NewSucc : BestSucc->successors()) {
+                if (BlockFilter && !BlockFilter->count(NewSucc))
+                  continue;
+                BlockChain *NewChain = BlockToChain[NewSucc];
+                if (NewChain != &Chain && NewChain != PredChain)
+                  BlockToChain[NewSucc]->UnscheduledPredecessors++;
+              }
+            }
+          }
+        }
+        bool TailBlockRemoved = false;
+        auto RemovalCallback =
+            [&](MachineBasicBlock *RemBB) {
+              TailBlockRemoved = true;
+              // Remove from the Chain
+              if (BlockToChain.count(RemBB)) {
+                BlockChain *Chain = BlockToChain[RemBB];
+                Chain->remove(RemBB);
+                BlockToChain.erase(RemBB);
+              }
+              // Handle the unplaced block iterator
+              if (&(*PrevUnplacedBlockIt) == RemBB) {
+                PrevUnplacedBlockIt++;
+              }
+              // Handle the Work Lists
+              SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
+              if (RemBB->isEHPad())
+                RemoveList = EHPadWorkList;
+              for (auto it = RemoveList.begin(); it != RemoveList.end(); ++it) {
+                if (*it == RemBB) {
+                  RemoveList.erase(it);
+                  break;
+                }
+              }
+              // Handle the filter set
+              if (BlockFilter) {
+                BlockFilter->erase(RemBB);
+              }
+              // Remove it from loops.
+              MLI->removeBlock(RemBB);
+              DEBUG(dbgs() << "TailDuplicator deleted block: "
+                    << getBlockName(RemBB) << "\n");
+            };
+        auto RemovalCallbackRef =
+            llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
+        TailDup.tailDuplicateAndUpdate(F, IsSimple, BestSucc, BB,
+                                       &RemovalCallbackRef);
+        // If the chosen successor was duplicated into all its predecessors,
+        // don't bother laying it out, just go round the loop again with BB as
+        // the chain end.
+        if (TailBlockRemoved)
+          continue;
+      }
+    }
+
+
     // Place this block, updating the datastructures to reflect its placement.
     BlockChain &SuccChain = *BlockToChain[BestSucc];
+    TailDupDelayBlocks.erase(BestSucc);
     // Zero out UnscheduledPredecessors for the successor we're about to merge in case
     // we selected a successor that didn't fit naturally into the CFG.
     SuccChain.UnscheduledPredecessors = 0;
     DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
                  << getBlockName(BestSucc) << "\n");
+    DEBUG(dbgs() << "Loop header is " << getBlockName(LoopHeaderBB) << "\n");
     markChainSuccessors(SuccChain, LoopHeaderBB, BlockWorkList, EHPadWorkList,
                         BlockFilter);
     Chain.merge(BestSucc, &SuccChain);
@@ -1145,6 +1286,67 @@
   return LoopBlockSet;
 }
 
+
+/// \brief Finds unavoidable blocks within a loop.
+///
+/// These blocks form the loop spine, and knowing which blocks they are allow
+/// the loop-optional blocks to be outlined to the end of the loop,
+/// unconditionally or if they can form a second tail-duped spine.
+void MachineBlockPlacement::computeLoopUnavoidableBlocks(MachineLoop &L) {
+  SmallVector<MachineBasicBlock *, 4> Exits;
+  L.getExitingBlocks(Exits);
+  // Find the nearest common dominator of all of F's terminators.
+  MachineBasicBlock * Terminator = nullptr;
+  for (MachineBasicBlock *MBB : Exits) {
+    if (Terminator == nullptr)
+      Terminator = MBB;
+    else
+      Terminator = MDT->findNearestCommonDominator(Terminator, MBB);
+  }
+
+  // MBBs dominating this common dominator are unavoidable.
+  UnavoidableBlocks.clear();
+  // If there are no exit blocks from the loop, punt and assume that there are
+  // no unavoidable blocks. This will result in a linear layout.
+  if (Terminator == nullptr)
+    return;
+  for (MachineBasicBlock *MBB : L.getBlocks()) {
+    if (MDT->dominates(MBB, Terminator)) {
+      UnavoidableBlocks.insert(MBB);
+    }
+  }
+}
+
+
+/// \brief Finds unavoidable blocks for the entire function
+///
+/// These blocks form the spine, and knowing which blocks they are allow
+/// the optional blocks to be outlined to the end of the function
+/// unconditionally or if they can form a second tail-duped spine.
+void MachineBlockPlacement::computeUnavoidableBlocks(MachineFunction &F) {
+  MachineBasicBlock * Terminator = nullptr;
+  for (MachineBasicBlock &MBB : F) {
+    if (MBB.succ_size() == 0) {
+      if (Terminator == nullptr)
+        Terminator = &MBB;
+      else
+        Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
+    }
+  }
+
+  // MBBs dominating this common dominator are unavoidable.
+  UnavoidableBlocks.clear();
+  // If there are no exit blocks from the function, punt and assume that there
+  // are no unavoidable blocks. This will result in a linear layout.
+  if (Terminator == nullptr)
+    return;
+  for (MachineBasicBlock &MBB : F) {
+    if (MDT->dominates(&MBB, Terminator)) {
+      UnavoidableBlocks.insert(&MBB);
+    }
+  }
+}
+
 /// \brief Forms basic block chains from the natural loop structures.
 ///
 /// These chains are designed to preserve the existing *structure* of the code
@@ -1162,6 +1364,13 @@
   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
   BlockFilterSet LoopBlockSet = collectLoopBlockSet(F, L);
 
+  // Find the unavoidable blocks within this loop. This allows partial outlining
+  // with tail duplication within a loop.
+  if (TailDupPlacement) {
+    computeLoopUnavoidableBlocks(L);
+    TailDupDelayBlocks.clear();
+  }
+
   // Check if we have profile data for this function. If yes, we will rotate
   // this loop by modeling costs more precisely which requires the profile data
   // for better layout.
@@ -1268,31 +1477,17 @@
     }
   }
 
-  if (OutlineOptionalBranches) {
-    // Find the nearest common dominator of all of F's terminators.
-    MachineBasicBlock *Terminator = nullptr;
-    for (MachineBasicBlock &MBB : F) {
-      if (MBB.succ_size() == 0) {
-        if (Terminator == nullptr)
-          Terminator = &MBB;
-        else
-          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
-      }
-    }
-
-    // MBBs dominating this common dominator are unavoidable.
-    UnavoidableBlocks.clear();
-    for (MachineBasicBlock &MBB : F) {
-      if (MDT->dominates(&MBB, Terminator)) {
-        UnavoidableBlocks.insert(&MBB);
-      }
-    }
-  }
-
   // Build any loop-based chains.
   for (MachineLoop *L : *MLI)
     buildLoopChains(F, *L);
 
+  // This must go after the loop chains, because the loop chains compute their
+  // own loop-relative UnavoidableBlocks
+  if (OutlineOptionalBranches || TailDupPlacement) {
+    computeUnavoidableBlocks(F);
+    TailDupDelayBlocks.clear();
+  }
+
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
 
Index: lib/CodeGen/TailDuplicator.cpp
===================================================================
--- lib/CodeGen/TailDuplicator.cpp
+++ lib/CodeGen/TailDuplicator.cpp
@@ -122,7 +122,8 @@
 /// Tail duplicate the block and cleanup.
 bool TailDuplicator::tailDuplicateAndUpdate(
     MachineFunction &MF, bool IsSimple, MachineBasicBlock *MBB,
-    MachineBasicBlock *ForcedLayoutPred) {
+    MachineBasicBlock *ForcedLayoutPred,
+    llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
   // Save the successors list.
   SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
@@ -147,7 +148,7 @@
   // If it is dead, remove it.
   if (isDead) {
     NumInstrDups -= MBB->size();
-    removeDeadBlock(MBB);
+    removeDeadBlock(MBB, RemovalCallback);
     ++NumDeadBlocks;
   }
 
@@ -512,7 +513,7 @@
                                          bool IgnoreFallthrough) {
   // IgnoreFallthrough is set when considering duplication during layout.
   // Because the ultimate layout may change, it is better to consider
-  // duplicating blocks that can't fall through.
+  // duplicating blocks that can fall through.
   if (TailBB.canFallThrough() && !IgnoreFallthrough)
     return false;
 
@@ -731,6 +732,27 @@
   return Changed;
 }
 
+bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB,
+                                      MachineBasicBlock *PredBB) {
+  // EH edges are ignored by AnalyzeBranch.
+  if (PredBB->succ_size() > 1) {
+    DEBUG(dbgs() << "Predecessor doesn't unconditionally jump to tail.\n");
+    return false;
+  }
+
+  MachineBasicBlock *PredTBB, *PredFBB;
+  SmallVector<MachineOperand, 4> PredCond;
+  if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true)) {
+    DEBUG(dbgs() << "Branch cannot be analyzed.\n");
+    return false;
+  }
+  if (!PredCond.empty()) {
+    DEBUG(dbgs() << "PredCond is not empty.\n");
+    return false;
+  }
+  return true;
+}
+
 /// If it is profitable, duplicate TailBB's contents in each
 /// of its predecessors.
 bool TailDuplicator::tailDuplicate(MachineFunction &MF, bool IsSimple,
@@ -756,19 +778,12 @@
                                                         PE = Preds.end();
        PI != PE; ++PI) {
     MachineBasicBlock *PredBB = *PI;
-
     assert(TailBB != PredBB &&
            "Single-block loop should have been rejected earlier!");
-    // EH edges are ignored by AnalyzeBranch.
-    if (PredBB->succ_size() > 1)
-      continue;
 
-    MachineBasicBlock *PredTBB, *PredFBB;
-    SmallVector<MachineOperand, 4> PredCond;
-    if (TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true))
-      continue;
-    if (!PredCond.empty())
+    if (!canTailDuplicate(TailBB, PredBB))
       continue;
+
     // Don't duplicate into a fall-through predecessor (at least for now).
     bool IsLayoutSuccessor = false;
     if (ForcedLayoutPred)
@@ -789,8 +804,9 @@
     if (RS && !TailBB->livein_empty()) {
       // Update PredBB livein.
       RS->enterBasicBlock(*PredBB);
-      if (!PredBB->empty())
+      if (!PredBB->empty()) {
         RS->forward(std::prev(PredBB->end()));
+      }
       for (const auto &LI : TailBB->liveins()) {
         if (!RS->isRegUsed(LI.PhysReg, false))
           // If a register is previously livein to the tail but it's not live
@@ -822,6 +838,8 @@
     appendCopies(PredBB, CopyInfos, Copies);
 
     // Simplify
+    MachineBasicBlock *PredTBB, *PredFBB;
+    SmallVector<MachineOperand, 4> PredCond;
     TII->AnalyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true);
 
     NumInstrDups += TailBB->size() - 1; // subtract one for removed branch
@@ -841,14 +859,16 @@
 
   // If TailBB was duplicated into all its predecessors except for the prior
   // block, which falls through unconditionally, move the contents of this
-  // block into the prior block. Don't do this when ForcedLayoutPred is
-  // non-null, as it can break layout to remove blocks.
-  MachineBasicBlock *PrevBB = &*std::prev(TailBB->getIterator());
+  // block into the prior block.
+  MachineBasicBlock *PrevBB = ForcedLayoutPred;
+  if (!PrevBB)
+      PrevBB = &*std::prev(TailBB->getIterator());
   MachineBasicBlock *PriorTBB = nullptr, *PriorFBB = nullptr;
   SmallVector<MachineOperand, 4> PriorCond;
   // This has to check PrevBB->succ_size() because EH edges are ignored by
   // AnalyzeBranch.
-  if (ForcedLayoutPred == nullptr && PrevBB->succ_size() == 1 &&
+  if (PrevBB->succ_size() == 1 &&
+      *PrevBB->succ_begin() == TailBB &&
       !TII->AnalyzeBranch(*PrevBB, PriorTBB, PriorFBB, PriorCond, true) &&
       PriorCond.empty() && !PriorTBB && TailBB->pred_size() == 1 &&
       !TailBB->hasAddressTaken()) {
@@ -952,10 +972,15 @@
 
 /// Remove the specified dead machine basic block from the function, updating
 /// the CFG.
-void TailDuplicator::removeDeadBlock(MachineBasicBlock *MBB) {
+void TailDuplicator::removeDeadBlock(
+    MachineBasicBlock *MBB,
+    llvm::function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
   assert(MBB->pred_empty() && "MBB must be dead!");
   DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
 
+  if (RemovalCallback)
+    (*RemovalCallback)(MBB);
+
   // Remove all successors.
   while (!MBB->succ_empty())
     MBB->removeSuccessor(MBB->succ_end() - 1);
Index: test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
===================================================================
--- test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -664,12 +664,12 @@
 ; No realignment in the prologue.
 ; CHECK-NOT:  and
 ; CHECK-NOT:  0xffffffffffffffe0
-; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
+; CHECK:  tbnz  {{.*}} .[[LABEL:.*]]
+; CHECK:  ret
+; CHECK:  .[[LABEL]]:
 ; Stack is realigned in a non-entry BB.
 ; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
 ; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
-; CHECK:  .[[LABEL]]:
-; CHECK:  ret
 
 
 define void @realign_conditional2(i1 %b) {
@@ -687,15 +687,15 @@
 
 ; CHECK-LABEL: realign_conditional2
 ; Extra realignment in the prologue (performance issue).
-; CHECK:  tbz  {{.*}} .[[LABEL:.*]]
+; CHECK:  tbnz  {{.*}} .[[LABEL:.*]]
+; CHECK:  ret
+; CHECK:  .[[LABEL]]:
 ; CHECK:  sub  x9, sp, #32            // =32
 ; CHECK:  and  sp, x9, #0xffffffffffffffe0
 ; CHECK:  mov   x19, sp
 ; Stack is realigned in a non-entry BB.
 ; CHECK:  sub  [[REG:x[01-9]+]], sp, #64
 ; CHECK:  and  sp, [[REG]], #0xffffffffffffffe0
-; CHECK:  .[[LABEL]]:
-; CHECK:  ret
 
 attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
Index: test/CodeGen/AArch64/arm64-atomic.ll
===================================================================
--- test/CodeGen/AArch64/arm64-atomic.ll
+++ test/CodeGen/AArch64/arm64-atomic.ll
@@ -9,10 +9,10 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -27,10 +27,12 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
   %new = load i32, i32* %pnew
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
@@ -41,15 +43,15 @@
 ; CHECK-LABEL: val_compare_and_swap_rel:
 ; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
-; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
+; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -64,10 +66,10 @@
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
Index: test/CodeGen/AArch64/arm64-ccmp.ll
===================================================================
--- test/CodeGen/AArch64/arm64-ccmp.ll
+++ test/CodeGen/AArch64/arm64-ccmp.ll
@@ -51,7 +51,7 @@
 ; CHECK: cmp
 ; CHECK: b.eq
 ; CHECK: cmp
-; CHECK: b.gt
+; CHECK: b.le
 define i32 @single_flagclobber(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp eq i32 %a, 5
@@ -78,7 +78,7 @@
 ; CHECK: cmp
 ; CHECK: b.eq
 ; CHECK: cmp
-; CHECK: tbz
+; CHECK: tbnz
 define i32 @single_flagclobber_tbz(i32 %a, i32 %b) nounwind ssp {
 entry:
   %cmp = icmp eq i32 %a, 5
Index: test/CodeGen/AArch64/arm64-extload-knownzero.ll
===================================================================
--- test/CodeGen/AArch64/arm64-extload-knownzero.ll
+++ test/CodeGen/AArch64/arm64-extload-knownzero.ll
@@ -12,7 +12,6 @@
   %tmp2 = load i16, i16* %ptr, align 2
   br label %bb2
 bb2:
-; CHECK: %bb2
 ; CHECK-NOT: and {{w[0-9]+}}, [[REG]], #0xffff
 ; CHECK: cmp [[REG]], #23
   %tmp3 = phi i16 [ 0, %entry ], [ %tmp2, %bb1 ]
Index: test/CodeGen/AArch64/arm64-shrink-wrapping.ll
===================================================================
--- test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -10,9 +10,11 @@
 ; Compare the arguments and jump to exit.
 ; No prologue needed.
 ; ENABLE: cmp w0, w1
-; ENABLE-NEXT: b.ge [[EXIT_LABEL:LBB[0-9_]+]]
+; ENABLE-NEXT: b.lt [[PROLOGUE_LABEL:LBB[0-9_]+]]
+; ENABLE: ret
 ;
 ; Prologue code.
+; ENABLE: [[PROLOGUE_LABEL]]:
 ; CHECK: sub sp, sp, #32
 ; CHECK-NEXT: stp [[SAVE_SP:x[0-9]+]], [[CSR:x[0-9]+]], [sp, #16]
 ; CHECK-NEXT: add [[SAVE_SP]], sp, #16
@@ -37,7 +39,6 @@
 ; CHECK-NEXT: add sp, sp, #32
 ;
 ; With shrink-wrapping, exit block is a simple return.
-; ENABLE: [[EXIT_LABEL]]:
 ; CHECK-NEXT: ret
 define i32 @foo(i32 %a, i32 %b) {
   %tmp = alloca i32, align 4
Index: test/CodeGen/AArch64/fcmp.ll
===================================================================
--- test/CodeGen/AArch64/fcmp.ll
+++ test/CodeGen/AArch64/fcmp.ll
@@ -31,7 +31,7 @@
   %tst4 = fcmp uge float %a, -0.0
   br i1 %tst4, label %t5, label %end
 ; CHECK-NOT: fcmp {{s[0-9]+}}, #0.0
-; CHECK: b.mi .LBB
+; CHECK: b.pl .LBB
 
 t5:
   call void @bar(i32 0)
@@ -70,7 +70,7 @@
   %tst4 = fcmp uge double %a, -0.0
   br i1 %tst4, label %t5, label %end
 ; CHECK-NOT: fcmp {{d[0-9]+}}, #0.0
-; CHECK: b.mi .LBB
+; CHECK: b.pl .LBB
 
 t5:
   call void @bar(i32 0)
Index: test/CodeGen/AArch64/rm_redundant_cmp.ll
===================================================================
--- test/CodeGen/AArch64/rm_redundant_cmp.ll
+++ test/CodeGen/AArch64/rm_redundant_cmp.ll
@@ -13,7 +13,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.gt
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 1), align 2
   %1 = load i16, i16* getelementptr inbounds (%struct.s_signed_i16, %struct.s_signed_i16* @cost_s_i8_i16, i64 0, i32 2), align 2
@@ -69,7 +69,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.hi
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 1), align 2
   %1 = load i16, i16* getelementptr inbounds (%struct.s_unsigned_i16, %struct.s_unsigned_i16* @cost_u_i16, i64 0, i32 2), align 2
@@ -134,7 +134,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.gt
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 1), align 2
   %1 = load i8, i8* getelementptr inbounds (%struct.s_signed_i8, %struct.s_signed_i8* @cost_s, i64 0, i32 2), align 2
@@ -190,7 +190,7 @@
 ; CHECK: cmp {{w[0-9]+}}, {{w[0-9]+}}
 ; CHECK-NEXT: b.hi
 ; CHECK-NOT: cmp
-; CHECK: b.ne
+; CHECK: b.eq
 entry:
   %0 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 1), align 2
   %1 = load i8, i8* getelementptr inbounds (%struct.s_unsigned_i8, %struct.s_unsigned_i8* @cost_u_i8, i64 0, i32 2), align 2
Index: test/CodeGen/AArch64/tbz-tbnz.ll
===================================================================
--- test/CodeGen/AArch64/tbz-tbnz.ll
+++ test/CodeGen/AArch64/tbz-tbnz.ll
@@ -10,7 +10,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -28,7 +28,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -46,7 +46,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbnz [[CMP]], #31
+; CHECK: tbz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -64,7 +64,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbnz [[CMP]], #63
+; CHECK: tbz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -82,7 +82,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbnz [[CMP]], #31
+; CHECK: tbz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -100,7 +100,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbnz [[CMP]], #63
+; CHECK: tbz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -118,7 +118,7 @@
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -162,7 +162,7 @@
   br i1 %tst4, label %if.then4, label %if.end
 
 ; CHECK: tst x0, x1, lsl #62
-; CHECK: b.lt
+; CHECK: b.ge
 
 if.then4:
   call void @t()
@@ -178,7 +178,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -194,7 +194,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -209,7 +209,7 @@
 
 ; CHECK: ldr [[CMP:x[0-9]+]], [x1]
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
   %val = load i64, i64* %ptr
   %tst = icmp slt i64 %val, 0
@@ -229,7 +229,7 @@
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -247,7 +247,7 @@
 
 ; CHECK: orr [[CMP:x[0-9]+]], x0, x1
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -262,7 +262,7 @@
   br i1 %cond, label %if.end, label %if.then
 
 ; CHECK-NOT: and
-; CHECK: tbnz w0, #0
+; CHECK: tbz w0, #0
 
 if.then:
   call void @t()
@@ -278,7 +278,7 @@
   br i1 %cond1, label %if.then, label %if.end
 
 ; CHECK-NOT: movn
-; CHECK: tbnz w0, #0
+; CHECK: tbz w0, #0
 
 if.then:
   call void @t()
@@ -296,7 +296,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: lsl
-; CHECK: tbnz w0, #2
+; CHECK: tbz w0, #2
 
 then:
   call void @t()
@@ -314,7 +314,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: lsr
-; CHECK: tbnz w0, #3
+; CHECK: tbz w0, #3
 
 then:
   call void @t()
@@ -331,7 +331,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: asr
-; CHECK: tbnz w0, #31
+; CHECK: tbz w0, #31
 
 then:
   call void @t()
@@ -350,7 +350,7 @@
   br i1 %cond, label %then, label %end
 
 ; CHECK-NOT: ubfx
-; CHECK: tbnz w0, #3
+; CHECK: tbz w0, #3
 
 then:
   call void @t()
Index: test/CodeGen/AMDGPU/salu-to-valu.ll
===================================================================
--- test/CodeGen/AMDGPU/salu-to-valu.ll
+++ test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -437,11 +437,12 @@
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_cmp_le_u32_e32 vcc, [[SGPR]], v{{[0-9]+}}
 ; GCN: s_and_b64 vcc, exec, vcc
-; GCN: s_cbranch_vccnz [[EXIT:[A-Z0-9_]+]]
+; GCN: s_cbranch_vccz [[SUCCESS:[A-Z0-9_]+]]
+; GCN: s_endpgm
+; GCN: {{^}}[[SUCCESS]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
 ; GCN: s_endpgm
 define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
Index: test/CodeGen/AMDGPU/smrd-vccz-bug.ll
===================================================================
--- test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -9,9 +9,10 @@
 ; GCN: s_waitcnt lgkmcnt(0)
 ; VCCZ-BUG: s_mov_b64 vcc, vcc
 ; NOVCCZ-BUG-NOT: s_mov_b64 vcc, vcc
-; GCN: s_cbranch_vccnz [[EXIT:[0-9A-Za-z_]+]]
+; GCN: s_cbranch_vccz [[SUCCESS:[0-9A-Za-z_]+]]
+; GCN: s_endpgm
+; GCN: [[SUCCESS]]:
 ; GCN: buffer_store_dword
-; GCN: [[EXIT]]:
 ; GCN: s_endpgm
 define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
 entry:
Index: test/CodeGen/AMDGPU/uniform-cfg.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-cfg.ll
+++ test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -121,9 +121,10 @@
 ; be selected for the SALU and then later moved to the VALU.
 ; SI: v_cmp_ne_i32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 5, [[CMP]]
 ; SI: s_and_b64 vcc, exec, [[COND]]
-; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS_LABEL]]:
 ; SI: buffer_store_dword
-; SI: [[ENDIF_LABEL]]:
 ; SI: s_endpgm
 define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
 entry:
@@ -146,9 +147,10 @@
 ; be selected for the SALU and then later moved to the VALU.
 ; SI: v_cmp_gt_u32_e32 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], 6, [[CMP]]
 ; SI: s_and_b64 vcc, exec, [[COND]]
-; SI: s_cbranch_vccnz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cbranch_vccz [[SUCCESS_LABEL:[0-9_A-Za-z]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS_LABEL]]:
 ; SI: buffer_store_dword
-; SI: [[ENDIF_LABEL]]:
 ; SI: s_endpgm
 define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
 entry:
@@ -231,9 +233,10 @@
 
 ; SI-LABEL: {{^}}icmp_2_users:
 ; SI: s_cmp_lt_i32 s{{[0-9]+}}, 1
-; SI: s_cbranch_scc1 [[LABEL:[a-zA-Z0-9_]+]]
+; SI: s_cbranch_scc0 [[SUCCESS:[a-zA-Z0-9_]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS]]:
 ; SI: buffer_store_dword
-; SI: [[LABEL]]:
 ; SI: s_endpgm
 define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
@@ -255,9 +258,10 @@
 ; SI: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
 ; SI: v_cmp_lt_i32_e64 [[MASK:s\[[0-9]+:[0-9]+\]]], 0, [[COND]]
 ; SI: s_and_b64 vcc, exec, [[MASK]]
-; SI: s_cbranch_vccnz [[EXIT]]
+; SI: s_cbranch_vccz [[SUCCESS:[a-zA-Z0-9_]+]]
+; SI: s_endpgm
+; SI: {{^}}[[SUCCESS]]:
 ; SI: buffer_store
-; SI: {{^}}[[EXIT]]:
 ; SI: s_endpgm
 define void @icmp_users_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
@@ -333,13 +337,14 @@
 
 ; SI-LABEL: {{^}}divergent_inside_uniform:
 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
-; SI: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; SI: s_cbranch_scc0 [[SUCCESS_LABEL:[0-9_A-Za-z]+]]
+; SI: s_endpgm
+; SI: [[SUCCESS_LABEL]]:
 ; SI: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; SI: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; SI: buffer_store_dword [[ONE]]
-; SI: [[ENDIF_LABEL]]:
 ; SI: s_endpgm
 define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
@@ -368,10 +373,11 @@
 ; SI: buffer_store_dword [[ONE]]
 ; SI: s_or_b64 exec, exec, [[MASK]]
 ; SI: s_cmp_lg_i32 s{{[0-9]+}}, 0
-; SI: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
+; SI: s_cbranch_scc0 [[THREE:[A-Z0-9_]+]]
+; SI: s_endpgm
+; SI: [[THREE]]:
 ; SI: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; SI: buffer_store_dword [[TWO]]
-; SI: [[EXIT]]:
 ; SI: s_endpgm
 define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
Index: test/CodeGen/AMDGPU/uniform-crash.ll
===================================================================
--- test/CodeGen/AMDGPU/uniform-crash.ll
+++ test/CodeGen/AMDGPU/uniform-crash.ll
@@ -3,9 +3,10 @@
 
 ; GCN-LABEL: {{^}}icmp_2_users:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 1
-; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
+; GCN: s_cbranch_scc0 [[LABEL:BB[0-9_A-Z]+]]
+; GCN: s_endpgm
 ; GCN: [[LABEL]]:
-; GCN-NEXT: s_endpgm
+; GCN: s_endpgm
 define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
Index: test/CodeGen/AMDGPU/valu-i1.ll
===================================================================
--- test/CodeGen/AMDGPU/valu-i1.ll
+++ test/CodeGen/AMDGPU/valu-i1.ll
@@ -116,9 +116,13 @@
 ; SI: s_and_saveexec_b64 [[OUTER_CMP_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[OUTER_CMP_SREG]], exec, [[OUTER_CMP_SREG]]
 ; SI: s_cbranch_execz [[LABEL_EXIT:BB[0-9]+_[0-9]+]]
+; SI: s_branch [[LABEL_PREHEADER:BB[0-9]+_[0-9]+]]
+
+; SI: [[LABEL_EXIT]]:
+; SI: s_endpgm
 
 ; Initialize inner condition to false
-; SI: ; BB#1:
+; SI: [[LABEL_PREHEADER]]:
 ; SI: s_mov_b64 [[ZERO:s\[[0-9]+:[0-9]+\]]], 0{{$}}
 ; SI: s_mov_b64 [[COND_STATE:s\[[0-9]+:[0-9]+\]]], [[ZERO]]
 
@@ -146,8 +150,6 @@
 
 ; SI: BB#5
 ; SI: s_or_b64 exec, exec, [[COND_STATE]]
-
-; SI: [[LABEL_EXIT]]:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
Index: test/CodeGen/AMDGPU/wqm.ll
===================================================================
--- test/CodeGen/AMDGPU/wqm.ll
+++ test/CodeGen/AMDGPU/wqm.ll
@@ -82,12 +82,12 @@
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_mov_b64 [[ORIG:s\[[0-9]+:[0-9]+\]]], exec
 ;CHECK-NEXT: s_wqm_b64 exec, exec
+;CHECK: %IF
+;CHECK: image_sample
 ;CHECK: %ELSE
 ;CHECK: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[ORIG]]
 ;CHECK: store
 ;CHECK: s_mov_b64 exec, [[SAVED]]
-;CHECK: %IF
-;CHECK: image_sample
 define amdgpu_ps float @test_control_flow_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, i32 %c, i32 %z, float %data) {
 main_body:
   %cmp = icmp eq i32 %z, 0
Index: test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
===================================================================
--- test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
+++ test/CodeGen/ARM/2013-05-05-IfConvertBug.ll
@@ -112,15 +112,17 @@
 ; CHECK-NEXT: subs [[REG:r[0-9]+]], #120
 ; CHECK-NEXT: cmp [[REG]], r1
 ; CHECK-NOT: it lt
-; CHECK-NEXT: bge [[LABEL:.+]]
+; CHECK-NEXT: blt [[LABEL:.+]]
 ; Next BB
+; CHECK: subs r0, r1, r0
+; CHECK-NEXT: bx lr
+; Next BB
+; CHECK: [[LABEL]]:
 ; CHECK-NOT: cmplt
 ; CHECK: cmp r0, #119
 ; CHECK-NEXT: itt le
 ; CHECK-NEXT: addle r0, r1, #1
 ; CHECK-NEXT: bxle lr
-; Next BB
-; CHECK: [[LABEL]]:
 ; CHECK-NEXT: subs r0, r1, r0
 ; CHECK-NEXT: bx lr
 
Index: test/CodeGen/ARM/arm-shrink-wrapping.ll
===================================================================
--- test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -23,9 +23,11 @@
 ; Compare the arguments and jump to exit.
 ; No prologue needed.
 ; ENABLE: cmp r0, r1
-; ENABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+; ENABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]]
+; ENABLE: bx lr
 ;
 ; Prologue code.
+; ENABLE: [[SUCCESS_LABEL]]:
 ; CHECK: push {r7, lr}
 ; CHECK-NEXT: mov r7, sp
 ;;
@@ -33,8 +35,12 @@
 ; After the prologue is set.
 ; DISABLE: sub sp
 ; DISABLE: cmp r0, r1
-; DISABLE-NEXT: bge [[EXIT_LABEL:LBB[0-9_]+]]
+; DISABLE-NEXT: blt [[SUCCESS_LABEL:LBB[0-9_]+]]
+; ARM-DISABLE: mov sp, r7
+; THUMB-DISABLE: add sp, 
+; DISABLE-NEXT: pop {r7, pc}
 ;
+; DISABLE: [[SUCCESS_LABEL]]:
 ; Store %a in the alloca.
 ; ARM-ENABLE: push {r0}
 ; THUMB-ENABLE: str r0, [sp, #-4]
@@ -50,9 +56,8 @@
 ; THUMB-ENABLE-NEXT: add sp, #4
 ; ENABLE-NEXT: pop{{(\.w)?}} {r7, lr}
 ;
-; CHECK: [[EXIT_LABEL]]:
-;
-; Without shrink-wrapping, epilogue is in the exit block.
+; Late stage tail-duplication removes the exit label with shrink-wrapping.
+; Without shrink-wrapping, epilogue is before the return.
 ; Epilogue code. (What we pop does not matter.)
 ; ARM-DISABLE: mov sp, r7
 ; THUMB-DISABLE: add sp, 
@@ -388,9 +393,9 @@
 ;
 ; Next BB.
 ; CHECK: [[LOOP:LBB[0-9_]+]]: @ %for.body
-; ARM: subs [[IV]], [[IV]], #1
-; THUMB: subs [[IV]], #1
-; CHECK: add{{(\.w)?}} r4, r4, #1
+; ARM-DAG: subs [[IV]], [[IV]], #1
+; THUMB-DAG: subs [[IV]], #1
+; CHECK-DAG: add{{(\.w)?}} r4, r4, #1
 ; CHECK: bne [[LOOP]]
 ;
 ; Next BB.
Index: test/CodeGen/ARM/atomic-cmpxchg.ll
===================================================================
--- test/CodeGen/ARM/atomic-cmpxchg.ll
+++ test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -72,11 +72,11 @@
 ; CHECK-ARMV7-NEXT: mov [[RES:r[0-9]+]], #1
 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
 ; CHECK-ARMV7-NEXT: bne [[TRY]]
-; CHECK-ARMV7-NEXT: b [[END:.LBB[0-9_]+]]
+; CHECK-ARMV7-NEXT: mov r0, [[RES]]
+; CHECK-ARMV7-NEXT: bx lr
 ; CHECK-ARMV7-NEXT: [[FAIL]]:
 ; CHECK-ARMV7-NEXT: clrex
 ; CHECK-ARMV7-NEXT: mov [[RES]], #0
-; CHECK-ARMV7-NEXT: [[END]]:
 ; CHECK-ARMV7-NEXT: mov r0, [[RES]]
 ; CHECK-ARMV7-NEXT: bx lr
 
Index: test/CodeGen/ARM/atomic-op.ll
===================================================================
--- test/CodeGen/ARM/atomic-op.ll
+++ test/CodeGen/ARM/atomic-op.ll
@@ -297,10 +297,10 @@
 ; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
 ; CHECK:     cmp     [[SUCCESS]], #0
 ; CHECK:     bne     [[LOOP_BB]]
-; CHECK:     b       [[END_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK:     dmb     ish
+; CHECK:     bx      lr
 ; CHECK: [[FAIL_BB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[END_BB]]:
 ; CHECK:     dmb     ish
 ; CHECK:     bx      lr
 
Index: test/CodeGen/ARM/atomic-ops-v8.ll
===================================================================
--- test/CodeGen/ARM/atomic-ops-v8.ll
+++ test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1045,20 +1045,21 @@
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i8 %old
 }
 
@@ -1078,20 +1079,21 @@
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i16 %old
 }
 
@@ -1110,20 +1112,21 @@
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-NEXT: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-ARM-NEXT: bx lr
    ret void
 }
 
@@ -1148,16 +1151,16 @@
 ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r2, r3 is a reasonable guess.
 ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+; CHECK-NEXT: pop
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
Index: test/CodeGen/ARM/fold-stack-adjust.ll
===================================================================
--- test/CodeGen/ARM/fold-stack-adjust.ll
+++ test/CodeGen/ARM/fold-stack-adjust.ll
@@ -135,7 +135,7 @@
 
   ; Important to check for beginning of basic block, because if it gets
   ; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
 ; CHECK-NEXT: vpop {d7, d8}
 ; CHECK-NEXT: pop {r4, pc}
 
Index: test/CodeGen/ARM/machine-cse-cmp.ll
===================================================================
--- test/CodeGen/ARM/machine-cse-cmp.ll
+++ test/CodeGen/ARM/machine-cse-cmp.ll
@@ -52,7 +52,7 @@
 ; CHECK-LABEL: f3:
 ; CHECK-NOT: sub
 ; CHECK: cmp
-; CHECK: blt
+; CHECK: bge
 %0 = load i32, i32* %offset, align 4
 %cmp = icmp slt i32 %0, %size
 %s = sub nsw i32 %0, %size
Index: test/CodeGen/Mips/llvm-ir/ashr.ll
===================================================================
--- test/CodeGen/Mips/llvm-ir/ashr.ll
+++ test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -96,20 +96,23 @@
 
   ; M2:         srav      $[[T0:[0-9]+]], $4, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $3, $[[T0]]
+  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         nop
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         sll       $[[T4:[0-9]+]], $4, 1
   ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
+  ; M2:         beqz      $[[T1]], $[[EXIT]]
   ; M2:         or        $3, $[[T3]], $[[T2]]
-  ; M2:         $[[BB0]]:
-  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
-  ; M2:         nop
-  ; M2:         sra       $2, $4, 31
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         sra       $2, $4, 31
 
   ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -180,20 +183,23 @@
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrav     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
+  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             nop
+  ; M3:             $[[EXIT:BB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             $[[BB0]]:
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+  ; M3:             beqz      $[[T3]], $[[EXIT]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             $[[BB0]]:
-  ; M3:             beqz      $[[T3]], $[[BB1:BB[0-9_]+]]
-  ; M3:             nop
-  ; M3:             dsra      $2, $4, 63
   ; M3:             $[[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             dsra      $2, $4, 63
 
   ; GP64-NOT-R6:    dsrlv     $[[T0:[0-9]+]], $5, $7
   ; GP64-NOT-R6:    dsll      $[[T1:[0-9]+]], $4, 1
Index: test/CodeGen/Mips/llvm-ir/lshr.ll
===================================================================
--- test/CodeGen/Mips/llvm-ir/lshr.ll
+++ test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -94,20 +94,24 @@
 
   ; M2:         srlv      $[[T0:[0-9]+]], $4, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $3, $[[T0]]
+  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $2, $zero, 0
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         sll       $[[T4:[0-9]+]], $4, 1
   ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
   ; M2:         or        $3, $[[T3]], $[[T2]]
-  ; M2:         $[[BB0]]:
-  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         bnez      $[[T1]], $[[EXIT]]
   ; M2:         addiu     $2, $zero, 0
-  ; M2:         move      $2, $[[T0]]
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         move      $2, $[[T0]]
 
   ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -171,20 +175,24 @@
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrlv     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
+  ; M3:             beqz      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             daddiu    $2, $zero, 0
+  ; M3:             $[[EXIT:BB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             $[[BB0]]:
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             $[[BB0]]:
-  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3]], $[[EXIT]]
   ; M3:             daddiu    $2, $zero, 0
-  ; M3:             move      $2, $[[T1]]
   ; M3:             $[[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             move      $2, $[[T1]]
 
   ; GP64-NOT-R6:    dsrlv     $[[T0:[0-9]+]], $5, $7
   ; GP64-NOT-R6:    dsll      $[[T1:[0-9]+]], $4, 1
Index: test/CodeGen/Mips/llvm-ir/shl.ll
===================================================================
--- test/CodeGen/Mips/llvm-ir/shl.ll
+++ test/CodeGen/Mips/llvm-ir/shl.ll
@@ -110,20 +110,24 @@
 
   ; M2:         sllv      $[[T0:[0-9]+]], $5, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $2, $[[T0]]
+  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $3, $zero, 0
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         sllv      $[[T2:[0-9]+]], $4, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         srl       $[[T4:[0-9]+]], $5, 1
   ; M2:         srlv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
   ; M2:         or        $2, $[[T2]], $[[T3]]
-  ; M2:         $[[BB0]]:
-  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         bnez      $[[T1]], $[[EXIT]]
   ; M2:         addiu     $3, $zero, 0
-  ; M2:         move      $3, $[[T0]]
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         move      $3, $[[T0]]
 
   ; 32R1-R5:    sllv      $[[T0:[0-9]+]], $4, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -187,20 +191,24 @@
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsllv     $[[T1:[0-9]+]], $5, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], $[[BB0:BB[0-9_]+]]
   ; M3:             move      $2, $[[T1]]
+  ; M3:             beqz      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             daddiu    $3, $zero, 0
+  ; M3:             $[[EXIT:BB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             $[[BB0]]:
   ; M3:             dsllv     $[[T4:[0-9]+]], $4, $7
   ; M3:             dsrl      $[[T5:[0-9]+]], $5, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsrlv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $2, $[[T4]], $[[T7]]
-  ; M3:             $[[BB0]]:
-  ; M3:             bnez      $[[T3]], $[[BB1:BB[0-9_]+]]
+  ; M3:             bnez      $[[T3]], $[[EXIT]]
   ; M3:             daddiu    $3, $zero, 0
-  ; M3:             move      $3, $[[T1]]
   ; M3:             $[[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             move      $3, $[[T1]]
 
   ; GP64-NOT-R6:    dsllv     $[[T0:[0-9]+]], $4, $7
   ; GP64-NOT-R6:    dsrl      $[[T1:[0-9]+]], $5, 1
Index: test/CodeGen/Mips/longbranch.ll
===================================================================
--- test/CodeGen/Mips/longbranch.ll
+++ test/CodeGen/Mips/longbranch.ll
@@ -76,7 +76,7 @@
 ; Check the MIPS64 version.
 
 ; N64:        lui     $[[R0:[0-9]+]], %hi(%neg(%gp_rel(test1)))
-; N64:        bnez    $4, $[[BB0:BB[0-9_]+]]
+; N64:        beqz    $4, $[[BB0:BB[0-9_]+]]
 ; N64:        daddu   $[[R1:[0-9]+]], $[[R0]], $25
 
 ; Check for long branch expansion:
@@ -93,13 +93,14 @@
 ; N64-NEXT:      daddiu  $sp, $sp, 16
 
 ; N64:   $[[BB0]]:
+; N64:        jr      $ra
+; N64:        nop
+; N64:   $[[BB2]]:
 ; N64:        daddiu  $[[GP:[0-9]+]], $[[R1]], %lo(%neg(%gp_rel(test1)))
 ; N64:        ld      $[[R2:[0-9]+]], %got_disp(x)($[[GP]])
 ; N64:        addiu   $[[R3:[0-9]+]], $zero, 1
-; N64:        sw      $[[R3]], 0($[[R2]])
-; N64:   $[[BB2]]:
 ; N64:        jr      $ra
-; N64:        nop
+; N64:        sw      $[[R3]], 0($[[R2]])
 
 
 ; Check the microMIPS version.
Index: test/CodeGen/PowerPC/bdzlr.ll
===================================================================
--- test/CodeGen/PowerPC/bdzlr.ll
+++ test/CodeGen/PowerPC/bdzlr.ll
@@ -53,13 +53,15 @@
 
 ; CHECK: @lua_xmove
 ; CHECK: bnelr
-; CHECK: bnelr
+; CHECK: beq
+; CHECK: blr
 ; CHECK: bdzlr
 ; CHECK-NOT: blr
 
 ; CHECK-CRB: @lua_xmove
 ; CHECK-CRB: bclr 12,
-; CHECK-CRB: bclr 12,
+; CHECK-CRB: bc 4,
+; CHECK-CRB: blr
 ; CHECK-CRB: bdzlr
 ; CHECK-CRB-NOT: blr
 }
Index: test/CodeGen/PowerPC/branch-opt.ll
===================================================================
--- test/CodeGen/PowerPC/branch-opt.ll
+++ test/CodeGen/PowerPC/branch-opt.ll
@@ -4,9 +4,9 @@
 target triple = "powerpc-apple-darwin8.7.0"
 
 ;CHECK-LABEL: foo:
-;CHECK: b LBB0_16
-;CHECK: b LBB0_14
-;CHECK: b LBB0_14
+;CHECK: b LBB0_15
+;CHECK: b LBB0_13
+;CHECK: b LBB0_13
 ;CHECK-NOT: b LBB
 
 define void @foo(i32 %W, i32 %X, i32 %Y, i32 %Z) {
Index: test/CodeGen/PowerPC/sjlj.ll
===================================================================
--- test/CodeGen/PowerPC/sjlj.ll
+++ test/CodeGen/PowerPC/sjlj.ll
@@ -74,24 +74,24 @@
 ; CHECK-DAG: std [[REGA]], [[OFF:[0-9]+]](31)                  # 8-byte Folded Spill
 ; CHECK-DAG: std 1, 16([[REGA]])
 ; CHECK-DAG: std 2, 24([[REGA]])
-; CHECK: bcl 20, 31, .LBB1_5
+; CHECK: bcl 20, 31, .LBB1_2
 ; CHECK: li 3, 1
-; CHECK: #EH_SjLj_Setup	.LBB1_5
+; CHECK: #EH_SjLj_Setup	.LBB1_2
 ; CHECK: b .LBB1_1
 
-; CHECK: .LBB1_4:
+; CHECK: .LBB1_2:
+; CHECK: mflr [[REGL:[0-9]+]]
+; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
+; CHECK: std [[REGL]], 8([[REG2]])
+; CHECK: li 3, 0
+
+; CHECK: .LBB1_5:
 
 ; CHECK: lfd
 ; CHECK: lvx
 ; CHECK: ld
 ; CHECK: blr
 
-; CHECK: .LBB1_5:
-; CHECK: mflr [[REGL:[0-9]+]]
-; CHECK: ld [[REG2:[0-9]+]], [[OFF]](31)                   # 8-byte Folded Reload
-; CHECK: std [[REGL]], 8([[REG2]])
-; CHECK: li 3, 0
-
 ; CHECK-NOAV: @main
 ; CHECK-NOAV-NOT: stvx
 ; CHECK-NOAV: bcl
Index: test/CodeGen/PowerPC/tail-dup-layout.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-layout.ll
+++ test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,4 +1,4 @@
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
@@ -7,19 +7,16 @@
 ;CHECK: # %test1
 ;CHECK-NEXT: andi. {{[0-9]+}}, 4, 1
 ;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: # %test2
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 30, 30
 ;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
 ;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 29, 29
-;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: bne 0, [[OPT3LABEL:[._0-9A-Za-z]+]]
 ;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, 4, 0, 28, 28
-;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
-;CHECK-NEXT: std 5, 0(3)
-;CHECK-NEXT: std 6, 8(3)
-;CHECK-NEXT: blr
+;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: b [[OPT4LABEL:[._0-9A-Za-z]+]]
 ;CHECK-NEXT: [[OPT1LABEL]]
 ;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 30, 30
 ;CHECK-NEXT: beq 0, [[TEST3LABEL]]
@@ -30,7 +27,10 @@
 ;CHECK: rlwinm. {{[0-9]+}}, 4, 0, 28, 28
 ;CHECK-NEXT: beq 0, [[EXITLABEL]]
 ;CHECK-NEXT: [[OPT4LABEL]]
-;CHECK: b [[EXITLABEL]]
+;CHECK: [[EXITLABEL]]: # %exit
+;CHECK-NEXT: std 5, 0(3)
+;CHECK-NEXT: std 6, 8(3)
+;CHECK-NEXT: blr
 
 define void @f(%struct.ptrpair* noalias nocapture sret %result, i32 %tag, i8* %source1, i8* %sink1) {
 entry:
Index: test/CodeGen/SPARC/sjlj.ll
===================================================================
--- test/CodeGen/SPARC/sjlj.ll
+++ test/CodeGen/SPARC/sjlj.ll
@@ -66,14 +66,15 @@
 ; CHECK:  ba   .LBB1_1
 ; CHECK:  nop
 ; CHECK:.LBB1_1:                                ! %entry
-; CHECK:  ba   .LBB1_3
 ; CHECK:  mov  %g0, %i0
+; CHECK:  cmp %i0, 0
+; CHECK:  bne  .LBB1_4
+; CHECK:  ba   .LBB1_5
 ; CHECK:.LBB1_2:                                ! Block address taken
 ; CHECK:  mov  1, %i0
-; CHECK:.LBB1_3:                                ! %entry
-; CHECK:  cmp %i0, 0
 ; CHECK:  be   .LBB1_5
-; CHECK:  nop
+; CHECK:.LBB1_4:
+; CHECK:  ba   .LBB1_6
 }
 declare i8* @llvm.frameaddress(i32) #2
 
Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll
===================================================================
--- test/CodeGen/Thumb/thumb-shrink-wrapping.ll
+++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll
@@ -1,11 +1,12 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
+
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
@@ -15,6 +16,8 @@
 ; edges.
 ; Also disable the late if-converter as it makes harder to reason on
 ; the diffs.
+; Disable tail-duplication during placement, as v4t vs v5t get different
+; results due to branches not being analyzable under v5
 
 ; Initial motivating example: Simple diamond with a call just on one side.
 ; CHECK-LABEL: foo:
Index: test/CodeGen/Thumb2/cbnz.ll
===================================================================
--- test/CodeGen/Thumb2/cbnz.ll
+++ test/CodeGen/Thumb2/cbnz.ll
@@ -26,7 +26,7 @@
   call void @x()
   call void @x()
   call void @x()
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %q = icmp eq i32 %y, 0
   br i1 %q, label %t2, label %f
 
Index: test/CodeGen/Thumb2/ifcvt-compare.ll
===================================================================
--- test/CodeGen/Thumb2/ifcvt-compare.ll
+++ test/CodeGen/Thumb2/ifcvt-compare.ll
@@ -4,7 +4,7 @@
 
 define void @f0(i32 %x) optsize {
   ; CHECK-LABEL: f0:
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %p = icmp eq i32 %x, 0
   br i1 %p, label %t, label %f
 
Index: test/CodeGen/WebAssembly/mem-intrinsics.ll
===================================================================
--- test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 | FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
Index: test/CodeGen/X86/2012-08-17-legalizer-crash.ll
===================================================================
--- test/CodeGen/X86/2012-08-17-legalizer-crash.ll
+++ test/CodeGen/X86/2012-08-17-legalizer-crash.ll
@@ -26,5 +26,5 @@
   ret void
 
 ; CHECK-LABEL: fn1:
-; CHECK: jb
+; CHECK: jae
 }
Index: test/CodeGen/X86/atom-bypass-slow-division.ll
===================================================================
--- test/CodeGen/X86/atom-bypass-slow-division.ll
+++ test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -47,8 +47,8 @@
 ; CHECK-LABEL: Test_use_div_and_idiv:
 ; CHECK: idivl
 ; CHECK: divb
-; CHECK: divl
 ; CHECK: divb
+; CHECK: divl
 ; CHECK: addl
 ; CHECK: ret
   %resultidiv = sdiv i32 %a, %b
Index: test/CodeGen/X86/avx-splat.ll
===================================================================
--- test/CodeGen/X86/avx-splat.ll
+++ test/CodeGen/X86/avx-splat.ll
@@ -62,8 +62,10 @@
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ## implicit-def: %YMM0
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    jne LBB4_2
-; CHECK-NEXT:  ## BB#1: ## %load.i1247
+; CHECK-NEXT:    je LBB4_1
+; CHECK-NEXT:  ## BB#2: ## %__load_and_broadcast_32.exit1249
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  LBB4_1: ## %load.i1247
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    movq %rsp, %rbp
 ; CHECK-NEXT:    andq $-32, %rsp
@@ -71,7 +73,6 @@
 ; CHECK-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %ymm0
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
-; CHECK-NEXT:  LBB4_2: ## %__load_and_broadcast_32.exit1249
 ; CHECK-NEXT:    retq
 allocas:
   %udx495 = alloca [18 x [18 x float]], align 32
Index: test/CodeGen/X86/avx512-cmp.ll
===================================================================
--- test/CodeGen/X86/avx512-cmp.ll
+++ test/CodeGen/X86/avx512-cmp.ll
@@ -69,13 +69,14 @@
 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ALL-NEXT:    vucomiss %xmm1, %xmm0
 ; ALL-NEXT:    jne LBB3_1
-; ALL-NEXT:    jnp LBB3_2
+; ALL-NEXT:    jp  LBB3_1
+; ALL-NEXT:  ## BB#2: ## %return
+; ALL-NEXT:    retq
 ; ALL-NEXT:  LBB3_1: ## %if.end
 ; ALL-NEXT:    seta %al
 ; ALL-NEXT:    movzbl %al, %eax
 ; ALL-NEXT:    leaq {{.*}}(%rip), %rcx
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:  LBB3_2: ## %return
 ; ALL-NEXT:    retq
 entry:
   %cmp = fcmp oeq float %p, 0.000000e+00
Index: test/CodeGen/X86/block-placement.ll
===================================================================
--- test/CodeGen/X86/block-placement.ll
+++ test/CodeGen/X86/block-placement.ll
@@ -306,7 +306,7 @@
 define void @unnatural_cfg1() {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -344,7 +344,7 @@
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
-; CHECK: unnatural_cfg2
+; CHECK-LABEL: unnatural_cfg2
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -551,7 +551,7 @@
 ; didn't correctly locate the fallthrough successor, assuming blindly that the
 ; first one was the fallthrough successor. As a result, we would add an
 ; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
 ; CHECK: %entry
 ; CHECK-NOT: jmp
 ; CHECK: %loop
@@ -579,7 +579,7 @@
 ; fallthrough simply won't occur. Make sure we don't crash trying to update
 ; terminators for such constructs.
 ;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
 ; CHECK: %entry
 ; CHECK: %cleanup
 
@@ -601,7 +601,7 @@
 ; attempt to merge onto the wrong end of the inner loop just because we find it
 ; first. This was reduced from a crasher in GCC's single source.
 ;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
 ; CHECK: %loop2b
 ; CHECK: %loop1
@@ -641,7 +641,7 @@
 ; fallthrough because that happens to always produce unanalyzable branches on
 ; x86.
 ;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
 ; CHECK: %entry
 ; CHECK: %loop
 ; CHECK: %exit
@@ -665,7 +665,7 @@
 ; This branch is now analyzable and hence the destination block becomes the
 ; hotter one. The right order is entry->bar->exit->foo.
 ;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
 ; CHECK: %bar
 ; CHECK: %exit
@@ -691,12 +691,13 @@
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
 ; CHECK: %entry
 ; CHECK: %a
 ; CHECK: %b
-; CHECK: %c
 ; CHECK: %exit
+; CHECK: %c
+; CHECK: retl
 
 entry:
   br i1 undef, label %a, label %b
@@ -721,7 +722,7 @@
 ; Ensure that we don't crash as we're building up many unanalyzable branches,
 ; blocks, and loops.
 ;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
 ; CHECK: %entry
 ; CHECK: %exit
 
@@ -940,7 +941,7 @@
 ;    strange layouts that are siginificantly less efficient, often times maing
 ;    it discontiguous.
 ;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
 ; CHECK: %entry
 ; First rotated loop top.
 ; CHECK: .p2align
Index: test/CodeGen/X86/cmovcmov.ll
===================================================================
--- test/CodeGen/X86/cmovcmov.ll
+++ test/CodeGen/X86/cmovcmov.ll
@@ -192,7 +192,7 @@
 ; CMOV-NEXT:   retq
 
 ; NOCMOV:        jne
-; NOCMOV-NEXT:   jp
+; NOCMOV-NEXT:   jnp
 define float @test_zext_fcmp_une(float %a, float %b) #0 {
 entry:
   %cmp = fcmp une float %a, %b
@@ -214,7 +214,7 @@
 ; CMOV-NEXT:   retq
 
 ; NOCMOV:        jne
-; NOCMOV-NEXT:   jp
+; NOCMOV-NEXT:   jnp
 define float @test_zext_fcmp_oeq(float %a, float %b) #0 {
 entry:
   %cmp = fcmp oeq float %a, %b
Index: test/CodeGen/X86/critical-edge-split-2.ll
===================================================================
--- test/CodeGen/X86/critical-edge-split-2.ll
+++ test/CodeGen/X86/critical-edge-split-2.ll
@@ -24,6 +24,7 @@
 
 ; CHECK-LABEL: test1:
 ; CHECK: testb %dil, %dil
-; CHECK: jne LBB0_2
+; CHECK: je LBB0_1
+; CHECK: retq
+; CHECK: LBB0_1:
 ; CHECK: divl
-; CHECK: LBB0_2:
Index: test/CodeGen/X86/fp-une-cmp.ll
===================================================================
--- test/CodeGen/X86/fp-une-cmp.ll
+++ test/CodeGen/X86/fp-une-cmp.ll
@@ -56,11 +56,11 @@
 ; CHECK-NEXT:    ucomisd %xmm1, %xmm0
 ; CHECK-NEXT:    jne .LBB1_1
 ; CHECK-NEXT:    jp .LBB1_1
-; CHECK-NEXT:  .LBB1_2: # %bb2
+; CHECK-NEXT:  # BB#2: # %bb2
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB1_1: # %bb1
 ; CHECK-NEXT:    addsd {{.*}}(%rip), %xmm0
-; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:    retq
 
 entry:
   %mul = fmul double %x, %y
Index: test/CodeGen/X86/ragreedy-bug.ll
===================================================================
--- test/CodeGen/X86/ragreedy-bug.ll
+++ test/CodeGen/X86/ragreedy-bug.ll
@@ -6,13 +6,13 @@
 ; CHECK: isupper.exit
 ; CHECK-NEXT: in Loop
 ; CHECK-NEXT: testl
-; CHECK-NEXT: jne
+; CHECK-NEXT: je
+; CHECK: maskrune
 ; CHECK: isupper.exit
 ; CHECK-NEXT: in Loop
 ; CHECK-NEXT: testl
 ; CHECK-NEXT: je
 ; CHECK: maskrune
-; CHECK: maskrune
 
 %struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
 %struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }
Index: test/CodeGen/X86/shrink-wrap-chkstk.ll
===================================================================
--- test/CodeGen/X86/shrink-wrap-chkstk.ll
+++ test/CodeGen/X86/shrink-wrap-chkstk.ll
@@ -62,11 +62,12 @@
 ; CHECK-LABEL: @use_eax_before_prologue@8: # @use_eax_before_prologue
 ; CHECK: movl %ecx, %eax
 ; CHECK: cmpl %edx, %eax
-; CHECK: jge LBB1_2
+; CHECK: jl LBB1_1
+; CHECK: retl
+; CHECK: LBB1_1
 ; CHECK: pushl %eax
 ; CHECK: movl $4092, %eax
 ; CHECK: calll __chkstk
 ; CHECK: movl 4092(%esp), %eax
 ; CHECK: calll _doSomething
-; CHECK: LBB1_2:
 ; CHECK: retl
Index: test/CodeGen/X86/statepoint-invoke.ll
===================================================================
--- test/CodeGen/X86/statepoint-invoke.ll
+++ test/CodeGen/X86/statepoint-invoke.ll
@@ -89,6 +89,7 @@
 left.relocs:
   ; CHECK: movq (%rsp),
   ; CHECK: movq 8(%rsp), [[REGVAL2:%[a-z]+]]
+  ; CHECK: cmoveq {{.*}}[[REGVAL2]]{{.*}}
   %val1.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 13, i32 13)
   %val2.relocated_left = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp1, i32 14, i32 14)
   br label %normal_return
@@ -104,13 +105,13 @@
 right.relocs:
   ; CHECK: movq (%rsp), [[REGVAL2]]
   ; CHECK: movq
+  ; CHECK: cmoveq {{.*}}[[REGVAL2]]{{.*}}
   %val2.relocated_right = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 13, i32 13)
   %val3.relocated = call coldcc i64 addrspace(1)* @llvm.experimental.gc.relocate.p1i64(token %sp2, i32 14, i32 14)
   br label %normal_return
 
 normal_return:
   ; CHECK-LABEL: %normal_return
-  ; CHECK: cmoveq {{.*}}[[REGVAL2]]{{.*}}
   ; CHECK: retq
   %a1 = phi i64 addrspace(1)* [%val1.relocated, %left.relocs], [%val3.relocated, %right.relocs]
   %a2 = phi i64 addrspace(1)* [%val2.relocated_left, %left.relocs], [%val2.relocated_right, %right.relocs]
Index: test/CodeGen/X86/twoaddr-coalesce-3.ll
===================================================================
--- test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -19,7 +19,7 @@
 
 ; Check that only one mov will be generated in the kernel loop.
 ; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
 ; CHECK-NOT: mov
@@ -56,7 +56,7 @@
 
 ; Check that only two mov will be generated in the kernel loop.
 ; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
 ; CHECK-NOT: mov
Index: test/CodeGen/X86/x86-shrink-wrap-unwind.ll
===================================================================
--- test/CodeGen/X86/x86-shrink-wrap-unwind.ll
+++ test/CodeGen/X86/x86-shrink-wrap-unwind.ll
@@ -24,7 +24,9 @@
 ; After the prologue is set.
 ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
-; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; CHECK: popq
+; CHECK-NEXT: retq
 ;
 ; Store %a in the alloca.
 ; CHECK: movl [[ARG0CPY]], 4(%rsp)
@@ -33,14 +35,9 @@
 ; Set the first argument to zero.
 ; CHECK-NEXT: xorl %edi, %edi
 ; CHECK-NEXT: callq _doSomething
-;
-; CHECK: [[EXIT_LABEL]]:
-;
-; Without shrink-wrapping, epilogue is in the exit block.
-; Epilogue code. (What we pop does not matter.)
 ; CHECK-NEXT: popq
-;
 ; CHECK-NEXT: retq
+;
 define i32 @framelessUnwind(i32 %a, i32 %b) #0 {
   %tmp = alloca i32, align 4
   %tmp2 = icmp slt i32 %a, %b
@@ -70,9 +67,11 @@
 ; After the prologue is set.
 ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
-; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; CHECK: retq
 ;
 ; Prologue code.
+; CHECK-NEXT: [[SUCCESS_LABEL]]
 ; CHECK: pushq %rbp
 ; CHECK: movq %rsp, %rbp
 ;
@@ -86,9 +85,8 @@
 ;
 ; Epilogue code. (What we pop does not matter.)
 ; CHECK: popq %rbp
-;
-; CHECK: [[EXIT_LABEL]]:
 ; CHECK-NEXT: retq
+;
 define i32 @frameUnwind(i32 %a, i32 %b) #1 {
   %tmp = alloca i32, align 4
   %tmp2 = icmp slt i32 %a, %b
@@ -116,10 +114,12 @@
 ; After the prologue is set.
 ; CHECK: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; CHECK-NEXT: cmpl %esi, [[ARG0CPY]]
-; CHECK-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; CHECK-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; CHECK: retq
 ;
 ; Prologue code.
 ; (What we push does not matter. It should be some random sratch register.)
+; CHECK-NEXT: [[SUCCESS_LABEL]]
 ; CHECK: pushq
 ;
 ; Store %a in the alloca.
@@ -132,8 +132,6 @@
 ;
 ; Epilogue code.
 ; CHECK-NEXT: addq
-;
-; CHECK: [[EXIT_LABEL]]:
 ; CHECK-NEXT: retq
 define i32 @framelessnoUnwind(i32 %a, i32 %b) #2 {
   %tmp = alloca i32, align 4
Index: test/CodeGen/X86/x86-shrink-wrapping.ll
===================================================================
--- test/CodeGen/X86/x86-shrink-wrapping.ll
+++ test/CodeGen/X86/x86-shrink-wrapping.ll
@@ -18,18 +18,24 @@
 ; No prologue needed.
 ; ENABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; ENABLE-NEXT: cmpl %esi, [[ARG0CPY]]
-; ENABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; ENABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
+; ENABLE: retq
 ;
 ; Prologue code.
 ; (What we push does not matter. It should be some random sratch register.)
+; ENABLE: [[SUCCESS_LABEL]]:
 ; CHECK: pushq
 ;
 ; Compare the arguments and jump to exit.
 ; After the prologue is set.
 ; DISABLE: movl %edi, [[ARG0CPY:%e[a-z]+]]
 ; DISABLE-NEXT: cmpl %esi, [[ARG0CPY]]
-; DISABLE-NEXT: jge [[EXIT_LABEL:LBB[0-9_]+]]
+; DISABLE-NEXT: jl [[SUCCESS_LABEL:LBB[0-9_]+]]
 ;
+; DISABLE: popq
+; DISABLE-NEXT: retq
+
+; DISABLE: [[SUCCESS_LABEL]]:
 ; Store %a in the alloca.
 ; CHECK: movl [[ARG0CPY]], 4(%rsp)
 ; Set the alloca address in the second argument.
@@ -37,17 +43,11 @@
 ; Set the first argument to zero.
 ; CHECK-NEXT: xorl %edi, %edi
 ; CHECK-NEXT: callq _doSomething
-;
 ; With shrink-wrapping, epilogue is just after the call.
 ; ENABLE-NEXT: addq $8, %rsp
-;
-; CHECK: [[EXIT_LABEL]]:
-;
-; Without shrink-wrapping, epilogue is in the exit block.
-; Epilogue code. (What we pop does not matter.)
 ; DISABLE-NEXT: popq
-;
 ; CHECK-NEXT: retq
+
 define i32 @foo(i32 %a, i32 %b) {
   %tmp = alloca i32, align 4
   %tmp2 = icmp slt i32 %a, %b