Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
@@ -293,9 +294,12 @@
   /// \brief A handle to the target's lowering info.
   const TargetLoweringBase *TLI;
 
-  /// \brief A handle to the post dominator tree.
+  /// \brief A handle to the dominator tree.
   MachineDominatorTree *MDT;
 
+  /// \brief A handle to the post dominator tree.
+  MachinePostDominatorTree *MPDT;
+
   /// \brief Duplicator used to duplicate tails during placement.
   ///
   /// Placement decisions can open up new tail duplication opportunities, but
@@ -403,6 +407,14 @@
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  bool shouldTailDuplicate(MachineBasicBlock *BB);
+  /// Check the edge frequencies to see if tail duplication will increase
+  /// fallthroughs.
+  bool probabilityJustifiesTailDuplicate(
+      MachineBasicBlock *BB, MachineBasicBlock *Succ);
+  bool canTailDuplicateUnplacedPreds(
+      MachineBasicBlock *BB, MachineBasicBlock *Succ,
+      BlockChain &Chain, const BlockFilterSet *BlockFilter);
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -416,6 +428,8 @@
     AU.addRequired<MachineBranchProbabilityInfo>();
     AU.addRequired<MachineBlockFrequencyInfo>();
     AU.addRequired<MachineDominatorTree>();
+    if (TailDupPlacement)
+      AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -430,6 +444,7 @@
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
                     "Branch Probability Basic Block Placement", false, false)
@@ -561,6 +576,132 @@
   return SuccProb;
 }
 
+/// Check if a block should be tail duplicated.
+/// \p BB Block to check.
+bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  bool IsSimple = TailDup.isSimpleBB(BB);
+
+  if (BB->succ_size() == 1)
+    return false;
+  return TailDup.shouldTailDuplicate(IsSimple, *BB);
+}
+
+/// Check the edge frequencies to see if tail duplication will increase
+/// fallthroughs.
+bool MachineBlockPlacement::probabilityJustifiesTailDuplicate(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ) {
+  // We need to do a probability calculation to make sure this is profitable.
+  // First: does succ have a successor that post-dominates? This affects the
+  // calculation. The 2 relevant cases are:
+  //    BB         BB
+  //    | \        | \  Q
+  //   P|  \Q      |P \
+  //    =   C      =   C
+  //    |  /       |  /
+  //    | /        | /
+  //    Succ       Succ
+  //    / \        | \  V
+  //  U/   =V      |U \
+  //  /     \      =   D
+  //  D      E     |  /
+  //               | /
+  //               |/
+  //               Dom
+  // In the second case, Placing Succ while duplicating it into C prevents the
+  // fallthrough of Succ into either D or Dom, because they now have C as an
+  // unplaced predecessor
+  MachineBasicBlock *Dom = Succ;
+  auto BestSuccSucc = BranchProbability::getZero();
+  for (MachineBasicBlock *SuccSucc : Succ->successors()) {
+    auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
+    if (Prob > BestSuccSucc)
+      BestSuccSucc = Prob;
+    if (Dom == nullptr) break;
+    Dom = MPDT->findNearestCommonDominator(Dom, SuccSucc);
+  }
+  // If it doesn't have a post-dominating successor, here is the calculation:
+  //    BB        BB
+  //    | \       |  \
+  //   P|  \Q     |   =
+  //    =   C     |    C
+  //    |  /      |     |
+  //    | /       |     |
+  //    Succ      Succ /|
+  //    / \       |  \/ |
+  //  U/   =V     =  /= =
+  //  /     \     | /  \|
+  //  D      E    D     E
+  //  Cost in the first case is: P + V
+  //  Cost in the second case is: Q + QV + PU + PV
+  if (Dom == nullptr || !Succ->isSuccessor(Dom)) {
+    BranchProbability P = (MBPI->getEdgeProbability(BB, Succ));
+    BranchProbability Q = P.getCompl();
+    BranchProbability U = BestSuccSucc;
+    BranchProbability V = U.getCompl();
+    BranchProbability QV = Q * V;
+    uint64_t BaseCost = static_cast<uint64_t>(P.getNumerator()) +
+        static_cast<uint64_t>(V.getNumerator());
+    uint64_t DupCost = static_cast<uint64_t>(Q.getNumerator()) +
+        static_cast<uint64_t>(QV.getNumerator()) +
+        static_cast<uint64_t>(P.getNumerator());
+    return (BaseCost > DupCost);
+  }
+  BranchProbability U = MBPI->getEdgeProbability(Succ, Dom);
+  BranchProbability Q = MBPI->getEdgeProbability(BB, Succ).getCompl();
+  // If there is a post-dominating successor, here is the calculation:
+  // BB         BB
+  // | \  Q     |  \  Q
+  // |P \       |   =
+  // =   C      |P   C (+Succ)
+  // |  /       |     |
+  // | /        |     |
+  // Succ       Succ /|
+  // | \  V     |  \/ |
+  // |U \       |U /\ |
+  // =   D      = =  =|
+  // |  /       |/    D
+  // | /        |    /
+  // |/         |   /
+  // Dom        Dom
+  // Branches have been marked with (=)
+  // The cost for taken branches in the first case is P + U
+  // The cost in the second case (assuming independence), given the layout:
+  // BB, Succ, (C+Succ), D, Dom
+  // is Q + P*U + P*V + Q*U == Q + P * Q*U. Subtracting P means we need to
+  // compare U vs Q + Q*U.
+  return (U > (Q + Q*U));
+}
+
+
+/// When the option TailDupPlacement is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB. In
+/// addition we keep a set of blocks that have been tail-duplicated into and
+/// allow those blocks to be unplaced as well. This allows the creation of a
+/// second (larger) spine and a short fallthrough spine.
+/// We also identify blocks with the CFG that would have been produced by
+/// tail-duplication and lay them out in the same manner.
+bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
+    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+  if (!shouldTailDuplicate(Succ))
+    return false;
+
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors can be
+    // tail-duplicated into.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain)
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred))
+      return false;
+  }
+  return true;
+}
+
 /// When the option OutlineOptionalBranches is on, this method
 /// checks if the fallthrough candidate block \p Succ (of block
 /// \p BB) also has other unscheduled predecessor blocks which
@@ -634,6 +775,13 @@
   if (SuccChain.UnscheduledPredecessors == 0)
     return false;
 
+  // As a heuristic, if we can duplicate the block into all its unscheduled
+  // predecessors, we return false.
+  if (TailDupPlacement
+      && canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
+      && probabilityJustifiesTailDuplicate(BB, Succ))
+    return false;
+
   // There are two basic scenarios here:
   // -------------------------------------
   // Case 1: triangular shape CFG (if-then):
@@ -1908,13 +2056,8 @@
   DuplicatedToLPred = false;
   DEBUG(dbgs() << "Redoing tail duplication for Succ#"
         << BB->getNumber() << "\n");
-  bool IsSimple = TailDup.isSimpleBB(BB);
-  // Blocks with single successors don't create additional fallthrough
-  // opportunities. Don't duplicate them. TODO: When conditional exits are
-  // analyzable, allow them to be duplicated.
-  if (!IsSimple && BB->succ_size() == 1)
-    return false;
-  if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
+
+  if (!shouldTailDuplicate(BB))
     return false;
   // This has to be a callback because none of it can be done after
   // BB is deleted.
@@ -1967,6 +2110,7 @@
       llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  bool IsSimple = TailDup.isSimpleBB(BB);
   TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
                                  &DuplicatedPreds, &RemovalCallbackRef);
 
@@ -2007,12 +2151,14 @@
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
   MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = nullptr;
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;
 
   if (TailDupPlacement) {
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
     unsigned TailDupSize = TailDuplicatePlacementThreshold;
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
@@ -2043,6 +2189,8 @@
       BlockToChain.clear();
       // Must redo the dominator tree if blocks were changed.
       MDT->runOnMachineFunction(MF);
+      if (MPDT)
+        MPDT->runOnMachineFunction(MF);
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }
Index: test/CodeGen/AMDGPU/branch-relaxation.ll
===================================================================
--- test/CodeGen/AMDGPU/branch-relaxation.ll
+++ test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -335,6 +335,12 @@
 ; GCN-NEXT: ;;#ASMEND
 
 ; GCN-NEXT: [[BB3]]: ; %bb3
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
 define void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
@@ -356,6 +362,12 @@
   br label %bb3
 
 bb3:
+; These NOPs prevent tail-duplication-based outlining
+; from firing, which defeats the need to expand the branches and this test.
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
   ret void
 }
 
@@ -385,6 +397,7 @@
 
 ; GCN-NEXT: [[ENDIF]]: ; %endif
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
+; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
 define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
@@ -402,6 +415,9 @@
   br label %endif
 
 endif:
+  ; layout can remove the split branch if it can copy the return block.
+  ; This call makes the return block long enough that it doesn't get copied.
+  call void @llvm.amdgcn.s.sleep(i32 5);
   ret void
 }
 
Index: test/CodeGen/ARM/cmpxchg-weak.ll
===================================================================
--- test/CodeGen/ARM/cmpxchg-weak.ll
+++ test/CodeGen/ARM/cmpxchg-weak.ll
@@ -13,14 +13,16 @@
 ; CHECK-NEXT:     dmb ish
 ; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r2, [r0]
 ; CHECK-NEXT:     cmp     [[SUCCESS]], #0
-; CHECK-NEXT:     bne     [[FAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT:     beq     [[SUCCESSBB:LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: BB#2:
-; CHECK-NEXT:     dmb     ish
 ; CHECK-NEXT:     str     r3, [r0]
 ; CHECK-NEXT:     bx      lr
 ; CHECK-NEXT: [[LDFAILBB]]:
 ; CHECK-NEXT:     clrex
-; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT:     str     r3, [r0]
+; CHECK-NEXT:     bx      lr
+; CHECK-NEXT: [[SUCCESSBB]]:
+; CHECK-NEXT:     dmb     ish
 ; CHECK-NEXT:     str     r3, [r0]
 ; CHECK-NEXT:     bx      lr
 
Index: test/CodeGen/Mips/brconeq.ll
===================================================================
--- test/CodeGen/Mips/brconeq.ll
+++ test/CodeGen/Mips/brconeq.ll
@@ -8,11 +8,11 @@
 entry:
   %0 = load i32, i32* @i, align 4
   %1 = load i32, i32* @j, align 4
-  %cmp = icmp eq i32 %0, %1
+  %cmp = icmp ne i32 %0, %1
 ; 16:	cmp	${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
 ; 16: $[[LABEL]]:
-  br i1 %cmp, label %if.end, label %if.then
+  br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
   store i32 1, i32* @result, align 4
Index: test/CodeGen/Mips/brconeqk.ll
===================================================================
--- test/CodeGen/Mips/brconeqk.ll
+++ test/CodeGen/Mips/brconeqk.ll
@@ -6,8 +6,8 @@
 define void @test() nounwind {
 entry:
   %0 = load i32, i32* @i, align 4
-  %cmp = icmp eq i32 %0, 10
-  br i1 %cmp, label %if.end, label %if.then
+  %cmp = icmp ne i32 %0, 10
+  br i1 %cmp, label %if.then, label %if.end
 ; 16:	cmpi	${{[0-9]+}}, {{[0-9]+}}
 ; 16:	bteqz	$[[LABEL:[0-9A-Ba-b_]+]]
 ; 16: $[[LABEL]]:
Index: test/CodeGen/Mips/brcongt.ll
===================================================================
--- test/CodeGen/Mips/brcongt.ll
+++ test/CodeGen/Mips/brcongt.ll
@@ -9,8 +9,8 @@
 entry:
   %0 = load i32, i32* @i, align 4
   %1 = load i32, i32* @j, align 4
-  %cmp = icmp sgt i32 %0, %1
-  br i1 %cmp, label %if.end, label %if.then
+  %cmp = icmp sle i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
 ; 16:	slt	${{[0-9]+}}, ${{[0-9]+}}
 ; 16:	btnez	$[[LABEL:[0-9A-Ba-b_]+]]
 ; 16: $[[LABEL]]:
Index: test/CodeGen/Mips/brconlt.ll
===================================================================
--- test/CodeGen/Mips/brconlt.ll
+++ test/CodeGen/Mips/brconlt.ll
@@ -10,8 +10,8 @@
 entry:
   %0 = load i32, i32* @j, align 4
   %1 = load i32, i32* @i, align 4
-  %cmp = icmp slt i32 %0, %1
-  br i1 %cmp, label %if.end, label %if.then
+  %cmp = icmp sge i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.end
 
 ; 16:     slt   ${{[0-9]+}}, ${{[0-9]+}}
 ; MM32R6: slt   ${{[0-9]+}}, ${{[0-9]+}}
Index: test/CodeGen/Mips/brconnez.ll
===================================================================
--- test/CodeGen/Mips/brconnez.ll
+++ test/CodeGen/Mips/brconnez.ll
@@ -7,7 +7,7 @@
 entry:
   %0 = load i32, i32* @j, align 4
   %cmp = icmp eq i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.end
+  br i1 %cmp, label %if.then, label %if.end, !prof !1
 
 ; 16:	bnez	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
 ; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
@@ -21,4 +21,4 @@
   ret void
 }
 
-
+!1 = !{!"branch_weights", i32 2, i32 1}
Index: test/CodeGen/Mips/micromips-compact-branches.ll
===================================================================
--- test/CodeGen/Mips/micromips-compact-branches.ll
+++ test/CodeGen/Mips/micromips-compact-branches.ll
@@ -6,7 +6,7 @@
   %x = alloca i32, align 4
   %0 = load i32, i32* %x, align 4
   %cmp = icmp eq i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.end
+  br i1 %cmp, label %if.then, label %if.end, !prof !1
 
 if.then:
   store i32 10, i32* %x, align 4
@@ -17,3 +17,4 @@
 }
 
 ; CHECK: bnezc
+!1 = !{!"branch_weights", i32 2, i32 1}
Index: test/CodeGen/PowerPC/misched-inorder-latency.ll
===================================================================
--- test/CodeGen/PowerPC/misched-inorder-latency.ll
+++ test/CodeGen/PowerPC/misched-inorder-latency.ll
@@ -17,7 +17,7 @@
   %sum1 = add i32 %sumin, 1
   %val1 = load i32, i32* %ptr
   %p = icmp eq i32 %sumin, 0
-  br i1 %p, label %true, label %end
+  br i1 %p, label %true, label %end, !prof !1
 true:
   %sum2 = add i32 %sum1, 1
   %ptr2 = getelementptr i32, i32* %ptr, i32 1
@@ -53,3 +53,5 @@
   ret i32 %valmerge
 }
 declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
+
+!1 = !{!"branch_weights", i32 2, i32 1}
Index: test/CodeGen/PowerPC/tail-dup-break-cfg.ll
===================================================================
--- /dev/null
+++ test/CodeGen/PowerPC/tail-dup-break-cfg.ll
@@ -0,0 +1,97 @@
+; RUN: llc -O2 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Intended layout:
+; The code for tail-duplication during layout will produce the layout:
+; test1
+; test2
+; body1 (with copy of test2)
+; body2
+; exit
+
+;CHECK-LABEL: tail_dup_break_cfg:
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: b [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: [[BODY1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[EXITLABEL]]
+;CHECK-NEXT: [[BODY2LABEL]]
+;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+define void @tail_dup_break_cfg(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
+body1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %exit, label %body2, !prof !1 ; %exit more likely
+body2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %exit
+exit:
+  ret void
+}
+
+; The branch weights here hint that we shouldn't tail duplicate in this case.
+;CHECK-LABEL: tail_dup_dont_break_cfg:
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 4, 1, [[TEST2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %body1
+;CHECK: [[TEST2LABEL]]: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %body2
+;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+define void @tail_dup_dont_break_cfg(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
+body1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp ne i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %body2, label %exit, !prof !1 ; %body2 more likely
+body2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %exit
+exit:
+  ret void
+}
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+
+!1 = !{!"branch_weights", i32 5, i32 3}
Index: test/CodeGen/PowerPC/tail-dup-layout.ll
===================================================================
--- test/CodeGen/PowerPC/tail-dup-layout.ll
+++ test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -19,7 +19,7 @@
 ; The CHECK statements check for the whole string of tests and exit block,
 ; and then check that the correct test has been duplicated into the end of
 ; the optional blocks and that the optional blocks are in the correct order.
-;CHECK-LABEL: f:
+;CHECK-LABEL: straight_test:
 ; test1 may have been merged with entry
 ;CHECK: mr [[TAGREG:[0-9]+]], 3
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
@@ -47,7 +47,7 @@
 ;CHECK-NEXT: [[OPT4LABEL]]
 ;CHECK: b [[EXITLABEL]]
 
-define void @f(i32 %tag) {
+define void @straight_test(i32 %tag) {
 entry:
   br label %test1
 test1:
@@ -94,7 +94,57 @@
   ret void
 }
 
+; The block then2 is not unavoidable, but since it can be tail-duplicated, it
+; should be placed as a fallthrough from test2 and copied.
+; CHECK-LABEL: avoidable_test:
+; CHECK: # %entry
+; CHECK: andi.
+; CHECK: # %test2
+; Make sure then2 falls through from test2
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %then2
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %end2
+; CHECK: # %else1
+; CHECK: bl a
+; CHECK: bl a
+; Make sure then2 was copied into else1
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %else2
+; CHECK: bl c
+define void @avoidable_test(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
+else1:
+  call void @a()
+  call void @a()
+  br label %then2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
+then2:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
+else2:
+  call void @c()
+  br label %end2
+end2:
+  ret void
+end1:
+  call void @d()
+  ret void
+}
+
 declare void @a()
 declare void @b()
 declare void @c()
 declare void @d()
+
+!1 = !{!"branch_weights", i32 2, i32 1}
Index: test/CodeGen/SPARC/sjlj.ll
===================================================================
--- test/CodeGen/SPARC/sjlj.ll
+++ test/CodeGen/SPARC/sjlj.ll
@@ -66,14 +66,15 @@
 ; CHECK:  ba   .LBB1_1
 ; CHECK:  nop
 ; CHECK:.LBB1_1:                                ! %entry
-; CHECK:  ba   .LBB1_3
 ; CHECK:  mov  %g0, %i0
+; CHECK:  cmp %i0, 0
+; CHECK:  bne  .LBB1_4
+; CHECK:  ba   .LBB1_5
 ; CHECK:.LBB1_2:                                ! Block address taken
 ; CHECK:  mov  1, %i0
-; CHECK:.LBB1_3:                                ! %entry
-; CHECK:  cmp %i0, 0
 ; CHECK:  be   .LBB1_5
-; CHECK:  nop
+; CHECK:.LBB1_4:
+; CHECK:  ba   .LBB1_6
 }
 declare i8* @llvm.frameaddress(i32) #2
 
Index: test/CodeGen/SystemZ/int-cmp-37.ll
===================================================================
--- test/CodeGen/SystemZ/int-cmp-37.ll
+++ test/CodeGen/SystemZ/int-cmp-37.ll
@@ -15,8 +15,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src2 = zext i16 %val to i32
-  %cond = icmp ult i32 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp uge i32 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i32 %src1, %src1
   br label %exit
@@ -34,8 +34,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src2 = zext i16 %val to i32
-  %cond = icmp slt i32 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp sge i32 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i32 %src1, %src1
   br label %exit
@@ -54,8 +54,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src2 = zext i16 %val to i32
-  %cond = icmp eq i32 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp ne i32 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i32 %src1, %src1
   br label %exit
@@ -74,8 +74,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src2 = zext i16 %val to i32
-  %cond = icmp ne i32 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp eq i32 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i32 %src1, %src1
   br label %exit
@@ -95,8 +95,8 @@
 entry:
   %val = load i16 , i16 *@h, align 1
   %src2 = zext i16 %val to i32
-  %cond = icmp ult i32 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp uge i32 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i32 %src1, %src1
   br label %exit
@@ -115,8 +115,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src1 = zext i16 %val to i32
-  %cond = icmp ult i32 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp uge i32 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i32 %src2, %src2
   br label %exit
Index: test/CodeGen/SystemZ/int-cmp-40.ll
===================================================================
--- test/CodeGen/SystemZ/int-cmp-40.ll
+++ test/CodeGen/SystemZ/int-cmp-40.ll
@@ -15,8 +15,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src2 = zext i16 %val to i64
-  %cond = icmp ult i64 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp uge i64 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i64 %src1, %src1
   br label %exit
@@ -54,8 +54,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src2 = zext i16 %val to i64
-  %cond = icmp eq i64 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp ne i64 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i64 %src1, %src1
   br label %exit
@@ -74,8 +74,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src2 = zext i16 %val to i64
-  %cond = icmp ne i64 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp eq i64 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i64 %src1, %src1
   br label %exit
@@ -95,8 +95,8 @@
 entry:
   %val = load i16 , i16 *@h, align 1
   %src2 = zext i16 %val to i64
-  %cond = icmp ult i64 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp uge i64 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i64 %src1, %src1
   br label %exit
@@ -115,8 +115,8 @@
 entry:
   %val = load i16 , i16 *@g
   %src1 = zext i16 %val to i64
-  %cond = icmp ult i64 %src1, %src2
-  br i1 %cond, label %exit, label %mulb
+  %cond = icmp uge i64 %src1, %src2
+  br i1 %cond, label %mulb, label %exit
 mulb:
   %mul = mul i64 %src2, %src2
   br label %exit
Index: test/CodeGen/SystemZ/int-cmp-44.ll
===================================================================
--- test/CodeGen/SystemZ/int-cmp-44.ll
+++ test/CodeGen/SystemZ/int-cmp-44.ll
@@ -473,8 +473,8 @@
   %xor = xor i32 %val, 1
   %add = add i32 %xor, 1000000
   call void @foo()
-  %cmp = icmp ne i32 %add, 0
-  br i1 %cmp, label %exit, label %store
+  %cmp = icmp eq i32 %add, 0
+  br i1 %cmp, label %store, label %exit, !prof !1
 
 store:
   store i32 %add, i32 *%ptr
@@ -888,3 +888,5 @@
 exit:
   ret i64 %res
 }
+
+!1 = !{!"branch_weights", i32 2, i32 1}
Index: test/CodeGen/Thumb/thumb-shrink-wrapping.ll
===================================================================
--- test/CodeGen/Thumb/thumb-shrink-wrapping.ll
+++ test/CodeGen/Thumb/thumb-shrink-wrapping.ll
@@ -1,11 +1,12 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
+
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
@@ -15,6 +16,8 @@
 ; edges.
 ; Also disable the late if-converter as it makes harder to reason on
 ; the diffs.
+; Disable tail-duplication during placement, as v4t vs v5t get different
+; results due to branches not being analyzable under v5
 
 ; Initial motivating example: Simple diamond with a call just on one side.
 ; CHECK-LABEL: foo:
Index: test/CodeGen/Thumb2/cbnz.ll
===================================================================
--- test/CodeGen/Thumb2/cbnz.ll
+++ test/CodeGen/Thumb2/cbnz.ll
@@ -26,7 +26,7 @@
   call void @x()
   call void @x()
   call void @x()
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %q = icmp eq i32 %y, 0
   br i1 %q, label %t2, label %f
 
Index: test/CodeGen/Thumb2/ifcvt-compare.ll
===================================================================
--- test/CodeGen/Thumb2/ifcvt-compare.ll
+++ test/CodeGen/Thumb2/ifcvt-compare.ll
@@ -4,7 +4,7 @@
 
 define void @f0(i32 %x) optsize {
   ; CHECK-LABEL: f0:
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %p = icmp eq i32 %x, 0
   br i1 %p, label %t, label %f
 
Index: test/CodeGen/Thumb2/v8_IT_4.ll
===================================================================
--- test/CodeGen/Thumb2/v8_IT_4.ll
+++ test/CodeGen/Thumb2/v8_IT_4.ll
@@ -12,10 +12,11 @@
 
 define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) {
 ; CHECK-LABEL: _ZNKSs7compareERKSs:
-; CHECK:      cbnz	r0,
+; CHECK:      cbz	r0,
+; CHECK-NEXT: %bb1
+; CHECK-NEXT: pop.w
 ; CHECK-NEXT: %bb
 ; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}}
-; CHECK-NEXT: %bb1
 ; CHECK-NEXT: pop.w
 entry:
   %0 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
Index: test/CodeGen/WebAssembly/phi.ll
===================================================================
--- test/CodeGen/WebAssembly/phi.ll
+++ test/CodeGen/WebAssembly/phi.ll
@@ -8,8 +8,9 @@
 ; Basic phi triangle.
 
 ; CHECK-LABEL: test0:
-; CHECK: div_s $[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}}
-; CHECK: return $[[NUM0]]{{$}}
+; CHECK: return $0
+; CHECK: div_s $push[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}}
+; CHECK: return $pop[[NUM0]]{{$}}
 define i32 @test0(i32 %p) {
 entry:
   %t = icmp slt i32 %p, 0
Index: test/CodeGen/X86/2008-11-29-ULT-Sign.ll
===================================================================
--- test/CodeGen/X86/2008-11-29-ULT-Sign.ll
+++ test/CodeGen/X86/2008-11-29-ULT-Sign.ll
@@ -4,8 +4,8 @@
 
 define i32 @a(i32 %x) nounwind {
 entry:
-	%cmp = icmp ult i32 %x, -2147483648		; <i1> [#uses=1]
-	br i1 %cmp, label %if.end, label %if.then
+	%cmp = icmp uge i32 %x, -2147483648		; <i1> [#uses=1]
+	br i1 %cmp, label %if.then, label %if.end
 
 if.then:		; preds = %entry
 	%call = call i32 (...) @b()		; <i32> [#uses=0]
Index: test/CodeGen/X86/add.ll
===================================================================
--- test/CodeGen/X86/add.ll
+++ test/CodeGen/X86/add.ll
@@ -30,7 +30,8 @@
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %sum = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %normal
+  %notobit = xor i1 1, %obit
+  br i1 %notobit, label %normal, label %overflow
 
 normal:
   store i32 0, i32* %X
@@ -53,7 +54,8 @@
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %sum = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %carry, label %normal
+  %notobit = xor i1 1, %obit
+  br i1 %notobit, label %normal, label %carry
 
 normal:
   store i32 0, i32* %X
Index: test/CodeGen/X86/avx512-cmp.ll
===================================================================
--- test/CodeGen/X86/avx512-cmp.ll
+++ test/CodeGen/X86/avx512-cmp.ll
@@ -69,13 +69,14 @@
 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ALL-NEXT:    vucomiss %xmm1, %xmm0
 ; ALL-NEXT:    jne LBB3_1
-; ALL-NEXT:    jnp LBB3_2
+; ALL-NEXT:    jp  LBB3_1
+; ALL-NEXT:  ## BB#2: ## %return
+; ALL-NEXT:    retq
 ; ALL-NEXT:  LBB3_1: ## %if.end
 ; ALL-NEXT:    seta %al
 ; ALL-NEXT:    movzbl %al, %eax
 ; ALL-NEXT:    leaq {{.*}}(%rip), %rcx
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:  LBB3_2: ## %return
 ; ALL-NEXT:    retq
 entry:
   %cmp = fcmp oeq float %p, 0.000000e+00
Index: test/CodeGen/X86/bt.ll
===================================================================
--- test/CodeGen/X86/bt.ll
+++ test/CodeGen/X86/bt.ll
@@ -49,7 +49,7 @@
   %tmp29 = lshr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
   %tmp4 = icmp eq i32 %tmp3, 0
-  br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+  br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1
 
 bb:
   call void @foo()
@@ -89,7 +89,7 @@
   %tmp29 = ashr i32 %x, %n
   %tmp3 = and i32 1, %tmp29
   %tmp4 = icmp eq i32 %tmp3, 0
-  br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+  br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1
 
 bb:
   call void @foo()
@@ -109,7 +109,7 @@
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %tmp29, %x
   %tmp4 = icmp eq i32 %tmp3, 0
-  br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+  br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1
 
 bb:
   call void @foo()
@@ -129,7 +129,7 @@
   %tmp29 = shl i32 1, %n
   %tmp3 = and i32 %x, %tmp29
   %tmp4 = icmp eq i32 %tmp3, 0
-  br i1 %tmp4, label %bb, label %UnifiedReturnBlock
+  br i1 %tmp4, label %bb, label %UnifiedReturnBlock, !prof !1
 
 bb:
   call void @foo()
@@ -608,3 +608,5 @@
   %tobool = icmp ne i64 %and1, 0
   ret i1 %tobool
 }
+
+!1 = !{!"branch_weights", i32 2, i32 1}
Index: test/CodeGen/X86/fp-une-cmp.ll
===================================================================
--- test/CodeGen/X86/fp-une-cmp.ll
+++ test/CodeGen/X86/fp-une-cmp.ll
@@ -36,8 +36,8 @@
 
 entry:
   %mul = fmul double %x, %y
-  %cmp = fcmp une double %mul, 0.000000e+00
-  br i1 %cmp, label %bb2, label %bb1
+  %cmp = fcmp oeq double %mul, 0.000000e+00
+  br i1 %cmp, label %bb1, label %bb2
 
 bb1:
   %add = fadd double %mul, -1.000000e+00
Index: test/CodeGen/X86/jump_sign.ll
===================================================================
--- test/CodeGen/X86/jump_sign.ll
+++ test/CodeGen/X86/jump_sign.ll
@@ -6,7 +6,7 @@
 ; CHECK: jns
 	%tmp1 = add i32 %X, 1		; <i32> [#uses=1]
 	%tmp = icmp slt i32 %tmp1, 0		; <i1> [#uses=1]
-	br i1 %tmp, label %cond_true, label %cond_next
+	br i1 %tmp, label %cond_true, label %cond_next, !prof !1
 
 cond_true:		; preds = %entry
 	%tmp2 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
@@ -303,3 +303,5 @@
 if.end:
   ret i32 undef
 }
+
+!1 = !{!"branch_weights", i32 2, i32 1}
Index: test/CodeGen/X86/machine-cse.ll
===================================================================
--- test/CodeGen/X86/machine-cse.ll
+++ test/CodeGen/X86/machine-cse.ll
@@ -86,8 +86,8 @@
 ; CHECK-LABEL: cross_mbb_phys_cse:
 ; CHECK: cmpl
 ; CHECK: ja
-  %cmp = icmp ugt i32 %a, %b
-  br i1 %cmp, label %return, label %if.end
+  %cmp = icmp ule i32 %a, %b
+  br i1 %cmp, label %if.end, label %return
 
 if.end:                                           ; preds = %entry
 ; CHECK-NOT: cmpl
Index: test/CodeGen/X86/testb-je-fusion.ll
===================================================================
--- test/CodeGen/X86/testb-je-fusion.ll
+++ test/CodeGen/X86/testb-je-fusion.ll
@@ -9,7 +9,7 @@
 entry:
   %and = and i32 %flags, 512
   %tobool = icmp eq i32 %and, 0
-  br i1 %tobool, label %if.end, label %if.then
+  br i1 %tobool, label %if.end, label %if.then, !prof !1
 
 if.then:
   br label %if.end
@@ -18,3 +18,4 @@
   %hasflag = phi i32 [ 1, %if.then ], [ 0, %entry ]
   ret i32 %hasflag
 }
+!1 = !{!"branch_weights", i32 1, i32 2}