Index: llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp
+++ llvm/trunk/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1289,6 +1289,33 @@
   if (PredToDestList.empty())
     return false;
 
+  // If all the predecessors go to a single known successor, we want to fold,
+  // not thread. By doing so, we do not need to duplicate the current block and
+  // also miss potential opportunities in case we don't/can't duplicate.
+  if (OnlyDest && OnlyDest != MultipleDestSentinel) {
+    if (PredToDestList.size() ==
+        (size_t)std::distance(pred_begin(BB), pred_end(BB))) {
+      for (BasicBlock *SuccBB : successors(BB)) {
+        if (SuccBB != OnlyDest)
+          SuccBB->removePredecessor(BB, true); // This is unreachable successor.
+      }
+
+      // Finally update the terminator.
+      TerminatorInst *Term = BB->getTerminator();
+      BranchInst::Create(OnlyDest, Term);
+      Term->eraseFromParent();
+
+      // If the condition is now dead due to the removal of the old terminator,
+      // erase it.
+      auto *CondInst = dyn_cast<Instruction>(Cond);
+      if (CondInst && CondInst->use_empty())
+        CondInst->eraseFromParent();
+      // FIXME: in case this instruction is defined in the current BB and it
+      // resolves to a single value from all predecessors, we can do RAUW.
+      return true;
+    }
+  }
+
   // Determine which is the most common successor.  If we have many inputs and
   // this block is a switch, we want to start by threading the batch that goes
   // to the most popular destination first.  If we only know about one
Index: llvm/trunk/test/Transforms/JumpThreading/basic.ll
===================================================================
--- llvm/trunk/test/Transforms/JumpThreading/basic.ll
+++ llvm/trunk/test/Transforms/JumpThreading/basic.ll
@@ -4,6 +4,95 @@
 declare i32 @f2()
 declare void @f3()
 
+; Make sure we can fold this branch ... We will not be able to thread it as
+; L0 is too big to duplicate. L2 is the unreachable block here.
+;
+; CHECK-LABEL: @test_br_folding_not_threading(
+; CHECK: L1:
+; CHECK: call i32 @f2()
+; CHECK: call void @f3()
+; CHECK-NEXT: ret void
+; CHECK-NOT: br
+; CHECK: L3:
+define void @test_br_folding_not_threading(i1 %cond) nounwind {
+entry:
+  br i1 %cond, label %L0, label %L3 
+L0:
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  br i1 %cond, label %L1, label %L2 
+
+L1:
+  call void @f3()
+  ret void
+L2:
+  call void @f3()
+  ret void
+L3:
+  call void @f3()
+  ret void
+}
+
+
+; Make sure we can fold this branch ... We will not be able to thread it as
+; L0 is too big to duplicate. L2 is the unreachable block here.
+; With more than 1 predecessors.
+;
+; CHECK-LABEL: @test_br_folding_not_threading_multiple_preds(
+; CHECK: L1:
+; CHECK: call i32 @f2()
+; CHECK: call void @f3()
+; CHECK-NEXT: ret void
+; CHECK-NOT: br
+; CHECK: L3:
+define void @test_br_folding_not_threading_multiple_preds(i1 %condx, i1 %cond) nounwind {
+entry:
+  br i1 %condx, label %X0, label %X1
+
+X0:
+  br i1 %cond, label %L0, label %L3 
+
+X1:
+  br i1 %cond, label %L0, label %L3 
+
+L0:
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  call i32 @f2()
+  br i1 %cond, label %L1, label %L2 
+
+L1:
+  call void @f3()
+  ret void
+L2:
+  call void @f3()
+  ret void
+L3:
+  call void @f3()
+  ret void
+}
+
 define i32 @test1(i1 %cond) {
 ; CHECK-LABEL: @test1(