Index: lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- lib/Transforms/Utils/SimplifyCFG.cpp
+++ lib/Transforms/Utils/SimplifyCFG.cpp
@@ -98,6 +98,10 @@
     SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
                cl::desc("Sink common instructions down to the end block"));
 
+static cl::opt<bool> DedupBB(
+    "simplifycfg-dedup-bb", cl::Hidden, cl::init(true),
+    cl::desc("Merge duplicated basic blocks into one"));
+
 static cl::opt<bool> HoistCondStores(
     "simplifycfg-hoist-cond-stores", cl::Hidden, cl::init(true),
     cl::desc("Hoist conditional stores if an unconditional store precedes"));
@@ -132,6 +136,7 @@
     NumLookupTablesHoles,
     "Number of switch instructions turned into lookup tables (holes checked)");
 STATISTIC(NumTableCmpReuses, "Number of reused switch table lookup compares");
+STATISTIC(NumDedupBBs, "Number of duplicated basic blocks merged");
 STATISTIC(NumSinkCommons,
           "Number of common instructions sunk down to the end block");
 STATISTIC(NumSpeculations, "Number of speculative executed instructions");
@@ -1676,6 +1681,146 @@
 
 } // end anonymous namespace
 
+static unsigned getNumNonDbgInstrInBB(BasicBlock *BB) {
+  unsigned count = 0;
+  for (Instruction &Instr : *BB)
+    if (!isa<DbgInfoIntrinsic>(Instr)) count++;
+  return count;
+}
+
+/// Given an unconditional branch that goes to BBSucc,
+/// check whether BBSucc has two exactly same predecessors.
+/// If it is true, merge these BBs into one BB.
+static bool MergeDuplicatedBlock(BranchInst *BI, const SmallPtrSetImpl<BasicBlock *> *LoopHeaders) {
+  assert(BI->isUnconditional());
+
+  // For this input CFG, we merge BB1 and BB2 if possible.
+  // This optimization is applied only in simple (but common) situations.
+  // e.g. we don't merge BBs if we need to add a new PHI node in BBsucc.
+  // Also, we don't optimize BBs around loops to avoid costly dominator analysis.
+  //
+  // input CFG:
+  //  [BB0] [other BBs]
+  //     \     |
+  //    [BB1][BB2] [other BBs]
+  //       \   |    /
+  //      [  BBsucc  ]
+  //       /   |   \
+  //
+  // optimized CFG:
+  //  [BB0] [other BBs]
+  //     \     |
+  //      ---[BB2] [other BBs]
+  //           |    /
+  //      [  BBsucc  ]
+  //       /   |   \
+  //
+  // For the edge(s) from (potentially multiple) BB0 to BB1,
+  // we support only conditional or unconditional branch instrcutions.
+  // For the edge from BB1 to BBsucc and BB2 to BBsucc,
+  // we support only unconditional branch instrcutions.
+
+  BasicBlock *BB1 = BI->getParent();
+  BasicBlock *BBSucc = BI->getSuccessor(0);
+  unsigned BB1NumInst = getNumNonDbgInstrInBB(BB1);
+
+  // We do not optimize the entry block
+  if (BB1 == &BB1->getParent()->getEntryBlock()) return false;
+
+  // An empty block will be eliminated anyway
+  if (BB1NumInst <= 1) return false;
+
+  // We avoid optimizing a method with a loop to avoid excissive analysis cost
+  // for other than blocks just before the return block (a common case).
+  if (!(LoopHeaders && LoopHeaders->empty()) &&
+     !isa<ReturnInst>(BBSucc->getTerminator())) return false;
+
+  BasicBlock::iterator II = BBSucc->begin();
+  const PHINode *PN = dyn_cast<PHINode>(II);
+  Value *InValBB1;
+  Instruction *InInstBB1;
+  if (PN) {
+    // We do not optimize if multiple PHI instructions exist
+    // in the successor for ease of analysis
+    if (++II != BBSucc->end() && isa<PHINode>(II)) return false;
+
+    InValBB1 = PN->getIncomingValueForBlock(BB1);
+    InInstBB1 = dyn_cast<Instruction>(InValBB1);
+  }
+
+  // We do not optimize non-branch CFG edge (e.g. switch)
+  for (auto *B : predecessors(BB1))
+    if (!isa<BranchInst>(B->getTerminator())) return false;
+
+  for (auto *BB2 : predecessors(BBSucc)) {
+    if (BB2 == BB1) continue;
+
+    // We do not optimize the entry block
+    if (BB2 == &BB2->getParent()->getEntryBlock()) continue;
+
+    // We only merge CFG edges of unconditional branch
+    BranchInst *BI = dyn_cast<BranchInst>(BB2->getTerminator());
+    if (!(BI && BI->isUnconditional())) continue;
+
+    // We do not optimize non-branch CFG edges
+    for (auto *B : predecessors(BB2))
+      if (!isa<BranchInst>(B->getTerminator())) continue;
+
+    // We can merge control flow if incomming values to the PHI node
+    // at the succesor are same values or both defined in the BBs to merge.
+    // For the latter case, canSinkInstructions executes further analysis
+    if (PN) {
+      Value *InValBB2 = PN->getIncomingValueForBlock(BB2);
+      Instruction *InInstBB2 = dyn_cast<Instruction>(InValBB2);
+      if (!(InValBB1 == InValBB2) &&
+          !(InInstBB1 && InInstBB1->getParent() == BB1 &&
+            InInstBB2 && InInstBB2->getParent() == BB2)) continue;
+    }
+
+    // If the sizes of BBs are different, we do not need to check in detail
+    if (BB1NumInst != getNumNonDbgInstrInBB(BB2)) continue;
+
+    // We check the instructions in two BBs are same
+    SmallVector<BasicBlock*,2> UnconditionalPreds({BB1, BB2});
+    DenseMap<Instruction*, SmallVector<Value*,4>> PHIOperands;
+    LockstepReverseIterator LRI(UnconditionalPreds);
+    while (LRI.isValid() &&
+          canSinkInstructions(*LRI, PHIOperands)) {
+      // we do not optimize if additional PHI node is required
+      if (PHIOperands.size() > 0) break;
+      --LRI;
+    }
+    // if iterator is still valid, it means a mismatch found in middle of BB
+    if (LRI.isValid()) continue;
+
+    BasicBlock *BBToErase = BB1;
+    BasicBlock *BBToRetain = BB2;
+    DEBUG(dbgs() << "DEDUP BB: merging duplicated blocks (" << BBToErase->getName()
+                 << " into " << BBToRetain->getName() << ")\n");
+
+    // We replace the destination of incomming edges of BBToErase by BBToRetain
+    SmallVector<BasicBlock *, 8> BBToUpdate(predecessors(BBToErase));
+    for (BasicBlock *BB0 : BBToUpdate) {
+      TerminatorInst *T = BB0->getTerminator();
+      bool updated = false;
+      for (unsigned OI = 0, OE = T->getNumOperands(); OI != OE; ++OI) {
+        if (T->getOperand(OI) == BBToErase) {
+          T->setOperand(OI, BBToRetain);
+          updated = true;
+        }
+      }
+      assert(updated && "CFG and branch instruction are inconsistent");
+    }
+
+    // Now we can remove the BB, which has no incomming edge
+    DeleteDeadBlock(BBToErase);
+    NumDedupBBs++;
+    return true;
+  }
+
+  return false;
+}
+
 /// Given an unconditional branch that goes to BBEnd,
 /// check whether BBEnd has only two predecessors and the other predecessor
 /// ends with an unconditional branch. If it is true, sink any common code
@@ -5691,6 +5836,9 @@
                                           IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
 
+  if (DedupBB && MergeDuplicatedBlock(BI, LoopHeaders))
+    return true;
+
   if (SinkCommon && SinkThenElseCodeToEnd(BI))
     return true;
 
Index: test/Transforms/SimplifyCFG/dedup-bb.ll
===================================================================
--- /dev/null
+++ test/Transforms/SimplifyCFG/dedup-bb.ll
@@ -0,0 +1,115 @@
+; RUN: opt < %s -simplifycfg -S | FileCheck %s
+
+define i64 @test(i64 %a) {
+; in this test, if.then1, if.then2 and if.then3 must be merged into one block
+; CHECK-LABEL: test
+; CHECK: call i64 @foo
+; CHECK-NOT: call i64 @foo
+entry:
+  %cmp1 = icmp eq i64 %a, 1
+  br i1 %cmp1, label %if.then1, label %if.else1
+
+if.else1:
+  store i64 1, i64* @g
+  %cmp2 = icmp eq i64 %a, 2
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.else2:
+  store i64 2, i64* @g
+  %cmp3 = icmp eq i64 %a, 3
+  br i1 %cmp3, label %if.then3, label %return
+
+return:
+  %retval = phi i64 [ %call1, %if.then1 ], [ %call2, %if.then2 ], [ %call3, %if.then3 ], [ 0, %if.else2 ]
+  ret i64 %retval
+
+if.then1:
+  %call1 = tail call i64 @foo(i64 %a) #2
+  br label %return
+
+if.then2:
+  %call2 = tail call i64 @foo(i64 %a) #2
+  br label %return
+
+if.then3:
+  %call3 = tail call i64 @foo(i64 %a) #2
+  br label %return
+}
+
+define i64 @test2(i64 %a) {
+; in this test, if.then3 should not be merged due to difference in instruction
+; CHECK-LABEL: test2
+; CHECK-DAG: call i64 @foo
+; CHECK-DAG: call i64 @bar
+; CHECK-NOT: call i64 @foo
+entry:
+  %cmp1 = icmp eq i64 %a, 1
+  br i1 %cmp1, label %if.then1, label %if.else1
+
+if.else1:
+  store i64 1, i64* @g
+  %cmp2 = icmp eq i64 %a, 2
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.else2:
+  store i64 2, i64* @g
+  %cmp3 = icmp eq i64 %a, 3
+  br i1 %cmp3, label %if.then3, label %return
+
+return:
+  %retval = phi i64 [ %call1, %if.then1 ], [ %call2, %if.then2 ], [ %call3, %if.then3 ], [ 0, %if.else2 ]
+  ret i64 %retval
+
+if.then1:
+  %call1 = tail call i64 @foo(i64 %a) #2
+  br label %return
+
+if.then2:
+  %call2 = tail call i64 @foo(i64 %a) #2
+  br label %return
+
+if.then3:
+  %call3 = tail call i64 @bar(i64 %a) #2
+  br label %return
+}
+
+define i64 @test3(i64 %a) {
+; in this test, if.then2 should not be merged due to difference in PHI node at the return block
+; CHECK-LABEL: test3
+; CHECK: call i64 @foo
+; CHECK: call i64 @foo
+; CHECK-NOT: call i64 @foo
+entry:
+  %cmp1 = icmp eq i64 %a, 1
+  br i1 %cmp1, label %if.then1, label %if.else1
+
+if.else1:
+  store i64 1, i64* @g
+  %cmp2 = icmp eq i64 %a, 2
+  br i1 %cmp2, label %if.then2, label %if.else2
+
+if.else2:
+  store i64 2, i64* @g
+  %cmp3 = icmp eq i64 %a, 3
+  br i1 %cmp3, label %if.then3, label %return
+
+return:
+  %retval = phi i64 [ %call1, %if.then1 ], [ 0, %if.then2 ], [ %call3, %if.then3 ], [ 0, %if.else2 ]
+  ret i64 %retval
+
+if.then1:
+  %call1 = tail call i64 @foo(i64 %a) #2
+  br label %return
+
+if.then2:
+  %call2 = tail call i64 @foo(i64 %a) #2
+  br label %return
+
+if.then3:
+  %call3 = tail call i64 @foo(i64 %a) #2
+  br label %return
+}
+
+declare i64 @foo(i64)
+declare i64 @bar(i64)
+@g = global i64 0