Index: lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- lib/Transforms/Utils/SimplifyCFG.cpp
+++ lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1242,11 +1242,8 @@
 /// guarantees that BI's block dominates BB1 and BB2.
 static bool HoistThenElseCodeToIf(BranchInst *BI,
                                   const TargetTransformInfo &TTI) {
-  // This does very trivial matching, with limited scanning, to find identical
-  // instructions in the two blocks.  In particular, we don't want to get into
-  // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
-  // such, we currently just scan for obviously identical instructions in an
-  // identical order.
+  // This does matching to find identical instructions in the two blocks.
+
   BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
   BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
 
@@ -1263,7 +1260,57 @@
     while (isa<DbgInfoIntrinsic>(I2))
       I2 = &*BB2_Itr++;
   }
-  if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2) ||
+
+  // We try to match instructions in two basic blocks.
+  // To avoid excessive cost for matching, we limit the range to scan
+  // by up to SCAN_LIMIT instructions.
+  const unsigned SCAN_LIMIT = 10;
+
+  // If we hoist instructions other than the first one in the block,
+  // we remember what instructions we skipped for the later check.
+  bool SkippedInst = false, SkippedStore = false, SkippedLoad = false;
+  bool Found = false;
+  BasicBlock::iterator TmpItr1(I1);
+  for (unsigned Count1 = 0;
+       TmpItr1 != BB1->end() && Count1 < SCAN_LIMIT; Count1++) {
+    Instruction *TmpI1 = &*TmpItr1++;
+    BasicBlock::iterator TmpItr2(I2);
+    for (unsigned Count2 = 0;
+        TmpItr2 != BB2->end() && Count2 < SCAN_LIMIT; Count2++) {
+      Instruction *TmpI2 = &*TmpItr2++;
+      if (TmpI1->isIdenticalToWhenDefined(TmpI2)) {
+        Found = true;
+        if (Count1 || Count2) {
+          SkippedInst = true;
+
+          // Advancing iterators over skipped instructions.
+          // We also check memory accesses by the skipped instructions.
+          for (unsigned I = 0; I < Count1; I++) {
+            if (I1->mayWriteToMemory()) SkippedStore = true;
+            if (I1->mayReadFromMemory()) SkippedLoad = true;
+            I1 = &*BB1_Itr++;
+          }
+          for (unsigned I = 0; I < Count2; I++) {
+            if (I2->mayWriteToMemory()) SkippedStore = true;
+            if (I2->mayReadFromMemory()) SkippedLoad = true;
+            I2 = &*BB2_Itr++;
+          }
+        }
+        break;
+      }
+      // We do not move instructions over atomic op (e.g. fence) or method call
+      if (TmpI2->isAtomic() || isa<CallInst>(TmpI2) || isa<InvokeInst>(TmpI2))
+        break;
+    }
+    if (Found ||
+        TmpI1->isAtomic() || isa<CallInst>(TmpI1) || isa<InvokeInst>(TmpI1))
+      break;
+  }
+
+  if (!Found)
+    return false;
+
+  if (isa<PHINode>(I1) ||
       (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2)))
     return false;
 
@@ -1276,6 +1323,12 @@
     if (isa<TerminatorInst>(I1))
       goto HoistTerminator;
 
+    // We don't hoist a load or store instruction over potentially conflicting
+    // memory accesses. To do: refine based on alias analysis.
+    if ((I1->mayReadFromMemory() && SkippedStore) ||
+        (I1->mayWriteToMemory() && (SkippedStore || SkippedLoad)))
+      return Changed;
+
     // If we're going to hoist a call, make sure that the two instructions we're
     // commoning/hoisting are both marked with musttail, or neither of them is
     // marked as such. Otherwise, we might end up in a situation where we hoist
@@ -1336,6 +1389,11 @@
   if (isa<InvokeInst>(I1) && !isSafeToHoistInvoke(BB1, BB2, I1, I2))
     return Changed;
 
+  // If we have remaining instructions in a BB due to skip,
+  // we cannot hoist branch instruction.
+  if (SkippedInst)
+    return Changed;
+
   for (BasicBlock *Succ : successors(BB1)) {
     for (PHINode &PN : Succ->phis()) {
       Value *BB1V = PN.getIncomingValueForBlock(BB1);
Index: test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
===================================================================
--- test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
+++ test/Transforms/SimplifyCFG/phi-undef-loadstore.ll
@@ -1,5 +1,6 @@
 ; RUN: opt -simplifycfg -S < %s | FileCheck %s
 
+declare void @foo() nounwind
 declare void @bar() nounwind
 
 define i32 @test1(i32* %a, i32 %b, i32* %c, i32 %d) nounwind {
@@ -8,7 +9,7 @@
   br i1 %tobool, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void @bar() nounwind
+  tail call void @foo() nounwind
   br label %if.end7
 
 if.else:                                          ; preds = %entry
@@ -37,7 +38,7 @@
   br i1 %tobool, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void @bar() nounwind
+  tail call void @foo() nounwind
   br label %if.end7
 
 if.else:                                          ; preds = %entry
@@ -65,7 +66,7 @@
   br i1 %tobool, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void @bar() nounwind
+  tail call void @foo() nounwind
   br label %if.end7
 
 if.else:                                          ; preds = %entry
@@ -92,7 +93,7 @@
   br i1 %tobool, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
-  tail call void @bar() nounwind
+  tail call void @foo() nounwind
   br label %if.end7
 
 if.else:                                          ; preds = %entry
Index: test/Transforms/SimplifyCFG/skip.ll
===================================================================
--- /dev/null
+++ test/Transforms/SimplifyCFG/skip.ll
@@ -0,0 +1,190 @@
+; RUN: opt -simplifycfg -S < %s | FileCheck %s
+
+declare void @foo() nounwind
+
+define i32 @test1(i32 %a, i32 %b, i32* %c) {
+; We expect load i32, i32* %c is hoisted into %entry from %if.then and %if.else
+; CHECK-LABEL: test1
+; CHECK: load i32, i32* %c
+; CHECK-LABEL: if.then
+; CHECK-NOT: load i32, i32* %c
+
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %0 = add nsw i32 %a, 1
+  %1 = load i32, i32* %c
+  br label %if.end
+
+if.else:
+  %2 = mul nsw i32 %a, 5
+  %3 = load i32, i32* %c
+  call void @foo() ; to avoid further simplification
+  br label %if.end
+
+if.end:
+  %4 = phi i32 [ %0, %if.then ], [ %2, %if.else ]
+  %5 = phi i32 [ %1, %if.then ], [ %3, %if.else ]
+  %6 = add nsw i32 %4, %5
+  ret i32 %6
+}
+
+define i32 @test2(i32 %a, i32 %b, i32* %c) {
+; We expect load i32, i32* %c is hoisted into %entry from %if.then and %if.else
+; Then the branch is further optimized using select.
+; CHECK-LABEL: test2
+; CHECK: select
+; CHECK-NOT: if.else
+; CHECK-NOT: if.then
+; CHECK: ret
+
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %0 = add nsw i32 %a, 1
+  %1 = load i32, i32* %c
+  br label %if.end
+
+if.else:
+  %2 = mul nsw i32 %a, 5
+  %3 = load i32, i32* %c
+  br label %if.end
+
+if.end:
+  %4 = phi i32 [ %0, %if.then ], [ %2, %if.else ]
+  %5 = phi i32 [ %1, %if.then ], [ %3, %if.else ]
+  %6 = add nsw i32 %4, %5
+  ret i32 %6
+}
+
+define i32 @test3(i32 %a, i32 %b, i32* %c) {
+; We expect load i32, i32* %c is NOT hoisted due to a call instruction
+; CHECK-LABEL: test3
+; CHECK-NOT: load i32, i32* %c
+; CHECK-LABEL: if.then
+; CHECKT: load i32, i32* %c
+
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %0 = add nsw i32 %a, 1
+  %1 = load i32, i32* %c
+  br label %if.end
+
+if.else:
+  %2 = mul nsw i32 %a, 5
+  call void @foo()
+  %3 = load i32, i32* %c
+  br label %if.end
+
+if.end:
+  %4 = phi i32 [ %0, %if.then ], [ %2, %if.else ]
+  %5 = phi i32 [ %1, %if.then ], [ %3, %if.else ]
+  %6 = add nsw i32 %4, %5
+  ret i32 %6
+}
+
+define i32 @test4(i32 %a, i32 %b, i32* %c) {
+; We expect load i32, i32* %c is NOT hoisted due to a fence instruction
+; CHECK-LABEL: test4
+; CHECK-NOT: load i32, i32* %c
+; CHECK-LABEL: if.then
+; CHECKT: load i32, i32* %c
+
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  fence acquire
+  %0 = add nsw i32 %a, 1
+  %1 = load i32, i32* %c
+  br label %if.end
+
+if.else:
+  %2 = mul nsw i32 %a, 5
+  %3 = load i32, i32* %c
+  br label %if.end
+
+if.end:
+  %4 = phi i32 [ %0, %if.then ], [ %2, %if.else ]
+  %5 = phi i32 [ %1, %if.then ], [ %3, %if.else ]
+  %6 = add nsw i32 %4, %5
+  ret i32 %6
+}
+
+define i32 @test5(i32 %a, i32 %b, i32* %c) {
+; We expect load i32, i32* %c is NOT hoisted due to a store instruction
+; CHECK-LABEL: test5
+; CHECK-NOT: load i32, i32* %c
+; CHECK-LABEL: if.then
+; CHECKT: load i32, i32* %c
+
+entry:
+  %tobool = icmp eq i32 %b, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %0 = add nsw i32 %a, 1
+  store i32 %0, i32* %c
+  %1 = load i32, i32* %c
+  br label %if.end
+
+if.else:
+  %2 = mul nsw i32 %a, 5
+  %3 = load i32, i32* %c
+  br label %if.end
+
+if.end:
+  %4 = phi i32 [ %0, %if.then ], [ %2, %if.else ]
+  %5 = phi i32 [ %1, %if.then ], [ %3, %if.else ]
+  %6 = add nsw i32 %4, %5
+  ret i32 %6
+}
+
+%struct.s = type { i32, i32 }
+
+define signext i32 @func(%struct.s* nocapture readonly %p, i32 signext %f) {
+; Equivalent C code
+; int func(struct s *p, int f) {
+;  int v = 0;
+;  if (f) v = p->a + p->b;
+;  else   v = p->b - p->a;
+;  return v;
+;}
+
+; CHECK-LABEL: func
+; CHECK: select
+; CHECK-NOT: if.else
+; CHECK-NOT: if.then
+; CHECK: ret
+entry:
+  %tobool = icmp eq i32 %f, 0
+  br i1 %tobool, label %if.else, label %if.then
+
+if.then:
+  %a = getelementptr inbounds %struct.s, %struct.s* %p, i64 0, i32 0
+  %0 = load i32, i32* %a, align 4
+  %b = getelementptr inbounds %struct.s, %struct.s* %p, i64 0, i32 1
+  %1 = load i32, i32* %b, align 4
+  %add = add nsw i32 %1, %0
+  br label %if.end
+
+if.else:
+  %b1 = getelementptr inbounds %struct.s, %struct.s* %p, i64 0, i32 1
+  %2 = load i32, i32* %b1, align 4
+  %a2 = getelementptr inbounds %struct.s, %struct.s* %p, i64 0, i32 0
+  %3 = load i32, i32* %a2, align 4
+  %sub = sub nsw i32 %2, %3
+  br label %if.end
+
+if.end:
+  %v.0 = phi i32 [ %add, %if.then ], [ %sub, %if.else ]
+  ret i32 %v.0
+}