Index: llvm/lib/Transforms/Utils/SimplifyCFG.cpp
===================================================================
--- llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -115,6 +115,12 @@
     HoistCommon("simplifycfg-hoist-common", cl::Hidden, cl::init(true),
                 cl::desc("Hoist common instructions up to the parent block"));
 
+static cl::opt<bool>
+    HoistCommonSkipLimit("simplifycfg-hoist-common-skip-limit", cl::Hidden,
+                         cl::init(20),
+                         cl::desc("Allow reordering across at most this many "
+                                  "instructions when hoisting"));
+
 static cl::opt<bool>
     SinkCommon("simplifycfg-sink-common", cl::Hidden, cl::init(true),
                cl::desc("Sink common instructions down to the end block"));
@@ -1430,6 +1436,32 @@
   return true;
 }
 
+// Returns true if it is safe to reorder an instruction across preceding
+// instructions in a basic block.
+static bool isSafeToHoistInstr(Instruction *I, bool ForceNoSideEffects,
+                               bool ForceNoSpeculation) {
+  // If we have seen an instruction with side effects, it's unsafe to reorder an
+  // instruction which reads memory or itself has side effects.
+  if (ForceNoSideEffects && (I->mayReadFromMemory() || I->mayHaveSideEffects()))
+    return false;
+
+  // Reordering across an instruction which does not necessarily transfer
+  // control to the next instruction is speculation.
+  if (ForceNoSpeculation && !isSafeToSpeculativelyExecute(I))
+    return false;
+
+  // It's also unsafe/illegal to hoist an instruction above its instruction
+  // operands
+  BasicBlock *BB = I->getParent();
+  for (Value *Op : I->operands()) {
+    if (auto *J = dyn_cast<Instruction>(Op))
+      if (J->getParent() == BB)
+        return false;
+  }
+
+  return true;
+}
+
 static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I, bool PtrValueMayBeModified = false);
 
 /// Given a conditional branch that goes to BB1 and BB2, hoist any common code
@@ -1444,7 +1476,8 @@
   // instructions in the two blocks.  In particular, we don't want to get into
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
   // such, we currently just scan for obviously identical instructions in an
-  // identical order.
+  // identical order, possibly separated by the same number of non-identical
+  // instructions.
   BasicBlock *BB1 = BI->getSuccessor(0); // The true destination.
   BasicBlock *BB2 = BI->getSuccessor(1); // The false destination
 
@@ -1467,7 +1500,7 @@
     while (isa<DbgInfoIntrinsic>(I2))
       I2 = &*BB2_Itr++;
   }
-  if (isa<PHINode>(I1) || !I1->isIdenticalToWhenDefined(I2))
+  if (isa<PHINode>(I1))
     return false;
 
   BasicBlock *BIParent = BI->getParent();
@@ -1493,79 +1526,122 @@
     // terminator. Let the loop below handle those 2 cases.
   }
 
-  do {
+  // Count how many instructions were not hoisted so far. There's a limit on how
+  // many instructions we skip, serving as a compilation time control as well as
+  // preventing excessive increase of life ranges.
+  unsigned NumSkipped = 0;
+  
+  // Record if any non-hoisted instruction contains side-effects, as it could
+  // make it illegal to reorder some instructions across.
+  bool ForceNoReadMemOrSideEffectsBB1 = false;
+  bool ForceNoReadMemOrSideEffectsBB2 = false;
+
+  // Record if any non-hoisted instructions does not necessarily transfer
+  // control to the next instruction. Then we can hoist only instrucions which
+  // are safe to speculate.
+  bool ForceNoSpeculationBB1 = false;
+  bool ForceNoSpeculationBB2 = false;
+
+  for (;;) {
     // If we are hoisting the terminator instruction, don't move one (making a
     // broken BB), instead clone it, and remove BI.
-    if (I1->isTerminator())
+    if (I1->isTerminator() || I2->isTerminator()) {
+      // If any instructions remain in the block, we cannot hoist terminators.
+      if (NumSkipped || !I1->isIdenticalToWhenDefined(I2))
+        return Changed;
       goto HoistTerminator;
+    }
 
-    // Hoisting token-returning instructions would obscure the origin.
-    if (I1->getType()->isTokenTy())
-      return Changed;
-
-    // If we're going to hoist a call, make sure that the two instructions we're
-    // commoning/hoisting are both marked with musttail, or neither of them is
-    // marked as such. Otherwise, we might end up in a situation where we hoist
-    // from a block where the terminator is a `ret` to a block where the terminator
-    // is a `br`, and `musttail` calls expect to be followed by a return.
-    auto *C1 = dyn_cast<CallInst>(I1);
-    auto *C2 = dyn_cast<CallInst>(I2);
-    if (C1 && C2)
-      if (C1->isMustTailCall() != C2->isMustTailCall())
+    if (I1->isIdenticalToWhenDefined(I2)) {
+      // Hoisting token-returning instructions would obscure the origin.
+      if (I1->getType()->isTokenTy())
         return Changed;
 
-    if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
-      return Changed;
-
-    // If any of the two call sites has nomerge attribute, stop hoisting.
-    if (const auto *CB1 = dyn_cast<CallBase>(I1))
-      if (CB1->cannotMerge())
+      // Even if the instructions are identical, it may not be safe to hoist
+      // them if we have skipped over instructions with side effects or their
+      // operands weren't hoisted.
+      if (!isSafeToHoistInstr(I1, ForceNoReadMemOrSideEffectsBB1, ForceNoSpeculationBB1) ||
+          !isSafeToHoistInstr(I2, ForceNoReadMemOrSideEffectsBB2, ForceNoSpeculationBB2))
         return Changed;
-    if (const auto *CB2 = dyn_cast<CallBase>(I2))
-      if (CB2->cannotMerge())
+
+      // If we're going to hoist a call, make sure that the two instructions
+      // we're commoning/hoisting are both marked with musttail, or neither of
+      // them is marked as such. Otherwise, we might end up in a situation where
+      // we hoist from a block where the terminator is a `ret` to a block where
+      // the terminator is a `br`, and `musttail` calls expect to be followed by
+      // a return.
+      auto *C1 = dyn_cast<CallInst>(I1);
+      auto *C2 = dyn_cast<CallInst>(I2);
+      if (C1 && C2)
+        if (C1->isMustTailCall() != C2->isMustTailCall())
+          return Changed;
+
+      if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
         return Changed;
 
-    if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
-      assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
-      // The debug location is an integral part of a debug info intrinsic
-      // and can't be separated from it or replaced.  Instead of attempting
-      // to merge locations, simply hoist both copies of the intrinsic.
-      BIParent->getInstList().splice(BI->getIterator(),
-                                     BB1->getInstList(), I1);
-      BIParent->getInstList().splice(BI->getIterator(),
-                                     BB2->getInstList(), I2);
+      // If any of the two call sites has nomerge attribute, stop hoisting.
+      if (const auto *CB1 = dyn_cast<CallBase>(I1))
+        if (CB1->cannotMerge())
+          return Changed;
+      if (const auto *CB2 = dyn_cast<CallBase>(I2))
+        if (CB2->cannotMerge())
+          return Changed;
+
+      if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
+        assert(isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
+        // The debug location is an integral part of a debug info intrinsic
+        // and can't be separated from it or replaced.  Instead of attempting
+        // to merge locations, simply hoist both copies of the intrinsic.
+        BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(),
+                                       I1);
+        BIParent->getInstList().splice(BI->getIterator(), BB2->getInstList(),
+                                       I2);
+      } else {
+        // For a normal instruction, we just move one to right before the
+        // branch, then replace all uses of the other with the first.  Finally,
+        // we remove the now redundant second instruction.
+        BIParent->getInstList().splice(BI->getIterator(), BB1->getInstList(),
+                                       I1);
+        if (!I2->use_empty())
+          I2->replaceAllUsesWith(I1);
+        I1->andIRFlags(I2);
+        unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
+                               LLVMContext::MD_range,
+                               LLVMContext::MD_fpmath,
+                               LLVMContext::MD_invariant_load,
+                               LLVMContext::MD_nonnull,
+                               LLVMContext::MD_invariant_group,
+                               LLVMContext::MD_align,
+                               LLVMContext::MD_dereferenceable,
+                               LLVMContext::MD_dereferenceable_or_null,
+                               LLVMContext::MD_mem_parallel_loop_access,
+                               LLVMContext::MD_access_group,
+                               LLVMContext::MD_preserve_access_index};
+        combineMetadata(I1, I2, KnownIDs, true);
+
+        // I1 and I2 are being combined into a single instruction.  Its debug
+        // location is the merged locations of the original instructions.
+        I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
+
+        I2->eraseFromParent();
+      }
       Changed = true;
+      ++NumHoistCommonInstrs;
     } else {
-      // For a normal instruction, we just move one to right before the branch,
-      // then replace all uses of the other with the first.  Finally, we remove
-      // the now redundant second instruction.
-      BIParent->getInstList().splice(BI->getIterator(),
-                                     BB1->getInstList(), I1);
-      if (!I2->use_empty())
-        I2->replaceAllUsesWith(I1);
-      I1->andIRFlags(I2);
-      unsigned KnownIDs[] = {LLVMContext::MD_tbaa,
-                             LLVMContext::MD_range,
-                             LLVMContext::MD_fpmath,
-                             LLVMContext::MD_invariant_load,
-                             LLVMContext::MD_nonnull,
-                             LLVMContext::MD_invariant_group,
-                             LLVMContext::MD_align,
-                             LLVMContext::MD_dereferenceable,
-                             LLVMContext::MD_dereferenceable_or_null,
-                             LLVMContext::MD_mem_parallel_loop_access,
-                             LLVMContext::MD_access_group,
-                             LLVMContext::MD_preserve_access_index};
-      combineMetadata(I1, I2, KnownIDs, true);
-
-      // I1 and I2 are being combined into a single instruction.  Its debug
-      // location is the merged locations of the original instructions.
-      I1->applyMergedLocation(I1->getDebugLoc(), I2->getDebugLoc());
-
-      I2->eraseFromParent();
-      Changed = true;
+      if (NumSkipped >= HoistCommonSkipLimit)
+        return Changed;
+      // We are about to skip over a pair of non-identical instructions. Record
+      // if any of them has side effects.
+      if (I1->mayHaveSideEffects())
+        ForceNoReadMemOrSideEffectsBB1 = true;
+      if (I2->mayHaveSideEffects())
+        ForceNoReadMemOrSideEffectsBB2 = true;
+      if (!I1->willReturn())
+        ForceNoSpeculationBB1 = true;
+      if (!I2->willReturn())
+        ForceNoSpeculationBB2 = true;
+      ++NumSkipped;
     }
-    ++NumHoistCommonInstrs;
 
     I1 = &*BB1_Itr++;
     I2 = &*BB2_Itr++;
@@ -1578,9 +1654,9 @@
       while (isa<DbgInfoIntrinsic>(I2))
         I2 = &*BB2_Itr++;
     }
-  } while (I1->isIdenticalToWhenDefined(I2));
+  }
 
-  return true;
+  return Changed;
 
 HoistTerminator:
   // It may not be possible to hoist an invoke.
Index: llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/SimplifyCFG/hoist-common-skip.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes='simplifycfg<hoist-common-insts>' %s | FileCheck %s
+; RUN: opt -S --passes='simplifycfg<hoist-common-insts>' -simplifycfg-hoist-common-skip-limit=0 %s | FileCheck %s --check-prefix=LIMIT
+
+;; Check that the two loads are hoisted to the common predecessor, skipping
+;; over the add/sub instructions.
+;; Second run line, check that only one of the loads is hoisted, due to
+;; the skip limit.
+
+define i32 @f(ptr nocapture noundef %d, ptr nocapture noundef readonly %m, ptr nocapture noundef readonly %b) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[D:%.*]], align 2
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[TMP0]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B:%.*]], align 2
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = load i16, ptr [[M:%.*]], align 2
+; CHECK-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP2]] to i32
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[ADD]], [[CONV4]]
+; CHECK-NEXT:    [[TMP3:%.*]] = lshr i32 [[MUL]], 16
+; CHECK-NEXT:    [[CONV5:%.*]] = trunc i32 [[TMP3]] to i16
+; CHECK-NEXT:    br label [[IF_END:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], [[CONV4]]
+; CHECK-NEXT:    [[TMP4:%.*]] = lshr i32 [[MUL9]], 16
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i32 [[TMP4]] to i16
+; CHECK-NEXT:    [[CONV12:%.*]] = sub i16 0, [[TMP5]]
+; CHECK-NEXT:    br label [[IF_END]]
+; CHECK:       if.end:
+; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i16 [ [[CONV12]], [[IF_ELSE]] ], [ [[CONV5]], [[IF_THEN]] ]
+; CHECK-NEXT:    store i16 [[STOREMERGE]], ptr [[D]], align 2
+; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp ne i16 [[STOREMERGE]], 0
+; CHECK-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL]] to i32
+; CHECK-NEXT:    ret i32 [[LNOT_EXT]]
+;
+; LIMIT-LABEL: @f(
+; LIMIT-NEXT:  entry:
+; LIMIT-NEXT:    [[TMP0:%.*]] = load i16, ptr [[D:%.*]], align 2
+; LIMIT-NEXT:    [[CONV:%.*]] = sext i16 [[TMP0]] to i32
+; LIMIT-NEXT:    [[CMP:%.*]] = icmp sgt i16 [[TMP0]], 0
+; LIMIT-NEXT:    [[TMP1:%.*]] = load i16, ptr [[B:%.*]], align 2
+; LIMIT-NEXT:    [[CONV2:%.*]] = zext i16 [[TMP1]] to i32
+; LIMIT-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; LIMIT:       if.then:
+; LIMIT-NEXT:    [[ADD:%.*]] = add nsw i32 [[CONV2]], [[CONV]]
+; LIMIT-NEXT:    [[TMP2:%.*]] = load i16, ptr [[M:%.*]], align 2
+; LIMIT-NEXT:    [[CONV4:%.*]] = zext i16 [[TMP2]] to i32
+; LIMIT-NEXT:    [[MUL:%.*]] = mul nsw i32 [[ADD]], [[CONV4]]
+; LIMIT-NEXT:    [[TMP3:%.*]] = lshr i32 [[MUL]], 16
+; LIMIT-NEXT:    [[CONV5:%.*]] = trunc i32 [[TMP3]] to i16
+; LIMIT-NEXT:    br label [[IF_END:%.*]]
+; LIMIT:       if.else:
+; LIMIT-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV2]], [[CONV]]
+; LIMIT-NEXT:    [[TMP4:%.*]] = load i16, ptr [[M]], align 2
+; LIMIT-NEXT:    [[CONV8:%.*]] = zext i16 [[TMP4]] to i32
+; LIMIT-NEXT:    [[MUL9:%.*]] = mul nsw i32 [[SUB]], [[CONV8]]
+; LIMIT-NEXT:    [[TMP5:%.*]] = lshr i32 [[MUL9]], 16
+; LIMIT-NEXT:    [[TMP6:%.*]] = trunc i32 [[TMP5]] to i16
+; LIMIT-NEXT:    [[CONV12:%.*]] = sub i16 0, [[TMP6]]
+; LIMIT-NEXT:    br label [[IF_END]]
+; LIMIT:       if.end:
+; LIMIT-NEXT:    [[STOREMERGE:%.*]] = phi i16 [ [[CONV12]], [[IF_ELSE]] ], [ [[CONV5]], [[IF_THEN]] ]
+; LIMIT-NEXT:    store i16 [[STOREMERGE]], ptr [[D]], align 2
+; LIMIT-NEXT:    [[TOBOOL:%.*]] = icmp ne i16 [[STOREMERGE]], 0
+; LIMIT-NEXT:    [[LNOT_EXT:%.*]] = zext i1 [[TOBOOL]] to i32
+; LIMIT-NEXT:    ret i32 [[LNOT_EXT]]
+;
+
+entry:
+  %0 = load i16, ptr %d, align 2
+  %conv = sext i16 %0 to i32
+  %cmp = icmp sgt i16 %0, 0
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  %1 = load i16, ptr %b, align 2
+  %conv2 = zext i16 %1 to i32
+  %add = add nsw i32 %conv2, %conv
+  %2 = load i16, ptr %m, align 2
+  %conv4 = zext i16 %2 to i32
+  %mul = mul nsw i32 %add, %conv4
+  %3 = lshr i32 %mul, 16
+  %conv5 = trunc i32 %3 to i16
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %4 = load i16, ptr %b, align 2
+  %conv6 = zext i16 %4 to i32
+  %sub = sub nsw i32 %conv6, %conv
+  %5 = load i16, ptr %m, align 2
+  %conv8 = zext i16 %5 to i32
+  %mul9 = mul nsw i32 %sub, %conv8
+  %6 = lshr i32 %mul9, 16
+  %7 = trunc i32 %6 to i16
+  %conv12 = sub i16 0, %7
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %storemerge = phi i16 [ %conv12, %if.else ], [ %conv5, %if.then ]
+  store i16 %storemerge, ptr %d, align 2
+  %tobool = icmp ne i16 %storemerge, 0
+  %lnot.ext = zext i1 %tobool to i32
+  ret i32 %lnot.ext
+}