Index: lib/Transforms/Scalar/MergeICmps.cpp
===================================================================
--- lib/Transforms/Scalar/MergeICmps.cpp
+++ lib/Transforms/Scalar/MergeICmps.cpp
@@ -110,6 +110,10 @@
 }
 
 // A basic block with a comparison between two BCE atoms.
+// The block might do extra work besides the atom comparison, in which case
+// doesOtherWork() returns true. Under some conditions, the block can be
+// split into the atom comparison part and the "other work" part
+// (see canSplit()).
 // Note: the terminology is misleading: the comparison is symmetric, so there
 // is no real {l/r}hs. What we want though is to have the same base on the
 // left (resp. right), so that we can detect consecutive loads. To ensure this
@@ -144,19 +148,83 @@
   // Returns true if the block does other works besides comparison.
   bool doesOtherWork() const;
 
+  // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
+  // instructions in the block.
+  bool canSplit() const;
+
+  // Return true if this all the relevant instructions in the BCE-cmp-block can
+  // be sunk below this instruction. By doing this, we know we can separate the
+  // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
+  // block.
+  bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &) const;
+
+  // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
+  // instructions. Split the old block and move all non-BCE-cmp-insts into the
+  // new parent block.
+  void split(BasicBlock *NewParent) const;
+
   // The basic block where this comparison happens.
   BasicBlock *BB = nullptr;
   // The ICMP for this comparison.
   ICmpInst *CmpI = nullptr;
   // The terminating branch.
   BranchInst *BranchI = nullptr;
+  // The block requires splitting.
+  bool RequireSplit = false;
 
- private:
+private:
   BCEAtom Lhs_;
   BCEAtom Rhs_;
   int SizeBits_ = 0;
 };
 
+bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
+                                    DenseSet<Instruction *> &BlockInsts) const {
+  // If this instruction has side effects and its in middle of the BCE cmp block
+  // instructions, then bail for now.
+  // TODO: use alias analysis to tell whether there is real interference.
+  if (Inst->mayHaveSideEffects())
+    return false;
+  // Make sure this instruction does not use any of the BCE cmp block
+  // instructions as operand.
+  for (auto BI : BlockInsts) {
+    if (is_contained(Inst->operands(), BI))
+      return false;
+  }
+  return true;
+}
+
+void BCECmpBlock::split(BasicBlock *NewParent) const {
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+  llvm::SmallVector<Instruction *, 4> OtherInsts;
+  for (Instruction &Inst : *BB) {
+    if (BlockInsts.count(&Inst))
+      continue;
+    assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block");
+    // This is a non-BCE-cmp-block instruction. And it can be separated
+    // from the BCE-cmp-block instruction.
+    OtherInsts.push_back(&Inst);
+  }
+
+  // Do the actual spliting.
+  for (Instruction *Inst : reverse(OtherInsts)) {
+    Inst->moveBefore(&*NewParent->begin());
+  }
+}
+
+bool BCECmpBlock::canSplit() const {
+  DenseSet<Instruction *> BlockInsts(
+      {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
+  for (Instruction &Inst : *BB) {
+    if (!BlockInsts.count(&Inst)) {
+      if (!canSinkBCECmpInst(&Inst, BlockInsts))
+        return false;
+    }
+  }
+  return true;
+}
+
 bool BCECmpBlock::doesOtherWork() const {
   AssertConsistent();
   // All the instructions we care about in the BCE cmp block.
@@ -241,6 +309,17 @@
   return {};
 }
 
+static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
+                                BCECmpBlock &Comparison) {
+  DEBUG(dbgs() << "Block '" << Comparison.BB->getName() << "': Found cmp of "
+               << Comparison.SizeBits() << " bits between "
+               << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset
+               << " and " << Comparison.Rhs().Base() << " + "
+               << Comparison.Rhs().Offset << "\n");
+  DEBUG(dbgs() << "\n");
+  Comparisons.push_back(Comparison);
+}
+
 // A chain of comparisons.
 class BCECmpChain {
  public:
@@ -266,9 +345,9 @@
   // Merges the given comparison blocks into one memcmp block and update
   // branches. Comparisons are assumed to be continguous. If NextBBInChain is
   // null, the merged block will link to the phi block.
-  static void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
-                               BasicBlock *const NextBBInChain, PHINode &Phi,
-                               const TargetLibraryInfo *const TLI);
+  void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
+                        BasicBlock *const NextBBInChain, PHINode &Phi,
+                        const TargetLibraryInfo *const TLI);
 
   PHINode &Phi_;
   std::vector<BCECmpBlock> Comparisons_;
@@ -295,12 +374,28 @@
       DEBUG(dbgs() << "block '" << Comparison.BB->getName()
                    << "' does extra work besides compare\n");
       if (Comparisons.empty()) {
-        // TODO(courbet): The initial block can do other things, and we should
-        // split them apart in a separate block before the comparison chain.
-        // Right now we just discard it and make the chain shorter.
-        DEBUG(dbgs()
-              << "ignoring initial block '" << Comparison.BB->getName()
-              << "' that does extra work besides compare\n");
+        // This is the initial block in the chain, in case this block does other
+        // work, we can try to split the block and move the irrelevant
+        // instructions to the predecessor.
+        //
+        // If this is not the initial block in the chain, splitting it wont
+        // work.
+        //
+        // As once split, there will still be instructions before the BCE cmp
+        // instructions that do other work in program order, i.e. within the
+        // chain before sorting. Unless we can abort the chain at this point
+        // and start anew.
+        //
+        // NOTE: we only handle block with single predecessor for now.
+        if (Comparison.canSplit()) {
+          DEBUG(dbgs() << "Split initial block '" << Comparison.BB->getName()
+                       << "' that does extra work besides compare\n");
+          Comparison.RequireSplit = true;
+          enqueueBlock(Comparisons, Comparison);
+        } else {
+          DEBUG(dbgs() << "ignoring initial block '" << Comparison.BB->getName()
+                       << "' that does extra work besides compare\n");
+        }
         continue;
       }
       // TODO(courbet): Right now we abort the whole chain. We could be
@@ -328,13 +423,7 @@
       // We could still merge bb1 and bb2 though.
       return;
     }
-    DEBUG(dbgs() << "Block '" << Comparison.BB->getName()<< "': Found cmp of "
-                 << Comparison.SizeBits() << " bits between "
-                 << Comparison.Lhs().Base() << " + " << Comparison.Lhs().Offset
-                 << " and " << Comparison.Rhs().Base() << " + "
-                 << Comparison.Rhs().Offset << "\n");
-    DEBUG(dbgs() << "\n");
-    Comparisons.push_back(Comparison);
+    enqueueBlock(Comparisons, Comparison);
   }
 
   // It is possible we have no suitable comparison to merge.
@@ -416,9 +505,11 @@
   }
 
   // Point the predecessors of the chain to the first comparison block (which is
-  // the new entry point).
-  if (EntryBlock_ != Comparisons_[0].BB)
+  // the new entry point) and update the entry block of the chain.
+  if (EntryBlock_ != Comparisons_[0].BB) {
     EntryBlock_->replaceAllUsesWith(Comparisons_[0].BB);
+    EntryBlock_ = Comparisons_[0].BB;
+  }
 
   // Effectively merge blocks.
   int NumMerged = 1;
@@ -450,6 +541,14 @@
   LLVMContext &Context = BB->getContext();
 
   if (Comparisons.size() >= 2) {
+    // If there is one block that requires splitting, we do it now, i.e.
+    // just before we know we will collapse the chain. The instructions
+    // can be executed before any of the instructions in the chain.
+    auto C = std::find_if(Comparisons.begin(), Comparisons.end(),
+                          [](const BCECmpBlock &B) { return B.RequireSplit; });
+    if (C != Comparisons.end())
+      C->split(EntryBlock_);
+
     DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
     const auto TotalSize =
         std::accumulate(Comparisons.begin(), Comparisons.end(), 0,
Index: test/Transforms/MergeICmps/X86/split-block-does-work.ll
===================================================================
--- /dev/null
+++ test/Transforms/MergeICmps/X86/split-block-does-work.ll
@@ -0,0 +1,113 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mergeicmps -mtriple=x86_64-unknown-unknown -S | FileCheck %s --check-prefix=X86
+
+%"struct.std::pair" = type { i32, i32, i32, i32 }
+
+declare void @foo(...)  nounwind readnone
+
+; We can split %entry and create a memcmp(16 bytes).
+define zeroext i1 @opeq1(
+; X86-LABEL: @opeq1(
+;
+; Make sure this call is moved to the beginning of the entry block.
+; X86:      entry:
+; X86-NEXT:    call void (...) @foo()
+; X86-NEXT:    [[THIRD_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0
+; X86-NEXT:    [[THIRD1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0
+; X86-NEXT:    [[CSTR:%.*]] = bitcast i32* [[THIRD_I]] to i8*
+; X86-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[THIRD1_I]] to i8*
+; X86-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16)
+; X86-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+;
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %a,
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
+entry:
+  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+  %0 = load i32, i32* %first.i, align 4
+  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+  %1 = load i32, i32* %first1.i, align 4
+  ; Does other work.
+  call void (...) @foo()
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+  %2 = load i32, i32* %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+  %3 = load i32, i32* %second2.i, align 4
+  %cmp2.i = icmp eq i32 %2, %3
+  br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
+
+land.rhs.i.2:
+  %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2
+  %4 = load i32, i32* %third.i, align 4
+  %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 2
+  %5 = load i32, i32* %third2.i, align 4
+  %cmp3.i = icmp eq i32 %4, %5
+  br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
+
+land.rhs.i.3:
+  %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 3
+  %6 = load i32, i32* %fourth.i, align 4
+  %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3
+  %7 = load i32, i32* %fourth2.i, align 4
+  %cmp4.i = icmp eq i32 %6, %7
+  br label %opeq1.exit
+
+opeq1.exit:
+  %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ]
+  ret i1 %8
+}
+
+
+; We will not be able to merge anything, make sure the call is not moved out.
+define zeroext i1 @opeq1_discontiguous(
+; X86-LABEL: @opeq1_discontiguous(
+;
+; Make sure this call is moved in the entry block.
+; X86:      entry:
+; X86:        [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 1 
+; X86:        [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0
+; X86:        call void (...) @foo()
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %a,
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
+entry:
+  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1 
+  %0 = load i32, i32* %first.i, align 4
+  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0 
+  %1 = load i32, i32* %first1.i, align 4
+  ; Does other work.
+  call void (...) @foo()
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2 
+  %2 = load i32, i32* %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+  %3 = load i32, i32* %second2.i, align 4
+  %cmp2.i = icmp eq i32 %2, %3
+  br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
+
+land.rhs.i.2:
+  %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2
+  %4 = load i32, i32* %third.i, align 4
+  %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3
+  %5 = load i32, i32* %third2.i, align 4
+  %cmp3.i = icmp eq i32 %4, %5
+  br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
+
+land.rhs.i.3:
+  %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+  %6 = load i32, i32* %fourth.i, align 4
+  %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3 
+  %7 = load i32, i32* %fourth2.i, align 4
+  %cmp4.i = icmp eq i32 %6, %7
+  br label %opeq1.exit
+
+opeq1.exit:
+  %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ]
+  ret i1 %8
+}
Index: test/Transforms/MergeICmps/X86/tuple-four-int8.ll
===================================================================
--- test/Transforms/MergeICmps/X86/tuple-four-int8.ll
+++ test/Transforms/MergeICmps/X86/tuple-four-int8.ll
@@ -19,28 +19,24 @@
 
 define zeroext i1 @opeq(
 ; CHECK-LABEL: @opeq(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
-; CHECK-NEXT:    [[A_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 3
-; CHECK-NEXT:    [[TMP0:%.*]] = load i8, i8* [[A_ELEM3_ADDR]], align 1
-; CHECK-NEXT:    [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
-; CHECK-NEXT:    [[B_ELEM3_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = load i8, i8* [[B_ELEM3_ADDR]], align 1
-; CHECK-NEXT:    [[CMP_ELEM3:%.*]] = icmp eq i8 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[CMP_ELEM3]], label [[LAND_ELEM0:%.*]], label [[OPEQ_EXIT:%.*]]
+;
+; These 2 instructions are split. Then we can merge 3 bytes, instead of 2.
+; CHECK:         br label [[LAND_ELEM0:%.*]]
 ; CHECK:       land.elem1:
-; CHECK-NEXT:    [[A_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[A_BASE]], i64 1
-; CHECK-NEXT:    [[B_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* [[B_BASE]], i64 1
-; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 2)
+; CHECK-NEXT:    [[A_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* %a.base, i64 1
+; CHECK-NEXT:    [[B_ELEM1_ADDR:%.*]] = getelementptr inbounds i8, i8* %b.base, i64 1
+; CHECK-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[A_ELEM1_ADDR]], i8* [[B_ELEM1_ADDR]], i64 3)
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i32 [[MEMCMP]], 0
-; CHECK-NEXT:    br label [[OPEQ_EXIT]]
+; CHECK-NEXT:    br label [[OPEQ_EXIT:%.*]]
 ; CHECK:       land.elem0:
+; CHECK:         [[A_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[A:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0
+; CHECK:         [[B_BASE:%.*]] = getelementptr inbounds %"class.std::tuple", %"class.std::tuple"* [[B:%.*]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0 
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i8, i8* [[A_BASE]], align 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i8, i8* [[B_BASE]], align 1
 ; CHECK-NEXT:    [[CMP_ELEM0:%.*]] = icmp eq i8 [[TMP3]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[CMP_ELEM0]], label [[LAND_ELEM1:%.*]], label [[OPEQ_EXIT]]
 ; CHECK:       opeq.exit:
-; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = phi i1 [ [[CMP_ELEM0]], [[LAND_ELEM0]] ], [ [[TMP2]], [[LAND_ELEM1]] ]
 ; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %"class.std::tuple"* nocapture readonly dereferenceable(4) %a,