Index: lib/Transforms/Scalar/MergeICmps.cpp
===================================================================
--- lib/Transforms/Scalar/MergeICmps.cpp
+++ lib/Transforms/Scalar/MergeICmps.cpp
@@ -41,6 +41,15 @@
 
 #define DEBUG_TYPE "mergeicmps"
 
+// Returns true if the instruction is a simple load or a simple store
+static bool isSimpleLoadOrStore(const Instruction *I) {
+  if (const LoadInst *LI = dyn_cast<LoadInst>(I))
+    return LI->isSimple();
+  if (const StoreInst *SI = dyn_cast<StoreInst>(I))
+    return SI->isSimple();
+  return false;
+}
+
 // A BCE atom.
 struct BCEAtom {
   BCEAtom() : GEP(nullptr), LoadI(nullptr), Offset() {}
@@ -150,18 +159,19 @@
 
   // Returns true if the non-BCE-cmp instructions can be separated from BCE-cmp
   // instructions in the block.
-  bool canSplit() const;
+  bool canSplit(AliasAnalysis *AA) const;
 
   // Return true if this all the relevant instructions in the BCE-cmp-block can
   // be sunk below this instruction. By doing this, we know we can separate the
   // BCE-cmp-block instructions from the non-BCE-cmp-block instructions in the
   // block.
-  bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &) const;
+  bool canSinkBCECmpInst(const Instruction *, DenseSet<Instruction *> &,
+                         AliasAnalysis *AA) const;
 
   // We can separate the BCE-cmp-block instructions and the non-BCE-cmp-block
   // instructions. Split the old block and move all non-BCE-cmp-insts into the
   // new parent block.
-  void split(BasicBlock *NewParent) const;
+  void split(BasicBlock *NewParent, AliasAnalysis *AA) const;
 
   // The basic block where this comparison happens.
   BasicBlock *BB = nullptr;
@@ -179,12 +189,21 @@
 };
 
 bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
-                                    DenseSet<Instruction *> &BlockInsts) const {
+                                    DenseSet<Instruction *> &BlockInsts,
+                                    AliasAnalysis *AA) const {
   // If this instruction has side effects and its in middle of the BCE cmp block
   // instructions, then bail for now.
-  // TODO: use alias analysis to tell whether there is real interference.
-  if (Inst->mayHaveSideEffects())
-    return false;
+  if (Inst->mayHaveSideEffects()) {
+    // Bail if this is not a simple load or store
+    if (!isSimpleLoadOrStore(Inst))
+      return false;
+
+    MemoryLocation LLoc = MemoryLocation::get(Lhs_.LoadI);
+    MemoryLocation RLoc = MemoryLocation::get(Rhs_.LoadI);
+    if (isModSet(AA->getModRefInfo(Inst, LLoc)) ||
+        isModSet(AA->getModRefInfo(Inst, RLoc)))
+      return false;
+  }
   // Make sure this instruction does not use any of the BCE cmp block
   // instructions as operand.
   for (auto BI : BlockInsts) {
@@ -194,14 +213,15 @@
   return true;
 }
 
-void BCECmpBlock::split(BasicBlock *NewParent) const {
+void BCECmpBlock::split(BasicBlock *NewParent, AliasAnalysis *AA) const {
   DenseSet<Instruction *> BlockInsts(
       {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
   llvm::SmallVector<Instruction *, 4> OtherInsts;
   for (Instruction &Inst : *BB) {
     if (BlockInsts.count(&Inst))
       continue;
-    assert(canSinkBCECmpInst(&Inst, BlockInsts) && "Split unsplittable block");
+    assert(canSinkBCECmpInst(&Inst, BlockInsts, AA) &&
+           "Split unsplittable block");
     // This is a non-BCE-cmp-block instruction. And it can be separated
     // from the BCE-cmp-block instruction.
     OtherInsts.push_back(&Inst);
@@ -213,12 +233,12 @@
   }
 }
 
-bool BCECmpBlock::canSplit() const {
+bool BCECmpBlock::canSplit(AliasAnalysis *AA) const {
   DenseSet<Instruction *> BlockInsts(
       {Lhs_.GEP, Rhs_.GEP, Lhs_.LoadI, Rhs_.LoadI, CmpI, BranchI});
   for (Instruction &Inst : *BB) {
     if (!BlockInsts.count(&Inst)) {
-      if (!canSinkBCECmpInst(&Inst, BlockInsts))
+      if (!canSinkBCECmpInst(&Inst, BlockInsts, AA))
         return false;
     }
   }
@@ -324,17 +344,18 @@
 // A chain of comparisons.
 class BCECmpChain {
  public:
-  BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi);
+   BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
+               AliasAnalysis *AA);
 
-  int size() const { return Comparisons_.size(); }
+   int size() const { return Comparisons_.size(); }
 
 #ifdef MERGEICMPS_DOT_ON
   void dump() const;
 #endif  // MERGEICMPS_DOT_ON
 
-  bool simplify(const TargetLibraryInfo *const TLI);
+  bool simplify(const TargetLibraryInfo *const TLI, AliasAnalysis *AA);
 
- private:
+private:
   static bool IsContiguous(const BCECmpBlock &First,
                            const BCECmpBlock &Second) {
     return First.Lhs().Base() == Second.Lhs().Base() &&
@@ -348,7 +369,7 @@
   // null, the merged block will link to the phi block.
   void mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
                         BasicBlock *const NextBBInChain, PHINode &Phi,
-                        const TargetLibraryInfo *const TLI);
+                        const TargetLibraryInfo *const TLI, AliasAnalysis *AA);
 
   PHINode &Phi_;
   std::vector<BCECmpBlock> Comparisons_;
@@ -356,7 +377,8 @@
   BasicBlock *EntryBlock_;
 };
 
-BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi)
+BCECmpChain::BCECmpChain(const std::vector<BasicBlock *> &Blocks, PHINode &Phi,
+                         AliasAnalysis *AA)
     : Phi_(Phi) {
   assert(!Blocks.empty() && "a chain should have at least one block");
   // Now look inside blocks to check for BCE comparisons.
@@ -388,7 +410,7 @@
         // and start anew.
         //
         // NOTE: we only handle block with single predecessor for now.
-        if (Comparison.canSplit()) {
+        if (Comparison.canSplit(AA)) {
           LLVM_DEBUG(dbgs()
                      << "Split initial block '" << Comparison.BB->getName()
                      << "' that does extra work besides compare\n");
@@ -475,7 +497,8 @@
 }
 #endif  // MERGEICMPS_DOT_ON
 
-bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI) {
+bool BCECmpChain::simplify(const TargetLibraryInfo *const TLI,
+                           AliasAnalysis *AA) {
   // First pass to check if there is at least one merge. If not, we don't do
   // anything and we keep analysis passes intact.
   {
@@ -523,13 +546,13 @@
       // Merge all previous comparisons and start a new merge block.
       mergeComparisons(
           makeArrayRef(Comparisons_).slice(I - NumMerged, NumMerged),
-          Comparisons_[I].BB, Phi_, TLI);
+          Comparisons_[I].BB, Phi_, TLI, AA);
       NumMerged = 1;
     }
   }
   mergeComparisons(makeArrayRef(Comparisons_)
                        .slice(Comparisons_.size() - NumMerged, NumMerged),
-                   nullptr, Phi_, TLI);
+                   nullptr, Phi_, TLI, AA);
 
   return true;
 }
@@ -537,7 +560,8 @@
 void BCECmpChain::mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
                                    BasicBlock *const NextBBInChain,
                                    PHINode &Phi,
-                                   const TargetLibraryInfo *const TLI) {
+                                   const TargetLibraryInfo *const TLI,
+                                   AliasAnalysis *AA) {
   assert(!Comparisons.empty());
   const auto &FirstComparison = *Comparisons.begin();
   BasicBlock *const BB = FirstComparison.BB;
@@ -550,7 +574,7 @@
     auto C = std::find_if(Comparisons.begin(), Comparisons.end(),
                           [](const BCECmpBlock &B) { return B.RequireSplit; });
     if (C != Comparisons.end())
-      C->split(EntryBlock_);
+      C->split(EntryBlock_, AA);
 
     LLVM_DEBUG(dbgs() << "Merging " << Comparisons.size() << " comparisons\n");
     const auto TotalSize =
@@ -666,7 +690,8 @@
   return Blocks;
 }
 
-bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI) {
+bool processPhi(PHINode &Phi, const TargetLibraryInfo *const TLI,
+                AliasAnalysis *AA) {
   LLVM_DEBUG(dbgs() << "processPhi()\n");
   if (Phi.getNumIncomingValues() <= 1) {
     LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
@@ -724,14 +749,14 @@
   const auto Blocks =
       getOrderedBlocks(Phi, LastBlock, Phi.getNumIncomingValues());
   if (Blocks.empty()) return false;
-  BCECmpChain CmpChain(Blocks, Phi);
+  BCECmpChain CmpChain(Blocks, Phi, AA);
 
   if (CmpChain.size() < 2) {
     LLVM_DEBUG(dbgs() << "skip: only one compare block\n");
     return false;
   }
 
-  return CmpChain.simplify(TLI);
+  return CmpChain.simplify(TLI, AA);
 }
 
 class MergeICmps : public FunctionPass {
@@ -746,7 +771,8 @@
     if (skipFunction(F)) return false;
     const auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
     const auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    auto PA = runImpl(F, &TLI, &TTI);
+    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto PA = runImpl(F, &TLI, &TTI, AA);
     return !PA.areAllPreserved();
   }
 
@@ -754,14 +780,16 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
   }
 
   PreservedAnalyses runImpl(Function &F, const TargetLibraryInfo *TLI,
-                            const TargetTransformInfo *TTI);
+                            const TargetTransformInfo *TTI, AliasAnalysis *AA);
 };
 
 PreservedAnalyses MergeICmps::runImpl(Function &F, const TargetLibraryInfo *TLI,
-                                      const TargetTransformInfo *TTI) {
+                                      const TargetTransformInfo *TTI,
+                                      AliasAnalysis *AA) {
   LLVM_DEBUG(dbgs() << "MergeICmpsPass: " << F.getName() << "\n");
 
   // We only try merging comparisons if the target wants to expand memcmp later.
@@ -777,7 +805,7 @@
   for (auto BBIt = ++F.begin(); BBIt != F.end(); ++BBIt) {
     // A Phi operation is always first in a basic block.
     if (auto *const Phi = dyn_cast<PHINode>(&*BBIt->begin()))
-      MadeChange |= processPhi(*Phi, TLI);
+      MadeChange |= processPhi(*Phi, TLI, AA);
   }
 
   if (MadeChange) return PreservedAnalyses::none();
@@ -791,6 +819,7 @@
                       "Merge contiguous icmps into a memcmp", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(MergeICmps, "mergeicmps",
                     "Merge contiguous icmps into a memcmp", false, false)
 
Index: test/CodeGen/AArch64/O3-pipeline.ll
===================================================================
--- test/CodeGen/AArch64/O3-pipeline.ll
+++ test/CodeGen/AArch64/O3-pipeline.ll
@@ -32,6 +32,8 @@
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
Index: test/CodeGen/Generic/llc-start-stop.ll
===================================================================
--- test/CodeGen/Generic/llc-start-stop.ll
+++ test/CodeGen/Generic/llc-start-stop.ll
@@ -13,15 +13,15 @@
 ; STOP-BEFORE-NOT: Loop Strength Reduction
 
 ; RUN: llc < %s -debug-pass=Structure -start-after=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-AFTER
-; START-AFTER: -machine-branch-prob -mergeicmps
+; START-AFTER: -aa -mergeicmps
 ; START-AFTER: FunctionPass Manager
-; START-AFTER-NEXT: Merge contiguous icmps into a memcmp
+; START-AFTER-NEXT: Dominator Tree Construction
 
 ; RUN: llc < %s -debug-pass=Structure -start-before=loop-reduce -o /dev/null 2>&1 | FileCheck %s -check-prefix=START-BEFORE
 ; START-BEFORE: -machine-branch-prob -domtree
 ; START-BEFORE: FunctionPass Manager
 ; START-BEFORE: Loop Strength Reduction
-; START-BEFORE-NEXT: Merge contiguous icmps into a memcmp
+; START-BEFORE-NEXT: Basic Alias Analysis (stateless AA impl)
 
 ; RUN: not llc < %s -start-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-START-BEFORE
 ; RUN: not llc < %s -stop-before=nonexistent -o /dev/null 2>&1 | FileCheck %s -check-prefix=NONEXISTENT-STOP-BEFORE
Index: test/CodeGen/X86/O3-pipeline.ll
===================================================================
--- test/CodeGen/X86/O3-pipeline.ll
+++ test/CodeGen/X86/O3-pipeline.ll
@@ -26,6 +26,8 @@
 ; CHECK-NEXT:       Loop Pass Manager
 ; CHECK-NEXT:         Induction Variable Users
 ; CHECK-NEXT:         Loop Strength Reduction
+; CHECK-NEXT:       Basic Alias Analysis (stateless AA impl)
+; CHECK-NEXT:         Function Alias Analysis Results
 ; CHECK-NEXT:       Merge contiguous icmps into a memcmp
 ; CHECK-NEXT:       Expand memcmp() to load/stores
 ; CHECK-NEXT:       Lower Garbage Collection Instructions
Index: test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
===================================================================
--- /dev/null
+++ test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
@@ -0,0 +1,83 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -mtriple=x86_64-unknown-unknown -mergeicmps -S | FileCheck %s --check-prefix=X86
+
+%"struct.std::pair" = type { i32, i32, i32, i32 }
+
+; Before the pass
+define zeroext i1 @opeq1(
+; CHECK-LABEL: @opeq1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[PTR:%.*]] = alloca i32
+; CHECK-NEXT:    [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[FIRST_I]], align 4
+; CHECK-NEXT:    [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[FIRST1_I]], align 4
+; CHECK-NEXT:    store i32 42, i32* [[PTR]]
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_I]], label [[LAND_RHS_I:%.*]], label [[OPEQ1_EXIT:%.*]]
+; CHECK:       land.rhs.i:
+; CHECK-NEXT:    [[SECOND_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* [[SECOND_I]], align 4
+; CHECK-NEXT:    [[SECOND2_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* [[SECOND2_I]], align 4
+; CHECK-NEXT:    [[CMP2_I:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    br i1 [[CMP2_I]], label [[LAND_RHS_I_2:%.*]], label [[OPEQ1_EXIT]]
+
+; After the pass
+; We can split %entry and create a memcmp(16 bytes)
+; X86-LABEL: @opeq1(
+; X86-NEXT:  entry:
+; X86-NEXT:    [[PTR:%.*]] = alloca i32
+; X86-NEXT:    store i32 42, i32* [[PTR]]
+; X86-NEXT:    [[FIRST_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[A:%.*]], i64 0, i32 0
+; X86-NEXT:    [[FIRST1_I:%.*]] = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* [[B:%.*]], i64 0, i32 0
+; X86-NEXT:    [[CSTR:%.*]] = bitcast i32* [[FIRST_I]] to i8*
+; X86-NEXT:    [[CSTR1:%.*]] = bitcast i32* [[FIRST1_I]] to i8*
+; X86-NEXT:    [[MEMCMP:%.*]] = call i32 @memcmp(i8* [[CSTR]], i8* [[CSTR1]], i64 16)
+; X86-NEXT:    [[TMP0:%.*]] = icmp eq i32 [[MEMCMP]], 0
+; X86-NEXT:    br label [[OPEQ1_EXIT:%.*]]
+; X86:       opeq1.exit:
+; X86-NEXT:    [[TMP1:%.*]] = phi i1 [ [[TMP0]], [[ENTRY:%.*]] ]
+; X86-NEXT:    ret i1 [[TMP1]]
+;
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %a,
+  %"struct.std::pair"* nocapture readonly dereferenceable(16) %b) local_unnamed_addr #0 {
+entry:
+  %ptr = alloca i32
+  %first.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 0
+  %0 = load i32, i32* %first.i, align 4
+  %first1.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 0
+  %1 = load i32, i32* %first1.i, align 4
+  ; Does other work, has no interference, merge block
+  store i32 42, i32* %ptr
+  %cmp.i = icmp eq i32 %0, %1
+  br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
+
+land.rhs.i:
+  %second.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 1
+  %2 = load i32, i32* %second.i, align 4
+  %second2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 1
+  %3 = load i32, i32* %second2.i, align 4
+  %cmp2.i = icmp eq i32 %2, %3
+  br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
+
+land.rhs.i.2:
+  %third.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 2
+  %4 = load i32, i32* %third.i, align 4
+  %third2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 2
+  %5 = load i32, i32* %third2.i, align 4
+  %cmp3.i = icmp eq i32 %4, %5
+  br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
+
+land.rhs.i.3:
+  %fourth.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %a, i64 0, i32 3
+  %6 = load i32, i32* %fourth.i, align 4
+  %fourth2.i = getelementptr inbounds %"struct.std::pair", %"struct.std::pair"* %b, i64 0, i32 3
+  %7 = load i32, i32* %fourth2.i, align 4
+  %cmp4.i = icmp eq i32 %6, %7
+  br label %opeq1.exit
+
+opeq1.exit:
+  %8 = phi i1 [ false, %entry ], [ false, %land.rhs.i] , [ false, %land.rhs.i.2 ], [ %cmp4.i, %land.rhs.i.3 ]
+  ret i1 %8
+}