Index: include/llvm/Transforms/Scalar/MemCpyOptimizer.h
===================================================================
--- include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -50,7 +50,7 @@
   // Helper functions
   bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
   bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-  bool processMemCpy(MemCpyInst *M);
+  bool processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI);
   bool processMemMove(MemMoveInst *M);
   bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
                             uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
@@ -60,6 +60,8 @@
   bool processByValArgument(CallSite CS, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
+  Instruction *tryMergingIntoMemcpy(Instruction *StartInst, Value *StartDstPtr,
+                                    Value *StartSrcPtr);
 
   bool iterateOnFunction(Function &F);
 };
Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp
===================================================================
--- lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -31,6 +31,7 @@
 #define DEBUG_TYPE "memcpyopt"
 
 STATISTIC(NumMemCpyInstr, "Number of memcpy instructions deleted");
+STATISTIC(NumMemCpyInfer, "Number of memcpys inferred");
 STATISTIC(NumMemSetInfer, "Number of memsets inferred");
 STATISTIC(NumMoveToCpy,   "Number of memmoves converted to memcpy");
 STATISTIC(NumCpyToSet,    "Number of memcpys converted to memset");
@@ -119,6 +120,18 @@
   return true;
 }
 
+static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
+                                     const LoadInst *LI) {
+  unsigned StoreAlign = SI->getAlignment();
+  if (!StoreAlign)
+    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
+  unsigned LoadAlign = LI->getAlignment();
+  if (!LoadAlign)
+    LoadAlign = DL.getABITypeAlignment(LI->getType());
+
+  return std::min(StoreAlign, LoadAlign);
+}
+
 
 /// Represents a range of memset'd bytes with the ByteVal value.
 /// This allows us to analyze stores like:
@@ -131,14 +144,18 @@
 /// to [0, 2).  The third makes a new range [2, 3).  The fourth store joins the
 /// two ranges into [0, 3) which is memset'able.
 namespace {
-struct MemsetRange {
+struct MemIntrinsicRange {
   // Start/End - A semi range that describes the span that this range covers.
   // The range is closed at the start and open at the end: [Start, End).
   int64_t Start, End;
 
-  /// StartPtr - The getelementptr instruction that points to the start of the
-  /// range.
-  Value *StartPtr;
+  /// DestStartPtr - The getelementptr instruction that points to the start of
+  /// the range being stored to.
+  Value *DestStartPtr;
+
+  /// SrcStartPtr - The getelementptr instruction that points to the start of
+  /// the range being read from.
+  Value *SrcStartPtr;
 
   /// Alignment - The known alignment of the first store.
   unsigned Alignment;
@@ -146,21 +163,22 @@
   /// TheStores - The actual stores that make up this range.
   SmallVector<Instruction*, 16> TheStores;
 
-  bool isProfitableToUseMemset(const DataLayout &DL) const;
+  bool isProfitableToUseMemIntrinsic(const DataLayout &DL) const;
 };
 } // end anon namespace
 
-bool MemsetRange::isProfitableToUseMemset(const DataLayout &DL) const {
-  // If we found more than 4 stores to merge or 16 bytes, use memset.
+bool MemIntrinsicRange::isProfitableToUseMemIntrinsic(
+    const DataLayout &DL) const {
+  // If we found more than 4 stores to merge or 16 bytes, use mem intrinsic.
   if (TheStores.size() >= 4 || End-Start >= 16) return true;
 
   // If there is nothing to merge, don't do anything.
   if (TheStores.size() < 2) return false;
 
-  // If any of the stores are a memset, then it is always good to extend the
-  // memset.
+  // If any of the stores are already a mem intrinsic, then it is always good to
+  // extend it.
   for (Instruction *SI : TheStores)
-    if (!isa<StoreInst>(SI))
+    if (isa<MemIntrinsic>(SI))
       return true;
 
   // Assume that the code generator is capable of merging pairs of stores
@@ -194,15 +212,15 @@
 
 
 namespace {
-class MemsetRanges {
+class MemIntrinsicRanges {
   /// A sorted list of the memset ranges.
-  SmallVector<MemsetRange, 8> Ranges;
-  typedef SmallVectorImpl<MemsetRange>::iterator range_iterator;
+  SmallVector<MemIntrinsicRange, 8> Ranges;
+  typedef SmallVectorImpl<MemIntrinsicRange>::iterator range_iterator;
   const DataLayout &DL;
 public:
-  MemsetRanges(const DataLayout &DL) : DL(DL) {}
+  MemIntrinsicRanges(const DataLayout &DL) : DL(DL) {}
 
-  typedef SmallVectorImpl<MemsetRange>::const_iterator const_iterator;
+  typedef SmallVectorImpl<MemIntrinsicRange>::const_iterator const_iterator;
   const_iterator begin() const { return Ranges.begin(); }
   const_iterator end() const { return Ranges.end(); }
   bool empty() const { return Ranges.empty(); }
@@ -216,17 +234,35 @@
 
   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
     int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+    unsigned Alignment =
+        SI->getAlignment()
+            ? SI->getAlignment()
+            : DL.getABITypeAlignment(SI->getValueOperand()->getType());
 
     addRange(OffsetFromFirst, StoreSize,
-             SI->getPointerOperand(), SI->getAlignment(), SI);
+             SI->getPointerOperand(), nullptr, Alignment, SI);
+  }
+
+  void addLoadStore(int64_t OffsetFromFirst, LoadInst *LI, StoreInst *SI) {
+    int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
+
+    addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
+             LI->getPointerOperand(), findCommonAlignment(DL, SI, LI), SI);
   }
 
   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
     int64_t Size = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-    addRange(OffsetFromFirst, Size, MSI->getDest(), MSI->getAlignment(), MSI);
+    addRange(OffsetFromFirst, Size, MSI->getDest(), nullptr,
+             MSI->getAlignment(), MSI);
   }
 
-  void addRange(int64_t Start, int64_t Size, Value *Ptr,
+  void addMemTransfer(int64_t OffsetFromFirst, MemTransferInst *MTI) {
+    int64_t Size = cast<ConstantInt>(MTI->getLength())->getZExtValue();
+    addRange(OffsetFromFirst, Size, MTI->getDest(), MTI->getSource(),
+             MTI->getAlignment(), MTI);
+  }
+
+  void addRange(int64_t Start, int64_t Size, Value *DestPtr, Value *SrcPtr,
                 unsigned Alignment, Instruction *Inst);
 
 };
@@ -234,24 +270,26 @@
 } // end anon namespace
 
 
-/// Add a new store to the MemsetRanges data structure.  This adds a
+/// Add a new store to the MemIntrinsicRanges data structure.  This adds a
 /// new range for the specified store at the specified offset, merging into
 /// existing ranges as appropriate.
-void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
-                            unsigned Alignment, Instruction *Inst) {
+void MemIntrinsicRanges::addRange(int64_t Start, int64_t Size, Value *DestPtr,
+                                  Value *SrcPtr, unsigned Alignment,
+                                  Instruction *Inst) {
   int64_t End = Start+Size;
 
   range_iterator I = std::lower_bound(Ranges.begin(), Ranges.end(), Start,
-    [](const MemsetRange &LHS, int64_t RHS) { return LHS.End < RHS; });
+    [](const MemIntrinsicRange &LHS, int64_t RHS) { return LHS.End < RHS; });
 
   // We now know that I == E, in which case we didn't find anything to merge
   // with, or that Start <= I->End.  If End < I->Start or I == E, then we need
   // to insert a new range.  Handle this now.
   if (I == Ranges.end() || End < I->Start) {
-    MemsetRange &R = *Ranges.insert(I, MemsetRange());
+    MemIntrinsicRange &R = *Ranges.insert(I, MemIntrinsicRange());
     R.Start        = Start;
     R.End          = End;
-    R.StartPtr     = Ptr;
+    R.DestStartPtr = DestPtr;
+    R.SrcStartPtr  = SrcPtr;
     R.Alignment    = Alignment;
     R.TheStores.push_back(Inst);
     return;
@@ -273,7 +311,8 @@
   // stopped on *it*.
   if (Start < I->Start) {
     I->Start = Start;
-    I->StartPtr = Ptr;
+    I->DestStartPtr = DestPtr;
+    I->SrcStartPtr = SrcPtr;
     I->Alignment = Alignment;
   }
 
@@ -369,7 +408,7 @@
   // all subsequent stores of the same value to offset from the same pointer.
   // Join these together into ranges, so we can decide whether contiguous blocks
   // are stored.
-  MemsetRanges Ranges(DL);
+  MemIntrinsicRanges Ranges(DL);
 
   BasicBlock::iterator BI(StartInst);
   for (++BI; !isa<TerminatorInst>(BI); ++BI) {
@@ -431,28 +470,22 @@
   // Now that we have full information about ranges, loop over the ranges and
   // emit memset's for anything big enough to be worthwhile.
   Instruction *AMemSet = nullptr;
-  for (const MemsetRange &Range : Ranges) {
+  for (const MemIntrinsicRange &Range : Ranges) {
 
     if (Range.TheStores.size() == 1) continue;
 
     // If it is profitable to lower this range to memset, do so now.
-    if (!Range.isProfitableToUseMemset(DL))
+    if (!Range.isProfitableToUseMemIntrinsic(DL))
       continue;
 
     // Otherwise, we do want to transform this!  Create a new memset.
     // Get the starting pointer of the block.
-    StartPtr = Range.StartPtr;
-
-    // Determine alignment
+    StartPtr = Range.DestStartPtr;
     unsigned Alignment = Range.Alignment;
-    if (Alignment == 0) {
-      Type *EltType =
-        cast<PointerType>(StartPtr->getType())->getElementType();
-      Alignment = DL.getABITypeAlignment(EltType);
-    }
+    assert(!Range.SrcStartPtr && "memset containing transfer instruction?");
 
-    AMemSet =
-      Builder.CreateMemSet(StartPtr, ByteVal, Range.End-Range.Start, Alignment);
+    AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
+                                   Alignment);
 
     DEBUG(dbgs() << "Replace stores:\n";
           for (Instruction *SI : Range.TheStores)
@@ -473,16 +506,149 @@
   return AMemSet;
 }
 
-static unsigned findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
-                                     const LoadInst *LI) {
-  unsigned StoreAlign = SI->getAlignment();
-  if (!StoreAlign)
-    StoreAlign = DL.getABITypeAlignment(SI->getOperand(0)->getType());
-  unsigned LoadAlign = LI->getAlignment();
-  if (!LoadAlign)
-    LoadAlign = DL.getABITypeAlignment(LI->getType());
+/// When scanning forward over instructions, we look for some other patterns to
+/// fold away. In particular, this looks for stores to neighboring locations of
+/// memory. If it sees enough consecutive ones, it attempts to merge them
+/// together into a memcpy/memset.
+Instruction *MemCpyOptPass::tryMergingIntoMemcpy(Instruction *StartInst,
+                                                       Value *StartDestPtr,
+                                                       Value *StartSrcPtr) {
+  const DataLayout &DL = StartInst->getModule()->getDataLayout();
+  AliasAnalysis &AA = LookupAliasAnalysis();
 
-  return std::min(StoreAlign, LoadAlign);
+  // Okay, so we now have a single store that can be splatable.  Scan to find
+  // all subsequent stores of the same value to offset from the same pointer.
+  // Join these together into ranges, so we can decide whether contiguous blocks
+  // are stored.
+  MemIntrinsicRanges Ranges(DL);
+
+  BasicBlock::iterator BI(StartInst);
+  LoadInst *NextLoad = nullptr;
+  for (;!isa<TerminatorInst>(BI); ++BI) {
+    if (!isa<StoreInst>(BI) && !isa<LoadInst>(BI) &&
+        !isa<MemTransferInst>(BI)) {
+      // If the instruction is readnone, ignore it, otherwise bail out.  We
+      // don't even allow readonly here because we don't want something like:
+      // A[1] = 2; strlen(A); A[2] = 2; -> memcpy(A, ...); strlen(A).
+      if (BI->mayWriteToMemory() || BI->mayReadFromMemory())
+        break;
+      continue;
+    }
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(BI)) {
+      if (NextLoad || !LI->isSimple() || !LI->hasOneUse())
+        break;
+      NextLoad = LI;
+    } else if (StoreInst *NextStore = dyn_cast<StoreInst>(BI)) {
+      // If this is a store, see if we can merge it in.
+      if (!NextLoad || NextLoad != NextStore->getValueOperand() ||
+          !NextStore->isSimple())
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t DestOffset;
+      if (!IsPointerOffset(StartDestPtr, NextStore->getPointerOperand(),
+                           DestOffset, DL))
+        break;
+
+      int64_t SrcOffset;
+      if (!IsPointerOffset(StartSrcPtr, NextLoad->getPointerOperand(),
+                           SrcOffset, DL))
+        break;
+
+      if (DestOffset != SrcOffset)
+        break;
+
+      Ranges.addLoadStore(DestOffset, NextLoad, NextStore);
+      NextLoad = nullptr;
+    } else {
+      MemTransferInst *MTI = cast<MemTransferInst>(BI);
+
+      if (NextLoad || MTI->isVolatile() || !isa<ConstantInt>(MTI->getLength()))
+        break;
+
+      // Check to see if this store is to a constant offset from the start ptr.
+      int64_t DestOffset;
+      if (!IsPointerOffset(StartDestPtr, MTI->getDest(), DestOffset, DL))
+        break;
+
+      int64_t SrcOffset;
+      if (!IsPointerOffset(StartSrcPtr, MTI->getSource(), SrcOffset, DL))
+        break;
+
+      if (SrcOffset != DestOffset)
+        break;
+
+      Ranges.addMemTransfer(SrcOffset, MTI);
+    }
+  }
+
+  // If we have no ranges, then we just had a single store with nothing that
+  // could be merged in.  This is a very common case of course.
+  if (Ranges.empty())
+    return nullptr;
+
+  // If we create any memsets, we put it right before the first instruction that
+  // isn't part of the memset block.  This ensure that the memset is dominated
+  // by any addressing instruction needed by the start of the block.
+  IRBuilder<> Builder(&*BI);
+
+  // Now that we have full information about ranges, loop over the ranges and
+  // emit memset's for anything big enough to be worthwhile.
+  Instruction *AMemCpy = nullptr;
+  for (const MemIntrinsicRange &Range : Ranges) {
+
+    if (Range.TheStores.size() == 1) continue;
+
+    // If it is profitable to lower this range to memcpy, do so now.
+    if (!Range.isProfitableToUseMemIntrinsic(DL))
+      continue;
+
+    // Otherwise, we do want to transform this!  Create a new memcpy.
+    // Get the starting pointers of the block.
+    Value *DestStartPtr = Range.DestStartPtr;
+    Value *SrcStartPtr = Range.SrcStartPtr;
+    unsigned Alignment = Range.Alignment;
+
+    // We don't keep track of load/store pairs well enough to determine whether
+    // a memmove is permitted for possibly-aliasing addresses (both order and
+    // duplicates matter in that case, possibly in ways only determined
+    // dynamically).
+    uint64_t Size = Range.End - Range.Start;
+    if (!AA.isNoAlias(MemoryLocation(DestStartPtr, Size),
+                      MemoryLocation(SrcStartPtr, Size)))
+      continue;
+
+    AMemCpy = Builder.CreateMemCpy(DestStartPtr, SrcStartPtr, Size, Alignment);
+
+    DEBUG(dbgs() << "Replace load/stores:\n";
+          for (Instruction *I : Range.TheStores) {
+            if (StoreInst *SI = dyn_cast<StoreInst>(I))
+              dbgs() << *SI->getValueOperand() << '\n';
+            dbgs() << *I << '\n';
+          }
+          dbgs() << "With: " << *AMemCpy << '\n');
+
+    if (!Range.TheStores.empty())
+      AMemCpy->setDebugLoc(Range.TheStores[0]->getDebugLoc());
+
+    // Zap all the excess operations.
+    for (Instruction *I : Range.TheStores) {
+      if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+        auto LI = cast<LoadInst>(SI->getValueOperand());
+        MD->removeInstruction(LI);
+        MD->removeInstruction(SI);
+        SI->eraseFromParent();
+        LI->eraseFromParent();
+      } else {
+        MD->removeInstruction(I);
+        I->eraseFromParent();
+      }
+    }
+    ++NumMemCpyInfer;
+  }
+
+  return AMemCpy;
 }
 
 // This method try to lift a store instruction before position P.
@@ -653,6 +819,10 @@
           BBI = M->getIterator();
           return true;
         }
+      } else if (Instruction *I = tryMergingIntoMemcpy(
+                     LI, SI->getPointerOperand(), LI->getPointerOperand())) {
+        BBI = I->getIterator();
+        return true;
       }
 
       // Detect cases where we're performing call slot forwarding, but
@@ -1132,7 +1302,7 @@
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
-bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   // We can only optimize non-volatile memcpy's.
   if (M->isVolatile()) return false;
 
@@ -1225,6 +1395,9 @@
         return true;
       }
 
+  if (auto I = tryMergingIntoMemcpy(M, M->getDest(), M->getSource()))
+    BBI = I->getIterator();
+
   return false;
 }
 
@@ -1346,7 +1519,7 @@
       else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
         RepeatInstruction = processMemSet(M, BI);
       else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
-        RepeatInstruction = processMemCpy(M);
+        RepeatInstruction = processMemCpy(M, BI);
       else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
         RepeatInstruction = processMemMove(M);
       else if (auto CS = CallSite(I)) {
Index: test/Transforms/MemCpyOpt/form-memcpy.ll
===================================================================
--- /dev/null
+++ test/Transforms/MemCpyOpt/form-memcpy.ll
@@ -0,0 +1,326 @@
+; RUN: opt < %s -memcpyopt -S | FileCheck %s
+
+define void @test_simple_memcpy(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_simple_memcpy
+; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst to i8*
+; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+define void @test_simple_memmove(i32* %dst, i32* %src) {
+; CHECK-LABEL: @test_simple_memmove
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+; Make sure we can handle calculating bases & offsets from a real memcpy.
+define void @test_initial_memcpy(i32* noalias %dst, i32* noalias%src) {
+; CHECK-LABEL: @test_initial_memcpy
+; CHECK: {{%.*}} = bitcast i32* %dst to i8*
+; CHECK: {{%.*}} = bitcast i32* %src to i8*
+; CHECK: [[DST:%.*]] = bitcast i32* %dst to i8*
+; CHECK: [[SRC:%.*]] = bitcast i32* %src to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+
+  %dst.0 = bitcast i32* %dst to i8*
+  %src.0 = bitcast i32* %src to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst.0, i8* %src.0, i64 4, i32 4, i1 false)
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+define void @test_volatile_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_volatile_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load volatile i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+define void @test_atomic_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_atomic_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store atomic i32 %val.1, i32* %dst.1 unordered, align 4
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+define i32 @test_multi_use_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_multi_use_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret i32 %val.1
+}
+
+define void @test_side_effect_skipped(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_side_effect_skipped
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  call void asm sideeffect "", ""()
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+define void @test_holes_handled(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_holes_handled
+; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst to i8*
+; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+; CHECK-DAG: [[DST:%.*]] = bitcast i32* %dst.7 to i8*
+; CHECK-DAG: [[SRC:%.*]] = bitcast i32* %src.7 to i8*
+; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[DST]], i8* [[SRC]], i64 16, i32 4, i1 false)
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+
+  %src.7 = getelementptr i32, i32* %src, i32 7
+  %dst.7 = getelementptr i32, i32* %dst, i32 7
+  %val.7 = load i32, i32* %src.7
+  store i32 %val.7, i32* %dst.7
+
+  %src.9 = getelementptr i32, i32* %src, i32 9
+  %dst.9 = getelementptr i32, i32* %dst, i32 9
+  %val.9 = load i32, i32* %src.9
+  store i32 %val.9, i32* %dst.9
+
+  %src.10 = getelementptr i32, i32* %src, i32 10
+  %dst.10 = getelementptr i32, i32* %dst, i32 10
+  %val.10 = load i32, i32* %src.10
+  store i32 %val.10, i32* %dst.10
+
+  %src.8 = getelementptr i32, i32* %src, i32 8
+  %dst.8 = getelementptr i32, i32* %dst, i32 8
+  %val.8 = load i32, i32* %src.8
+  store i32 %val.8, i32* %dst.8
+
+  ret void
+}
+
+define void @test_offset_mismatch(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_offset_mismatch
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %dst.2 = getelementptr i32, i32* %dst, i32 1
+  %val.2 = load i32, i32* %src.2
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 2
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+define void @test_non_idempotent_ops(i8* %dst, i8* %src) {
+; CHECK-LABEL: @test_non_idempotent_ops
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK-NOT: call void @llvm.memmove
+
+  %val.0 = load i8, i8* %src
+  store i8 %val.0, i8* %dst
+
+  %src.2 = getelementptr i8, i8* %src, i8 2
+  %dst.2 = getelementptr i8, i8* %dst, i8 2
+  %val.2 = load i8, i8* %src.2
+  store i8 %val.2, i8* %dst.2
+
+  %val.0.dup = load i8, i8* %src
+  store i8 %val.0.dup, i8* %dst
+
+  %src.1 = getelementptr i8, i8* %src, i8 1
+  %dst.1 = getelementptr i8, i8* %dst, i8 1
+  %val.1 = load i8, i8* %src.1
+  store i8 %val.1, i8* %dst.1
+
+  %src.3 = getelementptr i8, i8* %src, i8 3
+  %dst.3 = getelementptr i8, i8* %dst, i8 3
+  %val.3 = load i8, i8* %src.3
+  store i8 %val.3, i8* %dst.3
+
+  ret void
+}
+
+define void @test_intervening_op(i32* noalias %dst, i32* noalias %src) {
+; CHECK-LABEL: @test_intervening_op
+; CHECK-NOT: call void @llvm.memcpy
+
+  %val.0 = load i32, i32* %src
+  store i32 %val.0, i32* %dst
+
+  %src.2 = getelementptr i32, i32* %src, i32 2
+  %src16.2 = bitcast i32* %src.2 to i16*
+  %dst.2 = getelementptr i32, i32* %dst, i32 2
+  %val16.2 = load i16, i16* %src16.2
+  %val.2 = sext i16 %val16.2 to i32
+  store i32 %val.2, i32* %dst.2
+
+  %src.1 = getelementptr i32, i32* %src, i32 1
+  %dst.1 = getelementptr i32, i32* %dst, i32 1
+  %val.1 = load i32, i32* %src.1
+  store i32 %val.1, i32* %dst.1
+
+  %src.3 = getelementptr i32, i32* %src, i32 3
+  %dst.3 = getelementptr i32, i32* %dst, i32 3
+  %val.3 = load i32, i32* %src.3
+  store i32 %val.3, i32* %dst.3
+
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8*, i8*, i64, i32, i1)