Index: llvm/trunk/lib/Transforms/Scalar/DeadStoreElimination.cpp
===================================================================
--- llvm/trunk/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ llvm/trunk/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -510,8 +510,8 @@
 /// memory region into an identical pointer) then it doesn't actually make its
 /// input dead in the traditional sense.  Consider this case:
 ///
-///   memcpy(A <- B)
-///   memcpy(A <- A)
+///   memmove(A <- B)
+///   memmove(A <- A)
 ///
 /// In this case, the second store to A does not make the first store to A dead.
 /// The usual situation isn't an explicit A<-A store like this (which can be
@@ -527,23 +527,34 @@
   // Self reads can only happen for instructions that read memory.  Get the
   // location read.
   MemoryLocation InstReadLoc = getLocForRead(Inst, TLI);
-  if (!InstReadLoc.Ptr) return false;  // Not a reading instruction.
+  if (!InstReadLoc.Ptr)
+    return false; // Not a reading instruction.
 
   // If the read and written loc obviously don't alias, it isn't a read.
-  if (AA.isNoAlias(InstReadLoc, InstStoreLoc)) return false;
+  if (AA.isNoAlias(InstReadLoc, InstStoreLoc))
+    return false;
 
-  // Okay, 'Inst' may copy over itself.  However, we can still remove a the
-  // DepWrite instruction if we can prove that it reads from the same location
-  // as Inst.  This handles useful cases like:
-  //   memcpy(A <- B)
-  //   memcpy(A <- B)
-  // Here we don't know if A/B may alias, but we do know that B/B are must
-  // aliases, so removing the first memcpy is safe (assuming it writes <= #
-  // bytes as the second one.
-  MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
+  if (isa<MemCpyInst>(Inst)) {
+    // LLVM's memcpy overlap semantics are not fully fleshed out (see PR11763)
+    // but in practice memcpy(A <- B) either means that A and B are disjoint or
+    // are equal (i.e. there are not partial overlaps).  Given that, if we have:
+    //
+    //   memcpy/memmove(A <- B)  // DepWrite
+    //   memcpy(A <- B)  // Inst
+    //
+    // with Inst reading/writing a >= size than DepWrite, we can reason as
+    // follows:
+    //
+    //   - If A == B then both the copies are no-ops, so the DepWrite can be
+    //     removed.
+    //   - If A != B then A and B are disjoint locations in Inst.  Since
+    //     Inst.size >= DepWrite.size A and B are disjoint in DepWrite too.
+    //     Therefore DepWrite can be removed.
+    MemoryLocation DepReadLoc = getLocForRead(DepWrite, TLI);
 
-  if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
-    return false;
+    if (DepReadLoc.Ptr && AA.isMustAlias(InstReadLoc.Ptr, DepReadLoc.Ptr))
+      return false;
+  }
 
   // If DepWrite doesn't read memory or if we can't prove it is a must alias,
   // then it can't be considered dead.
Index: llvm/trunk/test/Transforms/DeadStoreElimination/simple.ll
===================================================================
--- llvm/trunk/test/Transforms/DeadStoreElimination/simple.ll
+++ llvm/trunk/test/Transforms/DeadStoreElimination/simple.ll
@@ -252,6 +252,9 @@
 ; Do not delete instruction where possible situation is:
 ; A = B
 ; A = A
+;
+; NB! See PR11763 - currently LLVM allows memcpy's source and destination to be
+; equal (but not inequal and overlapping).
 define void @test18(i8* %P, i8* %Q, i8* %R) nounwind ssp {
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
   tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
@@ -521,3 +524,51 @@
   store i32 0, i32* %p
   ret void
 }
+
+; We cannot optimize away the first memmove since %P could overlap with %Q.
+define void @test36(i8* %P, i8* %Q) {
+; CHECK-LABEL: @test36(
+; CHECK-NEXT:  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+; CHECK-NEXT:  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+; CHECK-NEXT:  ret
+
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  ret void
+}
+
+define void @test37(i8* %P, i8* %Q, i8* %R) {
+; CHECK-LABEL: @test37(
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+; CHECK-NEXT:  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
+; CHECK-NEXT:  ret
+
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
+  ret void
+}
+
+; Same caveat about memcpy as in @test18 applies here.
+define void @test38(i8* %P, i8* %Q, i8* %R) {
+; CHECK-LABEL: @test38(
+; CHECK-NEXT:  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
+; CHECK-NEXT:  ret
+
+  tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 12, i1 false)
+  ret void
+}
+
+define void @test39(i8* %P, i8* %Q, i8* %R) {
+; CHECK-LABEL: @test39(
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+; CHECK-NEXT:  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 8, i1 false)
+; CHECK-NEXT:  ret
+
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i1 false)
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %R, i64 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1)