Index: include/llvm/Transforms/Scalar/MemCpyOptimizer.h
===================================================================
--- include/llvm/Transforms/Scalar/MemCpyOptimizer.h
+++ include/llvm/Transforms/Scalar/MemCpyOptimizer.h
@@ -30,9 +30,11 @@
 class Function;
 class Instruction;
 class MemCpyInst;
-class MemMoveInst;
+class AnyMemCpyInst;
+class AnyMemMoveInst;
 class MemoryDependenceResults;
 class MemSetInst;
+class AnyMemSetInst;
 class StoreInst;
 class TargetLibraryInfo;
 class Value;
@@ -60,13 +62,13 @@
   // Helper functions
   bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
   bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-  bool processMemCpy(MemCpyInst *M);
-  bool processMemMove(MemMoveInst *M);
+  bool processAnyMemCpy(AnyMemCpyInst *M);
+  bool processAnyMemMove(AnyMemMoveInst *M);
   bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
                             uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
-  bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
+  bool processMemCpyMemCpyDependence(AnyMemCpyInst *M, AnyMemCpyInst *MDep);
   bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
-  bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
+  bool performMemCpyToMemSetOptzn(AnyMemCpyInst *M, AnyMemSetInst *MDep);
   bool processByValArgument(CallSite CS, unsigned ArgNo);
   Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
                                     Value *ByteVal);
Index: lib/Transforms/Scalar/MemCpyOptimizer.cpp
===================================================================
--- lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -1005,11 +1005,22 @@
 
 /// We've found that the (upward scanning) memory dependence of memcpy 'M' is
 /// the memcpy 'MDep'. Try to simplify M to copy from MDep's input if we can.
-bool MemCpyOptPass::processMemCpyMemCpyDependence(MemCpyInst *M,
-                                                  MemCpyInst *MDep) {
+bool MemCpyOptPass::processMemCpyMemCpyDependence(AnyMemCpyInst *M,
+                                                  AnyMemCpyInst *MDep) {
+  auto* MI = dyn_cast<MemCpyInst>(M);
+  auto* AMI = dyn_cast<AtomicMemCpyInst>(M);
+  auto* MIDep = dyn_cast<MemCpyInst>(MDep);
+  auto* AMIDep = dyn_cast<AtomicMemCpyInst>(MDep);
+
+  // Atomicity of the memcpy & dependent memcpy must be the same.
+  // FIXME: It is probably okay to do this replacement if M is atomic, and
+  //  MDep is not.
+  if (!((MI && MIDep) || (AMI && AMIDep)))
+    return false;
+
   // We can only transforms memcpy's where the dest of one is the source of the
   // other.
-  if (M->getSource() != MDep->getDest() || MDep->isVolatile())
+  if (M->getSource() != MDep->getDest() || (MIDep && MIDep->isVolatile()))
     return false;
 
   // If dep instruction is reading from our current input, then it is a noop
@@ -1027,6 +1038,12 @@
   if (!MDepLen || !MLen || MDepLen->getZExtValue() < MLen->getZExtValue())
     return false;
 
+  // If we have an atomic memcpy, then the element sizes of M and MDep
+  // must be the same.
+  // FIXME: It may be possible to relax this restriction in some way.
+  if (AMI && AMIDep->getElementSizeInBytes() != AMI->getElementSizeInBytes())
+    return false;
+
   AliasAnalysis &AA = LookupAliasAnalysis();
 
   // Verify that the copied-from memory doesn't change in between the two
@@ -1060,14 +1077,27 @@
   // TODO: Is this worth it if we're creating a less aligned memcpy? For
   // example we could be moving from movaps -> movq on x86.
   IRBuilder<> Builder(M);
-  if (UseMemMove)
-    Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(),
-                          MDep->getRawSource(), MDep->getSourceAlignment(),
-                          M->getLength(), M->isVolatile());
-  else
-    Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(),
-                         MDep->getRawSource(), MDep->getSourceAlignment(),
-                         M->getLength(), M->isVolatile());
+  if (MI) {
+    // Non-atomic operation
+    if (UseMemMove)
+      Builder.CreateMemMove(M->getRawDest(), M->getDestAlignment(),
+                            MDep->getRawSource(), MDep->getSourceAlignment(),
+                            M->getLength(), M->isVolatile());
+    else
+      Builder.CreateMemCpy(M->getRawDest(), M->getDestAlignment(),
+                           MDep->getRawSource(), MDep->getSourceAlignment(),
+                           M->getLength(), M->isVolatile());
+  } else {
+    // Atomic operation
+    if (UseMemMove)
+      Builder.CreateElementUnorderedAtomicMemMove(M->getRawDest(), M->getDestAlignment(),
+                                                  MDep->getRawSource(), MDep->getSourceAlignment(),
+                                                  M->getLength(), AMI->getElementSizeInBytes());
+    else
+      Builder.CreateElementUnorderedAtomicMemCpy(M->getRawDest(), M->getDestAlignment(),
+                                                 MDep->getRawSource(), MDep->getSourceAlignment(),
+                                                 M->getLength(), AMI->getElementSizeInBytes());
+  }
 
   // Remove the instruction we're replacing.
   MD->removeInstruction(M);
@@ -1155,8 +1185,8 @@
 /// When dst2_size <= dst1_size.
 ///
 /// The \p MemCpy must have a Constant length.
-bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
-                                               MemSetInst *MemSet) {
+bool MemCpyOptPass::performMemCpyToMemSetOptzn(AnyMemCpyInst *MemCpy,
+                                               AnyMemSetInst *MemSet) {
   AliasAnalysis &AA = LookupAliasAnalysis();
 
   // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and
@@ -1172,8 +1202,13 @@
     return false;
 
   IRBuilder<> Builder(MemCpy);
-  Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
-                       CopySize, MemCpy->getDestAlignment());
+  if (auto* AMC = dyn_cast<AtomicMemCpyInst>(MemCpy))
+    Builder.CreateElementUnorderedAtomicMemSet(MemCpy->getRawDest(), MemSet->getValue(),
+                                               CopySize, MemCpy->getDestAlignment(),
+                                               AMC->getElementSizeInBytes());
+  else
+    Builder.CreateMemSet(MemCpy->getRawDest(), MemSet->getOperand(1),
+                         CopySize, MemCpy->getDestAlignment());
   return true;
 }
 
@@ -1182,9 +1217,12 @@
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
-bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
+bool MemCpyOptPass::processAnyMemCpy(AnyMemCpyInst *M) {
+  auto* MI = dyn_cast<MemCpyInst>(M);
+  auto* AMI = dyn_cast<AtomicMemCpyInst>(M);
+  
   // We can only optimize non-volatile memcpy's.
-  if (M->isVolatile()) return false;
+  if (MI && MI->isVolatile()) return false;
 
   // If the source and destination of the memcpy are the same, then zap it.
   if (M->getSource() == M->getDest()) {
@@ -1198,8 +1236,14 @@
     if (GV->isConstant() && GV->hasDefinitiveInitializer())
       if (Value *ByteVal = isBytewiseValue(GV->getInitializer())) {
         IRBuilder<> Builder(M);
-        Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
-                             M->getDestAlignment(), false);
+        if (MI)
+          Builder.CreateMemSet(M->getRawDest(), ByteVal, M->getLength(),
+                               M->getDestAlignment(), false);
+        else
+          Builder.CreateElementUnorderedAtomicMemSet(M->getRawDest(), ByteVal,
+                                                     M->getLength(),
+                                                     M->getDestAlignment(),
+                                                     AMI->getElementSizeInBytes());
         MD->removeInstruction(M);
         M->eraseFromParent();
         ++NumCpyToSet;
@@ -1210,9 +1254,9 @@
 
   // Try to turn a partially redundant memset + memcpy into
   // memcpy + smaller memset.  We don't need the memcpy size for this.
-  if (DepInfo.isClobber())
+  if (MI && DepInfo.isClobber())
     if (MemSetInst *MDep = dyn_cast<MemSetInst>(DepInfo.getInst()))
-      if (processMemSetMemCpyDependence(M, MDep))
+      if (processMemSetMemCpyDependence(MI, MDep))
         return true;
 
   // The optimizations after this point require the memcpy size.
@@ -1226,12 +1270,12 @@
   //      lifetime copies undefined data, and we can therefore eliminate the
   //      memcpy in favor of the data that was already at the destination.
   //   d) memcpy from a just-memset'd source can be turned into memset.
-  if (DepInfo.isClobber()) {
+  if (MI && DepInfo.isClobber()) {
     if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
       // FIXME: Can we pass in either of dest/src alignment here instead
       // of conservatively taking the minimum?
       unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment());
-      if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
+      if (performCallSlotOptzn(MI, M->getDest(), M->getSource(),
                                CopySize->getZExtValue(), Align,
                                C)) {
         MD->removeInstruction(M);
@@ -1246,7 +1290,7 @@
       SrcLoc, true, M->getIterator(), M->getParent());
 
   if (SrcDepInfo.isClobber()) {
-    if (MemCpyInst *MDep = dyn_cast<MemCpyInst>(SrcDepInfo.getInst()))
+    if (AnyMemCpyInst *MDep = dyn_cast<AnyMemCpyInst>(SrcDepInfo.getInst()))
       return processMemCpyMemCpyDependence(M, MDep);
   } else if (SrcDepInfo.isDef()) {
     Instruction *I = SrcDepInfo.getInst();
@@ -1270,7 +1314,7 @@
   }
 
   if (SrcDepInfo.isClobber())
-    if (MemSetInst *MDep = dyn_cast<MemSetInst>(SrcDepInfo.getInst()))
+    if (AnyMemSetInst *MDep = dyn_cast<AnyMemSetInst>(SrcDepInfo.getInst()))
       if (performMemCpyToMemSetOptzn(M, MDep)) {
         MD->removeInstruction(M);
         M->eraseFromParent();
@@ -1283,7 +1327,7 @@
 
 /// Transforms memmove calls to memcpy calls when the src/dst are guaranteed
 /// not to alias.
-bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
+bool MemCpyOptPass::processAnyMemMove(AnyMemMoveInst *M) {
   AliasAnalysis &AA = LookupAliasAnalysis();
 
   if (!TLI->has(LibFunc_memmove))
@@ -1301,8 +1345,9 @@
   Type *ArgTys[3] = { M->getRawDest()->getType(),
                       M->getRawSource()->getType(),
                       M->getLength()->getType() };
+  auto MemCpy = isa<MemMoveInst>(M) ? Intrinsic::memcpy : Intrinsic::memcpy_element_unordered_atomic;
   M->setCalledFunction(Intrinsic::getDeclaration(M->getModule(),
-                                                 Intrinsic::memcpy, ArgTys));
+                                                 MemCpy, ArgTys));
 
   // MemDep may have over conservative information about this instruction, just
   // conservatively flush it from the cache.
@@ -1412,10 +1457,10 @@
         MadeChange |= processStore(SI, BI);
       else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
         RepeatInstruction = processMemSet(M, BI);
-      else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
-        RepeatInstruction = processMemCpy(M);
-      else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
-        RepeatInstruction = processMemMove(M);
+      else if (AnyMemCpyInst *M = dyn_cast<AnyMemCpyInst>(I))
+        RepeatInstruction = processAnyMemCpy(M);
+      else if (AnyMemMoveInst *M = dyn_cast<AnyMemMoveInst>(I))
+        RepeatInstruction = processAnyMemMove(M);
       else if (auto CS = CallSite(I)) {
         for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
           if (CS.isByValArgument(i))
Index: test/Transforms/MemCpyOpt/atomic-memcpy.ll
===================================================================
--- /dev/null
+++ test/Transforms/MemCpyOpt/atomic-memcpy.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -memcpyopt -dse -S | FileCheck -enable-var-scope %s
+
+; A copy of the memcpy.ll tests, but modified to test the atomic memcpy intrinsic
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
+target triple = "i686-apple-darwin9"
+
+%0 = type { x86_fp80, x86_fp80 }
+%1 = type { i32, i32 }
+
+; Check that the first memcpy is removed
+define void @test1(%0* sret  %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind  {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
+; CHECK-NEXT:    [[TMP5:%.*]] = fsub x86_fp80 0xK80000000000000000000, [[Z_1:%.*]]
+; CHECK-NEXT:    call void @ccoshl(%0* sret [[MEMTMP]], x86_fp80 [[TMP5]], x86_fp80 [[Z_0:%.*]]) #0
+; CHECK-NEXT:    [[MEMTMP20:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; CHECK-NEXT:    [[AGG_RESULT21:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT21]], i8* align 16 [[MEMTMP20]], i32 32, i32 1)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %tmp2 = alloca %0
+  %memtmp = alloca %0, align 16
+  %tmp5 = fsub x86_fp80 0xK80000000000000000000, %z.1
+  call void @ccoshl(%0* sret %memtmp, x86_fp80 %tmp5, x86_fp80 %z.0) nounwind
+  %tmp219 = bitcast %0* %tmp2 to i8*
+  %memtmp20 = bitcast %0* %memtmp to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %tmp219, i8* align 16 %memtmp20, i32 32, i32 1)
+  %agg.result21 = bitcast %0* %agg.result to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %agg.result21, i8* align 16 %tmp219, i32 32, i32 1)
+  ret void
+}
+
+declare void @ccoshl(%0* nocapture sret, x86_fp80, x86_fp80) nounwind
+
+
+; The intermediate alloca and one of the memcpy's should be eliminated, the
+; other should be replaced with a memmove.
+define void @test2(i8* %P, i8* %Q) nounwind  {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i32 1)
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca %0, align 16
+  %R = bitcast %0* %memtmp to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i32 1)
+  ret void
+}
+
+; The element sizes differ between the two memcpy's. We leave these alone.
+define void @test2b(i8* %P, i8* %Q) nounwind  {
+; CHECK-LABEL: @test2b(
+; CHECK-NEXT:    [[MEMTMP:%.*]] = alloca [[TMP0:%.*]], align 16
+; CHECK-NEXT:    [[R:%.*]] = bitcast %0* [[MEMTMP]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[R]], i8* align 16 [[P:%.*]], i32 32, i32 1)
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[R]], i32 32, i32 4)
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca %0, align 16
+  %R = bitcast %0* %memtmp to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i32 4)
+  ret void
+}
+
+; The intermediate alloca and one of the memcpy's should be eliminated, the
+; other should be replaced with a memcpy.element.unordered.atomic.
+define void @test2_memcpy(i8* noalias %P, i8* noalias %Q) nounwind  {
+; CHECK-LABEL: @test2_memcpy(
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i32 1)
+; CHECK-NEXT:    ret void
+;
+  %memtmp = alloca %0, align 16
+  %R = bitcast %0* %memtmp to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i32 1)
+  ret void
+}
+
+@x = external global %0
+
+define void @test3(%0* noalias sret %agg.result) nounwind  {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[AGG_RESULT2:%.*]] = bitcast %0* [[AGG_RESULT:%.*]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[AGG_RESULT2]], i8* align 16 bitcast (%0* @x to i8*), i32 32, i32 1)
+; CHECK-NEXT:    ret void
+;
+  %x.0 = alloca %0
+  %x.01 = bitcast %0* %x.0 to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %x.01, i8* align 16 bitcast (%0* @x to i8*), i32 32, i32 1)
+  %agg.result2 = bitcast %0* %agg.result to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %agg.result2, i8* align 16 %x.01, i32 32, i32 1)
+  ret void
+}
+
+
+; PR8644
+define void @test4(i8 *%P) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[TMP1:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = bitcast %1* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[B]], i8* align 4 [[P:%.*]], i64 8, i32 1)
+; CHECK-NEXT:    call void @test4a(i8* byval align 1 [[B]])
+; CHECK-NEXT:    ret void
+;
+  %A = alloca %1
+  %B = bitcast %1* %A to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %B, i8* align 4 %P, i64 8, i32 1)
+  call void @test4a(i8* align 1 byval %B)
+  ret void
+}
+
+; Make sure we don't remove the memcpy if the source address space doesn't match the byval argument
+define void @test4_addrspace(i8 addrspace(1)* %P) {
+; CHECK-LABEL: @test4_addrspace(
+; CHECK-NEXT:    [[A:%.*]] = alloca [[TMP1:%.*]]
+; CHECK-NEXT:    [[B:%.*]] = bitcast %1* [[A]] to i8*
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p1i8.i64(i8* align 4 [[B]], i8 addrspace(1)* align 4 [[P:%.*]], i64 8, i32 1)
+; CHECK-NEXT:    call void @test4a(i8* byval align 1 [[B]])
+; CHECK-NEXT:    ret void
+;
+  %A = alloca %1
+  %B = bitcast %1* %A to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p1i8.i64(i8* align 4 %B, i8 addrspace(1)* align 4 %P, i64 8, i32 1)
+  call void @test4a(i8* align 1 byval %B)
+  ret void
+}
+
+declare void @test4a(i8* align 1 byval)
+
+;; Noop memcpy should be zapped.
+define void @test6(i8 *%P) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %P, i8* align 4 %P, i64 8, i32 1)
+  ret void
+}
+
+
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32) nounwind
Index: test/Transforms/MemCpyOpt/memcpy-to-memset.ll
===================================================================
--- test/Transforms/MemCpyOpt/memcpy-to-memset.ll
+++ test/Transforms/MemCpyOpt/memcpy-to-memset.ll
@@ -3,6 +3,7 @@
 @cst = internal constant [3 x i32] [i32 -1, i32 -1, i32 -1], align 4
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 declare void @foo(i32*) nounwind
 
 define void @test1() nounwind {
@@ -17,3 +18,16 @@
 ; CHECK-NOT: call void @llvm.memcpy
 ; CHECK: ret void
 }
+
+define void @test1_atomic() nounwind {
+  %arr = alloca [3 x i32], align 4
+  %arr_i8 = bitcast [3 x i32]* %arr to i8*
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %arr_i8, i8* align 4 bitcast ([3 x i32]* @cst to i8*), i64 12, i32 1)
+  %arraydecay = getelementptr inbounds [3 x i32], [3 x i32]* %arr, i64 0, i64 0
+  call void @foo(i32* %arraydecay) nounwind
+  ret void
+; CHECK-LABEL: @test1_atomic(
+; CHECK: call void @llvm.memset.element.unordered.atomic.{{.*}}, i32 1)
+; CHECK-NOT: call void @llvm.memcpy
+; CHECK: ret void
+}
Index: test/Transforms/MemCpyOpt/memmove.ll
===================================================================
--- test/Transforms/MemCpyOpt/memmove.ll
+++ test/Transforms/MemCpyOpt/memmove.ll
@@ -5,6 +5,7 @@
 target triple = "x86_64-apple-darwin9.0"
 
 declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32) nounwind
 
 define i8* @test1(i8* nocapture %src) nounwind {
 entry:
@@ -19,6 +20,17 @@
 }
 declare noalias i8* @malloc(i32)
 
+define i8* @test1_atomic(i8* nocapture %src) nounwind {
+entry:
+; CHECK-LABEL: @test1_atomic(
+; CHECK: tail call void @llvm.memcpy.element.unordered.atomic{{.*}}, i32 1)
+
+  %malloccall = tail call i8* @malloc(i32 trunc (i64 mul nuw (i64 ptrtoint (i8* getelementptr (i8, i8* null, i32 1) to i64), i64 13) to i32))
+  %call3 = bitcast i8* %malloccall to [13 x i8]*
+  %call3.sub = getelementptr inbounds [13 x i8], [13 x i8]* %call3, i64 0, i64 0
+  tail call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %call3.sub, i8* align 1 %src, i64 13, i32 1)
+  ret i8* %call3.sub
+}
 
 define void @test2(i8* %P) nounwind {
 entry:
Index: test/Transforms/MemCpyOpt/memset-atomicmemcpy-to-2x-memset.ll
===================================================================
--- /dev/null
+++ test/Transforms/MemCpyOpt/memset-atomicmemcpy-to-2x-memset.ll
@@ -0,0 +1,123 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -memcpyopt -S %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @test(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i32 8)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i32 8)
+  ret void
+}
+
+define void @test2(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i32 8)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i32 4)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i32 8)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i32 4)
+  ret void
+}
+
+define void @test_smaller_memcpy(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_smaller_memcpy(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 2 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 2 [[DST2:%.*]], i8 [[C]], i64 100, i32 2)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* align 2 %dst1, i8 %c, i64 128, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 2 %dst2, i8* align 2 %dst1, i64 100, i32 2)
+  ret void
+}
+
+define void @test_smaller_memset(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_smaller_memset(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[DST1:%.*]], i8 [[C:%.*]], i64 100, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[DST2:%.*]], i8* align 4 [[DST1]], i64 128, i32 4)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* align 4 %dst1, i8 %c, i64 100, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %dst2, i8* align 4 %dst1, i64 128, i32 4)
+  ret void
+}
+
+define void @test_align_memset(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_align_memset(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 128, i32 8)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 128, i32 8)
+  ret void
+}
+
+define void @test_different_types(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_different_types(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 8 [[DST2:%.*]], i8 [[C]], i32 100, i32 2)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* align 8 %dst1, i8 %c, i64 128, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dst2, i8* align 8 %dst1, i32 100, i32 2)
+  ret void
+}
+
+define void @test_different_types_2(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_different_types_2(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i32(i8* align 8 [[DST1:%.*]], i8 [[C:%.*]], i32 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* align 8 [[DST2:%.*]], i8 [[C]], i64 100, i32 2)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i32(i8* align 8 %dst1, i8 %c, i32 128, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 8 %dst2, i8* align 8 %dst1, i64 100, i32 2)
+  ret void
+}
+
+define void @test_different_source_gep(i8* %dst1, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_different_source_gep(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    [[P:%.*]] = getelementptr i8, i8* [[DST1]], i64 64
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 [[DST2:%.*]], i8* align 4 [[P]], i64 64, i32 4)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
+  ; FIXME: We could optimize this as well.
+  %p = getelementptr i8, i8* %dst1, i64 64
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 4 %dst2, i8* align 4 %p, i64 64, i32 4)
+  ret void
+}
+
+define void @test_variable_size_1(i8* %dst1, i64 %dst1_size, i8* %dst2, i8 %c) {
+; CHECK-LABEL: @test_variable_size_1(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 [[DST1_SIZE:%.*]], i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[DST2:%.*]], i8* align 1 [[DST1]], i64 128, i32 1)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 %dst1_size, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %dst2, i8* align 1 %dst1, i64 128, i32 1)
+  ret void
+}
+
+define void @test_variable_size_2(i8* %dst1, i8* %dst2, i64 %dst2_size, i8 %c) {
+; CHECK-LABEL: @test_variable_size_2(
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 [[DST2:%.*]], i8* align 1 [[DST1]], i64 [[DST2_SIZE:%.*]], i32 1)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memset.p0i8.i64(i8* %dst1, i8 %c, i64 128, i1 false)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* align 1 %dst2, i8* align 1 %dst1, i64 %dst2_size, i32 1)
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1)
+declare void @llvm.memset.element.unordered.atomic.p0i8.i64(i8* nocapture, i8, i64, i32)
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32)
+declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i1)
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32)