Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
===================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -73,19 +73,12 @@
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
-static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
-    "unfold-element-atomic-memcpy-max-elements",
-    cl::init(16),
-    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
-             "allowed to unfold"));
-
 static cl::opt<unsigned> GuardWideningWindow(
     "instcombine-guard-widening-window",
     cl::init(3),
     cl::desc("How wide an instruction window to bypass looking for "
              "another guard"));
 
-
 /// Return the specified type promoted as it would be to pass though a va_arg
 /// area.
 static Type *getPromotedType(Type *Ty) {
@@ -113,84 +106,7 @@
   return ConstantVector::get(BoolVec);
 }
 
-Instruction *
-InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) {
-  // Try to unfold this intrinsic into sequence of explicit atomic loads and
-  // stores.
-  // First check that number of elements is compile time constant.
-  auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
-  if (!LengthCI)
-    return nullptr;
-
-  // Check that there are not too many elements.
-  uint64_t LengthInBytes = LengthCI->getZExtValue();
-  uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
-  uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
-  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
-    return nullptr;
-
-  // Only expand if there are elements to copy.
-  if (NumElements > 0) {
-    // Don't unfold into illegal integers
-    uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
-    if (!getDataLayout().isLegalInteger(ElementSizeInBits))
-      return nullptr;
-
-    // Cast source and destination to the correct type. Intrinsic input
-    // arguments are usually represented as i8*. Often operands will be
-    // explicitly casted to i8* and we can just strip those casts instead of
-    // inserting new ones. However it's easier to rely on other InstCombine
-    // rules which will cover trivial cases anyway.
-    Value *Src = AMI->getRawSource();
-    Value *Dst = AMI->getRawDest();
-    Type *ElementPointerType =
-        Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
-                           Src->getType()->getPointerAddressSpace());
-
-    Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
-                                                 "memcpy_unfold.src_casted");
-    Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
-                                                 "memcpy_unfold.dst_casted");
-
-    for (uint64_t i = 0; i < NumElements; ++i) {
-      // Get current element addresses
-      ConstantInt *ElementIdxCI =
-          ConstantInt::get(AMI->getContext(), APInt(64, i));
-      Value *SrcElementAddr =
-          Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
-      Value *DstElementAddr =
-          Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
-
-      // Load from the source. Transfer alignment information and mark load as
-      // unordered atomic.
-      LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
-      Load->setOrdering(AtomicOrdering::Unordered);
-      // We know alignment of the first element. It is also guaranteed by the
-      // verifier that element size is less or equal than first element
-      // alignment and both of this values are powers of two. This means that
-      // all subsequent accesses are at least element size aligned.
-      // TODO: We can infer better alignment but there is no evidence that this
-      // will matter.
-      Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
-                                : ElementSizeInBytes);
-      Load->setDebugLoc(AMI->getDebugLoc());
-
-      // Store loaded value via unordered atomic store.
-      StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
-      Store->setOrdering(AtomicOrdering::Unordered);
-      Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
-                                 : ElementSizeInBytes);
-      Store->setDebugLoc(AMI->getDebugLoc());
-    }
-  }
-
-  // Set the number of elements of the copy to 0, it will be deleted on the
-  // next iteration.
-  AMI->setLength(Constant::getNullValue(LengthCI->getType()));
-  return AMI;
-}
-
-Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
+Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
   unsigned CopyDstAlign = MI->getDestAlignment();
   if (CopyDstAlign < DstAlign){
@@ -198,17 +114,16 @@
     return MI;
   }
 
-  auto* MTI = cast<MemTransferInst>(MI);
-  unsigned SrcAlign = getKnownAlignment(MTI->getRawSource(), DL, MI, &AC, &DT);
-  unsigned CopySrcAlign = MTI->getSourceAlignment();
+  unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+  unsigned CopySrcAlign = MI->getSourceAlignment();
   if (CopySrcAlign < SrcAlign) {
-    MTI->setSourceAlignment(SrcAlign);
+    MI->setSourceAlignment(SrcAlign);
     return MI;
   }
 
   // If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
   // load/store.
-  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));
+  ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
   if (!MemOpLength) return nullptr;
 
   // Source and destination pointer types are always "i8*" for intrinsic.  See
@@ -250,7 +165,7 @@
 
   Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
   Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
-  LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());
+  LoadInst *L = Builder.CreateLoad(Src);
   // Alignment from the mem intrinsic will be better, so use it.
   L->setAlignment(CopySrcAlign);
   if (CopyMD)
@@ -260,7 +175,7 @@
   if (LoopMemParallelMD)
     L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
 
-  StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());
+  StoreInst *S = Builder.CreateStore(L, Dest);
   // Alignment from the mem intrinsic will be better, so use it.
   S->setAlignment(CopyDstAlign);
   if (CopyMD)
@@ -268,8 +183,19 @@
   if (LoopMemParallelMD)
     S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);
 
+  if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
+    // non-atomics can be volatile
+    L->setVolatile(MT->isVolatile());
+    S->setVolatile(MT->isVolatile());
+  }
+  if (isa<AtomicMemTransferInst>(MI)) {
+    // atomics have to be unordered
+    L->setOrdering(AtomicOrdering::Unordered);
+    S->setOrdering(AtomicOrdering::Unordered);
+  }
+
   // Set the size of the copy to 0, it will be deleted on the next iteration.
-  MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));
+  MI->setLength(Constant::getNullValue(MemOpLength->getType()));
   return MI;
 }
 
@@ -1781,7 +1707,7 @@
 
   // Intrinsics cannot occur in an invoke, so handle them here instead of in
   // visitCallSite.
-  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+  if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
     bool Changed = false;
 
     // memmove/cpy/set of zero bytes is a noop.
@@ -1798,17 +1724,21 @@
     }
 
     // No other transformations apply to volatile transfers.
-    if (MI->isVolatile())
-      return nullptr;
+    if (auto *M = dyn_cast<MemIntrinsic>(MI))
+      if (M->isVolatile())
+        return nullptr;
 
     // If we have a memmove and the source operation is a constant global,
     // then the source and dest pointers can't alias, so we can change this
     // into a call to memcpy.
-    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+    if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
       if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
         if (GVSrc->isConstant()) {
           Module *M = CI.getModule();
-          Intrinsic::ID MemCpyID = Intrinsic::memcpy;
+          Intrinsic::ID MemCpyID =
+              isa<AtomicMemMoveInst>(MMI)
+                  ? Intrinsic::memcpy_element_unordered_atomic
+                  : Intrinsic::memcpy;
           Type *Tys[3] = { CI.getArgOperand(0)->getType(),
                            CI.getArgOperand(1)->getType(),
                            CI.getArgOperand(2)->getType() };
@@ -1817,7 +1747,7 @@
         }
     }
 
-    if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
+    if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
       // memmove(x,x,size) -> noop.
       if (MTI->getSource() == MTI->getDest())
         return eraseInstFromFunction(CI);
@@ -1825,8 +1755,8 @@
 
     // If we can determine a pointer alignment that is bigger than currently
     // set, update the alignment.
-    if (isa<MemTransferInst>(MI)) {
-      if (Instruction *I = SimplifyMemTransfer(MI))
+    if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
+      if (Instruction *I = SimplifyAnyMemTransfer(MTI))
         return I;
     } else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
       if (Instruction *I = SimplifyMemSet(MSI))
@@ -1836,15 +1766,6 @@
     if (Changed) return II;
   }
 
-  if (auto *AMI = dyn_cast<AtomicMemCpyInst>(II)) {
-    if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
-      if (C->isNullValue())
-        return eraseInstFromFunction(*AMI);
-
-    if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
-      return I;
-  }
-
   if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
     return I;
 
Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h
===================================================================
--- llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h
+++ llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -824,8 +824,7 @@
   Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
 
-  Instruction *SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI);
-  Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
+  Instruction *SimplifyAnyMemTransfer(AnyMemTransferInst *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
   Value *EvaluateInDifferentType(Value *V, Type *Ty, bool isSigned);
Index: llvm/trunk/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
===================================================================
--- llvm/trunk/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
+++ llvm/trunk/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
@@ -1,94 +0,0 @@
-; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s
-; Temporarily an expected failure until inst combine is updated in the next patch
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-
-; Test basic unfolding -- unordered load & store
-define void @test1a(i8* %Src, i8* %Dst) {
-; CHECK-LABEL: test1a
-; CHECK-NOT: llvm.memcpy.element.unordered.atomic
-
-; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
-; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*
-
-; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4
-; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8
-
-; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
-; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4
-
-; CHECK-DAG: [[VAL3:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
-; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4
-
-; CHECK-DAG: [[VAL4:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
-; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
-entry:
-  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 16, i32 4)
-  ret void
-}
-
-; Test that we don't unfold too much
-define void @test2(i8* %Src, i8* %Dst) {
-; CHECK-LABEL: test2
-
-; CHECK-NOT: load
-; CHECK-NOT: store
-; CHECK: llvm.memcpy.element.unordered.atomic
-entry:
-  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 256, i32 4)
-  ret void
-}
-
-; Test that we will not unfold into non native integers
-define void @test3(i8* %Src, i8* %Dst) {
-; CHECK-LABEL: test3
-
-; CHECK-NOT: load
-; CHECK-NOT: store
-; CHECK: llvm.memcpy.element.unordered.atomic
-entry:
-  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 64, i32 64)
-  ret void
-}
-
-; Test that we will eliminate redundant bitcasts
-define void @test4(i64* %Src, i64* %Dst) {
-; CHECK-LABEL: test4
-; CHECK-NOT: llvm.memcpy.element.unordered.atomic
-
-; CHECK-NOT: bitcast
-
-; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i64, i64* %Src unordered, align 16
-; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16
-
-; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1
-; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1
-; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8
-; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8
-
-; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2
-; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2
-; CHECK-DAG: [[VAL3:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8
-; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8
-
-; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3
-; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3
-; CHECK-DAG: [[VAL4:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8
-; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8
-entry:
-  %Src.casted = bitcast i64* %Src to i8*
-  %Dst.casted = bitcast i64* %Dst to i8*
-  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i32 32, i32 8)
-  ret void
-}
-
-; Test that 0-length unordered atomic memcpy gets removed.
-define void @test5(i8* %Src, i8* %Dst) {
-; CHECK-LABEL: test5
-
-; CHECK-NOT: llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8)
-entry:
-  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8)
-  ret void
-}
-
-declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind
Index: llvm/trunk/test/Transforms/InstCombine/element-atomic-memintrins.ll
===================================================================
--- llvm/trunk/test/Transforms/InstCombine/element-atomic-memintrins.ll
+++ llvm/trunk/test/Transforms/InstCombine/element-atomic-memintrins.ll
@@ -1,33 +1,32 @@
-;; Placeholder tests that will fail once element atomic @llvm.mem[move|set] instrinsics have
-;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
-;; verify that inst combine handles these intrinsics properly once they have been
-;; added to that class hierarchy.
-
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
 ;; ---- memset -----
 
-; Ensure 0-length memset isn't removed
+; Ensure 0-length memset is removed
 define void @test_memset_zero_length(i8* %dest) {
-  ; CHECK-LABEL: test_memset_zero_length
-  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
-  ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memset_zero_length(
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
   ret void
 }
 
-; Ensure that small-sized memsets don't convert to stores
+; Placeholder test. This will chance once support for lowering atomic memsets is added to instcombine.
 define void @test_memset_to_store(i8* %dest) {
-  ; CHECK-LABEL: test_memset_to_store
-  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
-  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
-  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
-  ; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
-  ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memset_to_store(
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST:%.*]], i8 1, i32 1, i32 1)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 2, i32 1)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 4, i32 1)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 8, i32 1)
+; CHECK-NEXT:    call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 16, i32 1)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
   call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
   call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
   call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
+  call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 16, i32 1)
   ret void
 }
 
@@ -37,41 +36,35 @@
 ;; =========================================
 ;; ----- memmove ------
 
-; memmove from a global constant source does not become memcpy
-@gconst = constant [8 x i8] c"0123456\00"
+
+@gconst = constant [32 x i8] c"0123456789012345678901234567890\00"
+; Check that a memmove from a global constant is converted into a memcpy
 define void @test_memmove_to_memcpy(i8* %dest) {
-  ; CHECK-LABEL: test_memmove_to_memcpy
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
-  ; CHECK-NEXT: ret void
-  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)
+; CHECK-LABEL: @test_memmove_to_memcpy(
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST:%.*]], i8* align 16 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
   ret void
 }
 
 define void @test_memmove_zero_length(i8* %dest, i8* %src) {
-  ; CHECK-LABEL: test_memmove_zero_length
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
-  ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memmove_zero_length(
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
-  ret void  
+  ret void
 }
 
 ; memmove with src==dest is removed
 define void @test_memmove_removed(i8* %srcdest, i32 %sz) {
-  ; CHECK-LABEL: test_memmove_removed
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
-  ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memmove_removed(
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
@@ -82,17 +75,220 @@
 
 ; memmove with a small constant length is converted to a load/store pair
 define void @test_memmove_loadstore(i8* %dest, i8* %src) {
-  ; CHECK-LABEL: test_memmove_loadstore
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
-  ; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
-  ; CHECK-NEXT: ret void
+; CHECK-LABEL: @test_memmove_loadstore(
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1
+; CHECK-NEXT:    store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16*
+; CHECK-NEXT:    [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1
+; CHECK-NEXT:    store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT:    [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1
+; CHECK-NEXT:    store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT:    [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1
+; CHECK-NEXT:    store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1)
+; CHECK-NEXT:    ret void
+;
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
   call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1)
+  ret void
+}
+
+define void @test_memmove_loadstore_2(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i16*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16*
+; CHECK-NEXT:    [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2
+; CHECK-NEXT:    store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT:    [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2
+; CHECK-NEXT:    store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT:    [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2
+; CHECK-NEXT:    store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2)
+  ret void
+}
+
+define void @test_memmove_loadstore_4(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32*
+; CHECK-NEXT:    [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
+; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT:    [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4
+; CHECK-NEXT:    store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4)
+  ret void
+}
+
+define void @test_memmove_loadstore_8(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i64*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i64*
+; CHECK-NEXT:    [[TMP3:%.*]] = load atomic i64, i64* [[TMP1]] unordered, align 8
+; CHECK-NEXT:    store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8)
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8)
+  ret void
+}
+
+define void @test_memmove_loadstore_16(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memmove_loadstore_16(
+; CHECK-NEXT:    call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.*]], i8* align 16 [[SRC:%.*]], i32 16, i32 16)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16)
   ret void
 }
 
 declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly
+
+;; =========================================
+;; ----- memcpy ------
+
+define void @test_memcpy_zero_length(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_zero_length(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
+  ret void
+}
+
+; memcpy with src==dest is removed
+define void @test_memcpy_removed(i8* %srcdest, i32 %sz) {
+; CHECK-LABEL: @test_memcpy_removed(
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
+  ret void
+}
+
+; memcpy with a small constant length is converted to a load/store pair
+define void @test_memcpy_loadstore(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore(
+; CHECK-NEXT:    [[TMP1:%.*]] = load atomic i8, i8* [[SRC:%.*]] unordered, align 1
+; CHECK-NEXT:    store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[SRC]] to i16*
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i8* [[DEST]] to i16*
+; CHECK-NEXT:    [[TMP4:%.*]] = load atomic i16, i16* [[TMP2]] unordered, align 1
+; CHECK-NEXT:    store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT:    [[TMP7:%.*]] = load atomic i32, i32* [[TMP5]] unordered, align 1
+; CHECK-NEXT:    store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT:    [[TMP10:%.*]] = load atomic i64, i64* [[TMP8]] unordered, align 1
+; CHECK-NEXT:    store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1)
+  ret void
+}
+
+define void @test_memcpy_loadstore_2(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_2(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i16*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i16*
+; CHECK-NEXT:    [[TMP3:%.*]] = load atomic i16, i16* [[TMP1]] unordered, align 2
+; CHECK-NEXT:    store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[SRC]] to i32*
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[DEST]] to i32*
+; CHECK-NEXT:    [[TMP6:%.*]] = load atomic i32, i32* [[TMP4]] unordered, align 2
+; CHECK-NEXT:    store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2
+; CHECK-NEXT:    [[TMP7:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT:    [[TMP9:%.*]] = load atomic i64, i64* [[TMP7]] unordered, align 2
+; CHECK-NEXT:    store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2)
+  ret void
+}
+
+define void @test_memcpy_loadstore_4(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_4(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i32*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i32*
+; CHECK-NEXT:    [[TMP3:%.*]] = load atomic i32, i32* [[TMP1]] unordered, align 4
+; CHECK-NEXT:    store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i8* [[SRC]] to i64*
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast i8* [[DEST]] to i64*
+; CHECK-NEXT:    [[TMP6:%.*]] = load atomic i64, i64* [[TMP4]] unordered, align 4
+; CHECK-NEXT:    store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4)
+  ret void
+}
+
+define void @test_memcpy_loadstore_8(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_8(
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SRC:%.*]] to i64*
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i8* [[DEST:%.*]] to i64*
+; CHECK-NEXT:    [[TMP3:%.*]] = load atomic i64, i64* [[TMP1]] unordered, align 8
+; CHECK-NEXT:    store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8)
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8)
+  ret void
+}
+
+define void @test_memcpy_loadstore_16(i8* %dest, i8* %src) {
+; CHECK-LABEL: @test_memcpy_loadstore_16(
+; CHECK-NEXT:    call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.*]], i8* align 16 [[SRC:%.*]], i32 16, i32 16)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16)
+  ret void
+}
+
+declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly