Index: llvm/trunk/docs/LangRef.rst
===================================================================
--- llvm/trunk/docs/LangRef.rst
+++ llvm/trunk/docs/LangRef.rst
@@ -12661,3 +12661,79 @@
 LLVM provides experimental intrinsics to support runtime patching
 mechanisms commonly desired in dynamic language JITs. These intrinsics
 are described in :doc:`StackMaps`.
+
+Element Wise Atomic Memory Intrinsics
+-----------------------------
+
+These intrinsics are similar to the standard library memory intrinsics except
+that they perform memory transfer as a sequence of atomic memory accesses.
+
+.. _int_memcpy_element_atomic:
+
+'``llvm.memcpy.element.atomic``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memcpy.element.atomic`` on
+any integer bit width and for different address spaces. Not all targets
+support all bit widths however.
+
+::
+
+      declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* <dest>, i8* <src>,
+                                              i64 <num_elements>, i32 <element_size>)
+
+Overview:
+"""""""""
+
+The '``llvm.memcpy.element.atomic.*``' intrinsic performs copy of a block of 
+memory from the source location to the destination location as a sequence of
+unordered atomic memory accesses where each access is a multiple of
+``element_size`` bytes wide and aligned at an element size boundary. For example
+each element is accessed atomically in source and destination buffers.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination, the second is a
+pointer to the source. The third argument is an integer argument
+specifying the number of elements to copy, the fourth argument is size of
+the single element in bytes.
+
+``element_size`` should be a power of two, greater than zero and less than
+a target-specific atomic access size limit.
+
+For each of the input pointers ``align`` parameter attribute must be specified.
+It must be a power of two and greater than or equal to the ``element_size``.
+Caller guarantees that both the source and destination pointers are aligned to
+that boundary.
+
+Semantics:
+""""""""""
+
+The '``llvm.memcpy.element.atomic.*``' intrinsic copies
+'``num_elements`` * ``element_size``' bytes of memory from the source location to
+the destination location. These locations are not allowed to overlap. Memory copy
+is performed as a sequence of unordered atomic memory accesses where each access
+is guaranteed to be a multiple of ``element_size`` bytes wide and aligned at an
+element size boundary.
+
+The order of the copy is unspecified. The same value may be read from the source
+buffer many times, but only one write is issued to the destination buffer per
+element. It is well defined to have concurrent reads and writes to both source
+and destination provided those reads and writes are at least unordered atomic.
+
+This intrinsic does not provide any additional ordering guarantees over those
+provided by a set of unordered loads from the source location and stores to the
+destination.
+
+Lowering:
+""""""""""
+
+In the most general case call to the '``llvm.memcpy.element.atomic.*``' is lowered
+to a call to the symbol ``__llvm_memcpy_element_atomic_*``. Where '*' is replaced
+with an actual element size.
+
+Optimizer is allowed to inline memory copy when it's profitable to do so.
Index: llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h
+++ llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h
@@ -333,6 +333,13 @@
     MEMSET,
     MEMMOVE,
 
+    // ELEMENT-WISE ATOMIC MEMORY
+    MEMCPY_ELEMENT_ATOMIC_1,
+    MEMCPY_ELEMENT_ATOMIC_2,
+    MEMCPY_ELEMENT_ATOMIC_4,
+    MEMCPY_ELEMENT_ATOMIC_8,
+    MEMCPY_ELEMENT_ATOMIC_16,
+
     // EXCEPTION HANDLING
     UNWIND_RESUME,
 
@@ -503,6 +510,10 @@
   /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
   /// UNKNOWN_LIBCALL if there is none.
   Libcall getSYNC(unsigned Opc, MVT VT);
+
+  /// getMEMCPY_ELEMENT_ATOMIC - Return MEMCPY_ELEMENT_ATOMIC_* value for the
+  /// given element size or UNKNOW_LIBCALL if there is none.
+  Libcall getMEMCPY_ELEMENT_ATOMIC(uint64_t ElementSize);
 }
 }
 
Index: llvm/trunk/include/llvm/IR/Intrinsics.td
===================================================================
--- llvm/trunk/include/llvm/IR/Intrinsics.td
+++ llvm/trunk/include/llvm/IR/Intrinsics.td
@@ -759,6 +759,15 @@
 def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty],
                                  [IntrReadMem, IntrArgMemOnly]>;
 
+//===------ Memory intrinsics with element-wise atomicity guarantees ------===//
+//
+
+def int_memcpy_element_atomic  : Intrinsic<[],
+                                           [llvm_anyptr_ty, llvm_anyptr_ty,
+                                            llvm_i64_ty, llvm_i32_ty],
+                                 [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
+                                  WriteOnly<0>, ReadOnly<1>]>;
+
 //===----------------------------------------------------------------------===//
 // Target-specific intrinsics
 //===----------------------------------------------------------------------===//
Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4896,6 +4896,51 @@
     updateDAGForMaybeTailCall(MM);
     return nullptr;
   }
+  case Intrinsic::memcpy_element_atomic: {
+    SDValue Dst = getValue(I.getArgOperand(0));
+    SDValue Src = getValue(I.getArgOperand(1));
+    SDValue NumElements = getValue(I.getArgOperand(2));
+    SDValue ElementSize = getValue(I.getArgOperand(3));
+
+    // Emit a library call.
+    TargetLowering::ArgListTy Args;
+    TargetLowering::ArgListEntry Entry;
+    Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext());
+    Entry.Node = Dst;
+    Args.push_back(Entry);
+
+    Entry.Node = Src;
+    Args.push_back(Entry);
+    
+    Entry.Ty = I.getArgOperand(2)->getType();
+    Entry.Node = NumElements;
+    Args.push_back(Entry);
+    
+    Entry.Ty = Type::getInt32Ty(*DAG.getContext());
+    Entry.Node = ElementSize;
+    Args.push_back(Entry);
+
+    uint64_t ElementSizeConstant =
+        cast<ConstantInt>(I.getArgOperand(3))->getZExtValue();
+    RTLIB::Libcall LibraryCall =
+        RTLIB::getMEMCPY_ELEMENT_ATOMIC(ElementSizeConstant);
+    if (LibraryCall == RTLIB::UNKNOWN_LIBCALL)
+      report_fatal_error("Unsupported element size");
+
+    TargetLowering::CallLoweringInfo CLI(DAG);
+    CLI.setDebugLoc(sdl)
+        .setChain(getRoot())
+        .setCallee(TLI.getLibcallCallingConv(LibraryCall),
+                   Type::getVoidTy(*DAG.getContext()),
+                   DAG.getExternalSymbol(
+                       TLI.getLibcallName(LibraryCall),
+                       TLI.getPointerTy(DAG.getDataLayout())),
+                   std::move(Args));
+
+    std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
+    DAG.setRoot(CallResult.second);
+    return nullptr;
+  }
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(I);
     DILocalVariable *Variable = DI.getVariable();
Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
+++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp
@@ -361,6 +361,11 @@
   Names[RTLIB::MEMCPY] = "memcpy";
   Names[RTLIB::MEMMOVE] = "memmove";
   Names[RTLIB::MEMSET] = "memset";
+  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_1] = "__llvm_memcpy_element_atomic_1";
+  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_2] = "__llvm_memcpy_element_atomic_2";
+  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_4] = "__llvm_memcpy_element_atomic_4";
+  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_8] = "__llvm_memcpy_element_atomic_8";
+  Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_16] = "__llvm_memcpy_element_atomic_16";
   Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1";
   Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2";
@@ -763,6 +768,24 @@
   return UNKNOWN_LIBCALL;
 }
 
+RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_ATOMIC(uint64_t ElementSize) {
+  switch (ElementSize) {
+  case 1:
+    return MEMCPY_ELEMENT_ATOMIC_1;
+  case 2:
+    return MEMCPY_ELEMENT_ATOMIC_2;
+  case 4:
+    return MEMCPY_ELEMENT_ATOMIC_4;
+  case 8:
+    return MEMCPY_ELEMENT_ATOMIC_8;
+  case 16:
+    return MEMCPY_ELEMENT_ATOMIC_16;
+  default:
+    return UNKNOWN_LIBCALL;
+  }
+
+}
+
 /// InitCmpLibcallCCs - Set default comparison libcall CC.
 ///
 static void InitCmpLibcallCCs(ISD::CondCode *CCs) {
Index: llvm/trunk/lib/IR/Verifier.cpp
===================================================================
--- llvm/trunk/lib/IR/Verifier.cpp
+++ llvm/trunk/lib/IR/Verifier.cpp
@@ -3952,6 +3952,32 @@
            CS);
     break;
   }
+  case Intrinsic::memcpy_element_atomic: {
+    ConstantInt *ElementSizeCI = dyn_cast<ConstantInt>(CS.getArgOperand(3));
+    Assert(ElementSizeCI, "element size of the element-wise atomic memory "
+                          "intrinsic must be a constant int",
+           CS);
+    const APInt &ElementSizeVal = ElementSizeCI->getValue();
+    Assert(ElementSizeVal.isPowerOf2(),
+           "element size of the element-wise atomic memory intrinsic "
+           "must be a power of 2",
+           CS);
+
+    auto IsValidAlignment = [&](uint64_t Alignment) {
+      return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
+    };
+    
+    uint64_t DstAlignment = CS.getParamAlignment(1),
+             SrcAlignment = CS.getParamAlignment(2);
+
+    Assert(IsValidAlignment(DstAlignment),
+           "incorrect alignment of the destination argument",
+           CS);
+    Assert(IsValidAlignment(SrcAlignment),
+           "incorrect alignment of the source argument",
+           CS);
+    break;
+  }
   case Intrinsic::gcroot:
   case Intrinsic::gcwrite:
   case Intrinsic::gcread:
Index: llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
+++ llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll
@@ -0,0 +1,68 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define i8* @test_memcpy1(i8* %P, i8* %Q) {
+  ; CHECK: test_memcpy
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 1, i32 1)
+  ret i8* %P
+  ; CHECK-DAG: movl $1, %edx
+  ; CHECK-DAG: movl $1, %ecx
+  ; CHECK: __llvm_memcpy_element_atomic_1
+}
+
+define i8* @test_memcpy2(i8* %P, i8* %Q) {
+  ; CHECK: test_memcpy2
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 2, i32 2)
+  ret i8* %P
+  ; CHECK-DAG: movl $2, %edx
+  ; CHECK-DAG: movl $2, %ecx
+  ; CHECK: __llvm_memcpy_element_atomic_2
+}
+
+define i8* @test_memcpy4(i8* %P, i8* %Q) {
+  ; CHECK: test_memcpy4
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 4, i32 4)
+  ret i8* %P
+  ; CHECK-DAG: movl $4, %edx
+  ; CHECK-DAG: movl $4, %ecx
+  ; CHECK: __llvm_memcpy_element_atomic_4
+}
+
+define i8* @test_memcpy8(i8* %P, i8* %Q) {
+  ; CHECK: test_memcpy8
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 8 %P, i8* align 8 %Q, i64 8, i32 8)
+  ret i8* %P
+  ; CHECK-DAG: movl $8, %edx
+  ; CHECK-DAG: movl $8, %ecx
+  ; CHECK: __llvm_memcpy_element_atomic_8
+}
+
+define i8* @test_memcpy16(i8* %P, i8* %Q) {
+  ; CHECK: test_memcpy16
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %P, i8* align 16 %Q, i64 16, i32 16)
+  ret i8* %P
+  ; CHECK-DAG: movl $16, %edx
+  ; CHECK-DAG: movl $16, %ecx
+  ; CHECK: __llvm_memcpy_element_atomic_16
+}
+
+define void @test_memcpy_args(i8** %Storage) {
+  ; CHECK: test_memcpy_args
+  %Dst = load i8*, i8** %Storage
+  %Src.addr = getelementptr i8*, i8** %Storage, i64 1
+  %Src = load i8*, i8** %Src.addr
+
+  ; First argument
+  ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]]
+  ; CHECK-DAG: movq [[REG1]], %rdi
+  ; Second argument
+  ; CHECK-DAG: movq 8(%rdi), %rsi
+  ; Third argument
+  ; CHECK-DAG: movl $4, %edx
+  ; Fourth argument
+  ; CHECK-DAG: movl $4, %ecx
+  ; CHECK: __llvm_memcpy_element_atomic_4
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 4, i32 4)
+  ret void
+}
+
+declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32) nounwind
Index: llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll
===================================================================
--- llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll
+++ llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll
@@ -0,0 +1,17 @@
+; RUN: not opt -verify < %s 2>&1 | FileCheck %s
+
+define void @test_memcpy(i8* %P, i8* %Q) {
+  ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 2 %P, i8* align 2 %Q, i64 4, i32 3)
+
+  ; CHECK: incorrect alignment of the destination argument
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 2 %P, i8* align 4 %Q, i64 4, i32 4)
+
+  ; CHECK: incorrect alignment of the source argument
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 2 %Q, i64 4, i32 4)
+
+  ret void
+}
+declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32) nounwind
+
+; CHECK: input module is broken!