Index: docs/LangRef.rst =================================================================== --- docs/LangRef.rst +++ docs/LangRef.rst @@ -12639,3 +12639,84 @@ LLVM provides experimental intrinsics to support runtime patching mechanisms commonly desired in dynamic language JITs. These intrinsics are described in :doc:`StackMaps`. + +Element Wise Atomic Memory Intrinsics +----------------------------- + +These intrinsics are similar to the standard library memory intrinsics. Only +difference is that they perform memory transfer as a sequence of atomic +memory accesses. + +.. _int_memcpy_element_atomic: + +'``llvm.memcpy.element.atomic``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.memcpy.element.atomic`` on +any integer bit width and for different address spaces. Not all targets +support all bit widths however. + +:: + + declare void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* , i8* , + i32 , i32 + i32 , i1 ) + declare void @llvm.memcpy.element.atomic.p0i8.p0i8.i64(i8* , i8* , + i64 , i32 + i32 , i1 ) + +Overview: +""""""""" + +The '``llvm.memcpy.element.atomic.*``' intrinsic performs copy of a block of +memory from the source location to the destination location as a sequence of +unordered atomic memory accesses where each access is a multiple of +``element_size`` and aligned at an element size boundary. E.g. each element is +accessed atomicly in source and destination buffers. + +Arguments: +"""""""""" + +The first argument is a pointer to the destination, the second is a +pointer to the source. The third argument is an integer argument +specifying the number of elements to copy, the fourth argument is size of +the single element in bytes, the fifths argument is the alignment of the source +and destination locations, and the sixths is a boolean indicating a volatile +access. + +``element_size`` should be a power of two, greater than zero and less than +target-specific atomic access size limit. + +``align`` argument must be a power of two and greater than or equal to the +``element_size``. Caller guarantees that both the source and destination pointers +are aligned to that boundary. + +If the ``isvolatile`` parameter is ``true``, this call is +a :ref:`volatile operation `. + +Semantics: +"""""""""" + +The '``llvm.memcpy.element.atomic.*``' intrinsic copies +'``num_elements`` * ``element_size``' bytes of memory from the source location to +the destination location. These locations are not allowed to overlap. Memory copy +is performed as a sequence of unordered atomic memory accesses where each access +is guaranteed to be a multiple of ``element_size`` wide and aligned at an element +size boundary. + +The order of the copy is unspecified. The same value may be read from the source +buffer many times, but only one write is issued to the destination buffer per +element. It is well defined to have concurrent reads and writes to both source +and destination provided those reads and writes are at least unordered atomic. + +Lowering: +"""""""""" + +In the most general case call to the '``llvm.memcpy.element.atomic.*``' is lowered +to a call to the symbol ``__llvm_memcpy_element_atomic``. Only first four arguments +are passed to this call. + +Optimizer is allowed to inline memory copy when it's profitable to do so. Index: include/llvm/CodeGen/RuntimeLibcalls.h =================================================================== --- include/llvm/CodeGen/RuntimeLibcalls.h +++ include/llvm/CodeGen/RuntimeLibcalls.h @@ -333,6 +333,9 @@ MEMSET, MEMMOVE, + // ELEMENT-WISE ATOMIC MEMORY + MEMCPY_ELEMENT_ATOMIC, + // EXCEPTION HANDLING UNWIND_RESUME, Index: include/llvm/IR/Intrinsics.td =================================================================== --- include/llvm/IR/Intrinsics.td +++ include/llvm/IR/Intrinsics.td @@ -759,6 +759,16 @@ def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty], [IntrReadMem, IntrArgMemOnly]>; +//===------ Memory intrinsics with element-wise atomicity guarantees ------===// +// + +def int_memcpy_element_atomic : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyptr_ty, + llvm_anyint_ty, llvm_i32_ty, + llvm_i32_ty, llvm_i1_ty], + [IntrArgMemOnly, NoCapture<0>, NoCapture<1>, + WriteOnly<0>, ReadOnly<1>]>; + //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4898,6 +4898,44 @@ updateDAGForMaybeTailCall(MM); return nullptr; } + case Intrinsic::memcpy_element_atomic: { + SDValue Dst = getValue(I.getArgOperand(0)); + SDValue Src = getValue(I.getArgOperand(1)); + SDValue NumElements = getValue(I.getArgOperand(2)); + SDValue ElementSize = getValue(I.getArgOperand(3)); + + // Emit a library call. + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + + Entry.Node = Src; + Args.push_back(Entry); + + Entry.Ty = I.getArgOperand(2)->getType(); + Entry.Node = NumElements; + Args.push_back(Entry); + + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + Entry.Node = ElementSize; + Args.push_back(Entry); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(sdl) + .setChain(getRoot()) + .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY_ELEMENT_ATOMIC), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol( + TLI.getLibcallName(RTLIB::MEMCPY_ELEMENT_ATOMIC), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); + + std::pair CallResult = TLI.LowerCallTo(CLI); + DAG.setRoot(CallResult.second); + return nullptr; + } case Intrinsic::dbg_declare: { const DbgDeclareInst &DI = cast(I); DILocalVariable *Variable = DI.getVariable(); Index: lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- lib/CodeGen/TargetLoweringBase.cpp +++ lib/CodeGen/TargetLoweringBase.cpp @@ -361,6 +361,7 @@ Names[RTLIB::MEMCPY] = "memcpy"; Names[RTLIB::MEMMOVE] = "memmove"; Names[RTLIB::MEMSET] = "memset"; + Names[RTLIB::MEMCPY_ELEMENT_ATOMIC] = "__llvm_memcpy_element_atomic"; Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2"; Index: lib/IR/Verifier.cpp =================================================================== --- lib/IR/Verifier.cpp +++ lib/IR/Verifier.cpp @@ -3930,6 +3930,37 @@ CS); break; } + case Intrinsic::memcpy_element_atomic: { + ConstantInt *ElementSizeCI = dyn_cast(CS.getArgOperand(3)); + Assert(ElementSizeCI, "element size of the element-wise atomic memory " + "intrinsic must be a constant int", + CS); + const APInt &ElementSizeVal = ElementSizeCI->getValue(); + Assert(ElementSizeVal.isPowerOf2(), + "element size of the element-wise atomic memory intrinsic " + "must be a power of 2", + CS); + + ConstantInt *AlignCI = dyn_cast(CS.getArgOperand(4)); + Assert(AlignCI, "alignment of the element-wise atomic memory intrinsic " + "must be a constant int", + CS); + const APInt &AlignVal = AlignCI->getValue(); + Assert(AlignVal.isPowerOf2(), + "alignment of the element-wise atomic memory intrinsic " + "must be a power of 2", + CS); + + Assert(AlignVal.uge(ElementSizeVal), + "alignment of the element-wise atomic memory intrinsic " + "should be at least the element size", + CS); + + Assert(isa(CS.getArgOperand(5)), + "isvolatile argument of memory intrinsic must be a constant int", + CS); + break; + } case Intrinsic::gcroot: case Intrinsic::gcwrite: case Intrinsic::gcread: Index: test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll @@ -0,0 +1,31 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define i8* @test_memcpy(i8* %P, i8* %Q) { + ; CHECK: test_memcpy + call void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* %P, i8* %Q, i32 4, i32 4, i32 4, i1 false) + ret i8* %P + ; CHECK-DAG: movl $4, %edx + ; CHECK-DAG: movl $4, %ecx + ; CHECK: __llvm_memcpy_element_atomic +} + +define void @test_memcpy2(i8** %Storage) { + ; CHECK: test_memcpy2 + %Dst = load i8*, i8** %Storage + %Src.addr = getelementptr i8*, i8** %Storage, i64 1 + %Src = load i8*, i8** %Src.addr + + ; First argument + ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]] + ; CHECK-DAG: movq [[REG1]], %rdi + ; Second argument + ; CHECK-DAG: movq 8(%rdi), %rsi + ; Third argument + ; CHECK-DAG: movl $4, %edx + ; Fourth argument + ; CHECK-DAG: movl $4, %ecx + call void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* %Dst, i8* %Src, i32 4, i32 4, i32 4, i1 false) + ret void +} + +declare void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i32, i1) nounwind Index: test/Verifier/element-wise-atomic-memory-intrinsics.ll =================================================================== --- /dev/null +++ test/Verifier/element-wise-atomic-memory-intrinsics.ll @@ -0,0 +1,19 @@ +; RUN: not opt -verify < %s 2>&1 | FileCheck %s + +define void @test_memcpy(i8* %P, i8* %Q) { + ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2 + call void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* %P, i8* %Q, i32 4, i32 3, i32 2, i1 false) + + ; CHECK: alignment of the element-wise atomic memory intrinsic must be a power of 2 + call void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* %P, i8* %Q, i32 4, i32 2, i32 0, i1 false) + ; CHECK: alignment of the element-wise atomic memory intrinsic must be a power of 2 + call void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* %P, i8* %Q, i32 4, i32 2, i32 3, i1 false) + + ; CHECK: alignment of the element-wise atomic memory intrinsic should be at least the element size + call void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* %P, i8* %Q, i32 4, i32 4, i32 2, i1 false) + + ret void +} +declare void @llvm.memcpy.element.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i32, i1) nounwind + +; CHECK: input module is broken!