Index: llvm/trunk/docs/LangRef.rst =================================================================== --- llvm/trunk/docs/LangRef.rst +++ llvm/trunk/docs/LangRef.rst @@ -12661,3 +12661,79 @@ LLVM provides experimental intrinsics to support runtime patching mechanisms commonly desired in dynamic language JITs. These intrinsics are described in :doc:`StackMaps`. + +Element Wise Atomic Memory Intrinsics +----------------------------- + +These intrinsics are similar to the standard library memory intrinsics except +that they perform memory transfer as a sequence of atomic memory accesses. + +.. _int_memcpy_element_atomic: + +'``llvm.memcpy.element.atomic``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.memcpy.element.atomic`` on +any integer bit width and for different address spaces. Not all targets +support all bit widths however. + +:: + + declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* , i8* , + i64 , i32 ) + +Overview: +""""""""" + +The '``llvm.memcpy.element.atomic.*``' intrinsic performs copy of a block of +memory from the source location to the destination location as a sequence of +unordered atomic memory accesses where each access is a multiple of +``element_size`` bytes wide and aligned at an element size boundary. For example +each element is accessed atomically in source and destination buffers. + +Arguments: +"""""""""" + +The first argument is a pointer to the destination, the second is a +pointer to the source. The third argument is an integer argument +specifying the number of elements to copy, the fourth argument is size of +the single element in bytes. + +``element_size`` should be a power of two, greater than zero and less than +a target-specific atomic access size limit. + +For each of the input pointers ``align`` parameter attribute must be specified. +It must be a power of two and greater than or equal to the ``element_size``. +Caller guarantees that both the source and destination pointers are aligned to +that boundary. + +Semantics: +"""""""""" + +The '``llvm.memcpy.element.atomic.*``' intrinsic copies +'``num_elements`` * ``element_size``' bytes of memory from the source location to +the destination location. These locations are not allowed to overlap. Memory copy +is performed as a sequence of unordered atomic memory accesses where each access +is guaranteed to be a multiple of ``element_size`` bytes wide and aligned at an +element size boundary. + +The order of the copy is unspecified. The same value may be read from the source +buffer many times, but only one write is issued to the destination buffer per +element. It is well defined to have concurrent reads and writes to both source +and destination provided those reads and writes are at least unordered atomic. + +This intrinsic does not provide any additional ordering guarantees over those +provided by a set of unordered loads from the source location and stores to the +destination. + +Lowering: +"""""""""" + +In the most general case call to the '``llvm.memcpy.element.atomic.*``' is lowered +to a call to the symbol ``__llvm_memcpy_element_atomic_*``. Where '*' is replaced +with an actual element size. + +Optimizer is allowed to inline memory copy when it's profitable to do so. Index: llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h +++ llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h @@ -333,6 +333,13 @@ MEMSET, MEMMOVE, + // ELEMENT-WISE ATOMIC MEMORY + MEMCPY_ELEMENT_ATOMIC_1, + MEMCPY_ELEMENT_ATOMIC_2, + MEMCPY_ELEMENT_ATOMIC_4, + MEMCPY_ELEMENT_ATOMIC_8, + MEMCPY_ELEMENT_ATOMIC_16, + // EXCEPTION HANDLING UNWIND_RESUME, @@ -503,6 +510,10 @@ /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or /// UNKNOWN_LIBCALL if there is none. Libcall getSYNC(unsigned Opc, MVT VT); + + /// getMEMCPY_ELEMENT_ATOMIC - Return MEMCPY_ELEMENT_ATOMIC_* value for the + /// given element size or UNKNOW_LIBCALL if there is none. + Libcall getMEMCPY_ELEMENT_ATOMIC(uint64_t ElementSize); } } Index: llvm/trunk/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/trunk/include/llvm/IR/Intrinsics.td +++ llvm/trunk/include/llvm/IR/Intrinsics.td @@ -759,6 +759,15 @@ def int_load_relative: Intrinsic<[llvm_ptr_ty], [llvm_ptr_ty, llvm_anyint_ty], [IntrReadMem, IntrArgMemOnly]>; +//===------ Memory intrinsics with element-wise atomicity guarantees ------===// +// + +def int_memcpy_element_atomic : Intrinsic<[], + [llvm_anyptr_ty, llvm_anyptr_ty, + llvm_i64_ty, llvm_i32_ty], + [IntrArgMemOnly, NoCapture<0>, NoCapture<1>, + WriteOnly<0>, ReadOnly<1>]>; + //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4896,6 +4896,51 @@ updateDAGForMaybeTailCall(MM); return nullptr; } + case Intrinsic::memcpy_element_atomic: { + SDValue Dst = getValue(I.getArgOperand(0)); + SDValue Src = getValue(I.getArgOperand(1)); + SDValue NumElements = getValue(I.getArgOperand(2)); + SDValue ElementSize = getValue(I.getArgOperand(3)); + + // Emit a library call. + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + + Entry.Node = Src; + Args.push_back(Entry); + + Entry.Ty = I.getArgOperand(2)->getType(); + Entry.Node = NumElements; + Args.push_back(Entry); + + Entry.Ty = Type::getInt32Ty(*DAG.getContext()); + Entry.Node = ElementSize; + Args.push_back(Entry); + + uint64_t ElementSizeConstant = + cast(I.getArgOperand(3))->getZExtValue(); + RTLIB::Libcall LibraryCall = + RTLIB::getMEMCPY_ELEMENT_ATOMIC(ElementSizeConstant); + if (LibraryCall == RTLIB::UNKNOWN_LIBCALL) + report_fatal_error("Unsupported element size"); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(sdl) + .setChain(getRoot()) + .setCallee(TLI.getLibcallCallingConv(LibraryCall), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol( + TLI.getLibcallName(LibraryCall), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); + + std::pair CallResult = TLI.LowerCallTo(CLI); + DAG.setRoot(CallResult.second); + return nullptr; + } case Intrinsic::dbg_declare: { const DbgDeclareInst &DI = cast(I); DILocalVariable *Variable = DI.getVariable(); Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp @@ -361,6 +361,11 @@ Names[RTLIB::MEMCPY] = "memcpy"; Names[RTLIB::MEMMOVE] = "memmove"; Names[RTLIB::MEMSET] = "memset"; + Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_1] = "__llvm_memcpy_element_atomic_1"; + Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_2] = "__llvm_memcpy_element_atomic_2"; + Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_4] = "__llvm_memcpy_element_atomic_4"; + Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_8] = "__llvm_memcpy_element_atomic_8"; + Names[RTLIB::MEMCPY_ELEMENT_ATOMIC_16] = "__llvm_memcpy_element_atomic_16"; Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2"; @@ -763,6 +768,24 @@ return UNKNOWN_LIBCALL; } +RTLIB::Libcall RTLIB::getMEMCPY_ELEMENT_ATOMIC(uint64_t ElementSize) { + switch (ElementSize) { + case 1: + return MEMCPY_ELEMENT_ATOMIC_1; + case 2: + return MEMCPY_ELEMENT_ATOMIC_2; + case 4: + return MEMCPY_ELEMENT_ATOMIC_4; + case 8: + return MEMCPY_ELEMENT_ATOMIC_8; + case 16: + return MEMCPY_ELEMENT_ATOMIC_16; + default: + return UNKNOWN_LIBCALL; + } + +} + /// InitCmpLibcallCCs - Set default comparison libcall CC. /// static void InitCmpLibcallCCs(ISD::CondCode *CCs) { Index: llvm/trunk/lib/IR/Verifier.cpp =================================================================== --- llvm/trunk/lib/IR/Verifier.cpp +++ llvm/trunk/lib/IR/Verifier.cpp @@ -3952,6 +3952,32 @@ CS); break; } + case Intrinsic::memcpy_element_atomic: { + ConstantInt *ElementSizeCI = dyn_cast(CS.getArgOperand(3)); + Assert(ElementSizeCI, "element size of the element-wise atomic memory " + "intrinsic must be a constant int", + CS); + const APInt &ElementSizeVal = ElementSizeCI->getValue(); + Assert(ElementSizeVal.isPowerOf2(), + "element size of the element-wise atomic memory intrinsic " + "must be a power of 2", + CS); + + auto IsValidAlignment = [&](uint64_t Alignment) { + return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment); + }; + + uint64_t DstAlignment = CS.getParamAlignment(1), + SrcAlignment = CS.getParamAlignment(2); + + Assert(IsValidAlignment(DstAlignment), + "incorrect alignment of the destination argument", + CS); + Assert(IsValidAlignment(SrcAlignment), + "incorrect alignment of the source argument", + CS); + break; + } case Intrinsic::gcroot: case Intrinsic::gcwrite: case Intrinsic::gcread: Index: llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll @@ -0,0 +1,68 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s + +define i8* @test_memcpy1(i8* %P, i8* %Q) { + ; CHECK: test_memcpy + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 1, i32 1) + ret i8* %P + ; CHECK-DAG: movl $1, %edx + ; CHECK-DAG: movl $1, %ecx + ; CHECK: __llvm_memcpy_element_atomic_1 +} + +define i8* @test_memcpy2(i8* %P, i8* %Q) { + ; CHECK: test_memcpy2 + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 2, i32 2) + ret i8* %P + ; CHECK-DAG: movl $2, %edx + ; CHECK-DAG: movl $2, %ecx + ; CHECK: __llvm_memcpy_element_atomic_2 +} + +define i8* @test_memcpy4(i8* %P, i8* %Q) { + ; CHECK: test_memcpy4 + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 4 %Q, i64 4, i32 4) + ret i8* %P + ; CHECK-DAG: movl $4, %edx + ; CHECK-DAG: movl $4, %ecx + ; CHECK: __llvm_memcpy_element_atomic_4 +} + +define i8* @test_memcpy8(i8* %P, i8* %Q) { + ; CHECK: test_memcpy8 + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 8 %P, i8* align 8 %Q, i64 8, i32 8) + ret i8* %P + ; CHECK-DAG: movl $8, %edx + ; CHECK-DAG: movl $8, %ecx + ; CHECK: __llvm_memcpy_element_atomic_8 +} + +define i8* @test_memcpy16(i8* %P, i8* %Q) { + ; CHECK: test_memcpy16 + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %P, i8* align 16 %Q, i64 16, i32 16) + ret i8* %P + ; CHECK-DAG: movl $16, %edx + ; CHECK-DAG: movl $16, %ecx + ; CHECK: __llvm_memcpy_element_atomic_16 +} + +define void @test_memcpy_args(i8** %Storage) { + ; CHECK: test_memcpy_args + %Dst = load i8*, i8** %Storage + %Src.addr = getelementptr i8*, i8** %Storage, i64 1 + %Src = load i8*, i8** %Src.addr + + ; First argument + ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]] + ; CHECK-DAG: movq [[REG1]], %rdi + ; Second argument + ; CHECK-DAG: movq 8(%rdi), %rsi + ; Third argument + ; CHECK-DAG: movl $4, %edx + ; Fourth argument + ; CHECK-DAG: movl $4, %ecx + ; CHECK: __llvm_memcpy_element_atomic_4 + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 4, i32 4) + ret void +} + +declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32) nounwind Index: llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll =================================================================== --- llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll +++ llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll @@ -0,0 +1,17 @@ +; RUN: not opt -verify < %s 2>&1 | FileCheck %s + +define void @test_memcpy(i8* %P, i8* %Q) { + ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2 + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 2 %P, i8* align 2 %Q, i64 4, i32 3) + + ; CHECK: incorrect alignment of the destination argument + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 2 %P, i8* align 4 %Q, i64 4, i32 4) + + ; CHECK: incorrect alignment of the source argument + call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %P, i8* align 2 %Q, i64 4, i32 4) + + ret void +} +declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32) nounwind + +; CHECK: input module is broken!