Index: llvm/trunk/docs/LangRef.rst =================================================================== --- llvm/trunk/docs/LangRef.rst +++ llvm/trunk/docs/LangRef.rst @@ -10282,6 +10282,8 @@ to be aligned to some boundary, this can be specified as the fourth argument, otherwise it should be set to 0 or 1 (both meaning no alignment). +.. _int_memmove: + '``llvm.memmove``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ @@ -14178,4 +14180,81 @@ lowered to a call to the symbol ``__llvm_memcpy_element_unordered_atomic_*``. Where '*' is replaced with an actual element size. +Optimizer is allowed to inline memory copy when it's profitable to do so. + +'``llvm.memmove.element.unordered.atomic``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use +``llvm.memmove.element.unordered.atomic`` on any integer bit width and for +different address spaces. Not all targets support all bit widths however. + +:: + + declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* , + i8* , + i32 , + i32 ) + declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i64(i8* , + i8* , + i64 , + i32 ) + +Overview: +""""""""" + +The '``llvm.memmove.element.unordered.atomic.*``' intrinsic is a specialization +of the '``llvm.memmove.*``' intrinsic. It differs in that the ``dest`` and +``src`` are treated as arrays with elements that are exactly ``element_size`` +bytes, and the copy between buffers uses a sequence of +:ref:`unordered atomic ` load/store operations that are a positive +integer multiple of the ``element_size`` in size. + +Arguments: +"""""""""" + +The first three arguments are the same as they are in the +:ref:`@llvm.memmove ` intrinsic, with the added constraint that +``len`` is required to be a positive integer multiple of the ``element_size``. +If ``len`` is not a positive integer multiple of ``element_size``, then the +behaviour of the intrinsic is undefined. + +``element_size`` must be a compile-time constant positive power of two no +greater than a target-specific atomic access size limit. + +For each of the input pointers the ``align`` parameter attribute must be +specified. It must be a power of two no less than the ``element_size``. Caller +guarantees that both the source and destination pointers are aligned to that +boundary. + +Semantics: +"""""""""" + +The '``llvm.memmove.element.unordered.atomic.*``' intrinsic copies ``len`` bytes +of memory from the source location to the destination location. These locations +are allowed to overlap. The memory copy is performed as a sequence of load/store +operations where each access is guaranteed to be a multiple of ``element_size`` +bytes wide and aligned at an ``element_size`` boundary. + +The order of the copy is unspecified. The same value may be read from the source +buffer many times, but only one write is issued to the destination buffer per +element. It is well defined to have concurrent reads and writes to both source +and destination provided those reads and writes are unordered atomic when +specified. + +This intrinsic does not provide any additional ordering guarantees over those +provided by a set of unordered loads from the source location and stores to the +destination. + +Lowering: +""""""""" + +In the most general case call to the +'``llvm.memmove.element.unordered.atomic.*``' is lowered to a call to the symbol +``__llvm_memmove_element_unordered_atomic_*``. Where '*' is replaced with an +actual element size. + The optimizer is allowed to inline the memory copy when it's profitable to do so. Index: llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h =================================================================== --- llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h +++ llvm/trunk/include/llvm/CodeGen/RuntimeLibcalls.h @@ -340,6 +340,12 @@ MEMCPY_ELEMENT_UNORDERED_ATOMIC_8, MEMCPY_ELEMENT_UNORDERED_ATOMIC_16, + MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1, + MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2, + MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4, + MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8, + MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16, + // EXCEPTION HANDLING UNWIND_RESUME, @@ -515,6 +521,11 @@ /// MEMCPY_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or /// UNKNOW_LIBCALL if there is none. Libcall getMEMCPY_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); + + /// getMEMMOVE_ELEMENT_UNORDERED_ATOMIC - Return + /// MEMMOVE_ELEMENT_UNORDERED_ATOMIC_* value for the given element size or + /// UNKNOW_LIBCALL if there is none. + Libcall getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize); } } Index: llvm/trunk/include/llvm/IR/IntrinsicInst.h =================================================================== --- llvm/trunk/include/llvm/IR/IntrinsicInst.h +++ llvm/trunk/include/llvm/IR/IntrinsicInst.h @@ -296,6 +296,95 @@ } }; + class ElementUnorderedAtomicMemMoveInst : public IntrinsicInst { + private: + enum { ARG_DEST = 0, ARG_SOURCE = 1, ARG_LENGTH = 2, ARG_ELEMENTSIZE = 3 }; + + public: + Value *getRawDest() const { + return const_cast(getArgOperand(ARG_DEST)); + } + const Use &getRawDestUse() const { return getArgOperandUse(ARG_DEST); } + Use &getRawDestUse() { return getArgOperandUse(ARG_DEST); } + + /// Return the arguments to the instruction. + Value *getRawSource() const { + return const_cast(getArgOperand(ARG_SOURCE)); + } + const Use &getRawSourceUse() const { return getArgOperandUse(ARG_SOURCE); } + Use &getRawSourceUse() { return getArgOperandUse(ARG_SOURCE); } + + Value *getLength() const { + return const_cast(getArgOperand(ARG_LENGTH)); + } + const Use &getLengthUse() const { return getArgOperandUse(ARG_LENGTH); } + Use &getLengthUse() { return getArgOperandUse(ARG_LENGTH); } + + bool isVolatile() const { return false; } + + Value *getRawElementSizeInBytes() const { + return const_cast(getArgOperand(ARG_ELEMENTSIZE)); + } + + ConstantInt *getElementSizeInBytesCst() const { + return cast(getRawElementSizeInBytes()); + } + + uint32_t getElementSizeInBytes() const { + return getElementSizeInBytesCst()->getZExtValue(); + } + + /// This is just like getRawDest, but it strips off any cast + /// instructions that feed it, giving the original input. The returned + /// value is guaranteed to be a pointer. + Value *getDest() const { return getRawDest()->stripPointerCasts(); } + + /// This is just like getRawSource, but it strips off any cast + /// instructions that feed it, giving the original input. The returned + /// value is guaranteed to be a pointer. + Value *getSource() const { return getRawSource()->stripPointerCasts(); } + + unsigned getDestAddressSpace() const { + return cast(getRawDest()->getType())->getAddressSpace(); + } + + unsigned getSourceAddressSpace() const { + return cast(getRawSource()->getType())->getAddressSpace(); + } + + /// Set the specified arguments of the instruction. + void setDest(Value *Ptr) { + assert(getRawDest()->getType() == Ptr->getType() && + "setDest called with pointer of wrong type!"); + setArgOperand(ARG_DEST, Ptr); + } + + void setSource(Value *Ptr) { + assert(getRawSource()->getType() == Ptr->getType() && + "setSource called with pointer of wrong type!"); + setArgOperand(ARG_SOURCE, Ptr); + } + + void setLength(Value *L) { + assert(getLength()->getType() == L->getType() && + "setLength called with value of wrong type!"); + setArgOperand(ARG_LENGTH, L); + } + + void setElementSizeInBytes(Constant *V) { + assert(V->getType() == Type::getInt8Ty(getContext()) && + "setElementSizeInBytes called with value of wrong type!"); + setArgOperand(ARG_ELEMENTSIZE, V); + } + + static inline bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::memmove_element_unordered_atomic; + } + static inline bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + }; + /// This is the common base class for memset/memcpy/memmove. class MemIntrinsic : public IntrinsicInst { public: Index: llvm/trunk/include/llvm/IR/Intrinsics.td =================================================================== --- llvm/trunk/include/llvm/IR/Intrinsics.td +++ llvm/trunk/include/llvm/IR/Intrinsics.td @@ -873,6 +873,18 @@ ReadOnly<1> ]>; +// @llvm.memmove.element.unordered.atomic.*(dest, src, length, elementsize) +def int_memmove_element_unordered_atomic + : Intrinsic<[], + [ + llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i32_ty + ], + [ + IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>, + ReadOnly<1> + ]>; + + //===------------------------ Reduction Intrinsics ------------------------===// // def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty], Index: llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -4994,6 +4994,44 @@ DAG.setRoot(CallResult.second); return nullptr; } + case Intrinsic::memmove_element_unordered_atomic: { + auto &MI = cast(I); + SDValue Dst = getValue(MI.getRawDest()); + SDValue Src = getValue(MI.getRawSource()); + SDValue Length = getValue(MI.getLength()); + + // Emit a library call. + TargetLowering::ArgListTy Args; + TargetLowering::ArgListEntry Entry; + Entry.Ty = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + Entry.Node = Dst; + Args.push_back(Entry); + + Entry.Node = Src; + Args.push_back(Entry); + + Entry.Ty = MI.getLength()->getType(); + Entry.Node = Length; + Args.push_back(Entry); + + uint64_t ElementSizeConstant = MI.getElementSizeInBytes(); + RTLIB::Libcall LibraryCall = + RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(ElementSizeConstant); + if (LibraryCall == RTLIB::UNKNOWN_LIBCALL) + report_fatal_error("Unsupported element size"); + + TargetLowering::CallLoweringInfo CLI(DAG); + CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee( + TLI.getLibcallCallingConv(LibraryCall), + Type::getVoidTy(*DAG.getContext()), + DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall), + TLI.getPointerTy(DAG.getDataLayout())), + std::move(Args)); + + std::pair CallResult = TLI.LowerCallTo(CLI); + DAG.setRoot(CallResult.second); + return nullptr; + } case Intrinsic::dbg_declare: { const DbgDeclareInst &DI = cast(I); DILocalVariable *Variable = DI.getVariable(); Index: llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp =================================================================== --- llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp +++ llvm/trunk/lib/CodeGen/TargetLoweringBase.cpp @@ -384,6 +384,16 @@ "__llvm_memcpy_element_unordered_atomic_8"; Names[RTLIB::MEMCPY_ELEMENT_UNORDERED_ATOMIC_16] = "__llvm_memcpy_element_unordered_atomic_16"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1] = + "__llvm_memmove_element_unordered_atomic_1"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2] = + "__llvm_memmove_element_unordered_atomic_2"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4] = + "__llvm_memmove_element_unordered_atomic_4"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8] = + "__llvm_memmove_element_unordered_atomic_8"; + Names[RTLIB::MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16] = + "__llvm_memmove_element_unordered_atomic_16"; Names[RTLIB::UNWIND_RESUME] = "_Unwind_Resume"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_1] = "__sync_val_compare_and_swap_1"; Names[RTLIB::SYNC_VAL_COMPARE_AND_SWAP_2] = "__sync_val_compare_and_swap_2"; @@ -803,6 +813,23 @@ } } +RTLIB::Libcall RTLIB::getMEMMOVE_ELEMENT_UNORDERED_ATOMIC(uint64_t ElementSize) { + switch (ElementSize) { + case 1: + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_1; + case 2: + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_2; + case 4: + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_4; + case 8: + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_8; + case 16: + return MEMMOVE_ELEMENT_UNORDERED_ATOMIC_16; + default: + return UNKNOWN_LIBCALL; + } +} + /// InitCmpLibcallCCs - Set default comparison libcall CC. /// static void InitCmpLibcallCCs(ISD::CondCode *CCs) { Index: llvm/trunk/lib/IR/Verifier.cpp =================================================================== --- llvm/trunk/lib/IR/Verifier.cpp +++ llvm/trunk/lib/IR/Verifier.cpp @@ -4044,6 +4044,42 @@ "incorrect alignment of the source argument", CS); break; } + case Intrinsic::memmove_element_unordered_atomic: { + auto *MI = cast(CS.getInstruction()); + + ConstantInt *ElementSizeCI = + dyn_cast(MI->getRawElementSizeInBytes()); + Assert(ElementSizeCI, + "element size of the element-wise unordered atomic memory " + "intrinsic must be a constant int", + CS); + const APInt &ElementSizeVal = ElementSizeCI->getValue(); + Assert(ElementSizeVal.isPowerOf2(), + "element size of the element-wise atomic memory intrinsic " + "must be a power of 2", + CS); + + if (auto *LengthCI = dyn_cast(MI->getLength())) { + uint64_t Length = LengthCI->getZExtValue(); + uint64_t ElementSize = MI->getElementSizeInBytes(); + Assert((Length % ElementSize) == 0, + "constant length must be a multiple of the element size in the " + "element-wise atomic memory intrinsic", + CS); + } + + auto IsValidAlignment = [&](uint64_t Alignment) { + return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment); + }; + uint64_t DstAlignment = CS.getParamAlignment(0), + SrcAlignment = CS.getParamAlignment(1); + Assert(IsValidAlignment(DstAlignment), + "incorrect alignment of the destination argument", CS); + Assert(IsValidAlignment(SrcAlignment), + "incorrect alignment of the source argument", CS); + + break; + } case Intrinsic::gcroot: case Intrinsic::gcwrite: case Intrinsic::gcread: Index: llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll +++ llvm/trunk/test/CodeGen/X86/element-wise-atomic-memory-intrinsics.ll @@ -62,4 +62,67 @@ call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void } +define i8* @test_memmove1(i8* %P, i8* %Q) { + ; CHECK: test_memmove + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 1) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $1, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_1 +} + +define i8* @test_memmove2(i8* %P, i8* %Q) { + ; CHECK: test_memmove2 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 2, i32 2) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $2, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_2 +} + +define i8* @test_memmove4(i8* %P, i8* %Q) { + ; CHECK: test_memmove4 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 4, i32 4) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $4, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_4 +} + +define i8* @test_memmove8(i8* %P, i8* %Q) { + ; CHECK: test_memmove8 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %P, i8* align 8 %Q, i32 8, i32 8) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $8, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_8 +} + +define i8* @test_memmove16(i8* %P, i8* %Q) { + ; CHECK: test_memmove16 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %P, i8* align 16 %Q, i32 16, i32 16) + ret i8* %P + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $16, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_16 +} + +define void @test_memmove_args(i8** %Storage) { + ; CHECK: test_memmove_args + %Dst = load i8*, i8** %Storage + %Src.addr = getelementptr i8*, i8** %Storage, i64 1 + %Src = load i8*, i8** %Src.addr + + ; 1st arg (%rdi) + ; CHECK-DAG: movq (%rdi), [[REG1:%r.+]] + ; CHECK-DAG: movq [[REG1]], %rdi + ; 2nd arg (%rsi) + ; CHECK-DAG: movq 8(%rdi), %rsi + ; 3rd arg (%edx) -- length + ; CHECK-DAG: movl $4, %edx + ; CHECK: __llvm_memmove_element_unordered_atomic_4 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %Dst, i8* align 4 %Src, i32 4, i32 4) ret void +} + declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind Index: llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll =================================================================== --- llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll +++ llvm/trunk/test/Verifier/element-wise-atomic-memory-intrinsics.ll @@ -22,4 +22,28 @@ ret void } declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind + +define void @test_memmove(i8* %P, i8* %Q, i32 %A, i32 %E) { + ; CHECK: element size of the element-wise unordered atomic memory intrinsic must be a constant int + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 %E) + ; CHECK: element size of the element-wise atomic memory intrinsic must be a power of 2 + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 1, i32 3) + + ; CHECK: constant length must be a multiple of the element size in the element-wise atomic memory intrinsic + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 4 %Q, i32 7, i32 4) + + ; CHECK: incorrect alignment of the destination argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* %P, i8* align 4 %Q, i32 1, i32 1) + ; CHECK: incorrect alignment of the destination argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %P, i8* align 4 %Q, i32 4, i32 4) + + ; CHECK: incorrect alignment of the source argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* %Q, i32 1, i32 1) + ; CHECK: incorrect alignment of the source argument + call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %P, i8* align 1 %Q, i32 4, i32 4) + + ret void +} +declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind + ; CHECK: input module is broken!