diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst --- a/clang/docs/LanguageExtensions.rst +++ b/clang/docs/LanguageExtensions.rst @@ -3218,6 +3218,26 @@ Note that this intrinsic cannot yet be called in a ``constexpr`` context. +Guaranteed inlined memset +^^^^^^^^^^^^^^^^^^^^^^^^^ + +. code-block:: c + + void __builtin_memset_inline(void *dst, int value, size_t size); + + +``__builtin_memset_inline`` has been designed as a building block for efficient +``memset`` implementations. It is identical to ``__builtin_memset`` but also +guarantees not to call any external functions. See LLVM IR `llvm.memset.inline +`_ intrinsic +for more information. + +This is useful to implement a custom version of ``memset``, implement a +``libc`` memset or work around the absence of a ``libc``. + +Note that the `size` argument must be a compile time constant. + +Note that this intrinsic cannot yet be called in a ``constexpr`` context. Atomic Min/Max builtins with memory ordering -------------------------------------------- diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -559,6 +559,7 @@ BUILTIN(__builtin_memmove, "v*v*vC*z", "nF") BUILTIN(__builtin_mempcpy, "v*v*vC*z", "nF") BUILTIN(__builtin_memset, "v*v*iz", "nF") +BUILTIN(__builtin_memset_inline, "vv*iIz", "n") BUILTIN(__builtin_printf, "icC*.", "Fp:0:") BUILTIN(__builtin_stpcpy, "c*c*cC*", "nF") BUILTIN(__builtin_stpncpy, "c*c*cC*z", "nF") diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h --- a/clang/lib/CodeGen/CGBuilder.h +++ b/clang/lib/CodeGen/CGBuilder.h @@ -344,6 +344,14 @@ Dest.getAlignment().getAsAlign(), IsVolatile); } + using CGBuilderBaseTy::CreateMemSetInline; + llvm::CallInst *CreateMemSetInline(Address Dest, llvm::Value *Value, + uint64_t Size) { + return CreateMemSetInline(Dest.getPointer(), + Dest.getAlignment().getAsAlign(), Value, + getInt64(Size)); + } + using CGBuilderBaseTy::CreatePreserveStructAccessIndex; Address CreatePreserveStructAccessIndex(Address Addr, unsigned Index, unsigned FieldIndex, diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -3508,6 +3508,17 @@ Builder.CreateMemSet(Dest, ByteVal, SizeVal, false); return RValue::get(Dest.getPointer()); } + case Builtin::BI__builtin_memset_inline: { + Address Dest = EmitPointerWithAlignment(E->getArg(0)); + Value *ByteVal = + Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty()); + uint64_t Size = + E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue(); + EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(), + E->getArg(0)->getExprLoc(), FD, 0); + Builder.CreateMemSetInline(Dest, ByteVal, Size); + return RValue::get(nullptr); + } case Builtin::BI__builtin___memset_chk: { // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2. Expr::EvalResult SizeResult, DstSizeResult; diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -2277,6 +2277,17 @@ } break; } + case Builtin::BI__builtin_memset_inline: { + clang::Expr *SizeOp = TheCall->getArg(2); + // We warn about filling to `nullptr` pointers when `size` is greater than + // 0. When `size` is value dependent we cannot evaluate its value so we bail + // out. + if (SizeOp->isValueDependent()) + break; + if (!SizeOp->EvaluateKnownConstInt(Context).isZero()) + CheckNonNullArgument(*this, TheCall->getArg(0), TheCall->getExprLoc()); + break; + } #define BUILTIN(ID, TYPE, ATTRS) #define ATOMIC_BUILTIN(ID, TYPE, ATTRS) \ case Builtin::BI##ID: \ diff --git a/clang/test/CodeGen/builtins-memset-inline.c b/clang/test/CodeGen/builtins-memset-inline.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/builtins-memset-inline.c @@ -0,0 +1,21 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: x86-registered-target +// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s + +// CHECK-LABEL: define{{.*}} void @test_memset_inline_0(i8* noundef %dst, i8 noundef signext %value) +void test_memset_inline_0(void *dst, char value) { + // CHECK: call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 0, i1 false) + __builtin_memset_inline(dst, value, 0); +} + +// CHECK-LABEL: define{{.*}} void @test_memset_inline_1(i8* noundef %dst, i8 noundef signext %value) +void test_memset_inline_1(void *dst, char value) { + // CHECK: call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 1, i1 false) + __builtin_memset_inline(dst, value, 1); +} + +// CHECK-LABEL: define{{.*}} void @test_memset_inline_4(i8* noundef %dst, i8 noundef signext %value) +void test_memset_inline_4(void *dst, char value) { + // CHECK: call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 4, i1 false) + __builtin_memset_inline(dst, value, 4); +} diff --git a/clang/test/Sema/builtins-memset-inline.cpp b/clang/test/Sema/builtins-memset-inline.cpp new file mode 100644 --- /dev/null +++ b/clang/test/Sema/builtins-memset-inline.cpp @@ -0,0 +1,40 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +#define NULL ((char *)0) + +#if __has_builtin(__builtin_memset_inline) +#warning defined as expected +// expected-warning@-1 {{defined as expected}} +#endif + +void test_memset_inline_invalid_arg_types() { + __builtin_memset_inline(1, 2, 3); // expected-error {{cannot initialize a parameter of type 'void *' with an rvalue of type 'int'}} +} + +void test_memset_inline_null_dst(void *ptr) { + __builtin_memset_inline(NULL, 1, 4); // expected-warning {{null passed to a callee that requires a non-null argument}} +} + +void test_memset_inline_null_buffer_is_ok_if_size_is_zero(void *ptr, char value) { + __builtin_memset_inline(NULL, value, /*size */ 0); +} + +void test_memset_inline_non_constant_size(void *dst, char value, unsigned size) { + __builtin_memset_inline(dst, value, size); // expected-error {{argument to '__builtin_memset_inline' must be a constant integer}} +} + +template +void test_memset_inline_template(void *dst, char value) { + // we do not try to evaluate size in non intantiated templates. + __builtin_memset_inline(dst, value, size); +} + +void test_memset_inline_implicit_conversion(void *ptr, char value) { + char a[5]; + __builtin_memset_inline(a, value, 5); +} + +void test_memset_inline_num_args(void *dst, char value) { + __builtin_memset_inline(); // expected-error {{too few arguments to function call}} + __builtin_memset_inline(dst, value, 4, NULL); // expected-error {{too many arguments to function call}} +} diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -13867,6 +13867,71 @@ If ```` is not zero, ```` should be well-defined, otherwise the behavior is undefined. +.. _int_memset_inline: + +'``llvm.memset.inline``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +This is an overloaded intrinsic. You can use ``llvm.memset.inline`` on any +integer bit width and for different address spaces. Not all targets +support all bit widths however. + +:: + + declare void @llvm.memset.inline.p0i8.p0i8.i32(i8* , i8 , + i32 , + i1 ) + declare void @llvm.memset.inline.p0i8.p0i8.i64(i8* , i8 , + i64 , + i1 ) + +Overview: +""""""""" + +The '``llvm.memset.inline.*``' intrinsics fill a block of memory with a +particular byte value and guarantees that no external functions are called. + +Note that, unlike the standard libc function, the ``llvm.memset.inline.*`` +intrinsics do not return a value, take an extra isvolatile argument and the +pointer can be in specified address spaces. + +Arguments: +"""""""""" + +The first argument is a pointer to the destination to fill, the second +is the byte value with which to fill it, the third argument is a constant +integer argument specifying the number of bytes to fill, and the fourth +is a boolean indicating a volatile access. + +The :ref:`align ` parameter attribute can be provided +for the first argument. + +If the ``isvolatile`` parameter is ``true``, the ``llvm.memset.inline`` call is +a :ref:`volatile operation `. The detailed access behavior is not +very cleanly specified and it is unwise to depend on it. + +Semantics: +"""""""""" + +The '``llvm.memset.inline.*``' intrinsics fill "len" bytes of memory starting +at the destination location. If the argument is known to be +aligned to some boundary, this can be specified as an attribute on +the argument. + +``len`` must be a constant expression. +If ```` is 0, it is no-op modulo the behavior of attributes attached to +the arguments. +If ```` is not a well-defined value, the behavior is undefined. +If ```` is not zero, ```` should be well-defined, otherwise the +behavior is undefined. + +The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of +'``llvm.memset.*``', but the generated code is guaranteed not to call any +external functions. + '``llvm.sqrt.*``' Intrinsic ^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1052,7 +1052,8 @@ const AAMDNodes &AAInfo = AAMDNodes()); SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, - SDValue Size, Align Alignment, bool isVol, bool isTailCall, + SDValue Size, Align Alignment, bool isVol, + bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, const AAMDNodes &AAInfo = AAMDNodes()); diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h --- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h +++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h @@ -76,11 +76,13 @@ /// that don't fit the target's parameters for simple stores and can be more /// efficient than using a library call. This function can return a null /// SDValue if the target declines to use custom code and a different - /// lowering strategy should be used. + /// lowering strategy should be used. Note that if AlwaysInline is true the + /// function has to return a valid SDValue. virtual SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, SDValue Op3, Align Alignment, bool isVolatile, + bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { return SDValue(); } diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -3539,6 +3539,7 @@ /// Determines the optimal series of memory ops to replace the memset / memcpy. /// Return true if the number of memory ops is below the threshold (Limit). + /// Note that this is always the case when Limit is ~0. /// It returns the types of the sequence of memory ops to perform /// memset / memcpy by reference. virtual bool diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -578,6 +578,12 @@ MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); + CallInst *CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Val, + Value *Size, bool IsVolatile = false, + MDNode *TBAATag = nullptr, + MDNode *ScopeTag = nullptr, + MDNode *NoAliasTag = nullptr); + /// Create and insert an element unordered-atomic memset of the region of /// memory starting at the given pointer to the given value. /// diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -973,6 +973,7 @@ case Intrinsic::memcpy: case Intrinsic::memmove: case Intrinsic::memset: + case Intrinsic::memset_inline: case Intrinsic::memcpy_inline: return true; default: @@ -984,12 +985,33 @@ } }; -/// This class wraps the llvm.memset intrinsic. +/// This class wraps the llvm.memset and llvm.memset.inline intrinsics. class MemSetInst : public MemSetBase { public: // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { - return I->getIntrinsicID() == Intrinsic::memset; + switch (I->getIntrinsicID()) { + case Intrinsic::memset: + case Intrinsic::memset_inline: + return true; + default: + return false; + } + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } +}; + +/// This class wraps the llvm.memset.inline intrinsic. +class MemSetInlineInst : public MemSetInst { +public: + ConstantInt *getLength() const { + return cast(MemSetInst::getLength()); + } + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::memset_inline; } static bool classof(const Value *V) { return isa(V) && classof(cast(V)); @@ -1074,6 +1096,7 @@ case Intrinsic::memcpy_inline: case Intrinsic::memmove: case Intrinsic::memset: + case Intrinsic::memset_inline: case Intrinsic::memcpy_element_unordered_atomic: case Intrinsic::memmove_element_unordered_atomic: case Intrinsic::memset_element_unordered_atomic: @@ -1095,6 +1118,7 @@ static bool classof(const IntrinsicInst *I) { switch (I->getIntrinsicID()) { case Intrinsic::memset: + case Intrinsic::memset_inline: case Intrinsic::memset_element_unordered_atomic: return true; default: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -651,6 +651,17 @@ NoCapture>, WriteOnly>, ImmArg>]>; +// Memset version that is guaranteed to be inlined. +// In particular this means that the generated code is not allowed to call any +// external function. +// The third argument (specifying the size) must be a constant. +def int_memset_inline + : Intrinsic<[], + [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i1_ty], + [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree, + NoCapture>, WriteOnly>, + ImmArg>, ImmArg>]>; + // FIXME: Add version of these floating point intrinsics which allow non-default // rounding modes and FP exception handling. diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp --- a/llvm/lib/Analysis/Lint.cpp +++ b/llvm/lib/Analysis/Lint.cpp @@ -335,6 +335,12 @@ MSI->getDestAlign(), nullptr, MemRef::Write); break; } + case Intrinsic::memset_inline: { + MemSetInlineInst *MSII = cast(&I); + visitMemoryReference(I, MemoryLocation::getForDest(MSII), + MSII->getDestAlign(), nullptr, MemRef::Write); + break; + } case Intrinsic::vastart: Check(I.getParent()->getParent()->isVarArg(), diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -6987,17 +6987,18 @@ /// \param Size Number of bytes to write. /// \param Alignment Alignment of the destination in bytes. /// \param isVol True if destination is volatile. +/// \param AlwaysInline Makes sure no function call is generated. /// \param DstPtrInfo IR information on the memory pointer. /// \returns New head in the control flow, if lowering was successful, empty /// SDValue otherwise. /// /// The function tries to replace 'llvm.memset' intrinsic with several store /// operations and value calculation code. This is usually profitable for small -/// memory size. +/// memory size or when the semantic requires inlining. static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, Align Alignment, bool isVol, - MachinePointerInfo DstPtrInfo, + bool AlwaysInline, MachinePointerInfo DstPtrInfo, const AAMDNodes &AAInfo) { // Turn a memset of undef to nop. // FIXME: We need to honor volatile even is Src is undef. @@ -7017,8 +7018,10 @@ DstAlignCanChange = true; bool IsZeroVal = isa(Src) && cast(Src)->isZero(); + unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize); + if (!TLI.findOptimalMemOpLowering( - MemOps, TLI.getMaxStoresPerMemset(OptSize), + MemOps, Limit, MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol), DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes())) return SDValue(); @@ -7314,7 +7317,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, - bool isVol, bool isTailCall, + bool isVol, bool AlwaysInline, bool isTailCall, MachinePointerInfo DstPtrInfo, const AAMDNodes &AAInfo) { // Check to see if we should lower the memset to stores first. @@ -7327,7 +7330,7 @@ SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment, - isVol, DstPtrInfo, AAInfo); + isVol, false, DstPtrInfo, AAInfo); if (Result.getNode()) return Result; @@ -7337,11 +7340,23 @@ // code. If the target chooses to do this, this is the next best. if (TSI) { SDValue Result = TSI->EmitTargetCodeForMemset( - *this, dl, Chain, Dst, Src, Size, Alignment, isVol, DstPtrInfo); + *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline, DstPtrInfo); if (Result.getNode()) return Result; } + // If we really need inline code and the target declined to provide it, + // use a (potentially long) sequence of loads and stores. + if (AlwaysInline) { + assert(ConstantSize && "AlwaysInline requires a constant size!"); + SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src, + ConstantSize->getZExtValue(), Alignment, + isVol, true, DstPtrInfo, AAInfo); + assert(Result && + "getMemsetStores must return a valid sequence when AlwaysInline"); + return Result; + } + checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace()); // Emit a library call. diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5909,10 +5909,28 @@ bool isVol = MSI.isVolatile(); bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget()); SDValue Root = isVol ? getRoot() : getMemoryRoot(); - SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Alignment, isVol, isTC, + SDValue MS = DAG.getMemset( + Root, sdl, Op1, Op2, Op3, Alignment, isVol, /* AlwaysInline */ false, + isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata()); + updateDAGForMaybeTailCall(MS); + return; + } + case Intrinsic::memset_inline: { + const auto &MSII = cast(I); + SDValue Dst = getValue(I.getArgOperand(0)); + SDValue Value = getValue(I.getArgOperand(1)); + SDValue Size = getValue(I.getArgOperand(2)); + assert(isa(Size) && "memset_inline needs constant size"); + // @llvm.memset defines 0 and 1 to both mean no alignment. + Align DstAlign = MSII.getDestAlign().valueOrOne(); + bool isVol = MSII.isVolatile(); + bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget()); + SDValue Root = isVol ? getRoot() : getMemoryRoot(); + SDValue MC = DAG.getMemset(Root, sdl, Dst, Value, Size, DstAlign, isVol, + /* AlwaysInline */ true, isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata()); - updateDAGForMaybeTailCall(MS); + updateDAGForMaybeTailCall(MC); return; } case Intrinsic::memmove: { diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -196,7 +196,8 @@ bool TargetLowering::findOptimalMemOpLowering( std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes) const { - if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign()) + if (Limit != ~unsigned(0) && Op.isMemcpyWithFixedDstAlign() && + Op.getSrcAlign() < Op.getDstAlign()) return false; EVT VT = getOptimalMemOpType(Op, FuncAttributes); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -164,6 +164,35 @@ return CI; } +CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, + Value *Val, Value *Size, + bool IsVolatile, MDNode *TBAATag, + MDNode *ScopeTag, + MDNode *NoAliasTag) { + Dst = getCastedInt8PtrValue(Dst); + Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)}; + Type *Tys[] = {Dst->getType(), Size->getType()}; + Module *M = BB->getParent()->getParent(); + Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys); + + CallInst *CI = createCallHelper(TheFn, Ops, this); + + if (DstAlign) + cast(CI)->setDestAlignment(*DstAlign); + + // Set the TBAA info if present. + if (TBAATag) + CI->setMetadata(LLVMContext::MD_tbaa, TBAATag); + + if (ScopeTag) + CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); + + if (NoAliasTag) + CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + + return CI; +} + CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet( Value *Ptr, Value *Val, Value *Size, Align Alignment, uint32_t ElementSize, MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) { diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -4917,7 +4917,8 @@ case Intrinsic::memcpy: case Intrinsic::memcpy_inline: case Intrinsic::memmove: - case Intrinsic::memset: { + case Intrinsic::memset: + case Intrinsic::memset_inline: { const auto *MI = cast(&Call); auto IsValidAlignment = [&](unsigned Alignment) -> bool { return Alignment == 0 || isPowerOf2_32(Alignment); diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h @@ -34,7 +34,7 @@ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, - bool isVolatile, + bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const override; SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp @@ -91,7 +91,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, Align Alignment, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { const AArch64Subtarget &STI = DAG.getMachineFunction().getSubtarget(); diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h @@ -55,6 +55,7 @@ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Op1, SDValue Op2, SDValue Op3, Align Alignment, bool isVolatile, + bool AlwaysInline, MachinePointerInfo DstPtrInfo) const override; SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl, diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp --- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp +++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp @@ -296,7 +296,7 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, - SDValue Size, Align Alignment, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { const ARMSubtarget &Subtarget = @@ -314,6 +314,9 @@ DAG.getZExtOrTrunc(Size, dl, MVT::i32)); } - return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, - Alignment.value(), RTLIB::MEMSET); + if (!AlwaysInline) + return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, + Alignment.value(), RTLIB::MEMSET); + + return SDValue(); } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1000,13 +1000,15 @@ unsigned SrcAS, const AttributeList &FuncAttributes) const { const int MVCFastLen = 16; - // Don't expand Op into scalar loads/stores in these cases: - if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen) - return false; // Small memcpy: Use MVC - if (Op.isMemset() && Op.size() - 1 <= MVCFastLen) - return false; // Small memset (first byte with STC/MVI): Use MVC - if (Op.isZeroMemset()) - return false; // Memset zero: Use XC + if (Limit != ~unsigned(0)) { + // Don't expand Op into scalar loads/stores in these cases: + if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen) + return false; // Small memcpy: Use MVC + if (Op.isMemset() && Op.size() - 1 <= MVCFastLen) + return false; // Small memset (first byte with STC/MVI): Use MVC + if (Op.isZeroMemset()) + return false; // Memset zero: Use XC + } return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS, SrcAS, FuncAttributes); diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h @@ -31,7 +31,7 @@ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Byte, SDValue Size, Align Alignment, - bool IsVolatile, + bool IsVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const override; std::pair diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp --- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp @@ -88,7 +88,7 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Byte, SDValue Size, Align Alignment, bool IsVolatile, - MachinePointerInfo DstPtrInfo) const { + bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { EVT PtrVT = Dst.getValueType(); if (IsVolatile) diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h @@ -37,6 +37,7 @@ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Op1, SDValue Op2, SDValue Op3, Align Alignment, bool IsVolatile, + bool AlwaysInline, MachinePointerInfo DstPtrInfo) const override; }; diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp --- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp @@ -44,7 +44,7 @@ SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val, - SDValue Size, Align Alignment, bool IsVolatile, + SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { auto &ST = DAG.getMachineFunction().getSubtarget(); if (!ST.hasBulkMemory()) diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/llvm/lib/Target/X86/X86SelectionDAGInfo.h --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -29,7 +29,7 @@ SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, - bool isVolatile, + bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const override; SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl, diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -46,7 +46,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, - SDValue Size, Align Alignment, bool isVolatile, + SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo) const { ConstantSDNode *ConstantSize = dyn_cast(Size); const X86Subtarget &Subtarget = @@ -143,7 +143,8 @@ DAG.getNode(ISD::ADD, dl, AddrVT, Dst, DAG.getConstant(Offset, dl, AddrVT)), Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment, - isVolatile, false, DstPtrInfo.getWithOffset(Offset)); + isVolatile, AlwaysInline, + /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset)); } // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain. diff --git a/llvm/test/CodeGen/AArch64/memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-inline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/memset-inline.ll @@ -0,0 +1,296 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=-neon | FileCheck %s --check-prefixes=ALL,GPR +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=neon | FileCheck %s --check-prefixes=ALL,NEON + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +; ///////////////////////////////////////////////////////////////////////////// + +define void @memset_1(i8* %a, i8 %value) nounwind { +; ALL-LABEL: memset_1: +; ALL: // %bb.0: +; ALL-NEXT: strb w1, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0) + ret void +} + +define void @memset_2(i8* %a, i8 %value) nounwind { +; ALL-LABEL: memset_2: +; ALL: // %bb.0: +; ALL-NEXT: bfi w1, w1, #8, #24 +; ALL-NEXT: strh w1, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0) + ret void +} + +define void @memset_4(i8* %a, i8 %value) nounwind { +; ALL-LABEL: memset_4: +; ALL: // %bb.0: +; ALL-NEXT: mov w8, #16843009 +; ALL-NEXT: and w9, w1, #0xff +; ALL-NEXT: mul w8, w9, w8 +; ALL-NEXT: str w8, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0) + ret void +} + +define void @memset_8(i8* %a, i8 %value) nounwind { +; ALL-LABEL: memset_8: +; ALL: // %bb.0: +; ALL-NEXT: // kill: def $w1 killed $w1 def $x1 +; ALL-NEXT: mov x8, #72340172838076673 +; ALL-NEXT: and x9, x1, #0xff +; ALL-NEXT: mul x8, x9, x8 +; ALL-NEXT: str x8, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0) + ret void +} + +define void @memset_16(i8* %a, i8 %value) nounwind { +; ALL-LABEL: memset_16: +; ALL: // %bb.0: +; ALL-NEXT: // kill: def $w1 killed $w1 def $x1 +; ALL-NEXT: mov x8, #72340172838076673 +; ALL-NEXT: and x9, x1, #0xff +; ALL-NEXT: mul x8, x9, x8 +; ALL-NEXT: stp x8, x8, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0) + ret void +} + +define void @memset_32(i8* %a, i8 %value) nounwind { +; GPR-LABEL: memset_32: +; GPR: // %bb.0: +; GPR-NEXT: // kill: def $w1 killed $w1 def $x1 +; GPR-NEXT: mov x8, #72340172838076673 +; GPR-NEXT: and x9, x1, #0xff +; GPR-NEXT: mul x8, x9, x8 +; GPR-NEXT: stp x8, x8, [x0, #16] +; GPR-NEXT: stp x8, x8, [x0] +; GPR-NEXT: ret +; +; NEON-LABEL: memset_32: +; NEON: // %bb.0: +; NEON-NEXT: dup v0.16b, w1 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0) + ret void +} + +define void @memset_64(i8* %a, i8 %value) nounwind { +; GPR-LABEL: memset_64: +; GPR: // %bb.0: +; GPR-NEXT: // kill: def $w1 killed $w1 def $x1 +; GPR-NEXT: mov x8, #72340172838076673 +; GPR-NEXT: and x9, x1, #0xff +; GPR-NEXT: mul x8, x9, x8 +; GPR-NEXT: stp x8, x8, [x0, #48] +; GPR-NEXT: stp x8, x8, [x0, #32] +; GPR-NEXT: stp x8, x8, [x0, #16] +; GPR-NEXT: stp x8, x8, [x0] +; GPR-NEXT: ret +; +; NEON-LABEL: memset_64: +; NEON: // %bb.0: +; NEON-NEXT: dup v0.16b, w1 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: stp q0, q0, [x0, #32] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0) + ret void +} + +; ///////////////////////////////////////////////////////////////////////////// + +define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind { +; ALL-LABEL: aligned_memset_16: +; ALL: // %bb.0: +; ALL-NEXT: // kill: def $w1 killed $w1 def $x1 +; ALL-NEXT: mov x8, #72340172838076673 +; ALL-NEXT: and x9, x1, #0xff +; ALL-NEXT: mul x8, x9, x8 +; ALL-NEXT: stp x8, x8, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0) + ret void +} + +define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind { +; GPR-LABEL: aligned_memset_32: +; GPR: // %bb.0: +; GPR-NEXT: // kill: def $w1 killed $w1 def $x1 +; GPR-NEXT: mov x8, #72340172838076673 +; GPR-NEXT: and x9, x1, #0xff +; GPR-NEXT: mul x8, x9, x8 +; GPR-NEXT: stp x8, x8, [x0, #16] +; GPR-NEXT: stp x8, x8, [x0] +; GPR-NEXT: ret +; +; NEON-LABEL: aligned_memset_32: +; NEON: // %bb.0: +; NEON-NEXT: dup v0.16b, w1 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0) + ret void +} + +define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind { +; GPR-LABEL: aligned_memset_64: +; GPR: // %bb.0: +; GPR-NEXT: // kill: def $w1 killed $w1 def $x1 +; GPR-NEXT: mov x8, #72340172838076673 +; GPR-NEXT: and x9, x1, #0xff +; GPR-NEXT: mul x8, x9, x8 +; GPR-NEXT: stp x8, x8, [x0, #48] +; GPR-NEXT: stp x8, x8, [x0, #32] +; GPR-NEXT: stp x8, x8, [x0, #16] +; GPR-NEXT: stp x8, x8, [x0] +; GPR-NEXT: ret +; +; NEON-LABEL: aligned_memset_64: +; NEON: // %bb.0: +; NEON-NEXT: dup v0.16b, w1 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: stp q0, q0, [x0, #32] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0) + ret void +} + +; ///////////////////////////////////////////////////////////////////////////// + +define void @bzero_1(i8* %a) nounwind { +; ALL-LABEL: bzero_1: +; ALL: // %bb.0: +; ALL-NEXT: strb wzr, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0) + ret void +} + +define void @bzero_2(i8* %a) nounwind { +; ALL-LABEL: bzero_2: +; ALL: // %bb.0: +; ALL-NEXT: strh wzr, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0) + ret void +} + +define void @bzero_4(i8* %a) nounwind { +; ALL-LABEL: bzero_4: +; ALL: // %bb.0: +; ALL-NEXT: str wzr, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0) + ret void +} + +define void @bzero_8(i8* %a) nounwind { +; ALL-LABEL: bzero_8: +; ALL: // %bb.0: +; ALL-NEXT: str xzr, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0) + ret void +} + +define void @bzero_16(i8* %a) nounwind { +; ALL-LABEL: bzero_16: +; ALL: // %bb.0: +; ALL-NEXT: stp xzr, xzr, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0) + ret void +} + +define void @bzero_32(i8* %a) nounwind { +; GPR-LABEL: bzero_32: +; GPR: // %bb.0: +; GPR-NEXT: adrp x8, .LCPI15_0 +; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI15_0] +; GPR-NEXT: stp q0, q0, [x0] +; GPR-NEXT: ret +; +; NEON-LABEL: bzero_32: +; NEON: // %bb.0: +; NEON-NEXT: movi v0.2d, #0000000000000000 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0) + ret void +} + +define void @bzero_64(i8* %a) nounwind { +; GPR-LABEL: bzero_64: +; GPR: // %bb.0: +; GPR-NEXT: adrp x8, .LCPI16_0 +; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI16_0] +; GPR-NEXT: stp q0, q0, [x0] +; GPR-NEXT: stp q0, q0, [x0, #32] +; GPR-NEXT: ret +; +; NEON-LABEL: bzero_64: +; NEON: // %bb.0: +; NEON-NEXT: movi v0.2d, #0000000000000000 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: stp q0, q0, [x0, #32] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0) + ret void +} + +; ///////////////////////////////////////////////////////////////////////////// + +define void @aligned_bzero_16(i8* %a) nounwind { +; ALL-LABEL: aligned_bzero_16: +; ALL: // %bb.0: +; ALL-NEXT: stp xzr, xzr, [x0] +; ALL-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0) + ret void +} + +define void @aligned_bzero_32(i8* %a) nounwind { +; GPR-LABEL: aligned_bzero_32: +; GPR: // %bb.0: +; GPR-NEXT: adrp x8, .LCPI18_0 +; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI18_0] +; GPR-NEXT: stp q0, q0, [x0] +; GPR-NEXT: ret +; +; NEON-LABEL: aligned_bzero_32: +; NEON: // %bb.0: +; NEON-NEXT: movi v0.2d, #0000000000000000 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0) + ret void +} + +define void @aligned_bzero_64(i8* %a) nounwind { +; GPR-LABEL: aligned_bzero_64: +; GPR: // %bb.0: +; GPR-NEXT: adrp x8, .LCPI19_0 +; GPR-NEXT: ldr q0, [x8, :lo12:.LCPI19_0] +; GPR-NEXT: stp q0, q0, [x0] +; GPR-NEXT: stp q0, q0, [x0, #32] +; GPR-NEXT: ret +; +; NEON-LABEL: aligned_bzero_64: +; NEON: // %bb.0: +; NEON-NEXT: movi v0.2d, #0000000000000000 +; NEON-NEXT: stp q0, q0, [x0] +; NEON-NEXT: stp q0, q0, [x0, #32] +; NEON-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +define void @test1(i8* %a, i8 %value) nounwind { +; CHECK-LABEL: test1: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: mov x8, #72340172838076673 +; CHECK-NEXT: and x9, x1, #0xff +; CHECK-NEXT: mul x8, x9, x8 +; CHECK-NEXT: str x8, [x0] +; CHECK-NEXT: ret + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0) + ret void +} + +define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind { +; CHECK-LABEL: regular_memset_calls_external_function: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w2, #1024 +; CHECK-NEXT: b memset + tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0) + ret void +} + +define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind { +; CHECK-LABEL: inlined_set_doesnt_call_external_function: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v0.16b, w1 +; CHECK-NEXT: stp q0, q0, [x0] +; CHECK-NEXT: stp q0, q0, [x0, #32] + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/X86/memset-inline.ll b/llvm/test/CodeGen/X86/memset-inline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/memset-inline.ll @@ -0,0 +1,548 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2,-sse4.2 | FileCheck %s --check-prefixes=GPR,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.2,-avx | FileCheck %s --check-prefixes=GPR,SSE4 +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx,-avx512f | FileCheck %s --check-prefixes=GPR,AVX +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f | FileCheck %s --check-prefixes=GPR,AVX512 + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +; ///////////////////////////////////////////////////////////////////////////// + +define void @memset_1(i8* %a, i8 %value) nounwind { +; GPR-LABEL: memset_1: +; GPR: # %bb.0: +; GPR-NEXT: movb %sil, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0) + ret void +} + +define void @memset_2(i8* %a, i8 %value) nounwind { +; GPR-LABEL: memset_2: +; GPR: # %bb.0: +; GPR-NEXT: movzbl %sil, %eax +; GPR-NEXT: shll $8, %esi +; GPR-NEXT: orl %esi, %eax +; GPR-NEXT: movw %ax, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0) + ret void +} + +define void @memset_4(i8* %a, i8 %value) nounwind { +; GPR-LABEL: memset_4: +; GPR: # %bb.0: +; GPR-NEXT: movzbl %sil, %eax +; GPR-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; GPR-NEXT: movl %eax, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0) + ret void +} + +define void @memset_8(i8* %a, i8 %value) nounwind { +; GPR-LABEL: memset_8: +; GPR: # %bb.0: +; GPR-NEXT: # kill: def $esi killed $esi def $rsi +; GPR-NEXT: movzbl %sil, %eax +; GPR-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; GPR-NEXT: imulq %rax, %rcx +; GPR-NEXT: movq %rcx, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0) + ret void +} + +define void @memset_16(i8* %a, i8 %value) nounwind { +; SSE2-LABEL: memset_16: +; SSE2: # %bb.0: +; SSE2-NEXT: # kill: def $esi killed $esi def $rsi +; SSE2-NEXT: movzbl %sil, %eax +; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; SSE2-NEXT: imulq %rax, %rcx +; SSE2-NEXT: movq %rcx, 8(%rdi) +; SSE2-NEXT: movq %rcx, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: memset_16: +; SSE4: # %bb.0: +; SSE4-NEXT: movd %esi, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pshufb %xmm1, %xmm0 +; SSE4-NEXT: movdqu %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: memset_16: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: memset_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512-NEXT: vmovdqu %xmm0, (%rdi) +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0) + ret void +} + +define void @memset_32(i8* %a, i8 %value) nounwind { +; SSE2-LABEL: memset_32: +; SSE2: # %bb.0: +; SSE2-NEXT: # kill: def $esi killed $esi def $rsi +; SSE2-NEXT: movzbl %sil, %eax +; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; SSE2-NEXT: imulq %rax, %rcx +; SSE2-NEXT: movq %rcx, 24(%rdi) +; SSE2-NEXT: movq %rcx, 16(%rdi) +; SSE2-NEXT: movq %rcx, 8(%rdi) +; SSE2-NEXT: movq %rcx, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: memset_32: +; SSE4: # %bb.0: +; SSE4-NEXT: movd %esi, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pshufb %xmm1, %xmm0 +; SSE4-NEXT: movdqu %xmm0, 16(%rdi) +; SSE4-NEXT: movdqu %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: memset_32: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqu %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: memset_32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512-NEXT: vmovdqu %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0) + ret void +} + +define void @memset_64(i8* %a, i8 %value) nounwind { +; SSE2-LABEL: memset_64: +; SSE2: # %bb.0: +; SSE2-NEXT: # kill: def $esi killed $esi def $rsi +; SSE2-NEXT: movzbl %sil, %eax +; SSE2-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; SSE2-NEXT: imulq %rax, %rcx +; SSE2-NEXT: movq %rcx, 56(%rdi) +; SSE2-NEXT: movq %rcx, 48(%rdi) +; SSE2-NEXT: movq %rcx, 40(%rdi) +; SSE2-NEXT: movq %rcx, 32(%rdi) +; SSE2-NEXT: movq %rcx, 24(%rdi) +; SSE2-NEXT: movq %rcx, 16(%rdi) +; SSE2-NEXT: movq %rcx, 8(%rdi) +; SSE2-NEXT: movq %rcx, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: memset_64: +; SSE4: # %bb.0: +; SSE4-NEXT: movd %esi, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pshufb %xmm1, %xmm0 +; SSE4-NEXT: movdqu %xmm0, 48(%rdi) +; SSE4-NEXT: movdqu %xmm0, 32(%rdi) +; SSE4-NEXT: movdqu %xmm0, 16(%rdi) +; SSE4-NEXT: movdqu %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: memset_64: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovups %ymm0, 32(%rdi) +; AVX-NEXT: vmovups %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: memset_64: +; AVX512: # %bb.0: +; AVX512-NEXT: movzbl %sil, %eax +; AVX512-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; AVX512-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512-NEXT: vmovdqu64 %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0) + ret void +} + +; ///////////////////////////////////////////////////////////////////////////// + +define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind { +; SSE2-LABEL: aligned_memset_16: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: aligned_memset_16: +; SSE4: # %bb.0: +; SSE4-NEXT: movd %esi, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pshufb %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: aligned_memset_16: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: aligned_memset_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %xmm0 +; AVX512-NEXT: vmovdqa %xmm0, (%rdi) +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0) + ret void +} + +define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind { +; SSE2-LABEL: aligned_memset_32: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: aligned_memset_32: +; SSE4: # %bb.0: +; SSE4-NEXT: movd %esi, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pshufb %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, 16(%rdi) +; SSE4-NEXT: movdqa %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: aligned_memset_32: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rdi) +; AVX-NEXT: vmovdqa %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: aligned_memset_32: +; AVX512: # %bb.0: +; AVX512-NEXT: vmovd %esi, %xmm0 +; AVX512-NEXT: vpbroadcastb %xmm0, %ymm0 +; AVX512-NEXT: vmovdqa %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0) + ret void +} + +define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind { +; SSE2-LABEL: aligned_memset_64: +; SSE2: # %bb.0: +; SSE2-NEXT: movd %esi, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: movdqa %xmm0, 48(%rdi) +; SSE2-NEXT: movdqa %xmm0, 32(%rdi) +; SSE2-NEXT: movdqa %xmm0, 16(%rdi) +; SSE2-NEXT: movdqa %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: aligned_memset_64: +; SSE4: # %bb.0: +; SSE4-NEXT: movd %esi, %xmm0 +; SSE4-NEXT: pxor %xmm1, %xmm1 +; SSE4-NEXT: pshufb %xmm1, %xmm0 +; SSE4-NEXT: movdqa %xmm0, 48(%rdi) +; SSE4-NEXT: movdqa %xmm0, 32(%rdi) +; SSE4-NEXT: movdqa %xmm0, 16(%rdi) +; SSE4-NEXT: movdqa %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: aligned_memset_64: +; AVX: # %bb.0: +; AVX-NEXT: vmovd %esi, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpshufb %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; AVX-NEXT: vmovaps %ymm0, 32(%rdi) +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: aligned_memset_64: +; AVX512: # %bb.0: +; AVX512-NEXT: movzbl %sil, %eax +; AVX512-NEXT: imull $16843009, %eax, %eax # imm = 0x1010101 +; AVX512-NEXT: vpbroadcastd %eax, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0) + ret void +} + +; ///////////////////////////////////////////////////////////////////////////// + +define void @bzero_1(i8* %a) nounwind { +; GPR-LABEL: bzero_1: +; GPR: # %bb.0: +; GPR-NEXT: movb $0, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0) + ret void +} + +define void @bzero_2(i8* %a) nounwind { +; GPR-LABEL: bzero_2: +; GPR: # %bb.0: +; GPR-NEXT: movw $0, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0) + ret void +} + +define void @bzero_4(i8* %a) nounwind { +; GPR-LABEL: bzero_4: +; GPR: # %bb.0: +; GPR-NEXT: movl $0, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0) + ret void +} + +define void @bzero_8(i8* %a) nounwind { +; GPR-LABEL: bzero_8: +; GPR: # %bb.0: +; GPR-NEXT: movq $0, (%rdi) +; GPR-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0) + ret void +} + +define void @bzero_16(i8* %a) nounwind { +; SSE2-LABEL: bzero_16: +; SSE2: # %bb.0: +; SSE2-NEXT: movq $0, 8(%rdi) +; SSE2-NEXT: movq $0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: bzero_16: +; SSE4: # %bb.0: +; SSE4-NEXT: xorps %xmm0, %xmm0 +; SSE4-NEXT: movups %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: bzero_16: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovups %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: bzero_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %xmm0, (%rdi) +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0) + ret void +} + +define void @bzero_32(i8* %a) nounwind { +; SSE2-LABEL: bzero_32: +; SSE2: # %bb.0: +; SSE2-NEXT: movq $0, 24(%rdi) +; SSE2-NEXT: movq $0, 16(%rdi) +; SSE2-NEXT: movq $0, 8(%rdi) +; SSE2-NEXT: movq $0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: bzero_32: +; SSE4: # %bb.0: +; SSE4-NEXT: xorps %xmm0, %xmm0 +; SSE4-NEXT: movups %xmm0, 16(%rdi) +; SSE4-NEXT: movups %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: bzero_32: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovups %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: bzero_32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0) + ret void +} + +define void @bzero_64(i8* %a) nounwind { +; SSE2-LABEL: bzero_64: +; SSE2: # %bb.0: +; SSE2-NEXT: movq $0, 56(%rdi) +; SSE2-NEXT: movq $0, 48(%rdi) +; SSE2-NEXT: movq $0, 40(%rdi) +; SSE2-NEXT: movq $0, 32(%rdi) +; SSE2-NEXT: movq $0, 24(%rdi) +; SSE2-NEXT: movq $0, 16(%rdi) +; SSE2-NEXT: movq $0, 8(%rdi) +; SSE2-NEXT: movq $0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: bzero_64: +; SSE4: # %bb.0: +; SSE4-NEXT: xorps %xmm0, %xmm0 +; SSE4-NEXT: movups %xmm0, 48(%rdi) +; SSE4-NEXT: movups %xmm0, 32(%rdi) +; SSE4-NEXT: movups %xmm0, 16(%rdi) +; SSE4-NEXT: movups %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: bzero_64: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovups %ymm0, 32(%rdi) +; AVX-NEXT: vmovups %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: bzero_64: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovups %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0) + ret void +} + +; ///////////////////////////////////////////////////////////////////////////// + +define void @aligned_bzero_16(i8* %a) nounwind { +; SSE2-LABEL: aligned_bzero_16: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: aligned_bzero_16: +; SSE4: # %bb.0: +; SSE4-NEXT: xorps %xmm0, %xmm0 +; SSE4-NEXT: movaps %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: aligned_bzero_16: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %xmm0, (%rdi) +; AVX-NEXT: retq +; +; AVX512-LABEL: aligned_bzero_16: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %xmm0, (%rdi) +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0) + ret void +} + +define void @aligned_bzero_32(i8* %a) nounwind { +; SSE2-LABEL: aligned_bzero_32: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, 16(%rdi) +; SSE2-NEXT: movaps %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: aligned_bzero_32: +; SSE4: # %bb.0: +; SSE4-NEXT: xorps %xmm0, %xmm0 +; SSE4-NEXT: movaps %xmm0, 16(%rdi) +; SSE4-NEXT: movaps %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: aligned_bzero_32: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: aligned_bzero_32: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %ymm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0) + ret void +} + +define void @aligned_bzero_64(i8* %a) nounwind { +; SSE2-LABEL: aligned_bzero_64: +; SSE2: # %bb.0: +; SSE2-NEXT: xorps %xmm0, %xmm0 +; SSE2-NEXT: movaps %xmm0, 48(%rdi) +; SSE2-NEXT: movaps %xmm0, 32(%rdi) +; SSE2-NEXT: movaps %xmm0, 16(%rdi) +; SSE2-NEXT: movaps %xmm0, (%rdi) +; SSE2-NEXT: retq +; +; SSE4-LABEL: aligned_bzero_64: +; SSE4: # %bb.0: +; SSE4-NEXT: xorps %xmm0, %xmm0 +; SSE4-NEXT: movaps %xmm0, 48(%rdi) +; SSE4-NEXT: movaps %xmm0, 32(%rdi) +; SSE4-NEXT: movaps %xmm0, 16(%rdi) +; SSE4-NEXT: movaps %xmm0, (%rdi) +; SSE4-NEXT: retq +; +; AVX-LABEL: aligned_bzero_64: +; AVX: # %bb.0: +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX-NEXT: vmovaps %ymm0, 32(%rdi) +; AVX-NEXT: vmovaps %ymm0, (%rdi) +; AVX-NEXT: vzeroupper +; AVX-NEXT: retq +; +; AVX512-LABEL: aligned_bzero_64: +; AVX512: # %bb.0: +; AVX512-NEXT: vxorps %xmm0, %xmm0, %xmm0 +; AVX512-NEXT: vmovaps %zmm0, (%rdi) +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0) + ret void +} diff --git a/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + +define void @test1(i8* %a, i8 %value) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: movzbl %sil, %eax +; CHECK-NEXT: movabsq $72340172838076673, %rcx # imm = 0x101010101010101 +; CHECK-NEXT: imulq %rax, %rcx +; CHECK-NEXT: movq %rcx, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0) + ret void +} + +define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind { +; CHECK-LABEL: regular_memset_calls_external_function: +; CHECK: # %bb.0: +; CHECK-NEXT: movl $1024, %edx # imm = 0x400 +; CHECK-NEXT: jmp memset@PLT # TAILCALL + tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0) + ret void +} + +define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind { +; CHECK-LABEL: inlined_set_doesnt_call_external_function: +; CHECK: # %bb.0: +; CHECK-NEXT: # kill: def $esi killed $esi def $rsi +; CHECK-NEXT: movzbl %sil, %ecx +; CHECK-NEXT: movabsq $72340172838076673, %rax # imm = 0x101010101010101 +; CHECK-NEXT: imulq %rcx, %rax +; CHECK-NEXT: movq %rax, 1016(%rdi) +; CHECK-NEXT: movq %rax, 1008(%rdi) + tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0) + ret void +} diff --git a/llvm/test/Other/lint.ll b/llvm/test/Other/lint.ll --- a/llvm/test/Other/lint.ll +++ b/llvm/test/Other/lint.ll @@ -6,6 +6,8 @@ declare void @llvm.stackrestore(i8*) declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memset.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind +declare void @llvm.memset.inline.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind declare void @has_sret(i8* sret(i8) %p) declare void @has_noaliases(i32* noalias %p, i32* %q) declare void @one_arg(i32) @@ -87,6 +89,11 @@ ; CHECK: Unusual: noalias argument aliases another argument call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* bitcast (i32* @CG to i8*), i64 1, i1 0) +; CHECK: Write to read-only memory +call void @llvm.memset.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0) +; CHECK: Write to read-only memory +call void @llvm.memset.inline.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0) + ; CHECK: Undefined behavior: Buffer overflow %wider = bitcast i8* %buf to i16* store i16 0, i16* %wider diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll --- a/llvm/test/Verifier/intrinsic-immarg.ll +++ b/llvm/test/Verifier/intrinsic-immarg.ll @@ -62,6 +62,23 @@ ret void } +declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1) +define void @memset_inline_is_volatile(i8* %dest, i8 %value, i1 %is.volatile) { + ; CHECK: immarg operand has non-immediate parameter + ; CHECK-NEXT: i1 %is.volatile + ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile) + call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile) + ret void +} + +define void @memset_inline_variable_size(i8* %dest, i8 %value, i32 %size) { + ; CHECK: immarg operand has non-immediate parameter + ; CHECK-NEXT: i32 %size + ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true) + call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true) + ret void +} + declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1, i1) define void @objectsize(i8* %ptr, i1 %a, i1 %b, i1 %c) { diff --git a/llvm/test/Verifier/memset-inline.ll b/llvm/test/Verifier/memset-inline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Verifier/memset-inline.ll @@ -0,0 +1,9 @@ +; RUN: not opt -verify < %s 2>&1 | FileCheck %s + +; CHECK: alignment is not a power of two + +define void @foo(i8* %P, i8 %value) { + call void @llvm.memset.inline.p0i8.i32(i8* align 3 %P, i8 %value, i32 4, i1 false) + ret void +} +declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1) nounwind