diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -304,9 +304,6 @@ def ObjCNonFragileRuntime : LangOpt<"ObjCNonFragileRuntime", "LangOpts.ObjCRuntime.allowsClassStubs()">; -// Language option for CMSE extensions -def Cmse : LangOpt<"Cmse">; - // Defines targets for target-specific attributes. Empty lists are unchecked. class TargetSpec { // Specifies Architectures for which the target applies, based off the @@ -3252,3 +3249,10 @@ let Subjects = SubjectList<[NonParmVar, Function, Block, ObjCMethod]>; let Documentation = [ObjCExternallyRetainedDocs]; } + +def NoBuiltin : InheritableAttr { + let Spellings = [Clang<"no_builtin">]; + let Args = [VariadicStringArgument<"FunctionNames">]; + let Subjects = SubjectList<[Function]>; + let Documentation = [NoBuiltinDocs]; +} diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3757,7 +3757,7 @@ def WebAssemblyImportModuleDocs : Documentation { let Category = DocCatFunction; let Content = [{ -Clang supports the ``__attribute__((import_module()))`` +Clang supports the ``__attribute__((import_module()))`` attribute for the WebAssembly target. This attribute may be attached to a function declaration, where it modifies how the symbol is to be imported within the WebAssembly linking environment. @@ -3774,7 +3774,7 @@ def WebAssemblyImportNameDocs : Documentation { let Category = DocCatFunction; let Content = [{ -Clang supports the ``__attribute__((import_name()))`` +Clang supports the ``__attribute__((import_name()))`` attribute for the WebAssembly target. This attribute may be attached to a function declaration, where it modifies how the symbol is to be imported within the WebAssembly linking environment. @@ -3978,7 +3978,7 @@ (`start_routine`) is called zero or more times by the `pthread_create` function, and that the fourth parameter (`arg`) is passed along. Note that the callback behavior of `pthread_create` is automatically recognized by Clang. In addition, -the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for +the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for `#pragma omp target teams` and `#pragma omp parallel`, respectively, are also automatically recognized as broker functions. Further functions might be added in the future. @@ -4157,3 +4157,9 @@ ``__attribute__((malloc))``. }]; } + +def NoBuiltinDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ + }]; +} diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -504,6 +504,7 @@ BUILTIN(__builtin_vsnprintf, "ic*zcC*a", "nFP:2:") BUILTIN(__builtin_thread_pointer, "v*", "nc") BUILTIN(__builtin_launder, "v*v*", "nt") +BUILTIN(__builtin_memcpy_inline, "vv*vC*z", "n") LANGBUILTIN(__builtin_is_constant_evaluated, "b", "n", CXX_LANG) // GCC exception builtins diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h --- a/clang/lib/CodeGen/CGBuilder.h +++ b/clang/lib/CodeGen/CGBuilder.h @@ -284,6 +284,14 @@ Size, IsVolatile); } + using CGBuilderBaseTy::CreateMemCpyInline; + llvm::CallInst *CreateMemCpyInline(Address Dest, Address Src, + uint64_t Size) { + return CreateMemCpyInline( + Dest.getPointer(), Dest.getAlignment().getQuantity(), Src.getPointer(), + Src.getAlignment().getQuantity(), getInt64(Size)); + } + using CGBuilderBaseTy::CreateMemMove; llvm::CallInst *CreateMemMove(Address Dest, Address Src, llvm::Value *Size, bool IsVolatile = false) { diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -2362,7 +2362,18 @@ Builder.CreateMemCpy(Dest, Src, SizeVal, false); return RValue::get(Dest.getPointer()); } - + case Builtin::BI__builtin_memcpy_inline: { + Address Dest = EmitPointerWithAlignment(E->getArg(0)); + Address Src = EmitPointerWithAlignment(E->getArg(1)); + uint64_t Size = + E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue(); + EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(), + E->getArg(0)->getExprLoc(), FD, 0); + EmitNonNullArgCheck(RValue::get(Src.getPointer()), E->getArg(1)->getType(), + E->getArg(1)->getExprLoc(), FD, 1); + Builder.CreateMemCpyInline(Dest, Src, Size); + return RValue::get(nullptr); + } case Builtin::BI__builtin_char_memchr: BuiltinID = Builtin::BI__builtin_memchr; break; diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1846,6 +1846,15 @@ FuncAttrs.addAttribute(llvm::Attribute::NoDuplicate); if (TargetDecl->hasAttr()) FuncAttrs.addAttribute(llvm::Attribute::Convergent); + if (const auto *Attr = TargetDecl->getAttr()) { + // TODO: check that function names are valid for the TargetLibraryInfo. + for(const auto& FunctionName : Attr->functionNames()){ + SmallString<32> AttributeName; + AttributeName += "no-builtin-"; + AttributeName += FunctionName; + FuncAttrs.addAttribute(AttributeName); + } + } if (const FunctionDecl *Fn = dyn_cast(TargetDecl)) { AddAttributesFromFunctionProtoType( diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -1104,6 +1104,30 @@ cast(D), AL.getAttributeSpellingListIndex())); } +static void handleNoBuiltin(Sema &S, Decl *D, const ParsedAttr &AL) { + if (D->hasAttr()) { + S.Diag(D->getBeginLoc(), diag::err_attribute_only_once_per_parameter) << AL; + return; + } + + if (!checkAttributeAtLeastNumArgs(S, AL, 1)) + return; + + std::vector FunctionNames; + for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) { + StringRef FunctionName; + SourceLocation LiteralLoc; + if (!S.checkStringLiteralArgumentAttr(AL, I, FunctionName, &LiteralLoc)) + return; + // Check valid function name. + FunctionNames.push_back(FunctionName); + } + + D->addAttr(::new (S.Context) NoBuiltinAttr( + AL.getRange(), S.Context, FunctionNames.data(), FunctionNames.size(), + AL.getAttributeSpellingListIndex())); +} + static void handlePassObjectSizeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (D->hasAttr()) { S.Diag(D->getBeginLoc(), diag::err_attribute_only_once_per_parameter) << AL; @@ -6746,6 +6770,9 @@ case ParsedAttr::AT_DiagnoseIf: handleDiagnoseIfAttr(S, D, AL); break; + case ParsedAttr::AT_NoBuiltin: + handleNoBuiltin(S, D, AL); + break; case ParsedAttr::AT_ExtVectorType: handleExtVectorTypeAttr(S, D, AL); break; diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -11347,6 +11347,27 @@ other operations necessary to locate the TLS area. Not all targets support this intrinsic. +'``llvm.memcpy.inline``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i8* @llvm.memcpy.inline.p0i8.p0i8.i32(i8* , i8* , + i32 ) + +Overview: +""""""""" + +The '``llvm.memcpy.inline``' intrinsic ... + +Semantics: +"""""""""" + +The '``llvm.memcpy.inline``' intrinsic ... + Standard C Library Intrinsics ----------------------------- @@ -15154,7 +15175,7 @@ ` of floating point values. This argument must be larger in size than the result. -The second and third arguments specify the rounding mode and exception +The second and third arguments specify the rounding mode and exception behavior as described above. Semantics: @@ -15178,7 +15199,7 @@ Overview: """"""""" -The '``llvm.experimental.constrained.fpext``' intrinsic extends a +The '``llvm.experimental.constrained.fpext``' intrinsic extends a floating-point ``value`` to a larger floating-point value. Arguments: diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -460,6 +460,9 @@ MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); + CallInst *CreateMemCpyInline(Value *Dst, unsigned DstAlign, Value *Src, + unsigned SrcAlign, Value *Size); + /// Create and insert an element unordered-atomic memcpy between the /// specified pointers. /// diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -578,6 +578,7 @@ static bool classof(const IntrinsicInst *I) { switch (I->getIntrinsicID()) { case Intrinsic::memcpy: + case Intrinsic::memcpy_inline: case Intrinsic::memmove: case Intrinsic::memset: return true; @@ -606,8 +607,14 @@ public: // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { - return I->getIntrinsicID() == Intrinsic::memcpy || - I->getIntrinsicID() == Intrinsic::memmove; + switch (I->getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memcpy_inline: + case Intrinsic::memmove: + return true; + default: + return false; + } } static bool classof(const Value *V) { return isa(V) && classof(cast(V)); @@ -626,6 +633,18 @@ } }; + /// This class wraps the llvm.memcpy.inline intrinsic. + class MemCpyInlineInst : public MemTransferInst { + public: + // Methods for support type inquiry through isa, cast, and dyn_cast: + static bool classof(const IntrinsicInst *I) { + return I->getIntrinsicID() == Intrinsic::memcpy_inline; + } + static bool classof(const Value *V) { + return isa(V) && classof(cast(V)); + } + }; + /// This class wraps the llvm.memmove intrinsic. class MemMoveInst : public MemTransferInst { public: diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -483,6 +483,13 @@ llvm_i32_ty], []>; +// Memcpy semantic that is guaranteed to be inlined. +def int_memcpy_inline + : Intrinsic<[], + [ llvm_anyptr_ty, llvm_anyptr_ty, llvm_anyint_ty, llvm_i1_ty ], + [ IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>, ReadOnly<1>, + ImmArg<2>, ImmArg<3> ]>; + //===------------------- Standard C Library Intrinsics --------------------===// // diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5615,8 +5615,11 @@ case Intrinsic::longjmp: lowerCallToExternalSymbol(I, &"_longjmp"[!TLI.usesUnderscoreLongJmp()]); return; + case Intrinsic::memcpy_inline: case Intrinsic::memcpy: { - const auto &MCI = cast(I); + const auto &MCI = cast(I); + assert((isa(I) || isa(I)) && + "must be a memcpy"); SDValue Op1 = getValue(I.getArgOperand(0)); SDValue Op2 = getValue(I.getArgOperand(1)); SDValue Op3 = getValue(I.getArgOperand(2)); @@ -5628,8 +5631,10 @@ bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget()); // FIXME: Support passing different dest/src alignments to the memcpy DAG // node. + bool isAlwaysInline = + isa(I) || I.hasFnAttr("no-builtin-memcpy"); SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, - false, isTC, + isAlwaysInline, isTC, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1))); updateDAGForMaybeTailCall(MC); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -96,6 +96,14 @@ return II; } +static void ForwardAttribute(const Function *F, StringRef Attribute, + CallInst *CI) { + if (F->hasFnAttribute(Attribute)) { + CI->addAttribute(AttributeList::FunctionIndex, + F->getFnAttribute(Attribute)); + } +} + CallInst *IRBuilderBase:: CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, @@ -103,7 +111,8 @@ Ptr = getCastedInt8PtrValue(Ptr); Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)}; Type *Tys[] = { Ptr->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); CallInst *CI = createCallHelper(TheFn, Ops, this); @@ -121,6 +130,8 @@ if (NoAliasTag) CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + ForwardAttribute(F, "no-builtin-memset", CI); + return CI; } @@ -165,7 +176,8 @@ Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys); CallInst *CI = createCallHelper(TheFn, Ops, this); @@ -190,6 +202,36 @@ if (NoAliasTag) CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + ForwardAttribute(F, "no-builtin-memcpy", CI); + + return CI; +} + +CallInst *IRBuilderBase::CreateMemCpyInline(Value *Dst, unsigned DstAlign, + Value *Src, unsigned SrcAlign, + Value *Size) { + assert((DstAlign == 0 || isPowerOf2_32(DstAlign)) && + "Must be 0 or a power of 2"); + assert((SrcAlign == 0 || isPowerOf2_32(SrcAlign)) && + "Must be 0 or a power of 2"); + Dst = getCastedInt8PtrValue(Dst); + Src = getCastedInt8PtrValue(Src); + Value *IsVolatile = getInt1(false); + + Value *Ops[] = {Dst, Src, Size, IsVolatile}; + Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; + Function *F = BB->getParent(); + Module *M = F->getParent(); + Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy_inline, Tys); + + CallInst *CI = createCallHelper(TheFn, Ops, this); + + auto *MCI = cast(CI); + if (DstAlign > 0) + MCI->setDestAlignment(DstAlign); + if (SrcAlign > 0) + MCI->setSourceAlignment(SrcAlign); + return CI; } @@ -245,7 +287,8 @@ Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memmove, Tys); CallInst *CI = createCallHelper(TheFn, Ops, this); @@ -266,6 +309,8 @@ if (NoAliasTag) CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + ForwardAttribute(F, "no-builtin-memmove", CI); + return CI; } diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -314,5 +314,9 @@ Size.getValueType(), Align, isVolatile, AlwaysInline, DstPtrInfo, SrcPtrInfo); + /// Handle runtime sizes through repmovsb when we AlwaysInline. + if (AlwaysInline) + return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8); + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/memcpy-inline.ll b/llvm/test/CodeGen/X86/memcpy-inline.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/memcpy-inline.ll @@ -0,0 +1,14 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s + +declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind + +define void @test1(i8* %a, i8* %b) nounwind { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: +; CHECK-NEXT: movq (%rsi), %rax +; CHECK-NEXT: movq %rax, (%rdi) +; CHECK-NEXT: retq + tail call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* %a, i8* %b, i64 8, i1 0 ) + ret void +} diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -7,7 +7,7 @@ ; Variable memcpy's should lower to calls. -define i8* @test1(i8* %a, i8* %b, i64 %n) nounwind { +define void @test1(i8* %a, i8* %b, i64 %n) nounwind { ; LINUX-LABEL: test1: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: jmp memcpy # TAILCALL @@ -17,11 +17,11 @@ ; DARWIN-NEXT: jmp _memcpy ## TAILCALL entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %n, i1 0 ) - ret i8* %a + ret void } ; Variable memcpy's should lower to calls. -define i8* @test2(i64* %a, i64* %b, i64 %n) nounwind { +define void @test2(i64* %a, i64* %b, i64 %n) nounwind { ; LINUX-LABEL: test2: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: jmp memcpy # TAILCALL @@ -33,7 +33,25 @@ %tmp14 = bitcast i64* %a to i8* %tmp25 = bitcast i64* %b to i8* tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmp14, i8* align 8 %tmp25, i64 %n, i1 0 ) - ret i8* %tmp14 + ret void +} + +; Variable length memcpy's with disabled runtime should lower to repmovsb. +define void @memcpy_no_runtime(i8* %a, i8* %b, i64 %n) nounwind { +; LINUX-LABEL: memcpy_no_runtime: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: movq %rdx, %rcx +; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi) +; LINUX-NEXT: retq +; +; DARWIN-LABEL: memcpy_no_runtime: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: movq %rdx, %rcx +; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi) +; DARWIN-NEXT: retq +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %n, i1 0 ) "no-builtin-memcpy" + ret void } ; Large constant memcpy's should lower to a call when optimizing for size.