diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -3237,3 +3237,10 @@ let Subjects = SubjectList<[NonParmVar, Function, Block, ObjCMethod]>; let Documentation = [ObjCExternallyRetainedDocs]; } + +def NoRuntimeFor : InheritableAttr { + let Spellings = [Clang<"no_runtime_for">]; + let Args = [VariadicStringArgument<"FunctionNames">]; + let Subjects = SubjectList<[Function]>; + let Documentation = [NoRuntimeForDocs]; +} diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -3738,7 +3738,7 @@ def WebAssemblyImportModuleDocs : Documentation { let Category = DocCatFunction; let Content = [{ -Clang supports the ``__attribute__((import_module()))`` +Clang supports the ``__attribute__((import_module()))`` attribute for the WebAssembly target. This attribute may be attached to a function declaration, where it modifies how the symbol is to be imported within the WebAssembly linking environment. @@ -3755,7 +3755,7 @@ def WebAssemblyImportNameDocs : Documentation { let Category = DocCatFunction; let Content = [{ -Clang supports the ``__attribute__((import_name()))`` +Clang supports the ``__attribute__((import_name()))`` attribute for the WebAssembly target. This attribute may be attached to a function declaration, where it modifies how the symbol is to be imported within the WebAssembly linking environment. @@ -3926,7 +3926,7 @@ (`start_routine`) is called zero or more times by the `pthread_create` function, and that the fourth parameter (`arg`) is passed along. Note that the callback behavior of `pthread_create` is automatically recognized by Clang. In addition, -the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for +the declarations of `__kmpc_fork_teams` and `__kmpc_fork_call`, generated for `#pragma omp target teams` and `#pragma omp parallel`, respectively, are also automatically recognized as broker functions. Further functions might be added in the future. @@ -4105,3 +4105,30 @@ ``__attribute__((malloc))``. }]; } + +def NoRuntimeForDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ +The ``no_runtime_for`` attribute prevents the compiler from synthesizing calls +to specific runtime functions. +It is useful when designing runtime functions like ``memcpy`` or Objective-C +runtime where the compiler would otherwise be able to replace the implementation +by a call to the runtime library: resulting in a chicken and egg problem. + +.. code-block:: c++ + + extern "C" void *memcpy(char *dst, const char *src, size_t count) + __attribute__((no_runtime_for("memcpy"))) { + #pragma clang loop vectorize(enable) interleave(enable) unroll(disable) + for (;count >= 4;count -= 4, dst += 4, src += 4) + __builtin_memcpy(dst, src, 4); + switch (count) { + case 1: __builtin_memcpy(dst, src, 1); break; + case 2: __builtin_memcpy(dst, src, 2); break; + case 3: __builtin_memcpy(dst, src, 3); break; + default: break; + } + return dst; + } + }]; +} diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -1846,6 +1846,16 @@ FuncAttrs.addAttribute(llvm::Attribute::NoDuplicate); if (TargetDecl->hasAttr()) FuncAttrs.addAttribute(llvm::Attribute::Convergent); + if (const auto *Attr = TargetDecl->getAttr()) { + llvm::SmallVector FunctionNames(Attr->functionNames_begin(), + Attr->functionNames_end()); + llvm::sort(FunctionNames); + FunctionNames.erase( + std::unique(FunctionNames.begin(), FunctionNames.end()), + FunctionNames.end()); + FuncAttrs.addAttribute("no_runtime_for", + llvm::join(FunctionNames, ",")); + } if (const FunctionDecl *Fn = dyn_cast(TargetDecl)) { AddAttributesFromFunctionProtoType( diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -1104,6 +1104,30 @@ cast(D), AL.getAttributeSpellingListIndex())); } +static void handleNoRuntimeFor(Sema &S, Decl *D, const ParsedAttr &AL) { + if (D->hasAttr()) { + S.Diag(D->getBeginLoc(), diag::err_attribute_only_once_per_parameter) << AL; + return; + } + + if (!checkAttributeAtLeastNumArgs(S, AL, 1)) + return; + + std::vector FunctionNames; + for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) { + StringRef FunctionName; + SourceLocation LiteralLoc; + if (!S.checkStringLiteralArgumentAttr(AL, I, FunctionName, &LiteralLoc)) + return; + // Check valid function name. + FunctionNames.push_back(FunctionName); + } + + D->addAttr(::new (S.Context) NoRuntimeForAttr( + AL.getRange(), S.Context, FunctionNames.data(), FunctionNames.size(), + AL.getAttributeSpellingListIndex())); +} + static void handlePassObjectSizeAttr(Sema &S, Decl *D, const ParsedAttr &AL) { if (D->hasAttr()) { S.Diag(D->getBeginLoc(), diag::err_attribute_only_once_per_parameter) << AL; @@ -6734,6 +6758,9 @@ case ParsedAttr::AT_DiagnoseIf: handleDiagnoseIfAttr(S, D, AL); break; + case ParsedAttr::AT_NoRuntimeFor: + handleNoRuntimeFor(S, D, AL); + break; case ParsedAttr::AT_ExtVectorType: handleExtVectorTypeAttr(S, D, AL); break; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -5547,6 +5547,16 @@ } } +static bool HasNoRuntimeAttribute(const CallInst &I, StringRef FunctionName) { + if (!I.hasFnAttr("no_runtime_for")) + return false; + SmallVector pieces; + I.getAttribute(AttributeList::FunctionIndex, "no_runtime_for") + .getValueAsString() + .split(pieces, ","); + return is_contained(pieces, FunctionName); +} + /// Lower the call to the specified intrinsic function. If we want to emit this /// as a call to a named external function, return the name. Otherwise, lower it /// and return null. @@ -5622,8 +5632,9 @@ bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget()); // FIXME: Support passing different dest/src alignments to the memcpy DAG // node. + bool isAlwaysInline = HasNoRuntimeAttribute(I, "memcpy"); SDValue MC = DAG.getMemcpy(getRoot(), sdl, Op1, Op2, Op3, Align, isVol, - false, isTC, + isAlwaysInline, isTC, MachinePointerInfo(I.getArgOperand(0)), MachinePointerInfo(I.getArgOperand(1))); updateDAGForMaybeTailCall(MC); diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -96,6 +96,17 @@ return II; } +static void ForwardNoRuntimeAttribute(const Function *F, + StringRef FunctionName, + CallInst *CI) { + if (F->hasFnAttribute("no_runtime_for")) { + const Attribute A = F->getFnAttribute("no_runtime_for"); + if (A.getValueAsString().contains(FunctionName)) { + CI->addAttribute(AttributeList::FunctionIndex, A); + } + } +} + CallInst *IRBuilderBase:: CreateMemSet(Value *Ptr, Value *Val, Value *Size, unsigned Align, bool isVolatile, MDNode *TBAATag, MDNode *ScopeTag, @@ -103,7 +114,8 @@ Ptr = getCastedInt8PtrValue(Ptr); Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)}; Type *Tys[] = { Ptr->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys); CallInst *CI = createCallHelper(TheFn, Ops, this); @@ -121,6 +133,8 @@ if (NoAliasTag) CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + ForwardNoRuntimeAttribute(F, "memset", CI); + return CI; } @@ -165,7 +179,8 @@ Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys); CallInst *CI = createCallHelper(TheFn, Ops, this); @@ -190,6 +205,8 @@ if (NoAliasTag) CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + ForwardNoRuntimeAttribute(F, "memcpy", CI); + return CI; } @@ -245,7 +262,8 @@ Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() }; - Module *M = BB->getParent()->getParent(); + Function *F = BB->getParent(); + Module *M = F->getParent(); Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memmove, Tys); CallInst *CI = createCallHelper(TheFn, Ops, this); @@ -266,6 +284,8 @@ if (NoAliasTag) CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + ForwardNoRuntimeAttribute(F, "memmove", CI); + return CI; } diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp --- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -314,5 +314,9 @@ Size.getValueType(), Align, isVolatile, AlwaysInline, DstPtrInfo, SrcPtrInfo); + /// Handle runtime sizes through repmovsb when we AlwaysInline. + if (AlwaysInline) + return emitRepmovs(Subtarget, DAG, dl, Chain, Dst, Src, Size, MVT::i8); + return SDValue(); } diff --git a/llvm/test/CodeGen/X86/memcpy.ll b/llvm/test/CodeGen/X86/memcpy.ll --- a/llvm/test/CodeGen/X86/memcpy.ll +++ b/llvm/test/CodeGen/X86/memcpy.ll @@ -7,7 +7,7 @@ ; Variable memcpy's should lower to calls. -define i8* @test1(i8* %a, i8* %b, i64 %n) nounwind { +define void @test1(i8* %a, i8* %b, i64 %n) nounwind { ; LINUX-LABEL: test1: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: jmp memcpy # TAILCALL @@ -17,11 +17,11 @@ ; DARWIN-NEXT: jmp _memcpy ## TAILCALL entry: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %n, i1 0 ) - ret i8* %a + ret void } ; Variable memcpy's should lower to calls. -define i8* @test2(i64* %a, i64* %b, i64 %n) nounwind { +define void @test2(i64* %a, i64* %b, i64 %n) nounwind { ; LINUX-LABEL: test2: ; LINUX: # %bb.0: # %entry ; LINUX-NEXT: jmp memcpy # TAILCALL @@ -33,7 +33,25 @@ %tmp14 = bitcast i64* %a to i8* %tmp25 = bitcast i64* %b to i8* tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 %tmp14, i8* align 8 %tmp25, i64 %n, i1 0 ) - ret i8* %tmp14 + ret void +} + +; Variable length memcpy's with disabled runtime should lower to repmovsb. +define void @memcpy_no_runtime(i8* %a, i8* %b, i64 %n) nounwind { +; LINUX-LABEL: memcpy_no_runtime: +; LINUX: # %bb.0: # %entry +; LINUX-NEXT: movq %rdx, %rcx +; LINUX-NEXT: rep;movsb (%rsi), %es:(%rdi) +; LINUX-NEXT: retq +; +; DARWIN-LABEL: memcpy_no_runtime: +; DARWIN: ## %bb.0: ## %entry +; DARWIN-NEXT: movq %rdx, %rcx +; DARWIN-NEXT: rep;movsb (%rsi), %es:(%rdi) +; DARWIN-NEXT: retq +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* %b, i64 %n, i1 0 ) "no_runtime_for" = "memcpy" + ret void } ; Large constant memcpy's should lower to a call when optimizing for size.