diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -408,6 +408,7 @@ def TargetX86 : TargetArch<["x86"]>; def TargetAnyX86 : TargetArch<["x86", "x86_64"]>; def TargetWebAssembly : TargetArch<["wasm32", "wasm64"]>; +def TargetNVPTX : TargetArch<["nvptx", "nvptx64"]>; def TargetWindows : TargetSpec { let OSes = ["Win32"]; } @@ -1194,10 +1195,9 @@ def : MutualExclusions<[CUDADeviceBuiltinSurfaceType, CUDADeviceBuiltinTextureType]>; -def CUDAGlobal : InheritableAttr { - let Spellings = [GNU<"global">, Declspec<"__global__">]; +def CUDAGlobal : InheritableAttr, TargetSpecificAttr { + let Spellings = [GNU<"global">, Declspec<"__global__">, Clang<"nvptx_kernel">]; let Subjects = SubjectList<[Function]>; - let LangOpts = [CUDA]; let Documentation = [Undocumented]; } def : MutualExclusions<[CUDADevice, CUDAGlobal]>; @@ -1225,10 +1225,9 @@ let Documentation = [InternalOnly]; } -def CUDALaunchBounds : InheritableAttr { - let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">]; +def CUDALaunchBounds : InheritableAttr, TargetSpecificAttr { + let Spellings = [GNU<"launch_bounds">, Declspec<"__launch_bounds__">, Clang<"nvptx_launch_bounds">]; let Args = [ExprArgument<"MaxThreads">, ExprArgument<"MinBlocks", 1>]; - let LangOpts = [CUDA]; let Subjects = SubjectList<[ObjCMethod, FunctionLike]>; // An AST node is created for this attribute, but is not used by other parts // of the compiler. However, this node needs to exist in the AST because diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -1411,7 +1411,7 @@ if (FD && FD->getType()->castAs()->getCallConv() == CC_X86RegCall) { Out << "__regcall3__" << II->getName(); - } else if (FD && FD->hasAttr() && + } else if (FD && CGM.getLangOpts().CUDA && FD->hasAttr() && GD.getKernelReferenceKind() == KernelReferenceKind::Stub) { Out << "__device_stub__" << II->getName(); } else { @@ -1547,6 +1547,7 @@ // device-mangling in host compilation could help catching certain ones. assert(!isa(ND) || !ND->hasAttr() || getContext().shouldExternalize(ND) || getLangOpts().CUDAIsDevice || + !getLangOpts().CUDA || (getContext().getAuxTargetInfo() && (getContext().getAuxTargetInfo()->getCXXABI() != getContext().getTargetInfo().getCXXABI())) || diff --git a/clang/lib/CodeGen/TargetInfo.cpp b/clang/lib/CodeGen/TargetInfo.cpp --- a/clang/lib/CodeGen/TargetInfo.cpp +++ b/clang/lib/CodeGen/TargetInfo.cpp @@ -7328,32 +7328,29 @@ } } - // Perform special handling in CUDA mode. - if (M.getLangOpts().CUDA) { - // CUDA __global__ functions get a kernel metadata entry. Since - // __global__ functions cannot be called from the device, we do not - // need to set the noinline attribute. - if (FD->hasAttr()) { - // Create !{, metadata !"kernel", i32 1} node - addNVVMMetadata(F, "kernel", 1); - } - if (CUDALaunchBoundsAttr *Attr = FD->getAttr()) { - // Create !{, metadata !"maxntidx", i32 } node - llvm::APSInt MaxThreads(32); - MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(M.getContext()); - if (MaxThreads > 0) - addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); - - // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was - // not specified in __launch_bounds__ or if the user specified a 0 value, - // we don't have to add a PTX directive. - if (Attr->getMinBlocks()) { - llvm::APSInt MinBlocks(32); - MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(M.getContext()); - if (MinBlocks > 0) - // Create !{, metadata !"minctasm", i32 } node - addNVVMMetadata(F, "minctasm", MinBlocks.getExtValue()); - } + // CUDA __global__ functions get a kernel metadata entry. Since + // __global__ functions cannot be called from the device, we do not + // need to set the noinline attribute. + if (FD->hasAttr()) { + // Create !{, metadata !"kernel", i32 1} node + addNVVMMetadata(F, "kernel", 1); + } + if (CUDALaunchBoundsAttr *Attr = FD->getAttr()) { + // Create !{, metadata !"maxntidx", i32 } node + llvm::APSInt MaxThreads(32); + MaxThreads = Attr->getMaxThreads()->EvaluateKnownConstInt(M.getContext()); + if (MaxThreads > 0) + addNVVMMetadata(F, "maxntidx", MaxThreads.getExtValue()); + + // min blocks is an optional argument for CUDALaunchBoundsAttr. If it was + // not specified in __launch_bounds__ or if the user specified a 0 value, + // we don't have to add a PTX directive. + if (Attr->getMinBlocks()) { + llvm::APSInt MinBlocks(32); + MinBlocks = Attr->getMinBlocks()->EvaluateKnownConstInt(M.getContext()); + if (MinBlocks > 0) + // Create !{, metadata !"minctasm", i32 } node + addNVVMMetadata(F, "minctasm", MinBlocks.getExtValue()); } } } diff --git a/clang/test/CodeGen/nvptx_attributes.c b/clang/test/CodeGen/nvptx_attributes.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/nvptx_attributes.c @@ -0,0 +1,51 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals +// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -target-cpu sm_61 -emit-llvm %s -o - | FileCheck %s +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@device +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: ret i32 1 +// +int device() {return 1;}; + +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@foo +// CHECK-SAME: (ptr noundef [[RET:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RET_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[RET]], ptr [[RET_ADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @device() +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RET_ADDR]], align 8 +// CHECK-NEXT: store i32 [[CALL]], ptr [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +__attribute__((nvptx_kernel)) void foo(int *ret) { + *ret = device(); +} + +// CHECK: Function Attrs: noinline nounwind optnone +// CHECK-LABEL: define {{[^@]+}}@bar +// CHECK-SAME: (ptr noundef [[RET:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RET_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[RET]], ptr [[RET_ADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call i32 @device() +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RET_ADDR]], align 8 +// CHECK-NEXT: store i32 [[CALL]], ptr [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +__attribute__((nvptx_kernel, nvptx_launch_bounds(1, 128))) void bar(int *ret) { + *ret = device(); +} + + +//. +// CHECK: attributes #0 = { noinline nounwind optnone "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="sm_61" "target-features"="+ptx32,+sm_61" } +//. +// CHECK: !0 = !{ptr @foo, !"kernel", i32 1} +// CHECK: !1 = !{ptr @bar, !"kernel", i32 1} +// CHECK: !2 = !{ptr @bar, !"maxntidx", i32 1} +// CHECK: !3 = !{ptr @bar, !"minctasm", i32 128} +// CHECK: !4 = !{i32 1, !"wchar_size", i32 4} +// CHECK: !5 = !{!"clang version 16.0.0"} +//.