Index: clang/include/clang/Sema/Sema.h =================================================================== --- clang/include/clang/Sema/Sema.h +++ clang/include/clang/Sema/Sema.h @@ -4746,8 +4746,13 @@ bool isValidPointerAttrType(QualType T, bool RefOkay = false); bool CheckRegparmAttr(const ParsedAttr &attr, unsigned &value); + + /// Check validaty of calling convention attribute \p attr. If \p FD + /// is not null pointer, use \p FD to determine the host/device target. + /// Otherwise, use \p D to determiine the host/device target. bool CheckCallingConvAttr(const ParsedAttr &attr, CallingConv &CC, - const FunctionDecl *FD = nullptr); + const FunctionDecl *FD = nullptr, + const Declarator *D = nullptr); bool CheckAttrTarget(const ParsedAttr &CurrAttr); bool CheckAttrNoArgs(const ParsedAttr &CurrAttr); bool checkStringLiteralArgumentAttr(const AttributeCommonInfo &CI, @@ -13252,6 +13257,27 @@ /// Determines whether the given variable is emitted on host or device side. CUDAVariableTarget IdentifyCUDATarget(const VarDecl *D); + /// Defines different CUDA host/device contexts where a function may be + /// called. + enum CUDATargetContextKind { + CTCK_Unknown, /// Unknown context + CTCK_InitGlobalVar, /// Function called during global variable + /// initialization + }; + + struct CUDATargetContext { + CUDAFunctionTarget Target = CFT_HostDevice; + CUDATargetContextKind Kind = CTCK_Unknown; + Decl *D = nullptr; + } CurCUDATargetCtx; + + struct CUDATargetContextRAII { + Sema &S; + CUDATargetContext SavedCtx; + CUDATargetContextRAII(Sema &S_, CUDATargetContextKind K, Decl *D); + ~CUDATargetContextRAII() { S.CurCUDATargetCtx = SavedCtx; } + }; + /// Gets the CUDA target for the current context. CUDAFunctionTarget CurrentCUDATarget() { return IdentifyCUDATarget(dyn_cast(CurContext)); Index: clang/lib/Parse/ParseDecl.cpp =================================================================== --- clang/lib/Parse/ParseDecl.cpp +++ clang/lib/Parse/ParseDecl.cpp @@ -2498,6 +2498,7 @@ } } + Sema::CUDATargetContextRAII X(Actions, Sema::CTCK_InitGlobalVar, ThisDecl); switch (TheInitKind) { // Parse declarator '=' initializer. case InitKind::Equal: { Index: clang/lib/Sema/SemaCUDA.cpp =================================================================== --- clang/lib/Sema/SemaCUDA.cpp +++ clang/lib/Sema/SemaCUDA.cpp @@ -105,19 +105,38 @@ } template -static bool hasAttr(const FunctionDecl *D, bool IgnoreImplicitAttr) { +static bool hasAttr(const Decl *D, bool IgnoreImplicitAttr) { return D->hasAttrs() && llvm::any_of(D->getAttrs(), [&](Attr *Attribute) { return isa(Attribute) && !(IgnoreImplicitAttr && Attribute->isImplicit()); }); } +Sema::CUDATargetContextRAII::CUDATargetContextRAII(Sema &S_, + CUDATargetContextKind K, + Decl *D) + : S(S_) { + SavedCtx = S.CurCUDATargetCtx; + assert(K == CTCK_InitGlobalVar); + auto *VD = dyn_cast_or_null(D); + if (VD && VD->hasGlobalStorage() && !VD->isStaticLocal()) { + auto Target = CFT_Host; + if ((hasAttr(VD, /*IgnoreImplicit=*/true) && + !hasAttr(VD, /*IgnoreImplicit=*/true)) || + hasAttr(VD, /*IgnoreImplicit=*/true) || + hasAttr(VD, /*IgnoreImplicit=*/true)) + Target = CFT_Device; + S.CurCUDATargetCtx = {Target, K, VD}; + } +} + /// IdentifyCUDATarget - Determine the CUDA compilation target for this function Sema::CUDAFunctionTarget Sema::IdentifyCUDATarget(const FunctionDecl *D, bool IgnoreImplicitHDAttr) { - // Code that lives outside a function is run on the host. - if (D == nullptr) - return CFT_Host; + // Code that lives outside a function gets the target from CurCUDATargetCtx. + if (D == nullptr) { + return CurCUDATargetCtx.Target; + } if (D->hasAttr()) return CFT_InvalidTarget; Index: clang/lib/Sema/SemaDeclAttr.cpp =================================================================== --- clang/lib/Sema/SemaDeclAttr.cpp +++ clang/lib/Sema/SemaDeclAttr.cpp @@ -5307,7 +5307,7 @@ } bool Sema::CheckCallingConvAttr(const ParsedAttr &Attrs, CallingConv &CC, - const FunctionDecl *FD) { + const FunctionDecl *FD, const Declarator *D) { if (Attrs.isInvalid()) return true; @@ -5406,7 +5406,9 @@ // on their host/device attributes. if (LangOpts.CUDA) { auto *Aux = Context.getAuxTargetInfo(); - auto CudaTarget = IdentifyCUDATarget(FD); + assert(FD || D); + auto CudaTarget = + FD ? IdentifyCUDATarget(FD) : IdentifyCUDATarget(D->getAttributes()); bool CheckHost = false, CheckDevice = false; switch (CudaTarget) { case CFT_HostDevice: Index: clang/lib/Sema/SemaOverload.cpp =================================================================== --- clang/lib/Sema/SemaOverload.cpp +++ clang/lib/Sema/SemaOverload.cpp @@ -6699,17 +6699,19 @@ } // (CUDA B.1): Check for invalid calls between targets. - if (getLangOpts().CUDA) - if (const FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true)) - // Skip the check for callers that are implicit members, because in this - // case we may not yet know what the member's target is; the target is - // inferred for the member automatically, based on the bases and fields of - // the class. - if (!Caller->isImplicit() && !IsAllowedCUDACall(Caller, Function)) { - Candidate.Viable = false; - Candidate.FailureKind = ovl_fail_bad_target; - return; - } + if (getLangOpts().CUDA) { + const FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true); + // Skip the check for callers that are implicit members, because in this + // case we may not yet know what the member's target is; the target is + // inferred for the member automatically, based on the bases and fields of + // the class. + if (!(Caller && Caller->isImplicit()) && + !IsAllowedCUDACall(Caller, Function)) { + Candidate.Viable = false; + Candidate.FailureKind = ovl_fail_bad_target; + return; + } + } if (Function->getTrailingRequiresClause()) { ConstraintSatisfaction Satisfaction; @@ -7221,12 +7223,11 @@ // (CUDA B.1): Check for invalid calls between targets. if (getLangOpts().CUDA) - if (const FunctionDecl *Caller = getCurFunctionDecl(/*AllowLambda=*/true)) - if (!IsAllowedCUDACall(Caller, Method)) { - Candidate.Viable = false; - Candidate.FailureKind = ovl_fail_bad_target; - return; - } + if (!IsAllowedCUDACall(getCurFunctionDecl(/*AllowLambda=*/true), Method)) { + Candidate.Viable = false; + Candidate.FailureKind = ovl_fail_bad_target; + return; + } if (Method->getTrailingRequiresClause()) { ConstraintSatisfaction Satisfaction; @@ -12497,10 +12498,12 @@ return false; if (FunctionDecl *FunDecl = dyn_cast(Fn)) { - if (S.getLangOpts().CUDA) - if (FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true)) - if (!Caller->isImplicit() && !S.IsAllowedCUDACall(Caller, FunDecl)) - return false; + if (S.getLangOpts().CUDA) { + FunctionDecl *Caller = S.getCurFunctionDecl(/*AllowLambda=*/true); + if (!(Caller && Caller->isImplicit()) && + !S.IsAllowedCUDACall(Caller, FunDecl)) + return false; + } if (FunDecl->isMultiVersion()) { const auto *TA = FunDecl->getAttr(); if (TA && !TA->isDefaultVersion()) Index: clang/lib/Sema/SemaType.cpp =================================================================== --- clang/lib/Sema/SemaType.cpp +++ clang/lib/Sema/SemaType.cpp @@ -4055,7 +4055,7 @@ // function type. We'll diagnose the failure to apply them in // handleFunctionTypeAttr. CallingConv CC; - if (!S.CheckCallingConvAttr(AL, CC) && + if (!S.CheckCallingConvAttr(AL, CC, /*FunctionDecl=*/nullptr, &D) && (!FTI.isVariadic || supportsVariadicCall(CC))) { return CC; } Index: clang/test/CodeGenCUDA/global-initializers.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/global-initializers.cu @@ -0,0 +1,55 @@ +// RUN: %clang_cc1 %s -triple x86_64-linux-unknown -emit-llvm -o - \ +// RUN: | FileCheck -check-prefix=HOST %s +// RUN: %clang_cc1 %s -fcuda-is-device \ +// RUN: -emit-llvm -o - -triple nvptx64 \ +// RUN: -aux-triple x86_64-unknown-linux-gnu | FileCheck \ +// RUN: -check-prefix=DEV %s + +#include "Inputs/cuda.h" + +// Check host/device-based overloding resolution in global variable initializer. +template +T pow(T, U) { return 1.0; } + +__device__ double pow(double, int) { return 2.0; } + +// HOST-DAG: call {{.*}}double @_Z3powIdiET_S0_T0_(double noundef 1.000000e+00, i32 noundef 1) +double X = pow(1.0, 1); + +template +constexpr T cpow(T, U) { return 11.0; } + +constexpr __device__ double cpow(double, int) { return 12.0; } + +// HOST-DAG: @CX = global double 1.100000e+01 +double CX = cpow(11.0, 1); + +// DEV-DAG: @CY = addrspace(1) externally_initialized global double 1.200000e+01 +__device__ double CY = cpow(12.0, 1); + +struct A { + template + T pow(T, U) { return 3.0; } + + __device__ double pow(double, int) { return 4.0; } +}; + +A a; + +// HOST-DAG: call {{.*}}double @_ZN1A3powIdiEET_S1_T0_(ptr {{.*}}@a, double noundef 3.000000e+00, i32 noundef 1) +double AX = a.pow(3.0, 1); + +struct CA { + template + constexpr T cpow(T, U) const { return 13.0; } + + constexpr __device__ double cpow(double, int) const { return 14.0; } +}; + +const CA ca; + +// HOST-DAG: @CAX = global double 1.300000e+01 +double CAX = ca.cpow(13.0, 1); + +// DEV-DAG: @CAY = addrspace(1) externally_initialized global double 1.400000e+01 +__device__ double CAY = ca.cpow(14.0, 1); Index: clang/test/SemaCUDA/amdgpu-windows-vectorcall.cu =================================================================== --- clang/test/SemaCUDA/amdgpu-windows-vectorcall.cu +++ clang/test/SemaCUDA/amdgpu-windows-vectorcall.cu @@ -1,4 +1,5 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-pc-windows-msvc -fms-compatibility -fcuda-is-device -fsyntax-only -verify %s +// RUN: %clang_cc1 -triple x86_64-pc-windows-msvc -fms-compatibility -fsyntax-only -verify %s __cdecl void hostf1(); __vectorcall void (*hostf2)() = hostf1; // expected-error {{cannot initialize a variable of type 'void ((*))() __attribute__((vectorcall))' with an lvalue of type 'void () __attribute__((cdecl))'}} Index: clang/test/SemaCUDA/function-overload.cu =================================================================== --- clang/test/SemaCUDA/function-overload.cu +++ clang/test/SemaCUDA/function-overload.cu @@ -222,7 +222,13 @@ // Test for address of overloaded function resolution in the global context. HostFnPtr fp_h = h; HostFnPtr fp_ch = ch; +#if defined (__CUDA_ARCH__) +__device__ +#endif CurrentFnPtr fp_dh = dh; +#if defined (__CUDA_ARCH__) +__device__ +#endif CurrentFnPtr fp_cdh = cdh; GlobalFnPtr fp_g = g; Index: clang/test/SemaCUDA/global-initializers-host.cu =================================================================== --- clang/test/SemaCUDA/global-initializers-host.cu +++ /dev/null @@ -1,32 +0,0 @@ -// RUN: %clang_cc1 %s --std=c++11 -triple x86_64-linux-unknown -fsyntax-only -o - -verify - -#include "Inputs/cuda.h" - -// Check that we get an error if we try to call a __device__ function from a -// module initializer. - -struct S { - __device__ S() {} - // expected-note@-1 {{'S' declared here}} -}; - -S s; -// expected-error@-1 {{reference to __device__ function 'S' in global initializer}} - -struct T { - __host__ __device__ T() {} -}; -T t; // No error, this is OK. - -struct U { - __host__ U() {} - __device__ U(int) {} - // expected-note@-1 {{'U' declared here}} -}; -U u(42); -// expected-error@-1 {{reference to __device__ function 'U' in global initializer}} - -__device__ int device_fn() { return 42; } -// expected-note@-1 {{'device_fn' declared here}} -int n = device_fn(); -// expected-error@-1 {{reference to __device__ function 'device_fn' in global initializer}} Index: clang/test/SemaCUDA/global-initializers.cu =================================================================== --- /dev/null +++ clang/test/SemaCUDA/global-initializers.cu @@ -0,0 +1,76 @@ +// RUN: %clang_cc1 %s -triple x86_64-linux-unknown -fsyntax-only -o - -verify +// RUN: %clang_cc1 %s -fcuda-is-device -triple nvptx -fsyntax-only -o - -verify + +#include "Inputs/cuda.h" + +// Check that we get an error if we try to call a __device__ function from a +// module initializer. + +struct S { + // expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: requires 1 argument, but 0 were provided}} + // expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: requires 1 argument, but 0 were provided}} + __device__ S() {} + // expected-note@-1 {{candidate constructor not viable: call to __device__ function from __host__ function}} +}; + +S s; +// expected-error@-1 {{no matching constructor for initialization of 'S'}} + +struct T { + __host__ __device__ T() {} +}; +T t; // No error, this is OK. + +struct U { + // expected-note@-1 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'int' to 'const U' for 1st argument}} + // expected-note@-2 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'int' to 'U' for 1st argument}} + __host__ U() {} + // expected-note@-1 {{candidate constructor not viable: requires 0 arguments, but 1 was provided}} + __device__ U(int) {} + // expected-note@-1 {{candidate constructor not viable: call to __device__ function from __host__ function}} +}; +U u(42); +// expected-error@-1 {{no matching constructor for initialization of 'U'}} + +__device__ int device_fn() { return 42; } +// expected-note@-1 {{candidate function not viable: call to __device__ function from __host__ function}} +int n = device_fn(); +// expected-error@-1 {{no matching function for call to 'device_fn'}} + +// Check host/device-based overloding resolution in global variable initializer. +template +T pow(T, U); + +__device__ double pow(double, int); + +double X = pow(1.0, 1); +__device__ double Y = pow(2.0, 2); // expected-error{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}} + +template +constexpr T cpow(T, U) { return 1.0; } + +constexpr __device__ double cpow(double, int) { return 2.0; } + +const double CX = cpow(1.0, 1); +const __device__ double CY = cpow(2.0, 2); + +struct A { + template + T pow(T, U); + + __device__ double pow(double, int); + + template + constexpr T cpow(T, U) const { return 1.0; } + + constexpr __device__ double cpow(double, int) const { return 1.0; } + +}; + +A a; +double AX = a.pow(1.0, 1); +__device__ double AY = a.pow(2.0, 2); // expected-error{{dynamic initialization is not supported for __device__, __constant__, __shared__, and __managed__ variables}} + +const A ca; +const double CAX = ca.cpow(1.0, 1); +const __device__ double CAY = ca.cpow(2.0, 2);