diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11686,6 +11686,8 @@ return IdentifyCUDATarget(dyn_cast(CurContext)); } + static bool IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D); + // CUDA function call preference. Must be ordered numerically from // worst to best. enum CUDAFunctionPreference { diff --git a/clang/lib/Sema/SemaCUDA.cpp b/clang/lib/Sema/SemaCUDA.cpp --- a/clang/lib/Sema/SemaCUDA.cpp +++ b/clang/lib/Sema/SemaCUDA.cpp @@ -211,6 +211,20 @@ llvm_unreachable("All cases should've been handled by now."); } +template static bool hasImplicitAttr(const FunctionDecl *D) { + if (!D) + return false; + if (auto *A = D->getAttr()) + return A->isImplicit(); + return D->isImplicit(); +} + +bool Sema::IsCUDAImplicitHostDeviceFunction(const FunctionDecl *D) { + bool IsImplicitDevAttr = hasImplicitAttr(D); + bool IsImplicitHostAttr = hasImplicitAttr(D); + return IsImplicitDevAttr && IsImplicitHostAttr; +} + void Sema::EraseUnwantedCUDAMatches( const FunctionDecl *Caller, SmallVectorImpl> &Matches) { diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -9387,16 +9387,22 @@ return Comparison::Equal; } -static bool isBetterMultiversionCandidate(const OverloadCandidate &Cand1, - const OverloadCandidate &Cand2) { +static Comparison +isBetterMultiversionCandidate(const OverloadCandidate &Cand1, + const OverloadCandidate &Cand2) { if (!Cand1.Function || !Cand1.Function->isMultiVersion() || !Cand2.Function || !Cand2.Function->isMultiVersion()) - return false; + return Comparison::Equal; - // If Cand1 is invalid, it cannot be a better match, if Cand2 is invalid, this - // is obviously better. - if (Cand1.Function->isInvalidDecl()) return false; - if (Cand2.Function->isInvalidDecl()) return true; + // If both are invalid, they are equal. If one of them is invalid, the other + // is better. + if (Cand1.Function->isInvalidDecl()) { + if (Cand2.Function->isInvalidDecl()) + return Comparison::Equal; + return Comparison::Worse; + } + if (Cand2.Function->isInvalidDecl()) + return Comparison::Better; // If this is a cpu_dispatch/cpu_specific multiversion situation, prefer // cpu_dispatch, else arbitrarily based on the identifiers. @@ -9406,16 +9412,18 @@ const auto *Cand2CPUSpec = Cand2.Function->getAttr(); if (!Cand1CPUDisp && !Cand2CPUDisp && !Cand1CPUSpec && !Cand2CPUSpec) - return false; + return Comparison::Equal; if (Cand1CPUDisp && !Cand2CPUDisp) - return true; + return Comparison::Better; if (Cand2CPUDisp && !Cand1CPUDisp) - return false; + return Comparison::Worse; if (Cand1CPUSpec && Cand2CPUSpec) { if (Cand1CPUSpec->cpus_size() != Cand2CPUSpec->cpus_size()) - return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size(); + return Cand1CPUSpec->cpus_size() < Cand2CPUSpec->cpus_size() + ? Comparison::Better + : Comparison::Worse; std::pair FirstDiff = std::mismatch( @@ -9428,7 +9436,9 @@ assert(FirstDiff.first != Cand1CPUSpec->cpus_end() && "Two different cpu-specific versions should not have the same " "identifier list, otherwise they'd be the same decl!"); - return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName(); + return (*FirstDiff.first)->getName() < (*FirstDiff.second)->getName() + ? Comparison::Better + : Comparison::Worse; } llvm_unreachable("No way to get here unless both had cpu_dispatch"); } @@ -9488,6 +9498,66 @@ else if (!Cand1.Viable) return false; + // [CUDA] A function with 'never' preference is marked not viable, therefore + // is never shown up here. The worst preference shown up here is 'wrong side', + // e.g. a host function called by a device host function in device + // compilation. This is valid AST as long as the host device function is not + // emitted, e.g. it is an inline function which is called only by a host + // function. A deferred diagnostic will be triggered if it is emitted. + // However a wrong-sided function is still a viable candidate here. + // + // If Cand1 can be emitted and Cand2 cannot be emitted in the current + // context, Cand1 is better than Cand2. If Cand1 can not be emitted and Cand2 + // can be emitted, Cand1 is not better than Cand2. This rule should have + // precedence over other rules. + // + // If both Cand1 and Cand2 can be emitted, or neither can be emitted, then + // other rules should be used to determine which is better. This is because + // host/device based overloading resolution is mostly for determining + // viability of a function. If two functions are both viable, other factors + // should take precedence in preference, e.g. the standard-defined preferences + // like argument conversion ranks or enable_if partial-ordering. The + // preference for pass-object-size parameters is probably most similar to a + // type-based-overloading decision and so should take priority. + // + // If other rules cannot determine which is better, CUDA preference will be + // used again to determine which is better. + // + // TODO: Currently IdentifyCUDAPreference does not return correct values + // for functions called in global variable initializers due to missing + // correct context about device/host. Therefore we can only enforce this + // rule when there is a caller. We should enforce this rule for functions + // in global variable initializers once proper context is added. + if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { + if (FunctionDecl *Caller = dyn_cast(S.CurContext)) { + bool IsCallerImplicitHD = Sema::IsCUDAImplicitHostDeviceFunction(Caller); + bool IsCand1ImplicitHD = + Sema::IsCUDAImplicitHostDeviceFunction(Cand1.Function); + bool IsCand2ImplicitHD = + Sema::IsCUDAImplicitHostDeviceFunction(Cand2.Function); + auto P1 = S.IdentifyCUDAPreference(Caller, Cand1.Function); + auto P2 = S.IdentifyCUDAPreference(Caller, Cand2.Function); + assert(P1 != Sema::CFP_Never && P2 != Sema::CFP_Never); + // The implicit HD function may be a function in a system header which + // is forced by pragma. In device compilation, if we prefer HD candidates + // over wrong-sided candidates, overloading resolution may change, which + // may result in non-deferrable diagnostics. As a workaround, we let + // implicit HD candidates take equal preference as wrong-sided candidates. + // This will preserve the overloading resolution. + auto EmitThreshold = + (S.getLangOpts().CUDAIsDevice && IsCallerImplicitHD && + (IsCand1ImplicitHD || IsCand2ImplicitHD)) + ? Sema::CFP_Never + : Sema::CFP_WrongSide; + auto Cand1Emittable = P1 > EmitThreshold; + auto Cand2Emittable = P2 > EmitThreshold; + if (Cand1Emittable && !Cand2Emittable) + return true; + if (!Cand1Emittable && Cand2Emittable) + return false; + } + } + // C++ [over.match.best]p1: // // -- if F is a static member function, ICS1(F) is defined such @@ -9722,12 +9792,6 @@ return Cmp == Comparison::Better; } - if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { - FunctionDecl *Caller = dyn_cast(S.CurContext); - return S.IdentifyCUDAPreference(Caller, Cand1.Function) > - S.IdentifyCUDAPreference(Caller, Cand2.Function); - } - bool HasPS1 = Cand1.Function != nullptr && functionHasPassObjectSizeParams(Cand1.Function); bool HasPS2 = Cand2.Function != nullptr && @@ -9735,7 +9799,21 @@ if (HasPS1 != HasPS2 && HasPS1) return true; - return isBetterMultiversionCandidate(Cand1, Cand2); + auto MV = isBetterMultiversionCandidate(Cand1, Cand2); + if (MV == Comparison::Better) + return true; + if (MV == Comparison::Worse) + return false; + + // If other rules cannot determine which is better, CUDA preference is used + // to determine which is better. + if (S.getLangOpts().CUDA && Cand1.Function && Cand2.Function) { + FunctionDecl *Caller = dyn_cast(S.CurContext); + return S.IdentifyCUDAPreference(Caller, Cand1.Function) > + S.IdentifyCUDAPreference(Caller, Cand2.Function); + } + + return false; } /// Determine whether two declarations are "equivalent" for the purposes of @@ -9821,33 +9899,6 @@ std::transform(begin(), end(), std::back_inserter(Candidates), [](OverloadCandidate &Cand) { return &Cand; }); - // [CUDA] HD->H or HD->D calls are technically not allowed by CUDA but - // are accepted by both clang and NVCC. However, during a particular - // compilation mode only one call variant is viable. We need to - // exclude non-viable overload candidates from consideration based - // only on their host/device attributes. Specifically, if one - // candidate call is WrongSide and the other is SameSide, we ignore - // the WrongSide candidate. - if (S.getLangOpts().CUDA) { - const FunctionDecl *Caller = dyn_cast(S.CurContext); - bool ContainsSameSideCandidate = - llvm::any_of(Candidates, [&](OverloadCandidate *Cand) { - // Check viable function only. - return Cand->Viable && Cand->Function && - S.IdentifyCUDAPreference(Caller, Cand->Function) == - Sema::CFP_SameSide; - }); - if (ContainsSameSideCandidate) { - auto IsWrongSideCandidate = [&](OverloadCandidate *Cand) { - // Check viable function only to avoid unnecessary data copying/moving. - return Cand->Viable && Cand->Function && - S.IdentifyCUDAPreference(Caller, Cand->Function) == - Sema::CFP_WrongSide; - }; - llvm::erase_if(Candidates, IsWrongSideCandidate); - } - } - // Find the best viable function. Best = end(); for (auto *Cand : Candidates) { diff --git a/clang/test/SemaCUDA/function-overload.cu b/clang/test/SemaCUDA/function-overload.cu --- a/clang/test/SemaCUDA/function-overload.cu +++ b/clang/test/SemaCUDA/function-overload.cu @@ -1,8 +1,8 @@ // REQUIRES: x86-registered-target // REQUIRES: nvptx-registered-target -// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s -// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s +// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-linux-gnu -fsyntax-only -verify %s +// RUN: %clang_cc1 -std=c++14 -triple nvptx64-nvidia-cuda -fsyntax-only -fcuda-is-device -verify %s #include "Inputs/cuda.h" @@ -14,6 +14,13 @@ struct HostDeviceReturnTy {}; struct TemplateReturnTy {}; +struct CorrectOverloadRetTy{}; +#if __CUDA_ARCH__ +// expected-note@-2 {{candidate constructor (the implicit copy constructor) not viable: no known conversion from 'IncorrectOverloadRetTy' to 'const CorrectOverloadRetTy &' for 1st argument}} +// expected-note@-3 {{candidate constructor (the implicit move constructor) not viable: no known conversion from 'IncorrectOverloadRetTy' to 'CorrectOverloadRetTy &&' for 1st argument}} +#endif +struct IncorrectOverloadRetTy{}; + typedef HostReturnTy (*HostFnPtr)(); typedef DeviceReturnTy (*DeviceFnPtr)(); typedef HostDeviceReturnTy (*HostDeviceFnPtr)(); @@ -331,9 +338,6 @@ // If we have a mix of HD and H-only or D-only candidates in the overload set, // normal C++ overload resolution rules apply first. template TemplateReturnTy template_vs_hd_function(T arg) -#ifdef __CUDA_ARCH__ -//expected-note@-2 {{declared here}} -#endif { return TemplateReturnTy(); } @@ -342,11 +346,13 @@ } __host__ __device__ void test_host_device_calls_hd_template() { - HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f); - TemplateReturnTy ret2 = template_vs_hd_function(1); #ifdef __CUDA_ARCH__ - // expected-error@-2 {{reference to __host__ function 'template_vs_hd_function' in __host__ __device__ function}} + typedef HostDeviceReturnTy ExpectedReturnTy; +#else + typedef TemplateReturnTy ExpectedReturnTy; #endif + HostDeviceReturnTy ret1 = template_vs_hd_function(1.0f); + ExpectedReturnTy ret2 = template_vs_hd_function(1); } __host__ void test_host_calls_hd_template() { @@ -367,14 +373,14 @@ __device__ DeviceReturnTy device_only_function(int arg) { return DeviceReturnTy(); } __device__ DeviceReturnTy2 device_only_function(float arg) { return DeviceReturnTy2(); } #ifndef __CUDA_ARCH__ - // expected-note@-3 {{'device_only_function' declared here}} - // expected-note@-3 {{'device_only_function' declared here}} + // expected-note@-3 2{{'device_only_function' declared here}} + // expected-note@-3 2{{'device_only_function' declared here}} #endif __host__ HostReturnTy host_only_function(int arg) { return HostReturnTy(); } __host__ HostReturnTy2 host_only_function(float arg) { return HostReturnTy2(); } #ifdef __CUDA_ARCH__ - // expected-note@-3 {{'host_only_function' declared here}} - // expected-note@-3 {{'host_only_function' declared here}} + // expected-note@-3 2{{'host_only_function' declared here}} + // expected-note@-3 2{{'host_only_function' declared here}} #endif __host__ __device__ void test_host_device_single_side_overloading() { @@ -392,6 +398,37 @@ #endif } +// wrong-sided overloading should not cause diagnostic unless it is emitted. +// This inline function is not emitted. +inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_no_diag() { + DeviceReturnTy ret1 = device_only_function(1); + DeviceReturnTy2 ret2 = device_only_function(1.0f); + HostReturnTy ret3 = host_only_function(1); + HostReturnTy2 ret4 = host_only_function(1.0f); +} + +// wrong-sided overloading should cause diagnostic if it is emitted. +// This inline function is emitted since it is called by an emitted function. +inline __host__ __device__ void test_host_device_wrong_side_overloading_inline_diag() { + DeviceReturnTy ret1 = device_only_function(1); + DeviceReturnTy2 ret2 = device_only_function(1.0f); +#ifndef __CUDA_ARCH__ + // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}} + // expected-error@-3 {{reference to __device__ function 'device_only_function' in __host__ __device__ function}} +#endif + HostReturnTy ret3 = host_only_function(1); + HostReturnTy2 ret4 = host_only_function(1.0f); +#ifdef __CUDA_ARCH__ + // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}} + // expected-error@-3 {{reference to __host__ function 'host_only_function' in __host__ __device__ function}} +#endif +} + +__host__ __device__ void test_host_device_wrong_side_overloading_inline_diag_caller() { + test_host_device_wrong_side_overloading_inline_diag(); + // expected-note@-1 {{called by 'test_host_device_wrong_side_overloading_inline_diag_caller'}} +} + // Verify that we allow overloading function templates. template __host__ T template_overload(const T &a) { return a; }; template __device__ T template_overload(const T &a) { return a; }; @@ -419,3 +456,136 @@ int test_constexpr_overload(C2 &x, C2 &y) { return constexpr_overload(x, y); } + +// Verify no ambiguity for new operator. +void *a = new int; +__device__ void *b = new int; +// expected-error@-1{{dynamic initialization is not supported for __device__, __constant__, and __shared__ variables.}} + +// Verify no ambiguity for new operator. +template _Tp&& f(); +template()))> +void __test(); + +void foo() { + __test(); +} + +// Test resolving implicit host device candidate vs wrong-sided candidate. +// In device compilation, implicit host device caller choose implicit host +// device candidate and wrong-sided candidate with equal preference. +// Resolution result should not change with/without pragma. +namespace ImplicitHostDeviceVsWrongSided { +CorrectOverloadRetTy callee(double x); +#pragma clang force_cuda_host_device begin +IncorrectOverloadRetTy callee(int x); +inline CorrectOverloadRetTy implicit_hd_caller() { + return callee(1.0); +} +#pragma clang force_cuda_host_device end +} + +// Test resolving implicit host device candidate vs same-sided candidate. +// In host compilation, implicit host device caller choose implicit host +// device candidate and same-sided candidate with equal preference. +// Resolution result should not change with/without pragma. +namespace ImplicitHostDeviceVsSameSide { +IncorrectOverloadRetTy callee(int x); +#pragma clang force_cuda_host_device begin +CorrectOverloadRetTy callee(double x); +inline CorrectOverloadRetTy implicit_hd_caller() { + return callee(1.0); +} +#pragma clang force_cuda_host_device end +} + +// Test resolving explicit host device candidate vs. wrong-sided candidate. +// Explicit host device caller favors host device candidate against wrong-sided +// candidate. +namespace ExplicitHostDeviceVsWrongSided { +CorrectOverloadRetTy callee(double x); +__host__ __device__ IncorrectOverloadRetTy callee(int x); +inline __host__ __device__ CorrectOverloadRetTy explicit_hd_caller() { + return callee(1.0); +#if __CUDA_ARCH__ + // expected-error@-2 {{no viable conversion from returned value of type 'IncorrectOverloadRetTy' to function return type 'CorrectOverloadRetTy'}} +#endif +} +} + +// In the implicit host device function 'caller', the second 'callee' should be +// chosen since it has better match, even though it is an implicit host device +// function whereas the first 'callee' is a host function. A diagnostic will be +// emitted if the first 'callee' is chosen since deduced return type cannot be +// used before it is defined. +namespace ImplicitHostDeviceByConstExpr { +template a b; +auto callee(...); +template constexpr auto callee(d) -> decltype(0); +struct e { + template static auto g(ad, f...) { + return h)...>; + } + struct i { + template static constexpr auto caller(f... k) { + return callee(k...); + } + }; + template static auto h() { + return i::caller; + } +}; +class l { + l() { + e::g([] {}, this); + } +}; +} + +// Implicit HD candidate competes with device candidate. +// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved. +// copy ctor of a should win over a(short), otherwise there will be ambiguity +// due to conversion operator. +namespace TestImplicitHDWithD { + struct a { + __device__ a(short); + __device__ operator unsigned() const; + __device__ operator int() const; + }; + struct b { + a d; + }; + void f(b g) { b e = g; } +} + +// Implicit HD candidate competes with host candidate. +// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved. +// copy ctor of a should win over a(short), otherwise there will be ambiguity +// due to conversion operator. +namespace TestImplicitHDWithH { + struct a { + a(short); + __device__ operator unsigned() const; + __device__ operator int() const; + }; + struct b { + a d; + }; + void f(b g) { b e = g; } +} + +// Implicit HD candidate comptes with HD candidate. +// a and b have implicit HD copy ctor. In copy ctor of b, ctor of a is resolved. +// copy ctor of a should win over a(short), otherwise there will be ambiguity +// due to conversion operator. +namespace TestImplicitHDWithHD { + struct a { + __host__ __device__ a(short); + __device__ operator unsigned() const; + __device__ operator int() const; + }; + struct b { + a d; + }; + void f(b g) { b e = g; } +}