Index: include/clang/Sema/Sema.h =================================================================== --- include/clang/Sema/Sema.h +++ include/clang/Sema/Sema.h @@ -8794,10 +8794,14 @@ enum CUDAFunctionPreference { CFP_Never, // Invalid caller/callee combination. - CFP_LastResort, // Lowest priority. Only in effect if + CFP_WrongSide, // Calls from host-device to host or device + // function that do not match current compilation + // mode. Only in effect if // LangOpts.CUDADisableTargetCallChecks is true. - CFP_Fallback, // Low priority caller/callee combination - CFP_Best, // Preferred caller/callee combination + CFP_SameSide, // Calls from host-device to host or device + // function matching current compilation mode. + CFP_HostDevice, // Any calls to host/device functions. + CFP_Native, // host-to-host or device-to-device calls. }; /// Identifies relative preference of a given Caller/Callee Index: lib/Sema/SemaCUDA.cpp =================================================================== --- lib/Sema/SemaCUDA.cpp +++ lib/Sema/SemaCUDA.cpp @@ -68,26 +68,26 @@ // Ph - preference in host mode // Pd - preference in device mode // H - handled in (x) -// Preferences: b-best, f-fallback, l-last resort, n-never. +// Preferences: '+'-native, h-host-device, s-same side, w-wrong side, '-'-never. // // | F | T | Ph | Pd | H | // |----+----+----+----+-----+ -// | d | d | b | b | (b) | -// | d | g | n | n | (a) | -// | d | h | l | l | (e) | -// | d | hd | f | f | (c) | -// | g | d | b | b | (b) | -// | g | g | n | n | (a) | -// | g | h | l | l | (e) | -// | g | hd | f | f | (c) | -// | h | d | l | l | (e) | -// | h | g | b | b | (b) | -// | h | h | b | b | (b) | -// | h | hd | f | f | (c) | -// | hd | d | l | f | (d) | -// | hd | g | f | n |(d/a)| -// | hd | h | f | l | (d) | -// | hd | hd | b | b | (b) | +// | d | d | + | + | (c) | +// | d | g | - | - | (a) | +// | d | h | - | - | (e) | +// | d | hd | h | h | (b) | +// | g | d | + | + | (c) | +// | g | g | - | - | (a) | +// | g | h | - | - | (e) | +// | g | hd | h | h | (b) | +// | h | d | - | - | (e) | +// | h | g | + | + | (c) | +// | h | h | + | + | (c) | +// | h | hd | h | h | (b) | +// | hd | d | w | s | (d) | +// | hd | g | s | - |(d/a)| +// | hd | h | s | w | (d) | +// | hd | hd | h | h | (b) | Sema::CUDAFunctionPreference Sema::IdentifyCUDAPreference(const FunctionDecl *Caller, @@ -112,31 +112,31 @@ (CallerTarget == CFT_HostDevice && getLangOpts().CUDAIsDevice))) return CFP_Never; - // (b) Best case scenarios + // (b) Calling HostDevice is OK as a fallback that works for everyone. + if (CalleeTarget == CFT_HostDevice) + return CFP_HostDevice; + + // (c) Best case scenarios if (CalleeTarget == CallerTarget || (CallerTarget == CFT_Host && CalleeTarget == CFT_Global) || (CallerTarget == CFT_Global && CalleeTarget == CFT_Device)) - return CFP_Best; - - // (c) Calling HostDevice is OK as a fallback that works for everyone. - if (CalleeTarget == CFT_HostDevice) - return CFP_Fallback; + return CFP_Native; // Figure out what should be returned 'last resort' cases. Normally // those would not be allowed, but we'll consider them if // CUDADisableTargetCallChecks is true. CUDAFunctionPreference QuestionableResult = - getLangOpts().CUDADisableTargetCallChecks ? CFP_LastResort : CFP_Never; + getLangOpts().CUDADisableTargetCallChecks ? CFP_WrongSide : CFP_Never; // (d) HostDevice behavior depends on compilation mode. if (CallerTarget == CFT_HostDevice) { // Calling a function that matches compilation mode is OK. // Calling a function from the other side is frowned upon. if (getLangOpts().CUDAIsDevice) - return CalleeTarget == CFT_Device ? CFP_Fallback : QuestionableResult; + return CalleeTarget == CFT_Device ? CFP_SameSide : QuestionableResult; else return (CalleeTarget == CFT_Host || CalleeTarget == CFT_Global) - ? CFP_Fallback + ? CFP_SameSide : QuestionableResult; } @@ -144,7 +144,7 @@ if ((CallerTarget == CFT_Host && CalleeTarget == CFT_Device) || (CallerTarget == CFT_Device && CalleeTarget == CFT_Host) || (CallerTarget == CFT_Global && CalleeTarget == CFT_Host)) - return QuestionableResult; + return CFP_Never; llvm_unreachable("All cases should've been handled by now."); } Index: lib/Sema/SemaOverload.cpp =================================================================== --- lib/Sema/SemaOverload.cpp +++ lib/Sema/SemaOverload.cpp @@ -8527,6 +8527,27 @@ else if (!Cand1.Viable) return false; + // [CUDA] If HD function calls a function which has host-only and + // device-only variants, nvcc sees only host function during host + // compilation and device function only during device-side + // compilation. It appears to be a side effect of nvcc's splitting + // of host and device code into separate TUs. Alas we need to be + // compatible with existing code that relies on this. If we see such + // a case, return better variant right away. + if (S.getLangOpts().CUDA && S.getLangOpts().CUDATargetOverloads && + Cand1.Function && Cand2.Function) { + const FunctionDecl *Caller = dyn_cast(S.CurContext); + const Sema::CUDAFunctionPreference CFP1 = + S.IdentifyCUDAPreference(Caller, Cand1.Function); + const Sema::CUDAFunctionPreference CFP2 = + S.IdentifyCUDAPreference(Caller, Cand2.Function); + if (((CFP1 == Sema::CFP_SameSide || CFP1 == Sema::CFP_Native) && + (CFP2 <= Sema::CFP_WrongSide)) || + ((CFP1 <= Sema::CFP_WrongSide) && + (CFP2 == Sema::CFP_SameSide || CFP2 == Sema::CFP_Native))) + return CFP1 > CFP2; + } + // C++ [over.match.best]p1: // // -- if F is a static member function, ICS1(F) is defined such Index: test/CodeGenCUDA/function-overload.cu =================================================================== --- test/CodeGenCUDA/function-overload.cu +++ test/CodeGenCUDA/function-overload.cu @@ -7,7 +7,8 @@ // RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-HOST %s // RUN: %clang_cc1 -triple nvptx64-nvidia-cuda -fcuda-is-device \ // RUN: -fcuda-target-overloads -emit-llvm -o - %s \ -// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE %s +// RUN: | FileCheck -check-prefix=CHECK-BOTH -check-prefix=CHECK-DEVICE \ +// RUN: -check-prefix=CHECK-DEVICE-STRICT %s // Check target overloads handling with disabled call target checks. // RUN: %clang_cc1 -DNOCHECKS -triple x86_64-unknown-linux-gnu -emit-llvm \ @@ -77,6 +78,91 @@ extern "C" __host__ __device__ int chd(void) {return 14;} // CHECK-BOTH: ret i32 14 +// NOTE: this is an artefact of split-mode CUDA compilation that we +// need to mimic. HD functions are sometimes allowed to call H or D +// functions. Due to split compilation mode device-side compilation +// will not see host-only function and thus they will not be +// considered at all. For clang both H and D variants will become +// function overloads. Normally target attribute is considered only if +// C++ rules can not determine which function is better. However in +// this case we need to discard functions that would not be present +// during current compilation phase before we apply normal overload +// resolution rules. + +// Large enough difference in calling preferences should have +// precedence over standard C++ overloading rules. +template T template_vs_function(T arg) { return 15; } +__device__ float template_vs_function(float arg) { return 16; } + +// In this case during host compilation we expect to cal function +// template even if __device__ function may be available and allowed +// by -fcuda-disable-target-call-checks and, according to C++ overload +// resolution rules, would be prefered over function template. +// CHECK-BOTH-LABEL: define void @_Z5hd_tfv() +__host__ __device__ void hd_tf(void) { + template_vs_function(1.0f); + // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float + // CHECK-DEVICE: call float @_Z20template_vs_functionf(float + template_vs_function(2.0); + // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double + // CHECK-DEVICE: call float @_Z20template_vs_functionf(float +} + +// Calls from __host__ and __device__ functions should always call +// overloaded function that matches their mode. +// CHECK-HOST-LABEL: define void @_Z4h_tfv() +__host__ void h_tf() { + template_vs_function(1.0f); + // CHECK-HOST: call float @_Z20template_vs_functionIfET_S0_(float + template_vs_function(2.0); + // CHECK-HOST: call double @_Z20template_vs_functionIdET_S0_(double +} + +// CHECK-DEVICE-LABEL: define void @_Z4d_tfv() +__device__ void d_tf() { + template_vs_function(1.0f); + // CHECK-DEVICE: call float @_Z20template_vs_functionf(float + template_vs_function(2.0); + // CHECK-DEVICE: call float @_Z20template_vs_functionf(float +} + +// In case of smaller difference between calling preferences +// (HD->{HD,H} call), C++ rules take precedence. So, when we need to pick +// between (host or device) function template and HD function, C++ +// rules will have precedence. + +template T template_vs_hd_function(T arg) { return 15; } +__host__ __device__ float template_vs_hd_function(float arg) { return 16; } + +// CHECK-BOTH-LABEL: define void @_Z7hd_thdfv() +__host__ __device__ void hd_thdf() { + template_vs_hd_function(1.0f); + // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float + template_vs_hd_function(1.0); + // CHECK-HOST: call double @_Z23template_vs_hd_functionIdET_S0_(double +} + +// CHECK-HOST-LABEL: define void @_Z6h_thdfv() +__host__ void h_thdf() { + template_vs_hd_function(1.0f); + // CHECK-HOST: call float @_Z23template_vs_hd_functionf(float + template_vs_hd_function(1.0); + // CHECK-HOST: call double @_Z23template_vs_hd_functionIdET_S0_(double +} + +// CHECK-DEVICE-LABEL: define void @_Z6d_thdfv() +__device__ void d_thdf() { + template_vs_hd_function(1.0f); + // CHECK-DEVICE: call float @_Z23template_vs_hd_functionf(float + template_vs_hd_function(1.0); + // Host-only function template is not callable with strict call checks, + // so for device side HD function will be the only choice. + // CHECK-DEVICE-STRICT: call float @_Z23template_vs_hd_functionf(float + + // With target checks disabled we'll attempt to use host function template. + // CHECK-DEVICE-NC: call double @_Z23template_vs_hd_functionIdET_S0_(double +} + // CHECK-HOST-LABEL: define void @_Z5hostfv() __host__ void hostf(void) { #if defined (NOCHECKS)