Index: clang/include/clang/Basic/DiagnosticSemaKinds.td =================================================================== --- clang/include/clang/Basic/DiagnosticSemaKinds.td +++ clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -4223,7 +4223,9 @@ "candidate %sub{select_ovl_candidate_kind}0,1,2 not viable: " "call to " "%select{__device__|__global__|__host__|__host__ __device__|invalid}3 function from" - " %select{__device__|__global__|__host__|__host__ __device__|invalid}4 function">; + " %select{__device__|__global__|__host__|__host__ __device__|invalid}4 function" + "%select{|. Consider adding proper host/device attribute to the lambda function" + " or making it non-capturing}5">; def note_ovl_candidate_constraints_not_satisfied : Note< "candidate %sub{select_ovl_candidate_kind}0,1,2 not viable: constraints " "not satisfied">; Index: clang/include/clang/Sema/Sema.h =================================================================== --- clang/include/clang/Sema/Sema.h +++ clang/include/clang/Sema/Sema.h @@ -11776,7 +11776,7 @@ /// CUDA lambdas declared inside __device__ or __global__ functions inherit /// the __device__ attribute. Similarly, lambdas inside __host__ __device__ /// functions become __host__ __device__ themselves. - void CUDASetLambdaAttrs(CXXMethodDecl *Method); + void CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI); /// Finds a function in \p Matches with highest calling priority /// from \p Caller context and erases all functions with lower Index: clang/lib/Sema/SemaCUDA.cpp =================================================================== --- clang/lib/Sema/SemaCUDA.cpp +++ clang/lib/Sema/SemaCUDA.cpp @@ -746,10 +746,15 @@ DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack; } -void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) { +void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI) { assert(getLangOpts().CUDA && "Should only be called during CUDA compilation"); if (Method->hasAttr() || Method->hasAttr()) return; + if (LI.Default == LCD_None && LI.Captures.size() == 0) { + Method->addAttr(CUDADeviceAttr::CreateImplicit(Context)); + Method->addAttr(CUDAHostAttr::CreateImplicit(Context)); + return; + } FunctionDecl *CurFn = dyn_cast(CurContext); if (!CurFn) return; Index: clang/lib/Sema/SemaLambda.cpp =================================================================== --- clang/lib/Sema/SemaLambda.cpp +++ clang/lib/Sema/SemaLambda.cpp @@ -993,7 +993,7 @@ // CUDA lambdas get implicit attributes based on the scope in which they're // declared. if (getLangOpts().CUDA) - CUDASetLambdaAttrs(Method); + CUDASetLambdaAttrs(Method, Intro); // Number the lambda for linkage purposes if necessary. handleLambdaNumbering(Class, Method); Index: clang/lib/Sema/SemaOverload.cpp =================================================================== --- clang/lib/Sema/SemaOverload.cpp +++ clang/lib/Sema/SemaOverload.cpp @@ -10863,7 +10863,8 @@ S.Diag(Callee->getLocation(), diag::note_ovl_candidate_bad_target) << (unsigned)FnKindPair.first << (unsigned)ocs_non_template << FnDesc /* Ignored */ - << CalleeTarget << CallerTarget; + << CalleeTarget << CallerTarget << (isa(Callee) && + cast(Callee)->getParent()->isLambda()); // This could be an implicit constructor for which we could not infer the // target due to a collsion. Diagnose that case. Index: clang/test/CodeGenCUDA/lambda.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/lambda.cu @@ -0,0 +1,85 @@ +// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \ +// RUN: -triple x86_64-linux-gnu \ +// RUN: | FileCheck -check-prefix=HOST %s +// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \ +// RUN: -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: | FileCheck -check-prefix=DEV %s + +#include "Inputs/cuda.h" + +// Device side kernel name. +// HOST: @[[KERN_CAPTURE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_capturevEUlvE_EvT_\00" +// HOST: @[[KERN_RESOLVE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_resolvevEUlvE_EvT_\00" + +// Check functions emitted for test_capture in host compilation. +// Check lambda is not emitted in host compilation. +// HOST-LABEL: define void @_Z12test_capturev +// HOST: call void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_ +// HOST-LABEL: define internal void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_ +// HOST: call void @_Z16__device_stub__gIZ12test_capturevEUlvE_EvT_ +// HOST-NOT: define{{.*}}@_ZZ4mainENKUlvE_clEv + +// Check functions emitted for test_resolve in host compilation. +// Check host version of template function 'overloaded' is emitted and called +// by the lambda function. +// HOST-LABEL: define void @_Z12test_resolvev +// HOST: call void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_() +// HOST-LABEL: define internal void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_ +// HOST: call void @_Z16__device_stub__gIZ12test_resolvevEUlvE_EvT_ +// HOST: call void @_ZZ12test_resolvevENKUlvE_clEv +// HOST-LABEL: define internal void @_ZZ12test_resolvevENKUlvE_clEv +// HOST: call i32 @_Z10overloadedIiET_v +// HOST-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v +// HOST: ret i32 2 + +// Check kernel is registered with correct device side kernel name. +// HOST: @__hipRegisterFunction({{.*}}@[[KERN_CAPTURE]] +// HOST: @__hipRegisterFunction({{.*}}@[[KERN_RESOLVE]] + +// DEV: @a = addrspace(1) externally_initialized global i32 0 + +// Check functions emitted for test_capture in device compilation. +// Check lambda is emitted in device compilation and accessing device variable. +// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_capturevEUlvE_EvT_ +// DEV: call void @_ZZ12test_capturevENKUlvE_clEv +// DEV-LABEL: define internal void @_ZZ12test_capturevENKUlvE_clEv +// DEV: store i32 1, i32* addrspacecast (i32 addrspace(1)* @a to i32*) + +// Check functions emitted for test_resolve in device compilation. +// Check device version of template function 'overloaded' is emitted and called +// by the lambda function. +// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_resolvevEUlvE_EvT_ +// DEV: call void @_ZZ12test_resolvevENKUlvE_clEv +// DEV-LABE: define internal void @_ZZ12test_resolvevENKUlvE_clEv +// DEV: call i32 @_Z10overloadedIiET_v +// DEV-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v +// DEV: ret i32 1 + +__device__ int a; + +template +__device__ T overloaded() { return 1; } + +template +__host__ T overloaded() { return 2; } + +template +__global__ void g(F f) { f(); } + +template +void test_capture_helper(F f) { g<<<1,1>>>(f); } + +template +void test_resolve_helper(F f) { g<<<1,1>>>(f); f(); } + +// Test capture of device variable in lambda function. +void test_capture(void) { + test_capture_helper([](){ a = 1;}); +} + +// Test resolving host/device function in lambda function. +// Callee should resolve to correct host/device function based on where +// the lambda function is called, not where it is defined. +void test_resolve(void) { + test_resolve_helper([](){ overloaded();}); +} Index: clang/test/SemaCUDA/Inputs/cuda.h =================================================================== --- clang/test/SemaCUDA/Inputs/cuda.h +++ clang/test/SemaCUDA/Inputs/cuda.h @@ -17,6 +17,19 @@ __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} }; +#ifdef __HIP__ +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; +int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#else typedef struct cudaStream *cudaStream_t; typedef enum cudaError {} cudaError_t; @@ -29,6 +42,7 @@ extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); +#endif // Host- and device-side placement new overloads. void *operator new(__SIZE_TYPE__, void *p) { return p; } Index: clang/test/SemaCUDA/lambda.cu =================================================================== --- /dev/null +++ clang/test/SemaCUDA/lambda.cu @@ -0,0 +1,61 @@ +// RUN: %clang_cc1 -std=c++17 -fsyntax-only -verify %s + +#include "Inputs/cuda.h" + +__device__ int gvar; + +// non-capturing lambda is implicitly host device function. +auto global_lambda = [] () { return 123; }; + +auto global_lambda2 = [gvar=gvar] () { return gvar; }; +// expected-note@-1{{candidate function not viable: call to __host__ function from __global__ function. Consider adding proper host/device attribute to the lambda function or making it non-capturing}} + +template +__global__ void kernel(F f) { f(); } +// expected-error@-1 7{{no matching function for call to object of type}} + +constexpr __host__ __device__ void hd(); + +int main(void) { + auto lambda_kernel = [&]__global__(){}; + // expected-error@-1 {{kernel function 'operator()' must be a free function or static member function}} + + int b; + kernel<<<1,1>>>(global_lambda); + + kernel<<<1,1>>>(global_lambda2); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + + kernel<<<1,1>>>([](){ hd(); }); + + kernel<<<1,1>>>([=](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function. Consider adding proper host/device attribute to the lambda function or making it non-capturing}} + + kernel<<<1,1>>>([b](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function. Consider adding proper host/device attribute to the lambda function or making it non-capturing}} + + kernel<<<1,1>>>([&]()constexpr{ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function. Consider adding proper host/device attribute to the lambda function or making it non-capturing}} + + kernel<<<1,1>>>([&](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function. Consider adding proper host/device attribute to the lambda function or making it non-capturing}} + + kernel<<<1,1>>>([=, &b](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function. Consider adding proper host/device attribute to the lambda function or making it non-capturing}} + + kernel<<<1,1>>>([&, b](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function. Consider adding proper host/device attribute to the lambda function or making it non-capturing}} + + kernel<<<1,1>>>([](){ + auto f = [&]{ hd(); }; + f(); + }); + + return 0; +}