Index: clang/include/clang/Basic/LangOptions.def =================================================================== --- clang/include/clang/Basic/LangOptions.def +++ clang/include/clang/Basic/LangOptions.def @@ -238,6 +238,7 @@ LANGOPT(SYCLVersion , 32, 0, "Version of the SYCL standard used") LANGOPT(HIPUseNewLaunchAPI, 1, 0, "Use new kernel launching API for HIP") +LANGOPT(HIPLambdaHostDevice, 1, 0, "Let non-reference-capturing lambda be host device for HIP") LANGOPT(SizedDeallocation , 1, 0, "sized deallocation") LANGOPT(AlignedAllocation , 1, 0, "aligned allocation") Index: clang/include/clang/Driver/Options.td =================================================================== --- clang/include/clang/Driver/Options.td +++ clang/include/clang/Driver/Options.td @@ -625,6 +625,11 @@ def gpu_max_threads_per_block_EQ : Joined<["--"], "gpu-max-threads-per-block=">, Flags<[CC1Option]>, HelpText<"Default max threads per block for kernel launch bounds for HIP">; +def fhip_lambda_host_device : Flag<["-"], "fhip-lambda-host-device">, + Flags<[CC1Option]>, + HelpText<"Let a lambda function without host/device attributes be a host " + "device function if it does not capture by reference (HIP only)">; +def fno_hip_lambda_host_device : Flag<["-"], "fno-hip-lambda-host-device">; def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group, HelpText<"Path to libomptarget-nvptx libraries">; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, Index: clang/include/clang/Sema/Sema.h =================================================================== --- clang/include/clang/Sema/Sema.h +++ clang/include/clang/Sema/Sema.h @@ -11677,7 +11677,7 @@ /// CUDA lambdas declared inside __device__ or __global__ functions inherit /// the __device__ attribute. Similarly, lambdas inside __host__ __device__ /// functions become __host__ __device__ themselves. - void CUDASetLambdaAttrs(CXXMethodDecl *Method); + void CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI); /// Finds a function in \p Matches with highest calling priority /// from \p Caller context and erases all functions with lower Index: clang/lib/Driver/ToolChains/Clang.cpp =================================================================== --- clang/lib/Driver/ToolChains/Clang.cpp +++ clang/lib/Driver/ToolChains/Clang.cpp @@ -5404,6 +5404,10 @@ options::OPT_fno_hip_new_launch_api, false)) CmdArgs.push_back("-fhip-new-launch-api"); + if (Args.hasFlag(options::OPT_fhip_lambda_host_device, + options::OPT_fhip_lambda_host_device, false)) + CmdArgs.push_back("-fhip-lambda-host-device"); + if (Arg *A = Args.getLastArg(options::OPT_fcf_protection_EQ)) { CmdArgs.push_back( Args.MakeArgString(Twine("-fcf-protection=") + A->getValue())); Index: clang/lib/Frontend/CompilerInvocation.cpp =================================================================== --- clang/lib/Frontend/CompilerInvocation.cpp +++ clang/lib/Frontend/CompilerInvocation.cpp @@ -2578,6 +2578,7 @@ << Args.getLastArg(OPT_fgpu_allow_device_init)->getAsString(Args); } Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api); + Opts.HIPLambdaHostDevice = Args.hasArg(OPT_fhip_lambda_host_device); if (Opts.HIP) Opts.GPUMaxThreadsPerBlock = getLastArgIntValue( Args, OPT_gpu_max_threads_per_block_EQ, Opts.GPUMaxThreadsPerBlock); Index: clang/lib/Sema/SemaCUDA.cpp =================================================================== --- clang/lib/Sema/SemaCUDA.cpp +++ clang/lib/Sema/SemaCUDA.cpp @@ -711,13 +711,30 @@ DiagKind != DeviceDiagBuilder::K_ImmediateWithCallStack; } -void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method) { +static bool LambdaHasRefCaptures(LambdaIntroducer &LI) { + if (LI.Default == LCD_ByRef) + return true; + for (auto &C : LI.Captures) { + if (C.Kind == LCK_ByRef || C.Kind == LCK_This) + return true; + } + return false; +} + +void Sema::CUDASetLambdaAttrs(CXXMethodDecl *Method, LambdaIntroducer &LI) { assert(getLangOpts().CUDA && "Should only be called during CUDA compilation"); if (Method->hasAttr() || Method->hasAttr()) return; FunctionDecl *CurFn = dyn_cast(CurContext); if (!CurFn) return; + if (getLangOpts().HIP && + (Method->isConstexpr() || + (getLangOpts().HIPLambdaHostDevice && !LambdaHasRefCaptures(LI)))) { + Method->addAttr(CUDADeviceAttr::CreateImplicit(Context)); + Method->addAttr(CUDAHostAttr::CreateImplicit(Context)); + return; + } CUDAFunctionTarget Target = IdentifyCUDATarget(CurFn); if (Target == CFT_Global || Target == CFT_Device) { Method->addAttr(CUDADeviceAttr::CreateImplicit(Context)); Index: clang/lib/Sema/SemaLambda.cpp =================================================================== --- clang/lib/Sema/SemaLambda.cpp +++ clang/lib/Sema/SemaLambda.cpp @@ -996,7 +996,7 @@ // CUDA lambdas get implicit attributes based on the scope in which they're // declared. if (getLangOpts().CUDA) - CUDASetLambdaAttrs(Method); + CUDASetLambdaAttrs(Method, Intro); // Number the lambda for linkage purposes if necessary. handleLambdaNumbering(Class, Method); Index: clang/test/CodeGenCUDA/lambda.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/lambda.cu @@ -0,0 +1,85 @@ +// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \ +// RUN: -triple x86_64-linux-gnu -fhip-lambda-host-device \ +// RUN: | FileCheck -check-prefix=HOST %s +// RUN: %clang_cc1 -x hip -emit-llvm -std=c++11 %s -o - \ +// RUN: -fhip-lambda-host-device -triple amdgcn-amd-amdhsa -fcuda-is-device \ +// RUN: | FileCheck -check-prefix=DEV %s + +#include "Inputs/cuda.h" + +// Device side kernel name. +// HOST: @[[KERN_CAPTURE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_capturevEUlvE_EvT_\00" +// HOST: @[[KERN_RESOLVE:[0-9]+]] = {{.*}} c"_Z1gIZ12test_resolvevEUlvE_EvT_\00" + +// Check functions emitted for test_capture in host compilation. +// Check lambda is not emitted in host compilation. +// HOST-LABEL: define void @_Z12test_capturev +// HOST: call void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_ +// HOST-LABEL: define internal void @_Z19test_capture_helperIZ12test_capturevEUlvE_EvT_ +// HOST: call void @_Z16__device_stub__gIZ12test_capturevEUlvE_EvT_ +// HOST-NOT: define{{.*}}@_ZZ4mainENKUlvE_clEv + +// Check functions emitted for test_resolve in host compilation. +// Check host version of template function 'overloaded' is emitted and called +// by the lambda function. +// HOST-LABEL: define void @_Z12test_resolvev +// HOST: call void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_() +// HOST-LABEL: define internal void @_Z19test_resolve_helperIZ12test_resolvevEUlvE_EvT_ +// HOST: call void @_Z16__device_stub__gIZ12test_resolvevEUlvE_EvT_ +// HOST: call void @_ZZ12test_resolvevENKUlvE_clEv +// HOST-LABEL: define internal void @_ZZ12test_resolvevENKUlvE_clEv +// HOST: call i32 @_Z10overloadedIiET_v +// HOST-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v +// HOST: ret i32 2 + +// Check kernel is registered with correct device side kernel name. +// HOST: @__hipRegisterFunction({{.*}}@[[KERN_CAPTURE]] +// HOST: @__hipRegisterFunction({{.*}}@[[KERN_RESOLVE]] + +// DEV: @a = addrspace(1) externally_initialized global i32 0 + +// Check functions emitted for test_capture in device compilation. +// Check lambda is emitted in device compilation and accessing device variable. +// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_capturevEUlvE_EvT_ +// DEV: call void @_ZZ12test_capturevENKUlvE_clEv +// DEV-LABEL: define internal void @_ZZ12test_capturevENKUlvE_clEv +// DEV: store i32 1, i32* addrspacecast (i32 addrspace(1)* @a to i32*) + +// Check functions emitted for test_resolve in device compilation. +// Check device version of template function 'overloaded' is emitted and called +// by the lambda function. +// DEV-LABEL: define amdgpu_kernel void @_Z1gIZ12test_resolvevEUlvE_EvT_ +// DEV: call void @_ZZ12test_resolvevENKUlvE_clEv +// DEV-LABE: define internal void @_ZZ12test_resolvevENKUlvE_clEv +// DEV: call i32 @_Z10overloadedIiET_v +// DEV-LABEL: define linkonce_odr i32 @_Z10overloadedIiET_v +// DEV: ret i32 1 + +__device__ int a; + +template +__device__ T overloaded() { return 1; } + +template +__host__ T overloaded() { return 2; } + +template +__global__ void g(F f) { f(); } + +template +void test_capture_helper(F f) { g<<<1,1>>>(f); } + +template +void test_resolve_helper(F f) { g<<<1,1>>>(f); f(); } + +// Test capture of device variable in lambda function. +void test_capture(void) { + test_capture_helper([](){ a = 1;}); +} + +// Test resolving host/device function in lambda function. +// Callee should resolve to correct host/device function based on where +// the lambda function is called, not where it is defined. +void test_resolve(void) { + test_resolve_helper([](){ overloaded();}); +} Index: clang/test/SemaCUDA/Inputs/cuda.h =================================================================== --- clang/test/SemaCUDA/Inputs/cuda.h +++ clang/test/SemaCUDA/Inputs/cuda.h @@ -17,6 +17,19 @@ __host__ __device__ dim3(unsigned x, unsigned y = 1, unsigned z = 1) : x(x), y(y), z(z) {} }; +#ifdef __HIP__ +typedef struct hipStream *hipStream_t; +typedef enum hipError {} hipError_t; +int hipConfigureCall(dim3 gridSize, dim3 blockSize, size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t __hipPushCallConfiguration(dim3 gridSize, dim3 blockSize, + size_t sharedSize = 0, + hipStream_t stream = 0); +extern "C" hipError_t hipLaunchKernel(const void *func, dim3 gridDim, + dim3 blockDim, void **args, + size_t sharedMem, + hipStream_t stream); +#else typedef struct cudaStream *cudaStream_t; typedef enum cudaError {} cudaError_t; @@ -29,6 +42,7 @@ extern "C" cudaError_t cudaLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim, void **args, size_t sharedMem, cudaStream_t stream); +#endif // Host- and device-side placement new overloads. void *operator new(__SIZE_TYPE__, void *p) { return p; } Index: clang/test/SemaCUDA/lambda.cu =================================================================== --- /dev/null +++ clang/test/SemaCUDA/lambda.cu @@ -0,0 +1,38 @@ +// RUN: %clang_cc1 -std=c++17 -fsyntax-only -x hip -verify %s -fhip-lambda-host-device + +#include "Inputs/cuda.h" + +template +__global__ void kernel(F f) { f(); } +// expected-error@-1 3{{no matching function for call to object of type}} + +constexpr __host__ __device__ void hd(); + +int main(void) { + auto lambda_kernel = [&]__global__(){}; + // expected-error@-1 {{kernel function 'operator()' must be a free function or static member function}} + + int b; + + kernel<<<1,1>>>([](){ hd(); }); + + kernel<<<1,1>>>([=](){ hd(); }); + + kernel<<<1,1>>>([b](){ hd(); }); + + kernel<<<1,1>>>([&]()constexpr{ hd(); }); + + kernel<<<1,1>>>([&](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}} + + kernel<<<1,1>>>([=, &b](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}} + + kernel<<<1,1>>>([&, b](){ hd(); }); + // expected-note@-1 {{in instantiation of function template specialization 'kernel<(lambda at}} + // expected-note@-2 {{candidate function not viable: call to __host__ function from __global__ function}} + + return 0; +}