Index: include/clang/Basic/LangOptions.def =================================================================== --- include/clang/Basic/LangOptions.def +++ include/clang/Basic/LangOptions.def @@ -224,6 +224,7 @@ LANGOPT(CUDAHostDeviceConstexpr, 1, 1, "treating unattributed constexpr functions as __host__ __device__") LANGOPT(CUDADeviceApproxTranscendentals, 1, 0, "using approximate transcendental functions") LANGOPT(GPURelocatableDeviceCode, 1, 0, "generate relocatable device code") +LANGOPT(GPUAllowDeviceInit, 1, 0, "allowing device side global init functions for HIP") LANGOPT(SYCLIsDevice , 1, 0, "Generate code for SYCL device") Index: include/clang/Driver/Options.td =================================================================== --- include/clang/Driver/Options.td +++ include/clang/Driver/Options.td @@ -602,6 +602,9 @@ def fhip_new_launch_api : Flag<["-"], "fhip-new-launch-api">, Flags<[CC1Option]>, HelpText<"Use new kernel launching API for HIP.">; def fno_hip_new_launch_api : Flag<["-"], "fno-hip-new-launch-api">; +def fgpu_allow_device_init : Flag<["-"], "fgpu-allow-device-init">, + Flags<[CC1Option]>, HelpText<"Allow device side init function in HIP">; +def fno_gpu_allow_device_init : Flag<["-"], "fno-gpu-allow-device-init">; def libomptarget_nvptx_path_EQ : Joined<["--"], "libomptarget-nvptx-path=">, Group, HelpText<"Path to libomptarget-nvptx libraries">; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, Index: lib/CodeGen/CGDeclCXX.cpp =================================================================== --- lib/CodeGen/CGDeclCXX.cpp +++ lib/CodeGen/CGDeclCXX.cpp @@ -437,7 +437,7 @@ // that are of class type, cannot have a non-empty constructor. All // the checks have been done in Sema by now. Whatever initializers // are allowed are empty and we just need to ignore them here. - if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice && + if (getLangOpts().CUDAIsDevice && !getLangOpts().GPUAllowDeviceInit && (D->hasAttr() || D->hasAttr() || D->hasAttr())) return; @@ -608,6 +608,11 @@ Fn->setCallingConv(llvm::CallingConv::SPIR_KERNEL); } + if (getLangOpts().HIP) { + Fn->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL); + Fn->addFnAttr("device-init"); + } + CXXGlobalInits.clear(); } Index: lib/Driver/ToolChains/HIP.cpp =================================================================== --- lib/Driver/ToolChains/HIP.cpp +++ lib/Driver/ToolChains/HIP.cpp @@ -292,6 +292,10 @@ false)) CC1Args.push_back("-fgpu-rdc"); + if (DriverArgs.hasFlag(options::OPT_fgpu_allow_device_init, + options::OPT_fno_gpu_allow_device_init, false)) + CC1Args.push_back("-fgpu-allow-device-init"); + // Default to "hidden" visibility, as object level linking will not be // supported for the foreseeable future. if (!DriverArgs.hasArg(options::OPT_fvisibility_EQ, Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -2527,6 +2527,7 @@ Opts.CUDADeviceApproxTranscendentals = 1; Opts.GPURelocatableDeviceCode = Args.hasArg(OPT_fgpu_rdc); + Opts.GPUAllowDeviceInit = Args.hasArg(OPT_fgpu_allow_device_init); Opts.HIPUseNewLaunchAPI = Args.hasArg(OPT_fhip_new_launch_api); if (Opts.ObjC) { Index: lib/Sema/SemaCUDA.cpp =================================================================== --- lib/Sema/SemaCUDA.cpp +++ lib/Sema/SemaCUDA.cpp @@ -492,6 +492,8 @@ const Expr *Init = VD->getInit(); if (VD->hasAttr() || VD->hasAttr() || VD->hasAttr()) { + if (LangOpts.GPUAllowDeviceInit) + return; assert(!VD->isStaticLocal() || VD->hasAttr()); bool AllowedInit = false; if (const CXXConstructExpr *CE = dyn_cast(Init)) Index: test/CodeGenCUDA/device-init-fun.cu =================================================================== --- /dev/null +++ test/CodeGenCUDA/device-init-fun.cu @@ -0,0 +1,19 @@ +// REQUIRES: amdgpu-registered-target + +// RUN: %clang_cc1 -triple amdgcn -fcuda-is-device -std=c++11 \ +// RUN: -fgpu-allow-device-init -x hip \ +// RUN: -fno-threadsafe-statics -emit-llvm -o - %s \ +// RUN: | FileCheck %s + +#include "Inputs/cuda.h" + +// CHECK: define internal amdgpu_kernel void @_GLOBAL__sub_I_device_init_fun.cu() #[[ATTR:[0-9]*]] +// CHECK: attributes #[[ATTR]] = {{.*}}"device-init" + +__device__ void f(); + +struct A { + __device__ A() { f(); } +}; + +__device__ A a;