Index: clang/include/clang/AST/ASTContext.h =================================================================== --- clang/include/clang/AST/ASTContext.h +++ clang/include/clang/AST/ASTContext.h @@ -1058,8 +1058,8 @@ // Implicitly-declared type 'struct _GUID'. mutable TagDecl *MSGuidTagDecl = nullptr; - /// Keep track of CUDA/HIP static device variables referenced by host code. - llvm::DenseSet CUDAStaticDeviceVarReferencedByHost; + /// Keep track of CUDA/HIP device-side variables ODR-used by host code. + llvm::DenseSet CUDADeviceVarODRUsedByHost; ASTContext(LangOptions &LOpts, SourceManager &SM, IdentifierTable &idents, SelectorTable &sels, Builtin::Context &builtins); Index: clang/lib/AST/ASTContext.cpp =================================================================== --- clang/lib/AST/ASTContext.cpp +++ clang/lib/AST/ASTContext.cpp @@ -11630,7 +11630,7 @@ bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const { return mayExternalizeStaticVar(D) && (D->hasAttr() || - CUDAStaticDeviceVarReferencedByHost.count(cast(D))); + CUDADeviceVarODRUsedByHost.count(cast(D))); } StringRef ASTContext::getCUIDHash() const { Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -1015,10 +1015,14 @@ // Don't register a C++17 inline variable. The local symbol can be // discarded and referencing a discarded local symbol from outside the // comdat (__cuda_register_globals) is disallowed by the ELF spec. - // TODO: Reject __device__ constexpr and __device__ inline in Sema. + // // HIP managed variables need to be always recorded in device and host // compilations for transformation. + // + // HIP managed variables and variables in CUDADeviceVarODRUsedByHost are + // added to llvm.compiler-used, therefore they are safe to be registered. if ((!D->hasExternalStorage() && !D->isInline()) || + CGM.getContext().CUDADeviceVarODRUsedByHost.contains(D) || D->hasAttr()) { registerDeviceVar(D, GV, !D->hasDefinition(), D->hasAttr()); Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -2362,8 +2362,8 @@ } // Emit CUDA/HIP static device variables referenced by host code only. - if (getLangOpts().CUDA) - for (auto V : getContext().CUDAStaticDeviceVarReferencedByHost) + if (getLangOpts().CUDA && getLangOpts().CUDAIsDevice) + for (auto V : getContext().CUDADeviceVarODRUsedByHost) DeferredDeclsToEmit.push_back(V); // Stop if we're out of both deferred vtables and deferred declarations. Index: clang/lib/Sema/SemaExpr.cpp =================================================================== --- clang/lib/Sema/SemaExpr.cpp +++ clang/lib/Sema/SemaExpr.cpp @@ -17143,26 +17143,26 @@ return false; }; if (Var && Var->hasGlobalStorage()) { - if (SemaRef.LangOpts.CUDAIsDevice && !IsEmittedOnDeviceSide(Var)) { + if (!IsEmittedOnDeviceSide(Var)) { // Diagnose ODR-use of host global variables in device functions. // Reference of device global variables in host functions is allowed // through shadow variables therefore it is not diagnosed. - SemaRef.targetDiag(Loc, diag::err_ref_bad_target) - << /*host*/ 2 << /*variable*/ 1 << Var << Target; - } else if (Target == Sema::CFT_Host || Target == Sema::CFT_HostDevice) { - // Record a CUDA/HIP static device/constant variable if it is referenced + if (SemaRef.LangOpts.CUDAIsDevice) + SemaRef.targetDiag(Loc, diag::err_ref_bad_target) + << /*host*/ 2 << /*variable*/ 1 << Var << Target; + } else if ((Target == Sema::CFT_Host || Target == Sema::CFT_HostDevice) && + !Var->hasExternalStorage()) { + // Record a CUDA/HIP device side variable if it is ODR-used // by host code. This is done conservatively, when the variable is // referenced in any of the following contexts: // - a non-function context // - a host function // - a host device function - // This also requires the reference of the static device/constant - // variable by host code to be visible in the device compilation for the - // compiler to be able to externalize the static device/constant - // variable. - if (SemaRef.getASTContext().mayExternalizeStaticVar(Var)) - SemaRef.getASTContext().CUDAStaticDeviceVarReferencedByHost.insert( - Var); + // This makes the ODR-use of the device side variable by host code to + // be visible in the device compilation for the compiler to be able to + // emit template variables instantiated by host code only and to + // externalize the static device side variable ODR-used by host code. + SemaRef.getASTContext().CUDADeviceVarODRUsedByHost.insert(Var); } } } Index: clang/test/CodeGenCUDA/device-stub.cu =================================================================== --- clang/test/CodeGenCUDA/device-stub.cu +++ clang/test/CodeGenCUDA/device-stub.cu @@ -107,9 +107,14 @@ #if __cplusplus > 201402L // NORDC17: @inline_var = internal global i32 undef, comdat, align 4{{$}} // RDC17: @inline_var = linkonce_odr global i32 undef, comdat, align 4{{$}} +// NORDC17-NOT: @inline_var2 = +// RDC17-NOT: @inline_var2 = // NORDC17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}} // RDC17: @_ZN1C17member_inline_varE = linkonce_odr constant i32 undef, comdat, align 4{{$}} +// Check inline variable ODR-used by host is emitted on host and registered. __device__ inline int inline_var = 3; +// Check inline variable not ODR-used by host is not emitted on host or registered. +__device__ inline int inline_var2 = 5; struct C { __device__ static constexpr int member_inline_var = 4; }; @@ -126,10 +131,17 @@ p = &ext_host_var; #if __cplusplus > 201402L p = &inline_var; + decltype(inline_var2) tmp; p = &C::member_inline_var; #endif } +__device__ void device_use() { +#if __cplusplus > 201402L + const int *p = &inline_var2; +#endif +} + // Make sure that all parts of GPU code init/cleanup are there: // * constant unnamed string with the device-side kernel name to be passed to // __hipRegisterFunction/__cudaRegisterFunction. @@ -212,7 +224,8 @@ // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{[^,]*}}, {{[^@]*}}@2, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var_def{{[^,]*}}, {{[^@]*}}@3, {{.*}}i32 0, {{i32|i64}} 4, i32 0, i32 0 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var_def{{[^,]*}}, {{[^@]*}}@4, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0 -// LNX_17-NOT: [[PREFIX]]RegisterVar(i8** %0, {{.*}}inline_var +// LNX_17-DAG: [[PREFIX]]RegisterVar(i8** %0, {{.*}}inline_var +// LNX_17-NOT: [[PREFIX]]RegisterVar(i8** %0, {{.*}}inline_var2 // ALL: ret void // Test that we've built a constructor. Index: clang/test/CodeGenCUDA/host-used-device-var.cu =================================================================== --- clang/test/CodeGenCUDA/host-used-device-var.cu +++ clang/test/CodeGenCUDA/host-used-device-var.cu @@ -1,47 +1,95 @@ // REQUIRES: amdgpu-registered-target // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \ -// RUN: -std=c++11 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \ -// RUN: | FileCheck %s +// RUN: -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \ +// RUN: | FileCheck -check-prefix=DEV %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \ +// RUN: -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST %s + +// Negative tests. + +// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -x hip %s \ +// RUN: -std=c++17 -O3 -mllvm -amdgpu-internalize-symbols -emit-llvm -o - \ +// RUN: | FileCheck -check-prefix=DEV-NEG %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -x hip %s \ +// RUN: -std=c++17 -O3 -emit-llvm -o - | FileCheck -check-prefix=HOST-NEG %s #include "Inputs/cuda.h" // Check device variables used by neither host nor device functioins are not kept. -// CHECK-NOT: @v1 +// DEV-NEG-NOT: @v1 __device__ int v1; -// CHECK-NOT: @v2 +// DEV-NEG-NOT: @v2 __constant__ int v2; -// CHECK-NOT: @_ZL2v3 +// DEV-NEG-NOT: @_ZL2v3 static __device__ int v3; // Check device variables used by host functions are kept. -// CHECK-DAG: @u1 +// DEV-DAG: @u1 __device__ int u1; -// CHECK-DAG: @u2 +// DEV-DAG: @u2 __constant__ int u2; // Check host-used static device var is in llvm.compiler.used. -// CHECK-DAG: @_ZL2u3 +// DEV-DAG: @_ZL2u3 static __device__ int u3; // Check device-used static device var is emitted but is not in llvm.compiler.used. -// CHECK-DAG: @_ZL2u4 +// DEV-DAG: @_ZL2u4 static __device__ int u4; // Check device variables with used attribute are always kept. -// CHECK-DAG: @u5 +// DEV-DAG: @u5 __device__ __attribute__((used)) int u5; -int fun1() { - return u1 + u2 + u3; +// Test external device variable ODR-used by host code is not emitted or registered. +// DEV-NEG-NOT: @ext_var +extern __device__ int ext_var; + +// DEV-DAG: @inline_var = linkonce_odr addrspace(1) externally_initialized global i32 0 +__device__ inline int inline_var; + +template +using func_t = T (*) (T, T); + +template +__device__ T add_func (T x, T y) +{ + return x + y; +} + +// DEV-DAG: @_Z10p_add_funcIiE = linkonce_odr addrspace(1) externally_initialized global i32 (i32, i32)* @_Z8add_funcIiET_S0_S0_ +template +__device__ func_t p_add_func = add_func; + +void use(func_t p); +void use(int *p); + +void fun1() { + use(&u1); + use(&u2); + use(&u3); + use(&ext_var); + use(&inline_var); + use(p_add_func); } __global__ void kern1(int **x) { *x = &u4; } + // Check the exact list of variables to ensure @_ZL2u4 is not among them. -// CHECK: @llvm.compiler.used = {{[^@]*}} @_ZL2u3 {{[^@]*}} @u1 {{[^@]*}} @u2 {{[^@]*}} @u5 +// DEV: @llvm.compiler.used = {{[^@]*}} @_Z10p_add_funcIiE {{[^@]*}} @_ZL2u3 {{[^@]*}} @inline_var {{[^@]*}} @u1 {{[^@]*}} @u2 {{[^@]*}} @u5 + +// HOST-DAG: hipRegisterVar{{.*}}@u1 +// HOST-DAG: hipRegisterVar{{.*}}@u2 +// HOST-DAG: hipRegisterVar{{.*}}@_ZL2u3 +// HOST-DAG: hipRegisterVar{{.*}}@u5 +// HOST-DAG: hipRegisterVar{{.*}}@inline_var +// HOST-DAG: hipRegisterVar{{.*}}@_Z10p_add_funcIiE +// HOST-NEG-NOT: hipRegisterVar{{.*}}@ext_var +// HOST-NEG-NOT: hipRegisterVar{{.*}}@_ZL2u4