diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -4114,6 +4114,7 @@ // Is accessible from all the threads within the grid and from the host // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize() // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())." + bool CudaModuleCtorReferenced = false; if (GV && LangOpts.CUDA) { if (LangOpts.CUDAIsDevice) { if (Linkage != llvm::GlobalValue::InternalLinkage && @@ -4128,10 +4129,16 @@ Linkage = llvm::GlobalValue::InternalLinkage; // Shadow variables and their properties must be registered with CUDA // runtime. Skip Extern global variables, which will be registered in - // the TU where they are defined. - if (!D->hasExternalStorage()) + // the TU where they are defined. The variable cannot be placed in a + // comdat, because the copy in this translation unit can be discarded + // and referencing a discarded local symbol from outside the comdat + // (__cuda_module_ctor is in a different section) is disallowed by the + // ELF spec. + if (!D->hasExternalStorage()) { getCUDARuntime().registerDeviceVar(D, *GV, !D->hasDefinition(), D->hasAttr()); + CudaModuleCtorReferenced = true; + } } else if (D->hasAttr()) { // __shared__ variables are odd. Shadows do get created, but // they are not registered with the CUDA runtime, so they @@ -4235,7 +4242,8 @@ setTLSMode(GV, *D); } - maybeSetTrivialComdat(*D, *GV); + if (!CudaModuleCtorReferenced) + maybeSetTrivialComdat(*D, *GV); // Emit the initializer function if necessary. if (NeedsGlobalCtor || NeedsGlobalDtor) diff --git a/clang/test/CodeGenCUDA/device-stub.cu b/clang/test/CodeGenCUDA/device-stub.cu --- a/clang/test/CodeGenCUDA/device-stub.cu +++ b/clang/test/CodeGenCUDA/device-stub.cu @@ -29,6 +29,10 @@ // RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \ // RUN: | FileCheck %s -allow-deprecated-dag-overlap \ // RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA_NEW +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \ +// RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \ +// RUN: | FileCheck %s -allow-deprecated-dag-overlap \ +// RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA_NEW,LNX_17 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -target-sdk-version=9.2 -o - \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN @@ -91,9 +95,17 @@ // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef __constant__ int ext_constant_var_def = 2; +#if __cplusplus > 201402L +/// The local symbol _ZN1C10inline_varE cannot have a comdat, because it is +/// referenced from a section (__cuda_module_ctor's section) outside the section group. +// LNX_17: @_ZN1C10inline_varE = internal constant i32 undef, align 4{{$}} +struct C { + __device__ static constexpr int inline_var = 17; +}; +#endif void use_pointers() { - int *p; + const int *p; p = &device_var; p = &constant_var; p = &shared_var; @@ -101,6 +113,9 @@ p = &ext_device_var; p = &ext_constant_var; p = &ext_host_var; +#if __cplusplus > 201402L + p = &C::inline_var; +#endif } // Make sure that all parts of GPU code init/cleanup are there: @@ -185,6 +200,7 @@ // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}constant_var{{[^,]*}}, {{[^@]*}}@2, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_device_var_def{{[^,]*}}, {{[^@]*}}@3, {{.*}}i32 0, {{i32|i64}} 4, i32 0, i32 0 // ALL-DAG: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}ext_constant_var_def{{[^,]*}}, {{[^@]*}}@4, {{.*}}i32 0, {{i32|i64}} 4, i32 1, i32 0 +// LNX_17: call void {{.*}}[[PREFIX]]RegisterVar(i8** %0, {{.*}}@_ZN1C10inline_varE{{[^,]*}}, {{[^@]*}}@5, {{.*}}i32 0, {{i32|i64}} 4, i32 0, i32 0 // ALL: ret void // Test that we've built a constructor.