Index: clang/lib/AST/ASTContext.cpp =================================================================== --- clang/lib/AST/ASTContext.cpp +++ clang/lib/AST/ASTContext.cpp @@ -11435,16 +11435,22 @@ } bool ASTContext::mayExternalizeStaticVar(const Decl *D) const { - return !getLangOpts().GPURelocatableDeviceCode && - ((D->hasAttr() && - !D->getAttr()->isImplicit()) || - (D->hasAttr() && - !D->getAttr()->isImplicit())) && - isa(D) && cast(D)->isFileVarDecl() && - cast(D)->getStorageClass() == SC_Static; + bool IsStaticVar = + isa(D) && cast(D)->getStorageClass() == SC_Static; + bool IsExplicitDeviceVar = (D->hasAttr() && + !D->getAttr()->isImplicit()) || + (D->hasAttr() && + !D->getAttr()->isImplicit()); + // CUDA/HIP: static managed variables need to be externalized since it is + // a declaration in IR, therefore cannot have internal linkage. + // ToDo: externalize static variables for -fgpu-rdc. + return IsStaticVar && + (D->hasAttr() || + (!getLangOpts().GPURelocatableDeviceCode && IsExplicitDeviceVar)); } bool ASTContext::shouldExternalizeStaticVar(const Decl *D) const { return mayExternalizeStaticVar(D) && - CUDAStaticDeviceVarReferencedByHost.count(cast(D)); + (D->hasAttr() || + CUDAStaticDeviceVarReferencedByHost.count(cast(D))); } Index: clang/lib/CodeGen/CGCUDANV.cpp =================================================================== --- clang/lib/CodeGen/CGCUDANV.cpp +++ clang/lib/CodeGen/CGCUDANV.cpp @@ -538,6 +538,8 @@ /*Init=*/llvm::ConstantPointerNull::get(Var->getType()), Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal); + ManagedVar->setDSOLocal(Var->isDSOLocal()); + ManagedVar->setVisibility(Var->getVisibility()); replaceManagedVar(Var, ManagedVar); llvm::Value *Args[] = { &GpuBinaryHandlePtr, @@ -924,11 +926,16 @@ void CGNVCUDARuntime::internalizeDeviceSideVar( const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) { - // Host-side shadows of external declarations of device-side - // global variables become internal definitions. These have to - // be internal in order to prevent name conflicts with global - // host variables with the same name in a different TUs. + // For -fno-gpu-rdc, host-side shadows of external declarations of device-side + // global variables become internal definitions. These have to be internal in + // order to prevent name conflicts with global host variables with the same + // name in a different TUs. // + // For -fgpu-rdc, the shadow variables should not be internalized because + // they may be accessed by different TU. + if (CGM.getLangOpts().GPURelocatableDeviceCode) + return; + // __shared__ variables are odd. Shadows do get created, but // they are not registered with the CUDA runtime, so they // can't really be used to access their device-side Index: clang/lib/CodeGen/CodeGenModule.cpp =================================================================== --- clang/lib/CodeGen/CodeGenModule.cpp +++ clang/lib/CodeGen/CodeGenModule.cpp @@ -4169,8 +4169,12 @@ bool NeedsGlobalDtor = D->needsDestruction(getContext()) == QualType::DK_cxx_destructor; + bool IsHIPManagedVarOnDevice = + getLangOpts().CUDAIsDevice && D->hasAttr(); + const VarDecl *InitDecl; - const Expr *InitExpr = D->getAnyInitializer(InitDecl); + const Expr *InitExpr = + IsHIPManagedVarOnDevice ? nullptr : D->getAnyInitializer(InitDecl); Optional emitter; @@ -4190,8 +4194,6 @@ (D->getType()->isCUDADeviceBuiltinSurfaceType() || D->getType()->isCUDADeviceBuiltinTextureType() || D->hasAttr()); - // HIP pinned shadow of initialized host-side global variables are also - // left undefined. if (getLangOpts().CUDA && (IsCUDASharedVar || IsCUDAShadowVar || IsCUDADeviceShadowVar)) Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy)); @@ -4302,7 +4304,10 @@ } } - GV->setInitializer(Init); + // HIP managed variables need to be emitted as declarations in device + // compilation. + if (!IsHIPManagedVarOnDevice) + GV->setInitializer(Init); if (emitter) emitter->finalize(GV); Index: clang/test/CodeGenCUDA/device-stub.cu =================================================================== --- clang/test/CodeGenCUDA/device-stub.cu +++ clang/test/CodeGenCUDA/device-stub.cu @@ -30,9 +30,13 @@ // RUN: | FileCheck %s -allow-deprecated-dag-overlap \ // RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \ +// RUN: -target-sdk-version=9.2 -fcuda-include-gpubinary %t -o - \ +// RUN: | FileCheck %s -allow-deprecated-dag-overlap \ +// RUN: --check-prefixes=ALL,LNX,NORDC,CUDA,CUDANORDC,CUDA-NEW,LNX_17,NORDC17 +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -std=c++17 \ // RUN: -target-sdk-version=9.2 -fgpu-rdc -fcuda-include-gpubinary %t -o - \ // RUN: | FileCheck %s -allow-deprecated-dag-overlap \ -// RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17 +// RUN: --check-prefixes=ALL,LNX,RDC,CUDA,CUDARDC,CUDA-NEW,LNX_17,RDC17 // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -target-sdk-version=9.2 -o - \ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN @@ -45,7 +49,7 @@ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fgpu-rdc -fcuda-include-gpubinary %t -o - -x hip \ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,NORDC,HIP,HIPEF +// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,LNX,RDC,HIP,HIPEF // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=ALL,LNX,NORDC,HIP,HIPNEF @@ -56,15 +60,18 @@ #include "Inputs/cuda.h" #ifndef NOGLOBALS -// LNX-DAG: @device_var = internal global i32 +// NORDC-DAG: @device_var = internal global i32 +// RDC-DAG: @device_var = dso_local global i32 // WIN-DAG: @"?device_var@@3HA" = internal global i32 __device__ int device_var; -// LNX-DAG: @constant_var = internal global i32 +// NORDC-DAG: @constant_var = internal global i32 +// RDC-DAG: @constant_var = dso_local global i32 // WIN-DAG: @"?constant_var@@3HA" = internal global i32 __constant__ int constant_var; -// LNX-DAG: @shared_var = internal global i32 +// NORDC-DAG: @shared_var = internal global i32 +// RDC-DAG: @shared_var = dso_local global i32 // WIN-DAG: @"?shared_var@@3HA" = internal global i32 __shared__ int shared_var; @@ -87,18 +94,21 @@ // external device-side variables with definitions should generate // definitions for the shadows. -// LNX-DAG: @ext_device_var_def = internal global i32 undef, +// NORDC-DAG: @ext_device_var_def = internal global i32 undef, +// RDC-DAG: @ext_device_var_def = dso_local global i32 undef, // WIN-DAG: @"?ext_device_var_def@@3HA" = internal global i32 undef extern __device__ int ext_device_var_def; __device__ int ext_device_var_def = 1; -// LNX-DAG: @ext_device_var_def = internal global i32 undef, +// NORDC-DAG: @ext_device_var_def = internal global i32 undef, +// RDC-DAG: @ext_device_var_def = dso_local global i32 undef, // WIN-DAG: @"?ext_constant_var_def@@3HA" = internal global i32 undef __constant__ int ext_constant_var_def = 2; #if __cplusplus > 201402L -/// FIXME: Reject __device__ constexpr and inline variables in Sema. -// LNX_17: @inline_var = internal global i32 undef, comdat, align 4{{$}} -// LNX_17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}} +// NORDC17: @inline_var = internal global i32 undef, comdat, align 4{{$}} +// RDC17: @inline_var = linkonce_odr global i32 undef, comdat, align 4{{$}} +// NORDC17: @_ZN1C17member_inline_varE = internal constant i32 undef, comdat, align 4{{$}} +// RDC17: @_ZN1C17member_inline_varE = linkonce_odr constant i32 undef, comdat, align 4{{$}} __device__ inline int inline_var = 3; struct C { __device__ static constexpr int member_inline_var = 4; @@ -151,13 +161,13 @@ // CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null // HIPNEF: @__[[PREFIX]]_gpubin_handle = linkonce hidden global i8** null // * constant unnamed string with NVModuleID -// RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant +// CUDARDC: [[MODULE_ID_GLOBAL:@.*]] = private constant // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 // * Make sure our constructor was added to global ctor list. // LNX: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor // * Alias to global symbol containing the NVModuleID. -// RDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* } -// RDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper +// CUDARDC: @__fatbinwrap[[MODULE_ID]] ={{.*}} alias { i32, i32, i8*, i8* } +// CUDARDC-SAME: { i32, i32, i8*, i8* }* @__[[PREFIX]]_fatbin_wrapper // Test that we build the correct number of calls to cudaSetupArgument followed // by a call to cudaLaunch. @@ -214,25 +224,33 @@ // HIP-NEXT: icmp eq i8** {{.*}}, null // HIP-NEXT: br i1 {{.*}}, label %if, label %exit // HIP: if: -// NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper +// CUDANORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper +// .. stores return value in __[[PREFIX]]_gpubin_handle +// CUDANORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle +// .. and then calls __[[PREFIX]]_register_globals +// HIP: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper // .. stores return value in __[[PREFIX]]_gpubin_handle -// NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle +// HIP-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle // .. and then calls __[[PREFIX]]_register_globals // HIP-NEXT: br label %exit // HIP: exit: // HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle -// NORDC-NEXT: call void @__[[PREFIX]]_register_globals +// CUDANORDC-NEXT: call void @__[[PREFIX]]_register_globals +// HIP-NEXT: call void @__[[PREFIX]]_register_globals // * In separate mode we also register a destructor. -// NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) +// CUDANORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) +// HIP-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) // With relocatable device code we call __[[PREFIX]]RegisterLinkedBinary%NVModuleID% -// RDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]]( -// RDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper -// RDC-SAME: [[MODULE_ID_GLOBAL]] +// CUDARDC: call{{.*}}__[[PREFIX]]RegisterLinkedBinary[[MODULE_ID]]( +// CUDARDC-SAME: __[[PREFIX]]_register_globals, {{.*}}__[[PREFIX]]_fatbin_wrapper +// CUDARDC-SAME: [[MODULE_ID_GLOBAL]] // Test that we've created destructor. -// NORDC: define internal void @__[[PREFIX]]_module_dtor -// NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle +// CUDANORDC: define internal void @__[[PREFIX]]_module_dtor +// HIP: define internal void @__[[PREFIX]]_module_dtor +// CUDANORDC: load{{.*}}__[[PREFIX]]_gpubin_handle +// HIP: load{{.*}}__[[PREFIX]]_gpubin_handle // CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary // HIP-NEXT: icmp ne i8** {{.*}}, null // HIP-NEXT: br i1 {{.*}}, label %if, label %exit Index: clang/test/CodeGenCUDA/device-var-linkage.cu =================================================================== --- /dev/null +++ clang/test/CodeGenCUDA/device-var-linkage.cu @@ -0,0 +1,65 @@ +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=DEV,NORDC %s +// RUN: %clang_cc1 -triple nvptx -fcuda-is-device \ +// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=DEV,RDC %s +// RUN: %clang_cc1 -triple nvptx \ +// RUN: -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=HOST,NORDC-H %s +// RUN: %clang_cc1 -triple nvptx \ +// RUN: -fgpu-rdc -emit-llvm -o - -x hip %s \ +// RUN: | FileCheck -check-prefixes=HOST,RDC-H %s + +#include "Inputs/cuda.h" + +// DEV-DAG: @v1 = dso_local addrspace(1) externally_initialized global i32 0 +// NORDC-H-DAG: @v1 = internal global i32 undef +// RDC-H-DAG: @v1 = dso_local global i32 undef +__device__ int v1; +// DEV-DAG: @v2 = dso_local addrspace(4) externally_initialized global i32 0 +// NORDC-H-DAG: @v2 = internal global i32 undef +// RDC-H-DAG: @v2 = dso_local global i32 undef +__constant__ int v2; +// DEV-DAG: @v3 = external addrspace(1) externally_initialized global i32 +// NORDC-H-DAG: @v3 = internal global i32 0 +// RDC-H-DAG: @v3 = dso_local global i32 0 +__managed__ int v3; + +// DEV-DAG: @ev1 = external addrspace(1) global i32 +// HOST-DAG: @ev1 = external global i32 +extern __device__ int ev1; +// DEV-DAG: @ev2 = external addrspace(4) global i32 +// HOST-DAG: @ev2 = external global i32 +extern __constant__ int ev2; +// DEV-DAG: @ev3 = external addrspace(1) global i32 +// HOST-DAG: @ev3 = external global i32 +extern __managed__ int ev3; + +// NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0 +// RDC-DAG: @_ZL3sv1 = internal addrspace(1) global i32 0 +// HOST-DAG: @_ZL3sv1 = internal global i32 undef +static __device__ int sv1; +// NORDC-DAG: @_ZL3sv2 = dso_local addrspace(4) externally_initialized global i32 0 +// RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0 +// HOST-DAG: @_ZL3sv2 = internal global i32 undef +static __constant__ int sv2; +// DEV-DAG: @_ZL3sv3 = external addrspace(1) externally_initialized global i32 +// HOST-DAG: @_ZL3sv3 = internal global i32 0 +static __managed__ int sv3; + +__device__ __host__ int work(int *x); + +__device__ __host__ int fun1() { + return work(&ev1) + work(&ev2) + work(&ev3) + work(&sv1) + work(&sv2) + work(&sv3); +} + +// HOST: hipRegisterVar({{.*}}@v1 +// HOST: hipRegisterVar({{.*}}@v2 +// HOST: hipRegisterManagedVar({{.*}}@v3 +// HOST-NOT: hipRegisterVar({{.*}}@ev1 +// HOST-NOT: hipRegisterVar({{.*}}@ev2 +// HOST-NOT: hipRegisterManagedVar({{.*}}@ev3 +// HOST: hipRegisterVar({{.*}}@_ZL3sv1 +// HOST: hipRegisterVar({{.*}}@_ZL3sv2 +// HOST: hipRegisterManagedVar({{.*}}@_ZL3sv3 Index: clang/test/CodeGenCUDA/managed-var.cu =================================================================== --- clang/test/CodeGenCUDA/managed-var.cu +++ clang/test/CodeGenCUDA/managed-var.cu @@ -10,17 +10,19 @@ // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ // RUN: -emit-llvm -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=HOST %s +// RUN: -check-prefixes=HOST,NORDC %s // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ // RUN: -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=HOST %s +// RUN: -check-prefixes=HOST,RDC %s #include "Inputs/cuda.h" -// DEV-DAG: @x = {{.*}}addrspace(1) externally_initialized global i32 undef -// HOST-DAG: @x = internal global i32 1 -// HOST-DAG: @x.managed = internal global i32* null +// DEV-DAG: @x = external addrspace(1) externally_initialized global i32 +// NORDC-DAG: @x = internal global i32 1 +// RDC-DAG: @x = dso_local global i32 1 +// NORDC-DAG: @x.managed = internal global i32* null +// RDC-DAG: @x.managed = dso_local global i32* null // HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00" struct vec { @@ -31,11 +33,28 @@ __managed__ vec v[100]; __managed__ vec v2[100] = {{1, 1, 1}}; +// DEV-DAG: @ex = external addrspace(1) global i32 +// HOST-DAG: @ex = external global i32 +extern __managed__ int ex; + +// DEV-DAG: @_ZL2sx = external addrspace(1) externally_initialized global i32 +// HOST-DAG: @_ZL2sx = internal global i32 1 +// HOST-DAG: @_ZL2sx.managed = internal global i32* null +static __managed__ int sx = 1; + +// HOST-NOT: @ex.managed + +// Force ex and sx mitted in device compilation. __global__ void foo(int *z) { - *z = x; + *z = x + ex + sx; v[1].x = 2; } +// Force ex and sx emitted in host compilatioin. +int foo2() { + return ex + sx; +} + // HOST-LABEL: define {{.*}}@_Z4loadv() // HOST: %ld.managed = load i32*, i32** @x.managed, align 4 // HOST: %0 = load i32, i32* %ld.managed, align 4 @@ -97,4 +116,6 @@ } // HOST-DAG: __hipRegisterManagedVar({{.*}}@x.managed {{.*}}@x {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4) +// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx.managed {{.*}}@_ZL2sx +// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex.managed {{.*}}@ex // HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32)