diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp --- a/clang/lib/CodeGen/CGCUDANV.cpp +++ b/clang/lib/CodeGen/CGCUDANV.cpp @@ -144,20 +144,24 @@ /*Managed*/ false, Normalized, Type}}); } + /// Creates module constructor function + llvm::Function *makeModuleCtorFunction(); + /// Creates module destructor function + llvm::Function *makeModuleDtorFunction(); + /// Transform managed variables for device compilation. + void transformManagedVars(); + public: CGNVCUDARuntime(CodeGenModule &CGM); void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var) override; - - /// Creates module constructor function - llvm::Function *makeModuleCtorFunction() override; - /// Creates module destructor function - llvm::Function *makeModuleDtorFunction() override; void internalizeDeviceSideVar(const VarDecl *D, llvm::GlobalValue::LinkageTypes &Linkage) override; + + llvm::Function *finalizeModule() override; }; } @@ -534,6 +538,9 @@ addUnderscoredPrefixToName("RegisterTexture")); for (auto &&Info : DeviceVars) { llvm::GlobalVariable *Var = Info.Var; + assert((!Var->isDeclaration() || Info.Flags.isManaged()) && + "External variables should not show up here, except HIP managed " + "variables"); llvm::Constant *VarName = makeConstantString(getDeviceSideName(Info.D)); switch (Info.Flags.getKind()) { case DeviceVarFlags::Variable: { @@ -543,11 +550,16 @@ auto ManagedVar = new llvm::GlobalVariable( CGM.getModule(), Var->getType(), /*isConstant=*/false, Var->getLinkage(), - /*Init=*/llvm::ConstantPointerNull::get(Var->getType()), - Twine(Var->getName() + ".managed"), /*InsertBefore=*/nullptr, + /*Init=*/Var->isDeclaration() + ? nullptr + : llvm::ConstantPointerNull::get(Var->getType()), + /*Name=*/"", /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal); ManagedVar->setDSOLocal(Var->isDSOLocal()); ManagedVar->setVisibility(Var->getVisibility()); + ManagedVar->setExternallyInitialized(true); + ManagedVar->takeName(Var); + Var->setName(Twine(ManagedVar->getName() + ".managed")); replaceManagedVar(Var, ManagedVar); llvm::Value *Args[] = { &GpuBinaryHandlePtr, @@ -556,7 +568,8 @@ VarName, llvm::ConstantInt::get(VarSizeTy, VarSize), llvm::ConstantInt::get(IntTy, Var->getAlignment())}; - Builder.CreateCall(RegisterManagedVar, Args); + if (!Var->isDeclaration()) + Builder.CreateCall(RegisterManagedVar, Args); } else { llvm::Value *Args[] = { &GpuBinaryHandlePtr, @@ -968,9 +981,13 @@ // discarded and referencing a discarded local symbol from outside the // comdat (__cuda_register_globals) is disallowed by the ELF spec. // TODO: Reject __device__ constexpr and __device__ inline in Sema. - if (!D->hasExternalStorage() && !D->isInline()) + // HIP managed variables need to be always recorded in device and host + // compilations for transformation. + if ((!D->hasExternalStorage() && !D->isInline()) || + D->hasAttr()) { registerDeviceVar(D, GV, !D->hasDefinition(), D->hasAttr()); + } } else if (D->getType()->isCUDADeviceBuiltinSurfaceType() || D->getType()->isCUDADeviceBuiltinTextureType()) { // Builtin surfaces and textures and their template arguments are @@ -998,3 +1015,47 @@ } } } + +// Transform managed variables to pointers to managed variables in device code. +// Each use of the original managed variable is replaced by a load from the +// transformed managed variable. The transformed managed variable contains +// the address of managed memory which will be allocated by the runtime. +void CGNVCUDARuntime::transformManagedVars() { + for (auto &&Info : DeviceVars) { + llvm::GlobalVariable *Var = Info.Var; + if (Info.Flags.getKind() == DeviceVarFlags::Variable && + Info.Flags.isManaged()) { + auto ManagedVar = new llvm::GlobalVariable( + CGM.getModule(), Var->getType(), + /*isConstant=*/false, Var->getLinkage(), + /*Init=*/Var->isDeclaration() + ? nullptr + : llvm::ConstantPointerNull::get(Var->getType()), + /*Name=*/"", /*InsertBefore=*/nullptr, + llvm::GlobalVariable::NotThreadLocal, + CGM.getContext().getTargetAddressSpace(LangAS::cuda_device)); + ManagedVar->setDSOLocal(Var->isDSOLocal()); + ManagedVar->setVisibility(Var->getVisibility()); + ManagedVar->setExternallyInitialized(true); + replaceManagedVar(Var, ManagedVar); + ManagedVar->takeName(Var); + Var->setName(Twine(ManagedVar->getName()) + ".managed"); + // Keep managed variables even if they are not used in device code since + // they need to be allocated by the runtime. + if (!Var->isDeclaration()) { + assert(!ManagedVar->isDeclaration()); + CGM.addCompilerUsedGlobal(Var); + CGM.addCompilerUsedGlobal(ManagedVar); + } + } + } +} + +// Returns module constructor to be added. +llvm::Function *CGNVCUDARuntime::finalizeModule() { + if (CGM.getLangOpts().CUDAIsDevice) { + transformManagedVars(); + return nullptr; + } + return makeModuleCtorFunction(); +} diff --git a/clang/lib/CodeGen/CGCUDARuntime.h b/clang/lib/CodeGen/CGCUDARuntime.h --- a/clang/lib/CodeGen/CGCUDARuntime.h +++ b/clang/lib/CodeGen/CGCUDARuntime.h @@ -86,13 +86,9 @@ virtual void handleVarRegistration(const VarDecl *VD, llvm::GlobalVariable &Var) = 0; - /// Constructs and returns a module initialization function or nullptr if it's - /// not needed. Must be called after all kernels have been emitted. - virtual llvm::Function *makeModuleCtorFunction() = 0; - - /// Returns a module cleanup function or nullptr if it's not needed. - /// Must be called after ModuleCtorFunction - virtual llvm::Function *makeModuleDtorFunction() = 0; + /// Finalize generated LLVM module. Returns a module constructor function + /// to be added or a null pointer. + virtual llvm::Function *finalizeModule() = 0; /// Returns function or variable name on device side even if the current /// compilation is for host. diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -459,10 +459,8 @@ if (ObjCRuntime) if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction()) AddGlobalCtor(ObjCInitFunction); - if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice && - CUDARuntime) { - if (llvm::Function *CudaCtorFunction = - CUDARuntime->makeModuleCtorFunction()) + if (Context.getLangOpts().CUDA && CUDARuntime) { + if (llvm::Function *CudaCtorFunction = CUDARuntime->finalizeModule()) AddGlobalCtor(CudaCtorFunction); } if (OpenMPRuntime) { @@ -3833,8 +3831,14 @@ } } - if (GV->isDeclaration()) + if (GV->isDeclaration()) { getTargetCodeGenInfo().setTargetAttributes(D, GV, *this); + // External HIP managed variables needed to be recorded for transformation + // in both device and host compilations. + if (getLangOpts().CUDA && D && D->hasAttr() && + D->hasExternalStorage()) + getCUDARuntime().handleVarRegistration(D, *GV); + } LangAS ExpectedAS = D ? D->getType().getAddressSpace() @@ -4142,12 +4146,8 @@ bool NeedsGlobalDtor = D->needsDestruction(getContext()) == QualType::DK_cxx_destructor; - bool IsHIPManagedVarOnDevice = - getLangOpts().CUDAIsDevice && D->hasAttr(); - const VarDecl *InitDecl; - const Expr *InitExpr = - IsHIPManagedVarOnDevice ? nullptr : D->getAnyInitializer(InitDecl); + const Expr *InitExpr = D->getAnyInitializer(InitDecl); Optional emitter; @@ -4158,15 +4158,15 @@ getLangOpts().CUDAIsDevice && D->hasAttr(); // Shadows of initialized device-side global variables are also left // undefined. + // Managed Variables should be initialized on both host side and device side. bool IsCUDAShadowVar = !getLangOpts().CUDAIsDevice && !D->hasAttr() && (D->hasAttr() || D->hasAttr() || D->hasAttr()); bool IsCUDADeviceShadowVar = - getLangOpts().CUDAIsDevice && + getLangOpts().CUDAIsDevice && !D->hasAttr() && (D->getType()->isCUDADeviceBuiltinSurfaceType() || - D->getType()->isCUDADeviceBuiltinTextureType() || - D->hasAttr()); + D->getType()->isCUDADeviceBuiltinTextureType()); if (getLangOpts().CUDA && (IsCUDASharedVar || IsCUDAShadowVar || IsCUDADeviceShadowVar)) Init = llvm::UndefValue::get(getTypes().ConvertType(ASTTy)); @@ -4273,14 +4273,11 @@ GV->setExternallyInitialized(true); } else { getCUDARuntime().internalizeDeviceSideVar(D, Linkage); - getCUDARuntime().handleVarRegistration(D, *GV); } + getCUDARuntime().handleVarRegistration(D, *GV); } - // HIP managed variables need to be emitted as declarations in device - // compilation. - if (!IsHIPManagedVarOnDevice) - GV->setInitializer(Init); + GV->setInitializer(Init); if (emitter) emitter->finalize(GV); diff --git a/clang/test/CodeGenCUDA/device-var-linkage.cu b/clang/test/CodeGenCUDA/device-var-linkage.cu --- a/clang/test/CodeGenCUDA/device-var-linkage.cu +++ b/clang/test/CodeGenCUDA/device-var-linkage.cu @@ -21,9 +21,9 @@ // NORDC-H-DAG: @v2 = internal global i32 undef // RDC-H-DAG: @v2 = dso_local global i32 undef __constant__ int v2; -// DEV-DAG: @v3 = external addrspace(1) externally_initialized global i32 -// NORDC-H-DAG: @v3 = internal global i32 0 -// RDC-H-DAG: @v3 = dso_local global i32 0 +// DEV-DAG: @v3 = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null +// NORDC-H-DAG: @v3 = internal externally_initialized global i32* null +// RDC-H-DAG: @v3 = dso_local externally_initialized global i32* null __managed__ int v3; // DEV-DAG: @ev1 = external addrspace(1) global i32 @@ -32,8 +32,8 @@ // DEV-DAG: @ev2 = external addrspace(4) global i32 // HOST-DAG: @ev2 = external global i32 extern __constant__ int ev2; -// DEV-DAG: @ev3 = external addrspace(1) global i32 -// HOST-DAG: @ev3 = external global i32 +// DEV-DAG: @ev3 = external addrspace(1) externally_initialized global i32 addrspace(1)* +// HOST-DAG: @ev3 = external externally_initialized global i32* extern __managed__ int ev3; // NORDC-DAG: @_ZL3sv1 = dso_local addrspace(1) externally_initialized global i32 0 @@ -44,8 +44,8 @@ // RDC-DAG: @_ZL3sv2 = internal addrspace(4) global i32 0 // HOST-DAG: @_ZL3sv2 = internal global i32 undef static __constant__ int sv2; -// DEV-DAG: @_ZL3sv3 = external addrspace(1) externally_initialized global i32 -// HOST-DAG: @_ZL3sv3 = internal global i32 0 +// DEV-DAG: @_ZL3sv3 = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null +// HOST-DAG: @_ZL3sv3 = internal externally_initialized global i32* null static __managed__ int sv3; __device__ __host__ int work(int *x); diff --git a/clang/test/CodeGenCUDA/managed-var.cu b/clang/test/CodeGenCUDA/managed-var.cu --- a/clang/test/CodeGenCUDA/managed-var.cu +++ b/clang/test/CodeGenCUDA/managed-var.cu @@ -2,47 +2,62 @@ // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \ // RUN: -emit-llvm -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=DEV %s +// RUN: -check-prefixes=COMMON,DEV %s // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fcuda-is-device -std=c++11 \ // RUN: -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=DEV %s +// RUN: -check-prefixes=COMMON,DEV %s // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ // RUN: -emit-llvm -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=HOST,NORDC %s +// RUN: -check-prefixes=COMMON,HOST,NORDC %s // RUN: %clang_cc1 -triple x86_64-gnu-linux -std=c++11 \ // RUN: -emit-llvm -fgpu-rdc -o - -x hip %s | FileCheck \ -// RUN: -check-prefixes=HOST,RDC %s +// RUN: -check-prefixes=COMMON,HOST,RDC %s #include "Inputs/cuda.h" -// DEV-DAG: @x = external addrspace(1) externally_initialized global i32 -// NORDC-DAG: @x = internal global i32 1 -// RDC-DAG: @x = dso_local global i32 1 -// NORDC-DAG: @x.managed = internal global i32* null -// RDC-DAG: @x.managed = dso_local global i32* null -// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00" - struct vec { float x,y,z; }; +// DEV-DAG: @x.managed = dso_local addrspace(1) externally_initialized global i32 1, align 4 +// DEV-DAG: @x = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null +// NORDC-DAG: @x.managed = internal global i32 1 +// RDC-DAG: @x.managed = dso_local global i32 1 +// NORDC-DAG: @x = internal externally_initialized global i32* null +// RDC-DAG: @x = dso_local externally_initialized global i32* null +// HOST-DAG: @[[DEVNAMEX:[0-9]+]] = {{.*}}c"x\00" __managed__ int x = 1; + +// DEV-DAG: @v.managed = dso_local addrspace(1) externally_initialized global [100 x %struct.vec] zeroinitializer, align 4 +// DEV-DAG: @v = dso_local addrspace(1) externally_initialized global [100 x %struct.vec] addrspace(1)* null __managed__ vec v[100]; + +// DEV-DAG: @v2.managed = dso_local addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> <{ %struct.vec { float 1.000000e+00, float 1.000000e+00, float 1.000000e+00 }, [99 x %struct.vec] zeroinitializer }>, align 4 +// DEV-DAG: @v2 = dso_local addrspace(1) externally_initialized global <{ %struct.vec, [99 x %struct.vec] }> addrspace(1)* null __managed__ vec v2[100] = {{1, 1, 1}}; -// DEV-DAG: @ex = external addrspace(1) global i32 -// HOST-DAG: @ex = external global i32 +// DEV-DAG: @ex.managed = external addrspace(1) global i32, align 4 +// DEV-DAG: @ex = external addrspace(1) externally_initialized global i32 addrspace(1)* +// HOST-DAG: @ex.managed = external global i32 +// HOST-DAG: @ex = external externally_initialized global i32* extern __managed__ int ex; -// DEV-DAG: @_ZL2sx = external addrspace(1) externally_initialized global i32 -// HOST-DAG: @_ZL2sx = internal global i32 1 -// HOST-DAG: @_ZL2sx.managed = internal global i32* null +// DEV-DAG: @_ZL2sx.managed = dso_local addrspace(1) externally_initialized global i32 1, align 4 +// DEV-DAG: @_ZL2sx = dso_local addrspace(1) externally_initialized global i32 addrspace(1)* null +// HOST-DAG: @_ZL2sx.managed = internal global i32 1 +// HOST-DAG: @_ZL2sx = internal externally_initialized global i32* null static __managed__ int sx = 1; -// HOST-NOT: @ex.managed +// DEV-DAG: @llvm.compiler.used +// DEV-SAME-DAG: @x.managed +// DEV-SAME-DAG: @x +// DEV-SAME-DAG: @v.managed +// DEV-SAME-DAG: @v +// DEV-SAME-DAG: @_ZL2sx.managed +// DEV-SAME-DAG: @_ZL2sx // Force ex and sx mitted in device compilation. __global__ void foo(int *z) { @@ -55,42 +70,53 @@ return ex + sx; } -// HOST-LABEL: define {{.*}}@_Z4loadv() -// HOST: %ld.managed = load i32*, i32** @x.managed, align 4 +// COMMON-LABEL: define {{.*}}@_Z4loadv() +// DEV: %ld.managed = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @x, align 4 +// DEV: %0 = addrspacecast i32 addrspace(1)* %ld.managed to i32* +// DEV: %1 = load i32, i32* %0, align 4 +// DEV: ret i32 %1 +// HOST: %ld.managed = load i32*, i32** @x, align 4 // HOST: %0 = load i32, i32* %ld.managed, align 4 // HOST: ret i32 %0 -int load() { +__device__ __host__ int load() { return x; } -// HOST-LABEL: define {{.*}}@_Z5storev() -// HOST: %ld.managed = load i32*, i32** @x.managed, align 4 +// COMMON-LABEL: define {{.*}}@_Z5storev() +// DEV: %ld.managed = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @x, align 4 +// DEV: %0 = addrspacecast i32 addrspace(1)* %ld.managed to i32* +// DEV: store i32 2, i32* %0, align 4 +// HOST: %ld.managed = load i32*, i32** @x, align 4 // HOST: store i32 2, i32* %ld.managed, align 4 -void store() { +__device__ __host__ void store() { x = 2; } -// HOST-LABEL: define {{.*}}@_Z10addr_takenv() -// HOST: %ld.managed = load i32*, i32** @x.managed, align 4 +// COMMON-LABEL: define {{.*}}@_Z10addr_takenv() +// DEV: %0 = addrspacecast i32 addrspace(1)* %ld.managed to i32* +// DEV: store i32* %0, i32** %p.ascast, align 8 +// DEV: %1 = load i32*, i32** %p.ascast, align 8 +// DEV: store i32 3, i32* %1, align 4 +// HOST: %ld.managed = load i32*, i32** @x, align 4 // HOST: store i32* %ld.managed, i32** %p, align 8 // HOST: %0 = load i32*, i32** %p, align 8 // HOST: store i32 3, i32* %0, align 4 -void addr_taken() { +__device__ __host__ void addr_taken() { int *p = &x; *p = 3; } // HOST-LABEL: define {{.*}}@_Z5load2v() -// HOST: %ld.managed = load [100 x %struct.vec]*, [100 x %struct.vec]** @v.managed, align 16 +// HOST: %ld.managed = load [100 x %struct.vec]*, [100 x %struct.vec]** @v, align 16 // HOST: %0 = getelementptr inbounds [100 x %struct.vec], [100 x %struct.vec]* %ld.managed, i64 0, i64 1, i32 0 // HOST: %1 = load float, float* %0, align 4 // HOST: ret float %1 -float load2() { +__device__ __host__ float load2() { return v[1].x; } // HOST-LABEL: define {{.*}}@_Z5load3v() -// HOST: %ld.managed = load <{ %struct.vec, [99 x %struct.vec] }>*, <{ %struct.vec, [99 x %struct.vec] }>** @v2.managed, align 16 +// HOST: %ld.managed = load <{ %struct.vec, [99 x %struct.vec] }>*, <{ %struct.vec, [99 x %struct.vec] }>** @v2, align 16 // HOST: %0 = bitcast <{ %struct.vec, [99 x %struct.vec] }>* %ld.managed to [100 x %struct.vec]* // HOST: %1 = getelementptr inbounds [100 x %struct.vec], [100 x %struct.vec]* %0, i64 0, i64 1, i32 1 // HOST: %2 = load float, float* %1, align 4 @@ -100,10 +126,10 @@ } // HOST-LABEL: define {{.*}}@_Z11addr_taken2v() -// HOST: %ld.managed = load [100 x %struct.vec]*, [100 x %struct.vec]** @v.managed, align 16 +// HOST: %ld.managed = load [100 x %struct.vec]*, [100 x %struct.vec]** @v, align 16 // HOST: %0 = getelementptr inbounds [100 x %struct.vec], [100 x %struct.vec]* %ld.managed, i64 0, i64 1, i32 0 // HOST: %1 = ptrtoint float* %0 to i64 -// HOST: %ld.managed1 = load <{ %struct.vec, [99 x %struct.vec] }>*, <{ %struct.vec, [99 x %struct.vec] }>** @v2.managed, align 16 +// HOST: %ld.managed1 = load <{ %struct.vec, [99 x %struct.vec] }>*, <{ %struct.vec, [99 x %struct.vec] }>** @v2, align 16 // HOST: %2 = bitcast <{ %struct.vec, [99 x %struct.vec] }>* %ld.managed1 to [100 x %struct.vec]* // HOST: %3 = getelementptr inbounds [100 x %struct.vec], [100 x %struct.vec]* %2, i64 0, i64 1, i32 1 // HOST: %4 = ptrtoint float* %3 to i64 @@ -115,7 +141,19 @@ return (float)reinterpret_cast(&(v2[1].y)-&(v[1].x)); } -// HOST-DAG: __hipRegisterManagedVar({{.*}}@x.managed {{.*}}@x {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4) -// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx.managed {{.*}}@_ZL2sx -// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex.managed {{.*}}@ex +// COMMON-LABEL: define {{.*}}@_Z5load4v() +// DEV: %ld.managed = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* @ex, align 4 +// DEV: %0 = addrspacecast i32 addrspace(1)* %ld.managed to i32* +// DEV: %1 = load i32, i32* %0, align 4 +// DEV: ret i32 %1 +// HOST: %ld.managed = load i32*, i32** @ex, align 4 +// HOST: %0 = load i32, i32* %ld.managed, align 4 +// HOST: ret i32 %0 +__device__ __host__ int load4() { + return ex; +} + +// HOST-DAG: __hipRegisterManagedVar({{.*}}@x {{.*}}@x.managed {{.*}}@[[DEVNAMEX]]{{.*}}, i64 4, i32 4) +// HOST-DAG: __hipRegisterManagedVar({{.*}}@_ZL2sx {{.*}}@_ZL2sx.managed +// HOST-NOT: __hipRegisterManagedVar({{.*}}@ex {{.*}}@ex.managed // HOST-DAG: declare void @__hipRegisterManagedVar(i8**, i8*, i8*, i8*, i64, i32) diff --git a/llvm/lib/IR/ReplaceConstant.cpp b/llvm/lib/IR/ReplaceConstant.cpp --- a/llvm/lib/IR/ReplaceConstant.cpp +++ b/llvm/lib/IR/ReplaceConstant.cpp @@ -60,6 +60,7 @@ case Instruction::PtrToInt: case Instruction::IntToPtr: case Instruction::BitCast: + case Instruction::AddrSpaceCast: return dyn_cast( Builder.CreateCast((Instruction::CastOps)OpCode, CE->getOperand(0), CE->getType(), CE->getName()));