Index: cfe/trunk/lib/CodeGen/CGCUDANV.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp @@ -38,6 +38,7 @@ llvm::Module &TheModule; /// Keeps track of kernel launch stubs emitted in this module llvm::SmallVector EmittedKernels; + llvm::SmallVector, 16> DeviceVars; /// Keeps track of variables containing handles of GPU binaries. Populated by /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() @@ -47,7 +48,7 @@ llvm::Constant *getLaunchFn() const; /// Creates a function to register all kernel stubs generated in this module. - llvm::Function *makeRegisterKernelsFn(); + llvm::Function *makeRegisterGlobalsFn(); /// Helper function that generates a constant string and returns a pointer to /// the start of the string. The result of this function can be used anywhere @@ -68,6 +69,10 @@ CGNVCUDARuntime(CodeGenModule &CGM); void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; + void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) override { + DeviceVars.push_back(std::make_pair(&Var, Flags)); + } + /// Creates module constructor function llvm::Function *makeModuleCtorFunction() override; /// Creates module destructor function @@ -158,19 +163,24 @@ CGF.EmitBlock(EndBlock); } -/// Creates internal function to register all kernel stubs generated in this -/// module with the CUDA runtime. +/// Creates a function that sets up state on the host side for CUDA objects that +/// have a presence on both the host and device sides. Specifically, registers +/// the host side of kernel functions and device global variables with the CUDA +/// runtime. /// \code -/// void __cuda_register_kernels(void** GpuBinaryHandle) { +/// void __cuda_register_globals(void** GpuBinaryHandle) { /// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); /// ... /// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); +/// __cudaRegisterVar(GpuBinaryHandle, GlobalVar0, ...); +/// ... +/// __cudaRegisterVar(GpuBinaryHandle, GlobalVarN, ...); /// } /// \endcode -llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { +llvm::Function *CGNVCUDARuntime::makeRegisterGlobalsFn() { llvm::Function *RegisterKernelsFunc = llvm::Function::Create( llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), - llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule); + llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); llvm::BasicBlock *EntryBB = llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); CGBuilderTy Builder(CGM, Context); @@ -186,18 +196,44 @@ "__cudaRegisterFunction"); // Extract GpuBinaryHandle passed as the first argument passed to - // __cuda_register_kernels() and generate __cudaRegisterFunction() call for + // __cuda_register_globals() and generate __cudaRegisterFunction() call for // each emitted kernel. llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); for (llvm::Function *Kernel : EmittedKernels) { llvm::Constant *KernelName = makeConstantString(Kernel->getName()); llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); - llvm::Value *args[] = { + llvm::Value *Args[] = { &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, NullPtr, NullPtr, NullPtr, llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; - Builder.CreateCall(RegisterFunc, args); + Builder.CreateCall(RegisterFunc, Args); + } + + // void __cudaRegisterVar(void **, char *, char *, const char *, + // int, int, int, int) + std::vector RegisterVarParams = { + VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, + IntTy, IntTy, IntTy, IntTy}; + llvm::Constant *RegisterVar = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, RegisterVarParams, false), + "__cudaRegisterVar"); + for (auto &Pair : DeviceVars) { + llvm::GlobalVariable *Var = Pair.first; + unsigned Flags = Pair.second; + llvm::Constant *VarName = makeConstantString(Var->getName()); + uint64_t VarSize = + CGM.getDataLayout().getTypeAllocSize(Var->getValueType()); + llvm::Value *Args[] = { + &GpuBinaryHandlePtr, + Builder.CreateBitCast(Var, VoidPtrTy), + VarName, + VarName, + llvm::ConstantInt::get(IntTy, (Flags & ExternDeviceVar) ? 1 : 0), + llvm::ConstantInt::get(IntTy, VarSize), + llvm::ConstantInt::get(IntTy, (Flags & ConstantDeviceVar) ? 1 : 0), + llvm::ConstantInt::get(IntTy, 0)}; + Builder.CreateCall(RegisterVar, Args); } Builder.CreateRetVoid(); @@ -208,15 +244,15 @@ /// \code /// void __cuda_module_ctor(void*) { /// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); -/// __cuda_register_kernels(Handle0); +/// __cuda_register_globals(Handle0); /// ... /// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); -/// __cuda_register_kernels(HandleN); +/// __cuda_register_globals(HandleN); /// } /// \endcode llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { - // void __cuda_register_kernels(void* handle); - llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn(); + // void __cuda_register_globals(void* handle); + llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); // void ** __cudaRegisterFatBinary(void *); llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), @@ -272,8 +308,8 @@ CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, CGM.getPointerAlign()); - // Call __cuda_register_kernels(GpuBinaryHandle); - CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall); + // Call __cuda_register_globals(GpuBinaryHandle); + CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); // Save GpuBinaryHandle so we can unregister it in destructor. GpuBinaryHandles.push_back(GpuBinaryHandle); Index: cfe/trunk/lib/CodeGen/CGCUDARuntime.h =================================================================== --- cfe/trunk/lib/CodeGen/CGCUDARuntime.h +++ cfe/trunk/lib/CodeGen/CGCUDARuntime.h @@ -18,6 +18,7 @@ namespace llvm { class Function; +class GlobalVariable; } namespace clang { @@ -37,6 +38,12 @@ CodeGenModule &CGM; public: + // Global variable properties that must be passed to CUDA runtime. + enum DeviceVarFlags { + ExternDeviceVar = 0x01, // extern + ConstantDeviceVar = 0x02, // __constant__ + }; + CGCUDARuntime(CodeGenModule &CGM) : CGM(CGM) {} virtual ~CGCUDARuntime(); @@ -46,6 +53,7 @@ /// Emits a kernel launch stub. virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0; + virtual void registerDeviceVar(llvm::GlobalVariable &Var, unsigned Flags) = 0; /// Constructs and returns a module initialization function or nullptr if it's /// not needed. Must be called after all kernels have been emitted. Index: cfe/trunk/lib/CodeGen/CodeGenModule.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CodeGenModule.cpp +++ cfe/trunk/lib/CodeGen/CodeGenModule.cpp @@ -1528,11 +1528,18 @@ !Global->hasAttr()) return; } else { - if (!Global->hasAttr() && ( - Global->hasAttr() || - Global->hasAttr() || - Global->hasAttr())) + // We need to emit host-side 'shadows' for all global + // device-side variables because the CUDA runtime needs their + // size and host-side address in order to provide access to + // their device-side incarnations. + + // So device-only functions are the only things we skip. + if (isa(Global) && !Global->hasAttr() && + Global->hasAttr()) return; + + assert((isa(Global) || isa(Global)) && + "Expected Variable or Function"); } } @@ -1561,8 +1568,15 @@ } else { const auto *VD = cast(Global); assert(VD->isFileVarDecl() && "Cannot emit local var decl as global."); - - if (VD->isThisDeclarationADefinition() != VarDecl::Definition && + // We need to emit device-side global CUDA variables even if a + // variable does not have a definition -- we still need to define + // host-side shadow for it. + bool MustEmitForCuda = LangOpts.CUDA && !LangOpts.CUDAIsDevice && + !VD->hasDefinition() && + (VD->hasAttr() || + VD->hasAttr()); + if (!MustEmitForCuda && + VD->isThisDeclarationADefinition() != VarDecl::Definition && !Context.isMSStaticDataMemberInlineDefinition(VD)) return; } @@ -2444,6 +2458,10 @@ if (D->hasAttr()) AddGlobalAnnotations(D, GV); + // Set the llvm linkage type as appropriate. + llvm::GlobalValue::LinkageTypes Linkage = + getLLVMLinkageVarDefinition(D, GV->isConstant()); + // CUDA B.2.1 "The __device__ qualifier declares a variable that resides on // the device. [...]" // CUDA B.2.2 "The __constant__ qualifier, optionally used together with @@ -2451,9 +2469,34 @@ // Is accessible from all the threads within the grid and from the host // through the runtime library (cudaGetSymbolAddress() / cudaGetSymbolSize() // / cudaMemcpyToSymbol() / cudaMemcpyFromSymbol())." - if (GV && LangOpts.CUDA && LangOpts.CUDAIsDevice && - (D->hasAttr() || D->hasAttr())) { - GV->setExternallyInitialized(true); + if (GV && LangOpts.CUDA) { + if (LangOpts.CUDAIsDevice) { + if (D->hasAttr() || D->hasAttr()) + GV->setExternallyInitialized(true); + } else { + // Host-side shadows of external declarations of device-side + // global variables become internal definitions. These have to + // be internal in order to prevent name conflicts with global + // host variables with the same name in a different TUs. + if (D->hasAttr() || D->hasAttr()) { + Linkage = llvm::GlobalValue::InternalLinkage; + + // Shadow variables and their properties must be registered + // with CUDA runtime. + unsigned Flags = 0; + if (!D->hasDefinition()) + Flags |= CGCUDARuntime::ExternDeviceVar; + if (D->hasAttr()) + Flags |= CGCUDARuntime::ConstantDeviceVar; + getCUDARuntime().registerDeviceVar(*GV, Flags); + } else if (D->hasAttr()) + // __shared__ variables are odd. Shadows do get created, but + // they are not registered with the CUDA runtime, so they + // can't really be used to access their device-side + // counterparts. It's not clear yet whether it's nvcc's bug or + // a feature, but we've got to do the same for compatibility. + Linkage = llvm::GlobalValue::InternalLinkage; + } } GV->setInitializer(Init); @@ -2470,9 +2513,6 @@ GV->setAlignment(getContext().getDeclAlign(D).getQuantity()); - // Set the llvm linkage type as appropriate. - llvm::GlobalValue::LinkageTypes Linkage = - getLLVMLinkageVarDefinition(D, GV->isConstant()); // On Darwin, if the normal linkage of a C++ thread_local variable is // LinkOnce or Weak, we keep the normal linkage to prevent multiple Index: cfe/trunk/test/CodeGenCUDA/device-stub.cu =================================================================== --- cfe/trunk/test/CodeGenCUDA/device-stub.cu +++ cfe/trunk/test/CodeGenCUDA/device-stub.cu @@ -2,6 +2,40 @@ #include "Inputs/cuda.h" +// CHECK-DAG: @device_var = internal global i32 +__device__ int device_var; + +// CHECK-DAG: @constant_var = internal global i32 +__constant__ int constant_var; + +// CHECK-DAG: @shared_var = internal global i32 +__shared__ int shared_var; + +// Make sure host globals don't get internalized... +// CHECK-DAG: @host_var = global i32 +int host_var; +// ... and that extern vars remain external. +// CHECK-DAG: @ext_host_var = external global i32 +extern int ext_host_var; + +// Shadows for external device-side variables are *definitions* of +// those variables. +// CHECK-DAG: @ext_device_var = internal global i32 +extern __device__ int ext_device_var; +// CHECK-DAG: @ext_device_var = internal global i32 +extern __constant__ int ext_constant_var; + +void use_pointers() { + int *p; + p = &device_var; + p = &constant_var; + p = &shared_var; + p = &host_var; + p = &ext_device_var; + p = &ext_constant_var; + p = &ext_host_var; +} + // Make sure that all parts of GPU code init/cleanup are there: // * constant unnamed string with the kernel name // CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" @@ -32,9 +66,14 @@ // CHECK: call{{.*}}kernelfunc void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } -// Test that we've built a function to register kernels -// CHECK: define internal void @__cuda_register_kernels +// Test that we've built a function to register kernels and global vars. +// CHECK: define internal void @__cuda_register_globals // CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 +// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 +// CHECK: ret void // Test that we've built contructor.. // CHECK: define internal void @__cuda_module_ctor @@ -42,8 +81,8 @@ // CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper // .. stores return value in __cuda_gpubin_handle // CHECK-NEXT: store{{.*}}__cuda_gpubin_handle -// .. and then calls __cuda_register_kernels -// CHECK-NEXT: call void @__cuda_register_kernels +// .. and then calls __cuda_register_globals +// CHECK-NEXT: call void @__cuda_register_globals // Test that we've created destructor. // CHECK: define internal void @__cuda_module_dtor Index: cfe/trunk/test/CodeGenCUDA/filter-decl.cu =================================================================== --- cfe/trunk/test/CodeGenCUDA/filter-decl.cu +++ cfe/trunk/test/CodeGenCUDA/filter-decl.cu @@ -9,15 +9,15 @@ // CHECK-DEVICE-NOT: module asm "file scope asm is host only" __asm__("file scope asm is host only"); -// CHECK-HOST-NOT: constantdata = externally_initialized global +// CHECK-HOST: constantdata = internal global // CHECK-DEVICE: constantdata = externally_initialized global __constant__ char constantdata[256]; -// CHECK-HOST-NOT: devicedata = externally_initialized global +// CHECK-HOST: devicedata = internal global // CHECK-DEVICE: devicedata = externally_initialized global __device__ char devicedata[256]; -// CHECK-HOST-NOT: shareddata = global +// CHECK-HOST: shareddata = internal global // CHECK-DEVICE: shareddata = global __shared__ char shareddata[256];