Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -15,12 +15,13 @@ #include "CGCUDARuntime.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" -#include "clang/CodeGen/ConstantInitBuilder.h" #include "clang/AST/Decl.h" +#include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/Format.h" using namespace clang; using namespace CodeGen; @@ -45,10 +46,16 @@ /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() llvm::GlobalVariable *GpuBinaryHandle = nullptr; + /// Whether we generate relocatable device code. + bool RelocatableDeviceCode; llvm::Constant *getSetupArgumentFn() const; llvm::Constant *getLaunchFn() const; + llvm::FunctionType *getRegisterGlobalsFnTy() const; + llvm::FunctionType *getCallbackFnTy() const; + llvm::FunctionType *getRegisterLinkedBinaryFnTy() const; + /// Creates a function to register all kernel stubs generated in this module. llvm::Function *makeRegisterGlobalsFn(); @@ -71,7 +78,23 @@ return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), ConstStr.getPointer(), Zeros); - } + } + + /// Helper function that generates an empty dummy function returning void. + llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { + assert(FnTy->getReturnType()->isVoidTy() && + "Can only generate dummy functions returning void!"); + llvm::Function *DummyFunc = llvm::Function::Create( + FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); + + llvm::BasicBlock *DummyBlock = + llvm::BasicBlock::Create(Context, "", DummyFunc); + CGBuilderTy FuncBuilder(CGM, Context); + FuncBuilder.SetInsertPoint(DummyBlock); + FuncBuilder.CreateRetVoid(); + + return DummyFunc; + } void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); @@ -93,7 +116,8 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), - TheModule(CGM.getModule()) { + TheModule(CGM.getModule()), + RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); @@ -120,6 +144,22 @@ llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); } +llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { + return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); +} + +llvm::FunctionType *CGNVCUDARuntime::getCallbackFnTy() const { + return llvm::FunctionType::get(VoidTy, VoidPtrTy, false); +} + +llvm::FunctionType *CGNVCUDARuntime::getRegisterLinkedBinaryFnTy() const { + auto CallbackFnTy = getCallbackFnTy(); + auto RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); + llvm::Type *Params[] = {RegisterGlobalsFnTy->getPointerTo(), VoidPtrTy, + VoidPtrTy, CallbackFnTy->getPointerTo()}; + return llvm::FunctionType::get(VoidTy, Params, false); +} + void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) { EmittedKernels.push_back(CGF.CurFn); @@ -181,8 +221,8 @@ return nullptr; llvm::Function *RegisterKernelsFunc = llvm::Function::Create( - llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), - llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); + getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, + "__cuda_register_globals", &TheModule); llvm::BasicBlock *EntryBB = llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); CGBuilderTy Builder(CGM, Context); @@ -257,6 +297,11 @@ // void __cuda_register_globals(void* handle); llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); + // We always need a function to pass in as callback. Create a dummy + // implementation if we don't need to register anything. + if (RelocatableDeviceCode && !RegisterGlobalsFunc) + RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); + // void ** __cudaRegisterFatBinary(void *); llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), @@ -286,11 +331,18 @@ CtorBuilder.SetInsertPoint(CtorEntryBB); - const char *FatbinConstantName = - CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; + const char *FatbinConstantName; + if (RelocatableDeviceCode) + // TODO: Figure out how this is called on mac OS! + FatbinConstantName = "__nv_relfatbin"; + else + FatbinConstantName = + CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; // NVIDIA's cuobjdump looks for fatbins in this section. const char *FatbinSectionName = CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; + // TODO: Figure out how this is called on mac OS! + const char *NVModuleIDSectionName = "__nv_module_id"; // Create initialized wrapper structure that points to the loaded GPU binary ConstantInitBuilder Builder(CGM); @@ -309,18 +361,49 @@ /*constant*/ true); FatbinWrapper->setSection(FatbinSectionName); - // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); - llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( - RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); - GpuBinaryHandle = new llvm::GlobalVariable( - TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); - CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, - CGM.getPointerAlign()); - - // Call __cuda_register_globals(GpuBinaryHandle); - if (RegisterGlobalsFunc) - CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); + // Register binary with CUDA runtime. This is substantially different in + // default mode vs. separate compilation! + if (!RelocatableDeviceCode) { + // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + GpuBinaryHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); + CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, + CGM.getPointerAlign()); + + // Call __cuda_register_globals(GpuBinaryHandle); + if (RegisterGlobalsFunc) + CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); + } else { + // Generate a unique module ID. + SmallString<64> NVModuleID; + llvm::raw_svector_ostream OS(NVModuleID); + OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID()); + llvm::Constant *NVModuleIDConstant = + makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32); + + // Create an alias for the FatbinWrapper that nvcc will look for. + llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, + Twine("__fatbinwrap") + NVModuleID, + FatbinWrapper); + + // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *, + // void *, void (*)(void **)) + SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); + RegisterLinkedBinaryName += NVModuleID; + llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( + getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); + + assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); + llvm::Value *Args[] = {RegisterGlobalsFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), + NVModuleIDConstant, + makeDummyFunction(getCallbackFnTy())}; + CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); + } CtorBuilder.CreateRetVoid(); return ModuleCtorFunc; Index: test/CodeGenCUDA/device-stub.cu =================================================================== --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -1,33 +1,40 @@ // RUN: echo "GPU binary would be here" > %t -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - | FileCheck %s -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -fcuda-include-gpubinary %t -o - \ +// RUN: | FileCheck %s --check-prefixes=ALL,NORDC +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ // RUN: | FileCheck %s -check-prefix=NOGLOBALS -// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=NOGPUBIN +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ +// RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \ +// RUN: | FileCheck %s --check-prefixes=ALL,RDC +// RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \ +// RUN: | FileCheck %s -check-prefix=NOGPUBIN #include "Inputs/cuda.h" #ifndef NOGLOBALS -// CHECK-DAG: @device_var = internal global i32 +// ALL-DAG: @device_var = internal global i32 __device__ int device_var; -// CHECK-DAG: @constant_var = internal global i32 +// ALL-DAG: @constant_var = internal global i32 __constant__ int constant_var; -// CHECK-DAG: @shared_var = internal global i32 +// ALL-DAG: @shared_var = internal global i32 __shared__ int shared_var; // Make sure host globals don't get internalized... -// CHECK-DAG: @host_var = global i32 +// ALL-DAG: @host_var = global i32 int host_var; // ... and that extern vars remain external. -// CHECK-DAG: @ext_host_var = external global i32 +// ALL-DAG: @ext_host_var = external global i32 extern int ext_host_var; // Shadows for external device-side variables are *definitions* of // those variables. -// CHECK-DAG: @ext_device_var = internal global i32 +// ALL-DAG: @ext_device_var = internal global i32 extern __device__ int ext_device_var; -// CHECK-DAG: @ext_device_var = internal global i32 +// ALL-DAG: @ext_device_var = internal global i32 extern __constant__ int ext_constant_var; void use_pointers() { @@ -43,59 +50,73 @@ // Make sure that all parts of GPU code init/cleanup are there: // * constant unnamed string with the kernel name -// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" +// ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" // * constant unnamed string with GPU binary -// CHECK: private unnamed_addr constant{{.*GPU binary would be here.*}}\00" -// CHECK-SAME: section ".nv_fatbin", align 8 +// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00" +// NORDC-SAME: section ".nv_fatbin", align 8 +// RDC-SAME: section "__nv_relfatbin", align 8 // * constant struct that wraps GPU binary -// CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } -// CHECK-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null } -// CHECK-SAME: section ".nvFatBinSegment" +// ALL: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } +// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null } +// ALL-SAME: section ".nvFatBinSegment" // * variable to save GPU binary handle after initialization -// CHECK: @__cuda_gpubin_handle = internal global i8** null -// * Make sure our constructor/destructor was added to global ctor/dtor list. -// CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor -// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor +// NORDC: @__cuda_gpubin_handle = internal global i8** null +// * constant unnamed string with NVModuleID +// RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant +// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 +// * Make sure our constructor was added to global ctor list. +// ALL: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor +// * In separate mode we also register a destructor. +// NORDC: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor +// * Alias to global symbol containing the NVModuleID. +// RDC: @__fatbinwrap[[MODULE_ID]] = alias { i32, i32, i8*, i8* } +// RDC-SAME: { i32, i32, i8*, i8* }* @__cuda_fatbin_wrapper // Test that we build the correct number of calls to cudaSetupArgument followed // by a call to cudaLaunch. -// CHECK: define{{.*}}kernelfunc -// CHECK: call{{.*}}cudaSetupArgument -// CHECK: call{{.*}}cudaSetupArgument -// CHECK: call{{.*}}cudaSetupArgument -// CHECK: call{{.*}}cudaLaunch +// ALL: define{{.*}}kernelfunc +// ALL: call{{.*}}cudaSetupArgument +// ALL: call{{.*}}cudaSetupArgument +// ALL: call{{.*}}cudaSetupArgument +// ALL: call{{.*}}cudaLaunch __global__ void kernelfunc(int i, int j, int k) {} // Test that we've built correct kernel launch sequence. -// CHECK: define{{.*}}hostfunc -// CHECK: call{{.*}}cudaConfigureCall -// CHECK: call{{.*}}kernelfunc +// ALL: define{{.*}}hostfunc +// ALL: call{{.*}}cudaConfigureCall +// ALL: call{{.*}}kernelfunc void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } #endif // Test that we've built a function to register kernels and global vars. -// CHECK: define internal void @__cuda_register_globals -// CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc -// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 -// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 -// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 -// CHECK-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 -// CHECK: ret void - -// Test that we've built constructor.. -// CHECK: define internal void @__cuda_module_ctor -// .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper) -// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper +// ALL: define internal void @__cuda_register_globals +// ALL: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc +// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}device_var{{.*}}i32 0, i32 4, i32 0, i32 0 +// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}constant_var{{.*}}i32 0, i32 4, i32 1, i32 0 +// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_device_var{{.*}}i32 1, i32 4, i32 0, i32 0 +// ALL-DAG: call{{.*}}cudaRegisterVar(i8** %0, {{.*}}ext_constant_var{{.*}}i32 1, i32 4, i32 1, i32 0 +// ALL: ret void + +// Test that we've built a constructor. +// ALL: define internal void @__cuda_module_ctor + +// In separate mode it calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper) +// NORDC: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper // .. stores return value in __cuda_gpubin_handle -// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle +// NORDC-NEXT: store{{.*}}__cuda_gpubin_handle // .. and then calls __cuda_register_globals -// CHECK-NEXT: call void @__cuda_register_globals +// NORDC-NEXT: call void @__cuda_register_globals + +// With relocatable device code we call __cudaRegisterLinkedBinary%NVModuleID% +// RDC: call{{.*}}__cudaRegisterLinkedBinary[[MODULE_ID]]( +// RDC-SAME: __cuda_register_globals, {{.*}}__cuda_fatbin_wrapper +// RDC-SAME: [[MODULE_ID_GLOBAL]] // Test that we've created destructor. -// CHECK: define internal void @__cuda_module_dtor -// CHECK: load{{.*}}__cuda_gpubin_handle -// CHECK-NEXT: call void @__cudaUnregisterFatBinary +// NORDC: define internal void @__cuda_module_dtor +// NORDC: load{{.*}}__cuda_gpubin_handle +// NORDC-NEXT: call void @__cudaUnregisterFatBinary // There should be no __cuda_register_globals if we have no // device-side globals, but we still need to register GPU binary.