Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -15,12 +15,13 @@ #include "CGCUDARuntime.h" #include "CodeGenFunction.h" #include "CodeGenModule.h" -#include "clang/CodeGen/ConstantInitBuilder.h" #include "clang/AST/Decl.h" +#include "clang/CodeGen/ConstantInitBuilder.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/Support/Format.h" using namespace clang; using namespace CodeGen; @@ -45,9 +46,12 @@ /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() llvm::SmallVector GpuBinaryHandles; + /// Whether we generate relocatable device code. + bool RelocatableDeviceCode; llvm::Constant *getSetupArgumentFn() const; llvm::Constant *getLaunchFn() const; + llvm::FunctionType *getRegisterGlobalsFnTy() const; /// Creates a function to register all kernel stubs generated in this module. llvm::Function *makeRegisterGlobalsFn(); @@ -71,7 +75,23 @@ return llvm::ConstantExpr::getGetElementPtr(ConstStr.getElementType(), ConstStr.getPointer(), Zeros); - } + } + + /// Helper function that generates an empty dummy function returning void. + llvm::Function *makeDummyFunction(llvm::FunctionType *FnTy) { + assert(FnTy->getReturnType()->isVoidTy() && + "Can only generate dummy functions returning void!"); + llvm::Function *DummyFunc = llvm::Function::Create( + FnTy, llvm::GlobalValue::InternalLinkage, "dummy", &TheModule); + + llvm::BasicBlock *DummyBlock = + llvm::BasicBlock::Create(Context, "", DummyFunc); + CGBuilderTy FuncBuilder(CGM, Context); + FuncBuilder.SetInsertPoint(DummyBlock); + FuncBuilder.CreateRetVoid(); + + return DummyFunc; + } void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); @@ -93,7 +113,8 @@ CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), - TheModule(CGM.getModule()) { + TheModule(CGM.getModule()), + RelocatableDeviceCode(CGM.getLangOpts().CUDARelocatableDeviceCode) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); @@ -161,6 +182,10 @@ CGF.EmitBlock(EndBlock); } +llvm::FunctionType *CGNVCUDARuntime::getRegisterGlobalsFnTy() const { + return llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false); +} + /// Creates a function that sets up state on the host side for CUDA objects that /// have a presence on both the host and device sides. Specifically, registers /// the host side of kernel functions and device global variables with the CUDA @@ -181,8 +206,8 @@ return nullptr; llvm::Function *RegisterKernelsFunc = llvm::Function::Create( - llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), - llvm::GlobalValue::InternalLinkage, "__cuda_register_globals", &TheModule); + getRegisterGlobalsFnTy(), llvm::GlobalValue::InternalLinkage, + "__cuda_register_globals", &TheModule); llvm::BasicBlock *EntryBB = llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); CGBuilderTy Builder(CGM, Context); @@ -257,8 +282,29 @@ if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) return nullptr; + llvm::FunctionType *RegisterGlobalsFnTy; + llvm::FunctionType *RegisterLinkedBinaryFnTy; + llvm::Function *DummyCallback; + if (RelocatableDeviceCode) { + RegisterGlobalsFnTy = getRegisterGlobalsFnTy(); + + auto CallbackFnTy = llvm::FunctionType::get(VoidTy, VoidPtrTy, false); + DummyCallback = makeDummyFunction(CallbackFnTy); + + // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *, + // void *, void (*)(void **)) + llvm::Type *Params[] = {RegisterGlobalsFnTy, VoidPtrTy, VoidPtrTy, + CallbackFnTy}; + RegisterLinkedBinaryFnTy = llvm::FunctionType::get(VoidTy, Params, false); + } + // void __cuda_register_globals(void* handle); llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); + // We always need a function to pass in as callback. Create a dummy + // implementation if we don't need to register anything. + if (RelocatableDeviceCode && !RegisterGlobalsFunc) + RegisterGlobalsFunc = makeDummyFunction(RegisterGlobalsFnTy); + // void ** __cudaRegisterFatBinary(void *); llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), @@ -291,11 +337,18 @@ continue; } - const char *FatbinConstantName = - CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; + const char *FatbinConstantName; + if (RelocatableDeviceCode) + // TODO: Figure out how this is called on mac OS! + FatbinConstantName = "__nv_relfatbin"; + else + FatbinConstantName = + CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; // NVIDIA's cuobjdump looks for fatbins in this section. const char *FatbinSectionName = CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; + // TODO: Figure out how this is called on mac OS! + const char *NVModuleIDSectionName = "__nv_module_id"; // Create initialized wrapper structure that points to the loaded GPU binary ConstantInitBuilder Builder(CGM); @@ -315,22 +368,52 @@ /*constant*/ true); FatbinWrapper->setSection(FatbinSectionName); - // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); - llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( - RegisterFatbinFunc, - CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); - llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( - TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); - CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, - CGM.getPointerAlign()); - - // Call __cuda_register_globals(GpuBinaryHandle); - if (RegisterGlobalsFunc) - CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); - - // Save GpuBinaryHandle so we can unregister it in destructor. - GpuBinaryHandles.push_back(GpuBinaryHandle); + llvm::Constant *NVModuleIDConstant; + SmallString<64> NVModuleID; + if (RelocatableDeviceCode) { + // Generate a unique module ID. + llvm::raw_svector_ostream OS(NVModuleID); + OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID()); + NVModuleIDConstant = + makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32); + + // Create an alias for the FatbinWrapper that nvcc will look for. + llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, + Twine("__fatbinwrap") + NVModuleID, + FatbinWrapper); + } + + // Register binary with CUDA runtime. This is substantially different in + // default mode vs. separate compilation! + if (!RelocatableDeviceCode) { + // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); + CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, + CGM.getPointerAlign()); + + // Call __cuda_register_globals(GpuBinaryHandle); + if (RegisterGlobalsFunc) + CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); + + // Save GpuBinaryHandle so we can unregister it in destructor. + GpuBinaryHandles.push_back(GpuBinaryHandle); + } else { + SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); + RegisterLinkedBinaryName += NVModuleID; + llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( + RegisterLinkedBinaryFnTy, RegisterLinkedBinaryName); + + assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); + assert(DummyCallback && "Expecting dummy function for second callback!"); + llvm::Value *Args[] = {RegisterGlobalsFunc, FatbinWrapper, + NVModuleIDConstant, DummyCallback}; + CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); + } } CtorBuilder.CreateRetVoid();