Index: include/clang/Driver/CC1Options.td =================================================================== --- include/clang/Driver/CC1Options.td +++ include/clang/Driver/CC1Options.td @@ -617,6 +617,8 @@ def fcuda_disable_target_call_checks : Flag<["-"], "fcuda-disable-target-call-checks">, HelpText<"Disable all cross-target (host, device, etc.) call checks in CUDA">; +def fcuda_include_gpubinary : Separate<["-"], "fcuda-include-gpubinary">, + HelpText<"Incorporate CUDA device-side binary into host object file.">; } // let Flags = [CC1Option] Index: include/clang/Frontend/CodeGenOptions.h =================================================================== --- include/clang/Frontend/CodeGenOptions.h +++ include/clang/Frontend/CodeGenOptions.h @@ -163,6 +163,11 @@ /// Name of the profile file to use as input for -fprofile-instr-use std::string InstrProfileInput; + /// A list of file names passed with -fcuda-include-gpubinary options to + /// forward to CUDA runtime back-end for incorporating them into host-side + /// object file. + std::vector CudaGpuBinaryFileNames; + /// Regular expression to select optimizations for which we should enable /// optimization remarks. Transformation passes whose name matches this /// expression (and support this feature), will emit a diagnostic Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -20,7 +20,7 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" -#include +#include "llvm/IR/Verifier.h" using namespace clang; using namespace CodeGen; @@ -30,29 +30,66 @@ class CGNVCUDARuntime : public CGCUDARuntime { private: - llvm::Type *IntTy, *SizeTy; - llvm::PointerType *CharPtrTy, *VoidPtrTy; + llvm::Type *IntTy, *SizeTy, *VoidTy; + llvm::PointerType *CharPtrTy, *VoidPtrTy, *VoidPtrPtrTy; + + /// Convenience reference to LLVM Context + llvm::LLVMContext &Context; + /// Convenience reference to the current module + llvm::Module &TheModule; + /// Keeps track of kernel launch stubs emitted in this module + llvm::SmallVector EmittedKernels; + /// Keeps track of variables containing handles of GPU binaries. Populated by + /// ModuleCtorFunction() and used to create corresponding cleanup calls in + /// ModuleDtorFunction() + llvm::SmallVector GpuBinaryHandles; llvm::Constant *getSetupArgumentFn() const; llvm::Constant *getLaunchFn() const; + /// Creates a function to register all kernel stubs generated in this module. + llvm::Function *makeRegisterKernelsFn(); + + /// Helper function that generates a constant string and returns a pointer to + /// the start of the string. The result of this function can be used anywhere + /// where the C code specifies const char*. + llvm::Constant *makeConstantString(const std::string &Str, + const std::string &Name = "", + unsigned Alignment = 0) { + llvm::Constant *Zeros[] = {llvm::ConstantInt::get(SizeTy, 0), + llvm::ConstantInt::get(SizeTy, 0)}; + auto *ConstStr = CGM.GetAddrOfConstantCString(Str, Name.c_str()); + return llvm::ConstantExpr::getGetElementPtr(ConstStr->getValueType(), + ConstStr, Zeros); + } + + void emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args); + public: CGNVCUDARuntime(CodeGenModule &CGM); - void EmitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) override; + void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) override; + /// Creates module constructor function + llvm::Function *makeModuleCtorFunction() override; + /// Creates module destructor function + llvm::Function *makeModuleDtorFunction() override; }; } -CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) : CGCUDARuntime(CGM) { +CGNVCUDARuntime::CGNVCUDARuntime(CodeGenModule &CGM) + : CGCUDARuntime(CGM), Context(CGM.getLLVMContext()), + TheModule(CGM.getModule()) { CodeGen::CodeGenTypes &Types = CGM.getTypes(); ASTContext &Ctx = CGM.getContext(); IntTy = Types.ConvertType(Ctx.IntTy); SizeTy = Types.ConvertType(Ctx.getSizeType()); + VoidTy = llvm::Type::getVoidTy(Context); CharPtrTy = llvm::PointerType::getUnqual(Types.ConvertType(Ctx.CharTy)); VoidPtrTy = cast(Types.ConvertType(Ctx.VoidPtrTy)); + VoidPtrPtrTy = VoidPtrTy->getPointerTo(); } llvm::Constant *CGNVCUDARuntime::getSetupArgumentFn() const { @@ -68,14 +105,17 @@ llvm::Constant *CGNVCUDARuntime::getLaunchFn() const { // cudaError_t cudaLaunch(char *) - std::vector Params; - Params.push_back(CharPtrTy); - return CGM.CreateRuntimeFunction(llvm::FunctionType::get(IntTy, - Params, false), - "cudaLaunch"); + return CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, CharPtrTy, false), "cudaLaunch"); } -void CGNVCUDARuntime::EmitDeviceStubBody(CodeGenFunction &CGF, +void CGNVCUDARuntime::emitDeviceStub(CodeGenFunction &CGF, + FunctionArgList &Args) { + EmittedKernels.push_back(CGF.CurFn); + emitDeviceStubBody(CGF, Args); +} + +void CGNVCUDARuntime::emitDeviceStubBody(CodeGenFunction &CGF, FunctionArgList &Args) { // Build the argument value list and the argument stack struct type. SmallVector ArgValues; @@ -87,8 +127,7 @@ assert(isa(V->getType()) && "Arg type not PointerType"); ArgTypes.push_back(cast(V->getType())->getElementType()); } - llvm::StructType *ArgStackTy = llvm::StructType::get( - CGF.getLLVMContext(), ArgTypes); + llvm::StructType *ArgStackTy = llvm::StructType::get(Context, ArgTypes); llvm::BasicBlock *EndBlock = CGF.createBasicBlock("setup.end"); @@ -120,6 +159,163 @@ CGF.EmitBlock(EndBlock); } +/// Creates internal function to register all kernel stubs generated in this +/// module with the CUDA runtime. +/// \code +/// void __cuda_register_kernels(void** GpuBinaryHandle) { +/// __cudaRegisterFunction(GpuBinaryHandle,Kernel0,...); +/// ... +/// __cudaRegisterFunction(GpuBinaryHandle,KernelM,...); +/// } +/// \endcode +llvm::Function *CGNVCUDARuntime::makeRegisterKernelsFn() { + llvm::Function *RegisterKernelsFunc = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + llvm::GlobalValue::InternalLinkage, "__cuda_register_kernels", &TheModule); + llvm::BasicBlock *EntryBB = + llvm::BasicBlock::Create(Context, "entry", RegisterKernelsFunc); + CGBuilderTy Builder(Context); + Builder.SetInsertPoint(EntryBB); + + // void __cudaRegisterFunction(void **, const char *, char *, const char *, + // int, uint3*, uint3*, dim3*, dim3*, int*) + std::vector RegisterFuncParams = { + VoidPtrPtrTy, CharPtrTy, CharPtrTy, CharPtrTy, IntTy, + VoidPtrTy, VoidPtrTy, VoidPtrTy, VoidPtrTy, IntTy->getPointerTo()}; + llvm::Constant *RegisterFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(IntTy, RegisterFuncParams, false), + "__cudaRegisterFunction"); + + // Extract GpuBinaryHandle passed as the first argument passed to + // __cuda_register_kernels() and generate __cudaRegisterFunction() call for + // each emitted kernel. + llvm::Argument &GpuBinaryHandlePtr = *RegisterKernelsFunc->arg_begin(); + for (llvm::Function *Kernel : EmittedKernels) { + llvm::Constant *KernelName = makeConstantString(Kernel->getName()); + llvm::Constant *NullPtr = llvm::ConstantPointerNull::get(VoidPtrTy); + llvm::Value *args[] = { + &GpuBinaryHandlePtr, Builder.CreateBitCast(Kernel, VoidPtrTy), + KernelName, KernelName, llvm::ConstantInt::get(IntTy, -1), NullPtr, + NullPtr, NullPtr, NullPtr, + llvm::ConstantPointerNull::get(IntTy->getPointerTo())}; + Builder.CreateCall(RegisterFunc, args); + } + + Builder.CreateRetVoid(); + llvm::verifyFunction(*RegisterKernelsFunc); + return RegisterKernelsFunc; +} + +/// Creates a global constructor function for the module: +/// \code +/// void __cuda_module_ctor(void*) { +/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); +/// __cuda_register_kernels(Handle0); +/// ... +/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); +/// __cuda_register_kernels(HandleN); +/// } +/// \endcode +llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { + // void __cuda_register_kernels(void* handle); + llvm::Function *RegisterKernelsFunc = makeRegisterKernelsFn(); + // void ** __cudaRegisterFatBinary(void *); + llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), + "__cudaRegisterFatBinary"); + // struct { int magic, int version, void * gpu_binary, void * dont_care }; + llvm::StructType *FatbinWrapperTy = + llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy, nullptr); + + llvm::Function *ModuleCtorFunc = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, VoidPtrTy, false), + llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); + llvm::BasicBlock *CtorEntryBB = + llvm::BasicBlock::Create(Context, "entry", ModuleCtorFunc); + CGBuilderTy CtorBuilder(Context); + + CtorBuilder.SetInsertPoint(CtorEntryBB); + + // For each GPU binary, register it with the CUDA runtime and store returned + // handle in a global variable and save the handle in GpuBinaryHandles vector + // to be cleaned up in destructor on exit. Then associate all known kernels + // with the GPU binary handle so CUDA runtime can figure out what to call on + // the GPU side. + for (const std::string &GpuBinaryFileName : + CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { + llvm::ErrorOr> GpuBinaryOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); + if (std::error_code EC = GpuBinaryOrErr.getError()) { + CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName + << EC.message(); + continue; + } + + // Create initialized wrapper structure that points to the loaded GPU binary + llvm::Constant *Values[] = { + llvm::ConstantInt::get(IntTy, 0x466243b1), // Fatbin wrapper magic. + llvm::ConstantInt::get(IntTy, 1), // Fatbin version. + makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", 16), // Data. + llvm::ConstantPointerNull::get(VoidPtrTy)}; // Unused in fatbin v1. + llvm::GlobalVariable *FatbinWrapper = new llvm::GlobalVariable( + TheModule, FatbinWrapperTy, true, llvm::GlobalValue::InternalLinkage, + llvm::ConstantStruct::get(FatbinWrapperTy, Values), + "__cuda_fatbin_wrapper"); + + // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); + CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryHandle, false); + + // Call __cuda_register_kernels(GpuBinaryHandle); + CtorBuilder.CreateCall(RegisterKernelsFunc, RegisterFatbinCall); + + // Save GpuBinaryHandle so we can unregister it in destructor. + GpuBinaryHandles.push_back(GpuBinaryHandle); + } + + CtorBuilder.CreateRetVoid(); + llvm::verifyFunction(*ModuleCtorFunc); + return ModuleCtorFunc; +} + +/// Creates a global destructor function that unregisters all GPU code blobs +/// registered by constructor. +/// \code +/// void __cuda_module_dtor(void*) { +/// __cudaUnregisterFatBinary(Handle0); +/// ... +/// __cudaUnregisterFatBinary(HandleN); +/// } +/// \endcode +llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { + // void __cudaUnregisterFatBinary(void ** handle); + llvm::Constant *UnregisterFatbinFunc = CGM.CreateRuntimeFunction( + llvm::FunctionType::get(VoidTy, VoidPtrPtrTy, false), + "__cudaUnregisterFatBinary"); + + llvm::Function *ModuleDtorFunc = llvm::Function::Create( + llvm::FunctionType::get(VoidTy, VoidPtrTy, false), + llvm::GlobalValue::InternalLinkage, "__cuda_module_dtor", &TheModule); + llvm::BasicBlock *DtorEntryBB = + llvm::BasicBlock::Create(Context, "entry", ModuleDtorFunc); + CGBuilderTy DtorBuilder(Context); + DtorBuilder.SetInsertPoint(DtorEntryBB); + + for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { + DtorBuilder.CreateCall(UnregisterFatbinFunc, + DtorBuilder.CreateLoad(GpuBinaryHandle, false)); + } + + DtorBuilder.CreateRetVoid(); + llvm::verifyFunction(*ModuleDtorFunc); + return ModuleDtorFunc; +} + CGCUDARuntime *CodeGen::CreateNVCUDARuntime(CodeGenModule &CGM) { return new CGNVCUDARuntime(CGM); } Index: lib/CodeGen/CGCUDARuntime.h =================================================================== --- lib/CodeGen/CGCUDARuntime.h +++ lib/CodeGen/CGCUDARuntime.h @@ -16,6 +16,10 @@ #ifndef LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H #define LLVM_CLANG_LIB_CODEGEN_CGCUDARUNTIME_H +namespace llvm { +class Function; +} + namespace clang { class CUDAKernelCallExpr; @@ -39,10 +43,18 @@ virtual RValue EmitCUDAKernelCallExpr(CodeGenFunction &CGF, const CUDAKernelCallExpr *E, ReturnValueSlot ReturnValue); - - virtual void EmitDeviceStubBody(CodeGenFunction &CGF, - FunctionArgList &Args) = 0; + /// Adds CGF.CurFn to EmittedKernels and calls EmitDeviceStubBody() to emit a + /// kernel launch stub. + virtual void emitDeviceStub(CodeGenFunction &CGF, FunctionArgList &Args) = 0; + + /// Constructs and returns a module initialization function or nullptr if it's + /// not needed. Must be called after all kernels have been emitted. + virtual llvm::Function *makeModuleCtorFunction() = 0; + + /// Returns a module cleanup function or nullptr if it's not needed. + /// Must be called after ModuleCtorFunction + virtual llvm::Function *makeModuleDtorFunction() = 0; }; /// Creates an instance of a CUDA runtime class. Index: lib/CodeGen/CodeGenFunction.cpp =================================================================== --- lib/CodeGen/CodeGenFunction.cpp +++ lib/CodeGen/CodeGenFunction.cpp @@ -878,7 +878,7 @@ else if (getLangOpts().CUDA && !getLangOpts().CUDAIsDevice && FD->hasAttr()) - CGM.getCUDARuntime().EmitDeviceStubBody(*this, Args); + CGM.getCUDARuntime().emitDeviceStub(*this, Args); else if (isa(FD) && cast(FD)->isLambdaToBlockPointerConversion()) { // The lambda conversion to block pointer is special; the semantics can't be Index: lib/CodeGen/CodeGenModule.cpp =================================================================== --- lib/CodeGen/CodeGenModule.cpp +++ lib/CodeGen/CodeGenModule.cpp @@ -350,6 +350,13 @@ if (ObjCRuntime) if (llvm::Function *ObjCInitFunction = ObjCRuntime->ModuleInitFunction()) AddGlobalCtor(ObjCInitFunction); + if (Context.getLangOpts().CUDA && !Context.getLangOpts().CUDAIsDevice && + CUDARuntime) { + if (llvm::Function *CudaCtorFunction = CUDARuntime->makeModuleCtorFunction()) + AddGlobalCtor(CudaCtorFunction); + if (llvm::Function *CudaDtorFunction = CUDARuntime->makeModuleDtorFunction()) + AddGlobalDtor(CudaDtorFunction); + } if (PGOReader && PGOStats.hasDiagnostics()) PGOStats.reportDiagnostics(getDiags(), getCodeGenOpts().MainFileName); EmitCtorList(GlobalCtors, "llvm.global_ctors"); @@ -3677,4 +3684,3 @@ CXXGlobalInits.push_back(InitFunction); } } - Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -645,6 +645,9 @@ Args.getAllArgValues(OPT_fsanitize_recover_EQ), Diags, Opts.SanitizeRecover); + Opts.CudaGpuBinaryFileNames = + Args.getAllArgValues(OPT_fcuda_include_gpubinary); + return Success; } Index: test/CodeGenCUDA/device-stub.cu =================================================================== --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -1,7 +1,21 @@ -// RUN: %clang_cc1 -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -emit-llvm %s -fcuda-include-gpubinary %s -o - | FileCheck %s #include "Inputs/cuda.h" +// Make sure that all parts of GPU code init/cleanup are there: +// * constant unnamed string with the kernel name +// CHECK: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00", align 1 +// * constant unnamed string with GPU binary +// CHECK: private unnamed_addr constant{{.*}}\00" +// * constant struct that wraps GPU binary +// CHECK: @__cuda_fatbin_wrapper = internal constant { i32, i32, i8*, i8* } +// CHECK: { i32 1180844977, i32 1, {{.*}}, i64 0, i64 0), i8* null } +// * variable to save GPU binary handle after initialization +// CHECK: @__cuda_gpubin_handle = internal global i8** null +// * Make sure our constructor/destructor was added to global ctor/dtor list. +// CHECK: @llvm.global_ctors = appending global {{.*}}@__cuda_module_ctor +// CHECK: @llvm.global_dtors = appending global {{.*}}@__cuda_module_dtor + // Test that we build the correct number of calls to cudaSetupArgument followed // by a call to cudaLaunch. @@ -11,3 +25,28 @@ // CHECK: call{{.*}}cudaSetupArgument // CHECK: call{{.*}}cudaLaunch __global__ void kernelfunc(int i, int j, int k) {} + +// Test that we've built correct kernel launch sequence. +// CHECK: define{{.*}}hostfunc +// CHECK: call{{.*}}cudaConfigureCall +// CHEKC: call{{.*}}kernelfunc +void hostfunc(void) { kernelfunc<<<1, 1>>>(1, 1, 1); } + +// Test that we've built a function to register kernels +// CHECK: define internal void @__cuda_register_kernels +// CHECK: call{{.*}}cudaRegisterFunction(i8** %0, {{.*}}kernelfunc + +// Test that we've built contructor.. +// CHECK: define internal void @__cuda_module_ctor +// .. that calls __cudaRegisterFatBinary(&__cuda_fatbin_wrapper) +// CHECK: call{{.*}}cudaRegisterFatBinary{{.*}}__cuda_fatbin_wrapper +// .. stores return value in __cuda_gpubin_handle +// CHECK-NEXT: store{{.*}}__cuda_gpubin_handle +// .. and then calls __cuda_register_kernels +// CHECK-NEXT: call void @__cuda_register_kernels + +// Test that we've created destructor. +// CHECK: define internal void @__cuda_module_dtor +// CHECK: load{{.*}}__cuda_gpubin_handle +// CHECK-NEXT: call void @__cudaUnregisterFatBinary +