Index: include/clang/Frontend/CodeGenOptions.h =================================================================== --- include/clang/Frontend/CodeGenOptions.h +++ include/clang/Frontend/CodeGenOptions.h @@ -205,10 +205,9 @@ /// the summary and module symbol table (and not, e.g. any debug metadata). std::string ThinLinkBitcodeFile; - /// A list of file names passed with -fcuda-include-gpubinary options to - /// forward to CUDA runtime back-end for incorporating them into host-side - /// object file. - std::vector CudaGpuBinaryFileNames; + /// Name of file passed with -fcuda-include-gpubinary option to forward to + /// CUDA runtime back-end for incorporating them into host-side object file. + std::string CudaGpuBinaryFileName; /// The name of the file to which the backend should save YAML optimization /// records. Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -41,10 +41,10 @@ /// Keeps track of kernel launch stubs emitted in this module llvm::SmallVector EmittedKernels; llvm::SmallVector, 16> DeviceVars; - /// Keeps track of variables containing handles of GPU binaries. Populated by + /// Keeps track of variable containing handle of GPU binary. Populated by /// ModuleCtorFunction() and used to create corresponding cleanup calls in /// ModuleDtorFunction() - llvm::SmallVector GpuBinaryHandles; + llvm::GlobalVariable *GpuBinaryHandle = nullptr; llvm::Constant *getSetupArgumentFn() const; llvm::Constant *getLaunchFn() const; @@ -245,16 +245,14 @@ /// Creates a global constructor function for the module: /// \code /// void __cuda_module_ctor(void*) { -/// Handle0 = __cudaRegisterFatBinary(GpuBinaryBlob0); -/// __cuda_register_globals(Handle0); -/// ... -/// HandleN = __cudaRegisterFatBinary(GpuBinaryBlobN); -/// __cuda_register_globals(HandleN); +/// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); +/// __cuda_register_globals(Handle); /// } /// \endcode llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { - // No need to generate ctors/dtors if there are no GPU binaries. - if (CGM.getCodeGenOpts().CudaGpuBinaryFileNames.empty()) + // No need to generate ctors/dtors if there is no GPU binary. + std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; + if (GpuBinaryFileName.empty()) return nullptr; // void __cuda_register_globals(void* handle); @@ -267,6 +265,18 @@ llvm::StructType *FatbinWrapperTy = llvm::StructType::get(IntTy, IntTy, VoidPtrTy, VoidPtrTy); + // Register GPU binary with the CUDA runtime, store returned handle in a + // global variable and save a reference in GpuBinaryHandle to be cleaned up + // in destructor on exit. Then associate all known kernels with the GPU binary + // handle so CUDA runtime can figure out what to call on the GPU side. + llvm::ErrorOr> GpuBinaryOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); + if (std::error_code EC = GpuBinaryOrErr.getError()) { + CGM.getDiags().Report(diag::err_cannot_open_file) + << GpuBinaryFileName << EC.message(); + return nullptr; + } + llvm::Function *ModuleCtorFunc = llvm::Function::Create( llvm::FunctionType::get(VoidTy, VoidPtrTy, false), llvm::GlobalValue::InternalLinkage, "__cuda_module_ctor", &TheModule); @@ -276,79 +286,56 @@ CtorBuilder.SetInsertPoint(CtorEntryBB); - // For each GPU binary, register it with the CUDA runtime and store returned - // handle in a global variable and save the handle in GpuBinaryHandles vector - // to be cleaned up in destructor on exit. Then associate all known kernels - // with the GPU binary handle so CUDA runtime can figure out what to call on - // the GPU side. - for (const std::string &GpuBinaryFileName : - CGM.getCodeGenOpts().CudaGpuBinaryFileNames) { - llvm::ErrorOr> GpuBinaryOrErr = - llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); - if (std::error_code EC = GpuBinaryOrErr.getError()) { - CGM.getDiags().Report(diag::err_cannot_open_file) << GpuBinaryFileName - << EC.message(); - continue; - } - - const char *FatbinConstantName = - CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; - // NVIDIA's cuobjdump looks for fatbins in this section. - const char *FatbinSectionName = - CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; - - // Create initialized wrapper structure that points to the loaded GPU binary - ConstantInitBuilder Builder(CGM); - auto Values = Builder.beginStruct(FatbinWrapperTy); - // Fatbin wrapper magic. - Values.addInt(IntTy, 0x466243b1); - // Fatbin version. - Values.addInt(IntTy, 1); - // Data. - Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), - "", FatbinConstantName, 8)); - // Unused in fatbin v1. - Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); - llvm::GlobalVariable *FatbinWrapper = - Values.finishAndCreateGlobal("__cuda_fatbin_wrapper", - CGM.getPointerAlign(), - /*constant*/ true); - FatbinWrapper->setSection(FatbinSectionName); - - // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); - llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( - RegisterFatbinFunc, - CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); - llvm::GlobalVariable *GpuBinaryHandle = new llvm::GlobalVariable( - TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); - CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, - CGM.getPointerAlign()); - - // Call __cuda_register_globals(GpuBinaryHandle); - if (RegisterGlobalsFunc) - CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); - - // Save GpuBinaryHandle so we can unregister it in destructor. - GpuBinaryHandles.push_back(GpuBinaryHandle); - } + const char *FatbinConstantName = + CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; + // NVIDIA's cuobjdump looks for fatbins in this section. + const char *FatbinSectionName = + CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; + + // Create initialized wrapper structure that points to the loaded GPU binary + ConstantInitBuilder Builder(CGM); + auto Values = Builder.beginStruct(FatbinWrapperTy); + // Fatbin wrapper magic. + Values.addInt(IntTy, 0x466243b1); + // Fatbin version. + Values.addInt(IntTy, 1); + // Data. + Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", + FatbinConstantName, 8)); + // Unused in fatbin v1. + Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); + llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( + "__cuda_fatbin_wrapper", CGM.getPointerAlign(), + /*constant*/ true); + FatbinWrapper->setSection(FatbinSectionName); + + // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + GpuBinaryHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, + llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); + CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, + CGM.getPointerAlign()); + + // Call __cuda_register_globals(GpuBinaryHandle); + if (RegisterGlobalsFunc) + CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); CtorBuilder.CreateRetVoid(); return ModuleCtorFunc; } -/// Creates a global destructor function that unregisters all GPU code blobs +/// Creates a global destructor function that unregisters the GPU code blob /// registered by constructor. /// \code /// void __cuda_module_dtor(void*) { -/// __cudaUnregisterFatBinary(Handle0); -/// ... -/// __cudaUnregisterFatBinary(HandleN); +/// __cudaUnregisterFatBinary(Handle); /// } /// \endcode llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { - // No need for destructor if we don't have handles to unregister. - if (GpuBinaryHandles.empty()) + // No need for destructor if we don't have a handle to unregister. + if (!GpuBinaryHandle) return nullptr; // void __cudaUnregisterFatBinary(void ** handle); @@ -364,11 +351,9 @@ CGBuilderTy DtorBuilder(CGM, Context); DtorBuilder.SetInsertPoint(DtorEntryBB); - for (llvm::GlobalVariable *GpuBinaryHandle : GpuBinaryHandles) { - auto HandleValue = + auto HandleValue = DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); - DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); - } + DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); DtorBuilder.CreateRetVoid(); return ModuleDtorFunc; Index: lib/Driver/ToolChains/Clang.cpp =================================================================== --- lib/Driver/ToolChains/Clang.cpp +++ lib/Driver/ToolChains/Clang.cpp @@ -4661,13 +4661,12 @@ } if (IsCuda) { - // Host-side cuda compilation receives device-side outputs as Inputs[1...]. - // Include them with -fcuda-include-gpubinary. + // Host-side cuda compilation receives all device-side outputs in a single + // fatbin as Inputs[1]. Include the binary with -fcuda-include-gpubinary. if (Inputs.size() > 1) { - for (auto I = std::next(Inputs.begin()), E = Inputs.end(); I != E; ++I) { - CmdArgs.push_back("-fcuda-include-gpubinary"); - CmdArgs.push_back(I->getFilename()); - } + assert(Inputs.size() == 2 && "More than one GPU binary!"); + CmdArgs.push_back("-fcuda-include-gpubinary"); + CmdArgs.push_back(Inputs[1].getFilename()); } if (Args.hasFlag(options::OPT_fcuda_rdc, options::OPT_fno_cuda_rdc, false)) Index: lib/Frontend/CompilerInvocation.cpp =================================================================== --- lib/Frontend/CompilerInvocation.cpp +++ lib/Frontend/CompilerInvocation.cpp @@ -1045,8 +1045,8 @@ Args.getAllArgValues(OPT_fsanitize_trap_EQ), Diags, Opts.SanitizeTrap); - Opts.CudaGpuBinaryFileNames = - Args.getAllArgValues(OPT_fcuda_include_gpubinary); + Opts.CudaGpuBinaryFileName = + Args.getLastArgValue(OPT_fcuda_include_gpubinary); Opts.Backchain = Args.hasArg(OPT_mbackchain); Index: test/Driver/cuda-options.cu =================================================================== --- test/Driver/cuda-options.cu +++ test/Driver/cuda-options.cu @@ -73,11 +73,10 @@ // and that all results are included on the host side. // RUN: %clang -### -target x86_64-linux-gnu \ // RUN: --cuda-gpu-arch=sm_35 --cuda-gpu-arch=sm_30 -c %s 2>&1 \ -// RUN: | FileCheck -check-prefix DEVICE -check-prefix DEVICE-NOSAVE \ -// RUN: -check-prefix DEVICE2 -check-prefix DEVICE-SM30 \ -// RUN: -check-prefix DEVICE2-SM35 -check-prefix HOST \ -// RUN: -check-prefix HOST-NOSAVE -check-prefix INCLUDES-DEVICE \ -// RUN: -check-prefix NOLINK %s +// RUN: | FileCheck -check-prefixes DEVICE,DEVICE-NOSAVE,DEVICE2 \ +// RUN: -check-prefixes DEVICE-SM30,DEVICE2-SM35 \ +// RUN: -check-prefixes INCLUDES-DEVICE,INCLUDES-DEVICE2 \ +// RUN: -check-prefixes HOST,HOST-NOSAVE,NOLINK %s // Verify that device-side results are passed to the correct tool when // -save-temps is used. @@ -182,9 +181,15 @@ // DEVICE2-SAME: "-aux-triple" "x86_64--linux-gnu" // DEVICE2-SAME: "-fcuda-is-device" // DEVICE2-SM35-SAME: "-target-cpu" "sm_35" -// DEVICE2-SAME: "-o" "[[GPUBINARY2:[^"]*]]" +// DEVICE2-SAME: "-o" "[[PTXFILE2:[^"]*]]" // DEVICE2-SAME: "-x" "cuda" +// Match another call to ptxas. +// DEVICE2: ptxas +// DEVICE2-SM35-DAG: "--gpu-name" "sm_35" +// DEVICE2-DAG: "--output-file" "[[CUBINFILE2:[^"]*]]" +// DEVICE2-DAG: "[[PTXFILE2]]" + // Match no device-side compilation. // NODEVICE-NOT: "-cc1" "-triple" "nvptx64-nvidia-cuda" // NODEVICE-NOT: "-fcuda-is-device" @@ -193,6 +198,8 @@ // INCLUDES-DEVICE-DAG: "--create" "[[FATBINARY:[^"]*]]" // INCLUDES-DEVICE-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE]]" // INCLUDES-DEVICE-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE]]" +// INCLUDES-DEVICE2-DAG: "--image=profile=sm_{{[0-9]+}},file=[[CUBINFILE2]]" +// INCLUDES-DEVICE2-DAG: "--image=profile=compute_{{[0-9]+}},file=[[PTXFILE2]]" // Match host-side preprocessor job with -save-temps. // HOST-SAVE: "-cc1" "-triple" "x86_64--linux-gnu" @@ -207,7 +214,11 @@ // HOST-SAME: "-o" "[[HOSTOUTPUT:[^"]*]]" // HOST-NOSAVE-SAME: "-x" "cuda" // HOST-SAVE-SAME: "-x" "cuda-cpp-output" +// There is only one GPU binary after combining it with fatbinary! +// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary" // INCLUDES-DEVICE-SAME: "-fcuda-include-gpubinary" "[[FATBINARY]]" +// There is only one GPU binary after combining it with fatbinary. +// INCLUDES-DEVICE2-NOT: "-fcuda-include-gpubinary" // Match external assembler that uses compilation output. // HOST-AS: "-o" "{{.*}}.o" "[[HOSTOUTPUT]]"