Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -309,12 +309,24 @@ } /// Creates a global constructor function for the module: +/// +/// For CUDA: /// \code /// void __cuda_module_ctor(void*) { /// Handle = __cudaRegisterFatBinary(GpuBinaryBlob); /// __cuda_register_globals(Handle); /// } /// \endcode +/// +/// For HIP: +/// \code +/// void __hip_module_ctor(void*) { +/// if (__hip_gpubin_handle == 0) { +/// __hip_gpubin_handle = __hipRegisterFatBinary(GpuBinaryBlob); +/// __hip_register_globals(__hip_gpubin_handle); +/// } +/// } +/// \endcode llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { bool IsHIP = CGM.getLangOpts().HIP; // No need to generate ctors/dtors if there is no GPU binary. @@ -427,22 +439,68 @@ /*constant*/ true); FatbinWrapper->setSection(FatbinSectionName); - // Register binary with CUDA/HIP runtime. This is substantially different in - // default mode vs. separate compilation! - if (!RelocatableDeviceCode) { - // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper); + // There is only one HIP fat binary per linked module, however there are + // multiple constructor functions. Make sure the fat binary is registered + // only once. The constructor functions are executed by the dynamic loader + // before the program gains control. The dynamic loader cannot execute the + // constructor functions concurrently since doing that would not guarantee + // thread safety of the loaded program. Therefore we can assume sequential + // execution of constructor functions here. + if (IsHIP) { + llvm::BasicBlock *IfBlock = + llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); + llvm::BasicBlock *ExitBlock = + llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); + // The name, size, and initialization pattern of this variable is part + // of HIP ABI. + GpuBinaryHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, /*isConstant=*/false, + llvm::GlobalValue::LinkOnceAnyLinkage, + /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), + "__hip_gpubin_handle"); + GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); + Address GpuBinaryAddr( + GpuBinaryHandle, + CharUnits::fromQuantity(GpuBinaryHandle->getAlignment())); + { + auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); + llvm::Constant *Zero = + llvm::Constant::getNullValue(HandleValue->getType()); + llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); + CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); + } + { + CtorBuilder.SetInsertPoint(IfBlock); + // GpuBinaryHandle = __hipRegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + CtorBuilder.CreateStore(RegisterFatbinCall, GpuBinaryAddr); + CtorBuilder.CreateBr(ExitBlock); + } + { + CtorBuilder.SetInsertPoint(ExitBlock); + // Call __hip_register_globals(GpuBinaryHandle); + if (RegisterGlobalsFunc) { + auto HandleValue = CtorBuilder.CreateLoad(GpuBinaryAddr); + CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); + } + } + } else if (!RelocatableDeviceCode) { + // Register binary with CUDA runtime. This is substantially different in + // default mode vs. separate compilation! + // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); GpuBinaryHandle = new llvm::GlobalVariable( TheModule, VoidPtrPtrTy, false, llvm::GlobalValue::InternalLinkage, - llvm::ConstantPointerNull::get(VoidPtrPtrTy), - addUnderscoredPrefixToName("_gpubin_handle")); - + llvm::ConstantPointerNull::get(VoidPtrPtrTy), "__cuda_gpubin_handle"); + GpuBinaryHandle->setAlignment(CGM.getPointerAlign().getQuantity()); CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, CGM.getPointerAlign()); - // Call __{cuda|hip}_register_globals(GpuBinaryHandle); + // Call __cuda_register_globals(GpuBinaryHandle); if (RegisterGlobalsFunc) CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); } else { @@ -453,15 +511,13 @@ llvm::Constant *ModuleIDConstant = makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); - // Create an alias for the FatbinWrapper that nvcc or hip backend will - // look for. + // Create an alias for the FatbinWrapper that nvcc will look for. llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, Twine("__fatbinwrap") + ModuleID, FatbinWrapper); - // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *, + // void __cudaRegisterLinkedBinary%ModuleID%(void (*)(void *), void *, // void *, void (*)(void **)) - SmallString<128> RegisterLinkedBinaryName( - addUnderscoredPrefixToName("RegisterLinkedBinary")); + SmallString<128> RegisterLinkedBinaryName("__cudaRegisterLinkedBinary"); RegisterLinkedBinaryName += ModuleID; llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); @@ -493,11 +549,23 @@ /// Creates a global destructor function that unregisters the GPU code blob /// registered by constructor. +/// +/// For CUDA: /// \code /// void __cuda_module_dtor(void*) { /// __cudaUnregisterFatBinary(Handle); /// } /// \endcode +/// +/// For HIP: +/// \code +/// void __hip_module_dtor(void*) { +/// if (__hip_gpubin_handle) { +/// __hipUnregisterFatBinary(__hip_gpubin_handle); +/// __hip_gpubin_handle = 0; +/// } +/// } +/// \endcode llvm::Function *CGNVCUDARuntime::makeModuleDtorFunction() { // No need for destructor if we don't have a handle to unregister. if (!GpuBinaryHandle) @@ -518,10 +586,30 @@ CGBuilderTy DtorBuilder(CGM, Context); DtorBuilder.SetInsertPoint(DtorEntryBB); - auto HandleValue = - DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); - DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); + Address GpuBinaryAddr(GpuBinaryHandle, CharUnits::fromQuantity( + GpuBinaryHandle->getAlignment())); + auto HandleValue = DtorBuilder.CreateLoad(GpuBinaryAddr); + // There is only one HIP fat binary per linked module, however there are + // multiple destructor functions. Make sure the fat binary is unregistered + // only once. + if (CGM.getLangOpts().HIP) { + llvm::BasicBlock *IfBlock = + llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); + llvm::BasicBlock *ExitBlock = + llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); + llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); + llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); + DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); + + DtorBuilder.SetInsertPoint(IfBlock); + DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); + DtorBuilder.CreateStore(Zero, GpuBinaryAddr); + DtorBuilder.CreateBr(ExitBlock); + DtorBuilder.SetInsertPoint(ExitBlock); + } else { + DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); + } DtorBuilder.CreateRetVoid(); return ModuleDtorFunc; } Index: test/CodeGenCUDA/device-stub.cu =================================================================== --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -19,7 +19,7 @@ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \ -// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,RDC,HIP,HIPRDC +// RUN: | FileCheck -allow-deprecated-dag-overlap %s --check-prefixes=ALL,NORDC,HIP // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ // RUN: | FileCheck -allow-deprecated-dag-overlap %s -check-prefix=NOGPUBIN @@ -79,11 +79,11 @@ // CUDA-SAME: section ".nvFatBinSegment" // HIP-SAME: section ".hipFatBinSegment" // * variable to save GPU binary handle after initialization -// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null +// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null +// HIP: @__[[PREFIX]]_gpubin_handle = linkonce global i8** null // * constant unnamed string with NVModuleID // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 -// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32 // * Make sure our constructor was added to global ctor list. // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor // * Alias to global symbol containing the NVModuleID. @@ -120,10 +120,18 @@ // ALL: define internal void @__[[PREFIX]]_module_ctor // In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper) +// HIP only register fat binary once. +// HIP: load i8**, i8*** @__hip_gpubin_handle +// HIP-NEXT: icmp eq i8** {{.*}}, null +// HIP-NEXT: br i1 {{.*}}, label %if, label %exit +// HIP: if: // NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper // .. stores return value in __[[PREFIX]]_gpubin_handle // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle // .. and then calls __[[PREFIX]]_register_globals +// HIP-NEXT: br label %exit +// HIP: exit: +// HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle // NORDC-NEXT: call void @__[[PREFIX]]_register_globals // * In separate mode we also register a destructor. // NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) @@ -136,7 +144,14 @@ // Test that we've created destructor. // NORDC: define internal void @__[[PREFIX]]_module_dtor // NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle -// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary +// CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary +// HIP-NEXT: icmp ne i8** {{.*}}, null +// HIP-NEXT: br i1 {{.*}}, label %if, label %exit +// HIP: if: +// HIP-NEXT: call void @__[[PREFIX]]UnregisterFatBinary +// HIP-NEXT: store i8** null, i8*** @__hip_gpubin_handle +// HIP-NEXT: br label %exit +// HIP: exit: // There should be no __[[PREFIX]]_register_globals if we have no // device-side globals, but we still need to register GPU binary.