Index: lib/CodeGen/CGCUDANV.cpp =================================================================== --- lib/CodeGen/CGCUDANV.cpp +++ lib/CodeGen/CGCUDANV.cpp @@ -427,9 +427,42 @@ /*constant*/ true); FatbinWrapper->setSection(FatbinSectionName); - // Register binary with CUDA/HIP runtime. This is substantially different in - // default mode vs. separate compilation! - if (!RelocatableDeviceCode) { + // There is only one HIP fat binary per linked module, however there are + // multiple constructor functions. Make sure the fat binary is registered + // only once. + if (IsHIP) { + llvm::BasicBlock *IfBlock = + llvm::BasicBlock::Create(Context, "if", ModuleCtorFunc); + llvm::BasicBlock *ExitBlock = + llvm::BasicBlock::Create(Context, "exit", ModuleCtorFunc); + GpuBinaryHandle = new llvm::GlobalVariable( + TheModule, VoidPtrPtrTy, /*isConstant=*/false, + llvm::GlobalValue::LinkOnceAnyLinkage, + /*Initializer=*/llvm::ConstantPointerNull::get(VoidPtrPtrTy), + addUnderscoredPrefixToName("_gpubin_handle")); + auto HandleValue = + CtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); + llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); + llvm::Value *EQZero = CtorBuilder.CreateICmpEQ(HandleValue, Zero); + CtorBuilder.CreateCondBr(EQZero, IfBlock, ExitBlock); + CtorBuilder.SetInsertPoint(IfBlock); + // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper); + llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( + RegisterFatbinFunc, + CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); + CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, + CGM.getPointerAlign()); + CtorBuilder.CreateBr(ExitBlock); + CtorBuilder.SetInsertPoint(ExitBlock); + // Call __{cuda|hip}_register_globals(GpuBinaryHandle); + if (RegisterGlobalsFunc) { + auto HandleValue = + CtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); + CtorBuilder.CreateCall(RegisterGlobalsFunc, HandleValue); + } + } else if (!RelocatableDeviceCode) { + // Register binary with CUDA/HIP runtime. This is substantially different in + // default mode vs. separate compilation! // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper); llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( RegisterFatbinFunc, @@ -520,8 +553,26 @@ auto HandleValue = DtorBuilder.CreateAlignedLoad(GpuBinaryHandle, CGM.getPointerAlign()); - DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); - + // There is only one HIP fat binary per linked module, however there are + // multiple destructor functions. Make sure the fat binary is unregistered + // only once. + if (CGM.getLangOpts().HIP) { + llvm::BasicBlock *IfBlock = + llvm::BasicBlock::Create(Context, "if", ModuleDtorFunc); + llvm::BasicBlock *ExitBlock = + llvm::BasicBlock::Create(Context, "exit", ModuleDtorFunc); + llvm::Constant *Zero = llvm::Constant::getNullValue(HandleValue->getType()); + llvm::Value *NEZero = DtorBuilder.CreateICmpNE(HandleValue, Zero); + DtorBuilder.CreateCondBr(NEZero, IfBlock, ExitBlock); + DtorBuilder.SetInsertPoint(IfBlock); + DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); + DtorBuilder.CreateAlignedStore(Zero, GpuBinaryHandle, + CGM.getPointerAlign()); + DtorBuilder.CreateBr(ExitBlock); + DtorBuilder.SetInsertPoint(ExitBlock); + } else { + DtorBuilder.CreateCall(UnregisterFatbinFunc, HandleValue); + } DtorBuilder.CreateRetVoid(); return ModuleDtorFunc; } Index: test/CodeGenCUDA/device-stub.cu =================================================================== --- test/CodeGenCUDA/device-stub.cu +++ test/CodeGenCUDA/device-stub.cu @@ -19,7 +19,7 @@ // RUN: | FileCheck %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \ -// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP,HIPRDC +// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,HIP // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ // RUN: | FileCheck %s -check-prefix=NOGPUBIN @@ -79,11 +79,11 @@ // CUDA-SAME: section ".nvFatBinSegment" // HIP-SAME: section ".hipFatBinSegment" // * variable to save GPU binary handle after initialization -// NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null +// CUDANORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null +// HIP: @__[[PREFIX]]_gpubin_handle = linkonce global i8** null // * constant unnamed string with NVModuleID // RDC: [[MODULE_ID_GLOBAL:@.*]] = private constant // CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 -// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32 // * Make sure our constructor was added to global ctor list. // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor // * Alias to global symbol containing the NVModuleID. @@ -120,10 +120,18 @@ // ALL: define internal void @__[[PREFIX]]_module_ctor // In separate mode it calls __[[PREFIX]]RegisterFatBinary(&__[[PREFIX]]_fatbin_wrapper) +// HIP only register fat binary once. +// HIP: load i8**, i8*** @__hip_gpubin_handle +// HIP-NEXT: icmp eq i8** {{.*}}, null +// HIP-NEXT: br i1 {{.*}}, label %if, label %exit +// HIP: if: // NORDC: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper // .. stores return value in __[[PREFIX]]_gpubin_handle // NORDC-NEXT: store{{.*}}__[[PREFIX]]_gpubin_handle // .. and then calls __[[PREFIX]]_register_globals +// HIP-NEXT: br label %exit +// HIP: exit: +// HIP-NEXT: load i8**, i8*** @__hip_gpubin_handle // NORDC-NEXT: call void @__[[PREFIX]]_register_globals // * In separate mode we also register a destructor. // NORDC-NEXT: call i32 @atexit(void (i8*)* @__[[PREFIX]]_module_dtor) @@ -136,7 +144,14 @@ // Test that we've created destructor. // NORDC: define internal void @__[[PREFIX]]_module_dtor // NORDC: load{{.*}}__[[PREFIX]]_gpubin_handle -// NORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary +// CUDANORDC-NEXT: call void @__[[PREFIX]]UnregisterFatBinary +// HIP-NEXT: icmp ne i8** {{.*}}, null +// HIP-NEXT: br i1 {{.*}}, label %if, label %exit +// HIP: if: +// HIP-NEXT: call void @__[[PREFIX]]UnregisterFatBinary +// HIP-NEXT: store i8** null, i8*** @__hip_gpubin_handle +// HIP-NEXT: br label %exit +// HIP: exit: // There should be no __[[PREFIX]]_register_globals if we have no // device-side globals, but we still need to register GPU binary.