diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c --- a/clang/test/Driver/linker-wrapper-image.c +++ b/clang/test/Driver/linker-wrapper-image.c @@ -77,7 +77,6 @@ // CUDA-NEXT: %5 = icmp eq i64 %size, 0 // CUDA-NEXT: br i1 %5, label %if.then, label %if.else - // CUDA: if.then: // CUDA-NEXT: %6 = call i32 @__cudaRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) // CUDA-NEXT: br label %if.end @@ -111,3 +110,84 @@ // CUDA: while.end: // CUDA-NEXT: ret void // CUDA-NEXT: } + +// RUN: clang-offload-packager -o %t.out --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx908 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ +// RUN: -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --print-wrapped-module --dry-run --host-triple x86_64-unknown-linux-gnu \ +// RUN: -linker-path /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP + +// HIP: @.fatbin_image = internal constant [0 x i8] zeroinitializer, section ".hip_fatbin" +// HIP-NEXT: @.fatbin_wrapper = internal constant %fatbin_wrapper { i32 1212764230, i32 1, ptr @.fatbin_image, ptr null }, section ".hipFatBinSegment", align 8 +// HIP-NEXT: @__dummy.hip_offloading.entry = hidden constant [0 x %__tgt_offload_entry] zeroinitializer, section "hip_offloading_entries" +// HIP-NEXT: @.hip.binary_handle = internal global ptr null +// HIP-NEXT: @__start_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry] +// HIP-NEXT: @__stop_hip_offloading_entries = external hidden constant [0 x %__tgt_offload_entry] +// HIP-NEXT: @llvm.global_ctors = appending global [1 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 1, ptr @.hip.fatbin_reg, ptr null }] + +// HIP: define internal void @.hip.fatbin_reg() section ".text.startup" { +// HIP-NEXT: entry: +// HIP-NEXT: %0 = call ptr @__hipRegisterFatBinary(ptr @.fatbin_wrapper) +// HIP-NEXT: store ptr %0, ptr @.hip.binary_handle, align 8 +// HIP-NEXT: call void @.hip.globals_reg(ptr %0) +// HIP-NEXT: %1 = call i32 @atexit(ptr @.hip.fatbin_unreg) +// HIP-NEXT: ret void +// HIP-NEXT: } + +// HIP: define internal void @.hip.fatbin_unreg() section ".text.startup" { +// HIP-NEXT: entry: +// HIP-NEXT: %0 = load ptr, ptr @.hip.binary_handle, align 8 +// HIP-NEXT: call void @__hipUnregisterFatBinary(ptr %0) +// HIP-NEXT: ret void +// HIP-NEXT: } + +// HIP: define internal void @.hip.globals_reg(ptr %0) section ".text.startup" { +// HIP-NEXT: entry: +// HIP-NEXT: br i1 icmp ne (ptr @__start_hip_offloading_entries, ptr @__stop_hip_offloading_entries), label %while.entry, label %while.end + +// HIP: while.entry: +// HIP-NEXT: %entry1 = phi ptr [ @__start_hip_offloading_entries, %entry ], [ %7, %if.end ] +// HIP-NEXT: %1 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 0 +// HIP-NEXT: %addr = load ptr, ptr %1, align 8 +// HIP-NEXT: %2 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 1 +// HIP-NEXT: %name = load ptr, ptr %2, align 8 +// HIP-NEXT: %3 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 2 +// HIP-NEXT: %size = load i64, ptr %3, align 4 +// HIP-NEXT: %4 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 0, i32 3 +// HIP-NEXT: %flag = load i32, ptr %4, align 4 +// HIP-NEXT: %5 = icmp eq i64 %size, 0 +// HIP-NEXT: br i1 %5, label %if.then, label %if.else + +// HIP: if.then: +// HIP-NEXT: %6 = call i32 @__hipRegisterFunction(ptr %0, ptr %addr, ptr %name, ptr %name, i32 -1, ptr null, ptr null, ptr null, ptr null, ptr null) +// HIP-NEXT: br label %if.end + +// HIP: if.else: +// HIP-NEXT: switch i32 %flag, label %if.end [ +// HIP-NEXT: i32 0, label %sw.global +// HIP-NEXT: i32 1, label %sw.managed +// HIP-NEXT: i32 2, label %sw.surface +// HIP-NEXT: i32 3, label %sw.texture +// HIP-NEXT: ] + +// HIP: sw.global: +// HIP-NEXT: call void @__hipRegisterVar(ptr %0, ptr %addr, ptr %name, ptr %name, i32 0, i64 %size, i32 0, i32 0) +// HIP-NEXT: br label %if.end + +// HIP: sw.managed: +// HIP-NEXT: br label %if.end + +// HIP: sw.surface: +// HIP-NEXT: br label %if.end + +// HIP: sw.texture: +// HIP-NEXT: br label %if.end + +// HIP: if.end: +// HIP-NEXT: %7 = getelementptr inbounds %__tgt_offload_entry, ptr %entry1, i64 1 +// HIP-NEXT: %8 = icmp eq ptr %7, @__stop_hip_offloading_entries +// HIP-NEXT: br i1 %8, label %while.end, label %while.entry + +// HIP: while.end: +// HIP-NEXT: ret void +// HIP-NEXT: } diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -91,6 +91,19 @@ // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o {{.*}}.o // CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_52,file={{.*}}.out --image=profile=sm_70,file={{.*}}.out +// RUN: clang-offload-packager -o %t.out \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx908 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ +// RUN: -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --dry-run --host-triple x86_64-unknown-linux-gnu -linker-path \ +// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP + +// HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o +// HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.out {{.*}}.o +// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb + // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 @@ -103,6 +116,7 @@ // LINKER_ARGS: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o a // LINKER_ARGS: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o a b +/// Ensure that temp files aren't leftoever from static libraries. // RUN: clang-offload-packager -o %t-lib.out \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52 diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -587,6 +587,51 @@ return *TempFileOrErr; } + +Expected +fatbinary(ArrayRef> InputFiles, + const ArgList &Args) { + // AMDGPU uses the clang-offload-bundler to bundle the linked images. + Expected OffloadBundlerPath = findProgram( + "clang-offload-bundler", {getMainExecutable("clang-offload-bundler")}); + if (!OffloadBundlerPath) + return OffloadBundlerPath.takeError(); + + llvm::Triple Triple( + Args.getLastArgValue(OPT_host_triple_EQ, sys::getDefaultTargetTriple())); + + // Create a new file to write the linked device image to. + auto TempFileOrErr = createOutputFile(sys::path::filename(ExecutableName) + + "-device-" + Triple.getArchName(), + "hipfb"); + if (!TempFileOrErr) + return TempFileOrErr.takeError(); + + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + + SmallVector CmdArgs; + CmdArgs.push_back(*OffloadBundlerPath); + CmdArgs.push_back("-type=o"); + CmdArgs.push_back("-bundle-align=4096"); + + SmallVector Targets = {"-targets=host-x86_64-unknown-linux"}; + for (const auto &FileAndArch : InputFiles) + Targets.push_back( + Saver.save("hipv4-amdgcn-amd-amdhsa--" + std::get<1>(FileAndArch))); + CmdArgs.push_back(Saver.save(llvm::join(Targets, ","))); + + CmdArgs.push_back("-input=/dev/null"); + for (const auto &FileAndArch : InputFiles) + CmdArgs.push_back(Saver.save("-input=" + std::get<0>(FileAndArch))); + + CmdArgs.push_back(Saver.save("-output=" + *TempFileOrErr)); + + if (Error Err = executeCommands(*OffloadBundlerPath, CmdArgs)) + return std::move(Err); + + return *TempFileOrErr; +} } // namespace amdgcn namespace generic { @@ -1093,6 +1138,10 @@ if (Error Err = wrapCudaBinary(M, BuffersToWrap.front())) return std::move(Err); break; + case OFK_HIP: + if (Error Err = wrapHIPBinary(M, BuffersToWrap.front())) + return std::move(Err); + break; default: return createStringError(inconvertibleErrorCode(), getOffloadKindName(Kind) + @@ -1120,20 +1169,43 @@ Expected>> bundleCuda(ArrayRef Images, const ArgList &Args) { + SmallVector, 4> InputFiles; + for (const OffloadingImage &Image : Images) + InputFiles.emplace_back(std::make_pair(Image.Image->getBufferIdentifier(), + Image.StringData.lookup("arch"))); + + Triple TheTriple = Triple(Images.front().StringData.lookup("triple")); + auto FileOrErr = nvptx::fatbinary(InputFiles, Args); + if (!FileOrErr) + return FileOrErr.takeError(); + + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr); + SmallVector> Buffers; + if (std::error_code EC = ImageOrError.getError()) + return createFileError(*FileOrErr, EC); + Buffers.emplace_back(std::move(*ImageOrError)); + + return std::move(Buffers); +} +Expected>> +bundleHIP(ArrayRef Images, const ArgList &Args) { SmallVector, 4> InputFiles; for (const OffloadingImage &Image : Images) InputFiles.emplace_back(std::make_pair(Image.Image->getBufferIdentifier(), Image.StringData.lookup("arch"))); Triple TheTriple = Triple(Images.front().StringData.lookup("triple")); - auto FileOrErr = nvptx::fatbinary(InputFiles, Args); + auto FileOrErr = amdgcn::fatbinary(InputFiles, Args); if (!FileOrErr) return FileOrErr.takeError(); llvm::ErrorOr> ImageOrError = llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr); + + SmallVector> Buffers; if (std::error_code EC = ImageOrError.getError()) return createFileError(*FileOrErr, EC); Buffers.emplace_back(std::move(*ImageOrError)); @@ -1151,6 +1223,8 @@ return bundleOpenMP(Images); case OFK_Cuda: return bundleCuda(Images, Args); + case OFK_HIP: + return bundleHIP(Images, Args); default: return createStringError(inconvertibleErrorCode(), getOffloadKindName(Kind) + diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -21,4 +21,8 @@ /// registers the images with the CUDA runtime. llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images); +/// Wraps the input bundled image into the module \p M as global symbols and +/// registers the images with the HIP runtime. +llvm::Error wrapHIPBinary(llvm::Module &M, llvm::ArrayRef Images); + #endif diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -22,6 +22,7 @@ namespace { /// Magic number that begins the section containing the CUDA fatbinary. constexpr unsigned CudaFatMagic = 0x466243b1; +constexpr unsigned HIPFatMagic = 0x48495046; /// Copied from clang/CGCudaRuntime.h. enum OffloadEntryKindFlag : uint32_t { @@ -288,14 +289,15 @@ /// Embed the image \p Image into the module \p M so it can be found by the /// runtime. -GlobalVariable *createFatbinDesc(Module &M, ArrayRef Image) { +GlobalVariable *createFatbinDesc(Module &M, ArrayRef Image, bool IsHIP) { LLVMContext &C = M.getContext(); llvm::Type *Int8PtrTy = Type::getInt8PtrTy(C); llvm::Triple Triple = llvm::Triple(M.getTargetTriple()); // Create the global string containing the fatbinary. StringRef FatbinConstantSection = - Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; + IsHIP ? ".hip_fatbin" + : (Triple.isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"); auto *Data = ConstantDataArray::get(C, Image); auto *Fatbin = new GlobalVariable(M, Data->getType(), /*isConstant*/ true, GlobalVariable::InternalLinkage, Data, @@ -303,10 +305,11 @@ Fatbin->setSection(FatbinConstantSection); // Create the fatbinary wrapper - StringRef FatbinWrapperSection = - Triple.isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; + StringRef FatbinWrapperSection = IsHIP ? ".hipFatBinSegment" + : Triple.isMacOSX() ? "__NV_CUDA,__fatbin" + : ".nvFatBinSegment"; Constant *FatbinWrapper[] = { - ConstantInt::get(Type::getInt32Ty(C), CudaFatMagic), + ConstantInt::get(Type::getInt32Ty(C), IsHIP ? HIPFatMagic : CudaFatMagic), ConstantInt::get(Type::getInt32Ty(C), 1), ConstantExpr::getPointerBitCastOrAddrSpaceCast(Fatbin, Int8PtrTy), ConstantPointerNull::get(Type::getInt8PtrTy(C))}; @@ -328,9 +331,10 @@ ConstantAggregateZero::get(ArrayType::get(getEntryTy(M), 0u)); auto *DummyEntry = new GlobalVariable( M, DummyInit->getType(), true, GlobalVariable::ExternalLinkage, DummyInit, - "__dummy.cuda_offloading.entry"); - DummyEntry->setSection("cuda_offloading_entries"); + IsHIP ? "__dummy.hip_offloading.entry" : "__dummy.cuda_offloading.entry"); DummyEntry->setVisibility(GlobalValue::HiddenVisibility); + DummyEntry->setSection(IsHIP ? "hip_offloading_entries" + : "cuda_offloading_entries"); return FatbinDesc; } @@ -358,7 +362,7 @@ /// 0, entry->size, 0, 0); /// } /// } -Function *createRegisterGlobalsFunction(Module &M) { +Function *createRegisterGlobalsFunction(Module &M, bool IsHIP) { LLVMContext &C = M.getContext(); // Get the __cudaRegisterFunction function declaration. auto *RegFuncTy = FunctionType::get( @@ -368,8 +372,8 @@ Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt32PtrTy(C)}, /*isVarArg*/ false); - FunctionCallee RegFunc = - M.getOrInsertFunction("__cudaRegisterFunction", RegFuncTy); + FunctionCallee RegFunc = M.getOrInsertFunction( + IsHIP ? "__hipRegisterFunction" : "__cudaRegisterFunction", RegFuncTy); // Get the __cudaRegisterVar function declaration. auto *RegVarTy = FunctionType::get( @@ -378,25 +382,31 @@ Type::getInt8PtrTy(C), Type::getInt8PtrTy(C), Type::getInt32Ty(C), getSizeTTy(M), Type::getInt32Ty(C), Type::getInt32Ty(C)}, /*isVarArg*/ false); - FunctionCallee RegVar = M.getOrInsertFunction("__cudaRegisterVar", RegVarTy); + FunctionCallee RegVar = M.getOrInsertFunction( + IsHIP ? "__hipRegisterVar" : "__cudaRegisterVar", RegVarTy); // Create the references to the start / stop symbols defined by the linker. - auto *EntriesB = new GlobalVariable( - M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true, - GlobalValue::ExternalLinkage, - /*Initializer*/ nullptr, "__start_cuda_offloading_entries"); + auto *EntriesB = + new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0), + /*isConstant*/ true, GlobalValue::ExternalLinkage, + /*Initializer*/ nullptr, + IsHIP ? "__start_hip_offloading_entries" + : "__start_cuda_offloading_entries"); EntriesB->setVisibility(GlobalValue::HiddenVisibility); - auto *EntriesE = new GlobalVariable( - M, ArrayType::get(getEntryTy(M), 0), /*isConstant*/ true, - GlobalValue::ExternalLinkage, - /*Initializer*/ nullptr, "__stop_cuda_offloading_entries"); + auto *EntriesE = + new GlobalVariable(M, ArrayType::get(getEntryTy(M), 0), + /*isConstant*/ true, GlobalValue::ExternalLinkage, + /*Initializer*/ nullptr, + IsHIP ? "__stop_hip_offloading_entries" + : "__stop_cuda_offloading_entries"); EntriesE->setVisibility(GlobalValue::HiddenVisibility); auto *RegGlobalsTy = FunctionType::get(Type::getVoidTy(C), Type::getInt8PtrTy(C)->getPointerTo(), /*isVarArg*/ false); - auto *RegGlobalsFn = Function::Create( - RegGlobalsTy, GlobalValue::InternalLinkage, ".cuda.globals_reg", &M); + auto *RegGlobalsFn = + Function::Create(RegGlobalsTy, GlobalValue::InternalLinkage, + IsHIP ? ".hip.globals_reg" : ".cuda.globals_reg", &M); RegGlobalsFn->setSection(".text.startup"); // Create the loop to register all the entries. @@ -502,24 +512,27 @@ // Create the constructor and destructor to register the fatbinary with the CUDA // runtime. -void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc) { +void createRegisterFatbinFunction(Module &M, GlobalVariable *FatbinDesc, + bool IsHIP) { LLVMContext &C = M.getContext(); auto *CtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *CtorFunc = Function::Create(CtorFuncTy, GlobalValue::InternalLinkage, - ".cuda.fatbin_reg", &M); + auto *CtorFunc = + Function::Create(CtorFuncTy, GlobalValue::InternalLinkage, + IsHIP ? ".hip.fatbin_reg" : ".cuda.fatbin_reg", &M); CtorFunc->setSection(".text.startup"); auto *DtorFuncTy = FunctionType::get(Type::getVoidTy(C), /*isVarArg*/ false); - auto *DtorFunc = Function::Create(DtorFuncTy, GlobalValue::InternalLinkage, - ".cuda.fatbin_unreg", &M); + auto *DtorFunc = + Function::Create(DtorFuncTy, GlobalValue::InternalLinkage, + IsHIP ? ".hip.fatbin_unreg" : ".cuda.fatbin_unreg", &M); DtorFunc->setSection(".text.startup"); // Get the __cudaRegisterFatBinary function declaration. auto *RegFatTy = FunctionType::get(Type::getInt8PtrTy(C)->getPointerTo(), Type::getInt8PtrTy(C), /*isVarArg*/ false); - FunctionCallee RegFatbin = - M.getOrInsertFunction("__cudaRegisterFatBinary", RegFatTy); + FunctionCallee RegFatbin = M.getOrInsertFunction( + IsHIP ? "__hipRegisterFatBinary" : "__cudaRegisterFatBinary", RegFatTy); // Get the __cudaRegisterFatBinaryEnd function declaration. auto *RegFatEndTy = FunctionType::get(Type::getVoidTy(C), Type::getInt8PtrTy(C)->getPointerTo(), @@ -530,8 +543,9 @@ auto *UnregFatTy = FunctionType::get(Type::getVoidTy(C), Type::getInt8PtrTy(C)->getPointerTo(), /*isVarArg*/ false); - FunctionCallee UnregFatbin = - M.getOrInsertFunction("__cudaUnregisterFatBinary", UnregFatTy); + FunctionCallee UnregFatbin = M.getOrInsertFunction( + IsHIP ? "__hipUnregisterFatBinary" : "__cudaUnregisterFatBinary", + UnregFatTy); auto *AtExitTy = FunctionType::get(Type::getInt32Ty(C), DtorFuncTy->getPointerTo(), @@ -542,7 +556,7 @@ M, Type::getInt8PtrTy(C)->getPointerTo(), false, llvm::GlobalValue::InternalLinkage, llvm::ConstantPointerNull::get(Type::getInt8PtrTy(C)->getPointerTo()), - ".cuda.binary_handle"); + IsHIP ? ".hip.binary_handle" : ".cuda.binary_handle"); // Create the constructor to register this image with the runtime. IRBuilder<> CtorBuilder(BasicBlock::Create(C, "entry", CtorFunc)); @@ -552,8 +566,9 @@ CtorBuilder.CreateAlignedStore( Handle, BinaryHandleGlobal, Align(M.getDataLayout().getPointerTypeSize(Type::getInt8PtrTy(C)))); - CtorBuilder.CreateCall(createRegisterGlobalsFunction(M), Handle); - CtorBuilder.CreateCall(RegFatbinEnd, Handle); + CtorBuilder.CreateCall(createRegisterGlobalsFunction(M, IsHIP), Handle); + if (!IsHIP) + CtorBuilder.CreateCall(RegFatbinEnd, Handle); CtorBuilder.CreateCall(AtExit, DtorFunc); CtorBuilder.CreateRetVoid(); @@ -584,11 +599,21 @@ } Error wrapCudaBinary(Module &M, ArrayRef Image) { - GlobalVariable *Desc = createFatbinDesc(M, Image); + GlobalVariable *Desc = createFatbinDesc(M, Image, /* IsHIP */ false); + if (!Desc) + return createStringError(inconvertibleErrorCode(), + "No fatinbary section created."); + + createRegisterFatbinFunction(M, Desc, /* IsHIP */ false); + return Error::success(); +} + +Error wrapHIPBinary(Module &M, ArrayRef Image) { + GlobalVariable *Desc = createFatbinDesc(M, Image, /* IsHIP */ true); if (!Desc) return createStringError(inconvertibleErrorCode(), "No fatinbary section created."); - createRegisterFatbinFunction(M, Desc); + createRegisterFatbinFunction(M, Desc, /* IsHIP */ true); return Error::success(); }