Index: cfe/trunk/include/clang/Driver/Options.td =================================================================== --- cfe/trunk/include/clang/Driver/Options.td +++ cfe/trunk/include/clang/Driver/Options.td @@ -586,6 +586,8 @@ def fcuda_short_ptr : Flag<["-"], "fcuda-short-ptr">, Flags<[CC1Option]>, HelpText<"Use 32-bit pointers for accessing const/local/shared address spaces.">; def fno_cuda_short_ptr : Flag<["-"], "fno-cuda-short-ptr">; +def fhip_dump_offload_linker_script : Flag<["-"], "fhip-dump-offload-linker-script">, + Group, Flags<[NoArgumentUnused, HelpHidden]>; def dA : Flag<["-"], "dA">, Group; def dD : Flag<["-"], "dD">, Group, Flags<[CC1Option]>, HelpText<"Print macro definitions in -E mode in addition to normal output">; Index: cfe/trunk/lib/CodeGen/CGCUDANV.cpp =================================================================== --- cfe/trunk/lib/CodeGen/CGCUDANV.cpp +++ cfe/trunk/lib/CodeGen/CGCUDANV.cpp @@ -27,6 +27,8 @@ using namespace CodeGen; namespace { +constexpr unsigned CudaFatMagic = 0x466243b1; +constexpr unsigned HIPFatMagic = 0x48495046; // "HIPF" class CGNVCUDARuntime : public CGCUDARuntime { @@ -310,19 +312,20 @@ /// } /// \endcode llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() { + bool IsHIP = CGM.getLangOpts().HIP; // No need to generate ctors/dtors if there is no GPU binary. - std::string GpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; - if (GpuBinaryFileName.empty()) + StringRef CudaGpuBinaryFileName = CGM.getCodeGenOpts().CudaGpuBinaryFileName; + if (CudaGpuBinaryFileName.empty() && !IsHIP) return nullptr; - // void __cuda_register_globals(void* handle); + // void __{cuda|hip}_register_globals(void* handle); llvm::Function *RegisterGlobalsFunc = makeRegisterGlobalsFn(); // We always need a function to pass in as callback. Create a dummy // implementation if we don't need to register anything. if (RelocatableDeviceCode && !RegisterGlobalsFunc) RegisterGlobalsFunc = makeDummyFunction(getRegisterGlobalsFnTy()); - // void ** __cudaRegisterFatBinary(void *); + // void ** __{cuda|hip}RegisterFatBinary(void *); llvm::Constant *RegisterFatbinFunc = CGM.CreateRuntimeFunction( llvm::FunctionType::get(VoidPtrPtrTy, VoidPtrTy, false), addUnderscoredPrefixToName("RegisterFatBinary")); @@ -334,12 +337,16 @@ // global variable and save a reference in GpuBinaryHandle to be cleaned up // in destructor on exit. Then associate all known kernels with the GPU binary // handle so CUDA runtime can figure out what to call on the GPU side. - llvm::ErrorOr> GpuBinaryOrErr = - llvm::MemoryBuffer::getFileOrSTDIN(GpuBinaryFileName); - if (std::error_code EC = GpuBinaryOrErr.getError()) { - CGM.getDiags().Report(diag::err_cannot_open_file) - << GpuBinaryFileName << EC.message(); - return nullptr; + std::unique_ptr CudaGpuBinary; + if (!IsHIP) { + llvm::ErrorOr> CudaGpuBinaryOrErr = + llvm::MemoryBuffer::getFileOrSTDIN(CudaGpuBinaryFileName); + if (std::error_code EC = CudaGpuBinaryOrErr.getError()) { + CGM.getDiags().Report(diag::err_cannot_open_file) + << CudaGpuBinaryFileName << EC.message(); + return nullptr; + } + CudaGpuBinary = std::move(CudaGpuBinaryOrErr.get()); } llvm::Function *ModuleCtorFunc = llvm::Function::Create( @@ -353,28 +360,60 @@ CtorBuilder.SetInsertPoint(CtorEntryBB); const char *FatbinConstantName; - if (RelocatableDeviceCode) + const char *FatbinSectionName; + const char *ModuleIDSectionName; + StringRef ModuleIDPrefix; + llvm::Constant *FatBinStr; + unsigned FatMagic; + if (IsHIP) { + FatbinConstantName = ".hip_fatbin"; + FatbinSectionName = ".hipFatBinSegment"; + + ModuleIDSectionName = "__hip_module_id"; + ModuleIDPrefix = "__hip_"; + + // For HIP, create an external symbol __hip_fatbin in section .hip_fatbin. + // The external symbol is supposed to contain the fat binary but will be + // populated somewhere else, e.g. by lld through link script. + FatBinStr = new llvm::GlobalVariable( + CGM.getModule(), CGM.Int8Ty, + /*isConstant=*/true, llvm::GlobalValue::ExternalLinkage, nullptr, + "__hip_fatbin", nullptr, + llvm::GlobalVariable::NotThreadLocal); + cast(FatBinStr)->setSection(FatbinConstantName); + + FatMagic = HIPFatMagic; + } else { + if (RelocatableDeviceCode) + // TODO: Figure out how this is called on mac OS! + FatbinConstantName = "__nv_relfatbin"; + else + FatbinConstantName = + CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; + // NVIDIA's cuobjdump looks for fatbins in this section. + FatbinSectionName = + CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; + // TODO: Figure out how this is called on mac OS! - FatbinConstantName = "__nv_relfatbin"; - else - FatbinConstantName = - CGM.getTriple().isMacOSX() ? "__NV_CUDA,__nv_fatbin" : ".nv_fatbin"; - // NVIDIA's cuobjdump looks for fatbins in this section. - const char *FatbinSectionName = - CGM.getTriple().isMacOSX() ? "__NV_CUDA,__fatbin" : ".nvFatBinSegment"; - // TODO: Figure out how this is called on mac OS! - const char *NVModuleIDSectionName = "__nv_module_id"; + ModuleIDSectionName = "__nv_module_id"; + ModuleIDPrefix = "__nv_"; + + // For CUDA, create a string literal containing the fat binary loaded from + // the given file. + FatBinStr = makeConstantString(CudaGpuBinary->getBuffer(), "", + FatbinConstantName, 8); + FatMagic = CudaFatMagic; + } // Create initialized wrapper structure that points to the loaded GPU binary ConstantInitBuilder Builder(CGM); auto Values = Builder.beginStruct(FatbinWrapperTy); // Fatbin wrapper magic. - Values.addInt(IntTy, 0x466243b1); + Values.addInt(IntTy, FatMagic); // Fatbin version. Values.addInt(IntTy, 1); // Data. - Values.add(makeConstantString(GpuBinaryOrErr.get()->getBuffer(), "", - FatbinConstantName, 8)); + Values.add(FatBinStr); // Unused in fatbin v1. Values.add(llvm::ConstantPointerNull::get(VoidPtrTy)); llvm::GlobalVariable *FatbinWrapper = Values.finishAndCreateGlobal( @@ -382,10 +421,10 @@ /*constant*/ true); FatbinWrapper->setSection(FatbinSectionName); - // Register binary with CUDA runtime. This is substantially different in + // Register binary with CUDA/HIP runtime. This is substantially different in // default mode vs. separate compilation! if (!RelocatableDeviceCode) { - // GpuBinaryHandle = __cudaRegisterFatBinary(&FatbinWrapper); + // GpuBinaryHandle = __{cuda|hip}RegisterFatBinary(&FatbinWrapper); llvm::CallInst *RegisterFatbinCall = CtorBuilder.CreateCall( RegisterFatbinFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy)); @@ -397,34 +436,34 @@ CtorBuilder.CreateAlignedStore(RegisterFatbinCall, GpuBinaryHandle, CGM.getPointerAlign()); - // Call __cuda_register_globals(GpuBinaryHandle); + // Call __{cuda|hip}_register_globals(GpuBinaryHandle); if (RegisterGlobalsFunc) CtorBuilder.CreateCall(RegisterGlobalsFunc, RegisterFatbinCall); } else { // Generate a unique module ID. - SmallString<64> NVModuleID; - llvm::raw_svector_ostream OS(NVModuleID); - OS << "__nv_" << llvm::format("%x", FatbinWrapper->getGUID()); - llvm::Constant *NVModuleIDConstant = - makeConstantString(NVModuleID.str(), "", NVModuleIDSectionName, 32); + SmallString<64> ModuleID; + llvm::raw_svector_ostream OS(ModuleID); + OS << ModuleIDPrefix << llvm::format("%x", FatbinWrapper->getGUID()); + llvm::Constant *ModuleIDConstant = + makeConstantString(ModuleID.str(), "", ModuleIDSectionName, 32); - // Create an alias for the FatbinWrapper that nvcc will look for. + // Create an alias for the FatbinWrapper that nvcc or hip backend will + // look for. llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage, - Twine("__fatbinwrap") + NVModuleID, - FatbinWrapper); + Twine("__fatbinwrap") + ModuleID, FatbinWrapper); - // void __cudaRegisterLinkedBinary%NVModuleID%(void (*)(void *), void *, + // void __{cuda|hip}RegisterLinkedBinary%ModuleID%(void (*)(void *), void *, // void *, void (*)(void **)) SmallString<128> RegisterLinkedBinaryName( addUnderscoredPrefixToName("RegisterLinkedBinary")); - RegisterLinkedBinaryName += NVModuleID; + RegisterLinkedBinaryName += ModuleID; llvm::Constant *RegisterLinkedBinaryFunc = CGM.CreateRuntimeFunction( getRegisterLinkedBinaryFnTy(), RegisterLinkedBinaryName); assert(RegisterGlobalsFunc && "Expecting at least dummy function!"); llvm::Value *Args[] = {RegisterGlobalsFunc, CtorBuilder.CreateBitCast(FatbinWrapper, VoidPtrTy), - NVModuleIDConstant, + ModuleIDConstant, makeDummyFunction(getCallbackFnTy())}; CtorBuilder.CreateCall(RegisterLinkedBinaryFunc, Args); } Index: cfe/trunk/lib/Driver/ToolChains/CommonArgs.h =================================================================== --- cfe/trunk/lib/Driver/ToolChains/CommonArgs.h +++ cfe/trunk/lib/Driver/ToolChains/CommonArgs.h @@ -52,6 +52,12 @@ llvm::opt::ArgStringList &CmdArgs, const JobAction &JA); +void AddHIPLinkerScript(const ToolChain &TC, Compilation &C, + const InputInfo &Output, const InputInfoList &Inputs, + const llvm::opt::ArgList &Args, + llvm::opt::ArgStringList &CmdArgs, const JobAction &JA, + const Tool &T); + const char *SplitDebugName(const llvm::opt::ArgList &Args, const InputInfo &Input); Index: cfe/trunk/lib/Driver/ToolChains/CommonArgs.cpp =================================================================== --- cfe/trunk/lib/Driver/ToolChains/CommonArgs.cpp +++ cfe/trunk/lib/Driver/ToolChains/CommonArgs.cpp @@ -146,12 +146,14 @@ Args.AddAllArgValues(CmdArgs, options::OPT_Zlinker_input); for (const auto &II : Inputs) { - // If the current tool chain refers to an OpenMP offloading host, we should - // ignore inputs that refer to OpenMP offloading devices - they will be - // embedded according to a proper linker script. + // If the current tool chain refers to an OpenMP or HIP offloading host, we + // should ignore inputs that refer to OpenMP or HIP offloading devices - + // they will be embedded according to a proper linker script. if (auto *IA = II.getAction()) - if (JA.isHostOffloading(Action::OFK_OpenMP) && - IA->isDeviceOffloading(Action::OFK_OpenMP)) + if ((JA.isHostOffloading(Action::OFK_OpenMP) && + IA->isDeviceOffloading(Action::OFK_OpenMP)) || + (JA.isHostOffloading(Action::OFK_HIP) && + IA->isDeviceOffloading(Action::OFK_HIP))) continue; if (!TC.HasNativeLLVMSupport() && types::isLLVMIR(II.getType())) @@ -1288,6 +1290,124 @@ Lksf << LksBuffer; } +/// Add HIP linker script arguments at the end of the argument list so that +/// the fat binary is built by embedding the device images into the host. The +/// linker script also defines a symbol required by the code generation so that +/// the image can be retrieved at runtime. This should be used only in tool +/// chains that support linker scripts. +void tools::AddHIPLinkerScript(const ToolChain &TC, Compilation &C, + const InputInfo &Output, + const InputInfoList &Inputs, const ArgList &Args, + ArgStringList &CmdArgs, const JobAction &JA, + const Tool &T) { + + // If this is not a HIP host toolchain, we don't need to do anything. + if (!JA.isHostOffloading(Action::OFK_HIP)) + return; + + // Create temporary linker script. Keep it if save-temps is enabled. + const char *LKS; + SmallString<256> Name = llvm::sys::path::filename(Output.getFilename()); + if (C.getDriver().isSaveTempsEnabled()) { + llvm::sys::path::replace_extension(Name, "lk"); + LKS = C.getArgs().MakeArgString(Name.c_str()); + } else { + llvm::sys::path::replace_extension(Name, ""); + Name = C.getDriver().GetTemporaryPath(Name, "lk"); + LKS = C.addTempFile(C.getArgs().MakeArgString(Name.c_str())); + } + + // Add linker script option to the command. + CmdArgs.push_back("-T"); + CmdArgs.push_back(LKS); + + // Create a buffer to write the contents of the linker script. + std::string LksBuffer; + llvm::raw_string_ostream LksStream(LksBuffer); + + // Get the HIP offload tool chain. + auto *HIPTC = static_cast( + C.getSingleOffloadToolChain()); + assert(HIPTC->getTriple().getArch() == llvm::Triple::amdgcn && + "Wrong platform"); + + // Construct clang-offload-bundler command to bundle object files for + // for different GPU archs. + ArgStringList BundlerArgs; + BundlerArgs.push_back(Args.MakeArgString("-type=o")); + + // ToDo: Remove the dummy host binary entry which is required by + // clang-offload-bundler. + std::string BundlerTargetArg = "-targets=host-x86_64-unknown-linux"; + std::string BundlerInputArg = "-inputs=/dev/null"; + + for (const auto &II : Inputs) { + const Action *A = II.getAction(); + // Is this a device linking action? + if (A && isa(A) && A->isDeviceOffloading(Action::OFK_HIP)) { + BundlerTargetArg = BundlerTargetArg + ",hip-amdgcn-amd-amdhsa-" + + StringRef(A->getOffloadingArch()).str(); + BundlerInputArg = BundlerInputArg + "," + II.getFilename(); + } + } + BundlerArgs.push_back(Args.MakeArgString(BundlerTargetArg)); + BundlerArgs.push_back(Args.MakeArgString(BundlerInputArg)); + + std::string BundleFileName = C.getDriver().GetTemporaryPath("BUNDLE", "o"); + const char *BundleFile = + C.addTempFile(C.getArgs().MakeArgString(BundleFileName.c_str())); + auto BundlerOutputArg = + Args.MakeArgString(std::string("-outputs=").append(BundleFile)); + BundlerArgs.push_back(BundlerOutputArg); + + SmallString<128> BundlerPath(C.getDriver().Dir); + llvm::sys::path::append(BundlerPath, "clang-offload-bundler"); + const char *Bundler = Args.MakeArgString(BundlerPath); + C.addCommand(llvm::make_unique(JA, T, Bundler, BundlerArgs, Inputs)); + + // Add commands to embed target binaries. We ensure that each section and + // image is 16-byte aligned. This is not mandatory, but increases the + // likelihood of data to be aligned with a cache block in several main host + // machines. + LksStream << "/*\n"; + LksStream << " HIP Offload Linker Script\n"; + LksStream << " *** Automatically generated by Clang ***\n"; + LksStream << "*/\n"; + LksStream << "TARGET(binary)\n"; + LksStream << "INPUT(" << BundleFileName << ")\n"; + LksStream << "SECTIONS\n"; + LksStream << "{\n"; + LksStream << " .hip_fatbin :\n"; + LksStream << " ALIGN(0x10)\n"; + LksStream << " {\n"; + LksStream << " PROVIDE_HIDDEN(__hip_fatbin = .);\n"; + LksStream << " " << BundleFileName << "\n"; + LksStream << " }\n"; + LksStream << "}\n"; + LksStream << "INSERT BEFORE .data\n"; + LksStream.flush(); + + // Dump the contents of the linker script if the user requested that. We + // support this option to enable testing of behavior with -###. + if (C.getArgs().hasArg(options::OPT_fhip_dump_offload_linker_script)) + llvm::errs() << LksBuffer; + + // If this is a dry run, do not create the linker script file. + if (C.getArgs().hasArg(options::OPT__HASH_HASH_HASH)) + return; + + // Open script file and write the contents. + std::error_code EC; + llvm::raw_fd_ostream Lksf(LKS, EC, llvm::sys::fs::F_None); + + if (EC) { + C.getDriver().Diag(clang::diag::err_unable_to_make_temp) << EC.message(); + return; + } + + Lksf << LksBuffer; +} + SmallString<128> tools::getStatsFileName(const llvm::opt::ArgList &Args, const InputInfo &Output, const InputInfo &Input, Index: cfe/trunk/lib/Driver/ToolChains/Gnu.cpp =================================================================== --- cfe/trunk/lib/Driver/ToolChains/Gnu.cpp +++ cfe/trunk/lib/Driver/ToolChains/Gnu.cpp @@ -535,6 +535,10 @@ // Add OpenMP offloading linker script args if required. AddOpenMPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA); + // Add HIP offloading linker script args if required. + AddHIPLinkerScript(getToolChain(), C, Output, Inputs, Args, CmdArgs, JA, + *this); + C.addCommand(llvm::make_unique(JA, *this, Exec, CmdArgs, Inputs)); } Index: cfe/trunk/test/CodeGenCUDA/device-stub.cu =================================================================== --- cfe/trunk/test/CodeGenCUDA/device-stub.cu +++ cfe/trunk/test/CodeGenCUDA/device-stub.cu @@ -1,13 +1,13 @@ // RUN: echo "GPU binary would be here" > %t // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - \ -// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,CUDA +// RUN: | FileCheck %s --check-prefixes=ALL,NORDC,CUDA,CUDANORDC // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS \ -// RUN: | FileCheck %s -check-prefix=NOGLOBALS +// RUN: | FileCheck %s -check-prefixes=NOGLOBALS,CUDANOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - \ -// RUN: | FileCheck %s --check-prefixes=ALL,RDC,CUDA +// RUN: | FileCheck %s --check-prefixes=ALL,RDC,CUDA,CUDARDC // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - \ // RUN: | FileCheck %s -check-prefix=NOGPUBIN @@ -16,10 +16,10 @@ // RUN: | FileCheck %s --check-prefixes=ALL,NORDC,HIP // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-include-gpubinary %t -o - -DNOGLOBALS -x hip \ -// RUN: | FileCheck %s -check-prefix=NOGLOBALS +// RUN: | FileCheck %s -check-prefixes=NOGLOBALS,HIPNOGLOBALS // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s \ // RUN: -fcuda-rdc -fcuda-include-gpubinary %t -o - -x hip \ -// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP +// RUN: | FileCheck %s --check-prefixes=ALL,RDC,HIP,HIPRDC // RUN: %clang_cc1 -triple x86_64-linux-gnu -emit-llvm %s -o - -x hip\ // RUN: | FileCheck %s -check-prefix=NOGPUBIN @@ -64,21 +64,26 @@ // * constant unnamed string with the kernel name // ALL: private unnamed_addr constant{{.*}}kernelfunc{{.*}}\00" // * constant unnamed string with GPU binary -// ALL: private unnamed_addr constant{{.*GPU binary would be here.*}}\00" -// NORDC-SAME: section ".nv_fatbin", align 8 -// RDC-SAME: section "__nv_relfatbin", align 8 +// HIP: @[[FATBIN:__hip_fatbin]] = external constant i8, section ".hip_fatbin" +// CUDA: @[[FATBIN:.*]] = private unnamed_addr constant{{.*GPU binary would be here.*}}\00", +// CUDANORDC-SAME: section ".nv_fatbin", align 8 +// CUDARDC-SAME: section "__nv_relfatbin", align 8 // * constant struct that wraps GPU binary -// CUDA: @__[[PREFIX:cuda]]_fatbin_wrapper = internal constant -// CUDA-SAME: { i32, i32, i8*, i8* } -// HIP: @__[[PREFIX:hip]]_fatbin_wrapper = internal constant -// HIP-SAME: { i32, i32, i8*, i8* } -// ALL-SAME: { i32 1180844977, i32 1, {{.*}}, i8* null } -// ALL-SAME: section ".nvFatBinSegment" +// ALL: @__[[PREFIX:cuda|hip]]_fatbin_wrapper = internal constant +// ALL-SAME: { i32, i32, i8*, i8* } +// CUDA-SAME: { i32 1180844977, i32 1, +// HIP-SAME: { i32 1212764230, i32 1, +// CUDA-SAME: i8* getelementptr inbounds ({{.*}}@[[FATBIN]], i64 0, i64 0), +// HIP-SAME: i8* @[[FATBIN]], +// ALL-SAME: i8* null } +// CUDA-SAME: section ".nvFatBinSegment" +// HIP-SAME: section ".hipFatBinSegment" // * variable to save GPU binary handle after initialization // NORDC: @__[[PREFIX]]_gpubin_handle = internal global i8** null // * constant unnamed string with NVModuleID // RDC: [[MODULE_ID_GLOBAL:@.*]] = private unnamed_addr constant -// RDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 +// CUDARDC-SAME: c"[[MODULE_ID:.+]]\00", section "__nv_module_id", align 32 +// HIPRDC-SAME: c"[[MODULE_ID:.+]]\00", section "__hip_module_id", align 32 // * Make sure our constructor was added to global ctor list. // ALL: @llvm.global_ctors = appending global {{.*}}@__[[PREFIX]]_module_ctor // * In separate mode we also register a destructor. @@ -136,9 +141,10 @@ // There should be no __[[PREFIX]]_register_globals if we have no // device-side globals, but we still need to register GPU binary. // Skip GPU binary string first. -// NOGLOBALS: @0 = private unnamed_addr constant{{.*}} +// CUDANOGLOBALS: @{{.*}} = private unnamed_addr constant{{.*}} +// HIPNOGLOBALS: @{{.*}} = external constant{{.*}} // NOGLOBALS-NOT: define internal void @__{{.*}}_register_globals -// NOGLOBALS: define internal void @__[[PREFIX:.*]]_module_ctor +// NOGLOBALS: define internal void @__[[PREFIX:cuda|hip]]_module_ctor // NOGLOBALS: call{{.*}}[[PREFIX]]RegisterFatBinary{{.*}}__[[PREFIX]]_fatbin_wrapper // NOGLOBALS-NOT: call void @__[[PREFIX]]_register_globals // NOGLOBALS: define internal void @__[[PREFIX]]_module_dtor