diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -166,12 +166,12 @@ /// Information for a device offloading file extracted from the host. struct DeviceFile { - DeviceFile(StringRef Kind, StringRef TheTriple, StringRef Arch, + DeviceFile(OffloadKind Kind, StringRef TheTriple, StringRef Arch, StringRef Filename, bool IsLibrary = false) : Kind(Kind), TheTriple(TheTriple), Arch(Arch), Filename(Filename), IsLibrary(IsLibrary) {} - std::string Kind; + OffloadKind Kind; std::string TheTriple; std::string Arch; std::string Filename; @@ -183,15 +183,28 @@ /// assume device files with matching architectures and triples but different /// offloading kinds should be handlded together, this may not be true in the /// future. + +// Provide DenseMapInfo for OffloadKind. +template <> struct DenseMapInfo { + static inline OffloadKind getEmptyKey() { return OFK_LAST; } + static inline OffloadKind getTombstoneKey() { + return static_cast(OFK_LAST + 1); + } + static unsigned getHashValue(const OffloadKind &Val) { return Val * 37U; } + + static bool isEqual(const OffloadKind &LHS, const OffloadKind &RHS) { + return LHS == RHS; + } +}; template <> struct DenseMapInfo { static DeviceFile getEmptyKey() { - return {DenseMapInfo::getEmptyKey(), + return {DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey()}; } static DeviceFile getTombstoneKey() { - return {DenseMapInfo::getTombstoneKey(), + return {DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey()}; @@ -233,7 +246,7 @@ auto DeviceAndPath = StringRef(LibraryStr).split('='); auto StringAndArch = DeviceAndPath.first.rsplit('-'); auto KindAndTriple = StringAndArch.first.split('-'); - return DeviceFile(KindAndTriple.first, KindAndTriple.second, + return DeviceFile(getOffloadKind(KindAndTriple.first), KindAndTriple.second, StringAndArch.second, DeviceAndPath.second); } @@ -364,8 +377,8 @@ if (Error E = Output->commit()) return E; - DeviceFiles.emplace_back(Kind, Binary.getTriple(), Binary.getArch(), - TempFile, IsLibrary); + DeviceFiles.emplace_back(Binary.getOffloadKind(), Binary.getTriple(), + Binary.getArch(), TempFile, IsLibrary); Offset += Binary.getSize(); } @@ -689,6 +702,39 @@ return static_cast(TempFile); } + +Expected fatbinary(ArrayRef InputFiles, + Triple TheTriple, ArrayRef Archs) { + // NVPTX uses the fatbinary program to bundle the linked images. + Expected FatBinaryPath = + findProgram("fatbinary", {CudaBinaryPath}); + if (!FatBinaryPath) + return FatBinaryPath.takeError(); + + // Create a new file to write the linked device image to. + SmallString<128> TempFile; + if (Error Err = createOutputFile(sys::path::filename(ExecutableName) + + "-device-" + TheTriple.getArchName(), + "fatbin", TempFile)) + return std::move(Err); + + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + + SmallVector CmdArgs; + CmdArgs.push_back(*FatBinaryPath); + CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32"); + CmdArgs.push_back("--create"); + CmdArgs.push_back(TempFile); + for (const auto &FileAndArch : llvm::zip(InputFiles, Archs)) + CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) + + ",file=" + std::get<0>(FileAndArch))); + + if (Error Err = executeCommands(*FatBinaryPath, CmdArgs)) + return std::move(Err); + + return static_cast(TempFile); +} } // namespace nvptx namespace amdgcn { Expected link(ArrayRef InputFiles, Triple TheTriple, @@ -1133,15 +1179,18 @@ /// Runs the appropriate linking action on all the device files specified in \p /// DeviceFiles. The linked device images are returned in \p LinkedImages. Error linkDeviceFiles(ArrayRef DeviceFiles, - SmallVectorImpl &LinkedImages) { - // Get the list of inputs for a specific device. + SmallVectorImpl &LinkedImages) { + // Get the list of inputs and active offload kinds for a specific device. DenseMap> LinkerInputMap; + DenseMap> ActiveOffloadKinds; SmallVector LibraryFiles; for (auto &File : DeviceFiles) { - if (File.IsLibrary) + if (File.IsLibrary) { LibraryFiles.push_back(File); - else + } else { LinkerInputMap[File].push_back(File.Filename); + ActiveOffloadKinds[File].insert(File.Kind); + } } // Static libraries are loaded lazily as-needed, only add them if other files @@ -1157,33 +1206,42 @@ for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + auto &LinkerInputFiles = LinkerInput.getSecond(); bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, - File.Arch, WholeProgram)) + if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch, + WholeProgram)) return Err; - // If we are embedding bitcode for JIT, skip the final device linking. if (EmbedBitcode) { - assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + // If we are embedding bitcode for JIT, skip the final device linking. + if (LinkerInputFiles.size() != 1 || !WholeProgram) + return createStringError(inconvertibleErrorCode(), + "Unable to embed bitcode image for JIT"); + LinkedImages.emplace_back(OFK_OpenMP, TheTriple.getTriple(), File.Arch, + LinkerInputFiles.front()); continue; - } - - // If we performed LTO on NVPTX and had whole program visibility, we can use - // CUDA in non-RDC mode. - if (WholeProgram && TheTriple.isNVPTX()) { - assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + } else if (WholeProgram && TheTriple.isNVPTX()) { + // If we performed LTO on NVPTX and had whole program visibility, we can + // use CUDA in non-RDC mode. + if (LinkerInputFiles.size() != 1) + return createStringError(inconvertibleErrorCode(), + "Invalid number of inputs for non-RDC mode"); + for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + LinkerInputFiles.front()); continue; } - auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); + auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError(); - LinkedImages.push_back(*ImageOrErr); + // Create separate images for all the active offload kinds. + for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + *ImageOrErr); } return Error::success(); } @@ -1227,32 +1285,95 @@ return static_cast(ObjectFile); } -/// Creates the object file containing the device image and runtime registration -/// code from the device images stored in \p Images. -Expected wrapDeviceImages(ArrayRef Images) { +/// Load all of the OpenMP images into a buffer and pass it to the binary +/// wrapping function to create the registration code in the module \p M. +Error wrapOpenMPImages(Module &M, ArrayRef Images) { SmallVector, 4> SavedBuffers; SmallVector, 4> ImagesToWrap; - - for (StringRef ImageFilename : Images) { + for (const DeviceFile &File : Images) { llvm::ErrorOr> ImageOrError = - llvm::MemoryBuffer::getFileOrSTDIN(ImageFilename); + llvm::MemoryBuffer::getFileOrSTDIN(File.Filename); if (std::error_code EC = ImageOrError.getError()) - return createFileError(ImageFilename, EC); + return createFileError(File.Filename, EC); ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), (*ImageOrError)->getBufferSize()); SavedBuffers.emplace_back(std::move(*ImageOrError)); } - LLVMContext Context; - Module M("offload.wrapper.module", Context); - M.setTargetTriple(HostTriple); - if (Error Err = wrapBinaries(M, ImagesToWrap)) - return std::move(Err); + if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap)) + return Err; + return Error::success(); +} + +/// Combine all of the CUDA images into a single fatbinary and pass it to the +/// binary wrapping function to create the registration code in the module \p M. +Error wrapCudaImages(Module &M, ArrayRef Images) { + SmallVector InputFiles; + SmallVector Architectures; + for (const DeviceFile &File : Images) { + InputFiles.push_back(File.Filename); + Architectures.push_back(File.Arch); + } + + // CUDA expects its embedded device images to be a fatbinary. + Triple TheTriple = Triple(Images.front().TheTriple); + auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple, Architectures); + if (!FileOrErr) + return FileOrErr.takeError(); + + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr); + if (std::error_code EC = ImageOrError.getError()) + return createFileError(*FileOrErr, EC); + + auto ImageToWrap = ArrayRef((*ImageOrError)->getBufferStart(), + (*ImageOrError)->getBufferSize()); - if (PrintWrappedModule) - llvm::errs() << M; + if (Error Err = wrapCudaBinary(M, ImageToWrap)) + return Err; + return Error::success(); +} + +/// Creates the object file containing the device image and runtime +/// registration code from the device images stored in \p Images. +Expected> +wrapDeviceImages(ArrayRef Images) { + DenseMap> ImagesForKind; + for (const DeviceFile &Image : Images) + ImagesForKind[Image.Kind].push_back(Image); + + SmallVector WrappedImages; + for (const auto &KindAndImages : ImagesForKind) { + LLVMContext Context; + Module M("offload.wrapper.module", Context); + M.setTargetTriple(HostTriple); + + // Create registration code for the given offload kinds in the Module. + switch (KindAndImages.getFirst()) { + case OFK_OpenMP: + if (Error Err = wrapOpenMPImages(M, KindAndImages.getSecond())) + return std::move(Err); + break; + case OFK_Cuda: + if (Error Err = wrapCudaImages(M, KindAndImages.getSecond())) + return std::move(Err); + break; + default: + return createStringError(inconvertibleErrorCode(), + getOffloadKindName(KindAndImages.getFirst()) + + " wrapping is not supported"); + } + + if (PrintWrappedModule) + llvm::errs() << M; + + auto FileOrErr = compileModule(M); + if (!FileOrErr) + return FileOrErr.takeError(); + WrappedImages.push_back(*FileOrErr); + } - return compileModule(M); + return WrappedImages; } Optional findFile(StringRef Dir, const Twine &Name) { @@ -1383,7 +1504,7 @@ DeviceFiles.push_back(getBitcodeLibrary(LibraryStr)); // Link the device images extracted from the linker input. - SmallVector LinkedImages; + SmallVector LinkedImages; if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages)) return reportError(std::move(Err)); @@ -1392,7 +1513,7 @@ auto FileOrErr = wrapDeviceImages(LinkedImages); if (!FileOrErr) return reportError(FileOrErr.takeError()); - LinkerArgs.push_back(*FileOrErr); + LinkerArgs.append(*FileOrErr); // Run the host linking job. if (Error Err = runLinker(LinkerUserPath, LinkerArgs)) diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -1,4 +1,4 @@ -//===- OffloadWrapper.h -------------------------------------------*- C++ -*-===// +//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -12,9 +12,13 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/IR/Module.h" -/// Wrap the input device images into the module \p M as global symbols and +/// Wraps the input device images into the module \p M as global symbols and /// registers the images with the OpenMP Offloading runtime libomptarget. -llvm::Error wrapBinaries(llvm::Module &M, - llvm::ArrayRef> Images); +llvm::Error wrapOpenMPBinaries(llvm::Module &M, + llvm::ArrayRef> Images); + +/// Wraps the input fatbinary image into the module \p M as global symbols and +/// registers the images with the CUDA runtime. +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images); #endif diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -257,7 +257,7 @@ } // namespace -Error wrapBinaries(Module &M, ArrayRef> Images) { +Error wrapOpenMPBinaries(Module &M, ArrayRef> Images) { GlobalVariable *Desc = createBinDesc(M, Images); if (!Desc) return createStringError(inconvertibleErrorCode(), @@ -266,3 +266,8 @@ createUnregisterFunction(M, Desc); return Error::success(); } + +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images) { + // TODO: Implement this. + return Error::success(); +} diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -31,6 +31,7 @@ OFK_OpenMP, OFK_Cuda, OFK_HIP, + OFK_LAST, }; /// The type of contents the offloading image contains. @@ -41,6 +42,7 @@ IMG_Cubin, IMG_Fatbinary, IMG_PTX, + IMG_LAST, }; /// A simple binary serialization of an offloading file. We use this format to