diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -166,11 +166,11 @@ /// Information for a device offloading file extracted from the host. struct DeviceFile { - DeviceFile(StringRef Kind, StringRef TheTriple, StringRef Arch, + DeviceFile(OffloadKind Kind, StringRef TheTriple, StringRef Arch, StringRef Filename) : Kind(Kind), TheTriple(TheTriple), Arch(Arch), Filename(Filename) {} - std::string Kind; + OffloadKind Kind; std::string TheTriple; std::string Arch; std::string Filename; @@ -181,15 +181,30 @@ /// assume device files with matching architectures and triples but different /// offloading kinds should be handlded together, this may not be true in the /// future. + +// Provide DenseMapInfo for OffloadKind. +template <> struct DenseMapInfo { + static inline OffloadKind getEmptyKey() { + return static_cast(0xFFFF); + } + static inline OffloadKind getTombstoneKey() { + return static_cast(0xFFFF - 1); + } + static unsigned getHashValue(const OffloadKind &Val) { return Val * 37U; } + + static bool isEqual(const OffloadKind &LHS, const OffloadKind &RHS) { + return LHS == RHS; + } +}; template <> struct DenseMapInfo { static DeviceFile getEmptyKey() { - return {DenseMapInfo::getEmptyKey(), + return {static_cast(DenseMapInfo::getEmptyKey()), DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey(), DenseMapInfo::getEmptyKey()}; } static DeviceFile getTombstoneKey() { - return {DenseMapInfo::getTombstoneKey(), + return {static_cast(DenseMapInfo::getTombstoneKey()), DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey(), DenseMapInfo::getTombstoneKey()}; @@ -230,7 +245,7 @@ auto DeviceAndPath = StringRef(LibraryStr).split('='); auto StringAndArch = DeviceAndPath.first.rsplit('-'); auto KindAndTriple = StringAndArch.first.split('-'); - return DeviceFile(KindAndTriple.first, KindAndTriple.second, + return DeviceFile(getOffloadKind(KindAndTriple.first), KindAndTriple.second, StringAndArch.second, DeviceAndPath.second); } @@ -360,8 +375,8 @@ if (Error E = Output->commit()) return E; - DeviceFiles.emplace_back(Kind, Binary.getTriple(), Binary.getArch(), - TempFile); + DeviceFiles.emplace_back(Binary.getOffloadKind(), Binary.getTriple(), + Binary.getArch(), TempFile); Offset += Binary.getSize(); } @@ -680,6 +695,39 @@ return static_cast(TempFile); } + +Expected fatbinary(ArrayRef InputFiles, + Triple TheTriple, ArrayRef Archs) { + // NVPTX uses the fatbinary program to bundle the linked images. + Expected FatBinaryPath = + findProgram("fatbinary", {CudaBinaryPath}); + if (!FatBinaryPath) + return FatBinaryPath.takeError(); + + // Create a new file to write the linked device image to. + SmallString<128> TempFile; + if (Error Err = createOutputFile(sys::path::filename(ExecutableName) + + "-device-" + TheTriple.getArchName(), + "fatbin", TempFile)) + return std::move(Err); + + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + + SmallVector CmdArgs; + CmdArgs.push_back(*FatBinaryPath); + CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32"); + CmdArgs.push_back("--create"); + CmdArgs.push_back(TempFile); + for (const auto &FileAndArch : llvm::zip(InputFiles, Archs)) + CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) + + ",file=" + std::get<0>(FileAndArch))); + + if (Error Err = executeCommands(*FatBinaryPath, CmdArgs)) + return std::move(Err); + + return static_cast(TempFile); +} } // namespace nvptx namespace amdgcn { Expected link(ArrayRef InputFiles, Triple TheTriple, @@ -1124,43 +1172,53 @@ /// Runs the appropriate linking action on all the device files specified in \p /// DeviceFiles. The linked device images are returned in \p LinkedImages. Error linkDeviceFiles(ArrayRef DeviceFiles, - SmallVectorImpl &LinkedImages) { - // Get the list of inputs for a specific device. + SmallVectorImpl &LinkedImages) { + // Get the list of inputs and active offload kinds for a specific device. DenseMap> LinkerInputMap; - for (auto &File : DeviceFiles) + DenseMap> ActiveOffloadKinds; + for (auto &File : DeviceFiles) { + ActiveOffloadKinds[File].insert(File.Kind); LinkerInputMap[File].push_back(File.Filename); + } // Try to link each device toolchain. for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + auto &LinkerInputFiles = LinkerInput.getSecond(); bool WholeProgram = false; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, - File.Arch, WholeProgram)) + if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch, + WholeProgram)) return Err; // If we are embedding bitcode for JIT, skip the final device linking. if (EmbedBitcode) { - assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + assert(!LinkerInputFiles.empty() && "No bitcode image to embed"); + LinkedImages.emplace_back(OFK_OpenMP, TheTriple.getTriple(), File.Arch, + LinkerInputFiles.front()); continue; } // If we performed LTO on NVPTX and had whole program visibility, we can use // CUDA in non-RDC mode. if (WholeProgram && TheTriple.isNVPTX()) { - assert(!LinkerInput.getSecond().empty() && "No non-RDC image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + assert(!LinkerInputFiles.empty() && "No non-RDC image to embed"); + for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + LinkerInputFiles.front()); continue; } - auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); + auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError(); - LinkedImages.push_back(*ImageOrErr); + // Create separate images for all the active offload kinds. + for (OffloadKind Kind : ActiveOffloadKinds[LinkerInput.getFirst()]) + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + *ImageOrErr); } return Error::success(); } @@ -1205,32 +1263,94 @@ return static_cast(ObjectFile); } -/// Creates the object file containing the device image and runtime registration -/// code from the device images stored in \p Images. -Expected wrapDeviceImages(ArrayRef Images) { +/// Load all of the OpenMP images into a buffer and pass it to the binary +/// wrapping function to create the registration code in the module \p M. +Error wrapOpenMPImages(Module &M, ArrayRef Images) { SmallVector, 4> SavedBuffers; SmallVector, 4> ImagesToWrap; - - for (StringRef ImageFilename : Images) { + for (const DeviceFile &File : Images) { llvm::ErrorOr> ImageOrError = - llvm::MemoryBuffer::getFileOrSTDIN(ImageFilename); + llvm::MemoryBuffer::getFileOrSTDIN(File.Filename); if (std::error_code EC = ImageOrError.getError()) - return createFileError(ImageFilename, EC); + return createFileError(File.Filename, EC); ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), (*ImageOrError)->getBufferSize()); SavedBuffers.emplace_back(std::move(*ImageOrError)); } - LLVMContext Context; - Module M("offload.wrapper.module", Context); - M.setTargetTriple(HostTriple); - if (Error Err = wrapBinaries(M, ImagesToWrap)) - return std::move(Err); + if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap)) + return Err; + return Error::success(); +} + +/// Combine all of the CUDA images into a single fatbinary and pass it to the +/// binary wrapping function to create the registration code in the module \p M. +Error wrapCudaImages(Module &M, ArrayRef Images) { + SmallVector InputFiles; + SmallVector Architectures; + for (const DeviceFile &File : Images) { + InputFiles.push_back(File.Filename); + Architectures.push_back(File.Arch); + } + + // CUDA expects its embedded device images to be a fatbinary. + Triple TheTriple = Triple(Images.front().TheTriple); + auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple, Architectures); + if (!FileOrErr) + return FileOrErr.takeError(); + + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr); + if (std::error_code EC = ImageOrError.getError()) + return createFileError(*FileOrErr, EC); + + auto ImageToWrap = ArrayRef((*ImageOrError)->getBufferStart(), + (*ImageOrError)->getBufferSize()); - if (PrintWrappedModule) - llvm::errs() << M; + if (Error Err = wrapCudaBinary(M, ImageToWrap)) + return Err; + return Error::success(); +} + +/// Creates the object file containing the device image and runtime +/// registration code from the device images stored in \p Images. +Expected> +wrapDeviceImages(ArrayRef Images) { + DenseMap> ImagesForKind; + for (const DeviceFile &Image : Images) + ImagesForKind[Image.Kind].push_back(Image); + + SmallVector WrappedImages; + for (const auto &KindAndImages : ImagesForKind) { + LLVMContext Context; + Module M("offload.wrapper.module", Context); + M.setTargetTriple(HostTriple); + + switch (KindAndImages.getFirst()) { + case OFK_OpenMP: + if (Error Err = wrapOpenMPImages(M, KindAndImages.getSecond())) + return std::move(Err); + break; + case OFK_Cuda: + if (Error Err = wrapCudaImages(M, KindAndImages.getSecond())) + return std::move(Err); + break; + default: + return createStringError(inconvertibleErrorCode(), + getOffloadKindName(KindAndImages.getFirst()) + + " wrapping is not supported"); + } + + if (PrintWrappedModule) + llvm::errs() << M; + + auto FileOrErr = compileModule(M); + if (!FileOrErr) + return FileOrErr.takeError(); + WrappedImages.push_back(*FileOrErr); + } - return compileModule(M); + return WrappedImages; } Optional findFile(StringRef Dir, const Twine &Name) { @@ -1361,7 +1481,7 @@ DeviceFiles.push_back(getBitcodeLibrary(LibraryStr)); // Link the device images extracted from the linker input. - SmallVector LinkedImages; + SmallVector LinkedImages; if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages)) return reportError(std::move(Err)); @@ -1370,7 +1490,7 @@ auto FileOrErr = wrapDeviceImages(LinkedImages); if (!FileOrErr) return reportError(FileOrErr.takeError()); - LinkerArgs.push_back(*FileOrErr); + LinkerArgs.append(*FileOrErr); // Run the host linking job. if (Error Err = runLinker(LinkerUserPath, LinkerArgs)) diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -1,4 +1,4 @@ -//===- OffloadWrapper.h -------------------------------------------*- C++ -*-===// +//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,7 +14,11 @@ /// Wrap the input device images into the module \p M as global symbols and /// registers the images with the OpenMP Offloading runtime libomptarget. -llvm::Error wrapBinaries(llvm::Module &M, - llvm::ArrayRef> Images); +llvm::Error wrapOpenMPBinaries(llvm::Module &M, + llvm::ArrayRef> Images); + +/// Wrap the input fatbinary image into the module \p M as global symbols and +/// registers the images with the CUDA runtime. +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images); #endif diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -257,7 +257,7 @@ } // namespace -Error wrapBinaries(Module &M, ArrayRef> Images) { +Error wrapOpenMPBinaries(Module &M, ArrayRef> Images) { GlobalVariable *Desc = createBinDesc(M, Images); if (!Desc) return createStringError(inconvertibleErrorCode(), @@ -266,3 +266,8 @@ createUnregisterFunction(M, Desc); return Error::success(); } + +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images) { + return createStringError(inconvertibleErrorCode(), + "Cuda wrapping is not yet supported."); +}