diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -685,6 +685,39 @@ return static_cast(TempFile); } + +Expected fatbinary(ArrayRef InputFiles, + Triple TheTriple, ArrayRef Archs) { + // NVPTX uses the fatbinary program to bundle the linked images. + Expected FatBinaryPath = + findProgram("fatbinary", {CudaBinaryPath}); + if (!FatBinaryPath) + return FatBinaryPath.takeError(); + + // Create a new file to write the linked device image to. + SmallString<128> TempFile; + if (Error Err = createOutputFile(sys::path::filename(ExecutableName) + + "-device-" + TheTriple.getArchName(), + "fatbin", TempFile)) + return std::move(Err); + + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + + SmallVector CmdArgs; + CmdArgs.push_back(*FatBinaryPath); + CmdArgs.push_back(TheTriple.isArch64Bit() ? "-64" : "-32"); + CmdArgs.push_back("--create"); + CmdArgs.push_back(TempFile); + for (const auto &FileAndArch : llvm::zip(InputFiles, Archs)) + CmdArgs.push_back(Saver.save("--image=profile=" + std::get<1>(FileAndArch) + + ",file=" + std::get<0>(FileAndArch))); + + if (Error Err = executeCommands(*FatBinaryPath, CmdArgs)) + return std::move(Err); + + return static_cast(TempFile); +} } // namespace nvptx namespace amdgcn { Expected link(ArrayRef InputFiles, Triple TheTriple, @@ -1133,34 +1166,42 @@ /// Runs the appropriate linking action on all the device files specified in \p /// DeviceFiles. The linked device images are returned in \p LinkedImages. Error linkDeviceFiles(ArrayRef DeviceFiles, - SmallVectorImpl &LinkedImages) { - // Get the list of inputs for a specific device. - DenseMap> LinkerInputMap; - for (auto &File : DeviceFiles) - LinkerInputMap[File].push_back(File.Filename); + SmallVectorImpl &LinkedImages) { + // Get the list of inputs and active offload kinds for a specific device. + DenseMap, SmallVector>> + LinkerInputMap; + for (auto &File : DeviceFiles) { + LinkerInputMap[File].first.insert(File.Kind); + LinkerInputMap[File].second.push_back(File.Filename); + } // Try to link each device toolchain. for (auto &LinkerInput : LinkerInputMap) { DeviceFile &File = LinkerInput.getFirst(); Triple TheTriple = Triple(File.TheTriple); + auto &LinkerInputFiles = LinkerInput.getSecond().second; // Run LTO on any bitcode files and replace the input with the result. - if (Error Err = - linkBitcodeFiles(LinkerInput.getSecond(), TheTriple, File.Arch)) + if (Error Err = linkBitcodeFiles(LinkerInputFiles, TheTriple, File.Arch)) return Err; // If we are embedding bitcode for JIT, skip the final device linking. if (EmbedBitcode) { - assert(!LinkerInput.getSecond().empty() && "No bitcode image to embed"); - LinkedImages.push_back(LinkerInput.getSecond().front()); + assert(!LinkerInputFiles.empty() && "No bitcode image to embed"); + LinkedImages.emplace_back("openmp", TheTriple.getTriple(), File.Arch, + LinkerInputFiles.front()); continue; } - auto ImageOrErr = linkDevice(LinkerInput.getSecond(), TheTriple, File.Arch); + auto ImageOrErr = linkDevice(LinkerInputFiles, TheTriple, File.Arch); if (!ImageOrErr) return ImageOrErr.takeError(); - LinkedImages.push_back(*ImageOrErr); + // Create separate images for all the active offload kinds. + for (StringRef Kind : LinkerInput.getSecond().first) + LinkedImages.emplace_back(Kind, TheTriple.getTriple(), File.Arch, + *ImageOrErr); } return Error::success(); } @@ -1205,32 +1246,87 @@ return static_cast(ObjectFile); } -/// Creates the object file containing the device image and runtime registration -/// code from the device images stored in \p Images. -Expected wrapDeviceImages(ArrayRef Images) { +/// Load all of the OpenMP images into a buffer and pass it to the binary +/// wrapping function to create the registration code in the module \p M. +Error wrapOpenMPImages(Module &M, ArrayRef Images) { SmallVector, 4> SavedBuffers; SmallVector, 4> ImagesToWrap; - - for (StringRef ImageFilename : Images) { + for (const DeviceFile &File : Images) { llvm::ErrorOr> ImageOrError = - llvm::MemoryBuffer::getFileOrSTDIN(ImageFilename); + llvm::MemoryBuffer::getFileOrSTDIN(File.Filename); if (std::error_code EC = ImageOrError.getError()) - return createFileError(ImageFilename, EC); + return createFileError(File.Filename, EC); ImagesToWrap.emplace_back((*ImageOrError)->getBufferStart(), (*ImageOrError)->getBufferSize()); SavedBuffers.emplace_back(std::move(*ImageOrError)); } - LLVMContext Context; - Module M("offload.wrapper.module", Context); - M.setTargetTriple(HostTriple); - if (Error Err = wrapBinaries(M, ImagesToWrap)) - return std::move(Err); + if (Error Err = wrapOpenMPBinaries(M, ImagesToWrap)) + return Err; + return Error::success(); +} + +/// Combine all of the CUDA images into a single fatbinary and pass it to the +/// binary wrapping function to create the registration code in the module \p M. +Error wrapCudaImages(Module &M, ArrayRef Images) { + SmallVector InputFiles; + SmallVector Architectures; + for (const DeviceFile &File : Images) { + InputFiles.push_back(File.Filename); + Architectures.push_back(File.Arch); + } + + // CUDA expects its embedded device images to be a fatbinary. + Triple TheTriple = Triple(Images.front().TheTriple); + auto FileOrErr = nvptx::fatbinary(InputFiles, TheTriple, Architectures); + if (!FileOrErr) + return FileOrErr.takeError(); + + llvm::ErrorOr> ImageOrError = + llvm::MemoryBuffer::getFileOrSTDIN(*FileOrErr); + if (std::error_code EC = ImageOrError.getError()) + return createFileError(*FileOrErr, EC); + + auto ImageToWrap = ArrayRef((*ImageOrError)->getBufferStart(), + (*ImageOrError)->getBufferSize()); - if (PrintWrappedModule) - llvm::errs() << M; + if (Error Err = wrapCudaBinary(M, ImageToWrap)) + return Err; + return Error::success(); +} + +/// Creates the object file containing the device image and runtime +/// registration code from the device images stored in \p Images. +Expected> +wrapDeviceImages(ArrayRef Images) { + StringMap> ImagesForKind; + for (const DeviceFile &Image : Images) + ImagesForKind[Image.Kind].push_back(Image); + + SmallVector WrappedImages; + for (const auto &KindAndImages : ImagesForKind) { + LLVMContext Context; + Module M("offload.wrapper.module", Context); + M.setTargetTriple(HostTriple); + + if (KindAndImages.getKey() == "openmp") { + if (Error Err = wrapOpenMPImages(M, KindAndImages.getValue())) + return std::move(Err); + } else if (KindAndImages.getKey() == "cuda") { + if (Error Err = wrapCudaImages(M, KindAndImages.getValue())) + return std::move(Err); + } + + if (PrintWrappedModule) + llvm::errs() << M; + + auto FileOrErr = compileModule(M); + if (!FileOrErr) + return FileOrErr.takeError(); + WrappedImages.push_back(*FileOrErr); + } - return compileModule(M); + return WrappedImages; } Optional findFile(StringRef Dir, const Twine &Name) { @@ -1361,7 +1457,7 @@ DeviceFiles.push_back(getBitcodeLibrary(LibraryStr)); // Link the device images extracted from the linker input. - SmallVector LinkedImages; + SmallVector LinkedImages; if (Error Err = linkDeviceFiles(DeviceFiles, LinkedImages)) return reportError(std::move(Err)); @@ -1370,7 +1466,7 @@ auto FileOrErr = wrapDeviceImages(LinkedImages); if (!FileOrErr) return reportError(FileOrErr.takeError()); - LinkerArgs.push_back(*FileOrErr); + LinkerArgs.append(*FileOrErr); // Run the host linking job. if (Error Err = runLinker(LinkerUserPath, LinkerArgs)) diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.h b/clang/tools/clang-linker-wrapper/OffloadWrapper.h --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.h +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.h @@ -1,4 +1,4 @@ -//===- OffloadWrapper.h -------------------------------------------*- C++ -*-===// +//===- OffloadWrapper.h --r-------------------------------------*- C++ -*-===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. @@ -14,7 +14,11 @@ /// Wrap the input device images into the module \p M as global symbols and /// registers the images with the OpenMP Offloading runtime libomptarget. -llvm::Error wrapBinaries(llvm::Module &M, - llvm::ArrayRef> Images); +llvm::Error wrapOpenMPBinaries(llvm::Module &M, + llvm::ArrayRef> Images); + +/// Wrap the input fatbinary image into the module \p M as global symbols and +/// registers the images with the CUDA runtime. +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images); #endif diff --git a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp --- a/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/OffloadWrapper.cpp @@ -257,7 +257,7 @@ } // namespace -Error wrapBinaries(Module &M, ArrayRef> Images) { +Error wrapOpenMPBinaries(Module &M, ArrayRef> Images) { GlobalVariable *Desc = createBinDesc(M, Images); if (!Desc) return createStringError(inconvertibleErrorCode(), @@ -266,3 +266,8 @@ createUnregisterFunction(M, Desc); return Error::success(); } + +llvm::Error wrapCudaBinary(llvm::Module &M, llvm::ArrayRef Images) { + return createStringError(inconvertibleErrorCode(), + "Cuda wrapping is not yet supported."); +}