diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -8447,12 +8447,10 @@ } } - // TODO: We need to pass in the full target-id and handle it properly in the - // linker wrapper. SmallVector Parts{ "file=" + File.str(), "triple=" + TC->getTripleString(), - "arch=" + getProcessorFromTargetID(TC->getTriple(), Arch).str(), + "arch=" + Arch.str(), "kind=" + Kind.str(), }; diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c --- a/clang/test/Driver/amdgpu-openmp-toolchain.c +++ b/clang/test/Driver/amdgpu-openmp-toolchain.c @@ -65,7 +65,7 @@ // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:sramecc-:xnack+ \ // RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID -// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a,kind=openmp,feature=-sramecc,feature=+xnack +// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a:sramecc-:xnack+,kind=openmp,feature=-sramecc,feature=+xnack // RUN: not %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a,gfx90a:xnack+ \ // RUN: -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID-ERROR diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -2,6 +2,9 @@ // REQUIRES: nvptx-registered-target // REQUIRES: amdgpu-registered-target +// An externally visible variable so static libraries extract. +__attribute__((visibility("protected"), used)) int x; + // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o // RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.bc // RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc @@ -36,6 +39,18 @@ // AMDGPU-LINK: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx908 -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o +// RUN: clang-offload-packager -o %t-lib.out \ +// RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out +// RUN: llvm-ar rcs %t.a %t.o +// RUN: clang-offload-packager -o %t.out \ +// RUN: --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack+ +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \ +// RUN: --linker-path=/usr/bin/ld -- %t.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMDGPU-LINK-ID + +// AMDGPU-LINK-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o + // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 \ // RUN: --image=file=%t.amdgpu.bc,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx1030 diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -969,9 +969,13 @@ for (Arg *A : Args) DAL.append(A); - // Set the subarchitecture and target triple for this compilation. + // Set the subarchitecture and target triple for this compilation. The input + // may be an AMDGPU target-id so we split off anything before the colon. const OptTable &Tbl = getOptTable(); DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_arch_EQ), + Args.MakeArgString( + Input.front().getBinary()->getArch().split(':').first)); + DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_full_arch_EQ), Args.MakeArgString(Input.front().getBinary()->getArch())); DAL.AddJoinedArg(nullptr, Tbl.getOption(OPT_triple_EQ), Args.MakeArgString(Input.front().getBinary()->getTriple())); @@ -1001,23 +1005,13 @@ /// Transforms all the extracted offloading input files into an image that can /// be registered by the runtime. Expected> -linkAndWrapDeviceFiles(SmallVectorImpl &LinkerInputFiles, +linkAndWrapDeviceFiles(SmallVector> &LinkerInputFiles, const InputArgList &Args, char **Argv, int Argc) { llvm::TimeTraceScope TimeScope("Handle all device input"); - DenseMap> InputMap; - for (auto &File : LinkerInputFiles) - InputMap[File].emplace_back(std::move(File)); - LinkerInputFiles.clear(); - - SmallVector> InputsForTarget; - for (auto &[ID, Input] : InputMap) - InputsForTarget.emplace_back(std::move(Input)); - InputMap.clear(); - std::mutex ImageMtx; DenseMap> Images; - auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error { + auto Err = parallelForEachError(LinkerInputFiles, [&](auto &Input) -> Error { llvm::TimeTraceScope TimeScope("Link device input"); // Each thread needs its own copy of the base arguments to maintain @@ -1075,7 +1069,7 @@ TheImage.StringData["triple"] = Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ)); TheImage.StringData["arch"] = - Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ)); + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_full_arch_EQ)); TheImage.Image = std::move(*FileOrErr); Images[Kind].emplace_back(std::move(TheImage)); @@ -1294,7 +1288,8 @@ /// and add it to the list of files to be linked. Files coming from static /// libraries are only added to the input if they are used by an existing /// input file. -Expected> getDeviceInput(const ArgList &Args) { +Expected>> +getDeviceInput(const ArgList &Args) { llvm::TimeTraceScope TimeScope("ExtractDeviceCode"); StringRef Root = Args.getLastArgValue(OPT_sysroot_EQ); @@ -1306,7 +1301,7 @@ StringSaver Saver(Alloc); // Try to extract device code from the linker input files. - SmallVector InputFiles; + DenseMap> InputMap; DenseMap> Syms; bool WholeArchive = false; for (const opt::Arg *Arg : Args.filtered( @@ -1353,23 +1348,48 @@ if (!Binary.getBinary()) continue; - // If we don't have an object file for this architecture do not - // extract. - if (IsArchive && !WholeArchive && !Syms.count(Binary)) - continue; - - Expected ExtractOrErr = - getSymbols(Binary.getBinary()->getImage(), - Binary.getBinary()->getOffloadKind(), IsArchive, Saver, - Syms[Binary]); - if (!ExtractOrErr) - return ExtractOrErr.takeError(); - - Extracted = !WholeArchive && *ExtractOrErr; - - if (!IsArchive || WholeArchive || Extracted) - InputFiles.emplace_back(std::move(Binary)); - + // Initialize the map with an empty set of inputs. + OffloadFile::TargetID BinaryID = + OffloadFile::TargetID(Saver.save(Binary.getBinary()->getTriple()), + Saver.save(Binary.getBinary()->getArch())); + if (!InputMap.count(BinaryID)) + InputMap[BinaryID] = SmallVector(); + + // We need to compare this binary input with every input architecture + // and copy it in if it's compatible. This allows a single binary to + // participate in multiple link jobs. + DenseMap> NewInputMap; + for (const auto &[ID, Input] : InputMap) { + // If we don't have an object file for this architecture do not + // extract. + if (IsArchive && !WholeArchive && Input.empty()) + continue; + + // We only add the input if the binary is compatible with the slot. + if (!areTargetsCompatible(Binary, ID)) + continue; + + Expected ExtractOrErr = getSymbols( + Binary.getBinary()->getImage(), + Binary.getBinary()->getOffloadKind(), IsArchive, Saver, Syms[ID]); + if (!ExtractOrErr) + return ExtractOrErr.takeError(); + + Extracted = !WholeArchive && *ExtractOrErr; + + if (!IsArchive || WholeArchive || Extracted) { + auto NewBinaryOrErr = Binary.copy(); + if (!NewBinaryOrErr) + return NewBinaryOrErr.takeError(); + NewInputMap[ID].emplace_back(std::move(*NewBinaryOrErr)); + } + } + + for (auto &[NewID, NewInput] : NewInputMap) + InputMap[NewID].append(std::make_move_iterator(NewInput.begin()), + std::make_move_iterator(NewInput.end())); + + Binary.takeBinary(); // If we extracted any files we need to check all the symbols again. if (Extracted) break; @@ -1377,12 +1397,11 @@ } } - for (StringRef Library : Args.getAllArgValues(OPT_bitcode_library_EQ)) { - auto FileOrErr = getInputBitcodeLibrary(Library); - if (!FileOrErr) - return FileOrErr.takeError(); - InputFiles.push_back(std::move(*FileOrErr)); - } + SmallVector> InputFiles; + for (auto &[ID, Input] : InputMap) + if (!Input.empty()) + InputFiles.emplace_back(std::move(Input)); + InputMap.clear(); return std::move(InputFiles); } diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -67,6 +67,9 @@ def arch_EQ : Joined<["--"], "arch=">, Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"">, HelpText<"The device subarchitecture">; +def full_arch_EQ : Joined<["--"], "full-arch=">, + Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"">, + HelpText<"The fully qualifier device subarchitecture for AMD's target ID">; def triple_EQ : Joined<["--"], "triple=">, Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"">, HelpText<"The device target triple">; diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h --- a/llvm/include/llvm/Object/OffloadBinary.h +++ b/llvm/include/llvm/Object/OffloadBinary.h @@ -17,6 +17,7 @@ #ifndef LLVM_OBJECT_OFFLOADBINARY_H #define LLVM_OBJECT_OFFLOADBINARY_H +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/Binary.h" @@ -155,12 +156,22 @@ /// owns its memory. class OffloadFile : public OwningBinary { public: + /// An ordered pair of the target triple and the architecture. using TargetID = std::pair; OffloadFile(std::unique_ptr Binary, std::unique_ptr Buffer) : OwningBinary(std::move(Binary), std::move(Buffer)) {} + Expected copy() const { + std::unique_ptr Buffer = MemoryBuffer::getMemBufferCopy( + getBinary()->getMemoryBufferRef().getBuffer()); + auto NewBinaryOrErr = OffloadBinary::create(*Buffer); + if (!NewBinaryOrErr) + return NewBinaryOrErr.takeError(); + return OffloadFile(std::move(*NewBinaryOrErr), std::move(Buffer)); + } + /// We use the Triple and Architecture pair to group linker inputs together. /// This conversion function lets us use these inputs in a hash-map. operator TargetID() const { @@ -168,6 +179,28 @@ } }; +/// Queries if the target \p LHS is compatible with \p RHS for linking purposes. +inline bool areTargetsCompatible(const OffloadFile::TargetID LHS, + const OffloadFile::TargetID RHS) { + if (LHS == RHS) + return true; + + // If the target is AMD we check the target IDs for compatibility. A target id + // is a string conforming to the folowing BNF syntax: + // + // target-id ::= ' ( : ( '+' | '-' ) )*' + // + // This is used to link mutually compatible architectures together. + llvm::Triple T(LHS.first); + if (!T.isAMDGPU()) + return false; + + // The targets are compatible if the architecture is a subset of the other. + if (RHS.second.contains(LHS.second)) + return true; + return false; +} + /// Extracts embedded device offloading code from a memory \p Buffer to a list /// of \p Binaries. Error extractOffloadBinaries(MemoryBufferRef Buffer,