diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -107,7 +107,19 @@ // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_52 {{.*}}.o // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_70 {{.*}}.o {{.*}}.o -// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_52,file={{.*}}.out --image=profile=sm_70,file={{.*}}.out +// CUDA: fatbinary{{.*}}-64 --create {{.*}}.fatbin --image=profile=sm_70,file={{.*}}.out --image=profile=sm_52,file={{.*}}.out + +// RUN: clang-offload-packager -o %t.out \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_80 \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_75 \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_70 \ +// RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52 +// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ +// RUN: -fembed-offload-object=%t.out +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=4 \ +// RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA-PAR + +// CUDA-PAR: fatbinary{{.*}}-64 --create {{.*}}.fatbin // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=hip,triple=amdgcn-amd-amdhsa,arch=gfx90a \ @@ -120,7 +132,7 @@ // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.out {{.*}}.o -// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx908,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb +// HIP: clang-offload-bundler{{.*}}-type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux,hipv4-amdgcn-amd-amdhsa--gfx90a,hipv4-amdgcn-amd-amdhsa--gfx908 -input=/dev/null -input={{.*}}.out -input={{.*}}out -output={{.*}}.hipfb // RUN: clang-offload-packager -o %t.out \ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx908 \ diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -42,6 +42,7 @@ #include "llvm/Support/Host.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Parallel.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" @@ -1119,19 +1120,34 @@ /// be registered by the runtime. Expected> linkAndWrapDeviceFiles(SmallVectorImpl &LinkerInputFiles, - const InputArgList &Args) { + const InputArgList &Args, char **Argv, int Argc) { llvm::TimeTraceScope TimeScope("Handle all device input"); - DenseMap> InputsForTarget; + DenseMap> InputMap; for (auto &File : LinkerInputFiles) - InputsForTarget[File].emplace_back(std::move(File)); + InputMap[File].emplace_back(std::move(File)); LinkerInputFiles.clear(); - DenseMap> Images; - for (auto &[ID, Input] : InputsForTarget) { + SmallVector> InputsForTarget; + for (auto &[ID, Input] : InputMap) + InputsForTarget.emplace_back(std::move(Input)); + InputMap.clear(); + + std::mutex ImageMtx; + DenseMap> Images; + auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error { llvm::TimeTraceScope TimeScope("Link device input"); - auto LinkerArgs = getLinkerArgs(Input, Args); + // Each thread needs its own copy of the base arguments to maintain + // per-device argument storage of synthetic strings. + const OptTable &Tbl = getOptTable(); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + auto BaseArgs = + Tbl.parseArgs(Argc, Argv, OPT_INVALID, Saver, [](StringRef Err) { + reportError(createStringError(inconvertibleErrorCode(), Err)); + }); + auto LinkerArgs = getLinkerArgs(Input, BaseArgs); DenseSet ActiveOffloadKinds; for (const auto &File : Input) @@ -1142,7 +1158,7 @@ if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) return std::move(Err); - // Write any remaining device inputs to an output file for the linker job. + // Write any remaining device inputs to an output file for the linker. for (const OffloadFile &File : Input) { auto FileNameOrErr = writeOffloadFile(File); if (!FileNameOrErr) @@ -1150,7 +1166,7 @@ InputFiles.emplace_back(*FileNameOrErr); } - // Link the remaining device files, if necessary, using the device linker. + // Link the remaining device files using the device linker. llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); bool RequiresLinking = !Args.hasArg(OPT_embed_bitcode) && @@ -1171,17 +1187,31 @@ TheImage.TheImageKind = IMG_Object; TheImage.TheOffloadKind = Kind; TheImage.StringData = { - {"triple", LinkerArgs.getLastArgValue(OPT_triple_EQ)}, - {"arch", LinkerArgs.getLastArgValue(OPT_arch_EQ)}}; + {"triple", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ))}, + {"arch", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ))}}; TheImage.Image = std::move(*FileOrErr); + + std::lock_guard Guard(ImageMtx); Images[Kind].emplace_back(std::move(TheImage)); } - } + return Error::success(); + }); + if (Err) + return std::move(Err); // Create a binary image of each offloading image and embed it into a new // object file. SmallVector WrappedOutput; - for (const auto &[Kind, Input] : Images) { + for (auto &[Kind, Input] : Images) { + // We sort the entries before bundling so they appear in a deterministic + // order in the final binary. + llvm::sort(Input, [](OffloadingImage &A, OffloadingImage &B) { + return A.StringData["triple"].compare(B.StringData["triple"]) == 1 || + A.StringData["arch"].compare(B.StringData["arch"]) == 1 || + A.TheOffloadKind < B.TheOffloadKind; + }); auto BundledImagesOrErr = bundleLinkedOutput(Input, Args, Kind); if (!BundledImagesOrErr) return BundledImagesOrErr.takeError(); @@ -1362,6 +1392,16 @@ if (!CudaBinaryPath.empty()) CudaBinaryPath = CudaBinaryPath + "/bin"; + parallel::strategy = hardware_concurrency(1); + if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) { + unsigned Threads = 0; + if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0) + reportError(createStringError( + inconvertibleErrorCode(), "%s: expected a positive integer, got '%s'", + Arg->getSpelling().data(), Arg->getValue())); + parallel::strategy = hardware_concurrency(Threads); + } + if (Args.hasArg(OPT_wrapper_time_trace_eq)) { unsigned Granularity; Args.getLastArgValue(OPT_wrapper_time_trace_granularity, "500") @@ -1378,7 +1418,8 @@ reportError(DeviceInputFiles.takeError()); // Link and wrap the device images extracted from the linker input. - auto FilesOrErr = linkAndWrapDeviceFiles(*DeviceInputFiles, Args); + auto FilesOrErr = + linkAndWrapDeviceFiles(*DeviceInputFiles, Args, Argv, Argc); if (!FilesOrErr) reportError(FilesOrErr.takeError()); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -59,6 +59,10 @@ Flags<[WrapperOnlyOption]>, MetaVarName<"">, HelpText<"Set the granularity of time-trace updates">; +def wrapper_jobs : Joined<["--"], "wrapper-jobs=">, + Flags<[WrapperOnlyOption]>, MetaVarName<"">, + HelpText<"Sets the number of parallel jobs to use for device linking">; + // Flags passed to the device linker. def arch_EQ : Joined<["--"], "arch=">, Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"">,