diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c --- a/clang/test/Driver/linker-wrapper.c +++ b/clang/test/Driver/linker-wrapper.c @@ -102,7 +102,7 @@ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=cuda,triple=nvptx64-nvidia-cuda,arch=sm_52 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ // RUN: -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \ +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=1 \ // RUN: --linker-path=/usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=CUDA // CUDA: nvlink{{.*}}-m64 -o {{.*}}.out -arch sm_52 {{.*}}.o @@ -116,7 +116,7 @@ // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ // RUN: -fembed-offload-object=%t.out // RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu -linker-path \ -// RUN: /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP +// RUN: --wrapper-jobs=1 /usr/bin/ld -- %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=HIP // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx908 -o {{.*}}.out {{.*}}.o // HIP: lld{{.*}}-flavor gnu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -o {{.*}}.out {{.*}}.o @@ -127,7 +127,7 @@ // RUN: --image=file=%S/Inputs/dummy-elf.o,kind=openmp,triple=nvptx64-nvidia-cuda,arch=sm_70 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o \ // RUN: -fembed-offload-object=%t.out -// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu \ +// RUN: clang-linker-wrapper --dry-run --host-triple=x86_64-unknown-linux-gnu --wrapper-jobs=1 \ // RUN: --linker-path=/usr/bin/ld --device-linker=a --device-linker=nvptx64-nvidia-cuda=b -- \ // RUN: %t.o -o a.out 2>&1 | FileCheck %s --check-prefix=LINKER_ARGS diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp --- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp +++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp @@ -42,6 +42,7 @@ #include "llvm/Support/Host.h" #include "llvm/Support/InitLLVM.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Parallel.h" #include "llvm/Support/Path.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" @@ -1082,6 +1083,7 @@ /// Returns a new ArgList containg arguments used for the device linking phase. DerivedArgList getLinkerArgs(ArrayRef Input, const InputArgList &Args) { + DerivedArgList DAL = DerivedArgList(DerivedArgList(Args)); for (Arg *A : Args) DAL.append(A); @@ -1119,19 +1121,34 @@ /// be registered by the runtime. Expected> linkAndWrapDeviceFiles(SmallVectorImpl &LinkerInputFiles, - const InputArgList &Args) { + const InputArgList &Args, char **Argv, int Argc) { llvm::TimeTraceScope TimeScope("Handle all device input"); - DenseMap> InputsForTarget; + DenseMap> InputMap; for (auto &File : LinkerInputFiles) - InputsForTarget[File].emplace_back(std::move(File)); + InputMap[File].emplace_back(std::move(File)); LinkerInputFiles.clear(); - DenseMap> Images; - for (auto &[ID, Input] : InputsForTarget) { + SmallVector> InputsForTarget; + for (auto &[ID, Input] : InputMap) + InputsForTarget.emplace_back(std::move(Input)); + InputMap.clear(); + + std::mutex ImageMtx; + DenseMap> Images; + auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error { llvm::TimeTraceScope TimeScope("Link device input"); - auto LinkerArgs = getLinkerArgs(Input, Args); + // Each thread needs its own copy of the base arguments to maintain + // per-device argument storage of synthetic strings. + const OptTable &Tbl = getOptTable(); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + auto BaseArgs = + Tbl.parseArgs(Argc, Argv, OPT_INVALID, Saver, [](StringRef Err) { + reportError(createStringError(inconvertibleErrorCode(), Err)); + }); + auto LinkerArgs = getLinkerArgs(Input, BaseArgs); DenseSet ActiveOffloadKinds; for (const auto &File : Input) @@ -1142,7 +1159,7 @@ if (Error Err = linkBitcodeFiles(Input, InputFiles, LinkerArgs)) return std::move(Err); - // Write any remaining device inputs to an output file for the linker job. + // Write any remaining device inputs to an output file for the linker. for (const OffloadFile &File : Input) { auto FileNameOrErr = writeOffloadFile(File); if (!FileNameOrErr) @@ -1150,7 +1167,7 @@ InputFiles.emplace_back(*FileNameOrErr); } - // Link the remaining device files, if necessary, using the device linker. + // Link the remaining device files using the device linker. llvm::Triple Triple(LinkerArgs.getLastArgValue(OPT_triple_EQ)); bool RequiresLinking = !Args.hasArg(OPT_embed_bitcode) && @@ -1171,12 +1188,19 @@ TheImage.TheImageKind = IMG_Object; TheImage.TheOffloadKind = Kind; TheImage.StringData = { - {"triple", LinkerArgs.getLastArgValue(OPT_triple_EQ)}, - {"arch", LinkerArgs.getLastArgValue(OPT_arch_EQ)}}; + {"triple", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_triple_EQ))}, + {"arch", + Args.MakeArgString(LinkerArgs.getLastArgValue(OPT_arch_EQ))}}; TheImage.Image = std::move(*FileOrErr); + + std::lock_guard Guard(ImageMtx); Images[Kind].emplace_back(std::move(TheImage)); } - } + return Error::success(); + }); + if (Err) + return std::move(Err); // Create a binary image of each offloading image and embed it into a new // object file. @@ -1351,6 +1375,15 @@ if (!CudaBinaryPath.empty()) CudaBinaryPath = CudaBinaryPath + "/bin"; + if (auto *Arg = Args.getLastArg(OPT_wrapper_jobs)) { + unsigned Threads = 0; + if (!llvm::to_integer(Arg->getValue(), Threads) || Threads == 0) + reportError(createStringError( + inconvertibleErrorCode(), "%s: expected a positive integer, got '%s'", + Arg->getSpelling().data(), Arg->getValue())); + parallel::strategy = hardware_concurrency(Threads); + } + if (Args.hasArg(OPT_wrapper_time_trace_eq)) { unsigned Granularity; Args.getLastArgValue(OPT_wrapper_time_trace_granularity, "500") @@ -1367,7 +1400,8 @@ reportError(DeviceInputFiles.takeError()); // Link and wrap the device images extracted from the linker input. - auto FilesOrErr = linkAndWrapDeviceFiles(*DeviceInputFiles, Args); + auto FilesOrErr = + linkAndWrapDeviceFiles(*DeviceInputFiles, Args, Argv, Argc); if (!FilesOrErr) reportError(FilesOrErr.takeError()); diff --git a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td --- a/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td +++ b/clang/tools/clang-linker-wrapper/LinkerWrapperOpts.td @@ -59,6 +59,10 @@ Flags<[WrapperOnlyOption]>, MetaVarName<"">, HelpText<"Set the granularity of time-trace updates">; +def wrapper_jobs : Joined<["--"], "wrapper-jobs=">, + Flags<[WrapperOnlyOption]>, MetaVarName<"">, + HelpText<"Sets the number of parallel jobs to use for device linking">; + // Flags passed to the device linker. def arch_EQ : Joined<["--"], "arch=">, Flags<[DeviceOnlyOption, HelpHidden]>, MetaVarName<"">,