diff --git a/clang/lib/Driver/ToolChains/CommonArgs.h b/clang/lib/Driver/ToolChains/CommonArgs.h --- a/clang/lib/Driver/ToolChains/CommonArgs.h +++ b/clang/lib/Driver/ToolChains/CommonArgs.h @@ -88,7 +88,8 @@ bool isObjCAutoRefCount(const llvm::opt::ArgList &Args); -unsigned getLTOParallelism(const llvm::opt::ArgList &Args, const Driver &D); +llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args, + const Driver &D); bool areOptimizationsEnabled(const llvm::opt::ArgList &Args); diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp --- a/clang/lib/Driver/ToolChains/CommonArgs.cpp +++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -50,6 +50,7 @@ #include "llvm/Support/Program.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/TargetParser.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/YAMLParser.h" @@ -338,14 +339,14 @@ } } -unsigned tools::getLTOParallelism(const ArgList &Args, const Driver &D) { - unsigned Parallelism = 0; +llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) { Arg *LtoJobsArg = Args.getLastArg(options::OPT_flto_jobs_EQ); - if (LtoJobsArg && - StringRef(LtoJobsArg->getValue()).getAsInteger(10, Parallelism)) - D.Diag(diag::err_drv_invalid_int_value) << LtoJobsArg->getAsString(Args) - << LtoJobsArg->getValue(); - return Parallelism; + if (!LtoJobsArg) + return {}; + if (!llvm::get_threadpool_strategy(LtoJobsArg->getValue())) + D.Diag(diag::err_drv_invalid_int_value) + << LtoJobsArg->getAsString(Args) << LtoJobsArg->getValue(); + return LtoJobsArg->getValue(); } // CloudABI uses -ffunction-sections and -fdata-sections by default. @@ -410,7 +411,8 @@ if (IsThinLTO) CmdArgs.push_back("-plugin-opt=thinlto"); - if (unsigned Parallelism = getLTOParallelism(Args, ToolChain.getDriver())) + StringRef Parallelism = getLTOParallelism(Args, ToolChain.getDriver()); + if (!Parallelism.empty()) CmdArgs.push_back( Args.MakeArgString("-plugin-opt=jobs=" + Twine(Parallelism))); diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp --- a/clang/lib/Driver/ToolChains/Darwin.cpp +++ b/clang/lib/Driver/ToolChains/Darwin.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/TargetParser.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include // ::getenv @@ -605,10 +606,12 @@ getMachOToolChain().addProfileRTLibs(Args, CmdArgs); - if (unsigned Parallelism = - getLTOParallelism(Args, getToolChain().getDriver())) { + StringRef Parallelism = getLTOParallelism(Args, getToolChain().getDriver()); + if (!Parallelism.empty()) { CmdArgs.push_back("-mllvm"); - CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(Parallelism))); + unsigned NumThreads = + llvm::get_threadpool_strategy(Parallelism)->compute_thread_count(); + CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(NumThreads))); } if (getToolChain().ShouldLinkCXXStdlib(Args)) diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -144,7 +144,7 @@ unsigned ltoo = 2; // Used for /opt:lldltojobs=N - unsigned thinLTOJobs = 0; + std::string thinLTOJobs; // Used for /opt:lldltopartitions=N unsigned ltoPartitions = 1; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -1417,9 +1417,9 @@ error("/opt:lldlto: invalid optimization level: " + optLevel); } else if (s.startswith("lldltojobs=")) { StringRef jobs = s.substr(11); - if (jobs.getAsInteger(10, config->thinLTOJobs) || - config->thinLTOJobs == 0) + if (!get_threadpool_strategy(jobs)) error("/opt:lldltojobs: invalid job count: " + jobs); + config->thinLTOJobs = jobs.str(); } else if (s.startswith("lldltopartitions=")) { StringRef n = s.substr(17); if (n.getAsInteger(10, config->ltoPartitions) || diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp --- a/lld/COFF/LTO.cpp +++ b/lld/COFF/LTO.cpp @@ -101,8 +101,9 @@ std::string(config->thinLTOPrefixReplace.first), std::string(config->thinLTOPrefixReplace.second), config->thinLTOEmitImportsFiles, indexFile.get(), OnIndexWrite); - } else if (config->thinLTOJobs != 0) { - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + } else { + backend = lto::createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(config->thinLTOJobs)); } ltoObj = std::make_unique(createConfig(), backend, diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -246,7 +246,7 @@ unsigned ltoPartitions; unsigned ltoo; unsigned optimize; - unsigned thinLTOJobs; + StringRef thinLTOJobs; unsigned timeTraceGranularity; int32_t splitStackAdjustSize; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -974,7 +974,7 @@ config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); - config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u); + config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs); config->thinLTOObjectSuffixReplace = getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq); config->thinLTOPrefixReplace = @@ -1040,8 +1040,8 @@ error("invalid optimization level for LTO: " + Twine(config->ltoo)); if (config->ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (config->thinLTOJobs == 0) - error("--thinlto-jobs: number of threads must be > 0"); + if (!get_threadpool_strategy(config->thinLTOJobs)) + error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs); if (config->splitStackAdjustSize < 0) error("--split-stack-adjust-size: size must be >= 0"); diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -146,8 +146,9 @@ std::string(config->thinLTOPrefixReplace.first), std::string(config->thinLTOPrefixReplace.second), config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); - } else if (config->thinLTOJobs != -1U) { - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + } else { + backend = lto::createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(config->thinLTOJobs)); } ltoObj = std::make_unique(createConfig(), backend, diff --git a/lld/test/COFF/thinlto.ll b/lld/test/COFF/thinlto.ll --- a/lld/test/COFF/thinlto.ll +++ b/lld/test/COFF/thinlto.ll @@ -6,6 +6,16 @@ ; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj ; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; Test various possible options for /opt:lldltojobs +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1 +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=all +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1000 +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: not lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=foo 2>&1 | FileCheck %s --check-prefix=BAD-JOBS +; BAD-JOBS: error: /opt:lldltojobs: invalid job count: foo + ; This command will store full path to foo.obj in the archive %t.lib ; Check that /lldsavetemps is still usable in such case. ; RUN: lld-link /lib %T/thinlto/foo.obj /out:%t.lib diff --git a/lld/test/ELF/basic.s b/lld/test/ELF/basic.s --- a/lld/test/ELF/basic.s +++ b/lld/test/ELF/basic.s @@ -249,9 +249,21 @@ # RUN: not ld.lld %t --plugin-opt=lto-partitions=0 2>&1 | FileCheck --check-prefix=NOTHREADS %s # NOTHREADS: --lto-partitions: number of threads must be > 0 -# RUN: not ld.lld %t --thinlto-jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s -# RUN: not ld.lld %t --plugin-opt=jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s -# NOTHREADSTHIN: --thinlto-jobs: number of threads must be > 0 +# RUN: ld.lld %t --thinlto-jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# THREADSTHIN: basic.s.tmp +# RUN: not ld.lld %t --thinlto-jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s +# BADTHREADSTHIN: error: --thinlto-jobs: invalid job count: foo + +# RUN: ld.lld %t --plugin-opt=jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: not ld.lld %t --plugin-opt=jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s # RUN: not ld.lld %t -z ifunc-noplt -z text 2>&1 | FileCheck --check-prefix=NOIFUNCPLTNOTEXTREL %s # NOIFUNCPLTNOTEXTREL: -z text and -z ifunc-noplt may not be used together diff --git a/lld/test/ELF/lto/thinlto.ll b/lld/test/ELF/lto/thinlto.ll --- a/lld/test/ELF/lto/thinlto.ll +++ b/lld/test/ELF/lto/thinlto.ll @@ -16,8 +16,25 @@ ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 -; Then check without --thinlto-jobs (which currently default to hardware_concurrency) -; RUN: ld.lld -shared %t1.o %t2.o -o %t3 +; Test with all threads, on all cores, on all CPU sockets +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: ld.lld -save-temps --thinlto-jobs=all -shared %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with many more threads than the system has +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: ld.lld -save-temps --thinlto-jobs=1000 -shared %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with a bad value +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS +; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo + +; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT) +; RUN: ld.lld -shared -save-temps %t1.o %t2.o -o %t3 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 diff --git a/lld/test/wasm/lto/thinlto.ll b/lld/test/wasm/lto/thinlto.ll --- a/lld/test/wasm/lto/thinlto.ll +++ b/lld/test/wasm/lto/thinlto.ll @@ -14,8 +14,26 @@ ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 -; Check without --thinlto-jobs (which currently default to hardware_concurrency) -; RUN: wasm-ld -r %t1.o %t2.o -o %t3 +; Test with all threads, on all cores, on all CPU sockets +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: wasm-ld -r -save-temps --thinlto-jobs=all %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with many more threads than the system has +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: wasm-ld -r -save-temps --thinlto-jobs=1000 %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with a bad value +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: not wasm-ld -r -save-temps --thinlto-jobs=foo %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS +; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo + +; Check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT) +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: wasm-ld -r -save-temps %t1.o %t2.o -o %t3 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h --- a/lld/wasm/Config.h +++ b/lld/wasm/Config.h @@ -53,7 +53,7 @@ unsigned ltoPartitions; unsigned ltoo; unsigned optimize; - unsigned thinLTOJobs; + llvm::StringRef thinLTOJobs; llvm::StringRef entry; llvm::StringRef outputFile; diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp --- a/lld/wasm/Driver.cpp +++ b/lld/wasm/Driver.cpp @@ -362,7 +362,7 @@ config->thinLTOCachePolicy = CHECK( parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)), "--thinlto-cache-policy: invalid cache policy"); - config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u); + config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs); errorHandler().verbose = args.hasArg(OPT_verbose); LLVM_DEBUG(errorHandler().verbose = true); threadsEnabled = args.hasFlag(OPT_threads, OPT_no_threads, true); @@ -415,8 +415,8 @@ error("invalid optimization level for LTO: " + Twine(config->ltoo)); if (config->ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (config->thinLTOJobs == 0) - error("--thinlto-jobs: number of threads must be > 0"); + if (!get_threadpool_strategy(config->thinLTOJobs)) + error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs); if (config->pie && config->shared) error("-shared and -pie may not be used together"); diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp --- a/lld/wasm/LTO.cpp +++ b/lld/wasm/LTO.cpp @@ -63,10 +63,8 @@ if (config->saveTemps) checkError(c.addSaveTemps(config->outputFile.str() + ".", /*UseInputModulePath*/ true)); - - lto::ThinBackend backend; - if (config->thinLTOJobs != -1U) - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + lto::ThinBackend backend = lto::createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(config->thinLTOJobs)); return std::make_unique(std::move(c), backend, config->ltoPartitions); } diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -228,7 +228,7 @@ /// This ThinBackend runs the individual backend jobs in-process. /// The default value means to use one job per hardware core (not hyper-thread). -ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0); +ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism); /// This ThinBackend writes individual module indexes to files, instead of /// running the individual backend jobs. This backend is for distributed builds diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -166,8 +166,20 @@ /// sockets. \p ThreadPoolNum represents a number bounded by [0, /// compute_thread_count()). void apply_thread_strategy(unsigned ThreadPoolNum) const; + + /// Finds the CPU socket where a thread should go. Returns 'None' if the + /// thread shall remain on the actual CPU socket. + Optional compute_cpu_socket(unsigned ThreadPoolNum) const; }; + /// Build a strategy from a number of threads as a string provided in \p Num. + /// When Num is above the max number of threads specified by the \p Default + /// strategy, we attempt to equally allocate the threads on all CPU sockets. + /// "0" or an empty string will return the \p Default strategy. + /// "all" for using all hardware threads. + Optional + get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default = {}); + /// Returns a thread strategy for tasks requiring significant memory or other /// resources. To be used for workloads where hardware_concurrency() proves to /// be less efficient. Avoid this strategy if doing lots of I/O. Currently @@ -182,6 +194,18 @@ return S; } + /// Like heavyweight_hardware_concurrency() above, but builds a strategy + /// based on the rules described for get_threadpool_strategy(). + /// If \p Num is invalid, returns a default strategy where one thread per + /// hardware core is used. + inline ThreadPoolStrategy heavyweight_hardware_concurrency(StringRef Num) { + Optional S = + get_threadpool_strategy(Num, heavyweight_hardware_concurrency()); + if (S) + return *S; + return heavyweight_hardware_concurrency(); + } + /// Returns a default thread strategy where all available hardware ressources /// are to be used, except for those initially excluded by an affinity mask. /// This function takes affinity into consideration. Returns 1 when LLVM is diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -477,7 +477,8 @@ LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) { if (!Backend) - this->Backend = createInProcessThinBackend(); + this->Backend = + createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); } LTO::LTO(Config Conf, ThinBackend Backend, @@ -1090,13 +1091,12 @@ public: InProcessThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, - unsigned ThinLTOParallelismLevel, + ThreadPoolStrategy ThinLTOParallelism, const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), - BackendThreadPool( - heavyweight_hardware_concurrency(ThinLTOParallelismLevel)), - AddStream(std::move(AddStream)), Cache(std::move(Cache)) { + BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)), + Cache(std::move(Cache)) { for (auto &Name : CombinedIndex.cfiFunctionDefs()) CfiFunctionDefs.insert( GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); @@ -1192,13 +1192,13 @@ }; } // end anonymous namespace -ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) { +ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) { return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex, const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) { return std::make_unique( - Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries, - AddStream, Cache); + Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream, + Cache); }; } diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp --- a/llvm/lib/Support/Threading.cpp +++ b/llvm/lib/Support/Threading.cpp @@ -84,16 +84,34 @@ int computeHostNumHardwareThreads(); unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + if (ThreadsRequested > 0) + return ThreadsRequested; + int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads() : sys::getHostNumPhysicalCores(); if (MaxThreadCount <= 0) MaxThreadCount = 1; + return MaxThreadCount; +} - // No need to create more threads than there are hardware threads, it would - // uselessly induce more context-switching and cache eviction. - if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount) - return MaxThreadCount; - return ThreadsRequested; +Optional +llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) { + if (Num == "all") + return llvm::hardware_concurrency(); + if (Num.empty()) + return Default; + unsigned V; + if (Num.getAsInteger(10, V)) + return None; // malformed 'Num' value + if (V == 0) + return Default; + + // Do not take the Default into account. This effectively disables + // heavyweight_hardware_concurrency() if the user asks for any number of + // threads on the cmd-line. + ThreadPoolStrategy S = llvm::hardware_concurrency(); + S.ThreadsRequested = V; + return S; } namespace { diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -273,7 +273,7 @@ int computeHostNumHardwareThreads() { #if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT) cpu_set_t Set; - if (sched_getaffinity(0, sizeof(Set), &Set)) + if (sched_getaffinity(0, sizeof(Set), &Set) == 0) return CPU_COUNT(&Set); #endif // Guard against std::thread::hardware_concurrency() returning 0. diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc --- a/llvm/lib/Support/Windows/Threading.inc +++ b/llvm/lib/Support/Windows/Threading.inc @@ -131,6 +131,10 @@ unsigned UsableThreads; unsigned ThreadsPerCore; uint64_t Affinity; + + unsigned useableCores() const { + return std::max(1U, UsableThreads / ThreadsPerCore); + } }; template @@ -232,33 +236,41 @@ return Threads; } -// Assign the current thread to a more appropriate CPU socket or CPU group -void llvm::ThreadPoolStrategy::apply_thread_strategy( - unsigned ThreadPoolNum) const { +// Finds the proper CPU socket where a thread number should go. Returns 'None' +// if the thread shall remain on the actual CPU socket. +Optional +llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const { ArrayRef Groups = getProcessorGroups(); + // Only one CPU socket in the system or process affinity was set, no need to + // move the thread(s) to another CPU socket. + if (Groups.size() <= 1) + return None; + + // We ask for less threads than there are hardware threads per CPU socket, no + // need to dispatch threads to other CPU sockets. + unsigned MaxThreadsPerSocket = + UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores(); + if (compute_thread_count() <= MaxThreadsPerSocket) + return None; assert(ThreadPoolNum < compute_thread_count() && "The thread index is not within thread strategy's range!"); - // In this mode, the ThreadNumber represents the core number, not the - // hyper-thread number. Assumes all NUMA groups have the same amount of - // hyper-threads. - if (!UseHyperThreads) - ThreadPoolNum *= Groups[0].ThreadsPerCore; - - unsigned ThreadRangeStart = 0; - for (unsigned I = 0; I < Groups.size(); ++I) { - const ProcessorGroup &G = Groups[I]; - if (ThreadPoolNum >= ThreadRangeStart && - ThreadPoolNum < ThreadRangeStart + G.UsableThreads) { - - GROUP_AFFINITY Affinity{}; - Affinity.Group = G.ID; - Affinity.Mask = G.Affinity; - SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr); - } - ThreadRangeStart += G.UsableThreads; - } + // Assumes the same number of hardware threads per CPU socket. + return (ThreadPoolNum * Groups.size()) / compute_thread_count(); +} + +// Assign the current thread to a more appropriate CPU socket or CPU group +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const { + Optional Socket = compute_cpu_socket(ThreadPoolNum); + if (!Socket) + return; + ArrayRef Groups = getProcessorGroups(); + GROUP_AFFINITY Affinity{}; + Affinity.Group = Groups[*Socket].ID; + Affinity.Mask = Groups[*Socket].Affinity; + SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr); } llvm::BitVector llvm::get_thread_affinity_mask() { diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp --- a/llvm/tools/gold/gold-plugin.cpp +++ b/llvm/tools/gold/gold-plugin.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -134,11 +135,9 @@ }; static OutputType TheOutputType = OT_NORMAL; static unsigned OptLevel = 2; - // Default parallelism of 0 used to indicate that user did not specify. - // Actual parallelism default value depends on implementation. // Currently only affects ThinLTO, where the default is the max cores in the - // system. - static unsigned Parallelism = 0; + // system. See llvm::get_threadpool_strategy() for acceptable values. + static std::string Parallelism; // Default regular LTO codegen parallelism (number of partitions). static unsigned ParallelCodeGenParallelismLevel = 1; #ifdef NDEBUG @@ -272,8 +271,10 @@ message(LDPL_FATAL, "Optimization level must be between 0 and 3"); OptLevel = opt[1] - '0'; } else if (opt.startswith("jobs=")) { - if (StringRef(opt_ + 5).getAsInteger(10, Parallelism)) - message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5); + StringRef Num(opt_ + 5); + if (!get_threadpool_strategy(Num)) + message(LDPL_FATAL, "Invalid parallelism level: %s", Num.data()); + Parallelism = Num; } else if (opt.startswith("lto-partitions=")) { if (opt.substr(strlen("lto-partitions=")) .getAsInteger(10, ParallelCodeGenParallelismLevel)) @@ -877,14 +878,15 @@ Conf.PTO.LoopVectorization = options::OptLevel > 1; Conf.PTO.SLPVectorization = options::OptLevel > 1; - if (options::Parallelism) - Backend = createInProcessThinBackend(options::Parallelism); if (options::thinlto_index_only) { std::string OldPrefix, NewPrefix; getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix); Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix, options::thinlto_emit_imports_files, LinkedObjectsFile, OnIndexWrite); + } else { + Backend = createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(options::Parallelism)); } Conf.OverrideTriple = options::triple; diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -68,9 +68,10 @@ "distributed backend case")); // Default to using all available threads in the system, but using only one -// thread per core, as indicated by the usage of -// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor. -static cl::opt Threads("thinlto-threads", cl::init(0)); +// thread per core (no SMT). +// Use -thinlto-threads=all to use hardware_concurrency() instead, which means +// to use all hardware threads or cores in the system. +static cl::opt Threads("thinlto-threads"); static cl::list SymbolResolutions( "r", @@ -286,7 +287,8 @@ /* LinkedObjectsFile */ nullptr, /* OnWrite */ {}); else - Backend = createInProcessThinBackend(Threads); + Backend = createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(Threads)); LTO Lto(std::move(Conf), std::move(Backend)); bool HasErrors = false;