Index: clang/lib/Driver/ToolChains/CommonArgs.h =================================================================== --- clang/lib/Driver/ToolChains/CommonArgs.h +++ clang/lib/Driver/ToolChains/CommonArgs.h @@ -88,7 +88,8 @@ bool isObjCAutoRefCount(const llvm::opt::ArgList &Args); -unsigned getLTOParallelism(const llvm::opt::ArgList &Args, const Driver &D); +llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args, + const Driver &D); bool areOptimizationsEnabled(const llvm::opt::ArgList &Args); Index: clang/lib/Driver/ToolChains/CommonArgs.cpp =================================================================== --- clang/lib/Driver/ToolChains/CommonArgs.cpp +++ clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -50,6 +50,7 @@ #include "llvm/Support/Program.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/TargetParser.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include "llvm/Support/YAMLParser.h" @@ -338,14 +339,15 @@ } } -unsigned tools::getLTOParallelism(const ArgList &Args, const Driver &D) { +llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) { unsigned Parallelism = 0; Arg *LtoJobsArg = Args.getLastArg(options::OPT_flto_jobs_EQ); - if (LtoJobsArg && - StringRef(LtoJobsArg->getValue()).getAsInteger(10, Parallelism)) - D.Diag(diag::err_drv_invalid_int_value) << LtoJobsArg->getAsString(Args) - << LtoJobsArg->getValue(); - return Parallelism; + if (!LtoJobsArg) + return {}; + if (!llvm::get_threadpool_strategy(LtoJobsArg->getValue())) + D.Diag(diag::err_drv_invalid_int_value) + << LtoJobsArg->getAsString(Args) << LtoJobsArg->getValue(); + return LtoJobsArg->getValue(); } // CloudABI uses -ffunction-sections and -fdata-sections by default. @@ -410,7 +412,8 @@ if (IsThinLTO) CmdArgs.push_back("-plugin-opt=thinlto"); - if (unsigned Parallelism = getLTOParallelism(Args, ToolChain.getDriver())) + StringRef Parallelism = getLTOParallelism(Args, ToolChain.getDriver()); + if (!Parallelism.empty()) CmdArgs.push_back( Args.MakeArgString("-plugin-opt=jobs=" + Twine(Parallelism))); Index: clang/lib/Driver/ToolChains/Darwin.cpp =================================================================== --- clang/lib/Driver/ToolChains/Darwin.cpp +++ clang/lib/Driver/ToolChains/Darwin.cpp @@ -23,6 +23,7 @@ #include "llvm/Support/Path.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/TargetParser.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/VirtualFileSystem.h" #include // ::getenv @@ -605,10 +606,12 @@ getMachOToolChain().addProfileRTLibs(Args, CmdArgs); - if (unsigned Parallelism = - getLTOParallelism(Args, getToolChain().getDriver())) { + StringRef Parallelism = getLTOParallelism(Args, getToolChain().getDriver()); + if (!Parallelism.empty()) { CmdArgs.push_back("-mllvm"); - CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(Parallelism))); + unsigned NumThreads = + llvm::get_threadpool_strategy(Parallelism)->compute_thread_count(); + CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(NumThreads))); } if (getToolChain().ShouldLinkCXXStdlib(Args)) Index: lld/COFF/Config.h =================================================================== --- lld/COFF/Config.h +++ lld/COFF/Config.h @@ -144,7 +144,7 @@ unsigned ltoo = 2; // Used for /opt:lldltojobs=N - unsigned thinLTOJobs = 0; + std::string thinLTOJobs; // Used for /opt:lldltopartitions=N unsigned ltoPartitions = 1; Index: lld/COFF/Driver.cpp =================================================================== --- lld/COFF/Driver.cpp +++ lld/COFF/Driver.cpp @@ -1416,9 +1416,9 @@ error("/opt:lldlto: invalid optimization level: " + optLevel); } else if (s.startswith("lldltojobs=")) { StringRef jobs = s.substr(11); - if (jobs.getAsInteger(10, config->thinLTOJobs) || - config->thinLTOJobs == 0) + if (!get_threadpool_strategy(jobs)) error("/opt:lldltojobs: invalid job count: " + jobs); + config->thinLTOJobs = jobs.str(); } else if (s.startswith("lldltopartitions=")) { StringRef n = s.substr(17); if (n.getAsInteger(10, config->ltoPartitions) || Index: lld/COFF/LTO.cpp =================================================================== --- lld/COFF/LTO.cpp +++ lld/COFF/LTO.cpp @@ -101,8 +101,9 @@ std::string(config->thinLTOPrefixReplace.first), std::string(config->thinLTOPrefixReplace.second), config->thinLTOEmitImportsFiles, indexFile.get(), OnIndexWrite); - } else if (config->thinLTOJobs != 0) { - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + } else { + backend = lto::createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(config->thinLTOJobs)); } ltoObj = std::make_unique(createConfig(), backend, Index: lld/ELF/Config.h =================================================================== --- lld/ELF/Config.h +++ lld/ELF/Config.h @@ -244,7 +244,7 @@ unsigned ltoPartitions; unsigned ltoo; unsigned optimize; - unsigned thinLTOJobs; + StringRef thinLTOJobs; unsigned timeTraceGranularity; int32_t splitStackAdjustSize; Index: lld/ELF/Driver.cpp =================================================================== --- lld/ELF/Driver.cpp +++ lld/ELF/Driver.cpp @@ -977,7 +977,7 @@ config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); - config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u); + config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs); config->thinLTOObjectSuffixReplace = getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq); config->thinLTOPrefixReplace = @@ -1042,8 +1042,8 @@ error("invalid optimization level for LTO: " + Twine(config->ltoo)); if (config->ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (config->thinLTOJobs == 0) - error("--thinlto-jobs: number of threads must be > 0"); + if (!get_threadpool_strategy(config->thinLTOJobs)) + error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs); if (config->splitStackAdjustSize < 0) error("--split-stack-adjust-size: size must be >= 0"); Index: lld/ELF/LTO.cpp =================================================================== --- lld/ELF/LTO.cpp +++ lld/ELF/LTO.cpp @@ -146,8 +146,9 @@ std::string(config->thinLTOPrefixReplace.first), std::string(config->thinLTOPrefixReplace.second), config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); - } else if (config->thinLTOJobs != -1U) { - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + } else { + backend = lto::createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(config->thinLTOJobs)); } ltoObj = std::make_unique(createConfig(), backend, Index: lld/test/COFF/thinlto.ll =================================================================== --- lld/test/COFF/thinlto.ll +++ lld/test/COFF/thinlto.ll @@ -6,6 +6,15 @@ ; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj ; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1 +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=all +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1000 +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: not lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=foo 2>&1 | FileCheck %s --check-prefix=BAD-JOBS +; BAD-JOBS: error: /opt:lldltojobs: invalid job count: foo + ; CHECK-NOT: U foo target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" Index: lld/test/ELF/basic.s =================================================================== --- lld/test/ELF/basic.s +++ lld/test/ELF/basic.s @@ -249,9 +249,21 @@ # RUN: not ld.lld %t --plugin-opt=lto-partitions=0 2>&1 | FileCheck --check-prefix=NOTHREADS %s # NOTHREADS: --lto-partitions: number of threads must be > 0 -# RUN: not ld.lld %t --thinlto-jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s -# RUN: not ld.lld %t --plugin-opt=jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s -# NOTHREADSTHIN: --thinlto-jobs: number of threads must be > 0 +# RUN: ld.lld %t --thinlto-jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# THREADSTHIN: basic.s.tmp +# RUN: not ld.lld %t --thinlto-jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s +# BADTHREADSTHIN: error: --thinlto-jobs: invalid job count: foo + +# RUN: ld.lld %t --plugin-opt=jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=1000 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: not ld.lld %t --plugin-opt=jobs=foo -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s # RUN: not ld.lld %t -z ifunc-noplt -z text 2>&1 | FileCheck --check-prefix=NOIFUNCPLTNOTEXTREL %s # NOIFUNCPLTNOTEXTREL: -z text and -z ifunc-noplt may not be used together Index: lld/test/ELF/lto/thinlto.ll =================================================================== --- lld/test/ELF/lto/thinlto.ll +++ lld/test/ELF/lto/thinlto.ll @@ -16,8 +16,25 @@ ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 -; Then check without --thinlto-jobs (which currently default to hardware_concurrency) -; RUN: ld.lld -shared %t1.o %t2.o -o %t3 +; Test with all threads, on all cores, on all CPU sockets +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: ld.lld -save-temps --thinlto-jobs=all -shared %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with many more threads than the system has +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: ld.lld -save-temps --thinlto-jobs=1000 -shared %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with a bad value +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS +; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo + +; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT) +; RUN: ld.lld -shared -save-temps %t1.o %t2.o -o %t3 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 Index: lld/test/wasm/lto/thinlto.ll =================================================================== --- lld/test/wasm/lto/thinlto.ll +++ lld/test/wasm/lto/thinlto.ll @@ -14,8 +14,26 @@ ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 -; Check without --thinlto-jobs (which currently default to hardware_concurrency) -; RUN: wasm-ld -r %t1.o %t2.o -o %t3 +; Test with all threads, on all cores, on all CPU sockets +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: wasm-ld -r -save-temps --thinlto-jobs=all %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with many more threads than the system has +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: wasm-ld -r -save-temps --thinlto-jobs=1000 %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Test with a bad value +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: not wasm-ld -r -save-temps --thinlto-jobs=foo %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS +; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo + +; Check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT) +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: wasm-ld -r -save-temps %t1.o %t2.o -o %t3 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 Index: lld/wasm/Config.h =================================================================== --- lld/wasm/Config.h +++ lld/wasm/Config.h @@ -53,7 +53,7 @@ unsigned ltoPartitions; unsigned ltoo; unsigned optimize; - unsigned thinLTOJobs; + llvm::StringRef thinLTOJobs; llvm::StringRef entry; llvm::StringRef outputFile; Index: lld/wasm/Driver.cpp =================================================================== --- lld/wasm/Driver.cpp +++ lld/wasm/Driver.cpp @@ -342,7 +342,7 @@ config->thinLTOCachePolicy = CHECK( parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)), "--thinlto-cache-policy: invalid cache policy"); - config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u); + config->thinLTOJobs = args.getLastArgValue(OPT_thinlto_jobs); errorHandler().verbose = args.hasArg(OPT_verbose); LLVM_DEBUG(errorHandler().verbose = true); threadsEnabled = args.hasFlag(OPT_threads, OPT_no_threads, true); @@ -395,8 +395,8 @@ error("invalid optimization level for LTO: " + Twine(config->ltoo)); if (config->ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (config->thinLTOJobs == 0) - error("--thinlto-jobs: number of threads must be > 0"); + if (!get_threadpool_strategy(config->thinLTOJobs)) + error("--thinlto-jobs: invalid job count: " + config->thinLTOJobs); if (config->pie && config->shared) error("-shared and -pie may not be used together"); Index: lld/wasm/LTO.cpp =================================================================== --- lld/wasm/LTO.cpp +++ lld/wasm/LTO.cpp @@ -63,10 +63,8 @@ if (config->saveTemps) checkError(c.addSaveTemps(config->outputFile.str() + ".", /*UseInputModulePath*/ true)); - - lto::ThinBackend backend; - if (config->thinLTOJobs != -1U) - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + lto::ThinBackend backend = lto::createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(config->thinLTOJobs)); return std::make_unique(std::move(c), backend, config->ltoPartitions); } Index: llvm/include/llvm/LTO/LTO.h =================================================================== --- llvm/include/llvm/LTO/LTO.h +++ llvm/include/llvm/LTO/LTO.h @@ -228,7 +228,7 @@ /// This ThinBackend runs the individual backend jobs in-process. /// The default value means to use one job per hardware core (not hyper-thread). -ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0); +ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism); /// This ThinBackend writes individual module indexes to files, instead of /// running the individual backend jobs. This backend is for distributed builds Index: llvm/include/llvm/Support/Threading.h =================================================================== --- llvm/include/llvm/Support/Threading.h +++ llvm/include/llvm/Support/Threading.h @@ -166,8 +166,20 @@ /// sockets. \p ThreadPoolNum represents a number bounded by [0, /// compute_thread_count()). void apply_thread_strategy(unsigned ThreadPoolNum) const; + + /// Finds the CPU socket where a thread should go. Returns 'None' if the + /// thread shall remain on the actual CPU socket. + Optional compute_cpu_socket(unsigned ThreadPoolNum) const; }; + /// Build a strategy from a number of threads as a string provided in \p Num. + /// When Num is above the max number of threads specified by the \p Default + /// strategy, we attempt to equally allocate the threads on all CPU sockets. + /// "0" or an empty string will return the \p Default strategy. + /// "all" for using all hardware threads. + Optional + get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default = {}); + /// Returns a thread strategy for tasks requiring significant memory or other /// resources. To be used for workloads where hardware_concurrency() proves to /// be less efficient. Avoid this strategy if doing lots of I/O. Currently @@ -182,6 +194,18 @@ return S; } + /// Like heavyweight_hardware_concurrency() above, but builds a strategy + /// based on the rules described for get_threadpool_strategy(). + /// If \p Num is invalid, returns a default strategy where one thread per + /// hardware core is used. + inline ThreadPoolStrategy heavyweight_hardware_concurrency(StringRef Num) { + Optional S = + get_threadpool_strategy(Num, heavyweight_hardware_concurrency()); + if (S) + return *S; + return heavyweight_hardware_concurrency(); + } + /// Returns a default thread strategy where all available hardware ressources /// are to be used, except for those initially excluded by an affinity mask. /// This function takes affinity into consideration. Returns 1 when LLVM is Index: llvm/lib/LTO/LTO.cpp =================================================================== --- llvm/lib/LTO/LTO.cpp +++ llvm/lib/LTO/LTO.cpp @@ -477,7 +477,8 @@ LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) { if (!Backend) - this->Backend = createInProcessThinBackend(); + this->Backend = + createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); } LTO::LTO(Config Conf, ThinBackend Backend, @@ -1090,13 +1091,12 @@ public: InProcessThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, - unsigned ThinLTOParallelismLevel, + ThreadPoolStrategy ThinLTOParallelism, const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), - BackendThreadPool( - heavyweight_hardware_concurrency(ThinLTOParallelismLevel)), - AddStream(std::move(AddStream)), Cache(std::move(Cache)) { + BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)), + Cache(std::move(Cache)) { for (auto &Name : CombinedIndex.cfiFunctionDefs()) CfiFunctionDefs.insert( GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); @@ -1192,13 +1192,13 @@ }; } // end anonymous namespace -ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) { +ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) { return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex, const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) { return std::make_unique( - Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries, - AddStream, Cache); + Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream, + Cache); }; } Index: llvm/lib/Support/Threading.cpp =================================================================== --- llvm/lib/Support/Threading.cpp +++ llvm/lib/Support/Threading.cpp @@ -84,16 +84,34 @@ int computeHostNumHardwareThreads(); unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + if (ThreadsRequested > 0) + return ThreadsRequested; + int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads() : sys::getHostNumPhysicalCores(); if (MaxThreadCount <= 0) MaxThreadCount = 1; + return MaxThreadCount; +} - // No need to create more threads than there are hardware threads, it would - // uselessly induce more context-switching and cache eviction. - if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount) - return MaxThreadCount; - return ThreadsRequested; +Optional +llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) { + if (Num == "all") + return llvm::hardware_concurrency(); + if (Num.empty()) + return Default; + unsigned V; + if (Num.getAsInteger(10, V)) + return None; // malformed 'Num' value + if (V == 0) + return Default; + + // Do not take the Default into account. This effectively disables + // heavyweight_hardware_concurrency() if the user asks for any number of + // threads on the cmd-line. + ThreadPoolStrategy S = llvm::hardware_concurrency(); + S.ThreadsRequested = V; + return S; } namespace { Index: llvm/lib/Support/Windows/Threading.inc =================================================================== --- llvm/lib/Support/Windows/Threading.inc +++ llvm/lib/Support/Windows/Threading.inc @@ -131,6 +131,10 @@ unsigned UsableThreads; unsigned ThreadsPerCore; uint64_t Affinity; + + unsigned useableCores() const { + return std::max(1U, UsableThreads / ThreadsPerCore); + } }; template @@ -232,33 +236,41 @@ return Threads; } -// Assign the current thread to a more appropriate CPU socket or CPU group -void llvm::ThreadPoolStrategy::apply_thread_strategy( - unsigned ThreadPoolNum) const { +// Finds the proper CPU socket where a thread number should go. Returns 'None' +// if the thread shall remain on the actual CPU socket. +Optional +llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const { ArrayRef Groups = getProcessorGroups(); + // Only one CPU socket in the system or process affinity was set, no need to + // move the thread(s) to another CPU socket. + if (Groups.size() <= 1) + return None; + + // We ask for less threads than there are hardware threads per CPU socket, no + // need to dispatch threads to other CPU sockets. + unsigned MaxThreadsPerSocket = + UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores(); + if (compute_thread_count() <= MaxThreadsPerSocket) + return None; assert(ThreadPoolNum < compute_thread_count() && "The thread index is not within thread strategy's range!"); - // In this mode, the ThreadNumber represents the core number, not the - // hyper-thread number. Assumes all NUMA groups have the same amount of - // hyper-threads. - if (!UseHyperThreads) - ThreadPoolNum *= Groups[0].ThreadsPerCore; - - unsigned ThreadRangeStart = 0; - for (unsigned I = 0; I < Groups.size(); ++I) { - const ProcessorGroup &G = Groups[I]; - if (ThreadPoolNum >= ThreadRangeStart && - ThreadPoolNum < ThreadRangeStart + G.UsableThreads) { - - GROUP_AFFINITY Affinity{}; - Affinity.Group = G.ID; - Affinity.Mask = G.Affinity; - SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr); - } - ThreadRangeStart += G.UsableThreads; - } + // Assumes the same number of hardware threads per CPU socket. + return (ThreadPoolNum * Groups.size()) / compute_thread_count(); +} + +// Assign the current thread to a more appropriate CPU socket or CPU group +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const { + Optional Socket = compute_cpu_socket(ThreadPoolNum); + if (!Socket) + return; + ArrayRef Groups = getProcessorGroups(); + GROUP_AFFINITY Affinity{}; + Affinity.Group = Groups[*Socket].ID; + Affinity.Mask = Groups[*Socket].Affinity; + SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr); } llvm::BitVector llvm::get_thread_affinity_mask() { Index: llvm/test/Transforms/PGOProfile/thinlto_samplepgo_icp3.ll =================================================================== --- llvm/test/Transforms/PGOProfile/thinlto_samplepgo_icp3.ll +++ llvm/test/Transforms/PGOProfile/thinlto_samplepgo_icp3.ll @@ -7,6 +7,16 @@ ; Test to make sure importing and dead stripping works in the ; case where the target is a local function that also indirectly calls itself. ; RUN: llvm-lto2 run -thinlto-threads=1 -save-temps -o %t3 %t.bc %t2.bc -r %t.bc,fptr,plx -r %t.bc,main,plx -r %t2.bc,_Z6updatei,pl -r %t2.bc,fptr,l -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS + +; Also test with all threads on +; RUN: llvm-lto2 run -thinlto-threads=all -save-temps -o %t3 %t.bc %t2.bc -r %t.bc,fptr,plx -r %t.bc,main,plx -r %t2.bc,_Z6updatei,pl -r %t2.bc,fptr,l -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS + +; Run with more threads than there are in the system +; RUN: llvm-lto2 run -thinlto-threads=1000 -save-temps -o %t3 %t.bc %t2.bc -r %t.bc,fptr,plx -r %t.bc,main,plx -r %t2.bc,_Z6updatei,pl -r %t2.bc,fptr,l -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS + +; Provide a wrong thread count argument +; RUN: llvm-lto2 run -thinlto-threads=foo -save-temps -o %t3 %t.bc %t2.bc -r %t.bc,fptr,plx -r %t.bc,main,plx -r %t2.bc,_Z6updatei,pl -r %t2.bc,fptr,l -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS + ; Make sure we import the promted indirectly called target ; IMPORTS: Import _ZL3foov.llvm.0 Index: llvm/tools/gold/gold-plugin.cpp =================================================================== --- llvm/tools/gold/gold-plugin.cpp +++ llvm/tools/gold/gold-plugin.cpp @@ -28,6 +28,7 @@ #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -132,11 +133,9 @@ }; static OutputType TheOutputType = OT_NORMAL; static unsigned OptLevel = 2; - // Default parallelism of 0 used to indicate that user did not specify. - // Actual parallelism default value depends on implementation. // Currently only affects ThinLTO, where the default is the max cores in the - // system. - static unsigned Parallelism = 0; + // system. See llvm::get_threadpool_strategy() for acceptable values. + static std::string Parallelism; // Default regular LTO codegen parallelism (number of partitions). static unsigned ParallelCodeGenParallelismLevel = 1; #ifdef NDEBUG @@ -270,8 +269,10 @@ message(LDPL_FATAL, "Optimization level must be between 0 and 3"); OptLevel = opt[1] - '0'; } else if (opt.startswith("jobs=")) { - if (StringRef(opt_ + 5).getAsInteger(10, Parallelism)) - message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5); + StringRef Num(opt_ + 5); + if (!get_threadpool_strategy(Num)) + message(LDPL_FATAL, "Invalid parallelism level: %s", Num.data()); + Parallelism = Num; } else if (opt.startswith("lto-partitions=")) { if (opt.substr(strlen("lto-partitions=")) .getAsInteger(10, ParallelCodeGenParallelismLevel)) @@ -875,14 +876,15 @@ Conf.PTO.LoopVectorization = options::OptLevel > 1; Conf.PTO.SLPVectorization = options::OptLevel > 1; - if (options::Parallelism) - Backend = createInProcessThinBackend(options::Parallelism); if (options::thinlto_index_only) { std::string OldPrefix, NewPrefix; getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix); Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix, options::thinlto_emit_imports_files, LinkedObjectsFile, OnIndexWrite); + } else { + Backend = createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(options::Parallelism)); } Conf.OverrideTriple = options::triple; Index: llvm/tools/llvm-lto2/llvm-lto2.cpp =================================================================== --- llvm/tools/llvm-lto2/llvm-lto2.cpp +++ llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -66,9 +66,10 @@ "distributed backend case")); // Default to using all available threads in the system, but using only one -// thread per core, as indicated by the usage of -// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor. -static cl::opt Threads("thinlto-threads", cl::init(0)); +// thread per core (no SMT). +// Use -thinlto-threads=all to use hardware_concurrency() instead, which means +// to use all hardware threads or cores in the system. +static cl::opt Threads("thinlto-threads"); static cl::list SymbolResolutions( "r", @@ -284,7 +285,8 @@ /* LinkedObjectsFile */ nullptr, /* OnWrite */ {}); else - Backend = createInProcessThinBackend(Threads); + Backend = createInProcessThinBackend( + llvm::heavyweight_hardware_concurrency(Threads)); LTO Lto(std::move(Conf), std::move(Backend)); bool HasErrors = false;