Index: clang/lib/Driver/ToolChains/CommonArgs.h =================================================================== --- clang/lib/Driver/ToolChains/CommonArgs.h +++ clang/lib/Driver/ToolChains/CommonArgs.h @@ -88,7 +88,8 @@ bool isObjCAutoRefCount(const llvm::opt::ArgList &Args); -unsigned getLTOParallelism(const llvm::opt::ArgList &Args, const Driver &D); +llvm::StringRef getLTOParallelism(const llvm::opt::ArgList &Args, + const Driver &D); bool areOptimizationsEnabled(const llvm::opt::ArgList &Args); Index: clang/lib/Driver/ToolChains/CommonArgs.cpp =================================================================== --- clang/lib/Driver/ToolChains/CommonArgs.cpp +++ clang/lib/Driver/ToolChains/CommonArgs.cpp @@ -338,14 +338,17 @@ } } -unsigned tools::getLTOParallelism(const ArgList &Args, const Driver &D) { +llvm::StringRef tools::getLTOParallelism(const ArgList &Args, const Driver &D) { unsigned Parallelism = 0; Arg *LtoJobsArg = Args.getLastArg(options::OPT_flto_jobs_EQ); - if (LtoJobsArg && - StringRef(LtoJobsArg->getValue()).getAsInteger(10, Parallelism)) - D.Diag(diag::err_drv_invalid_int_value) << LtoJobsArg->getAsString(Args) - << LtoJobsArg->getValue(); - return Parallelism; + if (!LtoJobsArg) + return {}; + if (StringRef(LtoJobsArg->getValue()) == "all") + return LtoJobsArg->getValue(); + if (StringRef(LtoJobsArg->getValue()).getAsInteger(10, Parallelism)) + D.Diag(diag::err_drv_invalid_int_value) + << LtoJobsArg->getAsString(Args) << LtoJobsArg->getValue(); + return LtoJobsArg->getValue(); } // CloudABI uses -ffunction-sections and -fdata-sections by default. @@ -410,7 +413,8 @@ if (IsThinLTO) CmdArgs.push_back("-plugin-opt=thinlto"); - if (unsigned Parallelism = getLTOParallelism(Args, ToolChain.getDriver())) + StringRef Parallelism = getLTOParallelism(Args, ToolChain.getDriver()); + if (!Parallelism.empty()) CmdArgs.push_back( Args.MakeArgString("-plugin-opt=jobs=" + Twine(Parallelism))); Index: clang/lib/Driver/ToolChains/Darwin.cpp =================================================================== --- clang/lib/Driver/ToolChains/Darwin.cpp +++ clang/lib/Driver/ToolChains/Darwin.cpp @@ -605,8 +605,8 @@ getMachOToolChain().addProfileRTLibs(Args, CmdArgs); - if (unsigned Parallelism = - getLTOParallelism(Args, getToolChain().getDriver())) { + StringRef Parallelism = getLTOParallelism(Args, getToolChain().getDriver()); + if (!Parallelism.empty()) { CmdArgs.push_back("-mllvm"); CmdArgs.push_back(Args.MakeArgString("-threads=" + Twine(Parallelism))); } Index: lld/COFF/Config.h =================================================================== --- lld/COFF/Config.h +++ lld/COFF/Config.h @@ -230,6 +230,8 @@ bool swaprunNet = false; bool thinLTOEmitImportsFiles; bool thinLTOIndexOnly; + // Used for /opt:lldltojobs=all + bool thinLTOJobsHeavyWeightThreads = true; }; extern Configuration *config; Index: lld/COFF/Driver.cpp =================================================================== --- lld/COFF/Driver.cpp +++ lld/COFF/Driver.cpp @@ -1416,8 +1416,10 @@ error("/opt:lldlto: invalid optimization level: " + optLevel); } else if (s.startswith("lldltojobs=")) { StringRef jobs = s.substr(11); - if (jobs.getAsInteger(10, config->thinLTOJobs) || - config->thinLTOJobs == 0) + if (jobs == "all") + config->thinLTOJobsHeavyWeightThreads = false; + else if (jobs.getAsInteger(10, config->thinLTOJobs) || + config->thinLTOJobs == 0) error("/opt:lldltojobs: invalid job count: " + jobs); } else if (s.startswith("lldltopartitions=")) { StringRef n = s.substr(17); Index: lld/COFF/LTO.cpp =================================================================== --- lld/COFF/LTO.cpp +++ lld/COFF/LTO.cpp @@ -102,8 +102,12 @@ std::string(config->thinLTOPrefixReplace.first), std::string(config->thinLTOPrefixReplace.second), config->thinLTOEmitImportsFiles, indexFile.get(), OnIndexWrite); - } else if (config->thinLTOJobs != 0) { - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + } else { + ThreadPoolStrategy S = + config->thinLTOJobsHeavyWeightThreads + ? llvm::heavyweight_hardware_concurrency(config->thinLTOJobs) + : llvm::hardware_concurrency(); + backend = lto::createInProcessThinBackend(S); } ltoObj = std::make_unique(createConfig(), backend, Index: lld/ELF/Config.h =================================================================== --- lld/ELF/Config.h +++ lld/ELF/Config.h @@ -191,6 +191,7 @@ bool trace; bool thinLTOEmitImportsFiles; bool thinLTOIndexOnly; + bool thinLTOJobsHeavyWeightThreads = true; bool timeTraceEnabled; bool tocOptimize; bool undefinedVersion; @@ -244,7 +245,7 @@ unsigned ltoPartitions; unsigned ltoo; unsigned optimize; - unsigned thinLTOJobs; + unsigned thinLTOJobs = 0; unsigned timeTraceGranularity; int32_t splitStackAdjustSize; Index: lld/ELF/Driver.cpp =================================================================== --- lld/ELF/Driver.cpp +++ lld/ELF/Driver.cpp @@ -977,7 +977,6 @@ config->thinLTOIndexOnly = args.hasArg(OPT_thinlto_index_only) || args.hasArg(OPT_thinlto_index_only_eq); config->thinLTOIndexOnlyArg = args.getLastArgValue(OPT_thinlto_index_only_eq); - config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u); config->thinLTOObjectSuffixReplace = getOldNewOptions(args, OPT_thinlto_object_suffix_replace_eq); config->thinLTOPrefixReplace = @@ -1034,6 +1033,15 @@ for (auto *arg : args.filtered(OPT_plugin_opt)) parseClangOption(arg->getValue(), arg->getSpelling()); + if (auto *arg = args.getLastArgNoClaim(OPT_thinlto_jobs)) { + StringRef s = arg->getValue(); + if (s == "all") { + config->thinLTOJobsHeavyWeightThreads = false; + arg->claim(); + } else + config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, 0); + } + // Parse -mllvm options. for (auto *arg : args.filtered(OPT_mllvm)) parseClangOption(arg->getValue(), arg->getSpelling()); @@ -1042,8 +1050,6 @@ error("invalid optimization level for LTO: " + Twine(config->ltoo)); if (config->ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (config->thinLTOJobs == 0) - error("--thinlto-jobs: number of threads must be > 0"); if (config->splitStackAdjustSize < 0) error("--split-stack-adjust-size: size must be >= 0"); Index: lld/ELF/LTO.cpp =================================================================== --- lld/ELF/LTO.cpp +++ lld/ELF/LTO.cpp @@ -146,8 +146,12 @@ std::string(config->thinLTOPrefixReplace.first), std::string(config->thinLTOPrefixReplace.second), config->thinLTOEmitImportsFiles, indexFile.get(), onIndexWrite); - } else if (config->thinLTOJobs != -1U) { - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + } else { + ThreadPoolStrategy S = + config->thinLTOJobsHeavyWeightThreads + ? llvm::heavyweight_hardware_concurrency(config->thinLTOJobs) + : llvm::hardware_concurrency(); + backend = lto::createInProcessThinBackend(S); } ltoObj = std::make_unique(createConfig(), backend, Index: lld/test/COFF/thinlto.ll =================================================================== --- lld/test/COFF/thinlto.ll +++ lld/test/COFF/thinlto.ll @@ -6,6 +6,11 @@ ; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj ; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=1 +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s +; RUN: lld-link /lldsavetemps /out:%T/thinlto/main.exe /entry:main /subsystem:console %T/thinlto/main.obj %T/thinlto/foo.obj /opt:lldltojobs=all +; RUN: llvm-nm %T/thinlto/main.exe1.lto.obj | FileCheck %s + ; CHECK-NOT: U foo target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" Index: lld/test/ELF/basic.s =================================================================== --- lld/test/ELF/basic.s +++ lld/test/ELF/basic.s @@ -249,9 +249,19 @@ # RUN: not ld.lld %t --plugin-opt=lto-partitions=0 2>&1 | FileCheck --check-prefix=NOTHREADS %s # NOTHREADS: --lto-partitions: number of threads must be > 0 -# RUN: not ld.lld %t --thinlto-jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s -# RUN: not ld.lld %t --plugin-opt=jobs=0 2>&1 | FileCheck --check-prefix=NOTHREADSTHIN %s -# NOTHREADSTHIN: --thinlto-jobs: number of threads must be > 0 +# RUN: ld.lld %t --thinlto-jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --thinlto-jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# THREADSTHIN: basic.s.tmp +# RUN: not ld.lld %t --thinlto-jobs=1- -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s +# BADTHREADSTHIN: error: --{{.*}}jobs=1-: number expected, but got '1-' + +# RUN: ld.lld %t --plugin-opt=jobs=0 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=1 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=2 -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: ld.lld %t --plugin-opt=jobs=all -verbose 2>&1 | FileCheck --check-prefix=THREADSTHIN %s +# RUN: not ld.lld %t --plugin-opt=jobs=1- -verbose 2>&1 | FileCheck --check-prefix=BADTHREADSTHIN %s # RUN: not ld.lld %t -z ifunc-noplt -z text 2>&1 | FileCheck --check-prefix=NOIFUNCPLTNOTEXTREL %s # NOIFUNCPLTNOTEXTREL: -z text and -z ifunc-noplt may not be used together Index: lld/test/ELF/lto/thinlto.ll =================================================================== --- lld/test/ELF/lto/thinlto.ll +++ lld/test/ELF/lto/thinlto.ll @@ -16,7 +16,13 @@ ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 -; Then check without --thinlto-jobs (which currently default to hardware_concurrency) +; Test with all threads, on all cores, on all CPU sockets +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: ld.lld -save-temps --thinlto-jobs=all -shared %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT) ; RUN: ld.lld -shared %t1.o %t2.o -o %t3 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 Index: lld/test/wasm/lto/thinlto.ll =================================================================== --- lld/test/wasm/lto/thinlto.ll +++ lld/test/wasm/lto/thinlto.ll @@ -14,7 +14,13 @@ ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 -; Check without --thinlto-jobs (which currently default to hardware_concurrency) +; Test with all threads, on all cores, on all CPU sockets +; RUN: rm -f %t31.lto.o %t32.lto.o +; RUN: wasm-ld -r -save-temps --thinlto-jobs=all %t1.o %t2.o -o %t3 +; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 +; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 + +; Check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT) ; RUN: wasm-ld -r %t1.o %t2.o -o %t3 ; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1 ; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2 Index: lld/wasm/Config.h =================================================================== --- lld/wasm/Config.h +++ lld/wasm/Config.h @@ -46,6 +46,7 @@ bool stripDebug; bool stackFirst; bool trace; + bool thinLTOJobsHeavyWeightThreads = true; uint32_t globalBase; uint32_t initialMemory; uint32_t maxMemory; @@ -53,7 +54,7 @@ unsigned ltoPartitions; unsigned ltoo; unsigned optimize; - unsigned thinLTOJobs; + unsigned thinLTOJobs = 0; llvm::StringRef entry; llvm::StringRef outputFile; Index: lld/wasm/Driver.cpp =================================================================== --- lld/wasm/Driver.cpp +++ lld/wasm/Driver.cpp @@ -342,7 +342,15 @@ config->thinLTOCachePolicy = CHECK( parseCachePruningPolicy(args.getLastArgValue(OPT_thinlto_cache_policy)), "--thinlto-cache-policy: invalid cache policy"); - config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, -1u); + + if (auto *arg = args.getLastArgNoClaim(OPT_thinlto_jobs)) { + StringRef s = arg->getValue(); + if (s == "all") { + config->thinLTOJobsHeavyWeightThreads = false; + arg->claim(); + } else + config->thinLTOJobs = args::getInteger(args, OPT_thinlto_jobs, 0); + } errorHandler().verbose = args.hasArg(OPT_verbose); LLVM_DEBUG(errorHandler().verbose = true); threadsEnabled = args.hasFlag(OPT_threads, OPT_no_threads, true); @@ -395,8 +403,6 @@ error("invalid optimization level for LTO: " + Twine(config->ltoo)); if (config->ltoPartitions == 0) error("--lto-partitions: number of threads must be > 0"); - if (config->thinLTOJobs == 0) - error("--thinlto-jobs: number of threads must be > 0"); if (config->pie && config->shared) error("-shared and -pie may not be used together"); Index: lld/wasm/LTO.cpp =================================================================== --- lld/wasm/LTO.cpp +++ lld/wasm/LTO.cpp @@ -63,10 +63,11 @@ if (config->saveTemps) checkError(c.addSaveTemps(config->outputFile.str() + ".", /*UseInputModulePath*/ true)); - - lto::ThinBackend backend; - if (config->thinLTOJobs != -1U) - backend = lto::createInProcessThinBackend(config->thinLTOJobs); + ThreadPoolStrategy S = + config->thinLTOJobsHeavyWeightThreads + ? llvm::heavyweight_hardware_concurrency(config->thinLTOJobs) + : llvm::hardware_concurrency(); + lto::ThinBackend backend = lto::createInProcessThinBackend(S); return std::make_unique(std::move(c), backend, config->ltoPartitions); } Index: llvm/include/llvm/LTO/LTO.h =================================================================== --- llvm/include/llvm/LTO/LTO.h +++ llvm/include/llvm/LTO/LTO.h @@ -228,7 +228,7 @@ /// This ThinBackend runs the individual backend jobs in-process. /// The default value means to use one job per hardware core (not hyper-thread). -ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0); +ThinBackend createInProcessThinBackend(ThreadPoolStrategy Parallelism); /// This ThinBackend writes individual module indexes to files, instead of /// running the individual backend jobs. This backend is for distributed builds Index: llvm/lib/LTO/LTO.cpp =================================================================== --- llvm/lib/LTO/LTO.cpp +++ llvm/lib/LTO/LTO.cpp @@ -477,7 +477,8 @@ LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) { if (!Backend) - this->Backend = createInProcessThinBackend(); + this->Backend = + createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); } LTO::LTO(Config Conf, ThinBackend Backend, @@ -1090,13 +1091,12 @@ public: InProcessThinBackend( const Config &Conf, ModuleSummaryIndex &CombinedIndex, - unsigned ThinLTOParallelismLevel, + ThreadPoolStrategy ThinLTOParallelism, const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), - BackendThreadPool( - heavyweight_hardware_concurrency(ThinLTOParallelismLevel)), - AddStream(std::move(AddStream)), Cache(std::move(Cache)) { + BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)), + Cache(std::move(Cache)) { for (auto &Name : CombinedIndex.cfiFunctionDefs()) CfiFunctionDefs.insert( GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name))); @@ -1192,13 +1192,13 @@ }; } // end anonymous namespace -ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) { +ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) { return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex, const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) { return std::make_unique( - Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries, - AddStream, Cache); + Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream, + Cache); }; } Index: llvm/test/Transforms/PGOProfile/thinlto_samplepgo_icp3.ll =================================================================== --- llvm/test/Transforms/PGOProfile/thinlto_samplepgo_icp3.ll +++ llvm/test/Transforms/PGOProfile/thinlto_samplepgo_icp3.ll @@ -7,6 +7,10 @@ ; Test to make sure importing and dead stripping works in the ; case where the target is a local function that also indirectly calls itself. ; RUN: llvm-lto2 run -thinlto-threads=1 -save-temps -o %t3 %t.bc %t2.bc -r %t.bc,fptr,plx -r %t.bc,main,plx -r %t2.bc,_Z6updatei,pl -r %t2.bc,fptr,l -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS + +; Also test with all threads on +; RUN: llvm-lto2 run -thinlto-threads=all -save-temps -o %t3 %t.bc %t2.bc -r %t.bc,fptr,plx -r %t.bc,main,plx -r %t2.bc,_Z6updatei,pl -r %t2.bc,fptr,l -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS + ; Make sure we import the promted indirectly called target ; IMPORTS: Import _ZL3foov.llvm.0 Index: llvm/tools/gold/gold-plugin.cpp =================================================================== --- llvm/tools/gold/gold-plugin.cpp +++ llvm/tools/gold/gold-plugin.cpp @@ -139,6 +139,7 @@ static unsigned Parallelism = 0; // Default regular LTO codegen parallelism (number of partitions). static unsigned ParallelCodeGenParallelismLevel = 1; + static bool ParallelismHeavyWeight = true; #ifdef NDEBUG static bool DisableVerify = true; #else @@ -270,7 +271,10 @@ message(LDPL_FATAL, "Optimization level must be between 0 and 3"); OptLevel = opt[1] - '0'; } else if (opt.startswith("jobs=")) { - if (StringRef(opt_ + 5).getAsInteger(10, Parallelism)) + StringRef Num(opt_ + 5); + if (Num == "all") + ParallelismHeavyWeight = false; + else if (Num.getAsInteger(10, Parallelism)) message(LDPL_FATAL, "Invalid parallelism level: %s", opt_ + 5); } else if (opt.startswith("lto-partitions=")) { if (opt.substr(strlen("lto-partitions=")) @@ -875,14 +879,18 @@ Conf.PTO.LoopVectorization = options::OptLevel > 1; Conf.PTO.SLPVectorization = options::OptLevel > 1; - if (options::Parallelism) - Backend = createInProcessThinBackend(options::Parallelism); if (options::thinlto_index_only) { std::string OldPrefix, NewPrefix; getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix); Backend = createWriteIndexesThinBackend(OldPrefix, NewPrefix, options::thinlto_emit_imports_files, LinkedObjectsFile, OnIndexWrite); + } else { + ThreadPoolStrategy S = + options::ParallelismHeavyWeight + ? llvm::heavyweight_hardware_concurrency(options::Parallelism) + : llvm::hardware_concurrency(); + Backend = createInProcessThinBackend(S); } Conf.OverrideTriple = options::triple; Index: llvm/tools/llvm-lto2/llvm-lto2.cpp =================================================================== --- llvm/tools/llvm-lto2/llvm-lto2.cpp +++ llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -66,9 +66,10 @@ "distributed backend case")); // Default to using all available threads in the system, but using only one -// thread per core, as indicated by the usage of -// heavyweight_hardware_concurrency() in the InProcessThinBackend constructor. -static cl::opt Threads("thinlto-threads", cl::init(0)); +// thread per core (no SMT). +// Use -thinlto-threads=all to use hardware_concurrency() instead, which means +// to use all hardware threads or cores in the system. +static cl::opt Threads("thinlto-threads"); static cl::list SymbolResolutions( "r", @@ -276,6 +277,16 @@ Conf.PTO.LoopVectorization = Conf.OptLevel > 1; Conf.PTO.SLPVectorization = Conf.OptLevel > 1; + auto getStrategy = [](StringRef Num) { + if (Num == "all") + return llvm::hardware_concurrency(); + if (Num.empty()) + return ThreadPoolStrategy(); + unsigned V; + Num.getAsInteger(10, V); + return llvm::heavyweight_hardware_concurrency(V); + }; + ThinBackend Backend; if (ThinLTODistributedIndexes) Backend = createWriteIndexesThinBackend(/* OldPrefix */ "", @@ -284,7 +295,7 @@ /* LinkedObjectsFile */ nullptr, /* OnWrite */ {}); else - Backend = createInProcessThinBackend(Threads); + Backend = createInProcessThinBackend(getStrategy(Threads)); LTO Lto(std::move(Conf), std::move(Backend)); bool HasErrors = false;