diff --git a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp --- a/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp +++ b/clang-tools-extra/clang-doc/tool/ClangDocMain.cpp @@ -268,8 +268,7 @@ Error = false; llvm::sys::Mutex IndexMutex; // ExecutorConcurrency is a flag exposed by AllTUsExecution.h - llvm::ThreadPool Pool(ExecutorConcurrency == 0 ? llvm::hardware_concurrency() - : ExecutorConcurrency); + llvm::ThreadPool Pool(llvm::hardware_concurrency(ExecutorConcurrency)); for (auto &Group : USRToBitcode) { Pool.async([&]() { std::vector> Infos; diff --git a/clang-tools-extra/clangd/TUScheduler.cpp b/clang-tools-extra/clangd/TUScheduler.cpp --- a/clang-tools-extra/clangd/TUScheduler.cpp +++ b/clang-tools-extra/clangd/TUScheduler.cpp @@ -842,13 +842,7 @@ } // namespace unsigned getDefaultAsyncThreadsCount() { - unsigned HardwareConcurrency = llvm::heavyweight_hardware_concurrency(); - // heavyweight_hardware_concurrency may fall back to hardware_concurrency. - // C++ standard says that hardware_concurrency() may return 0; fallback to 1 - // worker thread in that case. - if (HardwareConcurrency == 0) - return 1; - return HardwareConcurrency; + return llvm::heavyweight_hardware_concurrency().compute_thread_count(); } FileStatus TUStatus::render(PathRef File) const { diff --git a/clang-tools-extra/clangd/index/Background.h b/clang-tools-extra/clangd/index/Background.h --- a/clang-tools-extra/clangd/index/Background.h +++ b/clang-tools-extra/clangd/index/Background.h @@ -135,7 +135,7 @@ Context BackgroundContext, const FileSystemProvider &, const GlobalCompilationDatabase &CDB, BackgroundIndexStorage::Factory IndexStorageFactory, - size_t ThreadPoolSize = llvm::heavyweight_hardware_concurrency(), + size_t ThreadPoolSize = 0, // 0 = use all hardware threads std::function OnProgress = nullptr); ~BackgroundIndex(); // Blocks while the current task finishes. diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp --- a/clang-tools-extra/clangd/index/Background.cpp +++ b/clang-tools-extra/clangd/index/Background.cpp @@ -148,9 +148,10 @@ CDB.watch([&](const std::vector &ChangedFiles) { enqueue(ChangedFiles); })) { - assert(ThreadPoolSize > 0 && "Thread pool size can't be zero."); + assert(Rebuilder.TUsBeforeFirstBuild > 0 && + "Thread pool size can't be zero."); assert(this->IndexStorageFactory && "Storage factory can not be null!"); - for (unsigned I = 0; I < ThreadPoolSize; ++I) { + for (unsigned I = 0; I < Rebuilder.TUsBeforeFirstBuild; ++I) { ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1), [this] { WithContext Ctx(this->BackgroundContext.clone()); Queue.work([&] { Rebuilder.idle(); }); diff --git a/clang-tools-extra/clangd/index/BackgroundRebuild.h b/clang-tools-extra/clangd/index/BackgroundRebuild.h --- a/clang-tools-extra/clangd/index/BackgroundRebuild.h +++ b/clang-tools-extra/clangd/index/BackgroundRebuild.h @@ -49,7 +49,9 @@ public: BackgroundIndexRebuilder(SwapIndex *Target, FileSymbols *Source, unsigned Threads) - : TUsBeforeFirstBuild(Threads), Target(Target), Source(Source) {} + : TUsBeforeFirstBuild(llvm::heavyweight_hardware_concurrency(Threads) + .compute_thread_count()), + Target(Target), Source(Source) {} // Called to indicate a TU has been indexed. // May rebuild, if enough TUs have been indexed. diff --git a/clang/lib/Tooling/AllTUsExecution.cpp b/clang/lib/Tooling/AllTUsExecution.cpp --- a/clang/lib/Tooling/AllTUsExecution.cpp +++ b/clang/lib/Tooling/AllTUsExecution.cpp @@ -114,8 +114,7 @@ auto &Action = Actions.front(); { - llvm::ThreadPool Pool(ThreadCount == 0 ? llvm::hardware_concurrency() - : ThreadCount); + llvm::ThreadPool Pool(llvm::hardware_concurrency(ThreadCount)); for (std::string File : Files) { Pool.async( [&](std::string Path) { diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp --- a/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp +++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp @@ -106,7 +106,8 @@ // sharding gives a performance edge by reducing the lock contention. // FIXME: A better heuristic might also consider the OS to account for // the different cost of lock contention on different OSes. - NumShards = std::max(2u, llvm::hardware_concurrency() / 4); + NumShards = + std::max(2u, llvm::hardware_concurrency().compute_thread_count() / 4); CacheShards = std::make_unique(NumShards); } diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp --- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -485,15 +485,9 @@ DependencyScanningService Service(ScanMode, Format, ReuseFileManager, SkipExcludedPPRanges); -#if LLVM_ENABLE_THREADS - unsigned NumWorkers = - NumThreads == 0 ? llvm::hardware_concurrency() : NumThreads; -#else - unsigned NumWorkers = 1; -#endif - llvm::ThreadPool Pool(NumWorkers); + llvm::ThreadPool Pool(llvm::hardware_concurrency(NumThreads)); std::vector> WorkerTools; - for (unsigned I = 0; I < NumWorkers; ++I) + for (unsigned I = 0; I < Pool.getThreadCount(); ++I) WorkerTools.push_back(std::make_unique(Service)); std::vector Inputs; @@ -508,9 +502,9 @@ if (Verbose) { llvm::outs() << "Running clang-scan-deps on " << Inputs.size() - << " files using " << NumWorkers << " workers\n"; + << " files using " << Pool.getThreadCount() << " workers\n"; } - for (unsigned I = 0; I < NumWorkers; ++I) { + for (unsigned I = 0; I < Pool.getThreadCount(); ++I) { Pool.async([I, &Lock, &Index, &Inputs, &HadErrors, &FD, &WorkerTools, &DependencyOS, &Errs]() { llvm::StringSet<> AlreadySeenModules; diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -2747,8 +2747,8 @@ size_t numShards = 32; size_t concurrency = 1; if (threadsEnabled) - concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), numShards); + concurrency = std::min( + hardware_concurrency().compute_thread_count(), numShards); // A sharded map to uniquify symbols by name. std::vector> map(numShards); @@ -3191,8 +3191,8 @@ // operations in the following tight loop. size_t concurrency = 1; if (threadsEnabled) - concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), numShards); + concurrency = std::min( + hardware_concurrency().compute_thread_count(), numShards); // Add section pieces to the builders. parallelForEachN(0, concurrency, [&](size_t threadId) { diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -227,7 +227,8 @@ AddStreamFn AddStream, NativeObjectCache Cache)>; /// This ThinBackend runs the individual backend jobs in-process. -ThinBackend createInProcessThinBackend(unsigned ParallelismLevel); +/// The default value means to use one job per hardware core (not hyper-thread). +ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0); /// This ThinBackend writes individual module indexes to files, instead of /// running the individual backend jobs. This backend is for distributed builds diff --git a/llvm/include/llvm/Support/ThreadPool.h b/llvm/include/llvm/Support/ThreadPool.h --- a/llvm/include/llvm/Support/ThreadPool.h +++ b/llvm/include/llvm/Support/ThreadPool.h @@ -13,7 +13,9 @@ #ifndef LLVM_SUPPORT_THREAD_POOL_H #define LLVM_SUPPORT_THREAD_POOL_H +#include "llvm/ADT/BitVector.h" #include "llvm/Config/llvm-config.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/thread.h" #include @@ -38,12 +40,11 @@ using TaskTy = std::function; using PackagedTaskTy = std::packaged_task; - /// Construct a pool with the number of threads found by - /// hardware_concurrency(). - ThreadPool(); - - /// Construct a pool of \p ThreadCount threads - ThreadPool(unsigned ThreadCount); + /// Construct a pool using the hardware strategy \p S for mapping hardware + /// execution resources (threads, cores, CPUs) + /// Defaults to using the maximum execution resources in the system, but + /// excluding any resources contained in the affinity mask. + ThreadPool(ThreadPoolStrategy S = hardware_concurrency()); /// Blocking destructor: the pool will wait for all the threads to complete. ~ThreadPool(); @@ -68,6 +69,8 @@ /// It is an error to try to add new tasks while blocking on this call. void wait(); + unsigned getThreadCount() const { return ThreadCount; } + private: /// Asynchronous submission of a task to the pool. The returned future can be /// used to wait for the task to finish and is *non-blocking* on destruction. @@ -94,6 +97,8 @@ /// Signal for the destruction of the pool, asking thread to exit. bool EnableFlag; #endif + + unsigned ThreadCount; }; } diff --git a/llvm/include/llvm/Support/Threading.h b/llvm/include/llvm/Support/Threading.h --- a/llvm/include/llvm/Support/Threading.h +++ b/llvm/include/llvm/Support/Threading.h @@ -14,6 +14,7 @@ #ifndef LLVM_SUPPORT_THREADING_H #define LLVM_SUPPORT_THREADING_H +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/FunctionExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX @@ -143,20 +144,52 @@ #endif } - /// Get the amount of currency to use for tasks requiring significant - /// memory or other resources. Currently based on physical cores, if - /// available for the host system, otherwise falls back to - /// thread::hardware_concurrency(). - /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF - unsigned heavyweight_hardware_concurrency(); - - /// Get the number of threads that the current program can execute - /// concurrently. On some systems std::thread::hardware_concurrency() returns - /// the total number of cores, without taking affinity into consideration. - /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF. - /// Fallback to std::thread::hardware_concurrency() if sched_getaffinity is - /// not available. - unsigned hardware_concurrency(); + /// This tells how a thread pool will be used + class ThreadPoolStrategy { + public: + // The default value (0) means all available threads should be used, + // excluding affinity mask. If set, this value only represents a suggested + // high bound, the runtime might choose a lower value (not higher). + unsigned ThreadsRequested = 0; + + // If SMT is active, use hyper threads. If false, there will be only one + // std::thread per core. + bool UseHyperThreads = true; + + /// Retrieves the max available threads for the current strategy. This + /// accounts for affinity masks and takes advantage of all CPU sockets. + unsigned compute_thread_count() const; + + /// Assign the current thread to an ideal hardware CPU or NUMA node. In a + /// multi-socket system, this ensures threads are assigned to all CPU + /// sockets. \p ThreadPoolNum represents a number bounded by [0, + /// compute_thread_count()). + void apply_thread_strategy(unsigned ThreadPoolNum) const; + }; + + /// Returns a thread strategy for tasks requiring significant memory or other + /// resources. To be used for workloads where hardware_concurrency() proves to + /// be less efficient. Avoid this strategy if doing lots of I/O. Currently + /// based on physical cores, if available for the host system, otherwise falls + /// back to hardware_concurrency(). Returns 1 when LLVM is configured with + /// LLVM_ENABLE_THREADS = OFF + inline ThreadPoolStrategy + heavyweight_hardware_concurrency(unsigned ThreadCount = 0) { + ThreadPoolStrategy S; + S.UseHyperThreads = false; + S.ThreadsRequested = ThreadCount; + return S; + } + + /// Returns a default thread strategy where all available hardware ressources + /// are to be used, except for those initially excluded by an affinity mask. + /// This function takes affinity into consideration. Returns 1 when LLVM is + /// configured with LLVM_ENABLE_THREADS=OFF. + inline ThreadPoolStrategy hardware_concurrency(unsigned ThreadCount = 0) { + ThreadPoolStrategy S; + S.ThreadsRequested = ThreadCount; + return S; + } /// Return the current thread id, as used in various OS system calls. /// Note that not all platforms guarantee that the value returned will be @@ -184,6 +217,14 @@ /// the operation succeeded or failed is returned. void get_thread_name(SmallVectorImpl &Name); + /// Returns a mask that represents on which hardware thread, core, CPU, NUMA + /// group, the calling thread can be executed. On Windows, threads cannot + /// cross CPU boundaries. + llvm::BitVector get_thread_affinity_mask(); + + /// Returns how many physical CPUs or NUMA groups the system has. + unsigned get_cpus(); + enum class ThreadPriority { Background = 0, Default = 1, diff --git a/llvm/lib/CodeGen/ParallelCG.cpp b/llvm/lib/CodeGen/ParallelCG.cpp --- a/llvm/lib/CodeGen/ParallelCG.cpp +++ b/llvm/lib/CodeGen/ParallelCG.cpp @@ -51,7 +51,7 @@ // Create ThreadPool in nested scope so that threads will be joined // on destruction. { - ThreadPool CodegenThreadPool(OSs.size()); + ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size())); int ThreadCount = 0; SplitModule( diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp --- a/llvm/lib/DWARFLinker/DWARFLinker.cpp +++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp @@ -2446,7 +2446,7 @@ } EmitLambda(); } else { - ThreadPool Pool(2); + ThreadPool Pool(hardware_concurrency(2)); Pool.async(AnalyzeAll); Pool.async(CloneAll); Pool.wait(); diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp --- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp +++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp @@ -445,7 +445,7 @@ // Now parse all DIEs in case we have cross compile unit references in a // thread pool. - ThreadPool pool(NumThreads); + ThreadPool pool(hardware_concurrency(NumThreads)); for (const auto &CU : DICtx.compile_units()) pool.async([&CU]() { CU->getUnitDIE(false /*CUDieOnly*/); }); pool.wait(); diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp --- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -157,7 +157,8 @@ if (S.NumCompileThreads > 0) { TransformLayer->setCloneToNewContextOnEmit(true); - CompileThreads = std::make_unique(S.NumCompileThreads); + CompileThreads = + std::make_unique(hardware_concurrency(S.NumCompileThreads)); ES->setDispatchMaterialization( [this](JITDylib &JD, std::unique_ptr MU) { // FIXME: Switch to move capture once we have c++14. diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -477,8 +477,7 @@ LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) { if (!Backend) - this->Backend = - createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); + this->Backend = createInProcessThinBackend(); } LTO::LTO(Config Conf, ThinBackend Backend, @@ -1095,7 +1094,8 @@ const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), - BackendThreadPool(ThinLTOParallelismLevel), + BackendThreadPool( + heavyweight_hardware_concurrency(ThinLTOParallelismLevel)), AddStream(std::move(AddStream)), Cache(std::move(Cache)) { for (auto &Name : CombinedIndex.cfiFunctionDefs()) CfiFunctionDefs.insert( diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp --- a/llvm/lib/LTO/LTOBackend.cpp +++ b/llvm/lib/LTO/LTOBackend.cpp @@ -375,7 +375,8 @@ void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream, unsigned ParallelCodeGenParallelismLevel, std::unique_ptr Mod) { - ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel); + ThreadPool CodegenThreadPool( + heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel)); unsigned ThreadCount = 0; const Target *T = &TM->getTarget(); diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -80,8 +80,8 @@ namespace { -static cl::opt - ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency())); +// Default to using one job per hardware core in the system +static cl::opt ThreadCount("threads", cl::init(0)); // Simple helper to save temporary files for debug. static void saveTempBitcode(const Module &TheModule, StringRef TempDir, @@ -1042,7 +1042,7 @@ // Parallel optimizer + codegen { - ThreadPool Pool(ThreadCount); + ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount)); for (auto IndexCount : ModulesOrdering) { auto &Mod = Modules[IndexCount]; Pool.async([&](int count) { diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp --- a/llvm/lib/Support/Host.cpp +++ b/llvm/lib/Support/Host.cpp @@ -1266,7 +1266,7 @@ // On Linux, the number of physical cores can be computed from /proc/cpuinfo, // using the number of unique physical/core id pairs. The following // implementation reads the /proc/cpuinfo format on an x86_64 system. -static int computeHostNumPhysicalCores() { +int computeHostNumPhysicalCores() { // Read /proc/cpuinfo as a stream (until EOF reached). It cannot be // mmapped because it appears to have 0 size. llvm::ErrorOr> Text = @@ -1312,7 +1312,7 @@ #include // Gets the number of *physical cores* on the machine. -static int computeHostNumPhysicalCores() { +int computeHostNumPhysicalCores() { uint32_t count; size_t len = sizeof(count); sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0); @@ -1326,6 +1326,9 @@ } return count; } +#elif defined(_WIN32) +// Defined in llvm/lib/Support/Windows/Threading.inc +int computeHostNumPhysicalCores(); #else // On other systems, return -1 to indicate unknown. static int computeHostNumPhysicalCores() { return -1; } diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -39,20 +39,21 @@ /// in filo order. class ThreadPoolExecutor : public Executor { public: - explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency()) { + explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency()) { + unsigned ThreadCount = S.compute_thread_count(); // Spawn all but one of the threads in another thread as spawning threads // can take a while. Threads.reserve(ThreadCount); Threads.resize(1); std::lock_guard Lock(Mutex); - Threads[0] = std::thread([&, ThreadCount] { - for (unsigned i = 1; i < ThreadCount; ++i) { - Threads.emplace_back([=] { work(); }); + Threads[0] = std::thread([this, ThreadCount, S] { + for (unsigned I = 1; I < ThreadCount; ++I) { + Threads.emplace_back([=] { work(S, I); }); if (Stop) break; } ThreadsCreated.set_value(); - work(); + work(S, 0); }); } @@ -90,7 +91,8 @@ } private: - void work() { + void work(ThreadPoolStrategy S, unsigned ThreadID) { + S.apply_thread_strategy(ThreadID); while (true) { std::unique_lock Lock(Mutex); Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); }); diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp --- a/llvm/lib/Support/ThreadPool.cpp +++ b/llvm/lib/Support/ThreadPool.cpp @@ -20,16 +20,15 @@ #if LLVM_ENABLE_THREADS -// Default to hardware_concurrency -ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {} - -ThreadPool::ThreadPool(unsigned ThreadCount) - : ActiveThreads(0), EnableFlag(true) { +ThreadPool::ThreadPool(ThreadPoolStrategy S) + : ActiveThreads(0), EnableFlag(true), + ThreadCount(S.compute_thread_count()) { // Create ThreadCount threads that will loop forever, wait on QueueCondition // for tasks to be queued or the Pool to be destroyed. Threads.reserve(ThreadCount); for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) { - Threads.emplace_back([&] { + Threads.emplace_back([S, ThreadID, this] { + S.apply_thread_strategy(ThreadID); while (true) { PackagedTaskTy Task; { @@ -108,12 +107,10 @@ #else // LLVM_ENABLE_THREADS Disabled -ThreadPool::ThreadPool() : ThreadPool(0) {} - // No threads are launched, issue a warning if ThreadCount is not 0 -ThreadPool::ThreadPool(unsigned ThreadCount) - : ActiveThreads(0) { - if (ThreadCount) { +ThreadPool::ThreadPool(ThreadPoolStrategy S) + : ActiveThreads(0), ThreadCount(S.compute_thread_count()) { + if (ThreadCount != 1) { errs() << "Warning: request a ThreadPool with " << ThreadCount << " threads, but LLVM_ENABLE_THREADS has been turned off\n"; } @@ -138,8 +135,6 @@ return Future; } -ThreadPool::~ThreadPool() { - wait(); -} +ThreadPool::~ThreadPool() { wait(); } #endif diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp --- a/llvm/lib/Support/Threading.cpp +++ b/llvm/lib/Support/Threading.cpp @@ -45,10 +45,6 @@ Fn(UserData); } -unsigned llvm::heavyweight_hardware_concurrency() { return 1; } - -unsigned llvm::hardware_concurrency() { return 1; } - uint64_t llvm::get_threadid() { return 0; } uint32_t llvm::get_max_thread_name_length() { return 0; } @@ -57,6 +53,13 @@ void llvm::get_thread_name(SmallVectorImpl &Name) { Name.clear(); } +llvm::BitVector llvm::get_thread_affinity_mask() { return {}; } + +unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + // When threads are disabled, ensure clients will loop at least once. + return 1; +} + #if LLVM_ENABLE_THREADS == 0 void llvm::llvm_execute_on_thread_async( llvm::unique_function Func, @@ -78,30 +81,19 @@ #else -#include -unsigned llvm::heavyweight_hardware_concurrency() { - // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use - // `std::thread` directly instead of `llvm::thread` (and indeed, doing so - // allows us to not define `thread` in the llvm namespace, which conflicts - // with some platforms such as FreeBSD whose headers also define a struct - // called `thread` in the global namespace which can cause ambiguity due to - // ADL. - int NumPhysical = sys::getHostNumPhysicalCores(); - if (NumPhysical == -1) - return std::thread::hardware_concurrency(); - return NumPhysical; -} +int computeHostNumHardwareThreads(); -unsigned llvm::hardware_concurrency() { -#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT) - cpu_set_t Set; - if (sched_getaffinity(0, sizeof(Set), &Set)) - return CPU_COUNT(&Set); -#endif - // Guard against std::thread::hardware_concurrency() returning 0. - if (unsigned Val = std::thread::hardware_concurrency()) - return Val; - return 1; +unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads() + : sys::getHostNumPhysicalCores(); + if (MaxThreadCount <= 0) + MaxThreadCount = 1; + + // No need to create more threads than there are hardware threads, it would + // uselessly induce more context-switching and cache eviction. + if (!ThreadsRequested || ThreadsRequested > (unsigned)MaxThreadCount) + return MaxThreadCount; + return ThreadsRequested; } namespace { diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc --- a/llvm/lib/Support/Unix/Threading.inc +++ b/llvm/lib/Support/Unix/Threading.inc @@ -267,3 +267,27 @@ #endif return SetThreadPriorityResult::FAILURE; } + +#include + +int computeHostNumHardwareThreads() { +#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT) + cpu_set_t Set; + if (sched_getaffinity(0, sizeof(Set), &Set)) + return CPU_COUNT(&Set); +#endif + // Guard against std::thread::hardware_concurrency() returning 0. + if (unsigned Val = std::thread::hardware_concurrency()) + return Val; + return 1; +} + +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const {} + +llvm::BitVector llvm::get_thread_affinity_mask() { + // FIXME: Implement + llvm_unreachable("Not implemented!"); +} + +unsigned llvm::get_cpus() { return 1; } diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc --- a/llvm/lib/Support/Windows/Threading.inc +++ b/llvm/lib/Support/Windows/Threading.inc @@ -16,6 +16,8 @@ #include "WindowsSupport.h" #include +#include + // Windows will at times define MemoryFence. #ifdef MemoryFence #undef MemoryFence @@ -122,3 +124,163 @@ ? SetThreadPriorityResult::SUCCESS : SetThreadPriorityResult::FAILURE; } + +struct ProcessorGroup { + unsigned ID; + unsigned AllThreads; + unsigned UsableThreads; + unsigned ThreadsPerCore; + uint64_t Affinity; +}; + +template +static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) { + DWORD Len = 0; + BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len); + if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return false; + } + auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len); + R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len); + if (R) { + auto *End = + (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len); + for (auto *Curr = Info; Curr < End; + Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr + + Curr->Size)) { + if (Curr->Relationship != Relationship) + continue; + Fn(Curr); + } + } + free(Info); + return true; +} + +static ArrayRef getProcessorGroups() { + auto computeGroups = []() { + SmallVector Groups; + + auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) { + GROUP_RELATIONSHIP &El = ProcInfo->Group; + for (unsigned J = 0; J < El.ActiveGroupCount; ++J) { + ProcessorGroup G; + G.ID = Groups.size(); + G.AllThreads = El.GroupInfo[J].MaximumProcessorCount; + G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount; + assert(G.UsableThreads <= 64); + G.Affinity = El.GroupInfo[J].ActiveProcessorMask; + Groups.push_back(G); + } + }; + + if (!IterateProcInfo(RelationGroup, HandleGroup)) + return std::vector(); + + auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) { + PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor; + assert(El.GroupCount == 1); + unsigned NumHyperThreads = 1; + // If the flag is set, each core supports more than one hyper-thread. + if (El.Flags & LTP_PC_SMT) + NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count(); + unsigned I = El.GroupMask[0].Group; + Groups[I].ThreadsPerCore = NumHyperThreads; + }; + + if (!IterateProcInfo(RelationProcessorCore, HandleProc)) + return std::vector(); + + // If there's an affinity mask set on one of the CPUs, then assume the user + // wants to constrain the current process to only a single CPU. + for (auto &G : Groups) { + if (G.UsableThreads != G.AllThreads) { + ProcessorGroup NewG{G}; + Groups.clear(); + Groups.push_back(NewG); + break; + } + } + + return std::vector(Groups.begin(), Groups.end()); + }; + static auto Groups = computeGroups(); + return ArrayRef(Groups); +} + +template +static unsigned aggregate(R &&Range, UnaryPredicate P) { + unsigned I{}; + for (const auto &It : Range) + I += P(It); + return I; +} + +// for sys::getHostNumPhysicalCores +int computeHostNumPhysicalCores() { + static unsigned Cores = + aggregate(getProcessorGroups(), [](const ProcessorGroup &G) { + return G.UsableThreads / G.ThreadsPerCore; + }); + return Cores; +} + +int computeHostNumHardwareThreads() { + static unsigned Threads = + aggregate(getProcessorGroups(), + [](const ProcessorGroup &G) { return G.UsableThreads; }); + return Threads; +} + +// Assign the current thread to a more appropriate CPU socket or CPU group +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const { + ArrayRef Groups = getProcessorGroups(); + + assert(ThreadPoolNum < compute_thread_count() && + "The thread index is not within thread strategy's range!"); + + // In this mode, the ThreadNumber represents the core number, not the + // hyper-thread number. Assumes all NUMA groups have the same amount of + // hyper-threads. + if (!UseHyperThreads) + ThreadPoolNum *= Groups[0].ThreadsPerCore; + + unsigned ThreadRangeStart = 0; + for (unsigned I = 0; I < Groups.size(); ++I) { + const ProcessorGroup &G = Groups[I]; + if (ThreadPoolNum >= ThreadRangeStart && + ThreadPoolNum < ThreadRangeStart + G.UsableThreads) { + + GROUP_AFFINITY Affinity{}; + Affinity.Group = G.ID; + Affinity.Mask = G.Affinity; + SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr); + } + ThreadRangeStart += G.UsableThreads; + } +} + +llvm::BitVector llvm::get_thread_affinity_mask() { + GROUP_AFFINITY Affinity{}; + GetThreadGroupAffinity(GetCurrentThread(), &Affinity); + + static unsigned All = + aggregate(getProcessorGroups(), + [](const ProcessorGroup &G) { return G.AllThreads; }); + + unsigned StartOffset = + aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) { + return G.ID < Affinity.Group ? G.AllThreads : 0; + }); + + llvm::BitVector V; + V.resize(All); + for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) { + if ((Affinity.Mask >> I) & 1) + V.set(StartOffset + I); + } + return V; +} + +unsigned llvm::get_cpus() { return getProcessorGroups().size(); } diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp --- a/llvm/tools/dsymutil/dsymutil.cpp +++ b/llvm/tools/dsymutil/dsymutil.cpp @@ -258,7 +258,7 @@ if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) Options.LinkOpts.Threads = atoi(NumThreads->getValue()); else - Options.LinkOpts.Threads = thread::hardware_concurrency(); + Options.LinkOpts.Threads = 0; // Use all available hardware threads if (Options.DumpDebugMap || Options.LinkOpts.Verbose) Options.LinkOpts.Threads = 1; @@ -541,9 +541,10 @@ // Shared a single binary holder for all the link steps. BinaryHolder BinHolder; - unsigned ThreadCount = - std::min(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size()); - ThreadPool Threads(ThreadCount); + unsigned ThreadCount = Options.LinkOpts.Threads; + if (!ThreadCount) + ThreadCount = DebugMapPtrsOrErr->size(); + ThreadPool Threads(hardware_concurrency(ThreadCount)); // If there is more than one link to execute, we need to generate // temporary files. diff --git a/llvm/tools/gold/gold-plugin.cpp b/llvm/tools/gold/gold-plugin.cpp --- a/llvm/tools/gold/gold-plugin.cpp +++ b/llvm/tools/gold/gold-plugin.cpp @@ -134,8 +134,8 @@ static unsigned OptLevel = 2; // Default parallelism of 0 used to indicate that user did not specify. // Actual parallelism default value depends on implementation. - // Currently only affects ThinLTO, where the default is - // llvm::heavyweight_hardware_concurrency. + // Currently only affects ThinLTO, where the default is the max cores in the + // system. static unsigned Parallelism = 0; // Default regular LTO codegen parallelism (number of partitions). static unsigned ParallelCodeGenParallelismLevel = 1; diff --git a/llvm/tools/llvm-cov/CodeCoverage.cpp b/llvm/tools/llvm-cov/CodeCoverage.cpp --- a/llvm/tools/llvm-cov/CodeCoverage.cpp +++ b/llvm/tools/llvm-cov/CodeCoverage.cpp @@ -947,9 +947,7 @@ // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(SourceFiles.size()))); + NumThreads = SourceFiles.size(); if (!ViewOpts.hasOutputDirectory() || NumThreads == 1) { for (const std::string &SourceFile : SourceFiles) @@ -957,7 +955,7 @@ ShowFilenames); } else { // In -output-dir mode, it's safe to use multiple threads to print files. - ThreadPool Pool(NumThreads); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); for (const std::string &SourceFile : SourceFiles) Pool.async(&CodeCoverageTool::writeSourceFileView, this, SourceFile, Coverage.get(), Printer.get(), ShowFilenames); diff --git a/llvm/tools/llvm-cov/CoverageExporterJson.cpp b/llvm/tools/llvm-cov/CoverageExporterJson.cpp --- a/llvm/tools/llvm-cov/CoverageExporterJson.cpp +++ b/llvm/tools/llvm-cov/CoverageExporterJson.cpp @@ -163,11 +163,9 @@ ArrayRef FileReports, const CoverageViewOptions &Options) { auto NumThreads = Options.NumThreads; - if (NumThreads == 0) { - NumThreads = std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(SourceFiles.size()))); - } - ThreadPool Pool(NumThreads); + if (NumThreads == 0) + NumThreads = SourceFiles.size(); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); json::Array FileArray; std::mutex FileArrayMutex; diff --git a/llvm/tools/llvm-cov/CoverageReport.cpp b/llvm/tools/llvm-cov/CoverageReport.cpp --- a/llvm/tools/llvm-cov/CoverageReport.cpp +++ b/llvm/tools/llvm-cov/CoverageReport.cpp @@ -356,11 +356,8 @@ // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(Files.size()))); - - ThreadPool Pool(NumThreads); + NumThreads = Files.size(); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); std::vector FileReports; FileReports.reserve(Files.size()); diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp --- a/llvm/tools/llvm-lto2/llvm-lto2.cpp +++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -65,8 +65,8 @@ "import files for the " "distributed backend case")); -static cl::opt Threads("thinlto-threads", - cl::init(llvm::heavyweight_hardware_concurrency())); +// Default to using all hardware cores in the system. +static cl::opt Threads("thinlto-threads", cl::init(0)); static cl::list SymbolResolutions( "r", diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp --- a/llvm/tools/llvm-profdata/llvm-profdata.cpp +++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -307,8 +307,11 @@ // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::min(hardware_concurrency(), unsigned((Inputs.size() + 1) / 2)); + NumThreads = std::min(hardware_concurrency().compute_thread_count(), + unsigned((Inputs.size() + 1) / 2)); + // FIXME: There's a bug here, where setting NumThreads = Inputs.size() fails + // the merge_empty_profile.test because the InstrProfWriter.ProfileKind isn't + // merged, thus the emitted file ends up with a PF_Unknown kind. // Initialize the writer contexts. SmallVector, 4> Contexts; @@ -320,7 +323,7 @@ for (const auto &Input : Inputs) loadInput(Input, Remapper, Contexts[0].get()); } else { - ThreadPool Pool(NumThreads); + ThreadPool Pool(hardware_concurrency(NumThreads)); // Load the inputs in parallel (N/NumThreads serial steps). unsigned Ctx = 0; diff --git a/llvm/unittests/Support/Host.cpp b/llvm/unittests/Support/Host.cpp --- a/llvm/unittests/Support/Host.cpp +++ b/llvm/unittests/Support/Host.cpp @@ -37,7 +37,8 @@ // Initially this is only testing detection of the number of // physical cores, which is currently only supported/tested for // x86_64 Linux and Darwin. - return (Host.getArch() == Triple::x86_64 && + return Host.isOSWindows() || + (Host.getArch() == Triple::x86_64 && (Host.isOSDarwin() || Host.getOS() == Triple::Linux)); } diff --git a/llvm/unittests/Support/TaskQueueTest.cpp b/llvm/unittests/Support/TaskQueueTest.cpp --- a/llvm/unittests/Support/TaskQueueTest.cpp +++ b/llvm/unittests/Support/TaskQueueTest.cpp @@ -22,7 +22,7 @@ }; TEST_F(TaskQueueTest, OrderedFutures) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::atomic X{ 0 }; std::atomic Y{ 0 }; @@ -66,7 +66,7 @@ } TEST_F(TaskQueueTest, UnOrderedFutures) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::atomic X{ 0 }; std::atomic Y{ 0 }; @@ -96,7 +96,7 @@ } TEST_F(TaskQueueTest, FutureWithReturnValue) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::future F1 = TQ.async([&] { return std::string("Hello"); }); std::future F2 = TQ.async([&] { return 42; }); diff --git a/llvm/unittests/Support/ThreadPool.cpp b/llvm/unittests/Support/ThreadPool.cpp --- a/llvm/unittests/Support/ThreadPool.cpp +++ b/llvm/unittests/Support/ThreadPool.cpp @@ -8,11 +8,13 @@ #include "llvm/Support/ThreadPool.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Triple.h" #include "llvm/Support/Host.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Threading.h" #include "gtest/gtest.h" @@ -69,6 +71,8 @@ void SetUp() override { MainThreadReady = false; } + void TestAllThreads(ThreadPoolStrategy S); + std::condition_variable WaitMainThread; std::mutex WaitMainThreadMutex; bool MainThreadReady = false; @@ -131,7 +135,7 @@ TEST_F(ThreadPoolTest, GetFuture) { CHECK_UNSUPPORTED(); - ThreadPool Pool{2}; + ThreadPool Pool(hardware_concurrency(2)); std::atomic_int i{0}; Pool.async([this, &i] { waitForMainThread(); @@ -162,3 +166,45 @@ } ASSERT_EQ(5, checked_in); } + +#if LLVM_ENABLE_THREADS == 1 + +void ThreadPoolTest::TestAllThreads(ThreadPoolStrategy S) { + // FIXME: Skip these tests on non-Windows because multi-socket system were not + // tested on Unix yet, and llvm::get_thread_affinity_mask() isn't implemented + // for Unix. + Triple Host(Triple::normalize(sys::getProcessTriple())); + if (!Host.isOSWindows()) + return; + + llvm::DenseSet ThreadsUsed; + std::mutex Lock; + unsigned Threads = 0; + { + ThreadPool Pool(S); + Threads = Pool.getThreadCount(); + for (size_t I = 0; I < 10000; ++I) { + Pool.async([&] { + waitForMainThread(); + std::lock_guard Guard(Lock); + auto Mask = llvm::get_thread_affinity_mask(); + ThreadsUsed.insert(Mask); + }); + } + ASSERT_EQ(true, ThreadsUsed.empty()); + setMainThreadReady(); + } + ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size()); +} + +TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) { + CHECK_UNSUPPORTED(); + TestAllThreads({}); +} + +TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) { + CHECK_UNSUPPORTED(); + TestAllThreads(llvm::heavyweight_hardware_concurrency()); +} + +#endif diff --git a/llvm/unittests/Support/Threading.cpp b/llvm/unittests/Support/Threading.cpp --- a/llvm/unittests/Support/Threading.cpp +++ b/llvm/unittests/Support/Threading.cpp @@ -21,7 +21,8 @@ auto Num = heavyweight_hardware_concurrency(); // Since Num is unsigned this will also catch us trying to // return -1. - ASSERT_LE(Num, thread::hardware_concurrency()); + ASSERT_LE(Num.compute_thread_count(), + hardware_concurrency().compute_thread_count()); } #if LLVM_ENABLE_THREADS diff --git a/mlir/lib/Pass/Pass.cpp b/mlir/lib/Pass/Pass.cpp --- a/mlir/lib/Pass/Pass.cpp +++ b/mlir/lib/Pass/Pass.cpp @@ -411,7 +411,8 @@ // Create the async executors if they haven't been created, or if the main // pipeline has changed. if (asyncExecutors.empty() || hasSizeMismatch(asyncExecutors.front(), mgrs)) - asyncExecutors.assign(llvm::hardware_concurrency(), mgrs); + asyncExecutors.assign(llvm::hardware_concurrency().compute_thread_count(), + mgrs); // Run a prepass over the module to collect the operations to execute over. // This ensures that an analysis manager exists for each operation, as well as