Index: clang-tools-extra/clang-doc/tool/ClangDocMain.cpp =================================================================== --- clang-tools-extra/clang-doc/tool/ClangDocMain.cpp +++ clang-tools-extra/clang-doc/tool/ClangDocMain.cpp @@ -268,8 +268,7 @@ Error = false; llvm::sys::Mutex IndexMutex; // ExecutorConcurrency is a flag exposed by AllTUsExecution.h - llvm::ThreadPool Pool(ExecutorConcurrency == 0 ? llvm::hardware_concurrency() - : ExecutorConcurrency); + llvm::ThreadPool Pool(llvm::hardware_concurrency(ExecutorConcurrency)); for (auto &Group : USRToBitcode) { Pool.async([&]() { std::vector> Infos; Index: clang-tools-extra/clangd/TUScheduler.cpp =================================================================== --- clang-tools-extra/clangd/TUScheduler.cpp +++ clang-tools-extra/clangd/TUScheduler.cpp @@ -824,13 +824,7 @@ } // namespace unsigned getDefaultAsyncThreadsCount() { - unsigned HardwareConcurrency = llvm::heavyweight_hardware_concurrency(); - // heavyweight_hardware_concurrency may fall back to hardware_concurrency. - // C++ standard says that hardware_concurrency() may return 0; fallback to 1 - // worker thread in that case. - if (HardwareConcurrency == 0) - return 1; - return HardwareConcurrency; + return llvm::heavyweight_hardware_concurrency().compute_thread_count(); } FileStatus TUStatus::render(PathRef File) const { Index: clang-tools-extra/clangd/index/Background.h =================================================================== --- clang-tools-extra/clangd/index/Background.h +++ clang-tools-extra/clangd/index/Background.h @@ -121,7 +121,7 @@ Context BackgroundContext, const FileSystemProvider &, const GlobalCompilationDatabase &CDB, BackgroundIndexStorage::Factory IndexStorageFactory, - size_t ThreadPoolSize = llvm::heavyweight_hardware_concurrency()); + size_t ThreadPoolSize = 0); ~BackgroundIndex(); // Blocks while the current task finishes. // Enqueue translation units for indexing. Index: clang-tools-extra/clangd/index/Background.cpp =================================================================== --- clang-tools-extra/clangd/index/Background.cpp +++ clang-tools-extra/clangd/index/Background.cpp @@ -146,9 +146,10 @@ CDB.watch([&](const std::vector &ChangedFiles) { enqueue(ChangedFiles); })) { - assert(ThreadPoolSize > 0 && "Thread pool size can't be zero."); + assert(Rebuilder.TUsBeforeFirstBuild > 0 && + "Thread pool size can't be zero."); assert(this->IndexStorageFactory && "Storage factory can not be null!"); - for (unsigned I = 0; I < ThreadPoolSize; ++I) { + for (unsigned I = 0; I < Rebuilder.TUsBeforeFirstBuild; ++I) { ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1), [this] { WithContext Ctx(this->BackgroundContext.clone()); Queue.work([&] { Rebuilder.idle(); }); Index: clang-tools-extra/clangd/index/BackgroundRebuild.h =================================================================== --- clang-tools-extra/clangd/index/BackgroundRebuild.h +++ clang-tools-extra/clangd/index/BackgroundRebuild.h @@ -49,7 +49,9 @@ public: BackgroundIndexRebuilder(SwapIndex *Target, FileSymbols *Source, unsigned Threads) - : TUsBeforeFirstBuild(Threads), Target(Target), Source(Source) {} + : TUsBeforeFirstBuild(llvm::heavyweight_hardware_concurrency(Threads) + .compute_thread_count()), + Target(Target), Source(Source) {} // Called to indicate a TU has been indexed. // May rebuild, if enough TUs have been indexed. Index: clang/lib/Tooling/AllTUsExecution.cpp =================================================================== --- clang/lib/Tooling/AllTUsExecution.cpp +++ clang/lib/Tooling/AllTUsExecution.cpp @@ -114,8 +114,7 @@ auto &Action = Actions.front(); { - llvm::ThreadPool Pool(ThreadCount == 0 ? llvm::hardware_concurrency() - : ThreadCount); + llvm::ThreadPool Pool(llvm::hardware_concurrency(ThreadCount)); for (std::string File : Files) { Pool.async( [&](std::string Path) { Index: clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp =================================================================== --- clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp +++ clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp @@ -106,7 +106,8 @@ // sharding gives a performance edge by reducing the lock contention. // FIXME: A better heuristic might also consider the OS to account for // the different cost of lock contention on different OSes. - NumShards = std::max(2u, llvm::hardware_concurrency() / 4); + NumShards = + std::max(2u, llvm::hardware_concurrency().compute_thread_count() / 4); CacheShards = std::make_unique(NumShards); } Index: clang/tools/clang-scan-deps/ClangScanDeps.cpp =================================================================== --- clang/tools/clang-scan-deps/ClangScanDeps.cpp +++ clang/tools/clang-scan-deps/ClangScanDeps.cpp @@ -17,6 +17,7 @@ #include "llvm/Support/InitLLVM.h" #include "llvm/Support/Program.h" #include "llvm/Support/Signals.h" +#include "llvm/Support/ThreadPool.h" #include "llvm/Support/Threading.h" #include #include @@ -299,14 +300,9 @@ DependencyScanningService Service(ScanMode, Format, ReuseFileManager, SkipExcludedPPRanges); -#if LLVM_ENABLE_THREADS - unsigned NumWorkers = - NumThreads == 0 ? llvm::hardware_concurrency() : NumThreads; -#else - unsigned NumWorkers = 1; -#endif + llvm::ThreadPool Pool(llvm::hardware_concurrency(NumThreads)); std::vector> WorkerTools; - for (unsigned I = 0; I < NumWorkers; ++I) + for (unsigned I = 0; I < Pool.getThreadCount(); ++I) WorkerTools.push_back(std::make_unique(Service)); std::vector Inputs; @@ -314,18 +310,17 @@ AdjustingCompilations->getAllCompileCommands()) Inputs.emplace_back(Cmd); - std::vector WorkerThreads; std::atomic HadErrors(false); std::mutex Lock; size_t Index = 0; if (Verbose) { llvm::outs() << "Running clang-scan-deps on " << Inputs.size() - << " files using " << NumWorkers << " workers\n"; + << " files using " << Pool.getThreadCount() << " workers\n"; } - for (unsigned I = 0; I < NumWorkers; ++I) { - auto Worker = [I, &Lock, &Index, &Inputs, &HadErrors, &WorkerTools, - &DependencyOS, &Errs]() { + for (unsigned I = 0; I < Pool.getThreadCount(); ++I) { + Pool.async([I, &Lock, &Index, &Inputs, &HadErrors, + &WorkerTools, &DependencyOS, &Errs]() { while (true) { const SingleCommandCompilationDatabase *Input; std::string Filename; @@ -345,16 +340,9 @@ if (handleDependencyToolResult(Filename, MaybeFile, DependencyOS, Errs)) HadErrors = true; } - }; -#if LLVM_ENABLE_THREADS - WorkerThreads.emplace_back(std::move(Worker)); -#else - // Run the worker without spawning a thread when threads are disabled. - Worker(); -#endif + }); } - for (auto &W : WorkerThreads) - W.join(); + Pool.wait(); return HadErrors; } Index: lld/ELF/SyntheticSections.cpp =================================================================== --- lld/ELF/SyntheticSections.cpp +++ lld/ELF/SyntheticSections.cpp @@ -2669,8 +2669,8 @@ size_t numShards = 32; size_t concurrency = 1; if (threadsEnabled) - concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), numShards); + concurrency = std::min( + hardware_concurrency().compute_thread_count(), numShards); // A sharded map to uniquify symbols by name. std::vector> map(numShards); @@ -3112,8 +3112,8 @@ // operations in the following tight loop. size_t concurrency = 1; if (threadsEnabled) - concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), numShards); + concurrency = std::min( + hardware_concurrency().compute_thread_count(), numShards); // Add section pieces to the builders. parallelForEachN(0, concurrency, [&](size_t threadId) { Index: llvm/include/llvm/ADT/BitVector.h =================================================================== --- llvm/include/llvm/ADT/BitVector.h +++ llvm/include/llvm/ADT/BitVector.h @@ -14,6 +14,7 @@ #define LLVM_ADT_BITVECTOR_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/MathExtras.h" #include @@ -529,31 +530,30 @@ return false; } - // Comparison operators. - bool operator==(const BitVector &RHS) const { - unsigned ThisWords = NumBitWords(size()); - unsigned RHSWords = NumBitWords(RHS.size()); - unsigned i; - for (i = 0; i != std::min(ThisWords, RHSWords); ++i) - if (Bits[i] != RHS.Bits[i]) - return false; - - // Verify that any extra words are all zeros. - if (i != ThisWords) { - for (; i != ThisWords; ++i) - if (Bits[i]) - return false; - } else if (i != RHSWords) { - for (; i != RHSWords; ++i) - if (RHS.Bits[i]) - return false; + int compare(const BitVector &RHS) const { + for (unsigned I = 0; I < std::max(Size, RHS.Size); ++I) { + int A = I < Size ? test(I) : 0; + int B = I < RHS.Size ? RHS.test(I) : 0; + if (A == B) + continue; + return A - B; } - return true; + return 0; } - bool operator!=(const BitVector &RHS) const { - return !(*this == RHS); - } + // Comparison operators. + + bool operator<(const BitVector &RHS) const { return compare(RHS) == -1; } + + bool operator<=(const BitVector &RHS) const { return compare(RHS) != 1; } + + bool operator>(const BitVector &RHS) const { return compare(RHS) == 1; } + + bool operator>=(const BitVector &RHS) const { return compare(RHS) != -1; } + + bool operator==(const BitVector &RHS) const { return compare(RHS) == 0; } + + bool operator!=(const BitVector &RHS) const { return compare(RHS) != 0; } /// Intersection, union, disjoint union. BitVector &operator&=(const BitVector &RHS) { @@ -758,6 +758,12 @@ std::swap(Size, RHS.Size); } + void invalid() { + assert(!Size && Bits.empty()); + Size = -1; + } + ArrayRef getData() const { return Bits; } + //===--------------------------------------------------------------------===// // Portable bit mask operations. //===--------------------------------------------------------------------===// @@ -932,6 +938,24 @@ return X.getMemorySize(); } +template <> struct DenseMapInfo { + static inline BitVector getEmptyKey() { return BitVector(); } + static inline BitVector getTombstoneKey() { + BitVector V; + V.invalid(); + return V; + } + static unsigned getHashValue(const BitVector &V) { + return DenseMapInfo>>::getHashValue( + std::make_pair(V.size(), V.getData())); + } + static bool isEqual(const BitVector &LHS, const BitVector &RHS) { + if (LHS.size() == (BitVector::size_type)-1 || + RHS.size() == (BitVector::size_type)-1) + return LHS.size() == RHS.size(); + return LHS == RHS; + } +}; } // end namespace llvm namespace std { Index: llvm/include/llvm/ADT/SmallBitVector.h =================================================================== --- llvm/include/llvm/ADT/SmallBitVector.h +++ llvm/include/llvm/ADT/SmallBitVector.h @@ -483,26 +483,34 @@ } // Comparison operators. - bool operator==(const SmallBitVector &RHS) const { - if (size() != RHS.size()) - return false; - if (isSmall() && RHS.isSmall()) - return getSmallBits() == RHS.getSmallBits(); - else if (!isSmall() && !RHS.isSmall()) - return *getPointer() == *RHS.getPointer(); - else { - for (size_t i = 0, e = size(); i != e; ++i) { - if ((*this)[i] != RHS[i]) - return false; - } - return true; + + int compare(const SmallBitVector &RHS) const { + for (unsigned I = 0; I < std::max(size(), RHS.size()); ++I) { + int A = I < size() ? test(I) : 0; + int B = I < RHS.size() ? RHS.test(I) : 0; + if (A == B) + continue; + return A - B; } + return 0; } - bool operator!=(const SmallBitVector &RHS) const { - return !(*this == RHS); + // Comparison operators. + + bool operator<(const SmallBitVector &RHS) const { return compare(RHS) == -1; } + + bool operator<=(const SmallBitVector &RHS) const { return compare(RHS) != 1; } + + bool operator>(const SmallBitVector &RHS) const { return compare(RHS) == 1; } + + bool operator>=(const SmallBitVector &RHS) const { + return compare(RHS) != -1; } + bool operator==(const SmallBitVector &RHS) const { return compare(RHS) == 0; } + + bool operator!=(const SmallBitVector &RHS) const { return compare(RHS) != 0; } + // Intersection, union, disjoint union. // FIXME BitVector::operator&= does not resize the LHS but this does SmallBitVector &operator&=(const SmallBitVector &RHS) { Index: llvm/include/llvm/LTO/LTO.h =================================================================== --- llvm/include/llvm/LTO/LTO.h +++ llvm/include/llvm/LTO/LTO.h @@ -227,7 +227,8 @@ AddStreamFn AddStream, NativeObjectCache Cache)>; /// This ThinBackend runs the individual backend jobs in-process. -ThinBackend createInProcessThinBackend(unsigned ParallelismLevel); +/// The default value means to use one job per hardware core (not hyper-thread). +ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0); /// This ThinBackend writes individual module indexes to files, instead of /// running the individual backend jobs. This backend is for distributed builds Index: llvm/include/llvm/Support/ThreadPool.h =================================================================== --- llvm/include/llvm/Support/ThreadPool.h +++ llvm/include/llvm/Support/ThreadPool.h @@ -13,7 +13,9 @@ #ifndef LLVM_SUPPORT_THREAD_POOL_H #define LLVM_SUPPORT_THREAD_POOL_H +#include "llvm/ADT/BitVector.h" #include "llvm/Config/llvm-config.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/thread.h" #include @@ -38,12 +40,11 @@ using TaskTy = std::function; using PackagedTaskTy = std::packaged_task; - /// Construct a pool with the number of threads found by - /// hardware_concurrency(). - ThreadPool(); - - /// Construct a pool of \p ThreadCount threads - ThreadPool(unsigned ThreadCount); + /// Construct a pool using the hardware strategy \p S for mapping hardware + /// execution resources (threads, cores, CPUs) + /// Defaults to using the maximum execution resources in the system, but + /// excluding any resources contained in the affinity mask. + ThreadPool(ThreadPoolStrategy S = hardware_concurrency()); /// Blocking destructor: the pool will wait for all the threads to complete. ~ThreadPool(); @@ -68,6 +69,8 @@ /// It is an error to try to add new tasks while blocking on this call. void wait(); + unsigned getThreadCount() const { return ThreadCount; } + private: /// Asynchronous submission of a task to the pool. The returned future can be /// used to wait for the task to finish and is *non-blocking* on destruction. @@ -94,6 +97,8 @@ /// Signal for the destruction of the pool, asking thread to exit. bool EnableFlag; #endif + + unsigned ThreadCount; }; } Index: llvm/include/llvm/Support/Threading.h =================================================================== --- llvm/include/llvm/Support/Threading.h +++ llvm/include/llvm/Support/Threading.h @@ -14,6 +14,7 @@ #ifndef LLVM_SUPPORT_THREADING_H #define LLVM_SUPPORT_THREADING_H +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/FunctionExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX @@ -143,20 +144,62 @@ #endif } - /// Get the amount of currency to use for tasks requiring significant - /// memory or other resources. Currently based on physical cores, if - /// available for the host system, otherwise falls back to - /// thread::hardware_concurrency(). - /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF - unsigned heavyweight_hardware_concurrency(); + /// This tells how a thread pool will be used + class ThreadPoolStrategy { + public: + // The default thread count means all available threads, excluding affinity + // mask. If set, this value only represents a suggested high bound, the + // runtime might use a lower value (not higher). + unsigned ThreadCount = 0; + + // If SMT is active, use hyper threads. If false, there will be only one + // std::thread per core (in that case, each thread will be pinned to a + // specific core). + bool HyperThreads = true; + + // Whether std::threads should be forced to a specific hardware thread. + // Disabled by default, which leaves to the OS to re-arrange threads in the + // most optimal manner. + bool PinThreads = false; + + /// Retrieves the max available threads for the current strategy. This + /// accounts for affinity masks and takes advantage of all CPU sockets. + unsigned compute_thread_count() const; + + /// Assign the current thread to an ideal hardware CPU. In a multi-socket + /// system, this ensures all CPU sockets are used. \p ThreadNumber + /// represents a number between 0 and max threads available for a CPU or a + /// NUMA group. \p ThreadCount can be either 0 (auto-detect max) or a fixed + /// number of threads to be used. See the ThreadPoolStrategy above. + void apply_thread_strategy(unsigned ThreadPoolNum) const; + }; + + /// Returns a thread strategy for tasks requiring significant memory or other + /// resources. To be used for workloads where hardware_concurrency() proves to + /// be less efficient. Avoid this strategy if doing lots of I/O. Currently + /// based on physical cores, if available for the host system, otherwise falls + /// back to hardware_concurrency(). Returns 1 when LLVM is configured with + /// LLVM_ENABLE_THREADS = OFF + inline ThreadPoolStrategy + heavyweight_hardware_concurrency(unsigned ThreadCount = 0) { + ThreadPoolStrategy S; + S.HyperThreads = false; + S.ThreadCount = ThreadCount; + return S; + } /// Get the number of threads that the current program can execute - /// concurrently. On some systems std::thread::hardware_concurrency() returns - /// the total number of cores, without taking affinity into consideration. - /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF. - /// Fallback to std::thread::hardware_concurrency() if sched_getaffinity is - /// not available. - unsigned hardware_concurrency(); + /// concurrently. Returns the default thread strategy where all available + /// hardware ressources are to be used, except for those initially excluded by + /// an affinity mask. On some systems std::thread::hardware_concurrency() + /// returns the total number of cores, without taking affinity into + /// consideration. Returns 1 when LLVM is configured with + /// LLVM_ENABLE_THREADS=OFF. + inline ThreadPoolStrategy hardware_concurrency(unsigned ThreadCount = 0) { + ThreadPoolStrategy S; + S.ThreadCount = ThreadCount; + return S; + } /// Return the current thread id, as used in various OS system calls. /// Note that not all platforms guarantee that the value returned will be @@ -184,6 +227,14 @@ /// the operation succeeded or failed is returned. void get_thread_name(SmallVectorImpl &Name); + /// Returns a mask that represents on which hardware thread, core, CPU, NUMA + /// group, the calling thread can be executed. On Windows, threads cannot + /// cross CPU boundaries. + llvm::BitVector get_thread_affinity_mask(); + + /// Returns how many physical CPUs or NUMA groups the system has. + unsigned get_cpus(); + enum class ThreadPriority { Background = 0, Default = 1, Index: llvm/lib/CodeGen/ParallelCG.cpp =================================================================== --- llvm/lib/CodeGen/ParallelCG.cpp +++ llvm/lib/CodeGen/ParallelCG.cpp @@ -51,7 +51,7 @@ // Create ThreadPool in nested scope so that threads will be joined // on destruction. { - ThreadPool CodegenThreadPool(OSs.size()); + ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size())); int ThreadCount = 0; SplitModule( Index: llvm/lib/ExecutionEngine/Orc/LLJIT.cpp =================================================================== --- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp +++ llvm/lib/ExecutionEngine/Orc/LLJIT.cpp @@ -153,7 +153,8 @@ if (S.NumCompileThreads > 0) { CompileLayer->setCloneToNewContextOnEmit(true); - CompileThreads = std::make_unique(S.NumCompileThreads); + CompileThreads = + std::make_unique(hardware_concurrency(S.NumCompileThreads)); ES->setDispatchMaterialization( [this](JITDylib &JD, std::unique_ptr MU) { // FIXME: Switch to move capture once we have c++14. Index: llvm/lib/LTO/LTO.cpp =================================================================== --- llvm/lib/LTO/LTO.cpp +++ llvm/lib/LTO/LTO.cpp @@ -475,8 +475,7 @@ LTO::ThinLTOState::ThinLTOState(ThinBackend Backend) : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) { if (!Backend) - this->Backend = - createInProcessThinBackend(llvm::heavyweight_hardware_concurrency()); + this->Backend = createInProcessThinBackend(); } LTO::LTO(Config Conf, ThinBackend Backend, @@ -1067,7 +1066,8 @@ const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), - BackendThreadPool(ThinLTOParallelismLevel), + BackendThreadPool( + heavyweight_hardware_concurrency(ThinLTOParallelismLevel)), AddStream(std::move(AddStream)), Cache(std::move(Cache)) { for (auto &Name : CombinedIndex.cfiFunctionDefs()) CfiFunctionDefs.insert( Index: llvm/lib/LTO/LTOBackend.cpp =================================================================== --- llvm/lib/LTO/LTOBackend.cpp +++ llvm/lib/LTO/LTOBackend.cpp @@ -375,7 +375,8 @@ void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream, unsigned ParallelCodeGenParallelismLevel, std::unique_ptr Mod) { - ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel); + ThreadPool CodegenThreadPool( + heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel)); unsigned ThreadCount = 0; const Target *T = &TM->getTarget(); Index: llvm/lib/LTO/ThinLTOCodeGenerator.cpp =================================================================== --- llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -81,8 +81,7 @@ namespace { -static cl::opt - ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency())); +static cl::opt ThreadCount("threads", cl::init(0)); // Simple helper to save temporary files for debug. static void saveTempBitcode(const Module &TheModule, StringRef TempDir, @@ -1037,7 +1036,7 @@ // Parallel optimizer + codegen { - ThreadPool Pool(ThreadCount); + ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount)); for (auto IndexCount : ModulesOrdering) { auto &Mod = Modules[IndexCount]; Pool.async([&](int count) { Index: llvm/lib/Support/Host.cpp =================================================================== --- llvm/lib/Support/Host.cpp +++ llvm/lib/Support/Host.cpp @@ -1266,7 +1266,7 @@ // On Linux, the number of physical cores can be computed from /proc/cpuinfo, // using the number of unique physical/core id pairs. The following // implementation reads the /proc/cpuinfo format on an x86_64 system. -static int computeHostNumPhysicalCores() { +int computeHostNumPhysicalCores() { // Read /proc/cpuinfo as a stream (until EOF reached). It cannot be // mmapped because it appears to have 0 size. llvm::ErrorOr> Text = @@ -1312,7 +1312,7 @@ #include // Gets the number of *physical cores* on the machine. -static int computeHostNumPhysicalCores() { +int computeHostNumPhysicalCores() { uint32_t count; size_t len = sizeof(count); sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0); @@ -1326,6 +1326,9 @@ } return count; } +#elif defined(_WIN32) +// Defined in llvm/lib/Support/Windows/Threading.inc +int computeHostNumPhysicalCores(); #else // On other systems, return -1 to indicate unknown. static int computeHostNumPhysicalCores() { return -1; } Index: llvm/lib/Support/Parallel.cpp =================================================================== --- llvm/lib/Support/Parallel.cpp +++ llvm/lib/Support/Parallel.cpp @@ -36,13 +36,16 @@ /// in filo order. class ThreadPoolExecutor : public Executor { public: - explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency()) - : Done(ThreadCount) { + explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency()) + : Strategy(S), Done(S.compute_thread_count()) { // Spawn all but one of the threads in another thread as spawning threads // can take a while. - std::thread([&, ThreadCount] { - for (size_t i = 1; i < ThreadCount; ++i) { - std::thread([=] { work(); }).detach(); + std::thread([&] { + for (size_t I = 1; I < Strategy.compute_thread_count(); ++I) { + std::thread([=] { + Strategy.apply_thread_strategy(I); + work(); + }).detach(); } work(); }).detach(); @@ -82,6 +85,7 @@ std::stack> WorkStack; std::mutex Mutex; std::condition_variable Cond; + ThreadPoolStrategy Strategy; parallel::detail::Latch Done; }; Index: llvm/lib/Support/ThreadPool.cpp =================================================================== --- llvm/lib/Support/ThreadPool.cpp +++ llvm/lib/Support/ThreadPool.cpp @@ -20,16 +20,14 @@ #if LLVM_ENABLE_THREADS -// Default to hardware_concurrency -ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {} - -ThreadPool::ThreadPool(unsigned ThreadCount) - : ActiveThreads(0), EnableFlag(true) { +ThreadPool::ThreadPool(ThreadPoolStrategy S) + : ActiveThreads(0), EnableFlag(true), ThreadCount(S.compute_thread_count()) { // Create ThreadCount threads that will loop forever, wait on QueueCondition // for tasks to be queued or the Pool to be destroyed. Threads.reserve(ThreadCount); for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) { - Threads.emplace_back([&] { + Threads.emplace_back([S, ThreadID, this] { + S.apply_thread_strategy(ThreadID); while (true) { PackagedTaskTy Task; { @@ -108,12 +106,10 @@ #else // LLVM_ENABLE_THREADS Disabled -ThreadPool::ThreadPool() : ThreadPool(0) {} - // No threads are launched, issue a warning if ThreadCount is not 0 -ThreadPool::ThreadPool(unsigned ThreadCount) - : ActiveThreads(0) { - if (ThreadCount) { +ThreadPool::ThreadPool(ThreadPoolStrategy S) + : ActiveThreads(0), ThreadCount(S.compute_thread_count()) { + if (ThreadCount != 1) { errs() << "Warning: request a ThreadPool with " << ThreadCount << " threads, but LLVM_ENABLE_THREADS has been turned off\n"; } @@ -138,8 +134,6 @@ return Future; } -ThreadPool::~ThreadPool() { - wait(); -} +ThreadPool::~ThreadPool() { wait(); } #endif Index: llvm/lib/Support/Threading.cpp =================================================================== --- llvm/lib/Support/Threading.cpp +++ llvm/lib/Support/Threading.cpp @@ -45,10 +45,6 @@ Fn(UserData); } -unsigned llvm::heavyweight_hardware_concurrency() { return 1; } - -unsigned llvm::hardware_concurrency() { return 1; } - uint64_t llvm::get_threadid() { return 0; } uint32_t llvm::get_max_thread_name_length() { return 0; } @@ -57,6 +53,14 @@ void llvm::get_thread_name(SmallVectorImpl &Name) { Name.clear(); } +llvm::BitVector llvm::get_thread_affinity_mask() { return {}; } + +unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + // When threads are disabled, already return 1 to ensure clients will loop at + // least once. + return 1; +} + #if LLVM_ENABLE_THREADS == 0 void llvm::llvm_execute_on_thread_async( llvm::unique_function Func, @@ -78,30 +82,20 @@ #else -#include -unsigned llvm::heavyweight_hardware_concurrency() { - // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use - // `std::thread` directly instead of `llvm::thread` (and indeed, doing so - // allows us to not define `thread` in the llvm namespace, which conflicts - // with some platforms such as FreeBSD whose headers also define a struct - // called `thread` in the global namespace which can cause ambiguity due to - // ADL. - int NumPhysical = sys::getHostNumPhysicalCores(); - if (NumPhysical == -1) - return std::thread::hardware_concurrency(); - return NumPhysical; -} +int computeHostNumPhysicalCores(); +int computeHostNumPhysicalThreads(); -unsigned llvm::hardware_concurrency() { -#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT) - cpu_set_t Set; - if (sched_getaffinity(0, sizeof(Set), &Set)) - return CPU_COUNT(&Set); -#endif - // Guard against std::thread::hardware_concurrency() returning 0. - if (unsigned Val = std::thread::hardware_concurrency()) - return Val; - return 1; +unsigned llvm::ThreadPoolStrategy::compute_thread_count() const { + unsigned MaxThreadCount = HyperThreads ? computeHostNumPhysicalThreads() + : computeHostNumPhysicalCores(); + if (MaxThreadCount == 0) + MaxThreadCount = 1; + + // No need to create more threads than there are hardware threads, it would + // uselessly induce more context-switching and cache eviction. + if (!ThreadCount || ThreadCount > MaxThreadCount) + return MaxThreadCount; + return ThreadCount; } namespace { Index: llvm/lib/Support/Unix/Threading.inc =================================================================== --- llvm/lib/Support/Unix/Threading.inc +++ llvm/lib/Support/Unix/Threading.inc @@ -267,3 +267,26 @@ #endif return SetThreadPriorityResult::FAILURE; } + +#include + +int computeHostNumPhysicalThreads() { +#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT) + cpu_set_t Set; + if (sched_getaffinity(0, sizeof(Set), &Set)) + return CPU_COUNT(&Set); +#endif + // Guard against std::thread::hardware_concurrency() returning 0. + if (unsigned Val = std::thread::hardware_concurrency()) + return Val; + return 1; +} + +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const {} + +llvm::BitVector llvm::get_thread_affinity_mask() { + llvm_unreachable("Not implemented!"); +} + +unsigned llvm::get_cpus() { return 1; } Index: llvm/lib/Support/Windows/Threading.inc =================================================================== --- llvm/lib/Support/Windows/Threading.inc +++ llvm/lib/Support/Windows/Threading.inc @@ -16,6 +16,8 @@ #include "WindowsSupport.h" #include +#include + // Windows will at times define MemoryFence. #ifdef MemoryFence #undef MemoryFence @@ -122,3 +124,197 @@ ? SetThreadPriorityResult::SUCCESS : SetThreadPriorityResult::FAILURE; } + +struct ProcessorGroup { + unsigned ID; + unsigned AllThreads; + unsigned UsableThreads; + unsigned ThreadsPerCore; + uint64_t Affinity; +}; + +template +static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) { + DWORD Len = 0; + BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len); + if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) { + return false; + } + auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len); + R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len); + if (R) { + auto *End = + (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len); + for (auto *Curr = Info; Curr < End; + Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr + + Curr->Size)) { + if (Curr->Relationship != Relationship) + continue; + Fn(Curr); + } + } + free(Info); + return true; +} + +static ArrayRef computeProcessorGroups() { + auto computeGroups = []() { + SmallVector Groups; + + auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) { + GROUP_RELATIONSHIP &El = ProcInfo->Group; + for (unsigned J = 0; J < El.ActiveGroupCount; ++J) { + ProcessorGroup G; + G.ID = Groups.size(); + G.AllThreads = El.GroupInfo[J].MaximumProcessorCount; + G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount; + assert(G.UsableThreads <= 64); + G.Affinity = El.GroupInfo[J].ActiveProcessorMask; + Groups.push_back(G); + } + }; + + if (!IterateProcInfo(RelationGroup, HandleGroup)) + return std::vector(); + + auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) { + PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor; + assert(El.GroupCount == 1); + unsigned HyperThreads = 1; + // If the flag is set, each core supports more than one hyper-thread. + if (El.Flags & LTP_PC_SMT) + HyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count(); + unsigned I = El.GroupMask[0].Group; + Groups[I].ThreadsPerCore = HyperThreads; + }; + + if (!IterateProcInfo(RelationProcessorCore, HandleProc)) + return std::vector(); + + // If there's an affinity mask set on one of the CPUs, then assume the user + // wants to constrain the current process to only a single CPU. + for (auto &G : Groups) { + if (G.UsableThreads != G.AllThreads) { + ProcessorGroup NewG{G}; + Groups.clear(); + Groups.push_back(NewG); + break; + } + } + + return std::vector(Groups.begin(), Groups.end()); + }; + static auto Groups = computeGroups(); + return ArrayRef(Groups); +} + +template +unsigned aggregate(R &&Range, UnaryPredicate P) { + unsigned I{}; + for (const auto &It : Range) + I += P(It); + return I; +} + +// for sys::getHostNumPhysicalCores +int computeHostNumPhysicalCores() { + static unsigned Cores = + aggregate(computeProcessorGroups(), [](const ProcessorGroup &G) { + return G.UsableThreads / G.ThreadsPerCore; + }); + return Cores; +} + +int computeHostNumPhysicalThreads() { + static unsigned Threads = + aggregate(computeProcessorGroups(), + [](const ProcessorGroup &G) { return G.UsableThreads; }); + return Threads; +} + +// Generate a mask to effectively pin it to a hyper-thread, or to a core. +// 'Affinity' is a mask of the allowed hyper-threads, 'Index' tells which +// thread(s) to use in the Affinity mask, ThreadsMask can be 1 for one thread, +// or more for pinning the thread to a group of hyper-threads. +static uint64_t gen_mask(uint64_t Affinity, unsigned Index, + unsigned ThreadsMask) { + assert(Index < 64); + unsigned Count = 0; + for (unsigned I = 0; I < 64; ++I) { + if ((Affinity >> I) & 1) + ++Count; + if (Index == Count - 1) + return (uint64_t)ThreadsMask << Index; + } + // Don't know how to set the mask, just use the default one + return Affinity; +} + +// Assign the current thread to a more appropriate CPU socket or CPU group +void llvm::ThreadPoolStrategy::apply_thread_strategy( + unsigned ThreadPoolNum) const { + ArrayRef Groups = computeProcessorGroups(); + + assert(ThreadPoolNum < compute_thread_count() && + "The thread index is not within thread strategy's range!"); + + // In this mode, the ThreadNumber represents the core number, not the + // hyper-thread number. Assumes all NUMA groups have the same amount of + // hyper-threads. + if (!HyperThreads) + ThreadPoolNum *= Groups[0].ThreadsPerCore; + + unsigned ThreadRangeStart = 0; + for (unsigned I = 0; I < Groups.size(); ++I) { + const ProcessorGroup &G = Groups[I]; + if (ThreadPoolNum >= ThreadRangeStart && + ThreadPoolNum < ThreadRangeStart + G.UsableThreads) { + + GROUP_AFFINITY Affinity{}; + Affinity.Group = G.ID; + + if (!HyperThreads) { + // This effectively pins the thread, alone, to a given core; however the + // thread could be migreated beween the hyperthreads within the core. + unsigned HyperThreadsMask = (1 << G.ThreadsPerCore) - 1; + Affinity.Mask = gen_mask(G.Affinity, (ThreadPoolNum - ThreadRangeStart), + HyperThreadsMask); + } else { + if (PinThreads) { + // Pin thread to a given hyper-thread + Affinity.Mask = + gen_mask(G.Affinity, ThreadPoolNum - ThreadRangeStart, 1); + } else { + // Use all available hyper-threads for that CPU socket or NUMA group. + Affinity.Mask = G.Affinity; + } + } + SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr); + } + ThreadRangeStart += G.UsableThreads; + } +} + +llvm::BitVector llvm::get_thread_affinity_mask() { + GROUP_AFFINITY Affinity{}; + GetThreadGroupAffinity(GetCurrentThread(), &Affinity); + + static unsigned All = + aggregate(computeProcessorGroups(), + [](const ProcessorGroup &G) { return G.AllThreads; }); + + unsigned StartOffset = + aggregate(computeProcessorGroups(), [&](const ProcessorGroup &G) { + return G.ID < Affinity.Group ? G.AllThreads : 0; + }); + + llvm::BitVector V; + V.resize(All); + for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) { + if ((Affinity.Mask >> I) & 1) + V.set(StartOffset + I); + } + return V; +} + +unsigned llvm::get_cpus() { return computeProcessorGroups().size(); } Index: llvm/tools/dsymutil/DwarfLinker.cpp =================================================================== --- llvm/tools/dsymutil/DwarfLinker.cpp +++ llvm/tools/dsymutil/DwarfLinker.cpp @@ -3018,7 +3018,7 @@ // errors from unchecked llvm::Error objects. Error RemarkLinkAllError = Error::success(); - ThreadPool pool(3); + ThreadPool pool(hardware_concurrency(3)); pool.async(AnalyzeAll); pool.async(CloneAll); pool.async(RemarkLinkAll, std::ref(RemarkLinkAllError)); Index: llvm/tools/dsymutil/dsymutil.cpp =================================================================== --- llvm/tools/dsymutil/dsymutil.cpp +++ llvm/tools/dsymutil/dsymutil.cpp @@ -258,7 +258,7 @@ if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads)) Options.LinkOpts.Threads = atoi(NumThreads->getValue()); else - Options.LinkOpts.Threads = thread::hardware_concurrency(); + Options.LinkOpts.Threads = 0; // Use all available HW threads if (Options.DumpDebugMap || Options.LinkOpts.Verbose) Options.LinkOpts.Threads = 1; @@ -540,9 +540,10 @@ // Shared a single binary holder for all the link steps. BinaryHolder BinHolder; - unsigned ThreadCount = - std::min(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size()); - ThreadPool Threads(ThreadCount); + unsigned ThreadCount = Options.LinkOpts.Threads; + if (!ThreadCount) + ThreadCount = DebugMapPtrsOrErr->size(); + ThreadPool Threads(hardware_concurrency(ThreadCount)); // If there is more than one link to execute, we need to generate // temporary files. Index: llvm/tools/gold/gold-plugin.cpp =================================================================== --- llvm/tools/gold/gold-plugin.cpp +++ llvm/tools/gold/gold-plugin.cpp @@ -134,8 +134,8 @@ static unsigned OptLevel = 2; // Default parallelism of 0 used to indicate that user did not specify. // Actual parallelism default value depends on implementation. - // Currently only affects ThinLTO, where the default is - // llvm::heavyweight_hardware_concurrency. + // Currently only affects ThinLTO, where the default is the max cores in the + // system. static unsigned Parallelism = 0; // Default regular LTO codegen parallelism (number of partitions). static unsigned ParallelCodeGenParallelismLevel = 1; Index: llvm/tools/llvm-cov/CodeCoverage.cpp =================================================================== --- llvm/tools/llvm-cov/CodeCoverage.cpp +++ llvm/tools/llvm-cov/CodeCoverage.cpp @@ -944,9 +944,7 @@ // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(SourceFiles.size()))); + NumThreads = SourceFiles.size(); if (!ViewOpts.hasOutputDirectory() || NumThreads == 1) { for (const std::string &SourceFile : SourceFiles) @@ -954,7 +952,7 @@ ShowFilenames); } else { // In -output-dir mode, it's safe to use multiple threads to print files. - ThreadPool Pool(NumThreads); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); for (const std::string &SourceFile : SourceFiles) Pool.async(&CodeCoverageTool::writeSourceFileView, this, SourceFile, Coverage.get(), Printer.get(), ShowFilenames); Index: llvm/tools/llvm-cov/CoverageExporterJson.cpp =================================================================== --- llvm/tools/llvm-cov/CoverageExporterJson.cpp +++ llvm/tools/llvm-cov/CoverageExporterJson.cpp @@ -163,11 +163,9 @@ ArrayRef FileReports, const CoverageViewOptions &Options) { auto NumThreads = Options.NumThreads; - if (NumThreads == 0) { - NumThreads = std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(SourceFiles.size()))); - } - ThreadPool Pool(NumThreads); + if (NumThreads == 0) + NumThreads = SourceFiles.size(); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); json::Array FileArray; std::mutex FileArrayMutex; Index: llvm/tools/llvm-cov/CoverageReport.cpp =================================================================== --- llvm/tools/llvm-cov/CoverageReport.cpp +++ llvm/tools/llvm-cov/CoverageReport.cpp @@ -356,11 +356,8 @@ // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(), - unsigned(Files.size()))); - - ThreadPool Pool(NumThreads); + NumThreads = Files.size(); + ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads)); std::vector FileReports; FileReports.reserve(Files.size()); Index: llvm/tools/llvm-lto2/llvm-lto2.cpp =================================================================== --- llvm/tools/llvm-lto2/llvm-lto2.cpp +++ llvm/tools/llvm-lto2/llvm-lto2.cpp @@ -65,8 +65,7 @@ "import files for the " "distributed backend case")); -static cl::opt Threads("thinlto-threads", - cl::init(llvm::heavyweight_hardware_concurrency())); +static cl::opt Threads("thinlto-threads", cl::init(0)); static cl::list SymbolResolutions( "r", Index: llvm/tools/llvm-profdata/llvm-profdata.cpp =================================================================== --- llvm/tools/llvm-profdata/llvm-profdata.cpp +++ llvm/tools/llvm-profdata/llvm-profdata.cpp @@ -307,8 +307,11 @@ // If NumThreads is not specified, auto-detect a good default. if (NumThreads == 0) - NumThreads = - std::min(hardware_concurrency(), unsigned((Inputs.size() + 1) / 2)); + NumThreads = std::min(hardware_concurrency().compute_thread_count(), + unsigned((Inputs.size() + 1) / 2)); + // FIXME: There's a bug here, where setting NumThreads = Inputs.size() fails + // the merge_empty_profile.test because the InstrProfWriter.ProfileKind isn't + // merged, thus the emitted file ends up with a PF_Unknown kind. // Initialize the writer contexts. SmallVector, 4> Contexts; @@ -320,7 +323,7 @@ for (const auto &Input : Inputs) loadInput(Input, Remapper, Contexts[0].get()); } else { - ThreadPool Pool(NumThreads); + ThreadPool Pool(hardware_concurrency(NumThreads)); // Load the inputs in parallel (N/NumThreads serial steps). unsigned Ctx = 0; Index: llvm/unittests/ADT/BitVectorTest.cpp =================================================================== --- llvm/unittests/ADT/BitVectorTest.cpp +++ llvm/unittests/ADT/BitVectorTest.cpp @@ -1149,4 +1149,24 @@ EXPECT_EQ(213U, Vec.size()); EXPECT_EQ(102U, Vec.count()); } + +TYPED_TEST(BitVectorTest, Compare) { + TypeParam A(10, false); + TypeParam B(5, false); + EXPECT_EQ(A, B); + + B.flip(); + EXPECT_LE(A, B); + EXPECT_GT(B, A); + EXPECT_NE(B, A); + + A.flip(); + EXPECT_LE(B, A); + EXPECT_GT(A, B); + EXPECT_NE(A, B); + + A.reset(); + A.set(0, 5); + EXPECT_EQ(A, B); +} } Index: llvm/unittests/Support/Host.cpp =================================================================== --- llvm/unittests/Support/Host.cpp +++ llvm/unittests/Support/Host.cpp @@ -37,7 +37,8 @@ // Initially this is only testing detection of the number of // physical cores, which is currently only supported/tested for // x86_64 Linux and Darwin. - return (Host.getArch() == Triple::x86_64 && + return Host.isOSWindows() || + (Host.getArch() == Triple::x86_64 && (Host.isOSDarwin() || Host.getOS() == Triple::Linux)); } Index: llvm/unittests/Support/TaskQueueTest.cpp =================================================================== --- llvm/unittests/Support/TaskQueueTest.cpp +++ llvm/unittests/Support/TaskQueueTest.cpp @@ -22,7 +22,7 @@ }; TEST_F(TaskQueueTest, OrderedFutures) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::atomic X{ 0 }; std::atomic Y{ 0 }; @@ -66,7 +66,7 @@ } TEST_F(TaskQueueTest, UnOrderedFutures) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::atomic X{ 0 }; std::atomic Y{ 0 }; @@ -96,7 +96,7 @@ } TEST_F(TaskQueueTest, FutureWithReturnValue) { - ThreadPool TP(1); + ThreadPool TP(hardware_concurrency(1)); TaskQueue TQ(TP); std::future F1 = TQ.async([&] { return std::string("Hello"); }); std::future F2 = TQ.async([&] { return 42; }); Index: llvm/unittests/Support/ThreadPool.cpp =================================================================== --- llvm/unittests/Support/ThreadPool.cpp +++ llvm/unittests/Support/ThreadPool.cpp @@ -8,11 +8,13 @@ #include "llvm/Support/ThreadPool.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Triple.h" #include "llvm/Support/Host.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/Threading.h" #include "gtest/gtest.h" @@ -69,6 +71,8 @@ void SetUp() override { MainThreadReady = false; } + void TestAllThreads(ThreadPoolStrategy S); + std::condition_variable WaitMainThread; std::mutex WaitMainThreadMutex; bool MainThreadReady = false; @@ -131,7 +135,7 @@ TEST_F(ThreadPoolTest, GetFuture) { CHECK_UNSUPPORTED(); - ThreadPool Pool{2}; + ThreadPool Pool(hardware_concurrency(2)); std::atomic_int i{0}; Pool.async([this, &i] { waitForMainThread(); @@ -162,3 +166,56 @@ } ASSERT_EQ(5, checked_in); } + +#if LLVM_ENABLE_THREADS == 1 + +void ThreadPoolTest::TestAllThreads(ThreadPoolStrategy S) { + // FIXME: Skip these tests on non-Windows because multi-CPU sockets wasn't + // tested there, and llvm::get_thread_affinity_mask() isn't implemented + // for all flavors of Unix. + Triple Host(Triple::normalize(sys::getProcessTriple())); + if (!Host.isOSWindows()) + return; + + //std::set ThreadsUsed; + llvm::DenseSet ThreadsUsed; + std::mutex Lock; + unsigned Threads = 0; + { + ThreadPool Pool(S); + Threads = Pool.getThreadCount(); + for (size_t I = 0; I < 10000; ++I) { + Pool.async([&] { + waitForMainThread(); + std::lock_guard Guard(Lock); + auto Mask = llvm::get_thread_affinity_mask(); + ThreadsUsed.insert(Mask); + }); + } + ASSERT_EQ(true, ThreadsUsed.empty()); + setMainThreadReady(); + } + if (!S.PinThreads && S.HyperThreads) + ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size()); + else + ASSERT_EQ(Threads, ThreadsUsed.size()); +} + +TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) { + CHECK_UNSUPPORTED(); + TestAllThreads({}); +} + +TEST_F(ThreadPoolTest, AllThreads_PinThreads) { + CHECK_UNSUPPORTED(); + ThreadPoolStrategy S; + S.PinThreads = true; + TestAllThreads(S); +} + +TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) { + CHECK_UNSUPPORTED(); + TestAllThreads(llvm::heavyweight_hardware_concurrency()); +} + +#endif Index: llvm/unittests/Support/Threading.cpp =================================================================== --- llvm/unittests/Support/Threading.cpp +++ llvm/unittests/Support/Threading.cpp @@ -21,7 +21,8 @@ auto Num = heavyweight_hardware_concurrency(); // Since Num is unsigned this will also catch us trying to // return -1. - ASSERT_LE(Num, thread::hardware_concurrency()); + ASSERT_LE(Num.compute_thread_count(), + hardware_concurrency().compute_thread_count()); } #if LLVM_ENABLE_THREADS