Index: clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
===================================================================
--- clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
+++ clang-tools-extra/clang-doc/tool/ClangDocMain.cpp
@@ -268,8 +268,7 @@
   Error = false;
   llvm::sys::Mutex IndexMutex;
   // ExecutorConcurrency is a flag exposed by AllTUsExecution.h
-  llvm::ThreadPool Pool(ExecutorConcurrency == 0 ? llvm::hardware_concurrency()
-                                                 : ExecutorConcurrency);
+  llvm::ThreadPool Pool(llvm::hardware_concurrency(ExecutorConcurrency));
   for (auto &Group : USRToBitcode) {
     Pool.async([&]() {
       std::vector<std::unique_ptr<doc::Info>> Infos;
Index: clang-tools-extra/clangd/TUScheduler.cpp
===================================================================
--- clang-tools-extra/clangd/TUScheduler.cpp
+++ clang-tools-extra/clangd/TUScheduler.cpp
@@ -824,13 +824,7 @@
 } // namespace
 
 unsigned getDefaultAsyncThreadsCount() {
-  unsigned HardwareConcurrency = llvm::heavyweight_hardware_concurrency();
-  // heavyweight_hardware_concurrency may fall back to hardware_concurrency.
-  // C++ standard says that hardware_concurrency() may return 0; fallback to 1
-  // worker thread in that case.
-  if (HardwareConcurrency == 0)
-    return 1;
-  return HardwareConcurrency;
+  return llvm::heavyweight_hardware_concurrency().compute_thread_count();
 }
 
 FileStatus TUStatus::render(PathRef File) const {
Index: clang-tools-extra/clangd/index/Background.h
===================================================================
--- clang-tools-extra/clangd/index/Background.h
+++ clang-tools-extra/clangd/index/Background.h
@@ -121,7 +121,7 @@
       Context BackgroundContext, const FileSystemProvider &,
       const GlobalCompilationDatabase &CDB,
       BackgroundIndexStorage::Factory IndexStorageFactory,
-      size_t ThreadPoolSize = llvm::heavyweight_hardware_concurrency());
+      size_t ThreadPoolSize = 0);
   ~BackgroundIndex(); // Blocks while the current task finishes.
 
   // Enqueue translation units for indexing.
Index: clang-tools-extra/clangd/index/Background.cpp
===================================================================
--- clang-tools-extra/clangd/index/Background.cpp
+++ clang-tools-extra/clangd/index/Background.cpp
@@ -146,9 +146,10 @@
           CDB.watch([&](const std::vector<std::string> &ChangedFiles) {
             enqueue(ChangedFiles);
           })) {
-  assert(ThreadPoolSize > 0 && "Thread pool size can't be zero.");
+  assert(Rebuilder.TUsBeforeFirstBuild > 0 &&
+         "Thread pool size can't be zero.");
   assert(this->IndexStorageFactory && "Storage factory can not be null!");
-  for (unsigned I = 0; I < ThreadPoolSize; ++I) {
+  for (unsigned I = 0; I < Rebuilder.TUsBeforeFirstBuild; ++I) {
     ThreadPool.runAsync("background-worker-" + llvm::Twine(I + 1), [this] {
       WithContext Ctx(this->BackgroundContext.clone());
       Queue.work([&] { Rebuilder.idle(); });
Index: clang-tools-extra/clangd/index/BackgroundRebuild.h
===================================================================
--- clang-tools-extra/clangd/index/BackgroundRebuild.h
+++ clang-tools-extra/clangd/index/BackgroundRebuild.h
@@ -49,7 +49,9 @@
 public:
   BackgroundIndexRebuilder(SwapIndex *Target, FileSymbols *Source,
                            unsigned Threads)
-      : TUsBeforeFirstBuild(Threads), Target(Target), Source(Source) {}
+      : TUsBeforeFirstBuild(llvm::heavyweight_hardware_concurrency(Threads)
+                                .compute_thread_count()),
+        Target(Target), Source(Source) {}
 
   // Called to indicate a TU has been indexed.
   // May rebuild, if enough TUs have been indexed.
Index: clang/lib/Tooling/AllTUsExecution.cpp
===================================================================
--- clang/lib/Tooling/AllTUsExecution.cpp
+++ clang/lib/Tooling/AllTUsExecution.cpp
@@ -114,8 +114,7 @@
   auto &Action = Actions.front();
 
   {
-    llvm::ThreadPool Pool(ThreadCount == 0 ? llvm::hardware_concurrency()
-                                           : ThreadCount);
+    llvm::ThreadPool Pool(llvm::hardware_concurrency(ThreadCount));
     for (std::string File : Files) {
       Pool.async(
           [&](std::string Path) {
Index: clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
===================================================================
--- clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
+++ clang/lib/Tooling/DependencyScanning/DependencyScanningFilesystem.cpp
@@ -106,7 +106,8 @@
   // sharding gives a performance edge by reducing the lock contention.
   // FIXME: A better heuristic might also consider the OS to account for
   // the different cost of lock contention on different OSes.
-  NumShards = std::max(2u, llvm::hardware_concurrency() / 4);
+  NumShards =
+      std::max(2u, llvm::hardware_concurrency().compute_thread_count() / 4);
   CacheShards = std::make_unique<CacheShard[]>(NumShards);
 }
 
Index: clang/tools/clang-scan-deps/ClangScanDeps.cpp
===================================================================
--- clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -17,6 +17,7 @@
 #include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/Signals.h"
+#include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include <mutex>
 #include <thread>
@@ -299,14 +300,9 @@
 
   DependencyScanningService Service(ScanMode, Format, ReuseFileManager,
                                     SkipExcludedPPRanges);
-#if LLVM_ENABLE_THREADS
-  unsigned NumWorkers =
-      NumThreads == 0 ? llvm::hardware_concurrency() : NumThreads;
-#else
-  unsigned NumWorkers = 1;
-#endif
+  llvm::ThreadPool Pool(llvm::hardware_concurrency(NumThreads));
   std::vector<std::unique_ptr<DependencyScanningTool>> WorkerTools;
-  for (unsigned I = 0; I < NumWorkers; ++I)
+  for (unsigned I = 0; I < Pool.getThreadCount(); ++I)
     WorkerTools.push_back(std::make_unique<DependencyScanningTool>(Service));
 
   std::vector<SingleCommandCompilationDatabase> Inputs;
@@ -314,18 +310,17 @@
        AdjustingCompilations->getAllCompileCommands())
     Inputs.emplace_back(Cmd);
 
-  std::vector<std::thread> WorkerThreads;
   std::atomic<bool> HadErrors(false);
   std::mutex Lock;
   size_t Index = 0;
 
   if (Verbose) {
     llvm::outs() << "Running clang-scan-deps on " << Inputs.size()
-                 << " files using " << NumWorkers << " workers\n";
+                 << " files using " << Pool.getThreadCount() << " workers\n";
   }
-  for (unsigned I = 0; I < NumWorkers; ++I) {
-    auto Worker = [I, &Lock, &Index, &Inputs, &HadErrors, &WorkerTools,
-                   &DependencyOS, &Errs]() {
+  for (unsigned I = 0; I < Pool.getThreadCount(); ++I) {
+    Pool.async([I, &Lock, &Index, &Inputs, &HadErrors,
+                                &WorkerTools, &DependencyOS, &Errs]() {
       while (true) {
         const SingleCommandCompilationDatabase *Input;
         std::string Filename;
@@ -345,16 +340,9 @@
         if (handleDependencyToolResult(Filename, MaybeFile, DependencyOS, Errs))
           HadErrors = true;
       }
-    };
-#if LLVM_ENABLE_THREADS
-    WorkerThreads.emplace_back(std::move(Worker));
-#else
-    // Run the worker without spawning a thread when threads are disabled.
-    Worker();
-#endif
+    });
   }
-  for (auto &W : WorkerThreads)
-    W.join();
+  Pool.wait();
 
   return HadErrors;
 }
Index: lld/ELF/SyntheticSections.cpp
===================================================================
--- lld/ELF/SyntheticSections.cpp
+++ lld/ELF/SyntheticSections.cpp
@@ -2669,8 +2669,8 @@
   size_t numShards = 32;
   size_t concurrency = 1;
   if (threadsEnabled)
-    concurrency =
-        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), numShards);
+    concurrency = std::min<size_t>(
+        hardware_concurrency().compute_thread_count(), numShards);
 
   // A sharded map to uniquify symbols by name.
   std::vector<DenseMap<CachedHashStringRef, size_t>> map(numShards);
@@ -3112,8 +3112,8 @@
   // operations in the following tight loop.
   size_t concurrency = 1;
   if (threadsEnabled)
-    concurrency =
-        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), numShards);
+    concurrency = std::min<size_t>(
+        hardware_concurrency().compute_thread_count(), numShards);
 
   // Add section pieces to the builders.
   parallelForEachN(0, concurrency, [&](size_t threadId) {
Index: llvm/include/llvm/ADT/BitVector.h
===================================================================
--- llvm/include/llvm/ADT/BitVector.h
+++ llvm/include/llvm/ADT/BitVector.h
@@ -14,6 +14,7 @@
 #define LLVM_ADT_BITVECTOR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/MathExtras.h"
 #include <algorithm>
@@ -529,31 +530,30 @@
     return false;
   }
 
-  // Comparison operators.
-  bool operator==(const BitVector &RHS) const {
-    unsigned ThisWords = NumBitWords(size());
-    unsigned RHSWords  = NumBitWords(RHS.size());
-    unsigned i;
-    for (i = 0; i != std::min(ThisWords, RHSWords); ++i)
-      if (Bits[i] != RHS.Bits[i])
-        return false;
-
-    // Verify that any extra words are all zeros.
-    if (i != ThisWords) {
-      for (; i != ThisWords; ++i)
-        if (Bits[i])
-          return false;
-    } else if (i != RHSWords) {
-      for (; i != RHSWords; ++i)
-        if (RHS.Bits[i])
-          return false;
+  int compare(const BitVector &RHS) const {
+    for (unsigned I = 0; I < std::max(Size, RHS.Size); ++I) {
+      int A = I < Size ? test(I) : 0;
+      int B = I < RHS.Size ? RHS.test(I) : 0;
+      if (A == B)
+        continue;
+      return A - B;
     }
-    return true;
+    return 0;
   }
 
-  bool operator!=(const BitVector &RHS) const {
-    return !(*this == RHS);
-  }
+  // Comparison operators.
+
+  bool operator<(const BitVector &RHS) const { return compare(RHS) == -1; }
+
+  bool operator<=(const BitVector &RHS) const { return compare(RHS) != 1; }
+
+  bool operator>(const BitVector &RHS) const { return compare(RHS) == 1; }
+
+  bool operator>=(const BitVector &RHS) const { return compare(RHS) != -1; }
+
+  bool operator==(const BitVector &RHS) const { return compare(RHS) == 0; }
+
+  bool operator!=(const BitVector &RHS) const { return compare(RHS) != 0; }
 
   /// Intersection, union, disjoint union.
   BitVector &operator&=(const BitVector &RHS) {
@@ -758,6 +758,12 @@
     std::swap(Size, RHS.Size);
   }
 
+  void invalid() {
+    assert(!Size && Bits.empty());
+    Size = -1;
+  }
+  ArrayRef<BitWord> getData() const { return Bits; }
+
   //===--------------------------------------------------------------------===//
   // Portable bit mask operations.
   //===--------------------------------------------------------------------===//
@@ -932,6 +938,24 @@
   return X.getMemorySize();
 }
 
+template <> struct DenseMapInfo<BitVector> {
+  static inline BitVector getEmptyKey() { return BitVector(); }
+  static inline BitVector getTombstoneKey() {
+    BitVector V;
+    V.invalid();
+    return V;
+  }
+  static unsigned getHashValue(const BitVector &V) {
+    return DenseMapInfo<std::pair<unsigned, ArrayRef<uintptr_t>>>::getHashValue(
+        std::make_pair(V.size(), V.getData()));
+  }
+  static bool isEqual(const BitVector &LHS, const BitVector &RHS) {
+    if (LHS.size() == (BitVector::size_type)-1 ||
+        RHS.size() == (BitVector::size_type)-1)
+      return LHS.size() == RHS.size();
+    return LHS == RHS;
+  }
+};
 } // end namespace llvm
 
 namespace std {
Index: llvm/include/llvm/ADT/SmallBitVector.h
===================================================================
--- llvm/include/llvm/ADT/SmallBitVector.h
+++ llvm/include/llvm/ADT/SmallBitVector.h
@@ -483,26 +483,34 @@
   }
 
   // Comparison operators.
-  bool operator==(const SmallBitVector &RHS) const {
-    if (size() != RHS.size())
-      return false;
-    if (isSmall() && RHS.isSmall())
-      return getSmallBits() == RHS.getSmallBits();
-    else if (!isSmall() && !RHS.isSmall())
-      return *getPointer() == *RHS.getPointer();
-    else {
-      for (size_t i = 0, e = size(); i != e; ++i) {
-        if ((*this)[i] != RHS[i])
-          return false;
-      }
-      return true;
+
+  int compare(const SmallBitVector &RHS) const {
+    for (unsigned I = 0; I < std::max(size(), RHS.size()); ++I) {
+      int A = I < size() ? test(I) : 0;
+      int B = I < RHS.size() ? RHS.test(I) : 0;
+      if (A == B)
+        continue;
+      return A - B;
     }
+    return 0;
   }
 
-  bool operator!=(const SmallBitVector &RHS) const {
-    return !(*this == RHS);
+  // Comparison operators.
+
+  bool operator<(const SmallBitVector &RHS) const { return compare(RHS) == -1; }
+
+  bool operator<=(const SmallBitVector &RHS) const { return compare(RHS) != 1; }
+
+  bool operator>(const SmallBitVector &RHS) const { return compare(RHS) == 1; }
+
+  bool operator>=(const SmallBitVector &RHS) const {
+    return compare(RHS) != -1;
   }
 
+  bool operator==(const SmallBitVector &RHS) const { return compare(RHS) == 0; }
+
+  bool operator!=(const SmallBitVector &RHS) const { return compare(RHS) != 0; }
+
   // Intersection, union, disjoint union.
   // FIXME BitVector::operator&= does not resize the LHS but this does
   SmallBitVector &operator&=(const SmallBitVector &RHS) {
Index: llvm/include/llvm/LTO/LTO.h
===================================================================
--- llvm/include/llvm/LTO/LTO.h
+++ llvm/include/llvm/LTO/LTO.h
@@ -227,7 +227,8 @@
     AddStreamFn AddStream, NativeObjectCache Cache)>;
 
 /// This ThinBackend runs the individual backend jobs in-process.
-ThinBackend createInProcessThinBackend(unsigned ParallelismLevel);
+/// The default value means to use one job per hardware core (not hyper-thread).
+ThinBackend createInProcessThinBackend(unsigned ParallelismLevel = 0);
 
 /// This ThinBackend writes individual module indexes to files, instead of
 /// running the individual backend jobs. This backend is for distributed builds
Index: llvm/include/llvm/Support/ThreadPool.h
===================================================================
--- llvm/include/llvm/Support/ThreadPool.h
+++ llvm/include/llvm/Support/ThreadPool.h
@@ -13,7 +13,9 @@
 #ifndef LLVM_SUPPORT_THREAD_POOL_H
 #define LLVM_SUPPORT_THREAD_POOL_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
 
 #include <future>
@@ -38,12 +40,11 @@
   using TaskTy = std::function<void()>;
   using PackagedTaskTy = std::packaged_task<void()>;
 
-  /// Construct a pool with the number of threads found by
-  /// hardware_concurrency().
-  ThreadPool();
-
-  /// Construct a pool of \p ThreadCount threads
-  ThreadPool(unsigned ThreadCount);
+  /// Construct a pool using the hardware strategy \p S for mapping hardware
+  /// execution resources (threads, cores, CPUs)
+  /// Defaults to using the maximum execution resources in the system, but
+  /// excluding any resources contained in the affinity mask.
+  ThreadPool(ThreadPoolStrategy S = hardware_concurrency());
 
   /// Blocking destructor: the pool will wait for all the threads to complete.
   ~ThreadPool();
@@ -68,6 +69,8 @@
   /// It is an error to try to add new tasks while blocking on this call.
   void wait();
 
+  unsigned getThreadCount() const { return ThreadCount; }
+
 private:
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
@@ -94,6 +97,8 @@
   /// Signal for the destruction of the pool, asking thread to exit.
   bool EnableFlag;
 #endif
+
+  unsigned ThreadCount;
 };
 }
 
Index: llvm/include/llvm/Support/Threading.h
===================================================================
--- llvm/include/llvm/Support/Threading.h
+++ llvm/include/llvm/Support/Threading.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_SUPPORT_THREADING_H
 #define LLVM_SUPPORT_THREADING_H
 
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/FunctionExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
@@ -143,20 +144,62 @@
 #endif
   }
 
-  /// Get the amount of currency to use for tasks requiring significant
-  /// memory or other resources. Currently based on physical cores, if
-  /// available for the host system, otherwise falls back to
-  /// thread::hardware_concurrency().
-  /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF
-  unsigned heavyweight_hardware_concurrency();
+  /// This tells how a thread pool will be used
+  class ThreadPoolStrategy {
+  public:
+    // The default thread count means all available threads, excluding affinity
+    // mask. If set, this value only represents a suggested high bound, the
+    // runtime might use a lower value (not higher).
+    unsigned ThreadCount = 0;
+
+    // If SMT is active, use hyper threads. If false, there will be only one
+    // std::thread per core (in that case, each thread will be pinned to a
+    // specific core).
+    bool HyperThreads = true;
+
+    // Whether std::threads should be forced to a specific hardware thread.
+    // Disabled by default, which leaves to the OS to re-arrange threads in the
+    // most optimal manner.
+    bool PinThreads = false;
+
+    /// Retrieves the max available threads for the current strategy. This
+    /// accounts for affinity masks and takes advantage of all CPU sockets.
+    unsigned compute_thread_count() const;
+
+    /// Assign the current thread to an ideal hardware CPU. In a multi-socket
+    /// system, this ensures all CPU sockets are used. \p ThreadNumber
+    /// represents a number between 0 and max threads available for a CPU or a
+    /// NUMA group. \p ThreadCount can be either 0 (auto-detect max) or a fixed
+    /// number of threads to be used. See the ThreadPoolStrategy above.
+    void apply_thread_strategy(unsigned ThreadPoolNum) const;
+  };
+
+  /// Returns a thread strategy for tasks requiring significant  memory or other
+  /// resources. To be used for workloads where hardware_concurrency() proves to
+  /// be less efficient. Avoid this strategy if doing lots of I/O. Currently
+  /// based on physical cores, if available for the host system, otherwise falls
+  /// back to hardware_concurrency(). Returns 1 when LLVM is configured with
+  /// LLVM_ENABLE_THREADS = OFF
+  inline ThreadPoolStrategy
+  heavyweight_hardware_concurrency(unsigned ThreadCount = 0) {
+    ThreadPoolStrategy S;
+    S.HyperThreads = false;
+    S.ThreadCount = ThreadCount;
+    return S;
+  }
 
   /// Get the number of threads that the current program can execute
-  /// concurrently. On some systems std::thread::hardware_concurrency() returns
-  /// the total number of cores, without taking affinity into consideration.
-  /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF.
-  /// Fallback to std::thread::hardware_concurrency() if sched_getaffinity is
-  /// not available.
-  unsigned hardware_concurrency();
+  /// concurrently. Returns the default thread strategy where all available
+  /// hardware ressources are to be used, except for those initially excluded by
+  /// an affinity mask. On some systems std::thread::hardware_concurrency()
+  /// returns the total number of cores, without taking affinity into
+  /// consideration. Returns 1 when LLVM is configured with
+  /// LLVM_ENABLE_THREADS=OFF.
+  inline ThreadPoolStrategy hardware_concurrency(unsigned ThreadCount = 0) {
+    ThreadPoolStrategy S;
+    S.ThreadCount = ThreadCount;
+    return S;
+  }
 
   /// Return the current thread id, as used in various OS system calls.
   /// Note that not all platforms guarantee that the value returned will be
@@ -184,6 +227,14 @@
   /// the operation succeeded or failed is returned.
   void get_thread_name(SmallVectorImpl<char> &Name);
 
+  /// Returns a mask that represents on which hardware thread, core, CPU, NUMA
+  /// group, the calling thread can be executed. On Windows, threads cannot
+  /// cross CPU boundaries.
+  llvm::BitVector get_thread_affinity_mask();
+
+  /// Returns how many physical CPUs or NUMA groups the system has.
+  unsigned get_cpus();
+
   enum class ThreadPriority {
     Background = 0,
     Default = 1,
Index: llvm/lib/CodeGen/ParallelCG.cpp
===================================================================
--- llvm/lib/CodeGen/ParallelCG.cpp
+++ llvm/lib/CodeGen/ParallelCG.cpp
@@ -51,7 +51,7 @@
   // Create ThreadPool in nested scope so that threads will be joined
   // on destruction.
   {
-    ThreadPool CodegenThreadPool(OSs.size());
+    ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size()));
     int ThreadCount = 0;
 
     SplitModule(
Index: llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
===================================================================
--- llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -153,7 +153,8 @@
 
   if (S.NumCompileThreads > 0) {
     CompileLayer->setCloneToNewContextOnEmit(true);
-    CompileThreads = std::make_unique<ThreadPool>(S.NumCompileThreads);
+    CompileThreads =
+        std::make_unique<ThreadPool>(hardware_concurrency(S.NumCompileThreads));
     ES->setDispatchMaterialization(
         [this](JITDylib &JD, std::unique_ptr<MaterializationUnit> MU) {
           // FIXME: Switch to move capture once we have c++14.
Index: llvm/lib/LTO/LTO.cpp
===================================================================
--- llvm/lib/LTO/LTO.cpp
+++ llvm/lib/LTO/LTO.cpp
@@ -475,8 +475,7 @@
 LTO::ThinLTOState::ThinLTOState(ThinBackend Backend)
     : Backend(Backend), CombinedIndex(/*HaveGVs*/ false) {
   if (!Backend)
-    this->Backend =
-        createInProcessThinBackend(llvm::heavyweight_hardware_concurrency());
+    this->Backend = createInProcessThinBackend();
 }
 
 LTO::LTO(Config Conf, ThinBackend Backend,
@@ -1067,7 +1066,8 @@
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
-        BackendThreadPool(ThinLTOParallelismLevel),
+        BackendThreadPool(
+            heavyweight_hardware_concurrency(ThinLTOParallelismLevel)),
         AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
Index: llvm/lib/LTO/LTOBackend.cpp
===================================================================
--- llvm/lib/LTO/LTOBackend.cpp
+++ llvm/lib/LTO/LTOBackend.cpp
@@ -375,7 +375,8 @@
 void splitCodeGen(Config &C, TargetMachine *TM, AddStreamFn AddStream,
                   unsigned ParallelCodeGenParallelismLevel,
                   std::unique_ptr<Module> Mod) {
-  ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel);
+  ThreadPool CodegenThreadPool(
+      heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel));
   unsigned ThreadCount = 0;
   const Target *T = &TM->getTarget();
 
Index: llvm/lib/LTO/ThinLTOCodeGenerator.cpp
===================================================================
--- llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -81,8 +81,7 @@
 
 namespace {
 
-static cl::opt<int>
-    ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
+static cl::opt<int> ThreadCount("threads", cl::init(0));
 
 // Simple helper to save temporary files for debug.
 static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
@@ -1037,7 +1036,7 @@
 
   // Parallel optimizer + codegen
   {
-    ThreadPool Pool(ThreadCount);
+    ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount));
     for (auto IndexCount : ModulesOrdering) {
       auto &Mod = Modules[IndexCount];
       Pool.async([&](int count) {
Index: llvm/lib/Support/Host.cpp
===================================================================
--- llvm/lib/Support/Host.cpp
+++ llvm/lib/Support/Host.cpp
@@ -1266,7 +1266,7 @@
 // On Linux, the number of physical cores can be computed from /proc/cpuinfo,
 // using the number of unique physical/core id pairs. The following
 // implementation reads the /proc/cpuinfo format on an x86_64 system.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
   // Read /proc/cpuinfo as a stream (until EOF reached). It cannot be
   // mmapped because it appears to have 0 size.
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -1312,7 +1312,7 @@
 #include <sys/sysctl.h>
 
 // Gets the number of *physical cores* on the machine.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
   uint32_t count;
   size_t len = sizeof(count);
   sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0);
@@ -1326,6 +1326,9 @@
   }
   return count;
 }
+#elif defined(_WIN32)
+// Defined in llvm/lib/Support/Windows/Threading.inc
+int computeHostNumPhysicalCores();
 #else
 // On other systems, return -1 to indicate unknown.
 static int computeHostNumPhysicalCores() { return -1; }
Index: llvm/lib/Support/Parallel.cpp
===================================================================
--- llvm/lib/Support/Parallel.cpp
+++ llvm/lib/Support/Parallel.cpp
@@ -36,13 +36,16 @@
 ///   in filo order.
 class ThreadPoolExecutor : public Executor {
 public:
-  explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency())
-      : Done(ThreadCount) {
+  explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency())
+      : Strategy(S), Done(S.compute_thread_count()) {
     // Spawn all but one of the threads in another thread as spawning threads
     // can take a while.
-    std::thread([&, ThreadCount] {
-      for (size_t i = 1; i < ThreadCount; ++i) {
-        std::thread([=] { work(); }).detach();
+    std::thread([&] {
+      for (size_t I = 1; I < Strategy.compute_thread_count(); ++I) {
+        std::thread([=] {
+          Strategy.apply_thread_strategy(I);
+          work();
+        }).detach();
       }
       work();
     }).detach();
@@ -82,6 +85,7 @@
   std::stack<std::function<void()>> WorkStack;
   std::mutex Mutex;
   std::condition_variable Cond;
+  ThreadPoolStrategy Strategy;
   parallel::detail::Latch Done;
 };
 
Index: llvm/lib/Support/ThreadPool.cpp
===================================================================
--- llvm/lib/Support/ThreadPool.cpp
+++ llvm/lib/Support/ThreadPool.cpp
@@ -20,16 +20,14 @@
 
 #if LLVM_ENABLE_THREADS
 
-// Default to hardware_concurrency
-ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {}
-
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0), EnableFlag(true) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ActiveThreads(0), EnableFlag(true), ThreadCount(S.compute_thread_count()) {
   // Create ThreadCount threads that will loop forever, wait on QueueCondition
   // for tasks to be queued or the Pool to be destroyed.
   Threads.reserve(ThreadCount);
   for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) {
-    Threads.emplace_back([&] {
+    Threads.emplace_back([S, ThreadID, this] {
+      S.apply_thread_strategy(ThreadID);
       while (true) {
         PackagedTaskTy Task;
         {
@@ -108,12 +106,10 @@
 
 #else // LLVM_ENABLE_THREADS Disabled
 
-ThreadPool::ThreadPool() : ThreadPool(0) {}
-
 // No threads are launched, issue a warning if ThreadCount is not 0
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0) {
-  if (ThreadCount) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ActiveThreads(0), ThreadCount(S.compute_thread_count()) {
+  if (ThreadCount != 1) {
     errs() << "Warning: request a ThreadPool with " << ThreadCount
            << " threads, but LLVM_ENABLE_THREADS has been turned off\n";
   }
@@ -138,8 +134,6 @@
   return Future;
 }
 
-ThreadPool::~ThreadPool() {
-  wait();
-}
+ThreadPool::~ThreadPool() { wait(); }
 
 #endif
Index: llvm/lib/Support/Threading.cpp
===================================================================
--- llvm/lib/Support/Threading.cpp
+++ llvm/lib/Support/Threading.cpp
@@ -45,10 +45,6 @@
   Fn(UserData);
 }
 
-unsigned llvm::heavyweight_hardware_concurrency() { return 1; }
-
-unsigned llvm::hardware_concurrency() { return 1; }
-
 uint64_t llvm::get_threadid() { return 0; }
 
 uint32_t llvm::get_max_thread_name_length() { return 0; }
@@ -57,6 +53,14 @@
 
 void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }
 
+llvm::BitVector llvm::get_thread_affinity_mask() { return {}; }
+
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  // When threads are disabled, already return 1 to ensure clients will loop at
+  // least once.
+  return 1;
+}
+
 #if LLVM_ENABLE_THREADS == 0
 void llvm::llvm_execute_on_thread_async(
     llvm::unique_function<void()> Func,
@@ -78,30 +82,20 @@
 
 #else
 
-#include <thread>
-unsigned llvm::heavyweight_hardware_concurrency() {
-  // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
-  // `std::thread` directly instead of `llvm::thread` (and indeed, doing so
-  // allows us to not define `thread` in the llvm namespace, which conflicts
-  // with some platforms such as FreeBSD whose headers also define a struct
-  // called `thread` in the global namespace which can cause ambiguity due to
-  // ADL.
-  int NumPhysical = sys::getHostNumPhysicalCores();
-  if (NumPhysical == -1)
-    return std::thread::hardware_concurrency();
-  return NumPhysical;
-}
+int computeHostNumPhysicalCores();
+int computeHostNumPhysicalThreads();
 
-unsigned llvm::hardware_concurrency() {
-#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
-  cpu_set_t Set;
-  if (sched_getaffinity(0, sizeof(Set), &Set))
-    return CPU_COUNT(&Set);
-#endif
-  // Guard against std::thread::hardware_concurrency() returning 0.
-  if (unsigned Val = std::thread::hardware_concurrency())
-    return Val;
-  return 1;
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  unsigned MaxThreadCount = HyperThreads ? computeHostNumPhysicalThreads()
+                                         : computeHostNumPhysicalCores();
+  if (MaxThreadCount == 0)
+    MaxThreadCount = 1;
+
+  // No need to create more threads than there are hardware threads, it would
+  // uselessly induce more context-switching and cache eviction.
+  if (!ThreadCount || ThreadCount > MaxThreadCount)
+    return MaxThreadCount;
+  return ThreadCount;
 }
 
 namespace {
Index: llvm/lib/Support/Unix/Threading.inc
===================================================================
--- llvm/lib/Support/Unix/Threading.inc
+++ llvm/lib/Support/Unix/Threading.inc
@@ -267,3 +267,26 @@
 #endif
   return SetThreadPriorityResult::FAILURE;
 }
+
+#include <thread>
+
+int computeHostNumPhysicalThreads() {
+#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
+  cpu_set_t Set;
+  if (sched_getaffinity(0, sizeof(Set), &Set))
+    return CPU_COUNT(&Set);
+#endif
+  // Guard against std::thread::hardware_concurrency() returning 0.
+  if (unsigned Val = std::thread::hardware_concurrency())
+    return Val;
+  return 1;
+}
+
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  llvm_unreachable("Not implemented!");
+}
+
+unsigned llvm::get_cpus() { return 1; }
Index: llvm/lib/Support/Windows/Threading.inc
===================================================================
--- llvm/lib/Support/Windows/Threading.inc
+++ llvm/lib/Support/Windows/Threading.inc
@@ -16,6 +16,8 @@
 #include "WindowsSupport.h"
 #include <process.h>
 
+#include <bitset>
+
 // Windows will at times define MemoryFence.
 #ifdef MemoryFence
 #undef MemoryFence
@@ -122,3 +124,197 @@
              ? SetThreadPriorityResult::SUCCESS
              : SetThreadPriorityResult::FAILURE;
 }
+
+struct ProcessorGroup {
+  unsigned ID;
+  unsigned AllThreads;
+  unsigned UsableThreads;
+  unsigned ThreadsPerCore;
+  uint64_t Affinity;
+};
+
+template <typename F>
+static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
+  DWORD Len = 0;
+  BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
+  if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+    return false;
+  }
+  auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
+  R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
+  if (R) {
+    auto *End =
+        (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len);
+    for (auto *Curr = Info; Curr < End;
+         Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr +
+                                                            Curr->Size)) {
+      if (Curr->Relationship != Relationship)
+        continue;
+      Fn(Curr);
+    }
+  }
+  free(Info);
+  return true;
+}
+
+static ArrayRef<ProcessorGroup> computeProcessorGroups() {
+  auto computeGroups = []() {
+    SmallVector<ProcessorGroup, 4> Groups;
+
+    auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      GROUP_RELATIONSHIP &El = ProcInfo->Group;
+      for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
+        ProcessorGroup G;
+        G.ID = Groups.size();
+        G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
+        G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
+        assert(G.UsableThreads <= 64);
+        G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
+        Groups.push_back(G);
+      }
+    };
+
+    if (!IterateProcInfo(RelationGroup, HandleGroup))
+      return std::vector<ProcessorGroup>();
+
+    auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
+      assert(El.GroupCount == 1);
+      unsigned HyperThreads = 1;
+      // If the flag is set, each core supports more than one hyper-thread.
+      if (El.Flags & LTP_PC_SMT)
+        HyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
+      unsigned I = El.GroupMask[0].Group;
+      Groups[I].ThreadsPerCore = HyperThreads;
+    };
+
+    if (!IterateProcInfo(RelationProcessorCore, HandleProc))
+      return std::vector<ProcessorGroup>();
+
+    // If there's an affinity mask set on one of the CPUs, then assume the user
+    // wants to constrain the current process to only a single CPU.
+    for (auto &G : Groups) {
+      if (G.UsableThreads != G.AllThreads) {
+        ProcessorGroup NewG{G};
+        Groups.clear();
+        Groups.push_back(NewG);
+        break;
+      }
+    }
+
+    return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
+  };
+  static auto Groups = computeGroups();
+  return ArrayRef<ProcessorGroup>(Groups);
+}
+
+template <typename R, typename UnaryPredicate>
+unsigned aggregate(R &&Range, UnaryPredicate P) {
+  unsigned I{};
+  for (const auto &It : Range)
+    I += P(It);
+  return I;
+}
+
+// for sys::getHostNumPhysicalCores
+int computeHostNumPhysicalCores() {
+  static unsigned Cores =
+      aggregate(computeProcessorGroups(), [](const ProcessorGroup &G) {
+        return G.UsableThreads / G.ThreadsPerCore;
+      });
+  return Cores;
+}
+
+int computeHostNumPhysicalThreads() {
+  static unsigned Threads =
+      aggregate(computeProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.UsableThreads; });
+  return Threads;
+}
+
+// Generate a mask to effectively pin it to a hyper-thread, or to a core.
+// 'Affinity' is a mask of the allowed hyper-threads, 'Index' tells which
+// thread(s) to use in the Affinity mask, ThreadsMask can be 1 for one thread,
+// or more for pinning the thread to a group of hyper-threads.
+static uint64_t gen_mask(uint64_t Affinity, unsigned Index,
+                         unsigned ThreadsMask) {
+  assert(Index < 64);
+  unsigned Count = 0;
+  for (unsigned I = 0; I < 64; ++I) {
+    if ((Affinity >> I) & 1)
+      ++Count;
+    if (Index == Count - 1)
+      return (uint64_t)ThreadsMask << Index;
+  }
+  // Don't know how to set the mask, just use the default one
+  return Affinity;
+}
+
+// Assign the current thread to a more appropriate CPU socket or CPU group
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {
+  ArrayRef<ProcessorGroup> Groups = computeProcessorGroups();
+
+  assert(ThreadPoolNum < compute_thread_count() &&
+         "The thread index is not within thread strategy's range!");
+
+  // In this mode, the ThreadNumber represents the core number, not the
+  // hyper-thread number. Assumes all NUMA groups have the same amount of
+  // hyper-threads.
+  if (!HyperThreads)
+    ThreadPoolNum *= Groups[0].ThreadsPerCore;
+
+  unsigned ThreadRangeStart = 0;
+  for (unsigned I = 0; I < Groups.size(); ++I) {
+    const ProcessorGroup &G = Groups[I];
+    if (ThreadPoolNum >= ThreadRangeStart &&
+        ThreadPoolNum < ThreadRangeStart + G.UsableThreads) {
+
+      GROUP_AFFINITY Affinity{};
+      Affinity.Group = G.ID;
+
+      if (!HyperThreads) {
+        // This effectively pins the thread, alone, to a given core; however the
+        // thread could be migreated beween the hyperthreads within the core.
+        unsigned HyperThreadsMask = (1 << G.ThreadsPerCore) - 1;
+        Affinity.Mask = gen_mask(G.Affinity, (ThreadPoolNum - ThreadRangeStart),
+                                 HyperThreadsMask);
+      } else {
+        if (PinThreads) {
+          // Pin thread to a given hyper-thread
+          Affinity.Mask =
+              gen_mask(G.Affinity, ThreadPoolNum - ThreadRangeStart, 1);
+        } else {
+          // Use all available hyper-threads for that CPU socket or NUMA group.
+          Affinity.Mask = G.Affinity;
+        }
+      }
+      SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
+    }
+    ThreadRangeStart += G.UsableThreads;
+  }
+}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  GROUP_AFFINITY Affinity{};
+  GetThreadGroupAffinity(GetCurrentThread(), &Affinity);
+
+  static unsigned All =
+      aggregate(computeProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.AllThreads; });
+
+  unsigned StartOffset =
+      aggregate(computeProcessorGroups(), [&](const ProcessorGroup &G) {
+        return G.ID < Affinity.Group ? G.AllThreads : 0;
+      });
+
+  llvm::BitVector V;
+  V.resize(All);
+  for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
+    if ((Affinity.Mask >> I) & 1)
+      V.set(StartOffset + I);
+  }
+  return V;
+}
+
+unsigned llvm::get_cpus() { return computeProcessorGroups().size(); }
Index: llvm/tools/dsymutil/DwarfLinker.cpp
===================================================================
--- llvm/tools/dsymutil/DwarfLinker.cpp
+++ llvm/tools/dsymutil/DwarfLinker.cpp
@@ -3018,7 +3018,7 @@
     // errors from unchecked llvm::Error objects.
     Error RemarkLinkAllError = Error::success();
 
-    ThreadPool pool(3);
+    ThreadPool pool(hardware_concurrency(3));
     pool.async(AnalyzeAll);
     pool.async(CloneAll);
     pool.async(RemarkLinkAll, std::ref(RemarkLinkAllError));
Index: llvm/tools/dsymutil/dsymutil.cpp
===================================================================
--- llvm/tools/dsymutil/dsymutil.cpp
+++ llvm/tools/dsymutil/dsymutil.cpp
@@ -258,7 +258,7 @@
   if (opt::Arg *NumThreads = Args.getLastArg(OPT_threads))
     Options.LinkOpts.Threads = atoi(NumThreads->getValue());
   else
-    Options.LinkOpts.Threads = thread::hardware_concurrency();
+    Options.LinkOpts.Threads = 0; // Use all available HW threads
 
   if (Options.DumpDebugMap || Options.LinkOpts.Verbose)
     Options.LinkOpts.Threads = 1;
@@ -540,9 +540,10 @@
     // Shared a single binary holder for all the link steps.
     BinaryHolder BinHolder;
 
-    unsigned ThreadCount =
-        std::min<unsigned>(Options.LinkOpts.Threads, DebugMapPtrsOrErr->size());
-    ThreadPool Threads(ThreadCount);
+    unsigned ThreadCount = Options.LinkOpts.Threads;
+    if (!ThreadCount)
+      ThreadCount = DebugMapPtrsOrErr->size();
+    ThreadPool Threads(hardware_concurrency(ThreadCount));
 
     // If there is more than one link to execute, we need to generate
     // temporary files.
Index: llvm/tools/gold/gold-plugin.cpp
===================================================================
--- llvm/tools/gold/gold-plugin.cpp
+++ llvm/tools/gold/gold-plugin.cpp
@@ -134,8 +134,8 @@
   static unsigned OptLevel = 2;
   // Default parallelism of 0 used to indicate that user did not specify.
   // Actual parallelism default value depends on implementation.
-  // Currently only affects ThinLTO, where the default is
-  // llvm::heavyweight_hardware_concurrency.
+  // Currently only affects ThinLTO, where the default is the max cores in the
+  // system.
   static unsigned Parallelism = 0;
   // Default regular LTO codegen parallelism (number of partitions).
   static unsigned ParallelCodeGenParallelismLevel = 1;
Index: llvm/tools/llvm-cov/CodeCoverage.cpp
===================================================================
--- llvm/tools/llvm-cov/CodeCoverage.cpp
+++ llvm/tools/llvm-cov/CodeCoverage.cpp
@@ -944,9 +944,7 @@
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                              unsigned(SourceFiles.size())));
+    NumThreads = SourceFiles.size();
 
   if (!ViewOpts.hasOutputDirectory() || NumThreads == 1) {
     for (const std::string &SourceFile : SourceFiles)
@@ -954,7 +952,7 @@
                           ShowFilenames);
   } else {
     // In -output-dir mode, it's safe to use multiple threads to print files.
-    ThreadPool Pool(NumThreads);
+    ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
     for (const std::string &SourceFile : SourceFiles)
       Pool.async(&CodeCoverageTool::writeSourceFileView, this, SourceFile,
                  Coverage.get(), Printer.get(), ShowFilenames);
Index: llvm/tools/llvm-cov/CoverageExporterJson.cpp
===================================================================
--- llvm/tools/llvm-cov/CoverageExporterJson.cpp
+++ llvm/tools/llvm-cov/CoverageExporterJson.cpp
@@ -163,11 +163,9 @@
                         ArrayRef<FileCoverageSummary> FileReports,
                         const CoverageViewOptions &Options) {
   auto NumThreads = Options.NumThreads;
-  if (NumThreads == 0) {
-    NumThreads = std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                                       unsigned(SourceFiles.size())));
-  }
-  ThreadPool Pool(NumThreads);
+  if (NumThreads == 0)
+    NumThreads = SourceFiles.size();
+  ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
   json::Array FileArray;
   std::mutex FileArrayMutex;
 
Index: llvm/tools/llvm-cov/CoverageReport.cpp
===================================================================
--- llvm/tools/llvm-cov/CoverageReport.cpp
+++ llvm/tools/llvm-cov/CoverageReport.cpp
@@ -356,11 +356,8 @@
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::max(1U, std::min(llvm::heavyweight_hardware_concurrency(),
-                              unsigned(Files.size())));
-
-  ThreadPool Pool(NumThreads);
+    NumThreads = Files.size();
+  ThreadPool Pool(heavyweight_hardware_concurrency(NumThreads));
 
   std::vector<FileCoverageSummary> FileReports;
   FileReports.reserve(Files.size());
Index: llvm/tools/llvm-lto2/llvm-lto2.cpp
===================================================================
--- llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -65,8 +65,7 @@
                                        "import files for the "
                                        "distributed backend case"));
 
-static cl::opt<int> Threads("thinlto-threads",
-                            cl::init(llvm::heavyweight_hardware_concurrency()));
+static cl::opt<int> Threads("thinlto-threads", cl::init(0));
 
 static cl::list<std::string> SymbolResolutions(
     "r",
Index: llvm/tools/llvm-profdata/llvm-profdata.cpp
===================================================================
--- llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -307,8 +307,11 @@
 
   // If NumThreads is not specified, auto-detect a good default.
   if (NumThreads == 0)
-    NumThreads =
-        std::min(hardware_concurrency(), unsigned((Inputs.size() + 1) / 2));
+    NumThreads = std::min(hardware_concurrency().compute_thread_count(),
+                          unsigned((Inputs.size() + 1) / 2));
+  // FIXME: There's a bug here, where setting NumThreads = Inputs.size() fails
+  // the merge_empty_profile.test because the InstrProfWriter.ProfileKind isn't
+  // merged, thus the emitted file ends up with a PF_Unknown kind.
 
   // Initialize the writer contexts.
   SmallVector<std::unique_ptr<WriterContext>, 4> Contexts;
@@ -320,7 +323,7 @@
     for (const auto &Input : Inputs)
       loadInput(Input, Remapper, Contexts[0].get());
   } else {
-    ThreadPool Pool(NumThreads);
+    ThreadPool Pool(hardware_concurrency(NumThreads));
 
     // Load the inputs in parallel (N/NumThreads serial steps).
     unsigned Ctx = 0;
Index: llvm/unittests/ADT/BitVectorTest.cpp
===================================================================
--- llvm/unittests/ADT/BitVectorTest.cpp
+++ llvm/unittests/ADT/BitVectorTest.cpp
@@ -1149,4 +1149,24 @@
   EXPECT_EQ(213U, Vec.size());
   EXPECT_EQ(102U, Vec.count());
 }
+
+TYPED_TEST(BitVectorTest, Compare) {
+  TypeParam A(10, false);
+  TypeParam B(5, false);
+  EXPECT_EQ(A, B);
+
+  B.flip();
+  EXPECT_LE(A, B);
+  EXPECT_GT(B, A);
+  EXPECT_NE(B, A);
+
+  A.flip();
+  EXPECT_LE(B, A);
+  EXPECT_GT(A, B);
+  EXPECT_NE(A, B);
+
+  A.reset();
+  A.set(0, 5);
+  EXPECT_EQ(A, B);
+}
 }
Index: llvm/unittests/Support/Host.cpp
===================================================================
--- llvm/unittests/Support/Host.cpp
+++ llvm/unittests/Support/Host.cpp
@@ -37,7 +37,8 @@
     // Initially this is only testing detection of the number of
     // physical cores, which is currently only supported/tested for
     // x86_64 Linux and Darwin.
-    return (Host.getArch() == Triple::x86_64 &&
+    return Host.isOSWindows() ||
+           (Host.getArch() == Triple::x86_64 &&
             (Host.isOSDarwin() || Host.getOS() == Triple::Linux));
   }
 
Index: llvm/unittests/Support/TaskQueueTest.cpp
===================================================================
--- llvm/unittests/Support/TaskQueueTest.cpp
+++ llvm/unittests/Support/TaskQueueTest.cpp
@@ -22,7 +22,7 @@
 };
 
 TEST_F(TaskQueueTest, OrderedFutures) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::atomic<int> X{ 0 };
   std::atomic<int> Y{ 0 };
@@ -66,7 +66,7 @@
 }
 
 TEST_F(TaskQueueTest, UnOrderedFutures) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::atomic<int> X{ 0 };
   std::atomic<int> Y{ 0 };
@@ -96,7 +96,7 @@
 }
 
 TEST_F(TaskQueueTest, FutureWithReturnValue) {
-  ThreadPool TP(1);
+  ThreadPool TP(hardware_concurrency(1));
   TaskQueue TQ(TP);
   std::future<std::string> F1 = TQ.async([&] { return std::string("Hello"); });
   std::future<int> F2 = TQ.async([&] { return 42; });
Index: llvm/unittests/Support/ThreadPool.cpp
===================================================================
--- llvm/unittests/Support/ThreadPool.cpp
+++ llvm/unittests/Support/ThreadPool.cpp
@@ -8,11 +8,13 @@
 
 #include "llvm/Support/ThreadPool.h"
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/Threading.h"
 
 #include "gtest/gtest.h"
 
@@ -69,6 +71,8 @@
 
   void SetUp() override { MainThreadReady = false; }
 
+  void TestAllThreads(ThreadPoolStrategy S);
+
   std::condition_variable WaitMainThread;
   std::mutex WaitMainThreadMutex;
   bool MainThreadReady = false;
@@ -131,7 +135,7 @@
 
 TEST_F(ThreadPoolTest, GetFuture) {
   CHECK_UNSUPPORTED();
-  ThreadPool Pool{2};
+  ThreadPool Pool(hardware_concurrency(2));
   std::atomic_int i{0};
   Pool.async([this, &i] {
     waitForMainThread();
@@ -162,3 +166,56 @@
   }
   ASSERT_EQ(5, checked_in);
 }
+
+#if LLVM_ENABLE_THREADS == 1
+
+void ThreadPoolTest::TestAllThreads(ThreadPoolStrategy S) {
+  // FIXME: Skip these tests on non-Windows because multi-CPU sockets wasn't
+  // tested there, and llvm::get_thread_affinity_mask() isn't implemented
+  // for all flavors of Unix.
+  Triple Host(Triple::normalize(sys::getProcessTriple()));
+  if (!Host.isOSWindows())
+    return;
+
+  //std::set<llvm::BitVector> ThreadsUsed;
+  llvm::DenseSet<llvm::BitVector> ThreadsUsed;
+  std::mutex Lock;
+  unsigned Threads = 0;
+  {
+    ThreadPool Pool(S);
+    Threads = Pool.getThreadCount();
+    for (size_t I = 0; I < 10000; ++I) {
+      Pool.async([&] {
+        waitForMainThread();
+        std::lock_guard<std::mutex> Guard(Lock);
+        auto Mask = llvm::get_thread_affinity_mask();
+        ThreadsUsed.insert(Mask);
+      });
+    }
+    ASSERT_EQ(true, ThreadsUsed.empty());
+    setMainThreadReady();
+  }
+  if (!S.PinThreads && S.HyperThreads)
+    ASSERT_EQ(llvm::get_cpus(), ThreadsUsed.size());
+  else
+    ASSERT_EQ(Threads, ThreadsUsed.size());
+}
+
+TEST_F(ThreadPoolTest, AllThreads_UseAllRessources) {
+  CHECK_UNSUPPORTED();
+  TestAllThreads({});
+}
+
+TEST_F(ThreadPoolTest, AllThreads_PinThreads) {
+  CHECK_UNSUPPORTED();
+  ThreadPoolStrategy S;
+  S.PinThreads = true;
+  TestAllThreads(S);
+}
+
+TEST_F(ThreadPoolTest, AllThreads_OneThreadPerCore) {
+  CHECK_UNSUPPORTED();
+  TestAllThreads(llvm::heavyweight_hardware_concurrency());
+}
+
+#endif
Index: llvm/unittests/Support/Threading.cpp
===================================================================
--- llvm/unittests/Support/Threading.cpp
+++ llvm/unittests/Support/Threading.cpp
@@ -21,7 +21,8 @@
   auto Num = heavyweight_hardware_concurrency();
   // Since Num is unsigned this will also catch us trying to
   // return -1.
-  ASSERT_LE(Num, thread::hardware_concurrency());
+  ASSERT_LE(Num.compute_thread_count(),
+            hardware_concurrency().compute_thread_count());
 }
 
 #if LLVM_ENABLE_THREADS