Index: llvm/include/llvm/Support/ThreadPool.h
===================================================================
--- llvm/include/llvm/Support/ThreadPool.h
+++ llvm/include/llvm/Support/ThreadPool.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_SUPPORT_THREADPOOL_H
 #define LLVM_SUPPORT_THREADPOOL_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/thread.h"
@@ -20,14 +21,16 @@
 #include <future>
 
 #include <condition_variable>
+#include <deque>
 #include <functional>
 #include <memory>
 #include <mutex>
-#include <queue>
 #include <utility>
 
 namespace llvm {
 
+class ThreadPoolTaskGroup;
+
 /// A ThreadPool for asynchronous parallel execution on a defined number of
 /// threads.
 ///
@@ -53,17 +56,41 @@
     return async(std::move(Task));
   }
 
+  /// Overload, task will be in the given task group.
+  template <typename Function, typename... Args>
+  inline auto async(ThreadPoolTaskGroup &Group, Function &&F,
+                    Args &&...ArgList) {
+    auto Task =
+        std::bind(std::forward<Function>(F), std::forward<Args>(ArgList)...);
+    return async(Group, std::move(Task));
+  }
+
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename Func>
   auto async(Func &&F) -> std::shared_future<decltype(F())> {
-    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)));
+    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)),
+                     nullptr);
+  }
+
+  template <typename Func>
+  auto async(ThreadPoolTaskGroup &Group, Func &&F)
+      -> std::shared_future<decltype(F())> {
+    return asyncImpl(std::function<decltype(F())()>(std::forward<Func>(F)),
+                     &Group);
   }
 
   /// Blocking wait for all the threads to complete and the queue to be empty.
   /// It is an error to try to add new tasks while blocking on this call.
   void wait();
 
+  /// Blocking wait for only all the threads in the given group to complete.
+  /// It is possible to recursively wait even inside a task, if the group is
+  /// different. There may be only one active wait() call for a given group.
+  /// It is possible to add new tasks while blocking on this call, if those
+  /// tasks are from a different group.
+  void wait(ThreadPoolTaskGroup &Group);
+
   // TODO: misleading legacy name warning!
   // Returns the maximum number of worker threads in the pool, not the current
   // number of threads!
@@ -73,6 +100,9 @@
   bool isWorkerThread() const;
 
 private:
+  typedef std::deque<std::pair<std::function<void()>, ThreadPoolTaskGroup *>>
+      TaskQueue;
+
   /// Helpers to create a promise and a callable wrapper of \p Task that sets
   /// the result of the promise. Returns the callable and a future to access the
   /// result.
@@ -98,12 +128,15 @@
             std::move(F)};
   }
 
-  bool workCompletedUnlocked() { return !ActiveThreads && Tasks.empty(); }
+  /// Returns true if all tasks in the given group have finished (nullptr means
+  /// all tasks regardless of their group).
+  bool workCompletedUnlocked(ThreadPoolTaskGroup *Group) const;
 
   /// Asynchronous submission of a task to the pool. The returned future can be
   /// used to wait for the task to finish and is *non-blocking* on destruction.
   template <typename ResTy>
-  std::shared_future<ResTy> asyncImpl(std::function<ResTy()> Task) {
+  std::shared_future<ResTy> asyncImpl(std::function<ResTy()> Task,
+                                      ThreadPoolTaskGroup *Group) {
 
 #if LLVM_ENABLE_THREADS
     /// Wrap the Task in a std::function<void()> that sets the result of the
@@ -117,7 +150,7 @@
 
       // Don't allow enqueueing after disabling the pool
       assert(EnableFlag && "Queuing a thread during ThreadPool destruction");
-      Tasks.push(std::move(R.first));
+      Tasks.push_back(std::make_pair(std::move(R.first), Group));
       requestedThreads = ActiveThreads + Tasks.size();
     }
     QueueCondition.notify_one();
@@ -130,7 +163,7 @@
     auto Future = std::async(std::launch::deferred, std::move(Task)).share();
     // Wrap the future so that both ThreadPool::wait() can operate and the
     // returned future can be sync'ed on.
-    Tasks.push([Future]() { Future.get(); });
+    Tasks.push_back(std::make_pair([Future]() { Future.get(); }, Group));
     return Future;
 #endif
   }
@@ -139,6 +172,8 @@
   // Grow to ensure that we have at least `requested` Threads, but do not go
   // over MaxThreadCount.
   void grow(int requested);
+
+  void processTasks(ThreadPoolTaskGroup *WaitingForGroup);
 #endif
 
   /// Threads in flight
@@ -147,17 +182,19 @@
   mutable std::mutex ThreadsLock;
 
   /// Tasks waiting for execution in the pool.
-  std::queue<std::function<void()>> Tasks;
+  TaskQueue Tasks;
 
   /// Locking and signaling for accessing the Tasks queue.
   std::mutex QueueLock;
   std::condition_variable QueueCondition;
 
-  /// Signaling for job completion
+  /// Signaling for job completion (all tasks or all tasks in a group).
   std::condition_variable CompletionCondition;
 
   /// Keep track of the number of thread actually busy
   unsigned ActiveThreads = 0;
+  /// Number of threads active for tasks in the given group (only non-zero).
+  DenseMap<ThreadPoolTaskGroup *, unsigned> ActiveGroups;
 
 #if LLVM_ENABLE_THREADS // avoids warning for unused variable
   /// Signal for the destruction of the pool, asking thread to exit.
@@ -169,6 +206,29 @@
   /// Maximum number of threads to potentially grow this pool to.
   const unsigned MaxThreadCount;
 };
-}
+
+/// A group of tasks to be run on a thread pool. Thread pool tasks in different
+/// groups can run on the same threadpool but can be waited for separately.
+class ThreadPoolTaskGroup {
+public:
+  /// The ThreadPool argument is the thread pool to forward calls to.
+  ThreadPoolTaskGroup(ThreadPool &Pool) : Pool(Pool) {}
+  /// Blocking destructor: will wait for all the tasks in the group to complete
+  /// by calling ThreadPool::wait().
+  ~ThreadPoolTaskGroup() { wait(); }
+  /// Calls ThreadPool::async() for this group.
+  template <typename Function, typename... Args>
+  inline auto async(Function &&F, Args &&...ArgList) {
+    return Pool.async(*this, std::forward<Function>(F),
+                      std::forward<Args>(ArgList)...);
+  }
+  /// Calls ThreadPool::wait() for this group.
+  void wait() { Pool.wait(*this); }
+
+private:
+  ThreadPool &Pool;
+};
+
+} // namespace llvm
 
 #endif // LLVM_SUPPORT_THREADPOOL_H
Index: llvm/lib/Support/ThreadPool.cpp
===================================================================
--- llvm/lib/Support/ThreadPool.cpp
+++ llvm/lib/Support/ThreadPool.cpp
@@ -24,6 +24,13 @@
 
 #if LLVM_ENABLE_THREADS
 
+// A note on thread groups: Tasks are by default in no group (represented
+// by nullptr ThreadPoolTaskGroup pointer in the Tasks queue) and functionality
+// here normally works on all tasks regardless of their group (functions
+// in that case receive nullptr ThreadPoolTaskGroup pointer as argument).
+// A task in a group has a pointer to that ThreadPoolTaskGroup in the Tasks queue,
+// and functions called to work only on tasks from one group take that pointer.
+
 ThreadPool::ThreadPool(ThreadPoolStrategy S)
     : Strategy(S), MaxThreadCount(S.compute_thread_count()) {}
 
@@ -36,48 +43,99 @@
     int ThreadID = Threads.size();
     Threads.emplace_back([this, ThreadID] {
       Strategy.apply_thread_strategy(ThreadID);
-      while (true) {
-        std::function<void()> Task;
-        {
-          std::unique_lock<std::mutex> LockGuard(QueueLock);
-          // Wait for tasks to be pushed in the queue
-          QueueCondition.wait(LockGuard,
-                              [&] { return !EnableFlag || !Tasks.empty(); });
-          // Exit condition
-          if (!EnableFlag && Tasks.empty())
-            return;
-          // Yeah, we have a task, grab it and release the lock on the queue
-
-          // We first need to signal that we are active before popping the queue
-          // in order for wait() to properly detect that even if the queue is
-          // empty, there is still a task in flight.
-          ++ActiveThreads;
-          Task = std::move(Tasks.front());
-          Tasks.pop();
-        }
-        // Run the task we just grabbed
-        Task();
-
-        bool Notify;
-        {
-          // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait()
-          std::lock_guard<std::mutex> LockGuard(QueueLock);
-          --ActiveThreads;
-          Notify = workCompletedUnlocked();
-        }
-        // Notify task completion if this is the last active thread, in case
-        // someone waits on ThreadPool::wait().
-        if (Notify)
-          CompletionCondition.notify_all();
-      }
+      processTasks(nullptr);
     });
   }
 }
 
+// WaitingForGroup == nullptr means all tasks regardless of their group.
+void ThreadPool::processTasks(ThreadPoolTaskGroup *WaitingForGroup) {
+  while (true) {
+    std::function<void()> Task;
+    ThreadPoolTaskGroup *GroupOfTask;
+    {
+      std::unique_lock<std::mutex> LockGuard(QueueLock);
+      // Wait for tasks to be pushed in the queue
+      QueueCondition.wait(LockGuard, [&] {
+        return !EnableFlag || !Tasks.empty() ||
+               (WaitingForGroup != nullptr &&
+                workCompletedUnlocked(WaitingForGroup));
+      });
+      // Exit condition
+      if (!EnableFlag && Tasks.empty())
+        return;
+      if (WaitingForGroup != nullptr && workCompletedUnlocked(WaitingForGroup))
+        return;
+      // Yeah, we have a task, grab it and release the lock on the queue
+
+      // We first need to signal that we are active before popping the queue
+      // in order for wait() to properly detect that even if the queue is
+      // empty, there is still a task in flight.
+      ++ActiveThreads;
+      Task = std::move(Tasks.front().first);
+      GroupOfTask = Tasks.front().second;
+      // Need to count active threads in each group separately, ActiveThreads
+      // would never be 0 if waiting for another group inside a wait.
+      if (GroupOfTask != nullptr)
+        ++ActiveGroups[GroupOfTask]; // Increment or set to 1 if new item
+      Tasks.pop_front();
+    }
+    // Run the task we just grabbed
+    Task();
+
+    bool Notify;
+    bool NotifyGroup;
+    {
+      // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait()
+      std::lock_guard<std::mutex> LockGuard(QueueLock);
+      --ActiveThreads;
+      if (GroupOfTask != nullptr) {
+        auto A = ActiveGroups.find(GroupOfTask);
+        if (--(A->second) == 0)
+          ActiveGroups.erase(A);
+      }
+      Notify = workCompletedUnlocked(GroupOfTask);
+      NotifyGroup = GroupOfTask != nullptr && Notify;
+    }
+    // Notify task completion if this is the last active thread, in case
+    // someone waits on ThreadPool::wait().
+    if (Notify)
+      CompletionCondition.notify_all();
+    // If this was a task in a group, notify also threads waiting for tasks
+    // in this function on QueueCondition, to make a recursive wait() return
+    // after the group it's been waiting for has finished.
+    if (NotifyGroup)
+      QueueCondition.notify_all();
+  }
+}
+
+bool ThreadPool::workCompletedUnlocked(ThreadPoolTaskGroup *Group) const {
+  if (Group == nullptr)
+    return !ActiveThreads && Tasks.empty();
+  return ActiveGroups.count(Group) == 0 &&
+         !llvm::any_of(Tasks,
+                       [Group](const auto &T) { return T.second == Group; });
+}
+
 void ThreadPool::wait() {
   // Wait for all threads to complete and the queue to be empty
   std::unique_lock<std::mutex> LockGuard(QueueLock);
-  CompletionCondition.wait(LockGuard, [&] { return workCompletedUnlocked(); });
+  CompletionCondition.wait(LockGuard,
+                           [&] { return workCompletedUnlocked(nullptr); });
+}
+
+void ThreadPool::wait(ThreadPoolTaskGroup &Group) {
+  // Wait for all threads in the group to complete.
+  if (!isWorkerThread()) {
+    std::unique_lock<std::mutex> LockGuard(QueueLock);
+    CompletionCondition.wait(LockGuard,
+                             [&] { return workCompletedUnlocked(&Group); });
+    return;
+  }
+  // Handle the case of recursive call from another task in a different group,
+  // in which case process tasks while waiting to keep the thread busy and avoid
+  // possible deadlock.
+  processTasks(&Group);
 }
 
 bool ThreadPool::isWorkerThread() const {
@@ -115,12 +173,18 @@
 void ThreadPool::wait() {
   // Sequential implementation running the tasks
   while (!Tasks.empty()) {
-    auto Task = std::move(Tasks.front());
-    Tasks.pop();
+    auto Task = std::move(Tasks.front().first);
+    Tasks.pop_front();
     Task();
   }
 }
 
+void ThreadPool::wait(ThreadPoolTaskGroup &) {
+  // Simply wait for all, this works even if recursive (the running task
+  // is already removed from the queue).
+  wait();
+}
+
 bool ThreadPool::isWorkerThread() const {
   report_fatal_error("LLVM compiled without multithreading");
 }
Index: llvm/tools/llvm-profdata/llvm-profdata.cpp
===================================================================
--- llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -38,6 +38,7 @@
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <queue>
 
 using namespace llvm;
 
Index: llvm/unittests/Support/ThreadPool.cpp
===================================================================
--- llvm/unittests/Support/ThreadPool.cpp
+++ llvm/unittests/Support/ThreadPool.cpp
@@ -18,6 +18,8 @@
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Threading.h"
 
+#include <chrono>
+
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -29,6 +31,7 @@
   SmallVector<Triple::ArchType, 4> UnsupportedArchs;
   SmallVector<Triple::OSType, 4> UnsupportedOSs;
   SmallVector<Triple::EnvironmentType, 1> UnsupportedEnvironments;
+
 protected:
   // This is intended for platform as a temporary "XFAIL"
   bool isUnsupportedOSOrEnvironment() {
@@ -57,27 +60,45 @@
   }
 
   /// Make sure this thread not progress faster than the main thread.
-  void waitForMainThread() {
-    std::unique_lock<std::mutex> LockGuard(WaitMainThreadMutex);
-    WaitMainThread.wait(LockGuard, [&] { return MainThreadReady; });
-  }
+  void waitForMainThread() { waitForPhase(1); }
 
   /// Set the readiness of the main thread.
-  void setMainThreadReady() {
+  void setMainThreadReady() { setPhase(1); }
+
+  /// Wait until given phase is set using setPhase(); first "main" phase is 1.
+  /// See also PhaseResetHelper below.
+  void waitForPhase(int Phase) {
+    std::unique_lock<std::mutex> LockGuard(CurrentPhaseMutex);
+    CurrentPhaseCondition.wait(
+        LockGuard, [&] { return CurrentPhase == Phase || CurrentPhase < 0; });
+  }
+  /// If a thread waits on another phase, the test could bail out on a failed
+  /// assertion and ThreadPool destructor would wait() on all threads, which
+  /// would deadlock on the task waiting. Create this helper to automatically
+  /// reset the phase and unblock such threads.
+  struct PhaseResetHelper {
+    PhaseResetHelper(ThreadPoolTest *test) : test(test) {}
+    ~PhaseResetHelper() { test->setPhase(-1); }
+    ThreadPoolTest *test;
+  };
+
+  /// Advance to the given phase.
+  void setPhase(int Phase) {
     {
-      std::unique_lock<std::mutex> LockGuard(WaitMainThreadMutex);
-      MainThreadReady = true;
+      std::unique_lock<std::mutex> LockGuard(CurrentPhaseMutex);
+      assert(Phase == CurrentPhase + 1 || Phase < 0);
+      CurrentPhase = Phase;
     }
-    WaitMainThread.notify_all();
+    CurrentPhaseCondition.notify_all();
   }
 
-  void SetUp() override { MainThreadReady = false; }
+  void SetUp() override { CurrentPhase = 0; }
 
   std::vector<llvm::BitVector> RunOnAllSockets(ThreadPoolStrategy S);
 
-  std::condition_variable WaitMainThread;
-  std::mutex WaitMainThreadMutex;
-  bool MainThreadReady = false;
+  std::condition_variable CurrentPhaseCondition;
+  std::mutex CurrentPhaseMutex;
+  int CurrentPhase; // -1 = error, 0 = setup, 1 = ready, 2+ = custom
 };
 
 #define CHECK_UNSUPPORTED()                                                    \
@@ -194,6 +215,128 @@
   ASSERT_EQ(5, checked_in);
 }
 
+// Check running tasks in different groups.
+TEST_F(ThreadPoolTest, Groups) {
+  CHECK_UNSUPPORTED();
+  // Need at least two threads, as the task in group2
+  // might block a thread until all tasks in group1 finish.
+  ThreadPoolStrategy S = hardware_concurrency(2);
+  if (S.compute_thread_count() < 2)
+    return;
+  ThreadPool Pool(S);
+  PhaseResetHelper Helper(this);
+  ThreadPoolTaskGroup Group1(Pool);
+  ThreadPoolTaskGroup Group2(Pool);
+
+  // Check that waiting for an empty group is a no-op.
+  Group1.wait();
+
+  std::atomic_int checked_in1{0};
+  std::atomic_int checked_in2{0};
+
+  for (size_t i = 0; i < 5; ++i) {
+    Group1.async([this, &checked_in1] {
+      waitForMainThread();
+      ++checked_in1;
+    });
+  }
+  Group2.async([this, &checked_in2] {
+    waitForPhase(2);
+    ++checked_in2;
+  });
+  ASSERT_EQ(0, checked_in1);
+  ASSERT_EQ(0, checked_in2);
+  // Start first group and wait for it.
+  setMainThreadReady();
+  Group1.wait();
+  ASSERT_EQ(5, checked_in1);
+  // Second group has not yet finished, start it and wait for it.
+  ASSERT_EQ(0, checked_in2);
+  setPhase(2);
+  Group2.wait();
+  ASSERT_EQ(5, checked_in1);
+  ASSERT_EQ(1, checked_in2);
+}
+
+// Check recursive tasks.
+TEST_F(ThreadPoolTest, RecursiveGroups) {
+  CHECK_UNSUPPORTED();
+  ThreadPool Pool;
+  ThreadPoolTaskGroup Group(Pool);
+
+  std::atomic_int checked_in1{0};
+
+  for (size_t i = 0; i < 5; ++i) {
+    Group.async([this, &Pool, &checked_in1] {
+      waitForMainThread();
+
+      ThreadPoolTaskGroup LocalGroup(Pool);
+
+      // Check that waiting for an empty group is a no-op.
+      LocalGroup.wait();
+
+      std::atomic_int checked_in2{0};
+      for (size_t i = 0; i < 5; ++i) {
+        LocalGroup.async([&checked_in2] { ++checked_in2; });
+      }
+      LocalGroup.wait();
+      ASSERT_EQ(5, checked_in2);
+
+      ++checked_in1;
+    });
+  }
+  ASSERT_EQ(0, checked_in1);
+  setMainThreadReady();
+  Group.wait();
+  ASSERT_EQ(5, checked_in1);
+}
+
+TEST_F(ThreadPoolTest, RecursiveWaitDeadlock) {
+  CHECK_UNSUPPORTED();
+  ThreadPoolStrategy S = hardware_concurrency(2);
+  if (S.compute_thread_count() < 2)
+    return;
+  ThreadPool Pool(S);
+  PhaseResetHelper Helper(this);
+  ThreadPoolTaskGroup Group(Pool);
+
+  // Test that a thread calling wait() for a group and is waiting for more tasks
+  // returns when the last task finishes in a different thread while the waiting
+  // thread was waiting for more tasks to process while waiting.
+
+  // Task A is run in a first thread, it finishes and leaves
+  // the background thread waiting for more tasks.
+  Group.async([this] {
+    waitForMainThread();
+    setPhase(2);
+  });
+  // Task B is run in a second thread, it launches yet another
+  // task C in a different group, which will be handled by the waiting
+  // thread started above.
+  Group.async([this, &Pool] {
+    waitForPhase(2);
+    ThreadPoolTaskGroup LocalGroup(Pool);
+    LocalGroup.async([this] {
+      waitForPhase(3);
+      // Give the other thread enough time to check that there's no task
+      // to process and suspend waiting for a notification. This is indeed racy,
+      // but probably the best that can be done.
+      auto start = std::chrono::system_clock::now();
+      while (std::chrono::system_clock::now() - start <
+             std::chrono::milliseconds(10))
+        ;
+    });
+    // And task B only now will wait for the tasks in the group (=task C)
+    // to finish. This test checks that it does not deadlock. If the
+    // `NotifyGroup` handling in ThreadPool::processTasks() didn't take place,
+    // this task B would be stuck waiting for tasks to arrive.
+    setPhase(3);
+    LocalGroup.wait();
+  });
+  setMainThreadReady();
+  Group.wait();
+}
+
 #if LLVM_ENABLE_THREADS == 1
 
 // FIXME: Skip some tests below on non-Windows because multi-socket systems