diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -30,6 +30,8 @@
   SyntheticSections.cpp
   Target.cpp
   UnwindInfoSection.cpp
+  WorkStealingQueue.h
+  WorkStealingQueue.inc
   Writer.cpp
 
   LINK_COMPONENTS
diff --git a/lld/MachO/MarkLive.cpp b/lld/MachO/MarkLive.cpp
--- a/lld/MachO/MarkLive.cpp
+++ b/lld/MachO/MarkLive.cpp
@@ -12,49 +12,147 @@
 #include "SymbolTable.h"
 #include "Symbols.h"
 #include "UnwindInfoSection.h"
+#include "WorkStealingQueue.h"
 #include "mach-o/compact_unwind_encoding.h"
 #include "llvm/Support/TimeProfiler.h"
 
+#include <thread>
+
 namespace lld {
 namespace macho {
 
 using namespace llvm;
 using namespace llvm::MachO;
 
-// Set live bit on for each reachable chunk. Unmarked (unreachable)
-// InputSections will be ignored by Writer, so they will be excluded
-// from the final output.
-void markLive() {
-  TimeTraceScope timeScope("markLive");
+static const int poolSize = 16; // arbitrary - could be something else.
+
+struct Task {
+  Task() : idx(0), pool(nullptr) {}
 
-  // We build up a worklist of sections which have been marked as live. We only
-  // push into the worklist when we discover an unmarked section, and we mark
-  // as we push, so sections never appear twice in the list.
-  // Literal sections cannot contain references to other sections, so we only
-  // store ConcatInputSections in our worklist.
-  SmallVector<ConcatInputSection *, 256> worklist;
+  Task(int idx, WorkStealingQueue<InputSection *> *queues)
+      : idx(idx), pool(queues) {
+    // calcuate the portion of inputSections this task were to own.
+    // we dive it into poolSize groups. d = inputSections.size() /size
+    start = idx * d;
+    end = std::min<size_t>(start + d - 1, inputSections.size() - 1);
+    if (start >= inputSections.size())
+      skip = true;
+    else
+      skip = false;
+  }
 
-  auto enqueue = [&](InputSection *isec, uint64_t off) {
-    if (isec->isLive(off))
+  void addSect(InputSection *s, uint64_t off) {
+    if (s->isLive(off))
       return;
-    isec->markLive(off);
-    if (auto s = dyn_cast<ConcatInputSection>(isec)) {
-      assert(!s->isCoalescedWeak());
-      worklist.push_back(s);
+    s->markLive(off);
+
+    if (auto cs = dyn_cast<ConcatInputSection>(s)) {
+      assert(!cs->isCoalescedWeak());
+      pool[idx].push(cs);
     }
-  };
+  }
 
-  auto addSym = [&](Symbol *s) {
+  void addSym(Symbol *s) {
     s->used = true;
     if (auto *d = dyn_cast<Defined>(s))
       if (d->isec)
-        enqueue(d->isec, d->value);
-  };
+        addSect(d->isec, d->value);
+  }
+
+  void help(int target_idx) {
+    do {
+      auto item =
+          target_idx == idx ? pool[idx].pop() : pool[target_idx].steal();
+      while (item) {
+        InputSection *s = *item;
+        // Mark all symbols listed in the relocation table for this section.
+        for (const Reloc &r : s->relocs) {
+          if (auto *s = r.referent.dyn_cast<Symbol *>())
+            addSym(s);
+          else
+            addSect(r.referent.get<InputSection *>(), r.addend);
+        }
+        target_idx == idx ? pool[idx].pop() : pool[target_idx].steal();
+      }
 
-  // Add GC roots.
-  if (config->entry)
-    addSym(config->entry);
-  for (Symbol *sym : symtab->getSymbols()) {
+      // TODO:  we can avoid iterating inputSections multiple times.
+      //
+      // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live
+      // section.
+      // Process them in a second pass.
+      if (!skip) {
+        for (int i = start; i <= end; ++i) {
+          ConcatInputSection *isec = inputSections[i];
+          // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a
+          // separate vector and only walking that here is faster.
+          if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live)
+            continue;
+
+          for (const Reloc &r : isec->relocs) {
+            bool referentLive;
+            if (auto *s = r.referent.dyn_cast<Symbol *>())
+              referentLive = s->isLive();
+            else
+              referentLive = r.referent.get<InputSection *>()->isLive(r.addend);
+            if (referentLive)
+              addSect(isec, 0);
+          }
+        }
+      }
+
+      if (pool[idx].isEmpty()) {
+        // a check is faster than the %
+        size_t target = idx + 1;
+        if (target >= poolSize)
+          target = 0;
+        help(target);
+      }
+    } while (!pool[idx].isEmpty());
+  }
+
+  void work() { help(idx); }
+  const int d = inputSections.size() / poolSize;
+  const int idx; // index of this task's queue in the pool
+  int start;     // start inputsect
+  int end;       // end
+  bool skip;
+  WorkStealingQueue<InputSection *> *pool;
+};
+
+// Func : (Task& task, int j )
+//        task: is the queue to add to.
+//        j: is the index of the item to be added
+template <typename Func>
+static void splitTasks(WorkStealingQueue<InputSection *> *queues,
+                       std::vector<Task> &tasks, size_t size, Func func) {
+  assert(size >= poolSize);
+  const size_t d = size / poolSize;
+  std::vector<std::thread> threads;
+  threads.reserve(poolSize);
+
+  for (int idx = 0; idx < poolSize; ++idx) {
+    int start = idx * d;
+    int end = std::min<size_t>(start + d - 1, size - 1);
+    threads.push_back(std::thread([&]() {
+      Task &task = tasks[idx];
+      for (int j = start; j <= end; ++j) {
+        func(task, j);
+      }
+    }));
+  }
+
+  for (auto &thread : threads) {
+    thread.join();
+  }
+}
+
+void populateFromSymtab(WorkStealingQueue<InputSection *> *queues,
+                        std::vector<Task> &tasks) {
+  TimeTraceScope timeScope("populateFromSymtab");
+
+  const auto &entries = symtab->getSymbols();
+  auto do_add = [&](Task &task, int i) {
+    Symbol *sym = entries[i];
     if (auto *defined = dyn_cast<Defined>(sym)) {
       // -exported_symbol(s_list)
       if (!config->exportedSymbols.empty() &&
@@ -64,17 +162,18 @@
         // explicitUndefineds code below would handle this automatically.
         assert(!defined->privateExtern &&
                "should have been rejected by driver");
-        addSym(defined);
-        continue;
+        task.addSym(defined);
+        return;
       }
 
       // public symbols explicitly marked .no_dead_strip
       if (defined->referencedDynamically || defined->noDeadStrip) {
-        addSym(defined);
-        continue;
+        task.addSym(defined);
+        return;
       }
 
-      // FIXME: When we implement these flags, make symbols from them GC roots:
+      // FIXME: When we implement these flags, make symbols from them GC
+      // roots:
       // * -reexported_symbol(s_list)
       // * -alias(-list)
       // * -init
@@ -84,40 +183,87 @@
       bool externsAreRoots =
           config->outputType != MH_EXECUTE || config->exportDynamic;
       if (externsAreRoots && !defined->privateExtern) {
-        addSym(defined);
-        continue;
+        task.addSym(defined);
+        return;
       }
     }
+  };
+
+  const size_t size = entries.size();
+  if (size <= poolSize) {
+    for (int i = 0; i < size; ++i)
+      do_add(tasks[i], i);
+    return;
+  }
+
+  splitTasks(queues, tasks, size, do_add);
+}
+
+void populateFromSpecialSymbols(WorkStealingQueue<InputSection *> *queues,
+                                std::vector<Task> &tasks) {
+  TimeTraceScope timeScope("populateFromSpecialSymbols");
+
+  // For simplicity, add the -u symbols to the queue with the fewest tasks.
+  int min_idx = 0;
+  size_t min = queues[0].size();
+  for (int i = 1; i < poolSize; ++i) {
+    if (queues[i].size() < min)
+      min = i;
   }
-  // -u symbols
+  Task &task = tasks[min_idx];
   for (Symbol *sym : config->explicitUndefineds)
     if (auto *defined = dyn_cast<Defined>(sym))
-      addSym(defined);
+      task.addSym(defined);
+  if (auto *stubBinder =
+          dyn_cast_or_null<DylibSymbol>(symtab->find("dyld_stub_binder")))
+    task.addSym(stubBinder);
+}
+
+void populateDeadSrips(WorkStealingQueue<InputSection *> *queues,
+                       std::vector<Task> &tasks) {
+  std::vector<Defined *> syms;
   // local symbols explicitly marked .no_dead_strip
   for (const InputFile *file : inputFiles)
     if (auto *objFile = dyn_cast<ObjFile>(file))
       for (Symbol *sym : objFile->symbols)
         if (auto *defined = dyn_cast_or_null<Defined>(sym))
           if (!defined->isExternal() && defined->noDeadStrip)
-            addSym(defined);
-  if (auto *stubBinder =
-          dyn_cast_or_null<DylibSymbol>(symtab->find("dyld_stub_binder")))
-    addSym(stubBinder);
-  for (ConcatInputSection *isec : inputSections) {
+            syms.push_back(defined);
+
+  if (syms.size() < poolSize) {
+    for (int i = 0; i < syms.size(); ++i)
+      tasks[i].addSym(syms[i]);
+    return;
+  }
+
+  splitTasks(queues, tasks, syms.size(),
+             [&](Task &task, int i) { task.addSym(syms[i]); });
+}
+
+void populateFromInputSections(WorkStealingQueue<InputSection *> *queues,
+                               std::vector<Task> &tasks) {
+
+  auto do_add = [&](Task &task, int i) {
+    ConcatInputSection *isec = inputSections[i];
     // Sections marked no_dead_strip
     if (isec->getFlags() & S_ATTR_NO_DEAD_STRIP) {
-      enqueue(isec, 0);
-      continue;
+      task.addSect(isec, 0);
+      return;
     }
 
     // mod_init_funcs, mod_term_funcs sections
     if (sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS ||
         sectionType(isec->getFlags()) == S_MOD_TERM_FUNC_POINTERS) {
-      enqueue(isec, 0);
-      continue;
+      task.addSect(isec, 0);
+      return;
     }
-  }
+  };
 
+  splitTasks(queues, tasks, inputSections.size(), do_add);
+}
+
+void populateFromUnwindInfo(WorkStealingQueue<InputSection *> *queues,
+                            std::vector<Task> &tasks) {
   // Dead strip runs before UnwindInfoSection handling so we need to keep
   // __LD,__compact_unwind alive here.
   // But that section contains absolute references to __TEXT,__text and
@@ -125,7 +271,9 @@
   // section: We must skip the relocations for the functionAddress
   // in each CompactUnwindEntry.
   // See also scanEhFrameSection() in lld/ELF/MarkLive.cpp.
-  for (ConcatInputSection *isec : in.unwindInfo->getInputs()) {
+  const std::vector<ConcatInputSection *> &inputs = in.unwindInfo->getInputs();
+  auto do_add = [&](Task &task, int i) {
+    ConcatInputSection *isec = inputs[i];
     isec->live = true;
     const int compactUnwindEntrySize =
         target->wordSize == 8 ? sizeof(CompactUnwindEntry<uint64_t>)
@@ -137,51 +285,49 @@
         continue;
 
       if (auto *s = r.referent.dyn_cast<Symbol *>())
-        addSym(s);
+        task.addSym(s);
       else
-        enqueue(r.referent.get<InputSection *>(), r.addend);
+        task.addSect(r.referent.get<InputSection *>(), r.addend);
     }
+  };
+
+  if (inputs.size() < poolSize) {
+    for (int i = 0; i < inputs.size(); ++i)
+      do_add(tasks[i], i);
+    return;
   }
+  splitTasks(queues, tasks, inputs.size(), do_add);
+}
+// Set live bit on for each reachable chunk. Unmarked (unreachable)
+// InputSections will be ignored by Writer, so they will be excluded
+// from the final output.
+void markLive() {
+  TimeTraceScope timeScope("markLive");
 
-  do {
-    // Mark things reachable from GC roots as live.
-    while (!worklist.empty()) {
-      ConcatInputSection *s = worklist.pop_back_val();
-      assert(s->live && "We mark as live when pushing onto the worklist!");
-
-      // Mark all symbols listed in the relocation table for this section.
-      for (const Reloc &r : s->relocs) {
-        if (auto *s = r.referent.dyn_cast<Symbol *>())
-          addSym(s);
-        else
-          enqueue(r.referent.get<InputSection *>(), r.addend);
-      }
-    }
+  WorkStealingQueue<InputSection *> queues[poolSize];
+  std::vector<Task> tasks;
+  std::vector<std::thread> threads;
+  tasks.reserve(poolSize);
+  threads.reserve(poolSize);
+  for (int i = 0; i < poolSize; ++i) {
+    tasks.push_back(Task(i, queues));
+  }
 
-    // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live section.
-    // Process them in a second pass.
-    for (ConcatInputSection *isec : inputSections) {
-      // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a
-      // separate vector and only walking that here is faster.
-      if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live)
-        continue;
+  // Give the last queue the entry because chances are it has the fewest work.
+  tasks[poolSize - 1].addSym(config->entry);
 
-      for (const Reloc &r : isec->relocs) {
-        bool referentLive;
-        if (auto *s = r.referent.dyn_cast<Symbol *>())
-          referentLive = s->isLive();
-        else
-          referentLive = r.referent.get<InputSection *>()->isLive(r.addend);
-        if (referentLive)
-          enqueue(isec, 0);
-      }
-    }
+  populateFromSymtab(queues, tasks);
+  populateFromSpecialSymbols(queues, tasks);
+  populateDeadSrips(queues, tasks);
+  populateFromInputSections(queues, tasks);
+  populateFromUnwindInfo(queues, tasks);
 
-    // S_ATTR_LIVE_SUPPORT could have marked additional sections live,
-    // which in turn could mark additional S_ATTR_LIVE_SUPPORT sections live.
-    // Iterate. In practice, the second iteration won't mark additional
-    // S_ATTR_LIVE_SUPPORT sections live.
-  } while (!worklist.empty());
+  for (int i = 0; i < poolSize; ++i) {
+    threads.push_back(std::thread([&tasks, i]() { tasks[i].work(); }));
+  }
+  for (auto &thread : threads) {
+    thread.join();
+  }
 }
 
 } // namespace macho
diff --git a/lld/MachO/WorkStealingQueue.h b/lld/MachO/WorkStealingQueue.h
new file mode 100644
--- /dev/null
+++ b/lld/MachO/WorkStealingQueue.h
@@ -0,0 +1,83 @@
+#ifndef LLD_MACHO_WORKSTEALINGQUEUE_H
+#define LLD_MACHO_WORKSTEALINGQUEUE_H
+
+#include "llvm/ADT/Optional.h"
+
+#include <atomic>
+#include <cassert>
+#include <optional>
+#include <vector>
+
+namespace lld {
+namespace macho {
+namespace internal {
+
+// Actual data holder for the queue.
+// This is a convenience wrapper around an array of atomics to simplify
+// operations in WorkStealingQueue class.
+template <typename T> class Node {
+public:
+  explicit Node(size_t capacity)
+      : cap_{capacity}, mask_{capacity - 1}, arr_{new std::atomic<T>[cap_]} {}
+
+  ~Node() { delete[] arr_; }
+
+  size_t capacity() const { return cap_; }
+
+  // Resize to cap * 2 and copy all items over in the range [head, tail).
+  Node *resize(size_t head, size_t tail);
+
+  // Store item to index i.
+  template <typename E> void store(size_t i, E &&e);
+
+  // Remove item at index i and return it.
+  T remove(size_t i);
+
+private:
+  size_t cap_;
+  const size_t mask_;
+  std::atomic<T> *arr_;
+};
+
+} // namespace internal
+
+template <typename T> class WorkStealingQueue {
+public:
+  explicit WorkStealingQueue(size_t capacity = 1024);
+
+  ~WorkStealingQueue();
+
+  size_t capacity() const;
+  size_t size() const;
+  bool isEmpty() const;
+
+  // Standard queue operations.
+  // Note: only 1 thread can push/pop.
+  // But many threads can steal()
+
+  // Push item into the queue.
+  template <typename E> void push(E &&e);
+  // Pop item from the queue
+  llvm::Optional<T> pop();
+  // Steal item from the queue.
+  llvm::Optional<T> steal();
+
+private:
+  std::atomic<size_t> head_; // Index to the first item.
+  std::atomic<size_t> tail_; // Index to the last item.
+
+  std::atomic<internal::Node<T> *> node_; // pointer to the actual data array
+
+  // This holds all the used nodes that were too small (ie, prior to resizing)
+  // We don't want to pause and delete in the middle of the runs, so we
+  // collect them in this vector to delete later.
+  // FIXME: may not need this.
+  std::vector<internal::Node<T> *> retired_; // retired nodes
+};
+
+} // namespace macho
+} // namespace lld
+
+#include "WorkStealingQueue.inc"
+
+#endif // LLD_MACHO_WORKSTEALINGQUEUE_H
diff --git a/lld/MachO/WorkStealingQueue.inc b/lld/MachO/WorkStealingQueue.inc
new file mode 100644
--- /dev/null
+++ b/lld/MachO/WorkStealingQueue.inc
@@ -0,0 +1,123 @@
+#ifndef LLD_MACHO_WORKSTEALINGQUEUE_H
+#error Must be included with header
+#else
+
+namespace lld {
+namespace macho {
+namespace internal {
+
+// Resize to cap * 2 and copy all items over in the range [head, tail)
+template <typename T> Node<T> *Node<T>::resize(size_t head, size_t tail) {
+  Node *ptr = new Node{2 * cap_};
+  for (size_t i = head; i < tail; ++i) {
+    ptr->store(i, remove(i));
+  }
+  return ptr;
+}
+
+template <typename T>
+template <typename E>
+void Node<T>::store(size_t i, E &&e) {
+  arr_[i & mask_].store(std::forward<E>(e), std::memory_order_relaxed);
+}
+
+template <typename T> T Node<T>::remove(size_t i) {
+  return arr_[i & mask_].load(std::memory_order_relaxed);
+}
+
+} // namespace internal
+
+template <typename T> WorkStealingQueue<T>::WorkStealingQueue(size_t cap) {
+  assert(cap > 0);
+  head_.store(0, std::memory_order_relaxed);
+  tail_.store(0, std::memory_order_relaxed);
+  node_.store(new internal::Node<T>{cap}, std::memory_order_relaxed);
+  retired_.reserve(64);
+}
+
+template <typename T> WorkStealingQueue<T>::~WorkStealingQueue() {
+  for (auto r : retired_)
+    delete r;
+  retired_.clear();
+  delete node_.load();
+}
+
+template <typename T> bool WorkStealingQueue<T>::isEmpty() const {
+  return head_.load(std::memory_order_relaxed) >=
+         tail_.load(std::memory_order_relaxed);
+}
+
+template <typename T> size_t WorkStealingQueue<T>::size() const {
+  size_t t = tail_.load(std::memory_order_relaxed);
+  size_t h = head_.load(std::memory_order_relaxed);
+  return t >= h ? t - h : 0;
+}
+
+template <typename T>
+template <typename E>
+void WorkStealingQueue<T>::push(E &&e) {
+  size_t tailIdx = tail_.load(std::memory_order_relaxed);
+  size_t headIdx = head_.load(std::memory_order_acquire);
+  internal::Node<T> *a = node_.load(std::memory_order_relaxed);
+
+  if (a->capacity() < tailIdx - headIdx + 1) {
+    internal::Node<T> *tmp = a->resize(headIdx, tailIdx);
+    retired_.push_back(a);
+    std::swap(a, tmp);
+    node_.store(tmp, std::memory_order_relaxed);
+  }
+
+  a->store(tailIdx, std::forward<E>(e));
+  std::atomic_thread_fence(std::memory_order_release);
+  tail_.store(tailIdx + 1, std::memory_order_relaxed);
+}
+
+template <typename T> llvm::Optional<T> WorkStealingQueue<T>::pop() {
+  size_t tailIdx = tail_.load(std::memory_order_relaxed) - 1;
+  internal::Node<T> *curNode = node_.load(std::memory_order_relaxed);
+  tail_.store(tailIdx, std::memory_order_relaxed);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  size_t headIdx = head_.load(std::memory_order_relaxed);
+
+  if (headIdx > tailIdx) {
+    tail_.store(tailIdx + 1, std::memory_order_relaxed);
+    return llvm::None;
+  }
+
+  llvm::Optional<T> ret = curNode->remove(tailIdx);
+  if (headIdx == tailIdx) {
+    if (!head_.compare_exchange_strong(headIdx, headIdx + 1,
+                                       std::memory_order_seq_cst,
+                                       std::memory_order_relaxed))
+      ret = llvm::None;
+    tail_.store(tailIdx + 1, std::memory_order_relaxed);
+  }
+
+  return ret;
+}
+
+template <typename T> llvm::Optional<T> WorkStealingQueue<T>::steal() {
+  size_t headIdx = head_.load(std::memory_order_acquire);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  size_t tailIdx = tail_.load(std::memory_order_acquire);
+
+  if (headIdx >= tailIdx)
+    return llvm::None;
+
+  internal::Node<T> *curNode = node_.load(std::memory_order_consume);
+  llvm::Optional<T> ret = curNode->remove(headIdx);
+  if (!head_.compare_exchange_strong(headIdx, headIdx + 1,
+                                     std::memory_order_seq_cst,
+                                     std::memory_order_relaxed))
+    return llvm::None;
+  return ret;
+}
+
+template <typename T> size_t WorkStealingQueue<T>::capacity() const {
+  return node_.load(std::memory_order_relaxed)->capacity();
+}
+
+} // namespace macho
+} // namespace lld
+
+#endif
diff --git a/lld/unittests/CMakeLists.txt b/lld/unittests/CMakeLists.txt
--- a/lld/unittests/CMakeLists.txt
+++ b/lld/unittests/CMakeLists.txt
@@ -14,3 +14,4 @@
 
 add_subdirectory(DriverTests)
 add_subdirectory(MachOTests)
+add_subdirectory(MachOUnitTests)
diff --git a/lld/unittests/MachOUnitTests/CMakeLists.txt b/lld/unittests/MachOUnitTests/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/lld/unittests/MachOUnitTests/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_lld_unittest(lldMachOOldTests
+  WorkSteallingQueueTest.cpp        
+  )
+
+target_link_libraries(lldMachOUnitTests
+  PRIVATE
+  lldMachO
+  lldYAML
+  )
diff --git a/lld/unittests/MachOUnitTests/WorkSteallingQueueTest.cpp b/lld/unittests/MachOUnitTests/WorkSteallingQueueTest.cpp
new file mode 100644
--- /dev/null
+++ b/lld/unittests/MachOUnitTests/WorkSteallingQueueTest.cpp
@@ -0,0 +1,56 @@
+#include <atomic>
+#include <cassert>
+#include <memory>
+#include <thread>
+
+#include "../../lld/MachO/WorkStealingQueue.h"
+#include "gtest/gtest.h"
+
+TEST(SmokeTest, Test) {
+  lld::macho::WorkStealingQueue<int> queue(-1);
+  // expect death???
+}
+
+TEST(FunctionalTest, Test) {
+  const size_t cap = 999;
+  // Count number of accesses.
+  std::atomic<int> counts[max];
+  for (int i = 0; i < cap; ++i) {
+    counts[i].store(0, std::memory_order_relaxed);
+  }
+
+  // Multiple thieves.
+  auto do_steal = [&]() {
+    while (!queue.isEmpty()) {
+      llvm::Optional<int> item = queue.steal();
+      if (item)
+        counts[*item].fetch_add(1, std::memory_order_relaxed);
+    }
+  };
+
+  // Only one producer
+  std::thread producer([&]() {
+    for (int i = 0; i < max; i = i + 1) {
+      queue.push(i);
+    }
+
+    do_steal();
+    do_steal();
+  });
+
+  const int thiefCount = 10;
+  std::thread thieves[thiefCount];
+  for (int i = 0; i < thiefCount; ++i) {
+    thieves[i] = std::thread(do_steal);
+  }
+
+  producer.join();
+  for (auto &thief : thieves)
+    thief.join();
+
+  // Check that everything is incremented exactly once.
+  for (int i = 0; i < max; ++i) {
+    const int value = counts[i].load(std::memory_order_relaxed);
+    EXPECT_EQ(1, value);
+  }
+}