diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt
--- a/lld/MachO/CMakeLists.txt
+++ b/lld/MachO/CMakeLists.txt
@@ -30,6 +30,8 @@
   SyntheticSections.cpp
   Target.cpp
   UnwindInfoSection.cpp
+  WorkStealingQueue.h
+  WorkStealingQueue.inc
   Writer.cpp
 
   LINK_COMPONENTS
diff --git a/lld/MachO/MarkLive.cpp b/lld/MachO/MarkLive.cpp
--- a/lld/MachO/MarkLive.cpp
+++ b/lld/MachO/MarkLive.cpp
@@ -12,54 +12,252 @@
 #include "SymbolTable.h"
 #include "Symbols.h"
 #include "UnwindInfoSection.h"
+#include "WorkStealingQueue.h"
 #include "mach-o/compact_unwind_encoding.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/ScopeExit.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Parallel.h"
+#include "llvm/Support/Threading.h"
 #include "llvm/Support/TimeProfiler.h"
 
+#include <thread>
+
 namespace lld {
 namespace macho {
 
 using namespace llvm;
 using namespace llvm::MachO;
 
-// Set live bit on for each reachable chunk. Unmarked (unreachable)
-// InputSections will be ignored by Writer, so they will be excluded
-// from the final output.
-void markLive() {
-  TimeTraceScope timeScope("markLive");
+static const size_t poolSize = PowerOf2Floor(std::min<size_t>(
+    llvm::hardware_concurrency(llvm::parallel::strategy.ThreadsRequested)
+        .compute_thread_count(),
+    32));
+static std::atomic<int> done(0);
+
+struct Worker {
+
+  Worker() : idx(0), pool(nullptr) {}
 
-  // We build up a worklist of sections which have been marked as live. We only
-  // push into the worklist when we discover an unmarked section, and we mark
-  // as we push, so sections never appear twice in the list.
-  // Literal sections cannot contain references to other sections, so we only
-  // store ConcatInputSections in our worklist.
-  SmallVector<ConcatInputSection *, 256> worklist;
+  Worker(size_t idx, WorkStealingQueue<InputSection *> *queues)
+      : idx(idx), pool(queues) {
+    // Calcuate the portion of inputSections this worker is to own.
+    // We divide inputSections into poolSize groups.
+    // If there are fewer inputSections than the number of workers, then
+    // only the first x workers will each have one items, where x is
+    // the number of inputSections.
+    start = idx * groupSize;
+    end = start + groupSize - 1;
+    if (start < idx || end >= inputSections.size()) {
+      // Not all workers will have inputSections
+      if (idx < inputSections.size()) {
+        start = end = idx;
+        skip = false;
+      } else {
+        skip = true;
+      }
+    } else {
+      skip = false;
+    }
+  }
 
-  auto enqueue = [&](InputSection *isec, uint64_t off) {
-    if (isec->isLive(off))
+  size_t queueSize() const { return pool[idx].size(); }
+
+  void addSect(InputSection *s, uint64_t off) {
+    if (!s)
+      return;
+    if (s->isLive(off))
       return;
-    isec->markLive(off);
-    if (auto s = dyn_cast<ConcatInputSection>(isec)) {
-      assert(!s->isCoalescedWeak());
-      worklist.push_back(s);
+    s->markLive(off);
+
+    if (auto *cs = dyn_cast<ConcatInputSection>(s)) {
+      assert(!cs->isCoalescedWeak());
+      pool[idx].push(s);
     }
-  };
 
-  auto addSym = [&](Symbol *s) {
-    if (s->used)
+    checkLiveSupport();
+  }
+
+  void addSym(Symbol *s) {
+    if (!s || s->used)
       return;
+   
     s->used = true;
     if (auto *d = dyn_cast<Defined>(s)) {
       if (d->isec)
-        enqueue(d->isec, d->value);
+        addSect(d->isec, d->value);
       if (d->compactUnwind)
-        enqueue(d->compactUnwind, 0);
+        addSect(d->compactUnwind, 0);
     }
-  };
+    checkSymbolLiveSupport();
+  }
+
+  void runOnQueue(size_t targetIdx) {
+    do {
+      while (!pool[targetIdx].isEmpty()) {
+        auto item =
+            targetIdx == idx ? pool[targetIdx].pop() : pool[targetIdx].steal();
+
+        if (!item)
+          continue;
+        InputSection *sect = *item;
+        if (sect) {
+          // Mark all symbols listed in the relocation table for this section.
+          for (const Reloc &r : sect->relocs) {
+            if (auto *ref = r.referent.dyn_cast<Symbol *>())
+              addSym(ref);
+            else
+              addSect(r.referent.get<InputSection *>(), r.addend);
+          }
+        }
+      }
+
+      // TODO:  we can avoid iterating inputSections multiple times.
+      //
+      // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live
+      // section.
+      // Process them in a second pass.
+      if (!skip) {
+        for (size_t i = start; i <= end; ++i) {
+          ConcatInputSection *isec = inputSections[i];
+          // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a
+          // separate vector and only walking that here is faster.
+          if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live)
+            continue;
+
+          for (const Reloc &r : isec->relocs) {
+            if (auto *s = r.referent.dyn_cast<Symbol *>()) {
+              if (s->isLive())
+                addSect(isec, 0);
+              else
+                symbolLiveSupport[s] = isec;
+            } else {
+              InputSection *referentIsec = r.referent.get<InputSection *>();
+              if (referentIsec->isLive(r.addend))
+                addSect(isec, 0);
+              else
+                liveSupport[std::pair<InputSection *, int64_t>(
+                    referentIsec, r.addend)] = isec;
+            }
+          }
+        }
+      }
+    } while (!pool[idx].isEmpty());
+
+    while (done.load(std::memory_order_relaxed) < poolSize) {
+      size_t target = targetIdx;
+      // Keep looking for a non-empty queue until we're all done
+      do {
+        ++target;
+        // a check is faster than the %
+        if (target >= poolSize)
+          target = 0;
+        if (target == targetIdx)
+          break;
+      } while (pool[target].isEmpty());
+
+      if (!pool[target].isEmpty()) {
+        runOnQueue(target);
+      } else {
+        if (!markDone)
+          done.fetch_add(1, std::memory_order_acquire);
+        markDone = true;
+        break;
+      }
+    }
+
+    assert(liveSupport.empty() && symbolLiveSupport.empty());
+  }
+
+  void checkLiveSupport() {
+    for (auto entry = liveSupport.begin(); entry != liveSupport.end();) {
+      if (entry->first.first->isLive(entry->first.second)) {
+        InputSection *supported = entry->second;
+        liveSupport.erase(entry++);
+        addSect(supported, 0);
+      } else {
+        ++entry;
+      }
+    }
+  }
+
+  void checkSymbolLiveSupport() {
+    for (auto entry = symbolLiveSupport.begin();
+         entry != symbolLiveSupport.end();) {
+      if (entry->first->isLive()) {
+        InputSection *supported = entry->second;
+        symbolLiveSupport.erase(entry++);
+        addSect(supported, 0);
+      } else {
+        ++entry;
+      }
+    }
+  }
+
+  void run() {
+    // Start by looking at our own queue first.
+    runOnQueue(idx);
+  }
+
+  // The number of InputSections each Worker is initially assigned.
+  // Each Worker owns a contiguous block of InputSection's [start, end]
+  const size_t groupSize = inputSections.size() / poolSize;
+
+  const size_t idx;      // index of this worker's queue in the pool
+  bool markDone = false; // if true, this thread has signaled that it's done.
+  size_t start;          // start index
+  size_t end;            // end  index.
+  bool skip;
+  WorkStealingQueue<InputSection *> *pool;
+
+  // Mapping of referents to their referers.
+  DenseMap<std::pair<InputSection *, int64_t>, InputSection *> liveSupport;
+  DenseMap<Symbol *, InputSection *> symbolLiveSupport;
+};
 
-  // Add GC roots.
-  if (config->entry)
-    addSym(config->entry);
-  for (Symbol *sym : symtab->getSymbols()) {
+// Func : (Worker& worker, size_t j )
+//        worker: whose queue the item should be added to.
+//        j: is the index of the item to be added
+template <typename Func>
+static void splitTasks(std::vector<Worker> &workers, size_t size, Func func) {
+  const size_t d = size / poolSize;
+  std::vector<std::thread> threads;
+  threads.reserve(poolSize);
+
+  for (size_t idx = 0; idx < poolSize; ++idx) {
+    size_t start = idx * d;
+    size_t end = start + d - 1;
+    bool skip = false;
+
+    if (start < idx || end >= size) {
+      if (idx < size) {
+        start = end = idx;
+        skip = false;
+      } else {
+        skip = true;
+      }
+    }
+    if (!skip)
+      threads.push_back(std::thread([&func, idx, start, end, &workers]() {
+        Worker &worker = workers[idx];
+        for (size_t j = start; j <= end; ++j) {
+          func(worker, j);
+        }
+      }));
+  }
+
+  for (auto &thread : threads) {
+    thread.join();
+  }
+}
+
+void populateFromSymtab(std::vector<Worker> &workers) {
+  TimeTraceScope timeScope("populateFromSymtab");
+
+  const auto &entries = symtab->getSymbols();
+  auto doAdd = [&](Worker &worker, int i) {
+    Symbol *sym = entries[i];
     if (auto *defined = dyn_cast<Defined>(sym)) {
       // -exported_symbol(s_list)
       if (!config->exportedSymbols.empty() &&
@@ -69,17 +267,18 @@
         // explicitUndefineds code below would handle this automatically.
         assert(!defined->privateExtern &&
                "should have been rejected by driver");
-        addSym(defined);
-        continue;
+        worker.addSym(defined);
+        return;
       }
 
       // public symbols explicitly marked .no_dead_strip
       if (defined->referencedDynamically || defined->noDeadStrip) {
-        addSym(defined);
-        continue;
+        worker.addSym(defined);
+        return;
       }
 
-      // FIXME: When we implement these flags, make symbols from them GC roots:
+      // FIXME: When we implement these flags, make symbols from them GC
+      // roots:
       // * -reexported_symbol(s_list)
       // * -alias(-list)
       // * -init
@@ -89,81 +288,107 @@
       bool externsAreRoots =
           config->outputType != MH_EXECUTE || config->exportDynamic;
       if (externsAreRoots && !defined->privateExtern) {
-        addSym(defined);
-        continue;
+        worker.addSym(defined);
+        return;
       }
     }
+  };
+
+  const size_t size = entries.size();
+  splitTasks(workers, size, doAdd);
+}
+
+void populateFromSpecialSymbols(std::vector<Worker> &workers) {
+  TimeTraceScope timeScope("populateFromSpecialSymbols");
+
+  // For simplicity, add the -u symbols to the queue with the fewest tasks.
+  int minIdx = 0;
+  size_t min = workers[minIdx].queueSize();
+  for (size_t i = 1; i < poolSize; ++i) {
+    if (workers[i].queueSize() < min)
+      min = i;
   }
-  // -u symbols
+  Worker &worker = workers[minIdx];
   for (Symbol *sym : config->explicitUndefineds)
     if (auto *defined = dyn_cast<Defined>(sym))
-      addSym(defined);
+      worker.addSym(defined);
+  if (auto *stubBinder =
+          dyn_cast_or_null<DylibSymbol>(symtab->find("dyld_stub_binder")))
+    worker.addSym(stubBinder);
+}
+
+void populateNoDeadStrips(std::vector<Worker> &workers) {
+  TimeTraceScope timeScope("populateDeadSrips");
+  std::vector<Defined *> syms;
   // local symbols explicitly marked .no_dead_strip
   for (const InputFile *file : inputFiles)
     if (auto *objFile = dyn_cast<ObjFile>(file))
       for (Symbol *sym : objFile->symbols)
         if (auto *defined = dyn_cast_or_null<Defined>(sym))
           if (!defined->isExternal() && defined->noDeadStrip)
-            addSym(defined);
-  if (auto *stubBinder =
-          dyn_cast_or_null<DylibSymbol>(symtab->find("dyld_stub_binder")))
-    addSym(stubBinder);
-  for (ConcatInputSection *isec : inputSections) {
+            syms.push_back(defined);
+
+  splitTasks(workers, syms.size(),
+             [&](Worker &worker, int i) { worker.addSym(syms[i]); });
+}
+
+void populateFromInputSections(std::vector<Worker> &workers) {
+  TimeTraceScope timeScope("populateFromInputSections");
+  auto doAdd = [&](Worker &worker, int i) {
+    ConcatInputSection *isec = inputSections[i];
     // Sections marked no_dead_strip
     if (isec->getFlags() & S_ATTR_NO_DEAD_STRIP) {
-      enqueue(isec, 0);
-      continue;
+      worker.addSect(isec, 0);
+      return;
     }
 
     // mod_init_funcs, mod_term_funcs sections
     if (sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS ||
         sectionType(isec->getFlags()) == S_MOD_TERM_FUNC_POINTERS) {
-      enqueue(isec, 0);
-      continue;
+      worker.addSect(isec, 0);
+      return;
     }
-  }
+  };
 
-  do {
-    // Mark things reachable from GC roots as live.
-    while (!worklist.empty()) {
-      ConcatInputSection *s = worklist.pop_back_val();
-      assert(s->live && "We mark as live when pushing onto the worklist!");
-
-      // Mark all symbols listed in the relocation table for this section.
-      for (const Reloc &r : s->relocs) {
-        if (auto *s = r.referent.dyn_cast<Symbol *>())
-          addSym(s);
-        else
-          enqueue(r.referent.get<InputSection *>(), r.addend);
-      }
-      for (Defined *d : s->symbols)
-        addSym(d);
-    }
+  splitTasks(workers, inputSections.size(), doAdd);
+}
 
-    // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live section.
-    // Process them in a second pass.
-    for (ConcatInputSection *isec : inputSections) {
-      // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a
-      // separate vector and only walking that here is faster.
-      if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live)
-        continue;
-
-      for (const Reloc &r : isec->relocs) {
-        bool referentLive;
-        if (auto *s = r.referent.dyn_cast<Symbol *>())
-          referentLive = s->isLive();
-        else
-          referentLive = r.referent.get<InputSection *>()->isLive(r.addend);
-        if (referentLive)
-          enqueue(isec, 0);
-      }
-    }
+// Set live bit on for each reachable chunk. Unmarked (unreachable)
+// InputSections will be ignored by Writer, so they will be excluded
+// from the final output.
+void markLive() {
+  TimeTraceScope timeScope("markLive");
+
+  WorkStealingQueue<InputSection *> *queues =
+      new WorkStealingQueue<InputSection *>[poolSize];
+  std::vector<Worker> workers;
+  auto Cleanup = make_scope_exit([&] {
+    workers.clear();
+    delete[] queues;
+  });
+
+  workers.reserve(poolSize);
+  for (size_t i = 0; i < poolSize; ++i)
+    workers.push_back(Worker(i, queues));
 
-    // S_ATTR_LIVE_SUPPORT could have marked additional sections live,
-    // which in turn could mark additional S_ATTR_LIVE_SUPPORT sections live.
-    // Iterate. In practice, the second iteration won't mark additional
-    // S_ATTR_LIVE_SUPPORT sections live.
-  } while (!worklist.empty());
+  // Give the last queue the entry because chances are it has the least work.
+  workers[poolSize - 1].addSym(config->entry);
+
+  // Populate the queues with initial stuff.
+  populateFromSymtab(workers);
+  populateFromSpecialSymbols(workers);
+  populateNoDeadStrips(workers);
+  populateFromInputSections(workers);
+
+  std::vector<std::thread> threads;
+  threads.reserve(poolSize);
+  for (size_t i = 0; i < poolSize; ++i) {
+    threads.emplace_back([&workers, i]() { workers[i].run(); });
+  }
+
+  for (auto &thread : threads) {
+    thread.join();
+  }
 }
 
 } // namespace macho
diff --git a/lld/MachO/WorkStealingQueue.h b/lld/MachO/WorkStealingQueue.h
new file mode 100644
--- /dev/null
+++ b/lld/MachO/WorkStealingQueue.h
@@ -0,0 +1,84 @@
+#ifndef LLD_MACHO_WORK_STEALING_QUEUE_H
+#define LLD_MACHO_WORK_STEALING_QUEUE_H
+
+#include "llvm/ADT/Optional.h"
+
+#include <atomic>
+#include <cassert>
+#include <optional>
+#include <vector>
+
+namespace lld {
+namespace macho {
+namespace internal {
+
+// Actual data holder for the queue.
+// This is a convenience wrapper around an array of atomics to simplify
+// operations in WorkStealingQueue class.
+template <typename T> class Node {
+public:
+  explicit Node(size_t capacity)
+      : capacity{capacity}, mask{capacity - 1},
+        arr{new std::atomic<T>[capacity]} {}
+
+  ~Node() { delete[] arr; }
+
+  size_t getCapacity() const { return capacity; }
+
+  // Resize to cap * 2 and copy all items over in the range [head, tail).
+  Node *resize(size_t head, size_t tail);
+
+  // Store item to index i.
+  template <typename E> void store(size_t i, E &&e);
+
+  // Remove item at index i and return it.
+  T remove(size_t i);
+
+private:
+  size_t capacity;
+  const size_t mask;
+  std::atomic<T> *arr;
+};
+
+} // namespace internal
+
+template <typename T> class WorkStealingQueue {
+public:
+  explicit WorkStealingQueue(size_t capacity = 1024);
+
+  ~WorkStealingQueue();
+
+  size_t getCapacity() const;
+  size_t size() const;
+  bool isEmpty() const;
+
+  // Standard queue operations.
+  // Note: only 1 thread can push/pop.
+  // But many threads can steal()
+
+  // Push item into the queue.
+  template <typename E> void push(E &&e);
+  // Pop item from the queue
+  llvm::Optional<T> pop();
+  // Steal item from the queue.
+  llvm::Optional<T> steal();
+
+private:
+  std::atomic<size_t> head; // Index to the first item.
+  std::atomic<size_t> tail; // Index to the last item.
+
+  std::atomic<internal::Node<T> *> node; // pointer to the actual data array
+
+  // This holds all the used nodes that were too small (ie, prior to resizing)
+  // We don't want to pause and delete in the middle of the runs, so we
+  // collect them in this vector to delete later.
+  // FIXME: may not need this.
+  std::vector<internal::Node<T> *> retired; // retired nodes
+};
+
+} // namespace macho
+} // namespace lld
+
+#include "WorkStealingQueue.inc"
+
+#endif // LLD_MACHO_WORK_STEALING_QUEUE_H
diff --git a/lld/MachO/WorkStealingQueue.inc b/lld/MachO/WorkStealingQueue.inc
new file mode 100644
--- /dev/null
+++ b/lld/MachO/WorkStealingQueue.inc
@@ -0,0 +1,123 @@
+#ifndef LLD_MACHO_WORK_STEALING_QUEUE_H
+#error Must be included with header
+#else
+
+namespace lld {
+namespace macho {
+namespace internal {
+
+// Resize to cap * 2 and copy all items over in the range [head, tail)
+template <typename T> Node<T> *Node<T>::resize(size_t head, size_t tail) {
+  Node *ptr = new Node{2 * capacity};
+  for (size_t i = head; i < tail; ++i) {
+    ptr->store(i, remove(i));
+  }
+  return ptr;
+}
+
+template <typename T>
+template <typename E>
+void Node<T>::store(size_t i, E &&e) {
+  arr[i & mask].store(std::forward<E>(e), std::memory_order_relaxed);
+}
+
+template <typename T> T Node<T>::remove(size_t i) {
+  return arr[i & mask].load(std::memory_order_relaxed);
+}
+
+} // namespace internal
+
+template <typename T> WorkStealingQueue<T>::WorkStealingQueue(size_t cap) {
+  assert(cap > 0);
+  head.store(0, std::memory_order_relaxed);
+  tail.store(0, std::memory_order_relaxed);
+  node.store(new internal::Node<T>{cap}, std::memory_order_relaxed);
+  retired.reserve(64);
+}
+
+template <typename T> WorkStealingQueue<T>::~WorkStealingQueue() {
+  for (auto r : retired)
+    delete r;
+  retired.clear();
+  delete node.load();
+}
+
+template <typename T> bool WorkStealingQueue<T>::isEmpty() const {
+  return head.load(std::memory_order_relaxed) >=
+         tail.load(std::memory_order_relaxed);
+}
+
+template <typename T> size_t WorkStealingQueue<T>::size() const {
+  size_t t = tail.load(std::memory_order_relaxed);
+  size_t h = head.load(std::memory_order_relaxed);
+  return t >= h ? t - h : 0;
+}
+
+template <typename T>
+template <typename E>
+void WorkStealingQueue<T>::push(E &&e) {
+  size_t tailIdx = tail.load(std::memory_order_relaxed);
+  size_t headIdx = head.load(std::memory_order_acquire);
+  internal::Node<T> *a = node.load(std::memory_order_relaxed);
+
+  if (a->getCapacity() < tailIdx - headIdx + 1) {
+    internal::Node<T> *tmp = a->resize(headIdx, tailIdx);
+    retired.push_back(a);
+    std::swap(a, tmp);
+    node.store(tmp, std::memory_order_relaxed);
+  }
+
+  a->store(tailIdx, std::forward<E>(e));
+  std::atomic_thread_fence(std::memory_order_release);
+  tail.store(tailIdx + 1, std::memory_order_relaxed);
+}
+
+template <typename T> llvm::Optional<T> WorkStealingQueue<T>::pop() {
+  size_t tailIdx = tail.load(std::memory_order_relaxed) - 1;
+  internal::Node<T> *curNode = node.load(std::memory_order_relaxed);
+  tail.store(tailIdx, std::memory_order_relaxed);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  size_t headIdx = head.load(std::memory_order_relaxed);
+
+  if (headIdx > tailIdx) {
+    tail.store(tailIdx + 1, std::memory_order_relaxed);
+    return llvm::None;
+  }
+
+  llvm::Optional<T> ret = curNode->remove(tailIdx);
+  if (headIdx == tailIdx) {
+    if (!head.compare_exchange_strong(headIdx, headIdx + 1,
+                                       std::memory_order_seq_cst,
+                                       std::memory_order_relaxed))
+      ret = llvm::None;
+    tail.store(tailIdx + 1, std::memory_order_relaxed);
+  }
+
+  return ret;
+}
+
+template <typename T> llvm::Optional<T> WorkStealingQueue<T>::steal() {
+  size_t headIdx = head.load(std::memory_order_acquire);
+  std::atomic_thread_fence(std::memory_order_seq_cst);
+  size_t tailIdx = tail.load(std::memory_order_acquire);
+
+  if (headIdx >= tailIdx)
+    return llvm::None;
+
+  internal::Node<T> *curNode = node.load(std::memory_order_consume);
+  llvm::Optional<T> ret = curNode->remove(headIdx);
+  if (!head.compare_exchange_strong(headIdx, headIdx + 1,
+                                     std::memory_order_seq_cst,
+                                     std::memory_order_relaxed))
+    return llvm::None;
+  return ret;
+}
+
+template <typename T> size_t WorkStealingQueue<T>::getCapacity() const {
+  return node.load(std::memory_order_relaxed)->getCapacity();
+}
+
+} // namespace macho
+} // namespace lld
+
+#endif  // LLD_MACHO_WORK_STEALING_QUEUE_H
diff --git a/lld/unittests/CMakeLists.txt b/lld/unittests/CMakeLists.txt
--- a/lld/unittests/CMakeLists.txt
+++ b/lld/unittests/CMakeLists.txt
@@ -14,3 +14,4 @@
 
 add_subdirectory(DriverTests)
 add_subdirectory(MachOTests)
+add_subdirectory(MachOUnitTests)
diff --git a/lld/unittests/MachOUnitTests/CMakeLists.txt b/lld/unittests/MachOUnitTests/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/lld/unittests/MachOUnitTests/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_lld_unittest(lldMachOUnitTests
+  WorkStealingQueueTest.cpp        
+  )
+
+target_link_libraries(lldMachOUnitTests
+  PRIVATE
+  lldMachO
+  lldYAML
+  )
diff --git a/lld/unittests/MachOUnitTests/WorkStealingQueueTest.cpp b/lld/unittests/MachOUnitTests/WorkStealingQueueTest.cpp
new file mode 100644
--- /dev/null
+++ b/lld/unittests/MachOUnitTests/WorkStealingQueueTest.cpp
@@ -0,0 +1,58 @@
+#include <atomic>
+#include <cassert>
+#include <memory>
+#include <thread>
+
+#include "../../lld/MachO/WorkStealingQueue.h"
+#include "gtest/gtest.h"
+
+TEST(SmokeTest, Test) {
+  //  lld::macho::WorkStealingQueue<int> queue(-1);
+  // expect death???
+}
+
+TEST(FunctionalTest, Test) {
+  const size_t cap = 999;
+  // Count number of accesses.
+  std::atomic<int> counts[cap];
+  for (int i = 0; i < cap; ++i) {
+    counts[i].store(0, std::memory_order_relaxed);
+  }
+
+  lld::macho::WorkStealingQueue<int> queue;
+
+  // Multiple thieves.
+  auto do_steal = [&]() {
+    while (!queue.isEmpty()) {
+      llvm::Optional<int> item = queue.steal();
+      if (item)
+        counts[*item].fetch_add(1, std::memory_order_relaxed);
+    }
+  };
+
+  // Only one producer
+  std::thread producer([&]() {
+    for (int i = 0; i < cap; ++i) {
+      queue.push(i);
+    }
+
+    do_steal();
+    do_steal();
+  });
+
+  const int thiefCount = 10;
+  std::thread thieves[thiefCount];
+  for (int i = 0; i < thiefCount; ++i) {
+    thieves[i] = std::thread(do_steal);
+  }
+
+  producer.join();
+  for (auto &thief : thieves)
+    thief.join();
+
+  // Check that everything is incremented exactly once.
+  for (int i = 0; i < cap; ++i) {
+    const int value = counts[i].load(std::memory_order_relaxed);
+    EXPECT_EQ(1, value);
+  }
+}