diff --git a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
--- a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h
@@ -85,14 +85,7 @@
   }
 
   void FlushCache(Cache *c) {
-    if (!c->pos)
-      return;
-    SpinMutexLock lock(&mtx_);
-    while (c->pos) {
-      IndexT idx = c->cache[--c->pos];
-      *(IndexT*)Map(idx) = freelist_;
-      freelist_ = idx;
-    }
+    while (c->pos) Drain(c);
   }
 
   void InitCache(Cache *c) {
@@ -106,7 +99,7 @@
 
   template <typename Func>
   void ForEach(Func func) {
-    SpinMutexLock lock(&mtx_);
+    Lock lock(&mtx_);
     uptr fillpos = atomic_load_relaxed(&fillpos_);
     for (uptr l1 = 0; l1 < fillpos; l1++) {
       for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
@@ -115,48 +108,86 @@
 
  private:
   T *map_[kL1Size];
-  SpinMutex mtx_;
-  IndexT freelist_ = {0};
+  Mutex mtx_;
+  // The freelist is organized as a lock-free stack of batches of nodes.
+  // The stack itself uses Block::next links, while the batch within each
+  // stack node uses Block::batch links.
+  // Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter.
+  atomic_uint64_t freelist_ = {0};
   atomic_uintptr_t fillpos_ = {0};
   const char *const name_;
 
-  void Refill(Cache *c) {
-    SpinMutexLock lock(&mtx_);
-    if (freelist_ == 0) {
-      uptr fillpos = atomic_load_relaxed(&fillpos_);
-      if (fillpos == kL1Size) {
-        Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",
-            name_, kL1Size, kL2Size);
-        Die();
-      }
-      VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
-              fillpos, kL1Size, kL2Size);
-      T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_);
-      // Reserve 0 as invalid index.
-      IndexT start = fillpos == 0 ? 1 : 0;
-      for (IndexT i = start; i < kL2Size; i++) {
-        new(batch + i) T;
-        *(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size;
-      }
-      *(IndexT*)(batch + kL2Size - 1) = 0;
-      freelist_ = fillpos * kL2Size + start;
-      map_[fillpos] = batch;
-      atomic_store_relaxed(&fillpos_, fillpos + 1);
-    }
-    for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
-      IndexT idx = freelist_;
+  struct Block {
+    IndexT next;
+    IndexT batch;
+  };
+
+  Block *MapBlock(IndexT idx) { return reinterpret_cast<Block *>(Map(idx)); }
+
+  static constexpr u64 kCounterInc = 1ull << 32;
+  static constexpr u64 kCounterMask = ~(kCounterInc - 1);
+
+  NOINLINE void Refill(Cache *c) {
+    // Pop 1 batch of nodes from the freelist.
+    IndexT idx;
+    u64 xchg;
+    u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+    do {
+      idx = static_cast<IndexT>(cmp);
+      if (!idx)
+        return AllocSuperBlock(c);
+      Block *ptr = MapBlock(idx);
+      xchg = ptr->next | (cmp & kCounterMask);
+    } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+                                           memory_order_acq_rel));
+    // Unpack it into c->cache.
+    while (idx) {
       c->cache[c->pos++] = idx;
-      freelist_ = *(IndexT*)Map(idx);
+      idx = MapBlock(idx)->batch;
     }
   }
 
-  void Drain(Cache *c) {
-    SpinMutexLock lock(&mtx_);
-    for (uptr i = 0; i < Cache::kSize / 2; i++) {
+  NOINLINE void Drain(Cache *c) {
+    // Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch.
+    IndexT head_idx = 0;
+    for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) {
       IndexT idx = c->cache[--c->pos];
-      *(IndexT*)Map(idx) = freelist_;
-      freelist_ = idx;
+      Block *ptr = MapBlock(idx);
+      ptr->batch = head_idx;
+      head_idx = idx;
+    }
+    // Push it onto the freelist stack.
+    Block *head = MapBlock(head_idx);
+    u64 xchg;
+    u64 cmp = atomic_load(&freelist_, memory_order_acquire);
+    do {
+      head->next = static_cast<IndexT>(cmp);
+      xchg = head_idx | (cmp & kCounterMask) + kCounterInc;
+    } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
+                                           memory_order_acq_rel));
+  }
+
+  NOINLINE void AllocSuperBlock(Cache *c) {
+    Lock lock(&mtx_);
+    uptr fillpos = atomic_load_relaxed(&fillpos_);
+    if (fillpos == kL1Size) {
+      Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size,
+             kL2Size);
+      Die();
+    }
+    VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
+            fillpos, kL1Size, kL2Size);
+    T *batch = (T *)MmapOrDie(kL2Size * sizeof(T), name_);
+    map_[fillpos] = batch;
+    // Reserve 0 as invalid index.
+    for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) {
+      new (batch + i) T;
+      c->cache[c->pos++] = i + fillpos * kL2Size;
+      if (c->pos == Cache::kSize)
+        Drain(c);
     }
+    atomic_store_relaxed(&fillpos_, fillpos + 1);
+    CHECK(c->pos);
   }
 };
 
diff --git a/compiler-rt/test/tsan/bench_malloc.cpp b/compiler-rt/test/tsan/bench_malloc.cpp
new file mode 100644
--- /dev/null
+++ b/compiler-rt/test/tsan/bench_malloc.cpp
@@ -0,0 +1,22 @@
+// RUN: %clangxx_tsan %s -o %t
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// bench.h needs pthread barriers which are not available on OS X
+// UNSUPPORTED: darwin
+
+#include "bench.h"
+
+void thread(int tid) {
+  void **blocks = new void *[bench_mode];
+  for (int i = 0; i < bench_niter; i++) {
+    for (int j = 0; j < bench_mode; j++)
+      blocks[j] = malloc(8);
+    for (int j = 0; j < bench_mode; j++)
+      free(blocks[j]);
+  }
+  delete[] blocks;
+}
+
+void bench() { start_thread_group(bench_nthread, thread); }
+
+// CHECK: DONE