diff --git a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h --- a/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h +++ b/compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h @@ -85,14 +85,7 @@ } void FlushCache(Cache *c) { - if (!c->pos) - return; - SpinMutexLock lock(&mtx_); - while (c->pos) { - IndexT idx = c->cache[--c->pos]; - *(IndexT*)Map(idx) = freelist_; - freelist_ = idx; - } + while (c->pos) Drain(c); } void InitCache(Cache *c) { @@ -106,7 +99,7 @@ template void ForEach(Func func) { - SpinMutexLock lock(&mtx_); + Lock lock(&mtx_); uptr fillpos = atomic_load_relaxed(&fillpos_); for (uptr l1 = 0; l1 < fillpos; l1++) { for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]); @@ -115,48 +108,86 @@ private: T *map_[kL1Size]; - SpinMutex mtx_; - IndexT freelist_ = {0}; + Mutex mtx_; + // The freelist is organized as a lock-free stack of batches of nodes. + // The stack itself uses Block::next links, while the batch within each + // stack node uses Block::batch links. + // Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter. + atomic_uint64_t freelist_ = {0}; atomic_uintptr_t fillpos_ = {0}; const char *const name_; - void Refill(Cache *c) { - SpinMutexLock lock(&mtx_); - if (freelist_ == 0) { - uptr fillpos = atomic_load_relaxed(&fillpos_); - if (fillpos == kL1Size) { - Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", - name_, kL1Size, kL2Size); - Die(); - } - VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_, - fillpos, kL1Size, kL2Size); - T *batch = (T*)MmapOrDie(kL2Size * sizeof(T), name_); - // Reserve 0 as invalid index. - IndexT start = fillpos == 0 ? 1 : 0; - for (IndexT i = start; i < kL2Size; i++) { - new(batch + i) T; - *(IndexT *)(batch + i) = i + 1 + fillpos * kL2Size; - } - *(IndexT*)(batch + kL2Size - 1) = 0; - freelist_ = fillpos * kL2Size + start; - map_[fillpos] = batch; - atomic_store_relaxed(&fillpos_, fillpos + 1); - } - for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) { - IndexT idx = freelist_; + struct Block { + IndexT next; + IndexT batch; + }; + + Block *MapBlock(IndexT idx) { return reinterpret_cast(Map(idx)); } + + static constexpr u64 kCounterInc = 1ull << 32; + static constexpr u64 kCounterMask = ~(kCounterInc - 1); + + NOINLINE void Refill(Cache *c) { + // Pop 1 batch of nodes from the freelist. + IndexT idx; + u64 xchg; + u64 cmp = atomic_load(&freelist_, memory_order_acquire); + do { + idx = static_cast(cmp); + if (!idx) + return AllocSuperBlock(c); + Block *ptr = MapBlock(idx); + xchg = ptr->next | (cmp & kCounterMask); + } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg, + memory_order_acq_rel)); + // Unpack it into c->cache. + while (idx) { c->cache[c->pos++] = idx; - freelist_ = *(IndexT*)Map(idx); + idx = MapBlock(idx)->batch; } } - void Drain(Cache *c) { - SpinMutexLock lock(&mtx_); - for (uptr i = 0; i < Cache::kSize / 2; i++) { + NOINLINE void Drain(Cache *c) { + // Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch. + IndexT head_idx = 0; + for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) { IndexT idx = c->cache[--c->pos]; - *(IndexT*)Map(idx) = freelist_; - freelist_ = idx; + Block *ptr = MapBlock(idx); + ptr->batch = head_idx; + head_idx = idx; + } + // Push it onto the freelist stack. + Block *head = MapBlock(head_idx); + u64 xchg; + u64 cmp = atomic_load(&freelist_, memory_order_acquire); + do { + head->next = static_cast(cmp); + xchg = head_idx | (cmp & kCounterMask) + kCounterInc; + } while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg, + memory_order_acq_rel)); + } + + NOINLINE void AllocSuperBlock(Cache *c) { + Lock lock(&mtx_); + uptr fillpos = atomic_load_relaxed(&fillpos_); + if (fillpos == kL1Size) { + Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size, + kL2Size); + Die(); + } + VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_, + fillpos, kL1Size, kL2Size); + T *batch = (T *)MmapOrDie(kL2Size * sizeof(T), name_); + map_[fillpos] = batch; + // Reserve 0 as invalid index. + for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) { + new (batch + i) T; + c->cache[c->pos++] = i + fillpos * kL2Size; + if (c->pos == Cache::kSize) + Drain(c); } + atomic_store_relaxed(&fillpos_, fillpos + 1); + CHECK(c->pos); } }; diff --git a/compiler-rt/test/tsan/bench_malloc.cpp b/compiler-rt/test/tsan/bench_malloc.cpp new file mode 100644 --- /dev/null +++ b/compiler-rt/test/tsan/bench_malloc.cpp @@ -0,0 +1,22 @@ +// RUN: %clangxx_tsan %s -o %t +// RUN: %run %t 2>&1 | FileCheck %s + +// bench.h needs pthread barriers which are not available on OS X +// UNSUPPORTED: darwin + +#include "bench.h" + +void thread(int tid) { + void **blocks = new void *[bench_mode]; + for (int i = 0; i < bench_niter; i++) { + for (int j = 0; j < bench_mode; j++) + blocks[j] = malloc(8); + for (int j = 0; j < bench_mode; j++) + free(blocks[j]); + } + delete[] blocks; +} + +void bench() { start_thread_group(bench_nthread, thread); } + +// CHECK: DONE