This is an archive of the discontinued LLVM Phabricator instance.

Differential D130002

tsan: optimize DenseSlabAlloc
ClosedPublic

Authored by dvyukov on Jul 18 2022, 5:20 AM.

Download Raw Diff

Details

Reviewers

melver

Commits

rG6d1f86095de9: tsan: optimize DenseSlabAlloc

Summary

If lots of threads do lots of malloc/free and they overflow
per-pthread DenseSlabAlloc cache, it causes lots of contention:

31.97%  race.old  race.old            [.] __sanitizer::StaticSpinMutex::LockSlow
17.61%  race.old  race.old            [.] __tsan_read4
10.77%  race.old  race.old            [.] __tsan::SlotLock

Optimize DenseSlabAlloc to use a lock-free stack of batches of nodes.
This way we don't take any locks in steady state at all and do only
1 push/pop per Refill/Drain.

Effect on the added benchmark:

$ TIME="%e %U %S %M" time ./test.old 36 5 2000000
34.51 978.22 175.67 5833592
32.53 891.73 167.03 5790036
36.17 1005.54 201.24 5802828
36.94 1004.76 226.58 5803188

$ TIME="%e %U %S %M" time ./test.new 36 5 2000000
26.44 720.99 13.45 5750704
25.92 721.98 13.58 5767764
26.33 725.15 13.41 5777936
25.93 713.49 13.41 5791796

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

dvyukov created this revision.Jul 18 2022, 5:20 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 18 2022, 5:20 AM

Herald added a subscriber: Enna1. · View Herald Transcript

dvyukov requested review of this revision.Jul 18 2022, 5:20 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 18 2022, 5:20 AM

Herald added a subscriber: Restricted Project. · View Herald Transcript

Harbormaster completed remote builds in B176005: Diff 445463.Jul 18 2022, 5:49 AM

melver accepted this revision.Jul 19 2022, 5:02 AM

This revision is now accepted and ready to land.Jul 19 2022, 5:02 AM

Closed by commit rG6d1f86095de9: tsan: optimize DenseSlabAlloc (authored by dvyukov). · Explain WhyJul 19 2022, 6:43 AM

This revision was automatically updated to reflect the committed changes.

dvyukov added a commit: rG6d1f86095de9: tsan: optimize DenseSlabAlloc.

Revision Contents

Path

Size

compiler-rt/

lib/

tsan/

rtl/

tsan_dense_alloc.h

115 lines

test/

tsan/

bench_malloc.cpp

22 lines

Diff 445800

compiler-rt/lib/tsan/rtl/tsan_dense_alloc.h

Show First 20 Lines • Show All 79 Lines • ▼ Show 20 Lines	public:

T *Map(IndexT idx) {		T *Map(IndexT idx) {
DCHECK_NE(idx, 0);		DCHECK_NE(idx, 0);
DCHECK_LE(idx, kL1Size * kL2Size);		DCHECK_LE(idx, kL1Size * kL2Size);
return &map_[idx / kL2Size][idx % kL2Size];		return &map_[idx / kL2Size][idx % kL2Size];
}		}

void FlushCache(Cache *c) {		void FlushCache(Cache *c) {
if (!c->pos)		while (c->pos) Drain(c);
return;
SpinMutexLock lock(&mtx_);
while (c->pos) {
IndexT idx = c->cache[--c->pos];
(IndexT)Map(idx) = freelist_;
freelist_ = idx;
}
}		}

void InitCache(Cache *c) {		void InitCache(Cache *c) {
c->pos = 0;		c->pos = 0;
internal_memset(c->cache, 0, sizeof(c->cache));		internal_memset(c->cache, 0, sizeof(c->cache));
}		}

uptr AllocatedMemory() const {		uptr AllocatedMemory() const {
return atomic_load_relaxed(&fillpos_) * kL2Size * sizeof(T);		return atomic_load_relaxed(&fillpos_) * kL2Size * sizeof(T);
}		}

template <typename Func>		template <typename Func>
void ForEach(Func func) {		void ForEach(Func func) {
SpinMutexLock lock(&mtx_);		Lock lock(&mtx_);
uptr fillpos = atomic_load_relaxed(&fillpos_);		uptr fillpos = atomic_load_relaxed(&fillpos_);
for (uptr l1 = 0; l1 < fillpos; l1++) {		for (uptr l1 = 0; l1 < fillpos; l1++) {
for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);		for (IndexT l2 = l1 == 0 ? 1 : 0; l2 < kL2Size; l2++) func(&map_[l1][l2]);
}		}
}		}

private:		private:
T *map_[kL1Size];		T *map_[kL1Size];
SpinMutex mtx_;		Mutex mtx_;
IndexT freelist_ = {0};		// The freelist is organized as a lock-free stack of batches of nodes.
		// The stack itself uses Block::next links, while the batch within each
		// stack node uses Block::batch links.
		// Low 32-bits of freelist_ is the node index, top 32-bits is ABA-counter.
		atomic_uint64_t freelist_ = {0};
atomic_uintptr_t fillpos_ = {0};		atomic_uintptr_t fillpos_ = {0};
const char *const name_;		const char *const name_;

void Refill(Cache *c) {		struct Block {
SpinMutexLock lock(&mtx_);		IndexT next;
if (freelist_ == 0) {		IndexT batch;
		};

		Block MapBlock(IndexT idx) { return reinterpret_cast<Block >(Map(idx)); }

		static constexpr u64 kCounterInc = 1ull << 32;
		static constexpr u64 kCounterMask = ~(kCounterInc - 1);

		NOINLINE void Refill(Cache *c) {
		// Pop 1 batch of nodes from the freelist.
		IndexT idx;
		u64 xchg;
		u64 cmp = atomic_load(&freelist_, memory_order_acquire);
		do {
		idx = static_cast<IndexT>(cmp);
		if (!idx)
		return AllocSuperBlock(c);
		Block *ptr = MapBlock(idx);
		xchg = ptr->next \| (cmp & kCounterMask);
		} while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
		memory_order_acq_rel));
		// Unpack it into c->cache.
		while (idx) {
		c->cache[c->pos++] = idx;
		idx = MapBlock(idx)->batch;
		}
		}

		NOINLINE void Drain(Cache *c) {
		// Build a batch of at most Cache::kSize / 2 nodes linked by Block::batch.
		IndexT head_idx = 0;
		for (uptr i = 0; i < Cache::kSize / 2 && c->pos; i++) {
		IndexT idx = c->cache[--c->pos];
		Block *ptr = MapBlock(idx);
		ptr->batch = head_idx;
		head_idx = idx;
		}
		// Push it onto the freelist stack.
		Block *head = MapBlock(head_idx);
		u64 xchg;
		u64 cmp = atomic_load(&freelist_, memory_order_acquire);
		do {
		head->next = static_cast<IndexT>(cmp);
		xchg = head_idx \| (cmp & kCounterMask) + kCounterInc;
		} while (!atomic_compare_exchange_weak(&freelist_, &cmp, xchg,
		memory_order_acq_rel));
		}

		NOINLINE void AllocSuperBlock(Cache *c) {
		Lock lock(&mtx_);
uptr fillpos = atomic_load_relaxed(&fillpos_);		uptr fillpos = atomic_load_relaxed(&fillpos_);
if (fillpos == kL1Size) {		if (fillpos == kL1Size) {
Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n",		Printf("ThreadSanitizer: %s overflow (%zu*%zu). Dying.\n", name_, kL1Size,
name_, kL1Size, kL2Size);		kL2Size);
Die();		Die();
}		}
VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,		VPrintf(2, "ThreadSanitizer: growing %s: %zu out of %zu*%zu\n", name_,
fillpos, kL1Size, kL2Size);		fillpos, kL1Size, kL2Size);
T batch = (T)MmapOrDie(kL2Size * sizeof(T), name_);		T batch = (T )MmapOrDie(kL2Size * sizeof(T), name_);
		map_[fillpos] = batch;
// Reserve 0 as invalid index.		// Reserve 0 as invalid index.
IndexT start = fillpos == 0 ? 1 : 0;		for (IndexT i = fillpos ? 0 : 1; i < kL2Size; i++) {
for (IndexT i = start; i < kL2Size; i++) {
new(batch + i) T;		new (batch + i) T;
(IndexT )(batch + i) = i + 1 + fillpos * kL2Size;		c->cache[c->pos++] = i + fillpos * kL2Size;
		if (c->pos == Cache::kSize)
		Drain(c);
}		}
(IndexT)(batch + kL2Size - 1) = 0;
freelist_ = fillpos * kL2Size + start;
map_[fillpos] = batch;
atomic_store_relaxed(&fillpos_, fillpos + 1);		atomic_store_relaxed(&fillpos_, fillpos + 1);
}		CHECK(c->pos);
for (uptr i = 0; i < Cache::kSize / 2 && freelist_ != 0; i++) {
IndexT idx = freelist_;
c->cache[c->pos++] = idx;
freelist_ = (IndexT)Map(idx);
}
}

void Drain(Cache *c) {
SpinMutexLock lock(&mtx_);
for (uptr i = 0; i < Cache::kSize / 2; i++) {
IndexT idx = c->cache[--c->pos];
(IndexT)Map(idx) = freelist_;
freelist_ = idx;
}
}		}
};		};

} // namespace __tsan		} // namespace __tsan

#endif // TSAN_DENSE_ALLOC_H		#endif // TSAN_DENSE_ALLOC_H

compiler-rt/test/tsan/bench_malloc.cpp

This file was added.

				// RUN: %clangxx_tsan %s -o %t
				// RUN: %run %t 2>&1 \| FileCheck %s

				// bench.h needs pthread barriers which are not available on OS X
				// UNSUPPORTED: darwin

				#include "bench.h"

				void thread(int tid) {
				void *blocks = new void [bench_mode];
				for (int i = 0; i < bench_niter; i++) {
				for (int j = 0; j < bench_mode; j++)
				blocks[j] = malloc(8);
				for (int j = 0; j < bench_mode; j++)
				free(blocks[j]);
				}
				delete[] blocks;
				}

				void bench() { start_thread_group(bench_nthread, thread); }

				// CHECK: DONE