Index: lib/tsan/rtl/tsan_clock.h =================================================================== --- lib/tsan/rtl/tsan_clock.h +++ lib/tsan/rtl/tsan_clock.h @@ -18,25 +18,6 @@ namespace __tsan { -struct ClockElem { - u64 epoch : kClkBits; - u64 reused : 64 - kClkBits; -}; - -struct ClockBlock { - static const uptr kSize = 512; - static const uptr kTableSize = kSize / sizeof(u32); - static const uptr kClockCount = kSize / sizeof(ClockElem); - - union { - u32 table[kTableSize]; - ClockElem clock[kClockCount]; - }; - - ClockBlock() { - } -}; - typedef DenseSlabAlloc ClockAlloc; typedef DenseSlabAllocCache ClockCache; @@ -46,69 +27,117 @@ SyncClock(); ~SyncClock(); - uptr size() const { - return size_; - } - - u64 get(unsigned tid) const { - return elem(tid).epoch; - } + uptr size() const; + + // These are used only in tests. + u64 get(unsigned tid) const; + u64 get_clean(unsigned tid) const; void Resize(ClockCache *c, uptr nclk); void Reset(ClockCache *c); void DebugDump(int(*printf)(const char *s, ...)); + // Clock element iterator. + // Note: it iterates only over the table without regard to dirty entries. + class Iter { + public: + explicit Iter(SyncClock* parent); + Iter& operator++(); + bool operator!=(const Iter& other); + ClockElem &operator*(); + + private: + SyncClock *parent_; + // [pos_, end_) is the current continuous range of clock elements. + ClockElem *pos_; + ClockElem *end_; + int block_; // Current number of second level block. + + NOINLINE void Next(); + }; + + Iter begin(); + Iter end(); + private: - friend struct ThreadClock; + friend class ThreadClock; + friend class Iter; static const uptr kDirtyTids = 2; + struct Dirty { + u64 epoch : kClkBits; + u64 tid : 64 - kClkBits; // kInvalidId if not active + }; + unsigned release_store_tid_; unsigned release_store_reused_; - unsigned dirty_tids_[kDirtyTids]; - // tab_ contains indirect pointer to a 512b block using DenseSlabAlloc. - // If size_ <= 64, then tab_ points to an array with 64 ClockElem's. - // Otherwise, tab_ points to an array with 128 u32 elements, + Dirty dirty_[kDirtyTids]; + // If size_ is 0, tab_ is nullptr. + // If size <= 64 (kClockCount), tab_ contains pointer to an array with + // 64 ClockElem's (ClockBlock::clock). + // Otherwise, tab_ points to an array with up to 127 u32 elements, // each pointing to the second-level 512b block with 64 ClockElem's. + // Unused space in the first level ClockBlock is used to store additional + // clock elements. + // The last u32 element in the first level ClockBlock is always used as + // reference counter. + // + // See the following scheme for details. + // All memory blocks are 512 bytes (allocated from ClockAlloc). + // Clock (clk) elements are 64 bits. + // Idx and ref are 32 bits. + // + // tab_ + // | + // \/ + // +----------------------------------------------------+ + // | clk128 | clk129 | ...unused... | idx1 | idx0 | ref | + // +----------------------------------------------------+ + // | | + // | \/ + // | +----------------+ + // | | clk0 ... clk63 | + // | +----------------+ + // \/ + // +------------------+ + // | clk64 ... clk127 | + // +------------------+ + // + // Note: dirty entries, if active, always override what's stored in the clock. ClockBlock *tab_; u32 tab_idx_; - u32 size_; + u16 size_; + u16 blocks_; // Number of second level blocks. + void Unshare(ClockCache *c); + bool IsShared() const; + bool Cachable() const; void ResetImpl(); + void FlushDirty(); + uptr capacity() const; + u32 get_block(uptr bi) const; + void append_block(u32 idx); ClockElem &elem(unsigned tid) const; }; // The clock that lives in threads. -struct ThreadClock { +class ThreadClock { public: typedef DenseSlabAllocCache Cache; explicit ThreadClock(unsigned tid, unsigned reused = 0); - u64 get(unsigned tid) const { - DCHECK_LT(tid, kMaxTidInClock); - return clk_[tid].epoch; - } - + u64 get(unsigned tid) const; void set(ClockCache *c, unsigned tid, u64 v); + void set(u64 v); + void tick(); + uptr size() const; - void set(u64 v) { - DCHECK_GE(v, clk_[tid_].epoch); - clk_[tid_].epoch = v; - } - - void tick() { - clk_[tid_].epoch++; - } - - uptr size() const { - return nclk_; - } - - void acquire(ClockCache *c, const SyncClock *src); - void release(ClockCache *c, SyncClock *dst) const; + void acquire(ClockCache *c, SyncClock *src); + void release(ClockCache *c, SyncClock *dst); void acq_rel(ClockCache *c, SyncClock *dst); - void ReleaseStore(ClockCache *c, SyncClock *dst) const; + void ReleaseStore(ClockCache *c, SyncClock *dst); void ResetCached(ClockCache *c); void DebugReset(); @@ -116,16 +145,82 @@ private: static const uptr kDirtyTids = SyncClock::kDirtyTids; + // Index of the thread associated with he clock ("current thread"). const unsigned tid_; - const unsigned reused_; + const unsigned reused_; // tid_ reuse count. + // Current thread time when it acquired something from other threads. u64 last_acquire_; + + // Cached SyncClock (without dirty entries and release_store_tid_). + // We reuse it for subsequent store-release operations without intervening + // acquire operations. Since it is shared (and thus constant), clock value + // for the current thread is then stored in dirty entries in the SyncClock. + // We host a refernece to the table while it is cached here. + u32 cached_idx_; + u16 cached_size_; + u16 cached_blocks_; + + // Number of active elements in the clk_ table (the rest is zeros). uptr nclk_; - ClockElem clk_[kMaxTidInClock]; + u64 clk_[kMaxTidInClock]; // Fixed size vector clock. bool IsAlreadyAcquired(const SyncClock *src) const; - void UpdateCurrentThread(SyncClock *dst) const; + void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const; }; +ALWAYS_INLINE u64 ThreadClock::get(unsigned tid) const { + DCHECK_LT(tid, kMaxTidInClock); + return clk_[tid]; +} + +ALWAYS_INLINE void ThreadClock::set(u64 v) { + DCHECK_GE(v, clk_[tid_]); + clk_[tid_] = v; +} + +ALWAYS_INLINE void ThreadClock::tick() { + clk_[tid_]++; +} + +ALWAYS_INLINE uptr ThreadClock::size() const { + return nclk_; +} + +ALWAYS_INLINE SyncClock::Iter SyncClock::begin() { + return Iter(this); +} + +ALWAYS_INLINE SyncClock::Iter SyncClock::end() { + return Iter(nullptr); +} + +ALWAYS_INLINE uptr SyncClock::size() const { + return size_; +} + +ALWAYS_INLINE SyncClock::Iter::Iter(SyncClock* parent) + : parent_(parent) + , pos_(nullptr) + , end_(nullptr) + , block_(-1) { + if (parent) + Next(); +} + +ALWAYS_INLINE SyncClock::Iter& SyncClock::Iter::operator++() { + pos_++; + if (UNLIKELY(pos_ >= end_)) + Next(); + return *this; +} + +ALWAYS_INLINE bool SyncClock::Iter::operator!=(const SyncClock::Iter& other) { + return parent_ != other.parent_; +} + +ALWAYS_INLINE ClockElem &SyncClock::Iter::operator*() { + return *pos_; +} } // namespace __tsan #endif // TSAN_CLOCK_H Index: lib/tsan/rtl/tsan_clock.cc =================================================================== --- lib/tsan/rtl/tsan_clock.cc +++ lib/tsan/rtl/tsan_clock.cc @@ -61,20 +61,13 @@ // an exclusive lock; ThreadClock's are private to respective threads and so // do not need any protection. // -// Description of ThreadClock state: -// clk_ - fixed size vector clock. -// nclk_ - effective size of the vector clock (the rest is zeros). -// tid_ - index of the thread associated with he clock ("current thread"). -// last_acquire_ - current thread time when it acquired something from -// other threads. -// // Description of SyncClock state: // clk_ - variable size vector clock, low kClkBits hold timestamp, // the remaining bits hold "acquired" flag (the actual value is thread's // reused counter); // if acquried == thr->reused_, then the respective thread has already -// acquired this clock (except possibly dirty_tids_). -// dirty_tids_ - holds up to two indeces in the vector clock that other threads +// acquired this clock (except possibly for dirty elements). +// dirty_ - holds up to two indeces in the vector clock that other threads // need to acquire regardless of "acquired" flag value; // release_store_tid_ - denotes that the clock state is a result of // release-store operation by the thread with release_store_tid_ index. @@ -90,21 +83,51 @@ namespace __tsan { +static atomic_uint32_t *ref_ptr(ClockBlock *cb) { + return reinterpret_cast(&cb->table[ClockBlock::kRefIdx]); +} + +// Drop reference to the first level block idx. +static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) { + ClockBlock *cb = ctx->clock_alloc.Map(idx); + atomic_uint32_t *ref = ref_ptr(cb); + u32 v = atomic_load(ref, memory_order_acquire); + for (;;) { + CHECK_GT(v, 0); + if (v == 1) + break; + if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel)) + return; + } + // First level block owns second level blocks, so them as well. + for (uptr i = 0; i < blocks; i++) + ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]); + ctx->clock_alloc.Free(c, idx); +} + ThreadClock::ThreadClock(unsigned tid, unsigned reused) : tid_(tid) - , reused_(reused + 1) { // 0 has special meaning + , reused_(reused + 1) // 0 has special meaning + , cached_idx_() + , cached_size_() + , cached_blocks_() { CHECK_LT(tid, kMaxTidInClock); CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits); nclk_ = tid_ + 1; last_acquire_ = 0; internal_memset(clk_, 0, sizeof(clk_)); - clk_[tid_].reused = reused_; } void ThreadClock::ResetCached(ClockCache *c) { + if (cached_idx_) { + UnrefClockBlock(c, cached_idx_, cached_blocks_); + cached_idx_ = 0; + cached_size_ = 0; + cached_blocks_ = 0; + } } -void ThreadClock::acquire(ClockCache *c, const SyncClock *src) { +void ThreadClock::acquire(ClockCache *c, SyncClock *src) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(src->size_, kMaxTid); CPP_STAT_INC(StatClockAcquire); @@ -116,50 +139,46 @@ return; } - // Check if we've already acquired src after the last release operation on src bool acquired = false; - if (nclk > tid_) { - if (src->elem(tid_).reused == reused_) { - for (unsigned i = 0; i < kDirtyTids; i++) { - unsigned tid = src->dirty_tids_[i]; - if (tid != kInvalidTid) { - u64 epoch = src->elem(tid).epoch; - if (clk_[tid].epoch < epoch) { - clk_[tid].epoch = epoch; - acquired = true; - } - } - } - if (acquired) { - CPP_STAT_INC(StatClockAcquiredSomething); - last_acquire_ = clk_[tid_].epoch; + for (unsigned i = 0; i < kDirtyTids; i++) { + SyncClock::Dirty dirty = src->dirty_[i]; + unsigned tid = dirty.tid; + if (tid != kInvalidTid) { + if (clk_[tid] < dirty.epoch) { + clk_[tid] = dirty.epoch; + acquired = true; } - return; } } - // O(N) acquire. - CPP_STAT_INC(StatClockAcquireFull); - nclk_ = max(nclk_, nclk); - for (uptr i = 0; i < nclk; i++) { - u64 epoch = src->elem(i).epoch; - if (clk_[i].epoch < epoch) { - clk_[i].epoch = epoch; - acquired = true; + // Check if we've already acquired src after the last release operation on src + if (tid_ >= nclk || src->elem(tid_).reused != reused_) { + // O(N) acquire. + CPP_STAT_INC(StatClockAcquireFull); + nclk_ = max(nclk_, nclk); + u64 *dst_pos = &clk_[0]; + for (ClockElem &src_elem : *src) { + u64 epoch = src_elem.epoch; + if (*dst_pos < epoch) { + *dst_pos = epoch; + acquired = true; + } + dst_pos++; } - } - // Remember that this thread has acquired this clock. - if (nclk > tid_) - src->elem(tid_).reused = reused_; + // Remember that this thread has acquired this clock. + if (nclk > tid_) + src->elem(tid_).reused = reused_; + } if (acquired) { CPP_STAT_INC(StatClockAcquiredSomething); - last_acquire_ = clk_[tid_].epoch; + last_acquire_ = clk_[tid_]; + ResetCached(c); } } -void ThreadClock::release(ClockCache *c, SyncClock *dst) const { +void ThreadClock::release(ClockCache *c, SyncClock *dst) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(dst->size_, kMaxTid); @@ -179,7 +198,7 @@ // since the last release on dst. If so, we need to update // only dst->elem(tid_). if (dst->elem(tid_).epoch > last_acquire_) { - UpdateCurrentThread(dst); + UpdateCurrentThread(c, dst); if (dst->release_store_tid_ != tid_ || dst->release_store_reused_ != reused_) dst->release_store_tid_ = kInvalidTid; @@ -188,23 +207,24 @@ // O(N) release. CPP_STAT_INC(StatClockReleaseFull); + dst->Unshare(c); // First, remember whether we've acquired dst. bool acquired = IsAlreadyAcquired(dst); if (acquired) CPP_STAT_INC(StatClockReleaseAcquired); // Update dst->clk_. - for (uptr i = 0; i < nclk_; i++) { - ClockElem &ce = dst->elem(i); - ce.epoch = max(ce.epoch, clk_[i].epoch); + dst->FlushDirty(); + uptr i = 0; + for (ClockElem &ce : *dst) { + ce.epoch = max(ce.epoch, clk_[i]); ce.reused = 0; + i++; } // Clear 'acquired' flag in the remaining elements. if (nclk_ < dst->size_) CPP_STAT_INC(StatClockReleaseClearTail); for (uptr i = nclk_; i < dst->size_; i++) dst->elem(i).reused = 0; - for (unsigned i = 0; i < kDirtyTids; i++) - dst->dirty_tids_[i] = kInvalidTid; dst->release_store_tid_ = kInvalidTid; dst->release_store_reused_ = 0; // If we've acquired dst, remember this fact, @@ -213,11 +233,37 @@ dst->elem(tid_).reused = reused_; } -void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const { +void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(dst->size_, kMaxTid); CPP_STAT_INC(StatClockStore); + if (dst->size_ == 0 && cached_idx_ != 0) { + // Reuse the cached clock. + // Note: we could reuse/cache the cached clock in more cases: + // we could update the existing clock and cache it, or replace it with the + // currently cached clock and release the old one. And for a shared + // existing clock, we could replace it with the currently cached; + // or unshare, update and cache. But, for simplicity, we currnetly reuse + // cached clock only when the target clock is empty. + dst->tab_ = ctx->clock_alloc.Map(cached_idx_); + dst->tab_idx_ = cached_idx_; + dst->size_ = cached_size_; + dst->blocks_ = cached_blocks_; + CHECK_EQ(dst->dirty_[0].tid, kInvalidTid); + // The cached clock is shared (immutable), + // so this is where we store the current clock. + dst->dirty_[0].tid = tid_; + dst->dirty_[0].epoch = clk_[tid_]; + dst->release_store_tid_ = tid_; + dst->release_store_reused_ = reused_; + // Rememeber that we don't need to acquire it in future. + dst->elem(tid_).reused = reused_; + // Grab a reference. + atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); + return; + } + // Check if we need to resize dst. if (dst->size_ < nclk_) dst->Resize(c, nclk_); @@ -226,32 +272,41 @@ dst->release_store_reused_ == reused_ && dst->elem(tid_).epoch > last_acquire_) { CPP_STAT_INC(StatClockStoreFast); - UpdateCurrentThread(dst); + UpdateCurrentThread(c, dst); return; } // O(N) release-store. CPP_STAT_INC(StatClockStoreFull); - for (uptr i = 0; i < nclk_; i++) { - ClockElem &ce = dst->elem(i); - ce.epoch = clk_[i].epoch; + dst->Unshare(c); + // Note: dst can be larger than this ThreadClock. + // This is fine since clk_ beyond size is all zeros. + uptr i = 0; + for (ClockElem &ce : *dst) { + ce.epoch = clk_[i]; ce.reused = 0; + i++; } - // Clear the tail of dst->clk_. - if (nclk_ < dst->size_) { - for (uptr i = nclk_; i < dst->size_; i++) { - ClockElem &ce = dst->elem(i); - ce.epoch = 0; - ce.reused = 0; - } - CPP_STAT_INC(StatClockStoreTail); - } - for (unsigned i = 0; i < kDirtyTids; i++) - dst->dirty_tids_[i] = kInvalidTid; + for (uptr i = 0; i < kDirtyTids; i++) + dst->dirty_[i].tid = kInvalidTid; dst->release_store_tid_ = tid_; dst->release_store_reused_ = reused_; // Rememeber that we don't need to acquire it in future. dst->elem(tid_).reused = reused_; + + // If the resulting clock is cachable, cache it for future release operations. + // The clock is always cachable if we released to an empty sync object. + if (cached_idx_ == 0 && dst->Cachable()) { + // Grab a reference to the ClockBlock. + atomic_uint32_t *ref = ref_ptr(dst->tab_); + if (atomic_load(ref, memory_order_acquire) == 1) + atomic_store_relaxed(ref, 2); + else + atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); + cached_idx_ = dst->tab_idx_; + cached_size_ = dst->size_; + cached_blocks_ = dst->blocks_; + } } void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) { @@ -261,37 +316,36 @@ } // Updates only single element related to the current thread in dst->clk_. -void ThreadClock::UpdateCurrentThread(SyncClock *dst) const { +void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const { // Update the threads time, but preserve 'acquired' flag. - dst->elem(tid_).epoch = clk_[tid_].epoch; - for (unsigned i = 0; i < kDirtyTids; i++) { - if (dst->dirty_tids_[i] == tid_) { + SyncClock::Dirty *dirty = &dst->dirty_[i]; + const unsigned tid = dirty->tid; + if (tid == tid_ || tid == kInvalidTid) { CPP_STAT_INC(StatClockReleaseFast); - return; - } - if (dst->dirty_tids_[i] == kInvalidTid) { - CPP_STAT_INC(StatClockReleaseFast); - dst->dirty_tids_[i] = tid_; + dirty->tid = tid_; + dirty->epoch = clk_[tid_]; return; } } // Reset all 'acquired' flags, O(N). + // We are going to touch dst elements, so we need to unshare it. + dst->Unshare(c); CPP_STAT_INC(StatClockReleaseSlow); + dst->elem(tid_).epoch = clk_[tid_]; for (uptr i = 0; i < dst->size_; i++) dst->elem(i).reused = 0; - for (unsigned i = 0; i < kDirtyTids; i++) - dst->dirty_tids_[i] = kInvalidTid; + dst->FlushDirty(); } -// Checks whether the current threads has already acquired src. +// Checks whether the current thread has already acquired src. bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const { if (src->elem(tid_).reused != reused_) return false; for (unsigned i = 0; i < kDirtyTids; i++) { - unsigned tid = src->dirty_tids_[i]; - if (tid != kInvalidTid) { - if (clk_[tid].epoch < src->elem(tid).epoch) + SyncClock::Dirty dirty = src->dirty_[i]; + if (dirty.tid != kInvalidTid) { + if (clk_[dirty.tid] < dirty.epoch) return false; } } @@ -302,22 +356,19 @@ // This function is called only from weird places like AcquireGlobal. void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) { DCHECK_LT(tid, kMaxTid); - DCHECK_GE(v, clk_[tid].epoch); - clk_[tid].epoch = v; + DCHECK_GE(v, clk_[tid]); + clk_[tid] = v; if (nclk_ <= tid) nclk_ = tid + 1; - last_acquire_ = clk_[tid_].epoch; + last_acquire_ = clk_[tid_]; + ResetCached(c); } void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) { printf("clock=["); for (uptr i = 0; i < nclk_; i++) - printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch); - printf("] reused=["); - for (uptr i = 0; i < nclk_; i++) - printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused); - printf("] tid=%u/%u last_acq=%llu", - tid_, reused_, last_acquire_); + printf("%s%llu", i == 0 ? "" : ",", clk_[i]); + printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_); } SyncClock::SyncClock() { @@ -327,22 +378,14 @@ SyncClock::~SyncClock() { // Reset must be called before dtor. CHECK_EQ(size_, 0); + CHECK_EQ(blocks_, 0); CHECK_EQ(tab_, 0); CHECK_EQ(tab_idx_, 0); } void SyncClock::Reset(ClockCache *c) { - if (size_ == 0) { - // nothing - } else if (size_ <= ClockBlock::kClockCount) { - // One-level table. - ctx->clock_alloc.Free(c, tab_idx_); - } else { - // Two-level table. - for (uptr i = 0; i < size_; i += ClockBlock::kClockCount) - ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]); - ctx->clock_alloc.Free(c, tab_idx_); - } + if (size_) + UnrefClockBlock(c, tab_idx_, blocks_); ResetImpl(); } @@ -350,66 +393,171 @@ tab_ = 0; tab_idx_ = 0; size_ = 0; + blocks_ = 0; release_store_tid_ = kInvalidTid; release_store_reused_ = 0; for (uptr i = 0; i < kDirtyTids; i++) - dirty_tids_[i] = kInvalidTid; + dirty_[i].tid = kInvalidTid; } void SyncClock::Resize(ClockCache *c, uptr nclk) { CPP_STAT_INC(StatClockReleaseResize); - if (RoundUpTo(nclk, ClockBlock::kClockCount) <= - RoundUpTo(size_, ClockBlock::kClockCount)) { - // Growing within the same block. + Unshare(c); + if (nclk <= capacity()) { // Memory is already allocated, just increase the size. size_ = nclk; return; } - if (nclk <= ClockBlock::kClockCount) { + if (size_ == 0) { // Grow from 0 to one-level table. CHECK_EQ(size_, 0); + CHECK_EQ(blocks_, 0); CHECK_EQ(tab_, 0); CHECK_EQ(tab_idx_, 0); - size_ = nclk; - tab_idx_ = ctx->clock_alloc.Alloc(c); - tab_ = ctx->clock_alloc.Map(tab_idx_); - internal_memset(tab_, 0, sizeof(*tab_)); - return; - } - // Growing two-level table. - if (size_ == 0) { - // Allocate first level table. - tab_idx_ = ctx->clock_alloc.Alloc(c); - tab_ = ctx->clock_alloc.Map(tab_idx_); - internal_memset(tab_, 0, sizeof(*tab_)); - } else if (size_ <= ClockBlock::kClockCount) { - // Transform one-level table to two-level table. - u32 old = tab_idx_; tab_idx_ = ctx->clock_alloc.Alloc(c); tab_ = ctx->clock_alloc.Map(tab_idx_); internal_memset(tab_, 0, sizeof(*tab_)); - tab_->table[0] = old; + atomic_store_relaxed(ref_ptr(tab_), 1); + size_ = 1; + } else if (size_ > blocks_ * ClockBlock::kClockCount) { + u32 idx = ctx->clock_alloc.Alloc(c); + ClockBlock *new_cb = ctx->clock_alloc.Map(idx); + uptr top = size_ - blocks_ * ClockBlock::kClockCount; + CHECK_LT(top, ClockBlock::kClockCount); + const uptr move = top * sizeof(tab_->clock[0]); + internal_memcpy(&new_cb->clock[0], tab_->clock, move); + internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move); + internal_memset(tab_->clock, 0, move); + append_block(idx); } - // At this point we have first level table allocated. + // At this point we have first level table allocated and all clock elements + // are evacuated from it to a second level block. // Add second level tables as necessary. - for (uptr i = RoundUpTo(size_, ClockBlock::kClockCount); - i < nclk; i += ClockBlock::kClockCount) { + while (nclk > capacity()) { u32 idx = ctx->clock_alloc.Alloc(c); ClockBlock *cb = ctx->clock_alloc.Map(idx); internal_memset(cb, 0, sizeof(*cb)); - CHECK_EQ(tab_->table[i/ClockBlock::kClockCount], 0); - tab_->table[i/ClockBlock::kClockCount] = idx; + append_block(idx); } size_ = nclk; } -ClockElem &SyncClock::elem(unsigned tid) const { +// Flushes all dirty elements into the main clock array. +void SyncClock::FlushDirty() { + for (unsigned i = 0; i < kDirtyTids; i++) { + Dirty *dirty = &dirty_[i]; + if (dirty->tid != kInvalidTid) { + CHECK_LT(dirty->tid, size_); + elem(dirty->tid).epoch = dirty->epoch; + dirty->tid = kInvalidTid; + } + } +} + +bool SyncClock::IsShared() const { + if (size_ == 0) + return false; + atomic_uint32_t *ref = ref_ptr(tab_); + u32 v = atomic_load(ref, memory_order_acquire); + CHECK_GT(v, 0); + return v > 1; +} + +// Unshares the current clock if it's shared. +// Shared clocks are immutable, so they need to be unshared before any updates. +// Note: this does not apply to dirty entries as they are not shared. +void SyncClock::Unshare(ClockCache *c) { + if (!IsShared()) + return; + // First, copy current state into old. + SyncClock old; + old.tab_ = tab_; + old.tab_idx_ = tab_idx_; + old.size_ = size_; + old.blocks_ = blocks_; + old.release_store_tid_ = release_store_tid_; + old.release_store_reused_ = release_store_reused_; + for (unsigned i = 0; i < kDirtyTids; i++) + old.dirty_[i] = dirty_[i]; + // Then, clear current object. + ResetImpl(); + // Allocate brand new clock in the current object. + Resize(c, old.size_); + // Now copy state back into this object. + Iter old_iter(&old); + for (ClockElem &ce : *this) { + ce = *old_iter; + ++old_iter; + } + release_store_tid_ = old.release_store_tid_; + release_store_reused_ = old.release_store_reused_; + for (unsigned i = 0; i < kDirtyTids; i++) + dirty_[i] = old.dirty_[i]; + // Drop reference to old and delete if necessary. + old.Reset(c); +} + +// Can we cache this clock for future release operations? +ALWAYS_INLINE bool SyncClock::Cachable() const { + if (size_ == 0) + return false; + for (unsigned i = 0; i < kDirtyTids; i++) { + if (dirty_[i].tid != kInvalidTid) + return false; + } + return atomic_load_relaxed(ref_ptr(tab_)) == 1; +} + +// elem linearizes the two-level structure into linear array. +// Note: this is used only for one time accesses, vector operations use +// the iterator as it is much faster. +ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const { DCHECK_LT(tid, size_); - if (size_ <= ClockBlock::kClockCount) + const uptr block = tid / ClockBlock::kClockCount; + DCHECK_LE(block, blocks_); + tid %= ClockBlock::kClockCount; + if (block == blocks_) return tab_->clock[tid]; - u32 idx = tab_->table[tid / ClockBlock::kClockCount]; + u32 idx = get_block(block); ClockBlock *cb = ctx->clock_alloc.Map(idx); - return cb->clock[tid % ClockBlock::kClockCount]; + return cb->clock[tid]; +} + +ALWAYS_INLINE uptr SyncClock::capacity() const { + if (size_ == 0) + return 0; + uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]); + // How many clock elements we can fit into the first level block. + // +1 for ref counter. + uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio; + return blocks_ * ClockBlock::kClockCount + top; +} + +ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const { + DCHECK(size_); + DCHECK_LT(bi, blocks_); + return tab_->table[ClockBlock::kBlockIdx - bi]; +} + +ALWAYS_INLINE void SyncClock::append_block(u32 idx) { + uptr bi = blocks_++; + CHECK_EQ(get_block(bi), 0); + tab_->table[ClockBlock::kBlockIdx - bi] = idx; +} + +// Used only by tests. +u64 SyncClock::get(unsigned tid) const { + for (unsigned i = 0; i < kDirtyTids; i++) { + Dirty dirty = dirty_[i]; + if (dirty.tid == tid) + return dirty.epoch; + } + return elem(tid).epoch; +} + +// Used only by Iter test. +u64 SyncClock::get_clean(unsigned tid) const { + return elem(tid).epoch; } void SyncClock::DebugDump(int(*printf)(const char *s, ...)) { @@ -419,8 +567,32 @@ printf("] reused=["); for (uptr i = 0; i < size_; i++) printf("%s%llu", i == 0 ? "" : ",", elem(i).reused); - printf("] release_store_tid=%d/%d dirty_tids=%d/%d", + printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]", release_store_tid_, release_store_reused_, - dirty_tids_[0], dirty_tids_[1]); + dirty_[0].tid, dirty_[0].epoch, + dirty_[1].tid, dirty_[1].epoch); +} + +void SyncClock::Iter::Next() { + // Finished with the current block, move on to the next one. + block_++; + if (block_ < parent_->blocks_) { + // Iterate over the next second level block. + u32 idx = parent_->get_block(block_); + ClockBlock *cb = ctx->clock_alloc.Map(idx); + pos_ = &cb->clock[0]; + end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, + ClockBlock::kClockCount); + return; + } + if (block_ == parent_->blocks_ && + parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) { + // Iterate over elements in the first level block. + pos_ = &parent_->tab_->clock[0]; + end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, + ClockBlock::kClockCount); + return; + } + parent_ = nullptr; // denotes end } } // namespace __tsan Index: lib/tsan/rtl/tsan_defs.h =================================================================== --- lib/tsan/rtl/tsan_defs.h +++ lib/tsan/rtl/tsan_defs.h @@ -38,15 +38,40 @@ namespace __tsan { +const int kClkBits = 42; +const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1; + +struct ClockElem { + u64 epoch : kClkBits; + u64 reused : 64 - kClkBits; // tid reuse count +}; + +struct ClockBlock { + static const uptr kSize = 512; + static const uptr kTableSize = kSize / sizeof(u32); + static const uptr kClockCount = kSize / sizeof(ClockElem); + static const uptr kRefIdx = kTableSize - 1; + static const uptr kBlockIdx = kTableSize - 2; + + union { + u32 table[kTableSize]; + ClockElem clock[kClockCount]; + }; + + ClockBlock() { + } +}; + const int kTidBits = 13; -const unsigned kMaxTid = 1 << kTidBits; +// Reduce kMaxTid by kClockCount because one slot in ClockBlock table is +// occupied by reference counter, so total number of elements we can store +// in SyncClock is kClockCount * (kTableSize - 1). +const unsigned kMaxTid = (1 << kTidBits) - ClockBlock::kClockCount; #if !SANITIZER_GO const unsigned kMaxTidInClock = kMaxTid * 2; // This includes msb 'freed' bit. #else const unsigned kMaxTidInClock = kMaxTid; // Go does not track freed memory. #endif -const int kClkBits = 42; -const unsigned kMaxTidReuse = (1 << (64 - kClkBits)) - 1; const uptr kShadowStackSize = 64 * 1024; // Count of shadow values in a shadow cell. @@ -74,7 +99,7 @@ const bool kCollectHistory = true; #endif -const unsigned kInvalidTid = (unsigned)-1; +const u16 kInvalidTid = kMaxTid + 1; // The following "build consistency" machinery ensures that all source files // are built in the same configuration. Inconsistent builds lead to Index: lib/tsan/tests/unit/tsan_clock_test.cc =================================================================== --- lib/tsan/tests/unit/tsan_clock_test.cc +++ lib/tsan/tests/unit/tsan_clock_test.cc @@ -53,6 +53,31 @@ chunked.Reset(&cache); } +static const uptr interesting_sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66, + 100, 124, 125, 126, 127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254, + 255}; + +TEST(Clock, Iter) { + const uptr n = ARRAY_SIZE(interesting_sizes); + for (uptr fi = 0; fi < n; fi++) { + const uptr size = interesting_sizes[fi]; + SyncClock sync; + ThreadClock vector(0); + for (uptr i = 0; i < size; i++) + vector.set(&cache, i, i + 1); + if (size != 0) + vector.release(&cache, &sync); + uptr i = 0; + for (ClockElem &ce : sync) { + ASSERT_LT(i, size); + ASSERT_EQ(sync.get_clean(i), ce.epoch); + i++; + } + ASSERT_EQ(i, size); + sync.Reset(&cache); + } +} + TEST(Clock, AcquireRelease) { ThreadClock vector1(100); vector1.tick(); @@ -216,13 +241,11 @@ TEST(Clock, Growth2) { // Test clock growth for every pair of sizes: - const uptr sizes[] = {0, 1, 2, 30, 61, 62, 63, 64, 65, 66, 100, 124, 125, 126, - 127, 128, 129, 130, 188, 189, 190, 191, 192, 193, 254, 255}; - const uptr n = sizeof(sizes) / sizeof(sizes[0]); + const uptr n = ARRAY_SIZE(interesting_sizes); for (uptr fi = 0; fi < n; fi++) { for (uptr ti = fi + 1; ti < n; ti++) { - const uptr from = sizes[fi]; - const uptr to = sizes[ti]; + const uptr from = interesting_sizes[fi]; + const uptr to = interesting_sizes[ti]; SyncClock sync; ThreadClock vector(0); for (uptr i = 0; i < from; i++)