Index: compiler-rt/lib/tsan/rtl/tsan_clock.h =================================================================== --- compiler-rt/lib/tsan/rtl/tsan_clock.h +++ compiler-rt/lib/tsan/rtl/tsan_clock.h @@ -139,6 +139,7 @@ void acq_rel(ClockCache *c, SyncClock *dst); void ReleaseStore(ClockCache *c, SyncClock *dst); void ResetCached(ClockCache *c); + void NoteGlobalAcquire(u64 v); void DebugReset(); void DebugDump(int(*printf)(const char *s, ...)); @@ -151,6 +152,52 @@ // Current thread time when it acquired something from other threads. u64 last_acquire_; + // Last time another thread has done a global acquire of this thread's clock. + // It helps to avoid problem described in: + // https://github.com/golang/go/issues/39186 + // See test/tsan/java_finalizer2.cpp for a regression test. + // Note the failuire is _extremely_ hard to hit, so if you are trying + // to reproduce it, you may want to run something like: + // $ go get golang.org/x/tools/cmd/stress + // $ stress -p=64 ./a.out + // + // The crux of the problem is roughly as follows. + // A number of O(1) optimizations in the clocks algorithm assume proper + // transitive cumulative propagation of clock values. The AcquireGlobal + // operation may produce an inconsistent non-linearazable view of + // thread clocks. Namely, it may acquire a later value from a thread + // with a higher ID, but fail to acquire an earlier value from a thread + // with a lower ID. If a thread that executed AcquireGlobal then releases + // to a sync clock, it will spoil the sync clock with the inconsistent + // values. If another thread later releases to the sync clock, the optimized + // algorithm may break. + // + // The exact sequence of events that leads to the failure. + // - thread 1 executes AcquireGlobal + // - thread 1 acquires value 1 for thread 2 + // - thread 2 increments clock to 2 + // - thread 2 releases to sync object 1 + // - thread 3 at time 1 + // - thread 3 acquires from sync object 1 + // - thread 1 acquires value 1 for thread 3 + // - thread 1 releases to sync object 2 + // - sync object 2 clock has 1 for thread 2 and 1 for thread 3 + // - thread 3 releases to sync object 2 + // - thread 3 sees value 1 in the clock for itself + // and decides that it has already released to the clock + // and did not acquire anything from other threads after that + // (the last_acquire_ check in release operation) + // - thread 3 does not update the value for thread 2 in the clock from 1 to 2 + // - thread 4 acquires from sync object 2 + // - thread 4 detects a false race with thread 2 + // as it should have been synchronized with thread 2 up to time 2, + // but because of the broken clock it is now synchronized only up to time 1 + // + // The global_acquire_ value helps to prevent this scenario. + // Namely, thread 3 will not trust any own clock values up to global_acquire_ + // for the purposes of the last_acquire_ optimization. + atomic_uint64_t global_acquire_; + // Cached SyncClock (without dirty entries and release_store_tid_). // We reuse it for subsequent store-release operations without intervening // acquire operations. Since it is shared (and thus constant), clock value @@ -165,6 +212,7 @@ u64 clk_[kMaxTidInClock]; // Fixed size vector clock. bool IsAlreadyAcquired(const SyncClock *src) const; + bool HasAcquiredAfterRelease(const SyncClock *dst) const; void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const; }; @@ -186,6 +234,10 @@ return nclk_; } +ALWAYS_INLINE void ThreadClock::NoteGlobalAcquire(u64 v) { + atomic_store_relaxed(&global_acquire_, v); +} + ALWAYS_INLINE SyncClock::Iter SyncClock::begin() { return Iter(this); } Index: compiler-rt/lib/tsan/rtl/tsan_clock.cpp =================================================================== --- compiler-rt/lib/tsan/rtl/tsan_clock.cpp +++ compiler-rt/lib/tsan/rtl/tsan_clock.cpp @@ -115,13 +115,14 @@ ThreadClock::ThreadClock(unsigned tid, unsigned reused) : tid_(tid) , reused_(reused + 1) // 0 has special meaning + , last_acquire_() + , global_acquire_() , cached_idx_() , cached_size_() , cached_blocks_() { CHECK_LT(tid, kMaxTidInClock); CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits); nclk_ = tid_ + 1; - last_acquire_ = 0; internal_memset(clk_, 0, sizeof(clk_)); } @@ -247,7 +248,7 @@ // Check if we had not acquired anything from other threads // since the last release on dst. If so, we need to update // only dst->elem(tid_). - if (dst->elem(tid_).epoch > last_acquire_) { + if (!HasAcquiredAfterRelease(dst)) { UpdateCurrentThread(c, dst); if (dst->release_store_tid_ != tid_ || dst->release_store_reused_ != reused_) @@ -318,7 +319,7 @@ if (dst->release_store_tid_ == tid_ && dst->release_store_reused_ == reused_ && - dst->elem(tid_).epoch > last_acquire_) { + !HasAcquiredAfterRelease(dst)) { CPP_STAT_INC(StatClockStoreFast); UpdateCurrentThread(c, dst); return; @@ -400,6 +401,14 @@ return true; } +// Checks whether the current thread has acquired anything +// from other clocks after releasing to dst. +bool ThreadClock::HasAcquiredAfterRelease(const SyncClock *dst) const { + const u64 my_epoch = dst->elem(tid_).epoch; + return my_epoch <= last_acquire_ || + my_epoch <= atomic_load_relaxed(&global_acquire_); +} + // Sets a single element in the vector clock. // This function is called only from weird places like AcquireGlobal. void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) { Index: compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp =================================================================== --- compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp +++ compiler-rt/lib/tsan/rtl/tsan_rtl_mutex.cpp @@ -415,8 +415,10 @@ ThreadState *thr = reinterpret_cast(arg); ThreadContext *tctx = static_cast(tctx_base); u64 epoch = tctx->epoch1; - if (tctx->status == ThreadStatusRunning) + if (tctx->status == ThreadStatusRunning) { epoch = tctx->thr->fast_state.epoch(); + tctx->thr->clock.NoteGlobalAcquire(epoch); + } thr->clock.set(&thr->proc()->clock_cache, tctx->tid, epoch); } Index: compiler-rt/test/tsan/java_finalizer2.cpp =================================================================== --- /dev/null +++ compiler-rt/test/tsan/java_finalizer2.cpp @@ -0,0 +1,82 @@ +// RUN: %clangxx_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s +// Regression test for https://github.com/golang/go/issues/39186 +#include "java.h" +#include + +struct Heap { + uint64_t data; + uint64_t ready; + uint64_t finalized; + uint64_t wg; + pthread_barrier_t barrier_finalizer; + pthread_barrier_t barrier_ballast; +}; + +void *Thread1(void *p) { + Heap* heap = (Heap*)p; + pthread_barrier_wait(&heap->barrier_finalizer); + __tsan_java_finalize(); + __atomic_fetch_add(&heap->wg, 1, __ATOMIC_RELEASE); + __atomic_store_n(&heap->finalized, 1, __ATOMIC_RELAXED); + return 0; +} + +void *Thread2(void *p) { + Heap* heap = (Heap*)p; + pthread_barrier_wait(&heap->barrier_finalizer); + heap->data = 1; + __atomic_store_n(&heap->ready, 1, __ATOMIC_RELEASE); + return 0; +} + +void *Thread3(void *p) { + Heap* heap = (Heap*)p; + pthread_barrier_wait(&heap->barrier_finalizer); + while (__atomic_load_n(&heap->ready, __ATOMIC_ACQUIRE) != 1) + pthread_yield(); + while (__atomic_load_n(&heap->finalized, __ATOMIC_RELAXED) != 1) + pthread_yield(); + __atomic_fetch_add(&heap->wg, 1, __ATOMIC_RELEASE); + return 0; +} + +void *Ballast(void *p) { + Heap* heap = (Heap*)p; + pthread_barrier_wait(&heap->barrier_ballast); + return 0; +} + +int main() { + Heap* heap = (Heap*)calloc(sizeof(Heap), 1); + __tsan_java_init((jptr)heap, sizeof(*heap)); + __tsan_java_alloc((jptr)heap, sizeof(*heap)); + // Ballast threads merely make the bug a bit easier to trigger. + const int kBallastThreads = 100; + pthread_barrier_init(&heap->barrier_finalizer, 0, 4); + pthread_barrier_init(&heap->barrier_ballast, 0, kBallastThreads + 1); + pthread_t th[3]; + pthread_create(&th[0], 0, Thread1, heap); + pthread_create(&th[1], 0, Thread2, heap); + pthread_t ballast[kBallastThreads]; + for (int i = 0; i < kBallastThreads; i++) + pthread_create(&ballast[i], 0, Ballast, heap); + pthread_create(&th[2], 0, Thread3, heap); + pthread_barrier_wait(&heap->barrier_ballast); + for (int i = 0; i < kBallastThreads; i++) + pthread_join(ballast[i], 0); + pthread_barrier_wait(&heap->barrier_finalizer); + while (__atomic_load_n(&heap->wg, __ATOMIC_ACQUIRE) != 2) + pthread_yield(); + if (heap->data != 1) + exit(printf("no data\n")); + for (int i = 0; i < 3; i++) + pthread_join(th[i], 0); + pthread_barrier_destroy(&heap->barrier_ballast); + pthread_barrier_destroy(&heap->barrier_finalizer); + __tsan_java_free((jptr)heap, sizeof(*heap)); + fprintf(stderr, "DONE\n"); + return __tsan_java_fini(); +} + +// CHECK-NOT: WARNING: ThreadSanitizer: data race +// CHECK: DONE