Index: lib/tsan/rtl/tsan_clock.h =================================================================== --- lib/tsan/rtl/tsan_clock.h +++ lib/tsan/rtl/tsan_clock.h @@ -15,6 +15,7 @@ #include "tsan_defs.h" #include "tsan_dense_alloc.h" +#include "sanitizer_common/sanitizer_vector.h" namespace __tsan { @@ -34,6 +35,7 @@ u64 get_clean(unsigned tid) const; void Resize(ClockCache *c, uptr nclk); + void ResizeReleaseSequenceVectors(unsigned int tid, u16 nclk); void Reset(ClockCache *c); void DebugDump(int(*printf)(const char *s, ...)); @@ -110,6 +112,11 @@ u16 size_; u16 blocks_; // Number of second level blocks. + // support several release sequence in one sync variable + // see Page 5 in https://www.doc.ic.ac.uk/~afd/homepages/papers/pdfs/2017/POPL.pdf + Vector> released_threads_clock; + Vector release_sequence_blocked; + void Unshare(ClockCache *c); bool IsShared() const; bool Cachable() const; @@ -135,11 +142,18 @@ uptr size() const; void acquire(ClockCache *c, SyncClock *src); + void relaxed_load(ClockCache *c, SyncClock *src); void release(ClockCache *c, SyncClock *dst); + void relaxed_store(ClockCache *c, SyncClock *dst); void acq_rel(ClockCache *c, SyncClock *dst); void ReleaseStore(ClockCache *c, SyncClock *dst); + void RelaxedStore(ClockCache *c, SyncClock *dst); + void acquire_fence(); + void release_fence(); void ResetCached(ClockCache *c); + void block_release_sequences(SyncClock *dst); + void DebugReset(); void DebugDump(int(*printf)(const char *s, ...)); @@ -163,6 +177,8 @@ // Number of active elements in the clk_ table (the rest is zeros). uptr nclk_; u64 clk_[kMaxTidInClock]; // Fixed size vector clock. + u64 fence_release_clock[kMaxTidInClock]; + u64 fence_acquire_clock[kMaxTidInClock]; bool IsAlreadyAcquired(const SyncClock *src) const; void UpdateCurrentThread(ClockCache *c, SyncClock *dst) const; Index: lib/tsan/rtl/tsan_clock.cc =================================================================== --- lib/tsan/rtl/tsan_clock.cc +++ lib/tsan/rtl/tsan_clock.cc @@ -178,6 +178,22 @@ } } +void ThreadClock::relaxed_load(ClockCache *c, SyncClock *src) { + DCHECK_LE(nclk_, kMaxTid); + DCHECK_LE(src->size_, kMaxTid); + + // Check if it's empty -> no need to do anything. + if (src->size_ == 0) { + return; + } + + uptr i = 0; + for (ClockElem &ce : *src) { + fence_acquire_clock[i] = max(fence_acquire_clock[i], ce.epoch); + ++i; + } +} + void ThreadClock::release(ClockCache *c, SyncClock *dst) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(dst->size_, kMaxTid); @@ -185,6 +201,9 @@ if (dst->size_ == 0) { // ReleaseStore will correctly set release_store_tid_, // which can be important for future operations. + // TODO(sobir.yorov94): + // maybe failed because RMW doesn't block release sequence, + // but in ReleaseStore we are blocked they ReleaseStore(c, dst); return; } @@ -194,6 +213,11 @@ if (dst->size_ < nclk_) dst->Resize(c, nclk_); + // Check if we need to resize release release sequence vector for tid + if (dst->released_threads_clock.Size() < dst->size_ || + dst->released_threads_clock[tid_].Size() < dst->size_) { + dst->ResizeReleaseSequenceVectors(tid_, dst->size_); + } // Check if we had not acquired anything from other threads // since the last release on dst. If so, we need to update // only dst->elem(tid_). @@ -216,6 +240,12 @@ dst->FlushDirty(); uptr i = 0; for (ClockElem &ce : *dst) { + dst->released_threads_clock[tid_][i] = ce.epoch; + ++i; + } + dst->release_sequence_blocked[tid_] = false; + i = 0; + for (ClockElem &ce : *dst) { ce.epoch = max(ce.epoch, clk_[i]); ce.reused = 0; i++; @@ -233,9 +263,31 @@ dst->elem(tid_).reused = reused_; } +void ThreadClock::relaxed_store(ClockCache *c, SyncClock *dst) { + DCHECK_LE(nclk_, kMaxTid); + DCHECK_LE(dst->size_, kMaxTid); + + // Check if we need to resize dst. + if (dst->size_ < nclk_) + dst->Resize(c, nclk_); + + // TODO(sobir.yorov94@gmail.com): + // optimize in case when we didn't change fence_release_clock from last time + dst->Unshare(c); + uptr i = 0; + for (ClockElem &ce : *dst) { + ce.epoch = max(ce.epoch, fence_release_clock[i]); + ce.reused = 0; + i++; + } + dst->FlushDirty(); +} + void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) { DCHECK_LE(nclk_, kMaxTid); DCHECK_LE(dst->size_, kMaxTid); + DCHECK_LE(dst->released_threads_clock.Size(), kMaxTid); + DCHECK_LE(dst->release_sequence_blocked.Size(), kMaxTid); CPP_STAT_INC(StatClockStore); if (dst->size_ == 0 && cached_idx_ != 0) { @@ -268,6 +320,19 @@ if (dst->size_ < nclk_) dst->Resize(c, nclk_); + // Check if we need to resize release sequence vector for tid + if (dst->released_threads_clock.Size() < dst->size_ || + dst->released_threads_clock[tid_].Size() < dst->size_) { + dst->ResizeReleaseSequenceVectors(tid_, dst->size_); + } + + // Since it's release store we block release sequence of other threads + block_release_sequences(dst); + dst->release_sequence_blocked[tid_] = false; + for (uptr i = 0; i < nclk_; ++i) { + dst->released_threads_clock[tid_][i] = clk_[i]; + } + if (dst->release_store_tid_ == tid_ && dst->release_store_reused_ == reused_ && dst->elem(tid_).epoch > last_acquire_) { @@ -281,6 +346,7 @@ dst->Unshare(c); // Note: dst can be larger than this ThreadClock. // This is fine since clk_ beyond size is all zeros. + uptr i = 0; for (ClockElem &ce : *dst) { ce.epoch = clk_[i]; @@ -309,12 +375,72 @@ } } +void ThreadClock::RelaxedStore(ClockCache *c, SyncClock *dst) { + DCHECK_LE(nclk_, kMaxTid); + DCHECK_LE(dst->size_, kMaxTid); + DCHECK_LE(dst->released_threads_clock.Size(), kMaxTid); + DCHECK_LE(dst->release_sequence_blocked.Size(), kMaxTid); + // TODO(yorov.sobir): add stat here + + // Check if we need to resize dst. + if (dst->size_ < nclk_) { + dst->Resize(c, nclk_); + } + // Check if we need to resize release sequence vector for tid + if (dst->released_threads_clock.Size() < dst->size_ || + dst->released_threads_clock[tid_].Size() < dst->size_) { + dst->ResizeReleaseSequenceVectors(tid_, dst->size_); + } + // relaxed store block all release sequences on current atomic + block_release_sequences(dst); + // unshare before changing dst + dst->Unshare(c); + if (!dst->release_sequence_blocked[tid_]) { + uptr i = 0; + for (ClockElem &ce : *dst) { + ce.epoch = max(dst->released_threads_clock[tid_][i], + fence_release_clock[i]); + ce.reused = 0; + i++; + } + } else { + uptr i = 0; + for (ClockElem &ce : *dst) { + ce.epoch = fence_release_clock[i]; + ce.reused = 0; + i++; + } + } + dst->FlushDirty(); +} + void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) { CPP_STAT_INC(StatClockAcquireRelease); acquire(c, dst); ReleaseStore(c, dst); } +void ThreadClock::acquire_fence() { + DCHECK_LE(nclk_, kMaxTid); + for (uptr i = 0; i < nclk_; ++i) { + clk_[i] = max(clk_[i], fence_acquire_clock[i]); + } +} + +void ThreadClock::release_fence() { + DCHECK_LE(nclk_, kMaxTid); + for (uptr i = 0; i < nclk_; ++i) { + fence_release_clock[i] = clk_[i]; + } +} + +void ThreadClock::block_release_sequences(SyncClock *dst) { + for (uptr i = 0; i < nclk_; ++i) { + if (i != tid_) { + dst->release_sequence_blocked[i] = true; + } + } +} // Updates only single element related to the current thread in dst->clk_. void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const { // Update the threads time, but preserve 'acquired' flag. @@ -573,6 +699,12 @@ dirty_[1].tid, dirty_[1].epoch); } +void SyncClock::ResizeReleaseSequenceVectors(const unsigned int tid, u16 nclk) { + released_threads_clock.Resize(nclk); + released_threads_clock[tid].Resize(nclk); + release_sequence_blocked.Resize(nclk); +} + void SyncClock::Iter::Next() { // Finished with the current block, move on to the next one. block_++; Index: lib/tsan/rtl/tsan_interface_atomic.cc =================================================================== --- lib/tsan/rtl/tsan_interface_atomic.cc +++ lib/tsan/rtl/tsan_interface_atomic.cc @@ -224,16 +224,20 @@ CHECK(IsLoadOrder(mo)); // This fast-path is critical for performance. // Assume the access is atomic. - if (!IsAcquireOrder(mo)) { - MemoryReadAtomic(thr, pc, (uptr)a, SizeLog()); - return NoTsanAtomicLoad(a, mo); - } +// if (!IsAcquireOrder(mo)) { +// MemoryReadAtomic(thr, pc, (uptr)a, SizeLog()); +// return NoTsanAtomicLoad(a, mo); +// } // Don't create sync object if it does not exist yet. For example, an atomic // pointer is initialized to nullptr and then periodically acquire-loaded. T v = NoTsanAtomicLoad(a, mo); SyncVar *s = ctx->metamap.GetIfExistsAndLock((uptr)a, false); if (s) { - AcquireImpl(thr, pc, &s->clock); + if (IsAcquireOrder(mo)) { + AcquireImpl(thr, pc, &s->clock); + } else { + RelaxedLoadImpl(thr, pc, &s->clock); + } // Re-read under sync mutex because we need a consistent snapshot // of the value and the clock we acquire. v = NoTsanAtomicLoad(a, mo); @@ -264,16 +268,17 @@ // Assume the access is atomic. // Strictly saying even relaxed store cuts off release sequence, // so must reset the clock. - if (!IsReleaseOrder(mo)) { - NoTsanAtomicStore(a, v, mo); - return; - } + __sync_synchronize(); SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true); - thr->fast_state.IncrementEpoch(); - // Can't increment epoch w/o writing to the trace as well. - TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0); - ReleaseStoreImpl(thr, pc, &s->clock); + if (IsReleaseOrder(mo)) { + thr->fast_state.IncrementEpoch(); + // Can't increment epoch w/o writing to the trace as well. + TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0); + ReleaseStoreImpl(thr, pc, &s->clock); + } else { + RelaxedStoreImpl(thr, pc, &s->clock); + } NoTsanAtomicStore(a, v, mo); s->mtx.Unlock(); } @@ -281,9 +286,8 @@ template static T AtomicRMW(ThreadState *thr, uptr pc, volatile T *a, T v, morder mo) { MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog()); - SyncVar *s = 0; + SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true); if (mo != mo_relaxed) { - s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true); thr->fast_state.IncrementEpoch(); // Can't increment epoch w/o writing to the trace as well. TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0); @@ -293,6 +297,8 @@ ReleaseImpl(thr, pc, &s->clock); else if (IsAcquireOrder(mo)) AcquireImpl(thr, pc, &s->clock); + } else { + RelaxedImpl(thr, pc, &s->clock); } v = F(a, v); if (s) @@ -405,10 +411,9 @@ volatile T *a, T *c, T v, morder mo, morder fmo) { (void)fmo; // Unused because llvm does not pass it yet. MemoryWriteAtomic(thr, pc, (uptr)a, SizeLog()); - SyncVar *s = 0; bool write_lock = mo != mo_acquire && mo != mo_consume; + SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock); if (mo != mo_relaxed) { - s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, write_lock); thr->fast_state.IncrementEpoch(); // Can't increment epoch w/o writing to the trace as well. TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0); @@ -418,6 +423,8 @@ ReleaseImpl(thr, pc, &s->clock); else if (IsAcquireOrder(mo)) AcquireImpl(thr, pc, &s->clock); + } else { + RelaxedImpl(thr, pc, &s->clock); } T cc = *c; T pr = func_cas(a, cc, v); @@ -446,8 +453,11 @@ } static void AtomicFence(ThreadState *thr, uptr pc, morder mo) { - // FIXME(dvyukov): not implemented. - __sync_synchronize(); + if (IsAcquireOrder(mo)) + AcquireFenceImpl(thr, pc); + else if (IsReleaseOrder(mo)) + ReleaseFenceImpl(thr, pc); + NoTsanAtomicFence(mo); } #endif Index: lib/tsan/rtl/tsan_rtl.h =================================================================== --- lib/tsan/rtl/tsan_rtl.h +++ lib/tsan/rtl/tsan_rtl.h @@ -801,9 +801,14 @@ void ReleaseStore(ThreadState *thr, uptr pc, uptr addr); void AfterSleep(ThreadState *thr, uptr pc); void AcquireImpl(ThreadState *thr, uptr pc, SyncClock *c); +void RelaxedLoadImpl(ThreadState *thr, uptr pc, SyncClock *c); void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c); +void RelaxedImpl(ThreadState *thr, uptr pc, SyncClock *c); void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c); +void RelaxedStoreImpl(ThreadState *thr, uptr pc, SyncClock *c); void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c); +void AcquireFenceImpl(ThreadState *thr, uptr pc); +void ReleaseFenceImpl(ThreadState *thr, uptr pc); // The hacky call uses custom calling convention and an assembly thunk. // It is considerably faster that a normal call for the caller Index: lib/tsan/rtl/tsan_rtl_mutex.cc =================================================================== --- lib/tsan/rtl/tsan_rtl_mutex.cc +++ lib/tsan/rtl/tsan_rtl_mutex.cc @@ -483,6 +483,14 @@ StatInc(thr, StatSyncAcquire); } +void RelaxedLoadImpl(ThreadState *thr, uptr pc, SyncClock *c) { + if (thr->ignore_sync) { + return; + } + thr->clock.set(thr->fast_state.epoch()); + thr->clock.relaxed_load(&thr->proc()->clock_cache, c); +} + void ReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) { if (thr->ignore_sync) return; @@ -492,6 +500,16 @@ StatInc(thr, StatSyncRelease); } +void RelaxedImpl(ThreadState *thr, uptr pc, SyncClock *c) { + if (thr->ignore_sync) { + return; + } + thr->clock.set(thr->fast_state.epoch()); + thr->fast_synch_epoch = thr->fast_state.epoch(); + thr->clock.relaxed_store(&thr->proc()->clock_cache, c); + // TODO(yorov.sobir): add stat here +} + void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) { if (thr->ignore_sync) return; @@ -501,6 +519,16 @@ StatInc(thr, StatSyncRelease); } +void RelaxedStoreImpl(ThreadState *thr, uptr pc, SyncClock *c) { + if (thr->ignore_sync) { + return; + } + thr->clock.set(thr->fast_state.epoch()); + thr->fast_synch_epoch = thr->fast_state.epoch(); + thr->clock.RelaxedStore(&thr->proc()->clock_cache, c); + // TODO(yorov.sobir): add stat here +} + void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c) { if (thr->ignore_sync) return; @@ -511,6 +539,26 @@ StatInc(thr, StatSyncRelease); } +void AcquireFenceImpl(ThreadState *thr, uptr pc) { + if (thr->ignore_sync) { + return; + } + thr->clock.set(thr->fast_state.epoch()); + thr->fast_synch_epoch = thr->fast_state.epoch(); + thr->clock.acquire_fence(); + //TODO(sobir.yorov94@gmail.com): add stat +} + +void ReleaseFenceImpl(ThreadState *thr, uptr pc) { + if (thr->ignore_sync) { + return; + } + thr->clock.set(thr->fast_state.epoch()); + thr->fast_synch_epoch = thr->fast_state.epoch(); + thr->clock.release_fence(); + //TODO(sobir.yorov94@gmail.com): add stat +} + void ReportDeadlock(ThreadState *thr, uptr pc, DDReport *r) { if (r == 0) return; Index: test/CMakeLists.txt =================================================================== --- test/CMakeLists.txt +++ test/CMakeLists.txt @@ -15,7 +15,7 @@ endif() if(COMPILER_RT_STANDALONE_BUILD) - add_executable(FileCheck IMPORTED GLOBAL) + add_executable(FileCheck IMPORTED GLOBAL tsan/atomic_release_sequence_blocking.cpp tsan/fence_norace.cpp) set_property(TARGET FileCheck PROPERTY IMPORTED_LOCATION ${LLVM_TOOLS_BINARY_DIR}/FileCheck) list(APPEND SANITIZER_COMMON_LIT_TEST_DEPS FileCheck) endif() Index: test/tsan/atomic_release_sequence_blocking.cpp =================================================================== --- /dev/null +++ test/tsan/atomic_release_sequence_blocking.cpp @@ -0,0 +1,45 @@ +// RUN: %clangxx_tsan -O0 %s -o %t && %run %t 2>&1 | FileCheck %s +#include "test.h" + +int nax; +int x; + +void* thread1(void* arg) { + nax = 1; + __atomic_store_n(&x, 1, __ATOMIC_RELEASE); + return 0; +} + +void* thread2(void* arg) { + if (__atomic_load_n(&x, __ATOMIC_ACQUIRE) == 1) { + __atomic_store_n(&x, 2, __ATOMIC_RELAXED); + } + return 0; +} + +void* thread3(void* arg) { + if (__atomic_load_n(&x, __ATOMIC_ACQUIRE) == 2) { + int temp = nax; + (void)temp; + } else { + fprintf(stderr, "DONE\n"); + } + return 0; +} + +int main() { + pthread_t t1; + pthread_t t2; + pthread_t t3; + pthread_create(&t1, nullptr, thread1, nullptr); + pthread_create(&t2, nullptr, thread2, nullptr); + pthread_create(&t3, nullptr, thread3, nullptr); + + pthread_join(t1, nullptr); + pthread_join(t2, nullptr); + pthread_join(t3, nullptr); + return 0; +} + +// CHECK: WARNING: ThreadSanitizer: data race +// CHECK-NOT-EMPTY: \ No newline at end of file Index: test/tsan/fence_norace.cpp =================================================================== --- /dev/null +++ test/tsan/fence_norace.cpp @@ -0,0 +1,35 @@ +// RUN: %clangxx_tsan -O0 %s -o %t && %run %t 2>&1 | FileCheck %s +#include "test.h" +#include + +int nax; +std::atomic x; + +void* thread1(void* arg) { + nax = 1; + std::atomic_thread_fence(std::memory_order_release); + x.store(1, std::memory_order_relaxed); + return 0; +} + +void* thread2(void* arg) { + if (x.load(std::memory_order_relaxed) == 1) { + std::atomic_thread_fence(std::memory_order_acquire); + printf("%d\n", nax); + } + return 0; +} + +int main() { + pthread_t t1; + pthread_t t2; + pthread_create(&t1, nullptr, thread1, nullptr); + pthread_create(&t2, nullptr, thread2, nullptr); + + pthread_join(t1, nullptr); + pthread_join(t2, nullptr); + return 0; +} + +// CHECK-NOT: ThreadSanitizer: data race +// CHECK: 1