Index: lib/sanitizer_common/sanitizer_thread_registry.h
===================================================================
--- lib/sanitizer_common/sanitizer_thread_registry.h
+++ lib/sanitizer_common/sanitizer_thread_registry.h
@@ -68,6 +68,7 @@
   virtual void OnStarted(void *arg) {}
   virtual void OnCreated(void *arg) {}
   virtual void OnReset() {}
+  virtual void OnDetached(void *arg) {}
 };
 
 typedef ThreadContextBase* (*ThreadContextFactory)(u32 tid);
@@ -111,6 +112,7 @@
   void SetThreadName(u32 tid, const char *name);
   void SetThreadNameByUserId(uptr user_id, const char *name);
   void DetachThread(u32 tid);
+  void DetachThread(u32 tid, void *arg);
   void JoinThread(u32 tid, void *arg);
   void FinishThread(u32 tid);
   void StartThread(u32 tid, uptr os_id, void *arg);
Index: lib/sanitizer_common/sanitizer_thread_registry.cc
===================================================================
--- lib/sanitizer_common/sanitizer_thread_registry.cc
+++ lib/sanitizer_common/sanitizer_thread_registry.cc
@@ -219,6 +219,10 @@
 }
 
 void ThreadRegistry::DetachThread(u32 tid) {
+  DetachThread(tid, 0);
+}
+
+void ThreadRegistry::DetachThread(u32 tid, void *arg) {
   BlockingMutexLock l(&mtx_);
   CHECK_LT(tid, n_contexts_);
   ThreadContextBase *tctx = threads_[tid];
@@ -227,6 +231,7 @@
     Report("%s: Detach of non-existent thread\n", SanitizerToolName);
     return;
   }
+  tctx->OnDetached(arg);
   if (tctx->status == ThreadStatusFinished) {
     tctx->SetDead();
     QuarantinePush(tctx);
Index: lib/tsan/rtl/tsan_clock.h
===================================================================
--- lib/tsan/rtl/tsan_clock.h
+++ lib/tsan/rtl/tsan_clock.h
@@ -14,7 +14,7 @@
 #define TSAN_CLOCK_H
 
 #include "tsan_defs.h"
-#include "tsan_vector.h"
+#include "tsan_dense_alloc.h"
 
 namespace __tsan {
 
@@ -23,37 +23,64 @@
   u64 reused : 64 - kClkBits;
 };
 
+struct ClockBlock {
+  static const uptr kSize = 512;
+  static const uptr kTableSize = kSize / sizeof(u32);
+  static const uptr kClockCount = kSize / sizeof(ClockElem);
+
+  union {
+    u32       table[kTableSize];
+    ClockElem clock[kClockCount];
+  };
+
+  ClockBlock() {
+  }
+};
+
+typedef DenseSlabAlloc<ClockBlock, 1<<16, 1<<10> ClockAlloc;
+typedef DenseSlabAllocCache ClockCache;
+
 // The clock that lives in sync variables (mutexes, atomics, etc).
 class SyncClock {
  public:
   SyncClock();
+  ~SyncClock();
 
   uptr size() const {
-    return clk_.Size();
+    return size_;
   }
 
   u64 get(unsigned tid) const {
-    DCHECK_LT(tid, clk_.Size());
-    return clk_[tid].epoch;
+    return elem(tid).epoch;
   }
 
-  void Reset();
-  void Zero();
+  void Reset(ClockCache *c);
 
   void DebugDump(int(*printf)(const char *s, ...));
 
  private:
+  friend struct ThreadClock;
+  static const uptr kDirtyTids = 2;
+
   unsigned release_store_tid_;
   unsigned release_store_reused_;
-  static const uptr kDirtyTids = 2;
   unsigned dirty_tids_[kDirtyTids];
-  mutable Vector<ClockElem> clk_;
-  friend struct ThreadClock;
+  // tab_ contains indirect pointer to a 512b block using DenseSlabAlloc.
+  // If size_ <= 64, then tab_ points to an array with 64 ClockElem's.
+  // Otherwise, tab_ points to an array with 128 u32 elements,
+  // each pointing to the second-level 512b block with 64 ClockElem's.
+  ClockBlock *tab_;
+  u32 tab_idx_;
+  u32 size_;
+
+  ClockElem &elem(unsigned tid) const;
 };
 
 // The clock that lives in threads.
 struct ThreadClock {
  public:
+  typedef DenseSlabAllocCache Cache;
+
   explicit ThreadClock(unsigned tid, unsigned reused = 0);
 
   u64 get(unsigned tid) const {
@@ -76,10 +103,10 @@
     return nclk_;
   }
 
-  void acquire(const SyncClock *src);
-  void release(SyncClock *dst) const;
-  void acq_rel(SyncClock *dst);
-  void ReleaseStore(SyncClock *dst) const;
+  void acquire(ClockCache *c, const SyncClock *src);
+  void release(ClockCache *c, SyncClock *dst) const;
+  void acq_rel(ClockCache *c, SyncClock *dst);
+  void ReleaseStore(ClockCache *c, SyncClock *dst) const;
 
   void DebugReset();
   void DebugDump(int(*printf)(const char *s, ...));
@@ -94,6 +121,7 @@
 
   bool IsAlreadyAcquired(const SyncClock *src) const;
   void UpdateCurrentThread(SyncClock *dst) const;
+  void Resize(ClockCache *c, SyncClock *dst) const;
 };
 
 }  // namespace __tsan
Index: lib/tsan/rtl/tsan_clock.cc
===================================================================
--- lib/tsan/rtl/tsan_clock.cc
+++ lib/tsan/rtl/tsan_clock.cc
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 #include "tsan_clock.h"
 #include "tsan_rtl.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
 
 // SyncClock and ThreadClock implement vector clocks for sync variables
 // (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
@@ -102,13 +103,13 @@
   clk_[tid_].reused = reused_;
 }
 
-void ThreadClock::acquire(const SyncClock *src) {
+void ThreadClock::acquire(ClockCache *c, const SyncClock *src) {
   DCHECK(nclk_ <= kMaxTid);
-  DCHECK(src->clk_.Size() <= kMaxTid);
+  DCHECK(src->size_ <= kMaxTid);
   CPP_STAT_INC(StatClockAcquire);
 
   // Check if it's empty -> no need to do anything.
-  const uptr nclk = src->clk_.Size();
+  const uptr nclk = src->size_;
   if (nclk == 0) {
     CPP_STAT_INC(StatClockAcquireEmpty);
     return;
@@ -118,12 +119,12 @@
   bool acquired = false;
   if (nclk > tid_) {
     CPP_STAT_INC(StatClockAcquireLarge);
-    if (src->clk_[tid_].reused == reused_) {
+    if (src->elem(tid_).reused == reused_) {
       CPP_STAT_INC(StatClockAcquireRepeat);
       for (unsigned i = 0; i < kDirtyTids; i++) {
         unsigned tid = src->dirty_tids_[i];
         if (tid != kInvalidTid) {
-          u64 epoch = src->clk_[tid].epoch;
+          u64 epoch = src->elem(tid).epoch;
           if (clk_[tid].epoch < epoch) {
             clk_[tid].epoch = epoch;
             acquired = true;
@@ -142,7 +143,7 @@
   CPP_STAT_INC(StatClockAcquireFull);
   nclk_ = max(nclk_, nclk);
   for (uptr i = 0; i < nclk; i++) {
-    u64 epoch = src->clk_[i].epoch;
+    u64 epoch = src->elem(i).epoch;
     if (clk_[i].epoch < epoch) {
       clk_[i].epoch = epoch;
       acquired = true;
@@ -151,7 +152,7 @@
 
   // Remember that this thread has acquired this clock.
   if (nclk > tid_)
-    src->clk_[tid_].reused = reused_;
+    src->elem(tid_).reused = reused_;
 
   if (acquired) {
     CPP_STAT_INC(StatClockAcquiredSomething);
@@ -159,28 +160,26 @@
   }
 }
 
-void ThreadClock::release(SyncClock *dst) const {
+void ThreadClock::release(ClockCache *c, SyncClock *dst) const {
   DCHECK_LE(nclk_, kMaxTid);
-  DCHECK_LE(dst->clk_.Size(), kMaxTid);
+  DCHECK_LE(dst->size_, kMaxTid);
 
-  if (dst->clk_.Size() == 0) {
+  if (dst->size_ == 0) {
     // ReleaseStore will correctly set release_store_tid_,
     // which can be important for future operations.
-    ReleaseStore(dst);
+    ReleaseStore(c, dst);
     return;
   }
 
   CPP_STAT_INC(StatClockRelease);
   // Check if we need to resize dst.
-  if (dst->clk_.Size() < nclk_) {
-    CPP_STAT_INC(StatClockReleaseResize);
-    dst->clk_.Resize(nclk_);
-  }
+  if (dst->size_ < nclk_)
+    Resize(c, dst);
 
   // Check if we had not acquired anything from other threads
   // since the last release on dst. If so, we need to update
-  // only dst->clk_[tid_].
-  if (dst->clk_[tid_].epoch > last_acquire_) {
+  // only dst->elem(tid_).
+  if (dst->elem(tid_).epoch > last_acquire_) {
     UpdateCurrentThread(dst);
     if (dst->release_store_tid_ != tid_ ||
         dst->release_store_reused_ != reused_)
@@ -196,14 +195,15 @@
     CPP_STAT_INC(StatClockReleaseAcquired);
   // Update dst->clk_.
   for (uptr i = 0; i < nclk_; i++) {
-    dst->clk_[i].epoch = max(dst->clk_[i].epoch, clk_[i].epoch);
-    dst->clk_[i].reused = 0;
+    ClockElem &ce = dst->elem(i);
+    ce.epoch = max(ce.epoch, clk_[i].epoch);
+    ce.reused = 0;
   }
   // Clear 'acquired' flag in the remaining elements.
-  if (nclk_ < dst->clk_.Size())
+  if (nclk_ < dst->size_)
     CPP_STAT_INC(StatClockReleaseClearTail);
-  for (uptr i = nclk_; i < dst->clk_.Size(); i++)
-    dst->clk_[i].reused = 0;
+  for (uptr i = nclk_; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
   for (unsigned i = 0; i < kDirtyTids; i++)
     dst->dirty_tids_[i] = kInvalidTid;
   dst->release_store_tid_ = kInvalidTid;
@@ -211,23 +211,21 @@
   // If we've acquired dst, remember this fact,
   // so that we don't need to acquire it on next acquire.
   if (acquired)
-    dst->clk_[tid_].reused = reused_;
+    dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::ReleaseStore(SyncClock *dst) const {
+void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) const {
   DCHECK(nclk_ <= kMaxTid);
-  DCHECK(dst->clk_.Size() <= kMaxTid);
+  DCHECK(dst->size_ <= kMaxTid);
   CPP_STAT_INC(StatClockStore);
 
   // Check if we need to resize dst.
-  if (dst->clk_.Size() < nclk_) {
-    CPP_STAT_INC(StatClockStoreResize);
-    dst->clk_.Resize(nclk_);
-  }
+  if (dst->size_ < nclk_)
+    Resize(c, dst);
 
   if (dst->release_store_tid_ == tid_ &&
       dst->release_store_reused_ == reused_ &&
-      dst->clk_[tid_].epoch > last_acquire_) {
+      dst->elem(tid_).epoch > last_acquire_) {
     CPP_STAT_INC(StatClockStoreFast);
     UpdateCurrentThread(dst);
     return;
@@ -236,13 +234,17 @@
   // O(N) release-store.
   CPP_STAT_INC(StatClockStoreFull);
   for (uptr i = 0; i < nclk_; i++) {
-    dst->clk_[i].epoch = clk_[i].epoch;
-    dst->clk_[i].reused = 0;
+    ClockElem &ce = dst->elem(i);
+    ce.epoch = clk_[i].epoch;
+    ce.reused = 0;
   }
   // Clear the tail of dst->clk_.
-  if (nclk_ < dst->clk_.Size()) {
-    internal_memset(&dst->clk_[nclk_], 0,
-        (dst->clk_.Size() - nclk_) * sizeof(dst->clk_[0]));
+  if (nclk_ < dst->size_) {
+    for (uptr i = nclk_; i < dst->size_; i++) {
+      ClockElem &ce = dst->elem(i);
+      ce.epoch = 0;
+      ce.reused = 0;
+    }
     CPP_STAT_INC(StatClockStoreTail);
   }
   for (unsigned i = 0; i < kDirtyTids; i++)
@@ -250,19 +252,19 @@
   dst->release_store_tid_ = tid_;
   dst->release_store_reused_ = reused_;
   // Rememeber that we don't need to acquire it in future.
-  dst->clk_[tid_].reused = reused_;
+  dst->elem(tid_).reused = reused_;
 }
 
-void ThreadClock::acq_rel(SyncClock *dst) {
+void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
   CPP_STAT_INC(StatClockAcquireRelease);
-  acquire(dst);
-  ReleaseStore(dst);
+  acquire(c, dst);
+  ReleaseStore(c, dst);
 }
 
 // Updates only single element related to the current thread in dst->clk_.
 void ThreadClock::UpdateCurrentThread(SyncClock *dst) const {
   // Update the threads time, but preserve 'acquired' flag.
-  dst->clk_[tid_].epoch = clk_[tid_].epoch;
+  dst->elem(tid_).epoch = clk_[tid_].epoch;
 
   for (unsigned i = 0; i < kDirtyTids; i++) {
     if (dst->dirty_tids_[i] == tid_) {
@@ -277,27 +279,73 @@
   }
   // Reset all 'acquired' flags, O(N).
   CPP_STAT_INC(StatClockReleaseSlow);
-  for (uptr i = 0; i < dst->clk_.Size(); i++) {
-    dst->clk_[i].reused = 0;
-  }
+  for (uptr i = 0; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
   for (unsigned i = 0; i < kDirtyTids; i++)
     dst->dirty_tids_[i] = kInvalidTid;
 }
 
 // Checks whether the current threads has already acquired src.
 bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
-  if (src->clk_[tid_].reused != reused_)
+  if (src->elem(tid_).reused != reused_)
     return false;
   for (unsigned i = 0; i < kDirtyTids; i++) {
     unsigned tid = src->dirty_tids_[i];
     if (tid != kInvalidTid) {
-      if (clk_[tid].epoch < src->clk_[tid].epoch)
+      if (clk_[tid].epoch < src->elem(tid).epoch)
         return false;
     }
   }
   return true;
 }
 
+void ThreadClock::Resize(ClockCache *c, SyncClock *dst) const {
+  CPP_STAT_INC(StatClockReleaseResize);
+  if (RoundUpTo(nclk_, ClockBlock::kClockCount) <=
+      RoundUpTo(dst->size_, ClockBlock::kClockCount)) {
+    // Growing within the same block.
+    // Memory is already allocated, just increase the size.
+    dst->size_ = nclk_;
+    return;
+  }
+  if (nclk_ <= ClockBlock::kClockCount) {
+    // Grow from 0 to one-level table.
+    CHECK_EQ(dst->size_, 0);
+    CHECK_EQ(dst->tab_, 0);
+    CHECK_EQ(dst->tab_idx_, 0);
+    dst->size_ = nclk_;
+    dst->tab_idx_ = ctx->clock_alloc.Alloc(c);
+    dst->tab_ = ctx->clock_alloc.Map(dst->tab_idx_);
+    internal_memset(dst->tab_, 0, sizeof(*dst->tab_));
+    return;
+  }
+  // Growing two-level table.
+  if (dst->size_ == 0) {
+    // Allocate first level table.
+    dst->tab_idx_ = ctx->clock_alloc.Alloc(c);
+    dst->tab_ = ctx->clock_alloc.Map(dst->tab_idx_);
+    internal_memset(dst->tab_, 0, sizeof(*dst->tab_));
+  } else if (dst->size_ <= ClockBlock::kClockCount) {
+    // Transform one-level table to two-level table.
+    u32 old = dst->tab_idx_;
+    dst->tab_idx_ = ctx->clock_alloc.Alloc(c);
+    dst->tab_ = ctx->clock_alloc.Map(dst->tab_idx_);
+    internal_memset(dst->tab_, 0, sizeof(*dst->tab_));
+    dst->tab_->table[0] = old;
+  }
+  // At this point we have first level table allocated.
+  // Add second level tables as necessary.
+  for (uptr i = RoundUpTo(dst->size_, ClockBlock::kClockCount);
+      i < nclk_; i += ClockBlock::kClockCount) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    internal_memset(cb, 0, sizeof(*cb));
+    CHECK_EQ(dst->tab_->table[i/ClockBlock::kClockCount], 0);
+    dst->tab_->table[i/ClockBlock::kClockCount] = idx;
+  }
+  dst->size_ = nclk_;
+}
+
 // Sets a single element in the vector clock.
 // This function is called only from weird places like AcquireGlobal.
 void ThreadClock::set(unsigned tid, u64 v) {
@@ -320,34 +368,59 @@
       tid_, reused_, last_acquire_);
 }
 
-SyncClock::SyncClock()
-    : clk_(MBlockClock) {
+SyncClock::SyncClock() {
+  tab_ = 0;
+  tab_idx_ = 0;
+  size_ = 0;
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
     dirty_tids_[i] = kInvalidTid;
 }
 
-void SyncClock::Reset() {
-  clk_.Reset();
-  Zero();
-}
-
-void SyncClock::Zero() {
-  clk_.Resize(0);
+SyncClock::~SyncClock() {
+  CHECK_EQ(size_, 0);
+  CHECK_EQ(tab_, 0);
+  CHECK_EQ(tab_idx_, 0);
+}
+
+void SyncClock::Reset(ClockCache *c) {
+  if (size_ == 0) {
+    // nothing
+  } else if (size_ <= ClockBlock::kClockCount) {
+    // One-level table.
+    ctx->clock_alloc.Free(c, tab_idx_);
+  } else {
+    // Two-level table.
+    for (uptr i = 0; i < size_; i += ClockBlock::kClockCount)
+      ctx->clock_alloc.Free(c, tab_->table[i / ClockBlock::kClockCount]);
+    ctx->clock_alloc.Free(c, tab_idx_);
+  }
+  tab_ = 0;
+  tab_idx_ = 0;
+  size_ = 0;
   release_store_tid_ = kInvalidTid;
   release_store_reused_ = 0;
   for (uptr i = 0; i < kDirtyTids; i++)
     dirty_tids_[i] = kInvalidTid;
 }
 
+ClockElem &SyncClock::elem(unsigned tid) const {
+  DCHECK_LT(tid, size_);
+  if (size_ <= ClockBlock::kClockCount)
+    return tab_->clock[tid];
+  u32 idx = tab_->table[tid / ClockBlock::kClockCount];
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  return cb->clock[tid % ClockBlock::kClockCount];
+}
+
 void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
   printf("clock=[");
-  for (uptr i = 0; i < clk_.Size(); i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].epoch);
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
   printf("] reused=[");
-  for (uptr i = 0; i < clk_.Size(); i++)
-    printf("%s%llu", i == 0 ? "" : ",", clk_[i].reused);
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
   printf("] release_store_tid=%d/%d dirty_tids=%d/%d",
       release_store_tid_, release_store_reused_,
       dirty_tids_[0], dirty_tids_[1]);
Index: lib/tsan/rtl/tsan_flags.cc
===================================================================
--- lib/tsan/rtl/tsan_flags.cc
+++ lib/tsan/rtl/tsan_flags.cc
@@ -107,7 +107,7 @@
   ParseCommonFlagsFromString(f, env);
 
   // Copy back to common flags.
-  *common_flags() = *f;
+  internal_memcpy(common_flags(), f, sizeof(*common_flags()));
 
   // Sanity check.
   if (!f->report_bugs) {
Index: lib/tsan/rtl/tsan_rtl.h
===================================================================
--- lib/tsan/rtl/tsan_rtl.h
+++ lib/tsan/rtl/tsan_rtl.h
@@ -374,6 +374,7 @@
 
   DenseSlabAllocCache block_cache;
   DenseSlabAllocCache sync_cache;
+  DenseSlabAllocCache clock_cache;
 
 #ifndef TSAN_GO
   u32 last_sleep_stack_id;
@@ -418,6 +419,7 @@
   void OnStarted(void *arg);
   void OnCreated(void *arg);
   void OnReset();
+  void OnDetached(void *arg);
 };
 
 struct RacyStacks {
@@ -466,6 +468,8 @@
   InternalMmapVector<FiredSuppression> fired_suppressions;
   DDetector *dd;
 
+  ClockAlloc clock_alloc;
+
   Flags flags;
 
   u64 stat[StatCnt];
Index: lib/tsan/rtl/tsan_rtl_mutex.cc
===================================================================
--- lib/tsan/rtl/tsan_rtl_mutex.cc
+++ lib/tsan/rtl/tsan_rtl_mutex.cc
@@ -118,7 +118,7 @@
   u64 mid = s->GetId();
   u32 last_lock = s->last_lock;
   if (!unlock_locked)
-    s->Reset();  // must not reset it before the report is printed
+    s->Reset(thr);  // must not reset it before the report is printed
   s->mtx.Unlock();
   if (unlock_locked) {
     ThreadRegistryLock l(ctx->thread_registry);
@@ -136,7 +136,7 @@
   if (unlock_locked) {
     SyncVar *s = ctx->metamap.GetIfExistsAndLock(addr);
     if (s != 0) {
-      s->Reset();
+      s->Reset(thr);
       s->mtx.Unlock();
     }
   }
@@ -429,7 +429,7 @@
   if (thr->ignore_sync)
     return;
   thr->clock.set(thr->fast_state.epoch());
-  thr->clock.acquire(c);
+  thr->clock.acquire(&thr->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
 }
 
@@ -438,7 +438,7 @@
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.release(c);
+  thr->clock.release(&thr->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -447,7 +447,7 @@
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.ReleaseStore(c);
+  thr->clock.ReleaseStore(&thr->clock_cache, c);
   StatInc(thr, StatSyncRelease);
 }
 
@@ -456,7 +456,7 @@
     return;
   thr->clock.set(thr->fast_state.epoch());
   thr->fast_synch_epoch = thr->fast_state.epoch();
-  thr->clock.acq_rel(c);
+  thr->clock.acq_rel(&thr->clock_cache, c);
   StatInc(thr, StatSyncAcquire);
   StatInc(thr, StatSyncRelease);
 }
Index: lib/tsan/rtl/tsan_rtl_thread.cc
===================================================================
--- lib/tsan/rtl/tsan_rtl_thread.cc
+++ lib/tsan/rtl/tsan_rtl_thread.cc
@@ -36,13 +36,13 @@
 #endif
 
 void ThreadContext::OnDead() {
-  sync.Reset();
+  CHECK_EQ(sync.size(), 0);
 }
 
 void ThreadContext::OnJoined(void *arg) {
   ThreadState *caller_thr = static_cast<ThreadState *>(arg);
   AcquireImpl(caller_thr, 0, &sync);
-  sync.Reset();
+  sync.Reset(&caller_thr->clock_cache);
 }
 
 struct OnCreatedArgs {
@@ -65,11 +65,16 @@
 }
 
 void ThreadContext::OnReset() {
-  sync.Reset();
+  CHECK_EQ(sync.size(), 0);
   FlushUnneededShadowMemory(GetThreadTrace(tid), TraceSize() * sizeof(Event));
   //!!! FlushUnneededShadowMemory(GetThreadTraceHeader(tid), sizeof(Trace));
 }
 
+void ThreadContext::OnDetached(void *arg) {
+  ThreadState *thr1 = static_cast<ThreadState*>(arg);
+  sync.Reset(&thr1->clock_cache);
+}
+
 struct OnStartedArgs {
   ThreadState *thr;
   uptr stk_addr;
@@ -113,7 +118,7 @@
   Trace *thr_trace = ThreadTrace(thr->tid);
   thr_trace->headers[trace].epoch0 = epoch0;
   StatInc(thr, StatSyncAcquire);
-  sync.Reset();
+  sync.Reset(&thr->clock_cache);
   DPrintf("#%d: ThreadStart epoch=%zu stk_addr=%zx stk_size=%zx "
           "tls_addr=%zx tls_size=%zx\n",
           tid, (uptr)epoch0, args->stk_addr, args->stk_size,
@@ -134,6 +139,7 @@
     ctx->dd->DestroyPhysicalThread(thr->dd_pt);
     ctx->dd->DestroyLogicalThread(thr->dd_lt);
   }
+  ctx->clock_alloc.FlushCache(&thr->clock_cache);
   ctx->metamap.OnThreadIdle(thr);
 #ifndef TSAN_GO
   AllocatorThreadFinish(thr);
@@ -307,7 +313,7 @@
 void ThreadDetach(ThreadState *thr, uptr pc, int tid) {
   CHECK_GT(tid, 0);
   CHECK_LT(tid, kMaxTid);
-  ctx->thread_registry->DetachThread(tid);
+  ctx->thread_registry->DetachThread(tid, thr);
 }
 
 void ThreadSetName(ThreadState *thr, const char *name) {
Index: lib/tsan/rtl/tsan_sync.h
===================================================================
--- lib/tsan/rtl/tsan_sync.h
+++ lib/tsan/rtl/tsan_sync.h
@@ -47,7 +47,7 @@
   SyncClock clock;
 
   void Init(ThreadState *thr, uptr pc, uptr addr, u64 uid);
-  void Reset();
+  void Reset(ThreadState *thr);
 
   u64 GetId() const {
     // 47 lsb is addr, then 14 bits is low part of uid, then 3 zero bits.
Index: lib/tsan/rtl/tsan_sync.cc
===================================================================
--- lib/tsan/rtl/tsan_sync.cc
+++ lib/tsan/rtl/tsan_sync.cc
@@ -21,7 +21,7 @@
 
 SyncVar::SyncVar()
     : mtx(MutexTypeSyncVar, StatMtxSyncVar) {
-  Reset();
+  Reset(0);
 }
 
 void SyncVar::Init(ThreadState *thr, uptr pc, uptr addr, u64 uid) {
@@ -36,7 +36,7 @@
     DDMutexInit(thr, pc, this);
 }
 
-void SyncVar::Reset() {
+void SyncVar::Reset(ThreadState *thr) {
   uid = 0;
   creation_stack_id = 0;
   owner_tid = kInvalidTid;
@@ -47,8 +47,13 @@
   is_broken = 0;
   is_linker_init = 0;
 
-  clock.Zero();
-  read_clock.Reset();
+  if (thr == 0) {
+    CHECK_EQ(clock.size(), 0);
+    CHECK_EQ(read_clock.size(), 0);
+  } else {
+    clock.Reset(&thr->clock_cache);
+    read_clock.Reset(&thr->clock_cache);
+  }
 }
 
 MetaMap::MetaMap() {
@@ -93,7 +98,7 @@
         DCHECK(idx & kFlagSync);
         SyncVar *s = sync_alloc_.Map(idx & ~kFlagMask);
         u32 next = s->next;
-        s->Reset();
+        s->Reset(thr);
         sync_alloc_.Free(&thr->sync_cache, idx & ~kFlagMask);
         idx = next;
       } else {
@@ -143,7 +148,7 @@
       SyncVar * s = sync_alloc_.Map(idx & ~kFlagMask);
       if (s->addr == addr) {
         if (myidx != 0) {
-          mys->Reset();
+          mys->Reset(thr);
           sync_alloc_.Free(&thr->sync_cache, myidx);
         }
         if (write_lock)
Index: lib/tsan/tests/unit/tsan_clock_test.cc
===================================================================
--- lib/tsan/tests/unit/tsan_clock_test.cc
+++ lib/tsan/tests/unit/tsan_clock_test.cc
@@ -17,6 +17,8 @@
 
 namespace __tsan {
 
+ClockCache cache;
+
 TEST(Clock, VectorBasic) {
   ThreadClock clk(0);
   ASSERT_EQ(clk.size(), 1U);
@@ -38,30 +40,32 @@
   SyncClock chunked;
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 0U);
-  vector.acquire(&chunked);
+  vector.acquire(&cache, &chunked);
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 0U);
-  vector.release(&chunked);
+  vector.release(&cache, &chunked);
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 1U);
-  vector.acq_rel(&chunked);
+  vector.acq_rel(&cache, &chunked);
   ASSERT_EQ(vector.size(), 1U);
   ASSERT_EQ(chunked.size(), 1U);
+  chunked.Reset(&cache);
 }
 
 TEST(Clock, AcquireRelease) {
   ThreadClock vector1(100);
   vector1.tick();
   SyncClock chunked;
-  vector1.release(&chunked);
+  vector1.release(&cache, &chunked);
   ASSERT_EQ(chunked.size(), 101U);
   ThreadClock vector2(0);
-  vector2.acquire(&chunked);
+  vector2.acquire(&cache, &chunked);
   ASSERT_EQ(vector2.size(), 101U);
   ASSERT_EQ(vector2.get(0), 0U);
   ASSERT_EQ(vector2.get(1), 0U);
   ASSERT_EQ(vector2.get(99), 0U);
   ASSERT_EQ(vector2.get(100), 1U);
+  chunked.Reset(&cache);
 }
 
 TEST(Clock, RepeatedAcquire) {
@@ -71,10 +75,12 @@
   thr2.tick();
 
   SyncClock sync;
-  thr1.ReleaseStore(&sync);
+  thr1.ReleaseStore(&cache, &sync);
+
+  thr2.acquire(&cache, &sync);
+  thr2.acquire(&cache, &sync);
 
-  thr2.acquire(&sync);
-  thr2.acquire(&sync);
+  sync.Reset(&cache);
 }
 
 TEST(Clock, ManyThreads) {
@@ -83,9 +89,9 @@
     ThreadClock vector(0);
     vector.tick();
     vector.set(i, 1);
-    vector.release(&chunked);
+    vector.release(&cache, &chunked);
     ASSERT_EQ(i + 1, chunked.size());
-    vector.acquire(&chunked);
+    vector.acquire(&cache, &chunked);
     ASSERT_EQ(i + 1, vector.size());
   }
 
@@ -93,10 +99,12 @@
     ASSERT_EQ(1U, chunked.get(i));
 
   ThreadClock vector(1);
-  vector.acquire(&chunked);
+  vector.acquire(&cache, &chunked);
   ASSERT_EQ(100U, vector.size());
   for (unsigned i = 0; i < 100; i++)
     ASSERT_EQ(1U, vector.get(i));
+
+  chunked.Reset(&cache);
 }
 
 TEST(Clock, DifferentSizes) {
@@ -107,33 +115,102 @@
     vector2.tick();
     {
       SyncClock chunked;
-      vector1.release(&chunked);
+      vector1.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 11U);
-      vector2.release(&chunked);
+      vector2.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 21U);
+      chunked.Reset(&cache);
     }
     {
       SyncClock chunked;
-      vector2.release(&chunked);
+      vector2.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 21U);
-      vector1.release(&chunked);
+      vector1.release(&cache, &chunked);
       ASSERT_EQ(chunked.size(), 21U);
+      chunked.Reset(&cache);
     }
     {
       SyncClock chunked;
-      vector1.release(&chunked);
-      vector2.acquire(&chunked);
+      vector1.release(&cache, &chunked);
+      vector2.acquire(&cache, &chunked);
       ASSERT_EQ(vector2.size(), 21U);
+      chunked.Reset(&cache);
     }
     {
       SyncClock chunked;
-      vector2.release(&chunked);
-      vector1.acquire(&chunked);
+      vector2.release(&cache, &chunked);
+      vector1.acquire(&cache, &chunked);
       ASSERT_EQ(vector1.size(), 21U);
+      chunked.Reset(&cache);
     }
   }
 }
 
+TEST(Clock, Growth) {
+  {
+    ThreadClock vector(10);
+    vector.tick();
+    vector.set(5, 42);
+    SyncClock sync;
+    vector.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 11U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(1), 0ULL);
+    ASSERT_EQ(sync.get(5), 42ULL);
+    ASSERT_EQ(sync.get(9), 0ULL);
+    ASSERT_EQ(sync.get(10), 1ULL);
+    sync.Reset(&cache);
+  }
+  {
+    ThreadClock vector1(10);
+    vector1.tick();
+    ThreadClock vector2(20);
+    vector2.tick();
+    SyncClock sync;
+    vector1.release(&cache, &sync);
+    vector2.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 21U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(10), 1ULL);
+    ASSERT_EQ(sync.get(19), 0ULL);
+    ASSERT_EQ(sync.get(20), 1ULL);
+    sync.Reset(&cache);
+  }
+  {
+    ThreadClock vector(100);
+    vector.tick();
+    vector.set(5, 42);
+    vector.set(90, 84);
+    SyncClock sync;
+    vector.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 101U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(1), 0ULL);
+    ASSERT_EQ(sync.get(5), 42ULL);
+    ASSERT_EQ(sync.get(60), 0ULL);
+    ASSERT_EQ(sync.get(70), 0ULL);
+    ASSERT_EQ(sync.get(90), 84ULL);
+    ASSERT_EQ(sync.get(99), 0ULL);
+    ASSERT_EQ(sync.get(100), 1ULL);
+    sync.Reset(&cache);
+  }
+  {
+    ThreadClock vector1(10);
+    vector1.tick();
+    ThreadClock vector2(100);
+    vector2.tick();
+    SyncClock sync;
+    vector1.release(&cache, &sync);
+    vector2.release(&cache, &sync);
+    ASSERT_EQ(sync.size(), 101U);
+    ASSERT_EQ(sync.get(0), 0ULL);
+    ASSERT_EQ(sync.get(10), 1ULL);
+    ASSERT_EQ(sync.get(99), 0ULL);
+    ASSERT_EQ(sync.get(100), 1ULL);
+    sync.Reset(&cache);
+  }
+}
+
 const int kThreads = 4;
 const int kClocks = 4;
 
@@ -257,31 +334,31 @@
       if (printing)
         printf("acquire thr%d <- clk%d\n", tid, cid);
       thr0[tid]->acquire(sync0[cid]);
-      thr1[tid]->acquire(sync1[cid]);
+      thr1[tid]->acquire(&cache, sync1[cid]);
       break;
     case 1:
       if (printing)
         printf("release thr%d -> clk%d\n", tid, cid);
       thr0[tid]->release(sync0[cid]);
-      thr1[tid]->release(sync1[cid]);
+      thr1[tid]->release(&cache, sync1[cid]);
       break;
     case 2:
       if (printing)
         printf("acq_rel thr%d <> clk%d\n", tid, cid);
       thr0[tid]->acq_rel(sync0[cid]);
-      thr1[tid]->acq_rel(sync1[cid]);
+      thr1[tid]->acq_rel(&cache, sync1[cid]);
       break;
     case 3:
       if (printing)
         printf("rel_str thr%d >> clk%d\n", tid, cid);
       thr0[tid]->ReleaseStore(sync0[cid]);
-      thr1[tid]->ReleaseStore(sync1[cid]);
+      thr1[tid]->ReleaseStore(&cache, sync1[cid]);
       break;
     case 4:
       if (printing)
         printf("reset clk%d\n", cid);
       sync0[cid]->Reset();
-      sync1[cid]->Reset();
+      sync1[cid]->Reset(&cache);
       break;
     case 5:
       if (printing)
@@ -331,6 +408,10 @@
       return false;
     }
   }
+
+  for (unsigned i = 0; i < kClocks; i++) {
+    sync1[i]->Reset(&cache);
+  }
   return true;
 }
 
Index: lib/tsan/tests/unit/tsan_sync_test.cc
===================================================================
--- lib/tsan/tests/unit/tsan_sync_test.cc
+++ lib/tsan/tests/unit/tsan_sync_test.cc
@@ -114,7 +114,7 @@
   u64 block[1] = {};  // fake malloc block
   m->AllocBlock(thr, 0, (uptr)&block[0], 1 * sizeof(u64));
   SyncVar *s = m->GetOrCreateAndLock(thr, 0, (uptr)&block[0], true);
-  s->Reset();
+  s->Reset(thr);
   s->mtx.Unlock();
   uptr sz = m->FreeBlock(thr, 0, (uptr)&block[0]);
   EXPECT_EQ(sz, 1 * sizeof(u64));
Index: test/tsan/thread_detach.c
===================================================================
--- test/tsan/thread_detach.c
+++ test/tsan/thread_detach.c
@@ -0,0 +1,20 @@
+// RUN: %clang_tsan -O1 %s -o %t && %run %t 2>&1 | FileCheck %s
+#include <pthread.h>
+#include <stdio.h>
+#include <unistd.h>
+
+void *Thread(void *x) {
+  return 0;
+}
+
+int main() {
+  pthread_t t;
+  pthread_create(&t, 0, Thread, 0);
+  sleep(1);
+  pthread_detach(t);
+  printf("PASS\n");
+  return 0;
+}
+
+// CHECK-NOT: WARNING: ThreadSanitizer: thread leak
+// CHECK: PASS