Index: lib/tsan/rtl/tsan_flags.cc
===================================================================
--- lib/tsan/rtl/tsan_flags.cc
+++ lib/tsan/rtl/tsan_flags.cc
@@ -71,6 +71,7 @@
     cf.print_suppressions = false;
     cf.stack_trace_format = "    #%n %f %S %M";
     cf.exitcode = 66;
+    cf.intercept_tls_get_addr = true;
     OverrideCommonFlags(cf);
   }
 
Index: lib/tsan/rtl/tsan_interceptors.cc
===================================================================
--- lib/tsan/rtl/tsan_interceptors.cc
+++ lib/tsan/rtl/tsan_interceptors.cc
@@ -19,6 +19,7 @@
 #include "sanitizer_common/sanitizer_platform_limits_posix.h"
 #include "sanitizer_common/sanitizer_placement_new.h"
 #include "sanitizer_common/sanitizer_stacktrace.h"
+#include "sanitizer_common/sanitizer_tls_get_addr.h"
 #include "interception/interception.h"
 #include "tsan_interceptors.h"
 #include "tsan_interface.h"
@@ -853,6 +854,7 @@
     thr->signal_ctx = 0;
     UnmapOrDie(sctx, sizeof(*sctx));
   }
+  DTLS_Destroy();
   cur_thread_finalize();
 }
 }  // namespace __tsan
@@ -2183,17 +2185,7 @@
 #undef SANITIZER_INTERCEPT_FGETPWENT
 #undef SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS
 #undef SANITIZER_INTERCEPT_GETPWNAM_R_AND_FRIENDS
-// __tls_get_addr can be called with mis-aligned stack due to:
-// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
-// There are two potential issues:
-// 1. Sanitizer code contains a MOVDQA spill (it does not seem to be the case
-// right now). or 2. ProcessPendingSignal calls user handler which contains
-// MOVDQA spill (this happens right now).
-// Since the interceptor only initializes memory for msan, the simplest solution
-// is to disable the interceptor in tsan (other sanitizers do not call
-// signal handlers from COMMON_INTERCEPTOR_ENTER).
-// As __tls_get_addr has been intercepted in the past, to avoid breaking
-// libtsan ABI, keep it around, but just call the real function.
+// We define our own.
 #if SANITIZER_INTERCEPT_TLS_GET_ADDR
 #define NEED_TLS_GET_ADDR
 #endif
@@ -2429,8 +2421,27 @@
 #include "sanitizer_common/sanitizer_common_syscalls.inc"
 
 #ifdef NEED_TLS_GET_ADDR
+// Define own interceptor instead of sanitizer_common's for three reasons:
+// 1. It must not process pending signals.
+//    Signal handlers may contain MOVDQA instruction (see below).
+// 2. It must be as simple as possible to not contain MOVDQA.
+// 3. Sanitizer_common version uses COMMON_INTERCEPTOR_INITIALIZE_RANGE which
+//    is empty for tsan (meant only for msan).
+// Note: __tls_get_addr can be called with mis-aligned stack due to:
+// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58066
+// So the interceptor must work with mis-aligned stack, in particular, does not
+// execute MOVDQA with stack addresses.
 TSAN_INTERCEPTOR(void *, __tls_get_addr, void *arg) {
-  return REAL(__tls_get_addr)(arg);
+  void *res = REAL(__tls_get_addr)(arg);
+  ThreadState *thr = cur_thread();
+  if (!thr)
+    return res;
+  DTLS::DTV *dtv = DTLS_on_tls_get_addr(arg, res, thr->tls_addr, thr->tls_size);
+  if (!dtv)
+    return res;
+  // New DTLS block has been allocated.
+  MemoryResetRange(thr, 0, dtv->beg, dtv->size);
+  return res;
 }
 #endif
 
Index: test/tsan/dtls.c
===================================================================
--- test/tsan/dtls.c
+++ test/tsan/dtls.c
@@ -0,0 +1,62 @@
+// RUN: %clang_tsan %s -o %t
+// RUN: %clang_tsan %s -DBUILD_SO -fPIC -o %t-so.so -shared
+// RUN: %run %t 2>&1 | FileCheck %s
+
+// Test that tsan cleans up dynamic TLS memory between reuse.
+
+#include "test.h"
+
+#ifndef BUILD_SO
+#include <assert.h>
+#include <dlfcn.h>
+
+typedef volatile long *(* get_t)();
+get_t GetTls;
+
+void *Thread1(void *arg) {
+  pthread_detach(pthread_self());
+  volatile long *x = GetTls();
+  *x = 42;
+  fprintf(stderr, "stack: %p dtls: %p\n", &x, x);
+  barrier_wait(&barrier);
+  return 0;
+}
+
+void *Thread2(void *arg) {
+  volatile long *x = GetTls();
+  *x = 42;
+  fprintf(stderr, "stack: %p dtls: %p\n", &x, x);
+  return 0;
+}
+
+int main(int argc, char *argv[]) {
+  char path[4096];
+  snprintf(path, sizeof(path), "%s-so.so", argv[0]);
+
+  void *handle = dlopen(path, RTLD_LAZY);
+  if (!handle) fprintf(stderr, "%s\n", dlerror());
+  assert(handle != 0);
+  GetTls = (get_t)dlsym(handle, "GetTls");
+  assert(dlerror() == 0);
+
+  barrier_init(&barrier, 2);
+  pthread_t t[2];
+  pthread_create(&t[0], 0, Thread1, 0);
+  barrier_wait(&barrier);
+  // Wait for actual thread termination without using pthread_join,
+  // which would synchronize threads.
+  sleep(1);
+  pthread_create(&t[1], 0, Thread2, 0);
+  pthread_join(t[1], 0);
+  fprintf(stderr, "DONE\n");
+  return 0;
+}
+#else  // BUILD_SO
+__thread long huge_thread_local_array[1 << 17];
+long *GetTls() {
+  return &huge_thread_local_array[0];
+}
+#endif
+
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK: DONE