Index: include/sanitizer/tsan_interface.h
===================================================================
--- include/sanitizer/tsan_interface.h
+++ include/sanitizer/tsan_interface.h
@@ -137,6 +137,12 @@
 void __tsan_external_read(void *addr, void *caller_pc, void *tag);
 void __tsan_external_write(void *addr, void *caller_pc, void *tag);
 
+void *__tsan_get_current_fiber(void);
+void *__tsan_create_fiber(void);
+void __tsan_destroy_fiber(void *fiber);
+void __tsan_switch_to_fiber(void *fiber);
+void __tsan_set_fiber_name(void *fiber, const char *name);
+
 #ifdef __cplusplus
 }  // extern "C"
 #endif
Index: lib/tsan/rtl/tsan_interface.cc
===================================================================
--- lib/tsan/rtl/tsan_interface.cc
+++ lib/tsan/rtl/tsan_interface.cc
@@ -124,6 +124,45 @@
   __tsan_unaligned_write8(addr);
   *addr = v;
 }
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void *__tsan_get_current_fiber() {
+  return cur_thread();
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void *__tsan_create_fiber() {
+  const uptr pc = GET_CALLER_PC();
+  void *mem = internal_alloc(MBlockThreadContex, sizeof(ThreadState));
+  ThreadState *thr = static_cast<ThreadState *>(mem);
+  internal_memset(thr, 0, sizeof(*thr));
+  int tid = ThreadCreate(cur_thread(), pc, 0, true);
+  ThreadStart(thr, tid, 0, false);
+  return thr;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_destroy_fiber(void *fiber) {
+  ThreadState *thr = static_cast<ThreadState *>(fiber);
+  ThreadFinish(thr);
+  internal_free(thr);
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_switch_to_fiber(void *fiber) {
+  ThreadState *thr = cur_thread();
+  Processor *proc = thr->proc();
+  ProcUnwire(proc, thr);
+  thr = static_cast<ThreadState *>(fiber);
+  ProcWire(proc, thr);
+  *cur_thread_location() = thr;
+}
+
+SANITIZER_INTERFACE_ATTRIBUTE
+void __tsan_set_fiber_name(void *fiber, const char *name) {
+  ThreadState *thr = static_cast<ThreadState *>(fiber);
+  ThreadSetName(thr, name);
+}
 }  // extern "C"
 
 void __tsan_acquire(void *addr) {
Index: lib/tsan/rtl/tsan_platform_linux.cc
===================================================================
--- lib/tsan/rtl/tsan_platform_linux.cc
+++ lib/tsan/rtl/tsan_platform_linux.cc
@@ -374,6 +374,10 @@
 // DestroyThreadState(), so add a fake thread state for "dead" threads.
 static ThreadState *dead_thread_state = nullptr;
 
+ThreadState **cur_thread_location() {
+  return reinterpret_cast<ThreadState**>(get_android_tls_ptr());
+}
+
 ThreadState *cur_thread() {
   ThreadState* thr = reinterpret_cast<ThreadState*>(*get_android_tls_ptr());
   if (thr == nullptr) {
Index: lib/tsan/rtl/tsan_platform_mac.cc
===================================================================
--- lib/tsan/rtl/tsan_platform_mac.cc
+++ lib/tsan/rtl/tsan_platform_mac.cc
@@ -73,23 +73,18 @@
 // in a static variable, because we need to access it even before the
 // shadow memory is set up.
 static uptr main_thread_identity = 0;
-ALIGNED(64) static char main_thread_state[sizeof(ThreadState)];
+static ThreadState *main_thread_state = nullptr;
 
 ThreadState **cur_thread_location() {
-  ThreadState **thread_identity = (ThreadState **)pthread_self();
-  return ((uptr)thread_identity == main_thread_identity) ? nullptr
-                                                         : thread_identity;
+  uptr thread_identity = (uptr)pthread_self();
+  if (thread_identity == main_thread_identity || main_thread_identity == 0)
+    return &main_thread_state;
+  return (ThreadState **)MemToShadow(thread_identity);
 }
 
 ThreadState *cur_thread() {
-  ThreadState **thr_state_loc = cur_thread_location();
-  if (thr_state_loc == nullptr || main_thread_identity == 0) {
-    return (ThreadState *)&main_thread_state;
-  }
-  ThreadState **fake_tls = (ThreadState **)MemToShadow((uptr)thr_state_loc);
-  ThreadState *thr = (ThreadState *)SignalSafeGetOrAllocate(
-      (uptr *)fake_tls, sizeof(ThreadState));
-  return thr;
+  return (ThreadState *)SignalSafeGetOrAllocate(
+      (uptr *)cur_thread_location(), sizeof(ThreadState));
 }
 
 // TODO(kuba.brecka): This is not async-signal-safe. In particular, we call
@@ -97,14 +92,13 @@
 // handler will try to access the unmapped ThreadState.
 void cur_thread_finalize() {
   ThreadState **thr_state_loc = cur_thread_location();
-  if (thr_state_loc == nullptr) {
+  if (thr_state_loc == &main_thread_state) {
     // Calling dispatch_main() or xpc_main() actually invokes pthread_exit to
     // exit the main thread. Let's keep the main thread's ThreadState.
     return;
   }
-  ThreadState **fake_tls = (ThreadState **)MemToShadow((uptr)thr_state_loc);
-  internal_munmap(*fake_tls, sizeof(ThreadState));
-  *fake_tls = nullptr;
+  internal_munmap(*thr_state_loc, sizeof(ThreadState));
+  *thr_state_loc = nullptr;
 }
 #endif
 
@@ -266,11 +260,11 @@
   // The pointer to the ThreadState object is stored in the shadow memory
   // of the tls.
   uptr tls_end = tls_addr + tls_size;
-  ThreadState **thr_state_loc = cur_thread_location();
-  if (thr_state_loc == nullptr) {
+  uptr thread_identity = (uptr)pthread_self();
+  if (thread_identity == main_thread_identity) {
     MemoryRangeImitateWrite(thr, /*pc=*/2, tls_addr, tls_size);
   } else {
-    uptr thr_state_start = (uptr)thr_state_loc;
+    uptr thr_state_start = thread_identity;
     uptr thr_state_end = thr_state_start + sizeof(uptr);
     CHECK_GE(thr_state_start, tls_addr);
     CHECK_LE(thr_state_start, tls_addr + tls_size);
Index: lib/tsan/rtl/tsan_rtl.h
===================================================================
--- lib/tsan/rtl/tsan_rtl.h
+++ lib/tsan/rtl/tsan_rtl.h
@@ -431,11 +431,7 @@
 
   // Current wired Processor, or nullptr. Required to handle any events.
   Processor *proc1;
-#if !SANITIZER_GO
-  Processor *proc() { return proc1; }
-#else
   Processor *proc();
-#endif
 
   atomic_uintptr_t in_signal_handler;
   ThreadSignalContext *signal_ctx;
@@ -459,16 +455,25 @@
 
 #if !SANITIZER_GO
 #if SANITIZER_MAC || SANITIZER_ANDROID
+ThreadState **cur_thread_location();
 ThreadState *cur_thread();
 void cur_thread_finalize();
 #else
 __attribute__((tls_model("initial-exec")))
 extern THREADLOCAL char cur_thread_placeholder[];
+__attribute__((tls_model("initial-exec")))
+extern THREADLOCAL ThreadState *cur_thread1;
+INLINE ThreadState **cur_thread_location() { return &cur_thread1; }
 INLINE ThreadState *cur_thread() {
+  if (cur_thread1)
+    return cur_thread1;
   return reinterpret_cast<ThreadState *>(&cur_thread_placeholder);
 }
 INLINE void cur_thread_finalize() { }
 #endif  // SANITIZER_MAC || SANITIZER_ANDROID
+INLINE Processor *ThreadState::proc() {
+  return proc1 ? proc1 : cur_thread()->proc1;
+}
 #endif  // SANITIZER_GO
 
 class ThreadContext : public ThreadContextBase {
@@ -839,7 +844,8 @@
 
 extern "C" void __tsan_trace_switch();
 void ALWAYS_INLINE TraceAddEvent(ThreadState *thr, FastState fs,
-                                        EventType typ, u64 addr) {
+                                        EventType typ, u64 addr,
+                                        bool hacky_call = true) {
   if (!kCollectHistory)
     return;
   DCHECK_GE((int)typ, 0);
@@ -849,10 +855,12 @@
   u64 pos = fs.GetTracePos();
   if (UNLIKELY((pos % kTracePartSize) == 0)) {
 #if !SANITIZER_GO
-    HACKY_CALL(__tsan_trace_switch);
-#else
-    TraceSwitch(thr);
+    if (hacky_call) {
+      DCHECK_EQ(thr, cur_thread());
+      HACKY_CALL(__tsan_trace_switch);
+    } else
 #endif
+      TraceSwitch(thr);
   }
   Event *trace = (Event*)GetThreadTrace(fs.tid());
   Event *evp = &trace[pos];
Index: lib/tsan/rtl/tsan_rtl.cc
===================================================================
--- lib/tsan/rtl/tsan_rtl.cc
+++ lib/tsan/rtl/tsan_rtl.cc
@@ -48,6 +48,8 @@
 #if !SANITIZER_GO && !SANITIZER_MAC
 __attribute__((tls_model("initial-exec")))
 THREADLOCAL char cur_thread_placeholder[sizeof(ThreadState)] ALIGNED(64);
+__attribute__((tls_model("initial-exec")))
+THREADLOCAL ThreadState *cur_thread1;
 #endif
 static char ctx_placeholder[sizeof(Context)] ALIGNED(64);
 Context *ctx;
@@ -619,6 +621,7 @@
   thr->racy_state[1] = old.raw();
   thr->racy_shadow_addr = shadow_mem;
 #if !SANITIZER_GO
+  DCHECK_EQ(thr, cur_thread());
   HACKY_CALL(__tsan_report_race);
 #else
   ReportRace(thr);
Index: lib/tsan/rtl/tsan_rtl_thread.cc
===================================================================
--- lib/tsan/rtl/tsan_rtl_thread.cc
+++ lib/tsan/rtl/tsan_rtl_thread.cc
@@ -59,7 +59,7 @@
     return;
   args->thr->fast_state.IncrementEpoch();
   // Can't increment epoch w/o writing to the trace as well.
-  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0);
+  TraceAddEvent(args->thr, args->thr->fast_state, EventTypeMop, 0, false);
   ReleaseImpl(args->thr, 0, &sync);
   creation_stack_id = CurrentStackId(args->thr, args->pc);
   if (reuse_count == 0)
@@ -112,7 +112,7 @@
   thr->fast_state.SetHistorySize(flags()->history_size);
   // Commit switch to the new part of the trace.
   // TraceAddEvent will reset stack0/mset0 in the new part for us.
-  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0, false);
 
   thr->fast_synch_epoch = epoch0;
   AcquireImpl(thr, 0, &sync);
@@ -135,7 +135,7 @@
   if (!detached) {
     thr->fast_state.IncrementEpoch();
     // Can't increment epoch w/o writing to the trace as well.
-    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0, false);
     ReleaseImpl(thr, 0, &sync);
   }
   epoch1 = thr->fast_state.epoch();
@@ -246,7 +246,8 @@
   uptr tls_addr = 0;
   uptr tls_size = 0;
 #if !SANITIZER_GO
-  GetThreadStackAndTls(tid == 0, &stk_addr, &stk_size, &tls_addr, &tls_size);
+  if (os_id)
+    GetThreadStackAndTls(tid == 0, &stk_addr, &stk_size, &tls_addr, &tls_size);
 
   if (tid) {
     if (stk_addr && stk_size)