Index: lib/interception/interception.h
===================================================================
--- lib/interception/interception.h
+++ lib/interception/interception.h
@@ -74,6 +74,15 @@
 // we intercept. To resolve this we declare our interceptors with __interceptor_
 // prefix, and then make actual interceptors weak aliases to __interceptor_
 // functions.
+// Another complication is that we may have been already included declaration
+// for an intercepted function from standard headers, and then get declaration
+// mismatch between standard and our signatures (e.g. standard declarations can
+// declare some arguments as __restrict). To circumvent this we declare actual
+// interceptors with __interceptor_fake_ prefix and them strip the prefix with
+// asm directive.
+// Another complication is that an intercepted function can also be a compiler
+// builtin (e.g. __atomic_load). Fortunately, __interceptor_fake_ hack also
+// resolves this problem.
 //
 // This is not so on Mac OS, where the two-level namespace makes
 // our replacement functions invisible to other libraries. This may be overcomed
@@ -143,8 +152,14 @@
 # define WRAP(x) __interceptor_ ## x
 # define WRAPPER_NAME(x) "__interceptor_" #x
 # define INTERCEPTOR_ATTRIBUTE __attribute__((visibility("default")))
+# if defined(__APPLE__)
+#  define INTERCEPTOR_ASM_PREFIX(x) "_" x
+# else
+#  define INTERCEPTOR_ASM_PREFIX(x) x
+# endif
 # define DECLARE_WRAPPER(ret_type, func, ...) \
-    extern "C" ret_type func(__VA_ARGS__) \
+    extern "C" ret_type __interceptor_fake_ ## func(__VA_ARGS__) \
+    __asm(INTERCEPTOR_ASM_PREFIX(#func)) \
     __attribute__((weak, alias("__interceptor_" #func), visibility("default")));
 #endif
 
Index: lib/interception/interception_linux.h
===================================================================
--- lib/interception/interception_linux.h
+++ lib/interception/interception_linux.h
@@ -31,7 +31,7 @@
 #define INTERCEPT_FUNCTION_LINUX_OR_FREEBSD(func)                          \
   ::__interception::GetRealFunctionAddress(                                \
       #func, (::__interception::uptr *)&__interception::PTR_TO_REAL(func), \
-      (::__interception::uptr) & (func),                                   \
+      (::__interception::uptr) & (__interceptor_fake_ ## func),            \
       (::__interception::uptr) & WRAP(func))
 
 #if !defined(__ANDROID__)  // android does not have dlvsym
Index: lib/msan/msan_interceptors.cc
===================================================================
--- lib/msan/msan_interceptors.cc
+++ lib/msan/msan_interceptors.cc
@@ -488,7 +488,7 @@
   ENSURE_MSAN_INITED();
   va_list ap;
   va_start(ap, format);
-  int res = vswprintf(str, size, format, ap);
+  int res = WRAP(vswprintf)(str, size, format, ap);
   va_end(ap);
   return res;
 }
Index: lib/tsan/rtl/tsan_interceptors.cc
===================================================================
--- lib/tsan/rtl/tsan_interceptors.cc
+++ lib/tsan/rtl/tsan_interceptors.cc
@@ -1339,6 +1339,74 @@
   return 0;
 }
 
+TSAN_INTERCEPTOR(u64, __atomic_load_8, u64 *a, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_load_8, a, ord);
+  return __tsan_atomic64_load(a, (morder)ord);
+}
+
+TSAN_INTERCEPTOR(void, __atomic_store_8, u64 *a, u64 v, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_store_8, a, v, ord);
+  __tsan_atomic64_store(a, v, (morder)ord);
+}
+
+TSAN_INTERCEPTOR(u64, __atomic_exchange_8, u64 *a, u64 v, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_exchange_8, a, v, ord);
+  return __tsan_atomic64_exchange(a, v, (morder)ord);
+}
+
+TSAN_INTERCEPTOR(bool, __atomic_compare_exchange_8, u64 *a, u64 *cmp,
+    u64 v, int sord, int ford) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_compare_exchange_8, a, cmp, v, sord, ford);
+  return __tsan_atomic64_compare_exchange_strong(a, cmp, v, (morder)sord,
+      (morder)ford);
+}
+
+#if __TSAN_HAS_INT128
+TSAN_INTERCEPTOR(a128, __atomic_load_16, a128 *a, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_load_16, a, ord);
+  return __tsan_atomic128_load(a, (morder)ord);
+}
+
+TSAN_INTERCEPTOR(void, __atomic_store_16, a128 *a, a128 v, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_store_16, a, v, ord);
+  __tsan_atomic128_store(a, v, (morder)ord);
+}
+
+TSAN_INTERCEPTOR(a128, __atomic_exchange_16, a128 *a, a128 v, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_exchange_16, a, v, ord);
+  return __tsan_atomic128_exchange(a, v, (morder)ord);
+}
+
+TSAN_INTERCEPTOR(bool, __atomic_compare_exchange_16, a128 *a, a128 *cmp,
+    a128 v, int sord, int ford) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_compare_exchange_16, a, cmp, v, sord, ford);
+  return __tsan_atomic128_compare_exchange_strong(a, cmp, v, (morder)sord,
+      (morder)ford);
+}
+#endif
+
+TSAN_INTERCEPTOR(void, __atomic_load, SIZE_T n, void *a, void *v, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_load, n, a, v, ord);
+  AtomicLoad(thr, pc, n, a, v, ord);
+}
+
+TSAN_INTERCEPTOR(void, __atomic_store, SIZE_T n, void *a, void *v, int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_store, n, a, v, ord);
+  AtomicStore(thr, pc, n, a, v, ord);
+}
+
+TSAN_INTERCEPTOR(void, __atomic_exchange, SIZE_T n, void *a, void *v, void *ret,
+    int ord) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_exchange, n, a, v, ret, ord);
+  AtomicExchange(thr, pc, n, a, v, ret, ord);
+}
+
+TSAN_INTERCEPTOR(bool, __atomic_compare_exchange, SIZE_T n, void *a, void *cmp,
+    void *v, int sord, int ford) {
+  SCOPED_TSAN_INTERCEPTOR(__atomic_compare_exchange, n, a, cmp, v, sord, ford);
+  return AtomicCompareExchange(thr, pc, n, a, cmp, v, sord, ford);
+}
+
 #if SANITIZER_LINUX && !SANITIZER_ANDROID
 TSAN_INTERCEPTOR(int, __fxstat, int version, int fd, void *buf) {
   SCOPED_TSAN_INTERCEPTOR(__fxstat, version, fd, buf);
@@ -1974,7 +2042,7 @@
   internal_memset(&act.sa_mask, -1, sizeof(act.sa_mask));
   act.sa_flags = 0;
   sigaction_t old;
-  int res = sigaction(sig, &act, &old);
+  int res = WRAP(sigaction)(sig, &act, &old);
   if (res)
     return SIG_ERR;
   return old.sa_handler;
@@ -2530,6 +2598,21 @@
 
   TSAN_INTERCEPT(pthread_once);
 
+  TSAN_INTERCEPT(__atomic_load_8);
+  TSAN_INTERCEPT(__atomic_store_8);
+  TSAN_INTERCEPT(__atomic_exchange_8);
+  TSAN_INTERCEPT(__atomic_compare_exchange_8);
+#if __TSAN_HAS_INT128
+  TSAN_INTERCEPT(__atomic_load_16);
+  TSAN_INTERCEPT(__atomic_store_16);
+  TSAN_INTERCEPT(__atomic_exchange_16);
+  TSAN_INTERCEPT(__atomic_compare_exchange_16);
+#endif
+  TSAN_INTERCEPT(__atomic_load);
+  TSAN_INTERCEPT(__atomic_store);
+  TSAN_INTERCEPT(__atomic_exchange);
+  TSAN_INTERCEPT(__atomic_compare_exchange);
+
   TSAN_INTERCEPT(fstat);
   TSAN_MAYBE_INTERCEPT___FXSTAT;
   TSAN_MAYBE_INTERCEPT_FSTAT64;
Index: lib/tsan/rtl/tsan_interface_atomic.cc
===================================================================
--- lib/tsan/rtl/tsan_interface_atomic.cc
+++ lib/tsan/rtl/tsan_interface_atomic.cc
@@ -445,6 +445,74 @@
 }
 #endif
 
+namespace __tsan {
+void AtomicLoad(ThreadState *thr, uptr pc, uptr n, void *a, void *v, int ord) {
+  const morder mo = static_cast<morder>(ord);
+  CHECK(IsLoadOrder(mo));
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, false);
+  if (IsAcquireOrder(mo))
+    AcquireImpl(thr, pc, &s->clock);
+  internal_memcpy(v, a, n);
+  s->mtx.ReadUnlock();
+  MemoryReadAtomic(thr, pc, (uptr)a, kSizeLog1);
+}
+
+void AtomicStore(ThreadState *thr, uptr pc, uptr n, void *a, void *v, int ord) {
+  const morder mo = static_cast<morder>(ord);
+  CHECK(IsStoreOrder(mo));
+  MemoryWriteAtomic(thr, pc, (uptr)a, kSizeLog1);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+  if (IsReleaseOrder(mo)) {
+    thr->fast_state.IncrementEpoch();
+    // Can't increment epoch w/o writing to the trace as well.
+    TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+    ReleaseImpl(thr, pc, &s->clock);
+  }
+  internal_memcpy(v, a, n);
+  s->mtx.Unlock();
+}
+
+void AtomicExchange(ThreadState *thr, uptr pc, uptr n, void *a, void *v,
+                    void *ret, int ord) {
+  const morder mo = static_cast<morder>(ord);
+  MemoryWriteAtomic(thr, pc, (uptr)a, kSizeLog1);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+  thr->fast_state.IncrementEpoch();
+  // Can't increment epoch w/o writing to the trace as well.
+  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+  if (IsAcqRelOrder(mo))
+    AcquireReleaseImpl(thr, pc, &s->clock);
+  else if (IsReleaseOrder(mo))
+    ReleaseImpl(thr, pc, &s->clock);
+  else if (IsAcquireOrder(mo))
+    AcquireImpl(thr, pc, &s->clock);
+  internal_memcpy(ret, a, n);
+  internal_memcpy(a, v, n);
+  s->mtx.Unlock();
+}
+
+bool AtomicCompareExchange(ThreadState *thr, uptr pc, uptr n, void *a, void *c,
+                           void *v, int sord, int ford) {
+  (void)ford;
+  const morder mo = static_cast<morder>(sord);
+  MemoryWriteAtomic(thr, pc, (uptr)a, kSizeLog1);
+  SyncVar *s = ctx->metamap.GetOrCreateAndLock(thr, pc, (uptr)a, true);
+  thr->fast_state.IncrementEpoch();
+  // Can't increment epoch w/o writing to the trace as well.
+  TraceAddEvent(thr, thr->fast_state, EventTypeMop, 0);
+  if (IsAcqRelOrder(mo))
+    AcquireReleaseImpl(thr, pc, &s->clock);
+  else if (IsReleaseOrder(mo))
+    ReleaseImpl(thr, pc, &s->clock);
+  else if (IsAcquireOrder(mo))
+    AcquireImpl(thr, pc, &s->clock);
+  bool res = internal_memcmp(a, c, n) == 0;
+  internal_memcpy((res ? a : c), (res ? v : a), n);
+  s->mtx.Unlock();
+  return res;
+}
+}  // namespace __tsan
+
 // Interface functions follow.
 #if !SANITIZER_GO
 
Index: lib/tsan/rtl/tsan_rtl.h
===================================================================
--- lib/tsan/rtl/tsan_rtl.h
+++ lib/tsan/rtl/tsan_rtl.h
@@ -756,6 +756,14 @@
 void ReleaseStoreImpl(ThreadState *thr, uptr pc, SyncClock *c);
 void AcquireReleaseImpl(ThreadState *thr, uptr pc, SyncClock *c);
 
+// Variable-size atomic operations for libatomic interceptors.
+void AtomicLoad(ThreadState *thr, uptr pc, uptr n, void *a, void *v, int ord);
+void AtomicStore(ThreadState *thr, uptr pc, uptr n, void *a, void *v, int ord);
+void AtomicExchange(ThreadState *thr, uptr pc, uptr n, void *a, void *v,
+                    void *ret, int ord);
+bool AtomicCompareExchange(ThreadState *thr, uptr pc, uptr n, void *a, void *c,
+                           void *v, int sord, int ford);
+
 // The hacky call uses custom calling convention and an assembly thunk.
 // It is considerably faster that a normal call for the caller
 // if it is not executed (it is intended for slow paths from hot functions).
Index: test/tsan/atomic_test.cc
===================================================================
--- test/tsan/atomic_test.cc
+++ test/tsan/atomic_test.cc
@@ -0,0 +1,245 @@
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE=char && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE=short && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE=int && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE=long && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE="MyStruct<6>" -latomic && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE="MyStruct<8>" -latomic && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE="MyStruct<12>" -latomic && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE="MyStruct<16>" -latomic && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE="MyStruct<24>" -latomic && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE="MyStruct<32>" -latomic && %deflake %run %t | FileCheck %s
+// RUN: %clangxx_tsan -O1 %s -o %t -DTEST_TYPE="MyStruct<128>" -latomic && %deflake %run %t | FileCheck %s
+#include "test.h"
+#include <memory.h>
+#include <atomic>
+
+// Test operation and synchronization provided by atomic variables of different
+// sizes, including sizes > 16 which are handled by libatomic.
+// Note that the only allowed atomic operations for these larger types are:
+// load, store, exchange, compare_exchange.
+
+template<int kSize>
+struct MyStruct {
+  char data[kSize];
+
+  explicit MyStruct(char v = 0) noexcept {
+    memset(&data[0], v, sizeof(data));
+  }
+
+  bool operator == (const MyStruct &other) const {
+    return memcmp(&data[0], &other.data[0], sizeof(data)) == 0;
+  }
+
+  bool operator != (const MyStruct &other) const {
+    return !(*this == other);
+  }
+
+  operator int() const {
+    return data[0];
+  }
+};
+
+#ifdef TEST_TYPE
+typedef TEST_TYPE T;
+#else
+typedef long T;
+#endif
+
+struct Data {
+  long pad0;
+  std::atomic<T> a;
+  long pad1;
+  long v;
+};
+
+const int kTestCount = 7;
+Data data[2 * kTestCount];
+
+void Test(int test, Data *p, bool main_thread) {
+  if (test == 0) {
+    // Test that we detect races between atomic and non-atomic accesses.
+    if (main_thread)
+      p->a.store(T(1));
+    else
+      memset(&p->a, 0, 1);
+// CHECK: Test 0 forward
+// CHECK: ThreadSanitizer: data race
+// CHECK: Test 0 reverse
+// CHECK: ThreadSanitizer: data race
+
+  } else if (test == 1) {
+    // Normal acquire-release synchronization.
+    if (p->a.load(std::memory_order_acquire) == T(0)) {
+      p->v = 42;
+      p->a.store(T(17), std::memory_order_release);
+    } else {
+      if (p->v != 42) {
+        fprintf(stderr, "%d: bad value %ld\n", __LINE__, p->v);
+        exit(0);
+      }
+      if (p->a.load(std::memory_order_relaxed) != T(17)) {
+        fprintf(stderr, "%d: bad atomic value %d\n", __LINE__,
+                (int)p->a.load());
+        exit(0);
+      }
+    }
+// CHECK: Test 1 forward
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK: Test 1 reverse
+// CHECK-NOT: ThreadSanitizer: data race
+
+  } else if (test == 2) {
+    // The same as the previous case, but with memory_order_relaxed for store.
+    // Ensure that atomics don't over-synchronize (that would happen if e.g. we
+    // would intercept underlying pthread_mutex_t operations in libatomic
+    // emulation).
+    if (p->a.load(std::memory_order_acquire) == T(0)) {
+      p->v = 42;
+      p->a.store(T(17), std::memory_order_relaxed);
+    } else {
+      if (p->v != 42) {
+        fprintf(stderr, "%d: bad value %ld\n", __LINE__, p->v);
+        exit(0);
+      }
+      if (p->a.load(std::memory_order_relaxed) != T(17)) {
+        fprintf(stderr, "%d: bad atomic value %d\n", __LINE__,
+                (int)p->a.load());
+        exit(0);
+      }
+    }
+// CHECK: Test 2 forward
+// CHECK: ThreadSanitizer: data race
+// CHECK: Test 2 reverse
+// CHECK: ThreadSanitizer: data race
+
+  } else if (test == 3) {
+    // The same as the previous case, but with memory_order_relaxed for load.
+    if (p->a.load(std::memory_order_relaxed) == T(0)) {
+      p->v = 42;
+      p->a.store(T(17), std::memory_order_release);
+    } else {
+      if (p->v != 42) {
+        fprintf(stderr, "%d: bad value %ld\n", __LINE__, p->v);
+        exit(0);
+      }
+      if (p->a.load(std::memory_order_relaxed) != T(17)) {
+        fprintf(stderr, "%d: bad atomic value %d\n", __LINE__,
+                (int)p->a.load());
+        exit(0);
+      }
+    }
+// CHECK: Test 3 forward
+// CHECK: ThreadSanitizer: data race
+// CHECK: Test 3 reverse
+// CHECK: ThreadSanitizer: data race
+
+  } else if (test == 4) {
+    // Acquire-relase synchronization, but using exchange/compare_exchange.
+    T cmp(17);
+    if (!p->a.compare_exchange_strong(cmp, T(18), std::memory_order_acquire)) {
+      // libc++ has a bug that causes this check to fail:
+      // https://llvm.org/bugs/show_bug.cgi?id=30675
+      if (false && cmp != T(0)) {
+        fprintf(stderr, "%d: bad atomic value %d\n", __LINE__, (int)cmp);
+        exit(0);
+      }
+      p->v = 42;
+      T old = p->a.exchange(T(17), std::memory_order_release);
+      if (old != T(0)) {
+        fprintf(stderr, "%d: bad atomic value %d\n", __LINE__, (int)old);
+        exit(0);
+      }
+    } else {
+      if (p->v != 42) {
+        fprintf(stderr, "%d: bad value %ld\n", __LINE__, p->v);
+        exit(0);
+      }
+      if (cmp != T(17)) {
+        fprintf(stderr, "%d: bad atomic value %d\n", __LINE__, (int)cmp);
+        exit(0);
+      }
+      if (p->a.load(std::memory_order_relaxed) != T(18)) {
+        fprintf(stderr, "%d: bad atomic value %d\n", __LINE__,
+                (int)p->a.load());
+        exit(0);
+      }
+    }
+// CHECK: Test 4 forward
+// CHECK-NOT: ThreadSanitizer: data race
+// CHECK: Test 4 reverse
+// CHECK-NOT: ThreadSanitizer: data race
+
+  } else if (test == 5) {
+    // The same as the previous case, but with memory_order_relaxed for
+    // exchange.
+    T cmp(17);
+    if (!p->a.compare_exchange_strong(cmp, T(18), std::memory_order_acquire)) {
+      p->v = 42;
+      p->a.exchange(T(17), std::memory_order_relaxed);
+    } else {
+      if (p->v != 42) {
+        fprintf(stderr, "%d: bad value %ld\n", __LINE__, p->v);
+        exit(0);
+      }
+    }
+// CHECK: Test 5 forward
+// CHECK: ThreadSanitizer: data race
+// CHECK: Test 5 reverse
+// CHECK: ThreadSanitizer: data race
+
+  } else if (test == 6) {
+    // The same as the previous case, but with memory_order_relaxed for
+    // compare_exchange.
+    T cmp(17);
+    if (!p->a.compare_exchange_strong(cmp, T(18), std::memory_order_relaxed)) {
+      p->v = 42;
+      p->a.exchange(T(17), std::memory_order_release);
+    } else {
+      if (p->v != 42) {
+        fprintf(stderr, "%d: bad value %ld\n", __LINE__, p->v);
+        exit(0);
+      }
+    }
+// CHECK: Test 6 forward
+// CHECK: ThreadSanitizer: data race
+// CHECK: Test 6 reverse
+// CHECK: ThreadSanitizer: data race
+  }
+}
+
+void *Thread(void *p) {
+  for (int i = 0; i < kTestCount; i++) {
+    Test(i, &data[i * 2], false);
+    barrier_wait(&barrier);
+    barrier_wait(&barrier);
+    fprintf(stderr, "Test %d reverse\n", i);
+    Test(i, &data[i * 2 + 1], false);
+  }
+  return 0;
+}
+
+int main() {
+  fprintf(stderr, "data size: %zu\n", sizeof(T));
+  for (int i = 0; i < 2 * kTestCount; i++) {
+    data[i].pad0 = 100;
+    data[i].pad1 = 200;
+  }
+  barrier_init(&barrier, 2);
+  pthread_t t;
+  pthread_create(&t, 0, Thread, 0);
+  for (int i = 0; i < kTestCount; i++) {
+    barrier_wait(&barrier);
+    fprintf(stderr, "Test %d forward\n", i);
+    Test(i, &data[2 * i], true);
+    Test(i, &data[2 * i + 1], true);
+    barrier_wait(&barrier);
+  }
+  pthread_join(t, 0);
+  for (int i = 0; i < 2 * kTestCount; i++) {
+    if (data[i].pad0 != 100 || data[i].pad1 != 200) {
+      fprintf(stderr, "padding overwritten %d: %lu/%lu\n",
+              i, data[i].pad0, data[i].pad1);
+// CHECK-NOT: padding overwritten
+    }
+  }
+}