Index: include/memory
===================================================================
--- include/memory
+++ include/memory
@@ -625,6 +625,18 @@
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
+template <class _ValueType>
+inline _LIBCPP_ALWAYS_INLINE
+_ValueType __libcpp_relaxed_load(_ValueType const* __value) {
+#if !defined(_LIBCPP_HAS_NO_THREADS) && \
+    defined(__ATOMIC_RELAXED) &&        \
+    (__has_builtin(__atomic_load_n) || _GNUC_VER >= 407)
+    return __atomic_load_n(__value, __ATOMIC_RELAXED);
+#else
+    return *__value;
+#endif
+}
+
 // addressof moved to <__functional_base>
 
 template <class _Tp> class allocator;
@@ -3676,7 +3688,9 @@
     void __add_shared() _NOEXCEPT;
     bool __release_shared() _NOEXCEPT;
     _LIBCPP_INLINE_VISIBILITY
-    long use_count() const _NOEXCEPT {return __shared_owners_ + 1;}
+    long use_count() const _NOEXCEPT {
+        return __libcpp_relaxed_load(&__shared_owners_) + 1;
+    }
 };
 
 class _LIBCPP_TYPE_VIS __shared_weak_count
Index: include/mutex
===================================================================
--- include/mutex
+++ include/mutex
@@ -175,6 +175,7 @@
 #include <__config>
 #include <__mutex_base>
 #include <functional>
+#include <memory>
 #ifndef _LIBCPP_HAS_NO_VARIADICS
 #include <tuple>
 #endif
@@ -541,7 +542,7 @@
 void
 call_once(once_flag& __flag, _Callable&& __func, _Args&&... __args)
 {
-    if (__flag.__state_ != ~0ul)
+    if (__libcpp_relaxed_load(&__flag.__state_) != ~0ul)
     {
         typedef tuple<typename decay<_Callable>::type, typename decay<_Args>::type...> _Gp;
         __call_once_param<_Gp> __p(_Gp(__decay_copy(_VSTD::forward<_Callable>(__func)),
@@ -557,7 +558,7 @@
 void
 call_once(once_flag& __flag, _Callable __func)
 {
-    if (__flag.__state_ != ~0ul)
+    if (__libcpp_relaxed_load(&__flag.__state_) != ~0ul)
     {
         __call_once_param<_Callable> __p(__func);
         __call_once(__flag.__state_, &__p, &__call_once_proxy<_Callable>);
Index: src/memory.cpp
===================================================================
--- src/memory.cpp
+++ src/memory.cpp
@@ -13,6 +13,7 @@
 #include "mutex"
 #include "thread"
 #endif
+#include "support/atomic_support.h"
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -23,14 +24,14 @@
 inline T
 increment(T& t) _NOEXCEPT
 {
-    return __sync_add_and_fetch(&t, 1);
+    return __libcpp_atomic_add(&t, 1);
 }
 
 template <class T>
 inline T
 decrement(T& t) _NOEXCEPT
 {
-    return __sync_add_and_fetch(&t, -1);
+    return __libcpp_atomic_add(&t, -1);
 }
 
 }  // namespace
@@ -99,14 +100,13 @@
 __shared_weak_count*
 __shared_weak_count::lock() _NOEXCEPT
 {
-    long object_owners = __shared_owners_;
+    long object_owners = __libcpp_atomic_load(&__shared_owners_);
     while (object_owners != -1)
     {
-        if (__sync_bool_compare_and_swap(&__shared_owners_,
-                                         object_owners,
-                                         object_owners+1))
+        if (__libcpp_atomic_compare_exchange(&__shared_owners_,
+                                             &object_owners,
+                                             object_owners+1))
             return this;
-        object_owners = __shared_owners_;
     }
     return 0;
 }
Index: src/mutex.cpp
===================================================================
--- src/mutex.cpp
+++ src/mutex.cpp
@@ -12,6 +12,7 @@
 #include "limits"
 #include "system_error"
 #include "cassert"
+#include "support/atomic_support.h"
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 #ifndef _LIBCPP_HAS_NO_THREADS
@@ -220,6 +221,9 @@
 static pthread_cond_t  cv  = PTHREAD_COND_INITIALIZER;
 #endif
 
+/// NOTE: Changes to flag are done via relaxed atomic stores
+///       even though the accesses are protected by a mutex because threads
+///       just entering 'call_once` concurrently read from flag.
 void
 __call_once(volatile unsigned long& flag, void* arg, void(*func)(void*))
 {
@@ -252,11 +256,11 @@
         try
         {
 #endif  // _LIBCPP_NO_EXCEPTIONS
-            flag = 1;
+            __libcpp_relaxed_store(&flag, 1ul);
             pthread_mutex_unlock(&mut);
             func(arg);
             pthread_mutex_lock(&mut);
-            flag = ~0ul;
+            __libcpp_relaxed_store(&flag, ~0ul);
             pthread_mutex_unlock(&mut);
             pthread_cond_broadcast(&cv);
 #ifndef _LIBCPP_NO_EXCEPTIONS
@@ -264,7 +268,7 @@
         catch (...)
         {
             pthread_mutex_lock(&mut);
-            flag = 0ul;
+            __libcpp_relaxed_store(&flag, 0ul);
             pthread_mutex_unlock(&mut);
             pthread_cond_broadcast(&cv);
             throw;
Index: src/support/atomic_support.h
===================================================================
--- /dev/null
+++ src/support/atomic_support.h
@@ -0,0 +1,134 @@
+#ifndef ATOMIC_SUPPORT_H
+#define ATOMIC_SUPPORT_H
+
+#include "__config"
+#include "memory" // for __libcpp_relaxed_load
+
+#if defined(__clang__) && __has_builtin(__atomic_load_n)             \
+                       && __has_builtin(__atomic_store_n)            \
+                       && __has_builtin(__atomic_add_fetch)          \
+                       && __has_builtin(__atomic_compare_exchange_n) \
+                       && defined(__ATOMIC_RELAXED)                  \
+                       && defined(__ATOMIC_CONSUME)                  \
+                       && defined(__ATOMIC_ACQUIRE)                  \
+                       && defined(__ATOMIC_RELEASE)                  \
+                       && defined(__ATOMIC_ACQ_REL)                  \
+                       && defined(__ATOMIC_SEQ_CST)
+#   define _LIBCPP_HAS_ATOMIC_BUILTINS
+#elif !defined(__clang__) && defined(_GNUC_VER) && _GNUC_VER >= 407
+#   define _LIBCPP_HAS_ATOMIC_BUILTINS
+#endif
+
+#if !defined(_LIBCPP_HAS_ATOMIC_BUILTINS) && !defined(_LIBCPP_HAS_NO_THREADS)
+#warning Building libc++ without __atomic builtins is unsupported
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if defined(_LIBCPP_HAS_ATOMIC_BUILTINS) && !defined(_LIBCPP_HAS_NO_THREADS)
+
+enum __libcpp_atomic_order {
+    _AO_Relaxed = __ATOMIC_RELAXED,
+    _AO_Consume = __ATOMIC_CONSUME,
+    _AO_Aquire  = __ATOMIC_ACQUIRE,
+    _AO_Release = __ATOMIC_RELEASE,
+    _AO_Acq_Rel = __ATOMIC_ACQ_REL,
+    _AO_Seq     = __ATOMIC_SEQ_CST
+};
+
+template <class _ValueType, class _FromType>
+inline _LIBCPP_INLINE_VISIBILITY
+void __libcpp_atomic_store(_ValueType* __dest, _FromType __val,
+                           int __order = _AO_Seq)
+{
+    __atomic_store_n(__dest, __val, __order);
+}
+
+template <class _ValueType, class _FromType>
+inline _LIBCPP_INLINE_VISIBILITY
+void __libcpp_relaxed_store(_ValueType* __dest, _FromType __val)
+{
+    __atomic_store_n(__dest, __val, _AO_Relaxed);
+}
+
+template <class _ValueType>
+inline _LIBCPP_INLINE_VISIBILITY
+_ValueType __libcpp_atomic_load(_ValueType const* __val,
+                                int __order = _AO_Seq)
+{
+    return __atomic_load_n(__val, __order);
+}
+
+template <class _ValueType, class _AddType>
+inline _LIBCPP_INLINE_VISIBILITY
+_ValueType __libcpp_atomic_add(_ValueType* __val, _AddType __a,
+                               int __order = _AO_Seq)
+{
+    return __atomic_add_fetch(__val, __a, __order);
+}
+
+template <class _ValueType>
+inline _LIBCPP_INLINE_VISIBILITY
+bool __libcpp_atomic_compare_exchange(_ValueType* __val,
+    _ValueType* __expected, _ValueType __after,
+    int __success_order = _AO_Seq,
+    int __fail_order = _AO_Seq)
+{
+    return __atomic_compare_exchange_n(__val, __expected, __after, true,
+                                       __success_order, __fail_order);
+}
+
+#else // _LIBCPP_HAS_NO_THREADS
+
+enum __libcpp_atomic_order {
+    _AO_Relaxed,
+    _AO_Consume,
+    _AO_Acquire,
+    _AO_Release,
+    _AO_Acq_Rel,
+    _AO_Seq
+};
+
+template <class _ValueType, class _FromType>
+inline _LIBCPP_INLINE_VISIBILITY
+void __libcpp_atomic_store(_ValueType* __dest, _FromType __val,
+                           int = 0)
+{
+    *__dest = __val;
+}
+
+template <class _ValueType>
+inline _LIBCPP_INLINE_VISIBILITY
+_ValueType __libcpp_atomic_load(_ValueType const* __val,
+                                int = 0)
+{
+    return *__val;
+}
+
+template <class _ValueType, class _AddType>
+inline _LIBCPP_INLINE_VISIBILITY
+_ValueType __libcpp_atomic_add(_ValueType* __val, _AddType __a,
+                               int = 0)
+{
+    return *__val += __a;
+}
+
+template <class _ValueType>
+inline _LIBCPP_INLINE_VISIBILITY
+bool __libcpp_atomic_compare_exchange(_ValueType* __val,
+    _ValueType* __expected, _ValueType __after,
+    int = 0, int = 0)
+{
+    if (*__val == *__expected) {
+        *__val = __after;
+        return true;
+    }
+    *__expected = *__val;
+    return false;
+}
+
+#endif // _LIBCPP_HAS_NO_THREADS
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // ATOMIC_SUPPORT_H
Index: test/libcxx/utilities/memory/util.smartptr/race_condition.pass.cpp
===================================================================
--- /dev/null
+++ test/libcxx/utilities/memory/util.smartptr/race_condition.pass.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// UNSUPPORTED: libcpp-has-no-threads
+//
+// <memory>
+//
+// class shared_ptr
+//
+// This test attempts to create a race condition surrounding use_count()
+// with the hope that TSAN will diagnose it.
+
+#include <memory>
+#include <atomic>
+#include <thread>
+#include <cassert>
+
+typedef std::shared_ptr<int> Ptr;
+typedef std::weak_ptr<int> WeakPtr;
+
+std::atomic_bool KeepRunning;
+
+struct TestRunner {
+    TestRunner(Ptr xx) : x(xx) {}
+    void operator()() {
+        while (KeepRunning) {
+            // loop to prevent always checking the atomic.
+            for (int i=0; i < 100000; ++i) {
+                Ptr x2 = x; // increment shared count
+                WeakPtr x3 = x; // increment weak count
+                Ptr x4 = x3.lock(); // increment shared count via lock
+                WeakPtr x5 = x3; // increment weak count
+            }
+        }
+    }
+    Ptr x;
+};
+
+void run_test(Ptr p) {
+    // Start the test thread.
+    KeepRunning = true;
+    TestRunner r(p);
+    std::thread t1(r);
+    assert(p.use_count() == 3);
+    // Run until we witness 25 use count changes via both
+    // shared and weak pointer methods.
+    WeakPtr w = p;
+    int shared_changes_count = 0;
+    int weak_changes_count = 0;
+    while (shared_changes_count < 25 && weak_changes_count < 25) {
+        // check use_count on the shared_ptr
+       int last = p.use_count();
+       int new_val = p.use_count();
+       assert(last >= 3);
+       assert(new_val >= 3);
+       if (last != new_val) ++shared_changes_count;
+       // Check use_count on the weak_ptr
+       last = w.use_count();
+       new_val = w.use_count();
+       assert(last >= 3);
+       assert(new_val >= 3);
+       if (last != new_val) ++weak_changes_count;
+    }
+    // kill the test thread.
+    KeepRunning = false;
+    t1.join();
+    assert(p.use_count() == 3);
+}
+
+int main() {
+  {
+    // Test with out-of-place shared_count.
+    Ptr p(new int(42));
+    run_test(p);
+    assert(p.use_count() == 1);
+  }
+  {
+    // Test with in-place shared_count.
+    Ptr p = std::make_shared<int>(42);
+    run_test(p);
+    assert(p.use_count() == 1);
+  }
+}