Index: include/memory =================================================================== --- include/memory +++ include/memory @@ -663,6 +663,18 @@ #endif } +template +inline _LIBCPP_ALWAYS_INLINE +_ValueType __libcpp_acquire_load(_ValueType const* __value) { +#if !defined(_LIBCPP_HAS_NO_THREADS) && \ + defined(__ATOMIC_ACQUIRE) && \ + (__has_builtin(__atomic_load_n) || _GNUC_VER >= 407) + return __atomic_load_n(__value, __ATOMIC_ACQUIRE); +#else + return *__value; +#endif +} + // addressof moved to <__functional_base> template class allocator; Index: include/mutex =================================================================== --- include/mutex +++ include/mutex @@ -574,7 +574,7 @@ void call_once(once_flag& __flag, _Callable&& __func, _Args&&... __args) { - if (__libcpp_relaxed_load(&__flag.__state_) != ~0ul) + if (__libcpp_acquire_load(&__flag.__state_) != ~0ul) { typedef tuple<_Callable&&, _Args&&...> _Gp; _Gp __f(_VSTD::forward<_Callable>(__func), _VSTD::forward<_Args>(__args)...); @@ -590,7 +590,7 @@ void call_once(once_flag& __flag, _Callable& __func) { - if (__libcpp_relaxed_load(&__flag.__state_) != ~0ul) + if (__libcpp_acquire_load(&__flag.__state_) != ~0ul) { __call_once_param<_Callable> __p(__func); __call_once(__flag.__state_, &__p, &__call_once_proxy<_Callable>); Index: src/mutex.cpp =================================================================== --- src/mutex.cpp +++ src/mutex.cpp @@ -199,9 +199,9 @@ static __libcpp_condvar_t cv = _LIBCPP_CONDVAR_INITIALIZER; #endif -/// NOTE: Changes to flag are done via relaxed atomic stores -/// even though the accesses are protected by a mutex because threads -/// just entering 'call_once` concurrently read from flag. +/// NOTE: Changes to flag are done via relaxed atomic stores, because they're +/// protected by a mutex; except for the final change to ~0ul, which is +/// a release atomic store to match the acquire atomic load in call_once. void __call_once(volatile unsigned long& flag, void* arg, void(*func)(void*)) { @@ -238,7 +238,7 @@ __libcpp_mutex_unlock(&mut); func(arg); __libcpp_mutex_lock(&mut); - __libcpp_relaxed_store(&flag, ~0ul); + __libcpp_atomic_store(&flag, ~0ul, _AO_Release); __libcpp_mutex_unlock(&mut); __libcpp_condvar_broadcast(&cv); #ifndef _LIBCPP_NO_EXCEPTIONS