Index: libcxx/include/atomic
===================================================================
--- libcxx/include/atomic
+++ libcxx/include/atomic
@@ -1484,14 +1484,14 @@
 template <class _Atp, class _Fn>
 struct __libcpp_atomic_wait_backoff_impl {
     _Atp* __a;
-    _Fn __test_fn;
+    _Fn __check_fn;
     _LIBCPP_AVAILABILITY_SYNC
     _LIBCPP_INLINE_VISIBILITY bool operator()(chrono::nanoseconds __elapsed) const
     {
         if(__elapsed > chrono::microseconds(64))
         {
-            auto const __monitor = __libcpp_atomic_monitor(__a);
-            if(__test_fn())
+            __cxx_contention_t __monitor = __libcpp_atomic_monitor(__a);
+            if(__check_fn(__monitor))
                 return true;
             __libcpp_atomic_wait(__a, __monitor);
         }
@@ -1503,11 +1503,11 @@
     }
 };
 
-template <class _Atp, class _Fn>
+template <class _Atp, class _TFn, class _CFn>
 _LIBCPP_AVAILABILITY_SYNC
-_LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_wait(_Atp* __a, _Fn && __test_fn)
+_LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_wait(_Atp* __a, _TFn && __test_fn, _CFn && __check_fn)
 {
-    __libcpp_atomic_wait_backoff_impl<_Atp, typename decay<_Fn>::type> __backoff_fn = {__a, __test_fn};
+    __libcpp_atomic_wait_backoff_impl<_Atp, typename decay<_CFn>::type> __backoff_fn = {__a, __check_fn};
     return __libcpp_thread_poll_with_backoff(__test_fn, __backoff_fn);
 }
 
@@ -1517,8 +1517,8 @@
 _LIBCPP_INLINE_VISIBILITY void __cxx_atomic_notify_all(__cxx_atomic_impl<_Tp> const volatile*) { }
 template <class _Tp>
 _LIBCPP_INLINE_VISIBILITY void __cxx_atomic_notify_one(__cxx_atomic_impl<_Tp> const volatile*) { }
-template <class _Atp, class _Fn>
-_LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_wait(_Atp*, _Fn && __test_fn)
+template <class _Atp, class _TFn, class _CFn>
+_LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_wait(_Atp*, _TFn && __test_fn, _CFn &&)
 {
 #if defined(_LIBCPP_HAS_NO_THREADS)
     using _Policy = __spinning_backoff_policy;
@@ -1541,12 +1541,21 @@
     }
 };
 
+struct __cxx_atomic_wait_check_fn_impl {
+    __cxx_contention_t __val;
+    _LIBCPP_INLINE_VISIBILITY bool operator()(__cxx_contention_t __cur) const
+    {
+        return !__cxx_nonatomic_compare_equal(__cur, __val);
+    }
+};
+
 template <class _Atp, class _Tp>
 _LIBCPP_AVAILABILITY_SYNC
 _LIBCPP_INLINE_VISIBILITY bool __cxx_atomic_wait(_Atp* __a, _Tp const __val, memory_order __order)
 {
     __cxx_atomic_wait_test_fn_impl<_Atp, _Tp> __test_fn = {__a, __val, __order};
-    return __cxx_atomic_wait(__a, __test_fn);
+    __cxx_atomic_wait_check_fn_impl __check_fn = {static_cast<__cxx_contention_t>(__val)};
+    return __cxx_atomic_wait(__a, __test_fn, __check_fn);
 }
 
 // general atomic<T>
Index: libcxx/include/latch
===================================================================
--- libcxx/include/latch
+++ libcxx/include/latch
@@ -94,6 +94,8 @@
     {
         __cxx_atomic_wait(&__a.__a_, [&]() -> bool {
             return try_wait();
+        }, [](__cxx_contention_t __cur) -> bool {
+            return __cur == 0;
         });
     }
     inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
Index: libcxx/include/semaphore
===================================================================
--- libcxx/include/semaphore
+++ libcxx/include/semaphore
@@ -89,12 +89,10 @@
     _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
     void release(ptrdiff_t __update = 1)
     {
-        if(0 < __a.fetch_add(__update, memory_order_release))
-            ;
-        else if(__update > 1)
+        if(__a.fetch_add(__update, memory_order_release) == 0)
+            // Always notify all, regardless of the value of __update
+            // (see https://llvm.org/PR47013)
             __a.notify_all();
-        else
-            __a.notify_one();
     }
     _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
     void acquire()
@@ -103,7 +101,13 @@
             auto __old = __a.load(memory_order_relaxed);
             return (__old != 0) && __a.compare_exchange_strong(__old, __old - 1, memory_order_acquire, memory_order_relaxed);
         };
-        __cxx_atomic_wait(&__a.__a_, __test_fn);
+        auto const __check_fn = [this](__cxx_contention_t & __monitor) -> bool {
+            ptrdiff_t __old = __monitor;
+            bool __r = __try_acquire_impl(__old);
+            __monitor = static_cast<__cxx_contention_t>(__old);
+            return __r;
+        };
+        __cxx_atomic_wait(&__a.__a_, __test_fn, __check_fn);
     }
     template <class Rep, class Period>
     _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
@@ -118,6 +122,13 @@
     bool try_acquire()
     {
         auto __old = __a.load(memory_order_acquire);
+        return __try_acquire_impl(__old);
+    }
+
+private:
+    _LIBCPP_AVAILABILITY_SYNC _LIBCPP_INLINE_VISIBILITY
+    bool __try_acquire_impl(ptrdiff_t & __old)
+    {
         while (true) {
             if (__old == 0)
                 return false;
Index: libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp
===================================================================
--- /dev/null
+++ libcxx/test/std/thread/thread.semaphore/lost_wakeup.pass.cpp
@@ -0,0 +1,64 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: libcpp-has-no-threads
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// This test requires the dylib support introduced in D68480, which shipped in macOS 11.0.
+// XFAIL: use_system_cxx_lib && target={{.+}}-apple-macosx10.{{9|10|11|12|13|14|15}}
+
+// This is a regression test for https://llvm.org/PR47013.
+
+// <semaphore>
+
+#include <barrier>
+#include <semaphore>
+#include <thread>
+#include <vector>
+
+#include "make_test_thread.h"
+
+static std::counting_semaphore s(0);
+static std::barrier b(8 + 1);
+
+void acquire() {
+  for (int i = 0; i < 10'000; ++i) {
+    s.acquire();
+    b.arrive_and_wait();
+  }
+}
+
+void release() {
+  for (int i = 0; i < 10'000; ++i) {
+    s.release(1);
+    s.release(1);
+    s.release(1);
+    s.release(1);
+
+    s.release(1);
+    s.release(1);
+    s.release(1);
+    s.release(1);
+
+    b.arrive_and_wait();
+  }
+}
+
+int main(int, char**) {
+  std::vector<std::thread> threads;
+
+  for (int i = 0; i < 8; ++i)
+    threads.push_back(support::make_test_thread(acquire));
+
+  threads.push_back(support::make_test_thread(release));
+
+  for (auto& thread : threads)
+    thread.join();
+
+  return 0;
+}