Index: benchmarks/shared_ptr_create_destroy.cpp
===================================================================
--- /dev/null
+++ benchmarks/shared_ptr_create_destroy.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <memory>
+#include <iostream>
+#include <chrono>
+#include <atomic>
+
+void clobber()
+{
+    asm volatile("" : : : "memory");
+}
+
+std::atomic<int> g_int;
+std::atomic<int> g_other;
+
+int main() {
+  auto a = std::chrono::high_resolution_clock::now();
+  {
+    clobber();
+    for (int i = 0; i < 1000000000; ++i)
+    {
+      auto sp = std::make_shared<int>(g_int.load(std::memory_order_relaxed));
+      g_other.store(*sp, std::memory_order_relaxed);
+    }
+    clobber();
+  }
+  auto b = std::chrono::high_resolution_clock::now();
+  std::cout<<std::chrono::duration_cast<std::chrono::nanoseconds>(b - a).count()/1000000000.0<<" seconds"<<std::endl;
+  return 0;
+}
Index: benchmarks/shared_ptr_inc_dec_ref.cpp
===================================================================
--- /dev/null
+++ benchmarks/shared_ptr_inc_dec_ref.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <memory>
+#include <iostream>
+#include <chrono>
+#include <atomic>
+
+void clobber()
+{
+    asm volatile("" : : : "memory");
+}
+
+std::atomic<int> g_int;
+std::atomic<int> g_other;
+
+int main() {
+  auto a = std::chrono::high_resolution_clock::now();
+  auto sp = std::make_shared<int>(g_int.load(std::memory_order_relaxed));
+  {
+    clobber();
+    for (int i = 0; i < 1000000000; ++i)
+    {
+      std::shared_ptr<int> sp2(sp);
+      g_other.store(*sp2, std::memory_order_relaxed);
+    }
+    clobber();
+  }
+  auto b = std::chrono::high_resolution_clock::now();
+  std::cout<<std::chrono::duration_cast<std::chrono::nanoseconds>(b - a).count()/1000000000.0<<" seconds"<<std::endl;
+  return 0;
+}
Index: benchmarks/weak_ptr_inc_dec_ref.cpp
===================================================================
--- /dev/null
+++ benchmarks/weak_ptr_inc_dec_ref.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is dual licensed under the MIT and the University of Illinois Open
+// Source Licenses. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include <memory>
+#include <iostream>
+#include <chrono>
+#include <atomic>
+
+void clobber()
+{
+    asm volatile("" : : : "memory");
+}
+
+std::atomic<int> g_int;
+std::atomic<int> g_other;
+
+int main() {
+  auto a = std::chrono::high_resolution_clock::now();
+  auto sp = std::make_shared<int>(g_int.load(std::memory_order_relaxed));
+  {
+    clobber();
+    for (int i = 0; i < 1000000000; ++i)
+    {
+      std::weak_ptr<int> wp(sp);
+    }
+    clobber();
+  }
+  auto b = std::chrono::high_resolution_clock::now();
+  std::cout<<std::chrono::duration_cast<std::chrono::nanoseconds>(b - a).count()/1000000000.0<<" seconds"<<std::endl;
+  return 0;
+}
Index: src/memory.cpp
===================================================================
--- src/memory.cpp
+++ src/memory.cpp
@@ -96,7 +96,35 @@
 void
 __shared_weak_count::__release_weak() _NOEXCEPT
 {
-    if (decrement(__shared_weak_owners_) == -1)
+    // NOTE: The acquire load here is an optimization of the very
+    // common case where a shared pointer is being destructed while
+    // having no other contended references.
+    //
+    // BENEFIT: We avoid expensive atomic stores like XADD and STREX
+    // in a common case.  Those instructions are slow and do nasty
+    // things to caches.
+    //
+    // IS THIS SAFE?  Yes.  During weak destruction, if we see that we
+    // are the last reference, we know that no-one else is accessing
+    // us. If someone were accessing us, then they would be doing so
+    // while the last shared / weak_ptr was being destructed, and
+    // that's undefined anyway.
+    //
+    // If we see anything other than a 0, then we have possible
+    // contention, and need to use an atomicrmw primitive.
+    // The same arguments don't apply for increment, where it is legal
+    // (though inadvisable) to share shared_ptr references between
+    // threads, and have them all get copied at once.  The argument
+    // also doesn't apply for __release_shared, because an outstanding
+    // weak_ptr::lock() could read / modify the shared count.
+    if (__libcpp_atomic_load(&__shared_weak_owners_, _AO_Aquire) == 0)
+    {
+        // no need to do this store, because we are about
+        // to destroy everything.
+        //__libcpp_atomic_store(&__shared_weak_owners_, -1, _AO_Release);
+        __on_zero_shared_weak();
+    }
+    else if (decrement(__shared_weak_owners_) == -1)
         __on_zero_shared_weak();
 }