diff --git a/libcxxabi/src/cxa_guard_impl.h b/libcxxabi/src/cxa_guard_impl.h
--- a/libcxxabi/src/cxa_guard_impl.h
+++ b/libcxxabi/src/cxa_guard_impl.h
@@ -154,92 +154,82 @@
 constexpr uint32_t (*PlatformThreadID)() = nullptr;
 #endif
 
-constexpr bool PlatformSupportsThreadID() { return +PlatformThreadID != nullptr; }
-
 //===----------------------------------------------------------------------===//
-//                          GuardBase
+//                          GuardByte
 //===----------------------------------------------------------------------===//
 
-enum class AcquireResult {
-  INIT_IS_DONE,
-  INIT_IS_PENDING,
-};
-constexpr AcquireResult INIT_IS_DONE = AcquireResult::INIT_IS_DONE;
-constexpr AcquireResult INIT_IS_PENDING = AcquireResult::INIT_IS_PENDING;
-
 static constexpr uint8_t UNSET = 0;
 static constexpr uint8_t COMPLETE_BIT = (1 << 0);
 static constexpr uint8_t PENDING_BIT = (1 << 1);
 static constexpr uint8_t WAITING_BIT = (1 << 2);
 
-template <class Derived>
-struct GuardObject {
-  GuardObject() = delete;
-  GuardObject(GuardObject const&) = delete;
-  GuardObject& operator=(GuardObject const&) = delete;
+/// Manages reads and writes to the guard byte.
+struct GuardByte {
+  GuardByte() = delete;
+  GuardByte(GuardByte const&) = delete;
+  GuardByte& operator=(GuardByte const&) = delete;
 
-  explicit GuardObject(uint32_t* g)
-      : base_address(g), guard_byte_address(reinterpret_cast<uint8_t*>(g)),
-        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1), thread_id_address(nullptr) {}
-
-  explicit GuardObject(uint64_t* g)
-      : base_address(g), guard_byte_address(reinterpret_cast<uint8_t*>(g)),
-        init_byte_address(reinterpret_cast<uint8_t*>(g) + 1), thread_id_address(reinterpret_cast<uint32_t*>(g) + 1) {}
+  explicit GuardByte(uint8_t* const guard_byte_address) : guard_byte(guard_byte_address) {}
 
 public:
-  /// Implements __cxa_guard_acquire
-  AcquireResult cxa_guard_acquire() {
-    AtomicInt<uint8_t> guard_byte(guard_byte_address);
-    if (guard_byte.load(std::_AO_Acquire) != UNSET)
-      return INIT_IS_DONE;
-    return derived()->acquire_init_byte();
+  /// The guard_byte portion of cxa_guard_acquire. Returns true if
+  /// initialization has already been completed.
+  /// Note: On completion, we haven't 'acquired' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_acquire.
+  bool acquire_guard_byte() {
+    // if guard_byte is non-zero, we have already completed initialization
+    // (i.e. release_guard_byte has been called)
+    return guard_byte.load(std::_AO_Acquire) != UNSET;
   }
 
-  /// Implements __cxa_guard_release
-  void cxa_guard_release() {
-    AtomicInt<uint8_t> guard_byte(guard_byte_address);
-    // Store complete first, so that when release wakes other folks, they see
-    // it as having been completed.
-    guard_byte.store(COMPLETE_BIT, std::_AO_Release);
-    derived()->release_init_byte();
-  }
+  /// The guard_byte portion of cxa_guard_release.
+  /// Note: On completion, we haven't 'released' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_release.
+  void release_guard_byte() { guard_byte.store(COMPLETE_BIT, std::_AO_Release); }
 
-  /// Implements __cxa_guard_abort
-  void cxa_guard_abort() { derived()->abort_init_byte(); }
-
-public:
-  /// base_address - the address of the original guard object.
-  void* const base_address;
-  /// The address of the guard byte at offset 0.
-  uint8_t* const guard_byte_address;
-  /// The address of the byte used by the implementation during initialization.
-  uint8_t* const init_byte_address;
-  /// An optional address storing an identifier for the thread performing initialization.
-  /// It's used to detect recursive initialization.
-  uint32_t* const thread_id_address;
+  /// The guard_byte portion of cxa_guard_abort.
+  void abort_guard_byte() {} // Nothing to do
 
 private:
-  Derived* derived() { return static_cast<Derived*>(this); }
+  AtomicInt<uint8_t> guard_byte;
 };
 
 //===----------------------------------------------------------------------===//
 //                    Single Threaded Implementation
 //===----------------------------------------------------------------------===//
 
-struct InitByteNoThreads : GuardObject<InitByteNoThreads> {
-  using GuardObject::GuardObject;
+/// InitByteNoThreads - Doesn't use any inter-thread synchronization when
+/// managing reads and writes to the init byte.
+struct InitByteNoThreads {
+  InitByteNoThreads() = delete;
+  InitByteNoThreads(InitByteNoThreads const&) = delete;
+  InitByteNoThreads& operator=(InitByteNoThreads const&) = delete;
+
+  explicit InitByteNoThreads(uint8_t* _init_byte_address, uint32_t*) : init_byte_address(_init_byte_address) {}
 
-  AcquireResult acquire_init_byte() {
+  /// The init_byte portion of cxa_guard_acquire. Returns true if
+  /// initialization has already been completed.
+  /// Note: On completion, we haven't 'acquired' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_acquire.
+  bool acquire_init_byte() {
     if (*init_byte_address == COMPLETE_BIT)
-      return INIT_IS_DONE;
+      return true;
     if (*init_byte_address & PENDING_BIT)
       ABORT_WITH_MESSAGE("__cxa_guard_acquire detected recursive initialization");
     *init_byte_address = PENDING_BIT;
-    return INIT_IS_PENDING;
+    return false;
   }
 
+  /// The init_byte portion of cxa_guard_release.
+  /// Note: On completion, we haven't 'released' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_release.
   void release_init_byte() { *init_byte_address = COMPLETE_BIT; }
+  /// The init_byte portion of cxa_guard_abort.
   void abort_init_byte() { *init_byte_address = UNSET; }
+
+private:
+  /// The address of the byte used during initialization.
+  uint8_t* const init_byte_address;
 };
 
 //===----------------------------------------------------------------------===//
@@ -279,18 +269,22 @@
 struct LibcppCondVar {};
 #endif // !defined(_LIBCXXABI_HAS_NO_THREADS)
 
+/// InitByteGlobalMutex - Uses a global mutex and condition variable (common to
+/// all static local variables) to manage reads and writes to the init byte.
 template <class Mutex, class CondVar, Mutex& global_mutex, CondVar& global_cond,
           uint32_t (*GetThreadID)() = PlatformThreadID>
-struct InitByteGlobalMutex : GuardObject<InitByteGlobalMutex<Mutex, CondVar, global_mutex, global_cond, GetThreadID>> {
+struct InitByteGlobalMutex {
 
-  using BaseT = typename InitByteGlobalMutex::GuardObject;
-  using BaseT::BaseT;
-
-  explicit InitByteGlobalMutex(uint32_t* g) : BaseT(g), has_thread_id_support(false) {}
-  explicit InitByteGlobalMutex(uint64_t* g) : BaseT(g), has_thread_id_support(PlatformSupportsThreadID()) {}
+  explicit InitByteGlobalMutex(uint8_t* _init_byte_address, uint32_t* _thread_id_address)
+      : init_byte_address(_init_byte_address), thread_id_address(_thread_id_address),
+        has_thread_id_support(_thread_id_address != nullptr && GetThreadID != nullptr) {}
 
 public:
-  AcquireResult acquire_init_byte() {
+  /// The init_byte portion of cxa_guard_acquire. Returns true if
+  /// initialization has already been completed.
+  /// Note: On completion, we haven't 'acquired' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_acquire.
+  bool acquire_init_byte() {
     LockGuard g("__cxa_guard_acquire");
     // Check for possible recursive initialization.
     if (has_thread_id_support && (*init_byte_address & PENDING_BIT)) {
@@ -305,15 +299,18 @@
     }
 
     if (*init_byte_address == COMPLETE_BIT)
-      return INIT_IS_DONE;
+      return true;
 
     if (has_thread_id_support)
       *thread_id_address = current_thread_id.get();
 
     *init_byte_address = PENDING_BIT;
-    return INIT_IS_PENDING;
+    return false;
   }
 
+  /// The init_byte portion of cxa_guard_release.
+  /// Note: On completion, we haven't 'released' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_release.
   void release_init_byte() {
     bool has_waiting;
     {
@@ -328,6 +325,7 @@
     }
   }
 
+  /// The init_byte portion of cxa_guard_abort.
   void abort_init_byte() {
     bool has_waiting;
     {
@@ -345,8 +343,12 @@
   }
 
 private:
-  using BaseT::init_byte_address;
-  using BaseT::thread_id_address;
+  /// The address of the byte used during initialization.
+  uint8_t* const init_byte_address;
+  /// An optional address storing an identifier for the thread performing initialization.
+  /// It's used to detect recursive initialization.
+  uint32_t* const thread_id_address;
+
   const bool has_thread_id_support;
   LazyValue<uint32_t, GetThreadID> current_thread_id;
 
@@ -393,36 +395,34 @@
 
 constexpr bool PlatformSupportsFutex() { return +PlatformFutexWait != nullptr; }
 
-/// InitByteFutex - Manages initialization using atomics and the futex syscall
-/// for waiting and waking.
+/// InitByteFutex - Uses a futex to manage reads and writes to the init byte.
 template <void (*Wait)(int*, int) = PlatformFutexWait, void (*Wake)(int*) = PlatformFutexWake,
           uint32_t (*GetThreadIDArg)() = PlatformThreadID>
-struct InitByteFutex : GuardObject<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
-  using BaseT = typename InitByteFutex::GuardObject;
-
-  /// ARM Constructor
-  explicit InitByteFutex(uint32_t* g)
-      : BaseT(g), init_byte(this->init_byte_address), has_thread_id_support(this->thread_id_address && GetThreadIDArg),
-        thread_id(this->thread_id_address) {}
+struct InitByteFutex {
 
-  /// Itanium Constructor
-  explicit InitByteFutex(uint64_t* g)
-      : BaseT(g), init_byte(this->init_byte_address), has_thread_id_support(this->thread_id_address && GetThreadIDArg),
-        thread_id(this->thread_id_address) {}
+  explicit InitByteFutex(uint8_t* _init_byte_address, uint32_t* _thread_id_address)
+      : init_byte(_init_byte_address),
+        has_thread_id_support(_thread_id_address != nullptr && GetThreadIDArg != nullptr),
+        thread_id(_thread_id_address), base_address(reinterpret_cast<int*>(
+                                           /*_init_byte_address & ~0x3*/ _init_byte_address - 1)) {}
 
 public:
-  AcquireResult acquire_init_byte() {
+  /// The init_byte portion of cxa_guard_acquire. Returns true if
+  /// initialization has already been completed.
+  /// Note: On completion, we haven't 'acquired' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_acquire.
+  bool acquire_init_byte() {
     while (true) {
       uint8_t last_val = UNSET;
       if (init_byte.compare_exchange(&last_val, PENDING_BIT, std::_AO_Acq_Rel, std::_AO_Acquire)) {
         if (has_thread_id_support) {
           thread_id.store(current_thread_id.get(), std::_AO_Relaxed);
         }
-        return INIT_IS_PENDING;
+        return false;
       }
 
       if (last_val == COMPLETE_BIT)
-        return INIT_IS_DONE;
+        return true;
 
       if (last_val & PENDING_BIT) {
 
@@ -439,7 +439,7 @@
           if (!init_byte.compare_exchange(&last_val, PENDING_BIT | WAITING_BIT, std::_AO_Acq_Rel, std::_AO_Release)) {
             // (1) success, via someone else's work!
             if (last_val == COMPLETE_BIT)
-              return INIT_IS_DONE;
+              return true;
 
             // (3) someone else, bailed on doing the work, retry from the start!
             if (last_val == UNSET)
@@ -453,12 +453,16 @@
     }
   }
 
+  /// The init_byte portion of cxa_guard_release.
+  /// Note: On completion, we haven't 'released' ownership of anything mutex
+  /// related. The name is simply referring to __cxa_guard_release.
   void release_init_byte() {
     uint8_t old = init_byte.exchange(COMPLETE_BIT, std::_AO_Acq_Rel);
     if (old & WAITING_BIT)
       wake_all();
   }
 
+  /// The init_byte portion of cxa_guard_abort.
   void abort_init_byte() {
     if (has_thread_id_support)
       thread_id.store(0, std::_AO_Relaxed);
@@ -470,12 +474,11 @@
 
 private:
   /// Use the futex to wait on the current guard variable. Futex expects a
-  /// 32-bit 4-byte aligned address as the first argument, so we have to use use
-  /// the base address of the guard variable (not the init byte).
-  void wait_on_initialization() {
-    Wait(static_cast<int*>(this->base_address), expected_value_for_futex(PENDING_BIT | WAITING_BIT));
-  }
-  void wake_all() { Wake(static_cast<int*>(this->base_address)); }
+  /// 32-bit 4-byte aligned address as the first argument, so we use the 4-byte
+  /// aligned address that encompasses the init byte (i.e. the address of the
+  /// raw guard object that was passed to __cxa_guard_acquire/release/abort).
+  void wait_on_initialization() { Wait(base_address, expected_value_for_futex(PENDING_BIT | WAITING_BIT)); }
+  void wake_all() { Wake(base_address); }
 
 private:
   AtomicInt<uint8_t> init_byte;
@@ -485,6 +488,10 @@
   AtomicInt<uint32_t> thread_id;
   LazyValue<uint32_t, GetThreadIDArg> current_thread_id;
 
+  /// the 4-byte-aligned address that encompasses the init byte (i.e. the
+  /// address of the raw guard object).
+  int* const base_address;
+
   /// Create the expected integer value for futex `wait(int* addr, int expected)`.
   /// We pass the base address as the first argument, So this function creates
   /// an zero-initialized integer  with `b` copied at the correct offset.
@@ -497,6 +504,86 @@
   static_assert(Wait != nullptr && Wake != nullptr, "");
 };
 
+//===----------------------------------------------------------------------===//
+//                                   Guards
+//===----------------------------------------------------------------------===//
+
+enum class AcquireResult {
+  INIT_IS_DONE,
+  INIT_IS_PENDING,
+};
+constexpr AcquireResult INIT_IS_DONE = AcquireResult::INIT_IS_DONE;
+constexpr AcquireResult INIT_IS_PENDING = AcquireResult::INIT_IS_PENDING;
+
+/// Co-ordinates between GuardByte and InitByte.
+template <class InitByteT>
+struct GuardBase : GuardByte, InitByteT {
+  GuardBase() = delete;
+  GuardBase(GuardBase const&) = delete;
+  GuardBase& operator=(GuardBase const&) = delete;
+
+  /// ARM Constructor
+  explicit GuardBase(uint32_t* raw_guard_object)
+      : GuardByte(reinterpret_cast<uint8_t*>(raw_guard_object)),
+        InitByteT(reinterpret_cast<uint8_t*>(raw_guard_object) + 1, nullptr) {}
+
+  /// Itanium Constructor
+  explicit GuardBase(uint64_t* raw_guard_object)
+      : GuardByte(reinterpret_cast<uint8_t*>(raw_guard_object)),
+        InitByteT(reinterpret_cast<uint8_t*>(raw_guard_object) + 1, reinterpret_cast<uint32_t*>(raw_guard_object) + 1) {
+  }
+
+  /// Implements __cxa_guard_acquire.
+  AcquireResult cxa_guard_acquire() {
+    // use short-circuit evalutation to avoid calling acquire_init_byte when
+    // acquire_guard_byte returns true. (i.e. don't call it when we know from
+    // the guard byte that initialization has already been completed)
+    if (acquire_guard_byte() || InitByteT::acquire_init_byte())
+      return INIT_IS_DONE;
+    return INIT_IS_PENDING;
+  }
+
+  /// Implements __cxa_guard_release.
+  void cxa_guard_release() {
+    // Update guard byte first, so if somebody is woken up by release_init_byte
+    // and comes all the way back around to __cxa_guard_acquire again, they see
+    // it as having completed initialization.
+    release_guard_byte();
+    InitByteT::release_init_byte();
+  }
+
+  /// Implements __cxa_guard_abort.
+  void cxa_guard_abort() {
+    abort_guard_byte();
+    InitByteT::abort_init_byte();
+  }
+};
+
+/// NoThreadsGuard - Manages initialization without performing any inter-thread
+/// synchronization.
+struct NoThreadsGuard : GuardBase<InitByteNoThreads> {
+  using BaseT = typename NoThreadsGuard::GuardBase;
+  using BaseT::BaseT;
+};
+
+/// GlobalMutexGuard - Manages initialization using a global mutex and
+/// condition variable.
+template <class Mutex, class CondVar, Mutex& global_mutex, CondVar& global_cond,
+          uint32_t (*GetThreadID)() = PlatformThreadID>
+struct GlobalMutexGuard : GuardBase<InitByteGlobalMutex<Mutex, CondVar, global_mutex, global_cond, GetThreadID>> {
+  using BaseT = typename GlobalMutexGuard::GuardBase;
+  using BaseT::BaseT;
+};
+
+/// FutexGuard - Manages initialization using atomics and the futex syscall for
+/// waiting and waking.
+template <void (*Wait)(int*, int) = PlatformFutexWait, void (*Wake)(int*) = PlatformFutexWake,
+          uint32_t (*GetThreadIDArg)() = PlatformThreadID>
+struct FutexGuard : GuardBase<InitByteFutex<Wait, Wake, GetThreadIDArg>> {
+  using BaseT = typename FutexGuard::GuardBase;
+  using BaseT::BaseT;
+};
+
 //===----------------------------------------------------------------------===//
 //
 //===----------------------------------------------------------------------===//
@@ -508,25 +595,25 @@
 template <class T>
 _LIBCPP_SAFE_STATIC T GlobalStatic<T>::instance = {};
 
-enum class Implementation { NoThreads, GlobalLock, Futex };
+enum class Implementation { NoThreads, GlobalMutex, Futex };
 
 template <Implementation Impl>
 struct SelectImplementation;
 
 template <>
 struct SelectImplementation<Implementation::NoThreads> {
-  using type = InitByteNoThreads;
+  using type = NoThreadsGuard;
 };
 
 template <>
-struct SelectImplementation<Implementation::GlobalLock> {
-  using type = InitByteGlobalMutex<LibcppMutex, LibcppCondVar, GlobalStatic<LibcppMutex>::instance,
-                                   GlobalStatic<LibcppCondVar>::instance, PlatformThreadID>;
+struct SelectImplementation<Implementation::GlobalMutex> {
+  using type = GlobalMutexGuard<LibcppMutex, LibcppCondVar, GlobalStatic<LibcppMutex>::instance,
+                                GlobalStatic<LibcppCondVar>::instance, PlatformThreadID>;
 };
 
 template <>
 struct SelectImplementation<Implementation::Futex> {
-  using type = InitByteFutex<PlatformFutexWait, PlatformFutexWake, PlatformThreadID>;
+  using type = FutexGuard<PlatformFutexWait, PlatformFutexWake, PlatformThreadID>;
 };
 
 // TODO(EricWF): We should prefer the futex implementation when available. But
@@ -537,7 +624,7 @@
 #elif defined(_LIBCXXABI_USE_FUTEX)
     Implementation::Futex;
 #else
-    Implementation::GlobalLock;
+    Implementation::GlobalMutex;
 #endif
 
 static_assert(CurrentImplementation != Implementation::Futex || PlatformSupportsFutex(),
diff --git a/libcxxabi/test/guard_test_basic.pass.cpp b/libcxxabi/test/guard_test_basic.pass.cpp
--- a/libcxxabi/test/guard_test_basic.pass.cpp
+++ b/libcxxabi/test/guard_test_basic.pass.cpp
@@ -12,8 +12,9 @@
 #include "../src/cxa_guard_impl.h"
 #include <cassert>
 
-// Disable GCC warning about tautological comparison of a function's address
-#if defined(__GNUC__) && !defined(__clang__)
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wtautological-pointer-compare"
+#elif defined(__GNUC__)
 # pragma GCC diagnostic ignored "-Waddress"
 #endif
 
@@ -118,42 +119,35 @@
   {
 #if defined(_LIBCXXABI_HAS_NO_THREADS)
     static_assert(CurrentImplementation == Implementation::NoThreads, "");
-    static_assert(
-        std::is_same<SelectedImplementation, InitByteNoThreads>::value, "");
+    static_assert(std::is_same<SelectedImplementation, NoThreadsGuard>::value, "");
 #else
-    static_assert(CurrentImplementation == Implementation::GlobalLock, "");
-    static_assert(
-        std::is_same<
-            SelectedImplementation,
-            InitByteGlobalMutex<LibcppMutex, LibcppCondVar,
-                                GlobalStatic<LibcppMutex>::instance,
-                                GlobalStatic<LibcppCondVar>::instance>>::value,
-        "");
+    static_assert(CurrentImplementation == Implementation::GlobalMutex, "");
+    static_assert(std::is_same<SelectedImplementation,
+                               GlobalMutexGuard<LibcppMutex, LibcppCondVar, GlobalStatic<LibcppMutex>::instance,
+                                                GlobalStatic<LibcppCondVar>::instance>>::value,
+                  "");
 #endif
   }
   {
 #if (defined(__APPLE__) || defined(__linux__))  && !defined(_LIBCXXABI_HAS_NO_THREADS)
     assert(PlatformThreadID);
 #endif
-    if (PlatformSupportsThreadID()) {
+    if (PlatformThreadID != nullptr) {
       assert(PlatformThreadID() != 0);
       assert(PlatformThreadID() == PlatformThreadID());
     }
   }
   {
-    Tests<uint32_t, InitByteNoThreads>::test();
-    Tests<uint64_t, InitByteNoThreads>::test();
+    Tests<uint32_t, NoThreadsGuard>::test();
+    Tests<uint64_t, NoThreadsGuard>::test();
   }
   {
-    using MutexImpl =
-        InitByteGlobalMutex<NopMutex, NopCondVar, global_nop_mutex,
-                            global_nop_cond, MockGetThreadID>;
+    using MutexImpl = GlobalMutexGuard<NopMutex, NopCondVar, global_nop_mutex, global_nop_cond, MockGetThreadID>;
     Tests<uint32_t, MutexImpl>::test();
     Tests<uint64_t, MutexImpl>::test();
   }
   {
-    using FutexImpl =
-        InitByteFutex<&NopFutexWait, &NopFutexWake, &MockGetThreadID>;
+    using FutexImpl = FutexGuard<&NopFutexWait, &NopFutexWake, &MockGetThreadID>;
     Tests<uint32_t, FutexImpl>::test();
     Tests<uint64_t, FutexImpl>::test();
   }
diff --git a/libcxxabi/test/guard_threaded_test.pass.cpp b/libcxxabi/test/guard_threaded_test.pass.cpp
--- a/libcxxabi/test/guard_threaded_test.pass.cpp
+++ b/libcxxabi/test/guard_threaded_test.pass.cpp
@@ -326,7 +326,7 @@
   }
 
 void test_all_impls() {
-  using MutexImpl = SelectImplementation<Implementation::GlobalLock>::type;
+  using MutexImpl = SelectImplementation<Implementation::GlobalMutex>::type;
 
   // Attempt to test the Futex based implementation if it's supported on the
   // target platform.