diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -15,6 +15,7 @@
 #include <__algorithm/min_element.h>
 #include <__algorithm/partial_sort.h>
 #include <__algorithm/unwrap_iter.h>
+#include <__assert>
 #include <__config>
 #include <__utility/swap.h>
 #include <memory>
@@ -70,7 +71,7 @@
 template <class _Compare, class _ForwardIterator>
 unsigned __sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3, _ForwardIterator __x4,
                  _Compare __c) {
-  unsigned __r = _VSTD::__sort3<_Compare>(__x1, __x2, __x3, __c);
+  unsigned __r = std::__sort3<_Compare>(__x1, __x2, __x3, __c);
   if (__c(*__x4, *__x3)) {
     swap(*__x3, *__x4);
     ++__r;
@@ -91,7 +92,7 @@
 template <class _Compare, class _ForwardIterator>
 _LIBCPP_HIDDEN unsigned __sort5(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3,
                                 _ForwardIterator __x4, _ForwardIterator __x5, _Compare __c) {
-  unsigned __r = _VSTD::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
+  unsigned __r = std::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
   if (__c(*__x5, *__x4)) {
     swap(*__x4, *__x5);
     ++__r;
@@ -153,52 +154,52 @@
 inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
 __sort3_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
                          _Compare __c) {
-  _VSTD::__cond_swap<_Compare>(__x2, __x3, __c);
-  _VSTD::__partially_sorted_swap<_Compare>(__x1, __x2, __x3, __c);
+  std::__cond_swap<_Compare>(__x2, __x3, __c);
+  std::__partially_sorted_swap<_Compare>(__x1, __x2, __x3, __c);
 }
 
 template <class _Compare, class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<!__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
 __sort3_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
                          _Compare __c) {
-  _VSTD::__sort3<_Compare>(__x1, __x2, __x3, __c);
+  std::__sort3<_Compare>(__x1, __x2, __x3, __c);
 }
 
 template <class _Compare, class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
 __sort4_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
                          _RandomAccessIterator __x4, _Compare __c) {
-  _VSTD::__cond_swap<_Compare>(__x1, __x3, __c);
-  _VSTD::__cond_swap<_Compare>(__x2, __x4, __c);
-  _VSTD::__cond_swap<_Compare>(__x1, __x2, __c);
-  _VSTD::__cond_swap<_Compare>(__x3, __x4, __c);
-  _VSTD::__cond_swap<_Compare>(__x2, __x3, __c);
+  std::__cond_swap<_Compare>(__x1, __x3, __c);
+  std::__cond_swap<_Compare>(__x2, __x4, __c);
+  std::__cond_swap<_Compare>(__x1, __x2, __c);
+  std::__cond_swap<_Compare>(__x3, __x4, __c);
+  std::__cond_swap<_Compare>(__x2, __x3, __c);
 }
 
 template <class _Compare, class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<!__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
 __sort4_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
                          _RandomAccessIterator __x4, _Compare __c) {
-  _VSTD::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
+  std::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c);
 }
 
 template <class _Compare, class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
 __sort5_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
                          _RandomAccessIterator __x4, _RandomAccessIterator __x5, _Compare __c) {
-  _VSTD::__cond_swap<_Compare>(__x1, __x2, __c);
-  _VSTD::__cond_swap<_Compare>(__x4, __x5, __c);
-  _VSTD::__partially_sorted_swap<_Compare>(__x3, __x4, __x5, __c);
-  _VSTD::__cond_swap<_Compare>(__x2, __x5, __c);
-  _VSTD::__partially_sorted_swap<_Compare>(__x1, __x3, __x4, __c);
-  _VSTD::__partially_sorted_swap<_Compare>(__x2, __x3, __x4, __c);
+  std::__cond_swap<_Compare>(__x1, __x2, __c);
+  std::__cond_swap<_Compare>(__x4, __x5, __c);
+  std::__partially_sorted_swap<_Compare>(__x3, __x4, __x5, __c);
+  std::__cond_swap<_Compare>(__x2, __x5, __c);
+  std::__partially_sorted_swap<_Compare>(__x1, __x3, __x4, __c);
+  std::__partially_sorted_swap<_Compare>(__x2, __x3, __x4, __c);
 }
 
 template <class _Compare, class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<!__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void>
 __sort5_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3,
                          _RandomAccessIterator __x4, _RandomAccessIterator __x5, _Compare __c) {
-  _VSTD::__sort5<_Compare>(__x1, __x2, __x3, __x4, __x5, __c);
+  std::__sort5<_Compare>(__x1, __x2, __x3, __x4, __x5, __c);
 }
 
 // Assumes size > 0
@@ -207,31 +208,16 @@
                                                     _Compare __comp) {
   _BidirectionalIterator __lm1 = __last;
   for (--__lm1; __first != __lm1; ++__first) {
-    _BidirectionalIterator __i = _VSTD::min_element(__first, __last, __comp);
+    _BidirectionalIterator __i = std::min_element(__first, __last, __comp);
     if (__i != __first)
       swap(*__first, *__i);
   }
 }
 
-template <class _Compare, class _BidirectionalIterator>
-void __insertion_sort(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp) {
-  typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type;
-  if (__first != __last) {
-    _BidirectionalIterator __i = __first;
-    for (++__i; __i != __last; ++__i) {
-      _BidirectionalIterator __j = __i;
-      value_type __t(_VSTD::move(*__j));
-      for (_BidirectionalIterator __k = __i; __k != __first && __comp(__t, *--__k); --__j)
-        *__j = _VSTD::move(*__k);
-      *__j = _VSTD::move(__t);
-    }
-  }
-}
-
 // Sort the iterator range [__first, __last) using the comparator __comp using
 // the insertion sort algorithm.
 template <class _Compare, class _RandomAccessIterator>
-void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+void __insertion_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
   typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
   if (__first == __last)
@@ -239,14 +225,14 @@
   for (_RandomAccessIterator __i = __first + difference_type(1); __i != __last; ++__i) {
     _RandomAccessIterator __j = __i - difference_type(1);
     if (__comp(*__i, *__j)) {
-      value_type __t(_VSTD::move(*__i));
+      value_type __t(std::move(*__i));
       _RandomAccessIterator __k = __j;
       __j = __i;
       do {
-        *__j = _VSTD::move(*__k);
+        *__j = std::move(*__k);
         __j = __k;
       } while (__j != __first && __comp(__t, *--__k));
-      *__j = _VSTD::move(__t);
+      *__j = std::move(__t);
     }
   }
 }
@@ -257,7 +243,7 @@
 // Assumes that there is an element in the position (__first - 1) and that each
 // element in the input range is greater or equal to the element at __first - 1.
 template <class _Compare, class _RandomAccessIterator>
-void __insertion_sort_3_unguarded(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+void __insertion_sort_unguarded(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
   typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
   if (__first == __last)
@@ -265,14 +251,14 @@
   for (_RandomAccessIterator __i = __first + difference_type(1); __i != __last; ++__i) {
     _RandomAccessIterator __j = __i - difference_type(1);
     if (__comp(*__i, *__j)) {
-      value_type __t(_VSTD::move(*__i));
+      value_type __t(std::move(*__i));
       _RandomAccessIterator __k = __j;
       __j = __i;
       do {
-        *__j = _VSTD::move(*__k);
+        *__j = std::move(*__k);
         __j = __k;
       } while (__comp(__t, *--__k)); // No need for bounds check due to the assumption stated above.
-      *__j = _VSTD::move(__t);
+      *__j = std::move(__t);
     }
   }
 }
@@ -289,32 +275,32 @@
       swap(*__first, *__last);
     return true;
   case 3:
-    _VSTD::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp);
+    std::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp);
     return true;
   case 4:
-    _VSTD::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
-                                              --__last, __comp);
+    std::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
+                                            --__last, __comp);
     return true;
   case 5:
-    _VSTD::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
-                                              __first + difference_type(3), --__last, __comp);
+    std::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
+                                            __first + difference_type(3), --__last, __comp);
     return true;
   }
   typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type;
   _RandomAccessIterator __j = __first + difference_type(2);
-  _VSTD::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), __j, __comp);
+  std::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), __j, __comp);
   const unsigned __limit = 8;
   unsigned __count = 0;
   for (_RandomAccessIterator __i = __j + difference_type(1); __i != __last; ++__i) {
     if (__comp(*__i, *__j)) {
-      value_type __t(_VSTD::move(*__i));
+      value_type __t(std::move(*__i));
       _RandomAccessIterator __k = __j;
       __j = __i;
       do {
-        *__j = _VSTD::move(*__k);
+        *__j = std::move(*__k);
         __j = __k;
       } while (__j != __first && __comp(__t, *--__k));
-      *__j = _VSTD::move(__t);
+      *__j = std::move(__t);
       if (++__count == __limit)
         return ++__i == __last;
     }
@@ -331,19 +317,19 @@
     __destruct_n __d(0);
     unique_ptr<value_type, __destruct_n&> __h(__first2, __d);
     value_type* __last2 = __first2;
-    ::new ((void*)__last2) value_type(_VSTD::move(*__first1));
+    ::new ((void*)__last2) value_type(std::move(*__first1));
     __d.template __incr<value_type>();
     for (++__last2; ++__first1 != __last1; ++__last2) {
       value_type* __j2 = __last2;
       value_type* __i2 = __j2;
       if (__comp(*__first1, *--__i2)) {
-        ::new ((void*)__j2) value_type(_VSTD::move(*__i2));
+        ::new ((void*)__j2) value_type(std::move(*__i2));
         __d.template __incr<value_type>();
         for (--__j2; __i2 != __first2 && __comp(*__first1, *--__i2); --__j2)
-          *__j2 = _VSTD::move(*__i2);
-        *__j2 = _VSTD::move(*__first1);
+          *__j2 = std::move(*__i2);
+        *__j2 = std::move(*__first1);
       } else {
-        ::new ((void*)__j2) value_type(_VSTD::move(*__first1));
+        ::new ((void*)__j2) value_type(std::move(*__first1));
         __d.template __incr<value_type>();
       }
     }
@@ -351,135 +337,65 @@
   }
 }
 
-struct __64bit_set {
-  typedef uint64_t __storage_t;
-  enum { __block_size = 64 };
-  static __storage_t __blsr(__storage_t x) {
-    // _blsr_u64 can be used here but it did not make any performance
-    // difference in practice.
-    return x ^ (x & -x);
-  }
-  static int __clz(__storage_t x) { return __builtin_clzll(x); }
-  static int __ctz(__storage_t x) { return __builtin_ctzll(x); }
-  static int __popcount(__storage_t x) { return __builtin_popcountll(x); }
-};
-
-struct __32bit_set {
-  typedef uint32_t __storage_t;
-  enum { __block_size = 32 };
-  static __storage_t __blsr(__storage_t x) {
-    // _blsr_u32 can be used here but it did not make any performance
-    // difference in practice.
-    return x ^ (x & -x);
-  }
-  static int __clz(__storage_t x) { return __builtin_clzl(x); }
-  static int __ctz(__storage_t x) { return __builtin_ctzl(x); }
-  static int __popcount(__storage_t x) { return __builtin_popcountl(x); }
-};
-
-template <class _Bitset, class _RandomAccessIterator>
+template <class _RandomAccessIterator>
 inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos(_RandomAccessIterator __first, _RandomAccessIterator __last,
-                                                    typename _Bitset::__storage_t& __left_bitset,
-                                                    typename _Bitset::__storage_t& __right_bitset) {
-  typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
+                                                    uint64_t& __left_bitset, uint64_t& __right_bitset) {
+  typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
   // Swap one pair on each iteration as long as both bitsets have at least one
   // element for swapping.
   while (__left_bitset != 0 && __right_bitset != 0) {
-    difference_type tz_left = _Bitset::__ctz(__left_bitset);
-    __left_bitset = _Bitset::__blsr(__left_bitset);
-    difference_type tz_right = _Bitset::__ctz(__right_bitset);
-    __right_bitset = _Bitset::__blsr(__right_bitset);
-    _VSTD::iter_swap(__first + tz_left, __last - tz_right);
+    difference_type tz_left = __libcpp_ctz(__left_bitset);
+    __left_bitset = __libcpp_blsr(__left_bitset);
+    difference_type tz_right = __libcpp_ctz(__right_bitset);
+    __right_bitset = __libcpp_blsr(__right_bitset);
+    std::iter_swap(__first + tz_left, __last - tz_right);
   }
 }
 
-// Partition [__first, __last) using the comparator __comp.  *__first has the
-// chosen pivot.  Elements that are equivalent are kept to the left of the
-// pivot.  Returns the iterator for the pivot and a bool value which is true if
-// the provided range is already sorted, false otherwise.  We assume that the
-// length of the range is at least three elements.
-//
-// We term the partitioning algorithm as bitset partitioning since the
-// outcomes of the comparisons between the pivot and other elements are stored
-// in the fixed size bitsets.
-template <class _Bitset, class _RandomAccessIterator, class _Compare>
-_LIBCPP_HIDE_FROM_ABI _VSTD::pair<_RandomAccessIterator, bool>
-__bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
-  typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type;
-  typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
-  typedef typename _Bitset::__storage_t __storage_t;
-  _RandomAccessIterator __begin = __first;
-  value_type __pivot(_VSTD::move(*__first));
-  // Find the first element greater than the pivot.
-  if (__comp(__pivot, *(__last - difference_type(1)))) {
-    // Not guarded since we know the last element is greater than the pivot.
-    while (!__comp(__pivot, *++__first)) {
-    }
-  } else {
-    while (++__first < __last && !__comp(__pivot, *__first)) {
-    }
-  }
-  // Find the last element less than or equal to the pivot.
-  if (__first < __last) {
-    // It will be always guarded because __introsort will do the median-of-three
-    // before calling this.
-    while (__comp(__pivot, *--__last)) {
-    }
+template <class _Compare, class _RandomAccessIterator,
+          class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void __populate_left_bitset(_RandomAccessIterator __first, _Compare __comp,
+                                                         _ValueType& __pivot, uint64_t& __left_bitset) {
+  typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
+  // Size in bits for the bitset in use.
+  _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8;
+  // Possible vectorization. With a proper "-march" flag, the following loop
+  // will be compiled into a set of SIMD instructions.
+  _RandomAccessIterator __iter = __first;
+  for (int __j = 0; __j < __block_size;) {
+    bool __comp_result = !__comp(*__iter, __pivot);
+    __left_bitset |= (static_cast<uint64_t>(__comp_result) << __j);
+    __j++;
+    ++__iter;
   }
-  // If the first element greater than the pivot is at or after the
-  // last element less than or equal to the pivot, then we have covered the
-  // entire range without swapping elements.  This implies the range is already
-  // partitioned.
-  bool __already_partitioned = __first >= __last;
-  if (!__already_partitioned) {
-    _VSTD::iter_swap(__first, __last);
-    ++__first;
-  }
-
-  // In [__first, __last) __last is not inclusive. From now on, it uses last
-  // minus one to be inclusive on both sides.
-  _RandomAccessIterator __lm1 = __last - difference_type(1);
-  __storage_t __left_bitset = 0;
-  __storage_t __right_bitset = 0;
+}
 
-  // Reminder: length = __lm1 - __first + 1.
-  while (__lm1 - __first >= 2 * _Bitset::__block_size - 1) {
-    // Record the comparison outcomes for the elements currently on the left
-    // side.
-    if (__left_bitset == 0) {
-      // Possible vectorization. With a proper "-march" flag, the following loop
-      // will be compiled into a set of SIMD instructions.
-      _RandomAccessIterator __iter = __first;
-      for (int __j = 0; __j < _Bitset::__block_size;) {
-        bool __comp_result = !__comp(*__iter, __pivot);
-        __left_bitset |= (static_cast<__storage_t>(__comp_result) << __j);
-        __j++;
-        ++__iter;
-      }
-    }
-    // Record the comparison outcomes for the elements currently on the right
-    // side.
-    if (__right_bitset == 0) {
-      // Possible vectorization. With a proper "-march" flag, the following loop
-      // will be compiled into a set of SIMD instructions.
-      _RandomAccessIterator __iter = __lm1;
-      for (int __j = 0; __j < _Bitset::__block_size;) {
-        bool __comp_result = __comp(*__iter, __pivot);
-        __right_bitset |= (static_cast<__storage_t>(__comp_result) << __j);
-        __j++;
-        --__iter;
-      }
-    }
-    // Swap the elements recorded to be the candidates for swapping in the
-    // bitsets.
-    __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset);
-    // Only advance the iterator if all the elements that need to be moved to
-    // other side were moved.
-    __first += (__left_bitset == 0) ? difference_type(_Bitset::__block_size) : difference_type(0);
-    __lm1 -= (__right_bitset == 0) ? difference_type(_Bitset::__block_size) : difference_type(0);
+template <class _Compare, class _RandomAccessIterator,
+          class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void __populate_right_bitset(_RandomAccessIterator __lm1, _Compare __comp,
+                                                          _ValueType& __pivot, uint64_t& __right_bitset) {
+  typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
+  // Size in bits for the bitset in use.
+  _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8;
+  // Possible vectorization. With a proper "-march" flag, the following loop
+  // will be compiled into a set of SIMD instructions.
+  _RandomAccessIterator __iter = __lm1;
+  for (int __j = 0; __j < __block_size;) {
+    bool __comp_result = __comp(*__iter, __pivot);
+    __right_bitset |= (static_cast<uint64_t>(__comp_result) << __j);
+    __j++;
+    --__iter;
   }
-  // Now, we have a less-than a block worth of elements on at least one of the
-  // sides.
+}
+
+template <class _Compare, class _RandomAccessIterator,
+          class _ValueType = typename iterator_traits<_RandomAccessIterator>::value_type>
+inline _LIBCPP_HIDE_FROM_ABI void
+__bitset_partition_partial_blocks(_RandomAccessIterator& __first, _RandomAccessIterator& __lm1, _Compare __comp,
+                                  _ValueType& __pivot, uint64_t& __left_bitset, uint64_t& __right_bitset) {
+  typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
+  // Size in bits for the bitset in use.
+  _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8;
   difference_type __remaining_len = __lm1 - __first + 1;
   difference_type __l_size;
   difference_type __r_size;
@@ -488,18 +404,18 @@
     __r_size = __remaining_len - __l_size;
   } else if (__left_bitset == 0) {
     // We know at least one side is a full block.
-    __l_size = __remaining_len - _Bitset::__block_size;
-    __r_size = _Bitset::__block_size;
+    __l_size = __remaining_len - __block_size;
+    __r_size = __block_size;
   } else { // if (__right_bitset == 0)
-    __l_size = _Bitset::__block_size;
-    __r_size = __remaining_len - _Bitset::__block_size;
+    __l_size = __block_size;
+    __r_size = __remaining_len - __block_size;
   }
   // Record the comparison outcomes for the elements currently on the left side.
   if (__left_bitset == 0) {
     _RandomAccessIterator __iter = __first;
     for (int j = 0; j < __l_size; j++) {
       bool __comp_result = !__comp(*__iter, __pivot);
-      __left_bitset |= (static_cast<__storage_t>(__comp_result) << j);
+      __left_bitset |= (static_cast<uint64_t>(__comp_result) << j);
       ++__iter;
     }
   }
@@ -509,24 +425,30 @@
     _RandomAccessIterator __iter = __lm1;
     for (int j = 0; j < __r_size; j++) {
       bool __comp_result = __comp(*__iter, __pivot);
-      __right_bitset |= (static_cast<__storage_t>(__comp_result) << j);
+      __right_bitset |= (static_cast<uint64_t>(__comp_result) << j);
       --__iter;
     }
   }
-  __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset);
+  __swap_bitmap_pos(__first, __lm1, __left_bitset, __right_bitset);
   __first += (__left_bitset == 0) ? __l_size : 0;
   __lm1 -= (__right_bitset == 0) ? __r_size : 0;
-  // At least one the bitsets would be empty.  For the non-empty one, we need to
-  // properly partition the elements that appear within that bitset.
+}
+
+template <class _RandomAccessIterator>
+inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos_within(_RandomAccessIterator& __first, _RandomAccessIterator& __lm1,
+                                                           uint64_t& __left_bitset, uint64_t& __right_bitset) {
+  typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
+  // Size in bits for the bitset in use.
+  _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8;
   if (__left_bitset) {
     // Swap within the left side.  Need to find set positions in the reverse
     // order.
     while (__left_bitset != 0) {
-      difference_type __tz_left = _Bitset::__block_size - 1 - _Bitset::__clz(__left_bitset);
-      __left_bitset &= (static_cast<__storage_t>(1) << __tz_left) - 1;
+      difference_type __tz_left = __block_size - 1 - __libcpp_clz(__left_bitset);
+      __left_bitset &= (static_cast<uint64_t>(1) << __tz_left) - 1;
       _RandomAccessIterator it = __first + __tz_left;
       if (it != __lm1) {
-        _VSTD::iter_swap(it, __lm1);
+        std::iter_swap(it, __lm1);
       }
       --__lm1;
     }
@@ -535,22 +457,100 @@
     // Swap within the right side.  Need to find set positions in the reverse
     // order.
     while (__right_bitset != 0) {
-      difference_type __tz_right = _Bitset::__block_size - 1 - _Bitset::__clz(__right_bitset);
-      __right_bitset &= (static_cast<__storage_t>(1) << __tz_right) - 1;
+      difference_type __tz_right = __block_size - 1 - __libcpp_clz(__right_bitset);
+      __right_bitset &= (static_cast<uint64_t>(1) << __tz_right) - 1;
       _RandomAccessIterator it = __lm1 - __tz_right;
       if (it != __first) {
-        _VSTD::iter_swap(it, __first);
+        std::iter_swap(it, __first);
       }
       ++__first;
     }
   }
+}
+
+// Partition [__first, __last) using the comparator __comp.  *__first has the
+// chosen pivot.  Elements that are equivalent are kept to the left of the
+// pivot.  Returns the iterator for the pivot and a bool value which is true if
+// the provided range is already sorted, false otherwise.  We assume that the
+// length of the range is at least three elements.
+//
+// __bitset_partition uses bitsets for storing outcomes of the comparisons
+// between the pivot and other elements.
+template <class _RandomAccessIterator, class _Compare>
+_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool>
+__bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
+  typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type;
+  _LIBCPP_ASSERT(__last - __first >= difference_type(3), "");
+
+  _RandomAccessIterator __begin = __first;
+  value_type __pivot(std::move(*__first));
+  // Size in bits for the bitset in use.
+  _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8;
+  // Find the first element greater than the pivot.
+  if (__comp(__pivot, *(__last - difference_type(1)))) {
+    // Not guarded since we know the last element is greater than the pivot.
+    while (!__comp(__pivot, *++__first)) {
+    }
+  } else {
+    while (++__first < __last && !__comp(__pivot, *__first)) {
+    }
+  }
+  // Find the last element less than or equal to the pivot.
+  if (__first < __last) {
+    // It will be always guarded because __introsort will do the median-of-three
+    // before calling this.
+    while (__comp(__pivot, *--__last)) {
+    }
+  }
+  // If the first element greater than the pivot is at or after the
+  // last element less than or equal to the pivot, then we have covered the
+  // entire range without swapping elements.  This implies the range is already
+  // partitioned.
+  bool __already_partitioned = __first >= __last;
+  if (!__already_partitioned) {
+    std::iter_swap(__first, __last);
+    ++__first;
+  }
+
+  // In [__first, __last) __last is not inclusive. From now on, it uses last
+  // minus one to be inclusive on both sides.
+  _RandomAccessIterator __lm1 = __last - difference_type(1);
+  uint64_t __left_bitset = 0;
+  uint64_t __right_bitset = 0;
+
+  // Reminder: length = __lm1 - __first + 1.
+  while (__lm1 - __first >= 2 * __block_size - 1) {
+    // Record the comparison outcomes for the elements currently on the left
+    // side.
+    if (__left_bitset == 0)
+      __populate_left_bitset<_Compare>(__first, __comp, __pivot, __left_bitset);
+    // Record the comparison outcomes for the elements currently on the right
+    // side.
+    if (__right_bitset == 0)
+      __populate_right_bitset<_Compare>(__lm1, __comp, __pivot, __right_bitset);
+    // Swap the elements recorded to be the candidates for swapping in the
+    // bitsets.
+    __swap_bitmap_pos(__first, __lm1, __left_bitset, __right_bitset);
+    // Only advance the iterator if all the elements that need to be moved to
+    // other side were moved.
+    __first += (__left_bitset == 0) ? difference_type(__block_size) : difference_type(0);
+    __lm1 -= (__right_bitset == 0) ? difference_type(__block_size) : difference_type(0);
+  }
+  // Now, we have a less-than a block worth of elements on at least one of the
+  // sides.
+  __bitset_partition_partial_blocks(__first, __lm1, __comp, __pivot, __left_bitset, __right_bitset);
+  // At least one the bitsets would be empty.  For the non-empty one, we need to
+  // properly partition the elements that appear within that bitset.
+  __swap_bitmap_pos_within(__first, __lm1, __left_bitset, __right_bitset);
+
   // Move the pivot to its correct position.
   _RandomAccessIterator __pivot_pos = __first - difference_type(1);
   if (__begin != __pivot_pos) {
-    *__begin = _VSTD::move(*__pivot_pos);
+    *__begin = std::move(*__pivot_pos);
   }
-  *__pivot_pos = _VSTD::move(__pivot);
-  return _VSTD::make_pair(__pivot_pos, __already_partitioned);
+  *__pivot_pos = std::move(__pivot);
+  return std::make_pair(__pivot_pos, __already_partitioned);
 }
 
 // Partition [__first, __last) using the comparator __comp.  *__first has the
@@ -559,12 +559,13 @@
 // the provided range is already sorted, false otherwise.  We assume that the
 // length of the range is at least three elements.
 template <class _RandomAccessIterator, class _Compare>
-_LIBCPP_HIDE_FROM_ABI _VSTD::pair<_RandomAccessIterator, bool>
+_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool>
 __partition_with_equals_on_right(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
-  typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
+  _LIBCPP_ASSERT(__last - __first >= difference_type(3), "");
   _RandomAccessIterator __begin = __first;
-  value_type __pivot(_VSTD::move(*__first));
+  value_type __pivot(std::move(*__first));
   // Find the first element greater or equal to the pivot.  It will be always
   // guarded because __introsort will do the median-of-three before calling
   // this.
@@ -589,7 +590,7 @@
   // right of the pivot and the other to left of the pivot) that are not on the
   // correct side of the pivot.
   while (__first < __last) {
-    _VSTD::iter_swap(__first, __last);
+    std::iter_swap(__first, __last);
     while (__comp(*++__first, __pivot))
       ;
     while (!__comp(*--__last, __pivot))
@@ -598,10 +599,10 @@
   // Move the pivot to its correct position.
   _RandomAccessIterator __pivot_pos = __first - difference_type(1);
   if (__begin != __pivot_pos) {
-    *__begin = _VSTD::move(*__pivot_pos);
+    *__begin = std::move(*__pivot_pos);
   }
-  *__pivot_pos = _VSTD::move(__pivot);
-  return _VSTD::make_pair(__pivot_pos, __already_partitioned);
+  *__pivot_pos = std::move(__pivot);
+  return std::make_pair(__pivot_pos, __already_partitioned);
 }
 
 // Similar to the above function.  Elements equivalent to the pivot are put to
@@ -611,9 +612,9 @@
                                                                             _RandomAccessIterator __last,
                                                                             _Compare __comp) {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
-  typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type;
+  typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type;
   _RandomAccessIterator __begin = __first;
-  value_type __pivot(_VSTD::move(*__first));
+  value_type __pivot(std::move(*__first));
   if (__comp(__pivot, *(__last - difference_type(1)))) {
     // Guarded.
     while (!__comp(__pivot, *++__first)) {
@@ -630,7 +631,7 @@
     }
   }
   while (__first < __last) {
-    _VSTD::iter_swap(__first, __last);
+    std::iter_swap(__first, __last);
     while (!__comp(__pivot, *++__first))
       ;
     while (__comp(__pivot, *--__last))
@@ -638,9 +639,9 @@
   }
   _RandomAccessIterator __pivot_pos = __first - difference_type(1);
   if (__begin != __pivot_pos) {
-    *__begin = _VSTD::move(*__pivot_pos);
+    *__begin = std::move(*__pivot_pos);
   }
-  *__pivot_pos = _VSTD::move(__pivot);
+  *__pivot_pos = std::move(__pivot);
   return __first;
 }
 
@@ -650,16 +651,16 @@
 //  - Tuckey's ninther technique for computing the pivot,
 //  - check on whether partition was not required.
 // The implementation is partly based on Orson Peters' pattern-defeating
-// quicksort, published at: <https://github.com/orlp/pdqsort>
+// quicksort, published at: <https://github.com/orlp/pdqsort>.
 template <class _Compare, class _RandomAccessIterator, bool _UseBitSetPartition>
 void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp,
                  typename iterator_traits<_RandomAccessIterator>::difference_type __depth, bool __leftmost = true) {
   typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type;
   typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
   // Upper bound for using insertion sort for sorting.
-  _LIBCPP_CONSTEXPR_AFTER_CXX11 difference_type __limit = 24;
+  _LIBCPP_CONSTEXPR difference_type __limit = 24;
   // Lower bound for using Tuckey's ninther technique for median computation.
-  _LIBCPP_CONSTEXPR_AFTER_CXX11 difference_type __ninther_threshold = 128;
+  _LIBCPP_CONSTEXPR difference_type __ninther_threshold = 128;
   while (true) {
     difference_type __len = __last - __first;
     switch (__len) {
@@ -671,30 +672,29 @@
         swap(*__first, *__last);
       return;
     case 3:
-      _VSTD::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp);
+      std::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp);
       return;
     case 4:
-      _VSTD::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
-                                                --__last, __comp);
+      std::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
+                                              --__last, __comp);
       return;
     case 5:
-      _VSTD::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
-                                                __first + difference_type(3), --__last, __comp);
+      std::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2),
+                                              __first + difference_type(3), --__last, __comp);
       return;
     }
-    // Use insertion sort if the length of the range is below the specified
-    // limit.
+    // Use insertion sort if the length of the range is below the specified limit.
     if (__len < __limit) {
       if (__leftmost) {
-        _VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp);
+        std::__insertion_sort<_Compare>(__first, __last, __comp);
       } else {
-        _VSTD::__insertion_sort_3_unguarded<_Compare>(__first, __last, __comp);
+        std::__insertion_sort_unguarded<_Compare>(__first, __last, __comp);
       }
       return;
     }
     if (__depth == 0) {
       // Fallback to heap sort as Introsort suggests.
-      _VSTD::__partial_sort<_Compare>(__first, __last, __last, __comp);
+      std::__partial_sort<_Compare>(__first, __last, __last, __comp);
       return;
     }
     --__depth;
@@ -703,15 +703,15 @@
       // Use Tuckey's ninther technique or median of 3 for pivot selection
       // depending on the length of the range being sorted.
       if (__len > __ninther_threshold) {
-        _VSTD::__sort3<_Compare>(__first, __first + __half_len, __last - difference_type(1), __comp);
-        _VSTD::__sort3<_Compare>(__first + difference_type(1), __first + (__half_len - 1), __last - difference_type(2),
-                                 __comp);
-        _VSTD::__sort3<_Compare>(__first + difference_type(2), __first + (__half_len + 1), __last - difference_type(3),
-                                 __comp);
-        _VSTD::__sort3<_Compare>(__first + (__half_len - 1), __first + __half_len, __first + (__half_len + 1), __comp);
-        _VSTD::iter_swap(__first, __first + __half_len);
+        std::__sort3<_Compare>(__first, __first + __half_len, __last - difference_type(1), __comp);
+        std::__sort3<_Compare>(__first + difference_type(1), __first + (__half_len - 1), __last - difference_type(2),
+                               __comp);
+        std::__sort3<_Compare>(__first + difference_type(2), __first + (__half_len + 1), __last - difference_type(3),
+                               __comp);
+        std::__sort3<_Compare>(__first + (__half_len - 1), __first + __half_len, __first + (__half_len + 1), __comp);
+        std::iter_swap(__first, __first + __half_len);
       } else {
-        _VSTD::__sort3<_Compare>(__first + __half_len, __first, __last - difference_type(1), __comp);
+        std::__sort3<_Compare>(__first + __half_len, __first, __last - difference_type(1), __comp);
       }
     }
     // The elements to the left of the current iterator range are already
@@ -728,14 +728,14 @@
     }
     // Use bitset partition only if asked for.
     auto __ret = _UseBitSetPartition
-                     ? __bitset_partition<__64bit_set, _RandomAccessIterator, _Compare>(__first, __last, __comp)
+                     ? __bitset_partition<_RandomAccessIterator, _Compare>(__first, __last, __comp)
                      : __partition_with_equals_on_right<_RandomAccessIterator, _Compare>(__first, __last, __comp);
     _RandomAccessIterator __i = __ret.first;
     // [__first, __i) < *__i and *__i <= [__i+1, __last)
     // If we were given a perfect partition, see if insertion sort is quick...
     if (__ret.second) {
-      bool __fs = _VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp);
-      if (_VSTD::__insertion_sort_incomplete<_Compare>(__i + difference_type(1), __last, __comp)) {
+      bool __fs = std::__insertion_sort_incomplete<_Compare>(__first, __i, __comp);
+      if (std::__insertion_sort_incomplete<_Compare>(__i + difference_type(1), __last, __comp)) {
         if (__fs)
           return;
         __last = __i;
@@ -747,9 +747,8 @@
         }
       }
     }
-    // Sort the left partiton recursively and the right partition with tail
-    // recursion elimination.
-    _VSTD::__introsort<_Compare, _RandomAccessIterator, _UseBitSetPartition>(__first, __i, __comp, __depth, __leftmost);
+    // Sort the left partiton recursively and the right partition with tail recursion elimination.
+    std::__introsort<_Compare, _RandomAccessIterator, _UseBitSetPartition>(__first, __i, __comp, __depth, __leftmost);
     __leftmost = false;
     __first = ++__i;
   }
@@ -772,15 +771,14 @@
   // Only use bitset partitioning for arithmetic types.  We should also check
   // that the default comparator is in use so that we are sure that there are no
   // branches in the comparator.
-  _VSTD::__introsort<_Compare, _RandomAccessIterator,
-                     _VSTD::is_arithmetic<typename iterator_traits<_RandomAccessIterator>::value_type>::value>(
+  std::__introsort<_Compare, _RandomAccessIterator, __use_branchless_sort<_Compare, _RandomAccessIterator>::value>(
       __first, __last, __comp, __depth_limit);
 }
 
 template <class _Compare, class _Tp>
 inline _LIBCPP_INLINE_VISIBILITY void __sort(_Tp** __first, _Tp** __last, __less<_Tp*>&) {
   __less<uintptr_t> __comp;
-  _VSTD::__sort<__less<uintptr_t>&, uintptr_t*>((uintptr_t*)__first, (uintptr_t*)__last, __comp);
+  std::__sort<__less<uintptr_t>&, uintptr_t*>((uintptr_t*)__first, (uintptr_t*)__last, __comp);
 }
 
 _LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less<char>&, char*>(char*, char*, __less<char>&))
@@ -827,16 +825,16 @@
   _LIBCPP_DEBUG_RANDOMIZE_RANGE(__first, __last);
   typedef typename __comp_ref_type<_Compare>::type _Comp_ref;
   if (__libcpp_is_constant_evaluated()) {
-    _VSTD::__partial_sort<_Comp_ref>(__first, __last, __last, _Comp_ref(__comp));
+    std::__partial_sort<_Comp_ref>(__first, __last, __last, _Comp_ref(__comp));
   } else {
-    _VSTD::__sort<_Comp_ref>(_VSTD::__unwrap_iter(__first), _VSTD::__unwrap_iter(__last), _Comp_ref(__comp));
+    std::__sort<_Comp_ref>(std::__unwrap_iter(__first), std::__unwrap_iter(__last), _Comp_ref(__comp));
   }
 }
 
 template <class _RandomAccessIterator>
 inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 void sort(_RandomAccessIterator __first,
                                                                          _RandomAccessIterator __last) {
-  _VSTD::sort(__first, __last, __less<typename iterator_traits<_RandomAccessIterator>::value_type>());
+  std::sort(__first, __last, __less<typename iterator_traits<_RandomAccessIterator>::value_type>());
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__bits b/libcxx/include/__bits
--- a/libcxx/include/__bits
+++ b/libcxx/include/__bits
@@ -53,6 +53,21 @@
 inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
 int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { return __builtin_popcountll(__x); }
 
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+unsigned __libcpp_blsr(unsigned  __x) _NOEXCEPT {
+  return __x ^ (__x & -__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+unsigned long __libcpp_blsr(unsigned long __x) _NOEXCEPT {
+  return __x ^ (__x & -__x);
+}
+
+inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR
+unsigned long long __libcpp_blsr(unsigned long long __x) _NOEXCEPT {
+  return __x ^ (__x & -__x);
+}
+
 #else  // _LIBCPP_COMPILER_MSVC
 
 // Precondition:  __x != 0