diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h --- a/libcxx/include/__algorithm/sort.h +++ b/libcxx/include/__algorithm/sort.h @@ -15,6 +15,7 @@ #include <__algorithm/min_element.h> #include <__algorithm/partial_sort.h> #include <__algorithm/unwrap_iter.h> +#include <__assert> #include <__config> #include <__utility/swap.h> #include @@ -70,7 +71,7 @@ template unsigned __sort4(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3, _ForwardIterator __x4, _Compare __c) { - unsigned __r = _VSTD::__sort3<_Compare>(__x1, __x2, __x3, __c); + unsigned __r = std::__sort3<_Compare>(__x1, __x2, __x3, __c); if (__c(*__x4, *__x3)) { swap(*__x3, *__x4); ++__r; @@ -91,7 +92,7 @@ template _LIBCPP_HIDDEN unsigned __sort5(_ForwardIterator __x1, _ForwardIterator __x2, _ForwardIterator __x3, _ForwardIterator __x4, _ForwardIterator __x5, _Compare __c) { - unsigned __r = _VSTD::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c); + unsigned __r = std::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c); if (__c(*__x5, *__x4)) { swap(*__x4, *__x5); ++__r; @@ -153,52 +154,52 @@ inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void> __sort3_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3, _Compare __c) { - _VSTD::__cond_swap<_Compare>(__x2, __x3, __c); - _VSTD::__partially_sorted_swap<_Compare>(__x1, __x2, __x3, __c); + std::__cond_swap<_Compare>(__x2, __x3, __c); + std::__partially_sorted_swap<_Compare>(__x1, __x2, __x3, __c); } template inline _LIBCPP_HIDE_FROM_ABI __enable_if_t::value, void> __sort3_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3, _Compare __c) { - _VSTD::__sort3<_Compare>(__x1, __x2, __x3, __c); + std::__sort3<_Compare>(__x1, __x2, __x3, __c); } template inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void> __sort4_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3, _RandomAccessIterator __x4, _Compare __c) { - _VSTD::__cond_swap<_Compare>(__x1, __x3, __c); - _VSTD::__cond_swap<_Compare>(__x2, __x4, __c); - _VSTD::__cond_swap<_Compare>(__x1, __x2, __c); - _VSTD::__cond_swap<_Compare>(__x3, __x4, __c); - _VSTD::__cond_swap<_Compare>(__x2, __x3, __c); + std::__cond_swap<_Compare>(__x1, __x3, __c); + std::__cond_swap<_Compare>(__x2, __x4, __c); + std::__cond_swap<_Compare>(__x1, __x2, __c); + std::__cond_swap<_Compare>(__x3, __x4, __c); + std::__cond_swap<_Compare>(__x2, __x3, __c); } template inline _LIBCPP_HIDE_FROM_ABI __enable_if_t::value, void> __sort4_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3, _RandomAccessIterator __x4, _Compare __c) { - _VSTD::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c); + std::__sort4<_Compare>(__x1, __x2, __x3, __x4, __c); } template inline _LIBCPP_HIDE_FROM_ABI __enable_if_t<__use_branchless_sort<_Compare, _RandomAccessIterator>::value, void> __sort5_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3, _RandomAccessIterator __x4, _RandomAccessIterator __x5, _Compare __c) { - _VSTD::__cond_swap<_Compare>(__x1, __x2, __c); - _VSTD::__cond_swap<_Compare>(__x4, __x5, __c); - _VSTD::__partially_sorted_swap<_Compare>(__x3, __x4, __x5, __c); - _VSTD::__cond_swap<_Compare>(__x2, __x5, __c); - _VSTD::__partially_sorted_swap<_Compare>(__x1, __x3, __x4, __c); - _VSTD::__partially_sorted_swap<_Compare>(__x2, __x3, __x4, __c); + std::__cond_swap<_Compare>(__x1, __x2, __c); + std::__cond_swap<_Compare>(__x4, __x5, __c); + std::__partially_sorted_swap<_Compare>(__x3, __x4, __x5, __c); + std::__cond_swap<_Compare>(__x2, __x5, __c); + std::__partially_sorted_swap<_Compare>(__x1, __x3, __x4, __c); + std::__partially_sorted_swap<_Compare>(__x2, __x3, __x4, __c); } template inline _LIBCPP_HIDE_FROM_ABI __enable_if_t::value, void> __sort5_maybe_branchless(_RandomAccessIterator __x1, _RandomAccessIterator __x2, _RandomAccessIterator __x3, _RandomAccessIterator __x4, _RandomAccessIterator __x5, _Compare __c) { - _VSTD::__sort5<_Compare>(__x1, __x2, __x3, __x4, __x5, __c); + std::__sort5<_Compare>(__x1, __x2, __x3, __x4, __x5, __c); } // Assumes size > 0 @@ -207,31 +208,16 @@ _Compare __comp) { _BidirectionalIterator __lm1 = __last; for (--__lm1; __first != __lm1; ++__first) { - _BidirectionalIterator __i = _VSTD::min_element(__first, __last, __comp); + _BidirectionalIterator __i = std::min_element(__first, __last, __comp); if (__i != __first) swap(*__first, *__i); } } -template -void __insertion_sort(_BidirectionalIterator __first, _BidirectionalIterator __last, _Compare __comp) { - typedef typename iterator_traits<_BidirectionalIterator>::value_type value_type; - if (__first != __last) { - _BidirectionalIterator __i = __first; - for (++__i; __i != __last; ++__i) { - _BidirectionalIterator __j = __i; - value_type __t(_VSTD::move(*__j)); - for (_BidirectionalIterator __k = __i; __k != __first && __comp(__t, *--__k); --__j) - *__j = _VSTD::move(*__k); - *__j = _VSTD::move(__t); - } - } -} - // Sort the iterator range [__first, __last) using the comparator __comp using // the insertion sort algorithm. template -void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { +void __insertion_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; if (__first == __last) @@ -239,14 +225,14 @@ for (_RandomAccessIterator __i = __first + difference_type(1); __i != __last; ++__i) { _RandomAccessIterator __j = __i - difference_type(1); if (__comp(*__i, *__j)) { - value_type __t(_VSTD::move(*__i)); + value_type __t(std::move(*__i)); _RandomAccessIterator __k = __j; __j = __i; do { - *__j = _VSTD::move(*__k); + *__j = std::move(*__k); __j = __k; } while (__j != __first && __comp(__t, *--__k)); - *__j = _VSTD::move(__t); + *__j = std::move(__t); } } } @@ -257,7 +243,7 @@ // Assumes that there is an element in the position (__first - 1) and that each // element in the input range is greater or equal to the element at __first - 1. template -void __insertion_sort_3_unguarded(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { +void __insertion_sort_unguarded(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; if (__first == __last) @@ -265,14 +251,14 @@ for (_RandomAccessIterator __i = __first + difference_type(1); __i != __last; ++__i) { _RandomAccessIterator __j = __i - difference_type(1); if (__comp(*__i, *__j)) { - value_type __t(_VSTD::move(*__i)); + value_type __t(std::move(*__i)); _RandomAccessIterator __k = __j; __j = __i; do { - *__j = _VSTD::move(*__k); + *__j = std::move(*__k); __j = __k; } while (__comp(__t, *--__k)); // No need for bounds check due to the assumption stated above. - *__j = _VSTD::move(__t); + *__j = std::move(__t); } } } @@ -289,32 +275,32 @@ swap(*__first, *__last); return true; case 3: - _VSTD::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp); + std::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp); return true; case 4: - _VSTD::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), - --__last, __comp); + std::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), + --__last, __comp); return true; case 5: - _VSTD::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), - __first + difference_type(3), --__last, __comp); + std::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), + __first + difference_type(3), --__last, __comp); return true; } typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; _RandomAccessIterator __j = __first + difference_type(2); - _VSTD::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), __j, __comp); + std::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), __j, __comp); const unsigned __limit = 8; unsigned __count = 0; for (_RandomAccessIterator __i = __j + difference_type(1); __i != __last; ++__i) { if (__comp(*__i, *__j)) { - value_type __t(_VSTD::move(*__i)); + value_type __t(std::move(*__i)); _RandomAccessIterator __k = __j; __j = __i; do { - *__j = _VSTD::move(*__k); + *__j = std::move(*__k); __j = __k; } while (__j != __first && __comp(__t, *--__k)); - *__j = _VSTD::move(__t); + *__j = std::move(__t); if (++__count == __limit) return ++__i == __last; } @@ -331,19 +317,19 @@ __destruct_n __d(0); unique_ptr __h(__first2, __d); value_type* __last2 = __first2; - ::new ((void*)__last2) value_type(_VSTD::move(*__first1)); + ::new ((void*)__last2) value_type(std::move(*__first1)); __d.template __incr(); for (++__last2; ++__first1 != __last1; ++__last2) { value_type* __j2 = __last2; value_type* __i2 = __j2; if (__comp(*__first1, *--__i2)) { - ::new ((void*)__j2) value_type(_VSTD::move(*__i2)); + ::new ((void*)__j2) value_type(std::move(*__i2)); __d.template __incr(); for (--__j2; __i2 != __first2 && __comp(*__first1, *--__i2); --__j2) - *__j2 = _VSTD::move(*__i2); - *__j2 = _VSTD::move(*__first1); + *__j2 = std::move(*__i2); + *__j2 = std::move(*__first1); } else { - ::new ((void*)__j2) value_type(_VSTD::move(*__first1)); + ::new ((void*)__j2) value_type(std::move(*__first1)); __d.template __incr(); } } @@ -351,135 +337,65 @@ } } -struct __64bit_set { - typedef uint64_t __storage_t; - enum { __block_size = 64 }; - static __storage_t __blsr(__storage_t x) { - // _blsr_u64 can be used here but it did not make any performance - // difference in practice. - return x ^ (x & -x); - } - static int __clz(__storage_t x) { return __builtin_clzll(x); } - static int __ctz(__storage_t x) { return __builtin_ctzll(x); } - static int __popcount(__storage_t x) { return __builtin_popcountll(x); } -}; - -struct __32bit_set { - typedef uint32_t __storage_t; - enum { __block_size = 32 }; - static __storage_t __blsr(__storage_t x) { - // _blsr_u32 can be used here but it did not make any performance - // difference in practice. - return x ^ (x & -x); - } - static int __clz(__storage_t x) { return __builtin_clzl(x); } - static int __ctz(__storage_t x) { return __builtin_ctzl(x); } - static int __popcount(__storage_t x) { return __builtin_popcountl(x); } -}; - -template +template inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos(_RandomAccessIterator __first, _RandomAccessIterator __last, - typename _Bitset::__storage_t& __left_bitset, - typename _Bitset::__storage_t& __right_bitset) { - typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + uint64_t& __left_bitset, uint64_t& __right_bitset) { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; // Swap one pair on each iteration as long as both bitsets have at least one // element for swapping. while (__left_bitset != 0 && __right_bitset != 0) { - difference_type tz_left = _Bitset::__ctz(__left_bitset); - __left_bitset = _Bitset::__blsr(__left_bitset); - difference_type tz_right = _Bitset::__ctz(__right_bitset); - __right_bitset = _Bitset::__blsr(__right_bitset); - _VSTD::iter_swap(__first + tz_left, __last - tz_right); + difference_type tz_left = __libcpp_ctz(__left_bitset); + __left_bitset = __libcpp_blsr(__left_bitset); + difference_type tz_right = __libcpp_ctz(__right_bitset); + __right_bitset = __libcpp_blsr(__right_bitset); + std::iter_swap(__first + tz_left, __last - tz_right); } } -// Partition [__first, __last) using the comparator __comp. *__first has the -// chosen pivot. Elements that are equivalent are kept to the left of the -// pivot. Returns the iterator for the pivot and a bool value which is true if -// the provided range is already sorted, false otherwise. We assume that the -// length of the range is at least three elements. -// -// We term the partitioning algorithm as bitset partitioning since the -// outcomes of the comparisons between the pivot and other elements are stored -// in the fixed size bitsets. -template -_LIBCPP_HIDE_FROM_ABI _VSTD::pair<_RandomAccessIterator, bool> -__bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { - typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type; - typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type difference_type; - typedef typename _Bitset::__storage_t __storage_t; - _RandomAccessIterator __begin = __first; - value_type __pivot(_VSTD::move(*__first)); - // Find the first element greater than the pivot. - if (__comp(__pivot, *(__last - difference_type(1)))) { - // Not guarded since we know the last element is greater than the pivot. - while (!__comp(__pivot, *++__first)) { - } - } else { - while (++__first < __last && !__comp(__pivot, *__first)) { - } - } - // Find the last element less than or equal to the pivot. - if (__first < __last) { - // It will be always guarded because __introsort will do the median-of-three - // before calling this. - while (__comp(__pivot, *--__last)) { - } +template ::value_type> +inline _LIBCPP_HIDE_FROM_ABI void __populate_left_bitset(_RandomAccessIterator __first, _Compare __comp, + _ValueType& __pivot, uint64_t& __left_bitset) { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + // Size in bits for the bitset in use. + _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8; + // Possible vectorization. With a proper "-march" flag, the following loop + // will be compiled into a set of SIMD instructions. + _RandomAccessIterator __iter = __first; + for (int __j = 0; __j < __block_size;) { + bool __comp_result = !__comp(*__iter, __pivot); + __left_bitset |= (static_cast(__comp_result) << __j); + __j++; + ++__iter; } - // If the first element greater than the pivot is at or after the - // last element less than or equal to the pivot, then we have covered the - // entire range without swapping elements. This implies the range is already - // partitioned. - bool __already_partitioned = __first >= __last; - if (!__already_partitioned) { - _VSTD::iter_swap(__first, __last); - ++__first; - } - - // In [__first, __last) __last is not inclusive. From now on, it uses last - // minus one to be inclusive on both sides. - _RandomAccessIterator __lm1 = __last - difference_type(1); - __storage_t __left_bitset = 0; - __storage_t __right_bitset = 0; +} - // Reminder: length = __lm1 - __first + 1. - while (__lm1 - __first >= 2 * _Bitset::__block_size - 1) { - // Record the comparison outcomes for the elements currently on the left - // side. - if (__left_bitset == 0) { - // Possible vectorization. With a proper "-march" flag, the following loop - // will be compiled into a set of SIMD instructions. - _RandomAccessIterator __iter = __first; - for (int __j = 0; __j < _Bitset::__block_size;) { - bool __comp_result = !__comp(*__iter, __pivot); - __left_bitset |= (static_cast<__storage_t>(__comp_result) << __j); - __j++; - ++__iter; - } - } - // Record the comparison outcomes for the elements currently on the right - // side. - if (__right_bitset == 0) { - // Possible vectorization. With a proper "-march" flag, the following loop - // will be compiled into a set of SIMD instructions. - _RandomAccessIterator __iter = __lm1; - for (int __j = 0; __j < _Bitset::__block_size;) { - bool __comp_result = __comp(*__iter, __pivot); - __right_bitset |= (static_cast<__storage_t>(__comp_result) << __j); - __j++; - --__iter; - } - } - // Swap the elements recorded to be the candidates for swapping in the - // bitsets. - __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); - // Only advance the iterator if all the elements that need to be moved to - // other side were moved. - __first += (__left_bitset == 0) ? difference_type(_Bitset::__block_size) : difference_type(0); - __lm1 -= (__right_bitset == 0) ? difference_type(_Bitset::__block_size) : difference_type(0); +template ::value_type> +inline _LIBCPP_HIDE_FROM_ABI void __populate_right_bitset(_RandomAccessIterator __lm1, _Compare __comp, + _ValueType& __pivot, uint64_t& __right_bitset) { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + // Size in bits for the bitset in use. + _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8; + // Possible vectorization. With a proper "-march" flag, the following loop + // will be compiled into a set of SIMD instructions. + _RandomAccessIterator __iter = __lm1; + for (int __j = 0; __j < __block_size;) { + bool __comp_result = __comp(*__iter, __pivot); + __right_bitset |= (static_cast(__comp_result) << __j); + __j++; + --__iter; } - // Now, we have a less-than a block worth of elements on at least one of the - // sides. +} + +template ::value_type> +inline _LIBCPP_HIDE_FROM_ABI void +__bitset_partition_partial_blocks(_RandomAccessIterator& __first, _RandomAccessIterator& __lm1, _Compare __comp, + _ValueType& __pivot, uint64_t& __left_bitset, uint64_t& __right_bitset) { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + // Size in bits for the bitset in use. + _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8; difference_type __remaining_len = __lm1 - __first + 1; difference_type __l_size; difference_type __r_size; @@ -488,18 +404,18 @@ __r_size = __remaining_len - __l_size; } else if (__left_bitset == 0) { // We know at least one side is a full block. - __l_size = __remaining_len - _Bitset::__block_size; - __r_size = _Bitset::__block_size; + __l_size = __remaining_len - __block_size; + __r_size = __block_size; } else { // if (__right_bitset == 0) - __l_size = _Bitset::__block_size; - __r_size = __remaining_len - _Bitset::__block_size; + __l_size = __block_size; + __r_size = __remaining_len - __block_size; } // Record the comparison outcomes for the elements currently on the left side. if (__left_bitset == 0) { _RandomAccessIterator __iter = __first; for (int j = 0; j < __l_size; j++) { bool __comp_result = !__comp(*__iter, __pivot); - __left_bitset |= (static_cast<__storage_t>(__comp_result) << j); + __left_bitset |= (static_cast(__comp_result) << j); ++__iter; } } @@ -509,24 +425,30 @@ _RandomAccessIterator __iter = __lm1; for (int j = 0; j < __r_size; j++) { bool __comp_result = __comp(*__iter, __pivot); - __right_bitset |= (static_cast<__storage_t>(__comp_result) << j); + __right_bitset |= (static_cast(__comp_result) << j); --__iter; } } - __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); + __swap_bitmap_pos(__first, __lm1, __left_bitset, __right_bitset); __first += (__left_bitset == 0) ? __l_size : 0; __lm1 -= (__right_bitset == 0) ? __r_size : 0; - // At least one the bitsets would be empty. For the non-empty one, we need to - // properly partition the elements that appear within that bitset. +} + +template +inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos_within(_RandomAccessIterator& __first, _RandomAccessIterator& __lm1, + uint64_t& __left_bitset, uint64_t& __right_bitset) { + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + // Size in bits for the bitset in use. + _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8; if (__left_bitset) { // Swap within the left side. Need to find set positions in the reverse // order. while (__left_bitset != 0) { - difference_type __tz_left = _Bitset::__block_size - 1 - _Bitset::__clz(__left_bitset); - __left_bitset &= (static_cast<__storage_t>(1) << __tz_left) - 1; + difference_type __tz_left = __block_size - 1 - __libcpp_clz(__left_bitset); + __left_bitset &= (static_cast(1) << __tz_left) - 1; _RandomAccessIterator it = __first + __tz_left; if (it != __lm1) { - _VSTD::iter_swap(it, __lm1); + std::iter_swap(it, __lm1); } --__lm1; } @@ -535,22 +457,100 @@ // Swap within the right side. Need to find set positions in the reverse // order. while (__right_bitset != 0) { - difference_type __tz_right = _Bitset::__block_size - 1 - _Bitset::__clz(__right_bitset); - __right_bitset &= (static_cast<__storage_t>(1) << __tz_right) - 1; + difference_type __tz_right = __block_size - 1 - __libcpp_clz(__right_bitset); + __right_bitset &= (static_cast(1) << __tz_right) - 1; _RandomAccessIterator it = __lm1 - __tz_right; if (it != __first) { - _VSTD::iter_swap(it, __first); + std::iter_swap(it, __first); } ++__first; } } +} + +// Partition [__first, __last) using the comparator __comp. *__first has the +// chosen pivot. Elements that are equivalent are kept to the left of the +// pivot. Returns the iterator for the pivot and a bool value which is true if +// the provided range is already sorted, false otherwise. We assume that the +// length of the range is at least three elements. +// +// __bitset_partition uses bitsets for storing outcomes of the comparisons +// between the pivot and other elements. +template +_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool> +__bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { + typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type; + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + _LIBCPP_ASSERT(__last - __first >= difference_type(3), ""); + + _RandomAccessIterator __begin = __first; + value_type __pivot(std::move(*__first)); + // Size in bits for the bitset in use. + _LIBCPP_CONSTEXPR difference_type __block_size = sizeof(uint64_t) * 8; + // Find the first element greater than the pivot. + if (__comp(__pivot, *(__last - difference_type(1)))) { + // Not guarded since we know the last element is greater than the pivot. + while (!__comp(__pivot, *++__first)) { + } + } else { + while (++__first < __last && !__comp(__pivot, *__first)) { + } + } + // Find the last element less than or equal to the pivot. + if (__first < __last) { + // It will be always guarded because __introsort will do the median-of-three + // before calling this. + while (__comp(__pivot, *--__last)) { + } + } + // If the first element greater than the pivot is at or after the + // last element less than or equal to the pivot, then we have covered the + // entire range without swapping elements. This implies the range is already + // partitioned. + bool __already_partitioned = __first >= __last; + if (!__already_partitioned) { + std::iter_swap(__first, __last); + ++__first; + } + + // In [__first, __last) __last is not inclusive. From now on, it uses last + // minus one to be inclusive on both sides. + _RandomAccessIterator __lm1 = __last - difference_type(1); + uint64_t __left_bitset = 0; + uint64_t __right_bitset = 0; + + // Reminder: length = __lm1 - __first + 1. + while (__lm1 - __first >= 2 * __block_size - 1) { + // Record the comparison outcomes for the elements currently on the left + // side. + if (__left_bitset == 0) + __populate_left_bitset<_Compare>(__first, __comp, __pivot, __left_bitset); + // Record the comparison outcomes for the elements currently on the right + // side. + if (__right_bitset == 0) + __populate_right_bitset<_Compare>(__lm1, __comp, __pivot, __right_bitset); + // Swap the elements recorded to be the candidates for swapping in the + // bitsets. + __swap_bitmap_pos(__first, __lm1, __left_bitset, __right_bitset); + // Only advance the iterator if all the elements that need to be moved to + // other side were moved. + __first += (__left_bitset == 0) ? difference_type(__block_size) : difference_type(0); + __lm1 -= (__right_bitset == 0) ? difference_type(__block_size) : difference_type(0); + } + // Now, we have a less-than a block worth of elements on at least one of the + // sides. + __bitset_partition_partial_blocks(__first, __lm1, __comp, __pivot, __left_bitset, __right_bitset); + // At least one the bitsets would be empty. For the non-empty one, we need to + // properly partition the elements that appear within that bitset. + __swap_bitmap_pos_within(__first, __lm1, __left_bitset, __right_bitset); + // Move the pivot to its correct position. _RandomAccessIterator __pivot_pos = __first - difference_type(1); if (__begin != __pivot_pos) { - *__begin = _VSTD::move(*__pivot_pos); + *__begin = std::move(*__pivot_pos); } - *__pivot_pos = _VSTD::move(__pivot); - return _VSTD::make_pair(__pivot_pos, __already_partitioned); + *__pivot_pos = std::move(__pivot); + return std::make_pair(__pivot_pos, __already_partitioned); } // Partition [__first, __last) using the comparator __comp. *__first has the @@ -559,12 +559,13 @@ // the provided range is already sorted, false otherwise. We assume that the // length of the range is at least three elements. template -_LIBCPP_HIDE_FROM_ABI _VSTD::pair<_RandomAccessIterator, bool> +_LIBCPP_HIDE_FROM_ABI std::pair<_RandomAccessIterator, bool> __partition_with_equals_on_right(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type; + typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type; + _LIBCPP_ASSERT(__last - __first >= difference_type(3), ""); _RandomAccessIterator __begin = __first; - value_type __pivot(_VSTD::move(*__first)); + value_type __pivot(std::move(*__first)); // Find the first element greater or equal to the pivot. It will be always // guarded because __introsort will do the median-of-three before calling // this. @@ -589,7 +590,7 @@ // right of the pivot and the other to left of the pivot) that are not on the // correct side of the pivot. while (__first < __last) { - _VSTD::iter_swap(__first, __last); + std::iter_swap(__first, __last); while (__comp(*++__first, __pivot)) ; while (!__comp(*--__last, __pivot)) @@ -598,10 +599,10 @@ // Move the pivot to its correct position. _RandomAccessIterator __pivot_pos = __first - difference_type(1); if (__begin != __pivot_pos) { - *__begin = _VSTD::move(*__pivot_pos); + *__begin = std::move(*__pivot_pos); } - *__pivot_pos = _VSTD::move(__pivot); - return _VSTD::make_pair(__pivot_pos, __already_partitioned); + *__pivot_pos = std::move(__pivot); + return std::make_pair(__pivot_pos, __already_partitioned); } // Similar to the above function. Elements equivalent to the pivot are put to @@ -611,9 +612,9 @@ _RandomAccessIterator __last, _Compare __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type; + typedef typename std::iterator_traits<_RandomAccessIterator>::value_type value_type; _RandomAccessIterator __begin = __first; - value_type __pivot(_VSTD::move(*__first)); + value_type __pivot(std::move(*__first)); if (__comp(__pivot, *(__last - difference_type(1)))) { // Guarded. while (!__comp(__pivot, *++__first)) { @@ -630,7 +631,7 @@ } } while (__first < __last) { - _VSTD::iter_swap(__first, __last); + std::iter_swap(__first, __last); while (!__comp(__pivot, *++__first)) ; while (__comp(__pivot, *--__last)) @@ -638,9 +639,9 @@ } _RandomAccessIterator __pivot_pos = __first - difference_type(1); if (__begin != __pivot_pos) { - *__begin = _VSTD::move(*__pivot_pos); + *__begin = std::move(*__pivot_pos); } - *__pivot_pos = _VSTD::move(__pivot); + *__pivot_pos = std::move(__pivot); return __first; } @@ -650,16 +651,16 @@ // - Tuckey's ninther technique for computing the pivot, // - check on whether partition was not required. // The implementation is partly based on Orson Peters' pattern-defeating -// quicksort, published at: +// quicksort, published at: . template void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, typename iterator_traits<_RandomAccessIterator>::difference_type __depth, bool __leftmost = true) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; typedef typename __comp_ref_type<_Compare>::type _Comp_ref; // Upper bound for using insertion sort for sorting. - _LIBCPP_CONSTEXPR_AFTER_CXX11 difference_type __limit = 24; + _LIBCPP_CONSTEXPR difference_type __limit = 24; // Lower bound for using Tuckey's ninther technique for median computation. - _LIBCPP_CONSTEXPR_AFTER_CXX11 difference_type __ninther_threshold = 128; + _LIBCPP_CONSTEXPR difference_type __ninther_threshold = 128; while (true) { difference_type __len = __last - __first; switch (__len) { @@ -671,30 +672,29 @@ swap(*__first, *__last); return; case 3: - _VSTD::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp); + std::__sort3_maybe_branchless<_Compare>(__first, __first + difference_type(1), --__last, __comp); return; case 4: - _VSTD::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), - --__last, __comp); + std::__sort4_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), + --__last, __comp); return; case 5: - _VSTD::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), - __first + difference_type(3), --__last, __comp); + std::__sort5_maybe_branchless<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), + __first + difference_type(3), --__last, __comp); return; } - // Use insertion sort if the length of the range is below the specified - // limit. + // Use insertion sort if the length of the range is below the specified limit. if (__len < __limit) { if (__leftmost) { - _VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp); + std::__insertion_sort<_Compare>(__first, __last, __comp); } else { - _VSTD::__insertion_sort_3_unguarded<_Compare>(__first, __last, __comp); + std::__insertion_sort_unguarded<_Compare>(__first, __last, __comp); } return; } if (__depth == 0) { // Fallback to heap sort as Introsort suggests. - _VSTD::__partial_sort<_Compare>(__first, __last, __last, __comp); + std::__partial_sort<_Compare>(__first, __last, __last, __comp); return; } --__depth; @@ -703,15 +703,15 @@ // Use Tuckey's ninther technique or median of 3 for pivot selection // depending on the length of the range being sorted. if (__len > __ninther_threshold) { - _VSTD::__sort3<_Compare>(__first, __first + __half_len, __last - difference_type(1), __comp); - _VSTD::__sort3<_Compare>(__first + difference_type(1), __first + (__half_len - 1), __last - difference_type(2), - __comp); - _VSTD::__sort3<_Compare>(__first + difference_type(2), __first + (__half_len + 1), __last - difference_type(3), - __comp); - _VSTD::__sort3<_Compare>(__first + (__half_len - 1), __first + __half_len, __first + (__half_len + 1), __comp); - _VSTD::iter_swap(__first, __first + __half_len); + std::__sort3<_Compare>(__first, __first + __half_len, __last - difference_type(1), __comp); + std::__sort3<_Compare>(__first + difference_type(1), __first + (__half_len - 1), __last - difference_type(2), + __comp); + std::__sort3<_Compare>(__first + difference_type(2), __first + (__half_len + 1), __last - difference_type(3), + __comp); + std::__sort3<_Compare>(__first + (__half_len - 1), __first + __half_len, __first + (__half_len + 1), __comp); + std::iter_swap(__first, __first + __half_len); } else { - _VSTD::__sort3<_Compare>(__first + __half_len, __first, __last - difference_type(1), __comp); + std::__sort3<_Compare>(__first + __half_len, __first, __last - difference_type(1), __comp); } } // The elements to the left of the current iterator range are already @@ -728,14 +728,14 @@ } // Use bitset partition only if asked for. auto __ret = _UseBitSetPartition - ? __bitset_partition<__64bit_set, _RandomAccessIterator, _Compare>(__first, __last, __comp) + ? __bitset_partition<_RandomAccessIterator, _Compare>(__first, __last, __comp) : __partition_with_equals_on_right<_RandomAccessIterator, _Compare>(__first, __last, __comp); _RandomAccessIterator __i = __ret.first; // [__first, __i) < *__i and *__i <= [__i+1, __last) // If we were given a perfect partition, see if insertion sort is quick... if (__ret.second) { - bool __fs = _VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp); - if (_VSTD::__insertion_sort_incomplete<_Compare>(__i + difference_type(1), __last, __comp)) { + bool __fs = std::__insertion_sort_incomplete<_Compare>(__first, __i, __comp); + if (std::__insertion_sort_incomplete<_Compare>(__i + difference_type(1), __last, __comp)) { if (__fs) return; __last = __i; @@ -747,9 +747,8 @@ } } } - // Sort the left partiton recursively and the right partition with tail - // recursion elimination. - _VSTD::__introsort<_Compare, _RandomAccessIterator, _UseBitSetPartition>(__first, __i, __comp, __depth, __leftmost); + // Sort the left partiton recursively and the right partition with tail recursion elimination. + std::__introsort<_Compare, _RandomAccessIterator, _UseBitSetPartition>(__first, __i, __comp, __depth, __leftmost); __leftmost = false; __first = ++__i; } @@ -772,15 +771,14 @@ // Only use bitset partitioning for arithmetic types. We should also check // that the default comparator is in use so that we are sure that there are no // branches in the comparator. - _VSTD::__introsort<_Compare, _RandomAccessIterator, - _VSTD::is_arithmetic::value_type>::value>( + std::__introsort<_Compare, _RandomAccessIterator, __use_branchless_sort<_Compare, _RandomAccessIterator>::value>( __first, __last, __comp, __depth_limit); } template inline _LIBCPP_INLINE_VISIBILITY void __sort(_Tp** __first, _Tp** __last, __less<_Tp*>&) { __less __comp; - _VSTD::__sort<__less&, uintptr_t*>((uintptr_t*)__first, (uintptr_t*)__last, __comp); + std::__sort<__less&, uintptr_t*>((uintptr_t*)__first, (uintptr_t*)__last, __comp); } _LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, char*>(char*, char*, __less&)) @@ -827,16 +825,16 @@ _LIBCPP_DEBUG_RANDOMIZE_RANGE(__first, __last); typedef typename __comp_ref_type<_Compare>::type _Comp_ref; if (__libcpp_is_constant_evaluated()) { - _VSTD::__partial_sort<_Comp_ref>(__first, __last, __last, _Comp_ref(__comp)); + std::__partial_sort<_Comp_ref>(__first, __last, __last, _Comp_ref(__comp)); } else { - _VSTD::__sort<_Comp_ref>(_VSTD::__unwrap_iter(__first), _VSTD::__unwrap_iter(__last), _Comp_ref(__comp)); + std::__sort<_Comp_ref>(std::__unwrap_iter(__first), std::__unwrap_iter(__last), _Comp_ref(__comp)); } } template inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_AFTER_CXX17 void sort(_RandomAccessIterator __first, _RandomAccessIterator __last) { - _VSTD::sort(__first, __last, __less::value_type>()); + std::sort(__first, __last, __less::value_type>()); } _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__bits b/libcxx/include/__bits --- a/libcxx/include/__bits +++ b/libcxx/include/__bits @@ -53,6 +53,21 @@ inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR int __libcpp_popcount(unsigned long long __x) _NOEXCEPT { return __builtin_popcountll(__x); } +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR +unsigned __libcpp_blsr(unsigned __x) _NOEXCEPT { + return __x ^ (__x & -__x); +} + +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR +unsigned long __libcpp_blsr(unsigned long __x) _NOEXCEPT { + return __x ^ (__x & -__x); +} + +inline _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR +unsigned long long __libcpp_blsr(unsigned long long __x) _NOEXCEPT { + return __x ^ (__x & -__x); +} + #else // _LIBCPP_COMPILER_MSVC // Precondition: __x != 0