diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm --- a/libcxx/include/algorithm +++ b/libcxx/include/algorithm @@ -3946,541 +3946,6 @@ } } -template -void -__sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - // _Compare is known to be a reference type - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; - const difference_type __limit = is_trivially_copy_constructible::value && - is_trivially_copy_assignable::value ? 30 : 6; - while (true) - { - __restart: - difference_type __len = __last - __first; - switch (__len) - { - case 0: - case 1: - return; - case 2: - if (__comp(*--__last, *__first)) - swap(*__first, *__last); - return; - case 3: - _VSTD::__sort3<_Compare>(__first, __first+1, --__last, __comp); - return; - case 4: - _VSTD::__sort4<_Compare>(__first, __first+1, __first+2, --__last, __comp); - return; - case 5: - _VSTD::__sort5<_Compare>(__first, __first+1, __first+2, __first+3, --__last, __comp); - return; - } - if (__len <= __limit) - { - _VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp); - return; - } - // __len > 5 - _RandomAccessIterator __m = __first; - _RandomAccessIterator __lm1 = __last; - --__lm1; - unsigned __n_swaps; - { - difference_type __delta; - if (__len >= 1000) - { - __delta = __len/2; - __m += __delta; - __delta /= 2; - __n_swaps = _VSTD::__sort5<_Compare>(__first, __first + __delta, __m, __m+__delta, __lm1, __comp); - } - else - { - __delta = __len/2; - __m += __delta; - __n_swaps = _VSTD::__sort3<_Compare>(__first, __m, __lm1, __comp); - } - } - // *__m is median - // partition [__first, __m) < *__m and *__m <= [__m, __last) - // (this inhibits tossing elements equivalent to __m around unnecessarily) - _RandomAccessIterator __i = __first; - _RandomAccessIterator __j = __lm1; - // j points beyond range to be tested, *__m is known to be <= *__lm1 - // The search going up is known to be guarded but the search coming down isn't. - // Prime the downward search with a guard. - if (!__comp(*__i, *__m)) // if *__first == *__m - { - // *__first == *__m, *__first doesn't go in first part - // manually guard downward moving __j against __i - while (true) - { - if (__i == --__j) - { - // *__first == *__m, *__m <= all other elements - // Parition instead into [__first, __i) == *__first and *__first < [__i, __last) - ++__i; // __first + 1 - __j = __last; - if (!__comp(*__first, *--__j)) // we need a guard if *__first == *(__last-1) - { - while (true) - { - if (__i == __j) - return; // [__first, __last) all equivalent elements - if (__comp(*__first, *__i)) - { - swap(*__i, *__j); - ++__n_swaps; - ++__i; - break; - } - ++__i; - } - } - // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1 - if (__i == __j) - return; - while (true) - { - while (!__comp(*__first, *__i)) - ++__i; - while (__comp(*__first, *--__j)) - ; - if (__i >= __j) - break; - swap(*__i, *__j); - ++__n_swaps; - ++__i; - } - // [__first, __i) == *__first and *__first < [__i, __last) - // The first part is sorted, sort the secod part - // _VSTD::__sort<_Compare>(__i, __last, __comp); - __first = __i; - goto __restart; - } - if (__comp(*__j, *__m)) - { - swap(*__i, *__j); - ++__n_swaps; - break; // found guard for downward moving __j, now use unguarded partition - } - } - } - // It is known that *__i < *__m - ++__i; - // j points beyond range to be tested, *__m is known to be <= *__lm1 - // if not yet partitioned... - if (__i < __j) - { - // known that *(__i - 1) < *__m - // known that __i <= __m - while (true) - { - // __m still guards upward moving __i - while (__comp(*__i, *__m)) - ++__i; - // It is now known that a guard exists for downward moving __j - while (!__comp(*--__j, *__m)) - ; - if (__i > __j) - break; - swap(*__i, *__j); - ++__n_swaps; - // It is known that __m != __j - // If __m just moved, follow it - if (__m == __i) - __m = __j; - ++__i; - } - } - // [__first, __i) < *__m and *__m <= [__i, __last) - if (__i != __m && __comp(*__m, *__i)) - { - swap(*__i, *__m); - ++__n_swaps; - } - // [__first, __i) < *__i and *__i <= [__i+1, __last) - // If we were given a perfect partition, see if insertion sort is quick... - if (__n_swaps == 0) - { - bool __fs = _VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp); - if (_VSTD::__insertion_sort_incomplete<_Compare>(__i+1, __last, __comp)) - { - if (__fs) - return; - __last = __i; - continue; - } - else - { - if (__fs) - { - __first = ++__i; - continue; - } - } - } - // sort smaller range with recursive call and larger with tail recursion elimination - if (__i - __first < __last - __i) - { - _VSTD::__sort<_Compare>(__first, __i, __comp); - // _VSTD::__sort<_Compare>(__i+1, __last, __comp); - __first = ++__i; - } - else - { - _VSTD::__sort<_Compare>(__i+1, __last, __comp); - // _VSTD::__sort<_Compare>(__first, __i, __comp); - __last = __i; - } - } -} - -namespace __bitsetsort { -struct __64bit_set { - typedef uint64_t __storage_t; - constexpr static int __block_size = 64; - static __storage_t __blsr(__storage_t __x) { - // _blsr_u64 can be used here but it did not make any performance - // difference in practice. - return __x ^ (__x & -__x); - } - static int __clz(__storage_t __x) { return __builtin_clzll(__x); } - static int __ctz(__storage_t __x) { return __builtin_ctzll(__x); } -}; - -struct __32bit_set { - typedef uint32_t __storage_t; - constexpr static int __block_size = 32; - static __storage_t __blsr(__storage_t __x) { - // _blsr_u32 can be used here but it did not make any performance - // difference in practice. - return __x ^ (__x & -__x); - } - static int __clz(__storage_t __x) { return __builtin_clzl(__x); } - static int __ctz(__storage_t __x) { return __builtin_ctzl(__x); } -}; - -template -struct __set_selector { - typedef __64bit_set __set; -}; - -template<> -struct __set_selector<4> { - typedef __32bit_set __set; -}; - -template -inline void __swap_bitmap_pos(_RandomAccessIterator __first, - _RandomAccessIterator __last, - typename _Bitset::__storage_t& __left_bitset, - typename _Bitset::__storage_t& __right_bitset) { - while (__left_bitset != 0 & __right_bitset != 0) { - int tz_left = _Bitset::__ctz(__left_bitset); - __left_bitset = _Bitset::__blsr(__left_bitset); - - int tz_right = _Bitset::__ctz(__right_bitset); - __right_bitset = _Bitset::__blsr(__right_bitset); - - std::iter_swap(__first + tz_left, __last - tz_right); - } -} - -template -std::pair<_RandomAccessIterator, bool> __bitset_partition( - _RandomAccessIterator __first, _RandomAccessIterator __last, - _Compare __comp) { - typedef typename std::iterator_traits<_RandomAccessIterator>::value_type - value_type; - typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type - difference_type; - typedef typename _Bitset::__storage_t __storage_t; - _RandomAccessIterator __begin = __first; - value_type __pivot = std::move(*__first); - - if (__comp(__pivot, *(__last - 1))) { - // Guarded. - while (!__comp(__pivot, *++__first)) {} - } else { - while (++__first < __last && !__comp(__pivot, *__first)) {} - } - - if (__first < __last) { - // It will be always guarded because __bitset_sort will do the median-of-three before calling this. - while (__comp(__pivot, *--__last)) {} - } - bool __already_partitioned = __first >= __last; - if (!__already_partitioned) { - std::iter_swap(__first, __last); - ++__first; - } - - // [__first, __last) - __last is not inclusive. From now one, it uses last minus one to be inclusive on both sides. - _RandomAccessIterator __lm1 = __last - 1; - __storage_t __left_bitset = 0; - __storage_t __right_bitset = 0; - - // Reminder: length = __lm1 - __first + 1. - while (__lm1 - __first >= 2 * _Bitset::__block_size - 1) { - if (__left_bitset == 0) { - // Possible vectorization. - _RandomAccessIterator __iter = __first; - for (int __j = 0; __j < _Bitset::__block_size;) { - __left_bitset |= (static_cast<__storage_t>(__comp(__pivot, *__iter)) << __j); - __j++; - __iter++; - } - } - - if (__right_bitset == 0) { - _RandomAccessIterator __iter = __lm1; - for (int __j = 0; __j < _Bitset::__block_size;) { - __right_bitset |= - (static_cast<__storage_t>(!__comp(__pivot, *__iter)) << __j); - __j++; - __iter--; - } - } - - __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); - __first += (__left_bitset == 0) ? _Bitset::__block_size : 0; - __lm1 -= (__right_bitset == 0) ? _Bitset::__block_size : 0; - } - // Now, we have a less-than a block. - difference_type __remaining_len = __lm1 - __first + 1; - difference_type __l_size; - difference_type __r_size; - if (__left_bitset == 0 && __right_bitset == 0) { - __l_size = __remaining_len / 2; - __r_size = __remaining_len - __l_size; - } else if (__left_bitset == 0) { - // We know at least one side is a full block. - __l_size = __remaining_len - _Bitset::__block_size; - __r_size = _Bitset::__block_size; - } else { // if (right == 0) - __l_size = _Bitset::__block_size; - __r_size = __remaining_len - _Bitset::__block_size; - } - if (__left_bitset == 0) { - _RandomAccessIterator __iter = __first; - for (int j = 0; j < __l_size; j++) { - __left_bitset |= - (static_cast<__storage_t>(__comp(__pivot, *(__iter))) << j); - __iter++; - } - } - if (__right_bitset == 0) { - _RandomAccessIterator __iter = __lm1; - for (int j = 0; j < __r_size; j++) { - __right_bitset |= - (static_cast<__storage_t>(!__comp(__pivot, *(__iter))) << j); - --__iter; - } - } - __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); - __first += (__left_bitset == 0) ? __l_size : 0; - __lm1 -= (__right_bitset == 0) ? __r_size : 0; - - if (__left_bitset) { - // Swap within the right side. - int __tz_left; - - // Need to find set positions in the reverse order. - while (__left_bitset != 0) { - __tz_left = _Bitset::__block_size - 1 - _Bitset::__clz(__left_bitset); - __left_bitset &= (static_cast<__storage_t>(1) << __tz_left) - 1; - std::iter_swap(__first + __tz_left, __lm1--); - } - __first = __lm1 + 1; - } else if (__right_bitset) { - // Swap within the left side. - int __tz_right; - // Need to find set positions in the reverse order. - while (__right_bitset != 0) { - __tz_right = _Bitset::__block_size - 1 - _Bitset::__clz(__right_bitset); - __right_bitset &= (static_cast<__storage_t>(1) << __tz_right) - 1; - std::iter_swap(__lm1 - __tz_right, __first++); - } - } - - _RandomAccessIterator __pivot_pos = __first - 1; - *__begin = std::move(*__pivot_pos); - *__pivot_pos = std::move(__pivot); - return std::make_pair(__pivot_pos, __already_partitioned); -} -} // __bitsetsort -constexpr int __ninther_threshold = 128; -template -void -__bitset_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - // _Compare is known to be a reference type - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - //const difference_type __limit = 32; - typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; - const difference_type __limit = is_trivially_copy_constructible::value && - is_trivially_copy_assignable::value ? 30 : 6; - while (true) - { - difference_type __len = __last - __first; - switch (__len) - { - case 0: - case 1: - return; - case 2: - if (__comp(*--__last, *__first)) - swap(*__first, *__last); - return; - case 3: - _VSTD::__sort3<_Compare>(__first, __first+1, --__last, __comp); - return; - case 4: - _VSTD::__sort4<_Compare>(__first, __first+1, __first+2, --__last, __comp); - return; - case 5: - _VSTD::__sort5<_Compare>(__first, __first+1, __first+2, __first+3, --__last, __comp); - return; - } - if (__len <= __limit) - { - _VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp); - return; - } - // __len > 5 - { - difference_type __delta = __len / 2; - if (__len > __ninther_threshold) { - __sort3(__first, __first + __delta, __last - 1, __comp); - __sort3(__first + 1, __first + (__delta - 1), __last - 2, __comp); - __sort3(__first + 2, __first + (__delta + 1), __last - 3, __comp); - __sort3(__first + (__delta - 1), __first + __delta, - __first + (__delta + 1), __comp); - std::iter_swap(__first, __first + __delta); - } - else - { - _VSTD::__sort3<_Compare>(__first + __delta, __first, __last - 1, __comp); - } - } - auto __ret = __bitsetsort::__bitset_partition<__bitsetsort::__set_selector::__set>(__first, __last, __comp); - _RandomAccessIterator __i = __ret.first; // pivot - bool __already_partitioned = __ret.second; - - // [__first, __i) < *__i and *__i <= [__i+1, __last) - // If we were given a perfect partition, see if insertion sort is quick... - if (__already_partitioned) - { - bool __fs = _VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp); - if (_VSTD::__insertion_sort_incomplete<_Compare>(__i+1, __last, __comp)) - { - if (__fs) - return; - __last = __i; - continue; - } - else - { - if (__fs) - { - __first = ++__i; - continue; - } - } - } - // sort smaller range with recursive call and larger with tail recursion elimination - if (__i - __first < __last - __i) - { - __bitset_sort<_Compare>(__first, __i, __comp); - __first = ++__i; - } - else - { - __bitset_sort<_Compare>(__i+1, __last, __comp); - __last = __i; - } - } -} - -// This forwarder keeps the top call and the recursive calls using the same instantiation, forcing a reference _Compare -template -inline _LIBCPP_INLINE_VISIBILITY -void -sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - typedef typename __comp_ref_type<_Compare>::type _Comp_ref; - _VSTD::__bitset_sort<_Comp_ref>(__first, __last, _Comp_ref(__comp)); -} - -template -inline _LIBCPP_INLINE_VISIBILITY -void -sort(_RandomAccessIterator __first, _RandomAccessIterator __last) -{ - _VSTD::sort(__first, __last, __less::value_type>()); -} - -template -inline _LIBCPP_INLINE_VISIBILITY -void -sort(_Tp** __first, _Tp** __last) -{ - _VSTD::sort((uintptr_t*)__first, (uintptr_t*)__last, __less()); -} - -template -inline _LIBCPP_INLINE_VISIBILITY -void -sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last) -{ - _VSTD::sort(__first.base(), __last.base()); -} - -template -inline _LIBCPP_INLINE_VISIBILITY -void -sort(__wrap_iter<_Tp*> __first, __wrap_iter<_Tp*> __last, _Compare __comp) -{ - typedef typename add_lvalue_reference<_Compare>::type _Comp_ref; - _VSTD::sort<_Tp*, _Comp_ref>(__first.base(), __last.base(), __comp); -} - -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, char*>(char*, char*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, signed char*>(signed char*, signed char*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, short*>(short*, short*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, int*>(int*, int*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, unsigned*>(unsigned*, unsigned*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, long*>(long*, long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, long long*>(long long*, long long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, unsigned long long*>(unsigned long long*, unsigned long long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, float*>(float*, float*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, double*>(double*, double*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __sort<__less&, long double*>(long double*, long double*, __less&)) - -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, char*>(char*, char*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, signed char*>(signed char*, signed char*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, short*>(short*, short*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, int*>(int*, int*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned*>(unsigned*, unsigned*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, long*>(long*, long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, long long*>(long long*, long long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, unsigned long long*>(unsigned long long*, unsigned long long*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, float*>(float*, float*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, double*>(double*, double*, __less&)) -_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS bool __insertion_sort_incomplete<__less&, long double*>(long double*, long double*, __less&)) - _LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS unsigned __sort5<__less&, long double*>(long double*, long double*, long double*, long double*, long double*, __less&)) // lower_bound @@ -5178,164 +4643,899 @@ _VSTD::__sift_up<_Comp_ref>(__first, __last, __comp, __last - __first); } -template -inline _LIBCPP_INLINE_VISIBILITY -void -push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) -{ - _VSTD::push_heap(__first, __last, __less::value_type>()); -} +template +inline _LIBCPP_INLINE_VISIBILITY +void +push_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) +{ + _VSTD::push_heap(__first, __last, __less::value_type>()); +} + +// pop_heap + +template +void +__sift_down(_RandomAccessIterator __first, _RandomAccessIterator /*__last*/, + _Compare __comp, + typename iterator_traits<_RandomAccessIterator>::difference_type __len, + _RandomAccessIterator __start) +{ + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; + // left-child of __start is at 2 * __start + 1 + // right-child of __start is at 2 * __start + 2 + difference_type __child = __start - __first; + + if (__len < 2 || (__len - 2) / 2 < __child) + return; + + __child = 2 * __child + 1; + _RandomAccessIterator __child_i = __first + __child; + + if ((__child + 1) < __len && __comp(*__child_i, *(__child_i + 1))) { + // right-child exists and is greater than left-child + ++__child_i; + ++__child; + } + + // check if we are in heap-order + if (__comp(*__child_i, *__start)) + // we are, __start is larger than it's largest child + return; + + value_type __top(_VSTD::move(*__start)); + do + { + // we are not in heap-order, swap the parent with it's largest child + *__start = _VSTD::move(*__child_i); + __start = __child_i; + + if ((__len - 2) / 2 < __child) + break; + + // recompute the child based off of the updated parent + __child = 2 * __child + 1; + __child_i = __first + __child; + + if ((__child + 1) < __len && __comp(*__child_i, *(__child_i + 1))) { + // right-child exists and is greater than left-child + ++__child_i; + ++__child; + } + + // check if we are in heap-order + } while (!__comp(*__child_i, __top)); + *__start = _VSTD::move(__top); +} + +template +inline _LIBCPP_INLINE_VISIBILITY +void +__pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, + typename iterator_traits<_RandomAccessIterator>::difference_type __len) +{ + if (__len > 1) + { + swap(*__first, *--__last); + _VSTD::__sift_down<_Compare>(__first, __last, __comp, __len - 1, __first); + } +} + +template +inline _LIBCPP_INLINE_VISIBILITY +void +pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) +{ + typedef typename __comp_ref_type<_Compare>::type _Comp_ref; + _VSTD::__pop_heap<_Comp_ref>(__first, __last, __comp, __last - __first); +} + +template +inline _LIBCPP_INLINE_VISIBILITY +void +pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) +{ + _VSTD::pop_heap(__first, __last, __less::value_type>()); +} + +// make_heap + +template +void +__make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) +{ + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + difference_type __n = __last - __first; + if (__n > 1) + { + // start from the first parent, there is no need to consider children + for (difference_type __start = (__n - 2) / 2; __start >= 0; --__start) + { + _VSTD::__sift_down<_Compare>(__first, __last, __comp, __n, __first + __start); + } + } +} + +template +inline _LIBCPP_INLINE_VISIBILITY +void +make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) +{ + typedef typename __comp_ref_type<_Compare>::type _Comp_ref; + _VSTD::__make_heap<_Comp_ref>(__first, __last, __comp); +} + +template +inline _LIBCPP_INLINE_VISIBILITY +void +make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) +{ + _VSTD::make_heap(__first, __last, __less::value_type>()); +} + +// sort_heap + +template +void +__sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) +{ + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + for (difference_type __n = __last - __first; __n > 1; --__last, (void) --__n) + _VSTD::__pop_heap<_Compare>(__first, __last, __comp, __n); +} + +template +inline _LIBCPP_INLINE_VISIBILITY +void +sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) +{ + typedef typename __comp_ref_type<_Compare>::type _Comp_ref; + _VSTD::__sort_heap<_Comp_ref>(__first, __last, __comp); +} + +template +inline _LIBCPP_INLINE_VISIBILITY +void +sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) +{ + _VSTD::sort_heap(__first, __last, __less::value_type>()); +} + +// sort + +namespace __sorting_network { + +template +class __conditional_swap { + _Compare comp_; + + public: + _Compare get() const { return comp_; } + __conditional_swap(_Compare __comp) : comp_(__comp) {} + inline void operator()(_RandomAccessIterator __x, + _RandomAccessIterator __y) { + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type + value_type; + bool __result = comp_(*__x, *__y); + // Expect a compiler would short-circuit the following if-block. + // 4 * sizeof(size_t) is a magic number. Expect a compiler to use SIMD + // instruction on them. + if (_VSTD::is_trivially_copy_constructible::value && + _VSTD::is_trivially_copy_assignable::value && + sizeof(value_type) <= 4 * sizeof(size_t)) { + value_type __min = __result ? _VSTD::move(*__x) : _VSTD::move(*__y); + *__y = __result ? _VSTD::move(*__y) : _VSTD::move(*__x); + *__x = _VSTD::move(__min); + } else { + if (!__result) { + _VSTD::iter_swap(__x, __y); + } + } + } +}; + +template +class __reverse_conditional_swap { + _Compare comp_; + + public: + _Compare get() const { return comp_; } + __reverse_conditional_swap(_Compare __comp) : comp_(__comp) {} + inline void operator()(_RandomAccessIterator __x, + _RandomAccessIterator __y) { + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type + value_type; + bool __result = !comp_(*__x, *__y); + // Expect a compiler would short-circuit the following if-block. + if (_VSTD::is_trivially_copy_constructible::value && + _VSTD::is_trivially_copy_assignable::value && + sizeof(value_type) <= 4 * sizeof(size_t)) { + value_type __min = __result ? _VSTD::move(*__x) : _VSTD::move(*__y); + *__y = __result ? _VSTD::move(*__y) : _VSTD::move(*__x); + *__x = _VSTD::move(__min); + } else { + if (__result) { + _VSTD::iter_swap(__x, __y); + } + } + } +}; + +template +void __sort2(_RandomAccessIterator __a, _ConditionalSwap __cond_swap) { + __cond_swap(__a + 0, __a + 1); +} + +template +void __sort3(_RandomAccessIterator __a, _ConditionalSwap __cond_swap) { + __cond_swap(__a + 1, __a + 2); + __cond_swap(__a + 0, __a + 2); + __cond_swap(__a + 0, __a + 1); +} + +template +void __sort4(_RandomAccessIterator __a, _ConditionalSwap __cond_swap) { + __cond_swap(__a + 0, __a + 1); + __cond_swap(__a + 2, __a + 3); + __cond_swap(__a + 0, __a + 2); + __cond_swap(__a + 1, __a + 3); + __cond_swap(__a + 1, __a + 2); +} + +template +void __sort5(_RandomAccessIterator __a, _ConditionalSwap __cond_swap) { + __cond_swap(__a + 0, __a + 1); + __cond_swap(__a + 3, __a + 4); + __cond_swap(__a + 2, __a + 4); + __cond_swap(__a + 2, __a + 3); + __cond_swap(__a + 0, __a + 3); + __cond_swap(__a + 1, __a + 4); + __cond_swap(__a + 0, __a + 2); + __cond_swap(__a + 1, __a + 3); + __cond_swap(__a + 1, __a + 2); +} + +template +void __sort6(_RandomAccessIterator __a, _ConditionalSwap __cond_swap) { + __cond_swap(__a + 1, __a + 2); + __cond_swap(__a + 4, __a + 5); + __cond_swap(__a + 0, __a + 2); + __cond_swap(__a + 3, __a + 5); + __cond_swap(__a + 0, __a + 1); + __cond_swap(__a + 3, __a + 4); + __cond_swap(__a + 0, __a + 3); + __cond_swap(__a + 1, __a + 4); + __cond_swap(__a + 2, __a + 5); + __cond_swap(__a + 2, __a + 4); + __cond_swap(__a + 1, __a + 3); + __cond_swap(__a + 2, __a + 3); +} +template +void __sort7(_RandomAccessIterator __a, _ConditionalSwap __cond_swap) { + __cond_swap(__a + 1, __a + 2); + __cond_swap(__a + 3, __a + 4); + __cond_swap(__a + 5, __a + 6); + __cond_swap(__a + 0, __a + 2); + __cond_swap(__a + 3, __a + 5); + __cond_swap(__a + 4, __a + 6); + __cond_swap(__a + 0, __a + 1); + __cond_swap(__a + 4, __a + 5); + __cond_swap(__a + 0, __a + 4); + __cond_swap(__a + 1, __a + 5); + __cond_swap(__a + 2, __a + 6); + __cond_swap(__a + 0, __a + 3); + __cond_swap(__a + 2, __a + 5); + __cond_swap(__a + 1, __a + 3); + __cond_swap(__a + 2, __a + 4); + __cond_swap(__a + 2, __a + 3); +} + +template +void __sort8(_RandomAccessIterator __a, _ConditionalSwap __cond_swap) { + __cond_swap(__a + 0, __a + 1); + __cond_swap(__a + 2, __a + 3); + __cond_swap(__a + 4, __a + 5); + __cond_swap(__a + 6, __a + 7); + __cond_swap(__a + 0, __a + 2); + __cond_swap(__a + 1, __a + 3); + __cond_swap(__a + 4, __a + 6); + __cond_swap(__a + 5, __a + 7); + __cond_swap(__a + 1, __a + 2); + __cond_swap(__a + 5, __a + 6); + __cond_swap(__a + 0, __a + 4); + __cond_swap(__a + 1, __a + 5); + __cond_swap(__a + 2, __a + 6); + __cond_swap(__a + 3, __a + 7); + __cond_swap(__a + 1, __a + 4); + __cond_swap(__a + 3, __a + 6); + __cond_swap(__a + 2, __a + 4); + __cond_swap(__a + 3, __a + 5); + __cond_swap(__a + 3, __a + 4); +} + +template +void __sort1to8( + _RandomAccessIterator __a, + typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type __len, + _ConditionalSwap __cond_swap) { + switch (__len) { + case 0: + case 1: + return; + case 2: + __sort2(__a, __cond_swap); + return; + case 3: + __sort3(__a, __cond_swap); + return; + case 4: + __sort4(__a, __cond_swap); + return; + case 5: + __sort5(__a, __cond_swap); + return; + case 6: + __sort6(__a, __cond_swap); + return; + case 7: + __sort7(__a, __cond_swap); + return; + case 8: + __sort8(__a, __cond_swap); + return; + } + // ignore +} +template +void __sort3(_RandomAccessIterator __a0, _RandomAccessIterator __a1, _RandomAccessIterator __a2, _ConditionalSwap __cond_swap) { + __cond_swap(__a1, __a2); + __cond_swap(__a0, __a2); + __cond_swap(__a0, __a1); +} +} // namespace __sorting_network + +namespace __bitonic { +class __detail { + public: + enum { + __batch = 8, + __bitonic_batch = __batch * 2, + __small_sort_max = __bitonic_batch * 2, + }; +}; + +template +void __enforce_order(_RandomAccessIterator __first, + _RandomAccessIterator __last, _ConditionalSwap __cond_swap, + _ReverseConditionalSwap __reverse_cond_swap) { + _RandomAccessIterator __i = __first; + while (__i + __detail::__bitonic_batch <= __last) { + __sorting_network::__sort8(__i, __cond_swap); + __sorting_network::__sort8(__i + __detail::__batch, __reverse_cond_swap); + __i += __detail::__bitonic_batch; + } + if (__i + __detail::__batch <= __last) { + __sorting_network::__sort8(__i, __cond_swap); + __i += __detail::__batch; + __sorting_network::__sort1to8(__i, __last - __i, __reverse_cond_swap); + } else { + __sorting_network::__sort1to8(__i, __last - __i, __cond_swap); + } +} + +class __construct { + public: + template + static inline void __op(_Type* __result, _Type&& __val) { + new (__result) _Type(_VSTD::move(__val)); + } +}; + +class __move_assign { + public: + template + static inline void __op(_Type* __result, _Type&& __val) { + *__result = _VSTD::move(__val); + } +}; + +template +void __forward_merge(_InputIterator __first, _InputIterator __last, + _OutputIterator __result, _Compare __comp) { + --__last; + typename _VSTD::iterator_traits<_InputIterator>::difference_type __len = + __last - __first; + for (; __len > 0; __len--) { + if (__comp(*__first, *__last)) { + _Copy::__op(&*__result, _VSTD::move(*__first++)); + } else { + _Copy::__op(&*__result, _VSTD::move(*__last--)); + } + __result++; + } + _Copy::__op(&*__result, _VSTD::move(*__first)); +} + +template +void __backward_merge(_InputIterator __first, _InputIterator __last, + _OutputIterator __result, _Compare __comp) { + --__last; + __result += __last - __first; + typename _VSTD::iterator_traits<_InputIterator>::difference_type __len = + __last - __first; + for (; __len > 0; __len--) { + if (__comp(*__first, *__last)) { + _Copy::__op(&*__result, _VSTD::move(*__first++)); + } else { + _Copy::__op(&*__result, _VSTD::move(*__last--)); + } + __result--; + } + _Copy::__op(&*__result, _VSTD::move(*__first)); +} + +template +void __forward_and_backward_merge(_InputIterator __first, _InputIterator __last, + _InputIterator __rlast, + _OutputIterator __result, _Compare __comp) { + _InputIterator __rfirst = __last; + __last--; + __rlast--; + typename _VSTD::iterator_traits<_InputIterator>::difference_type len = + __last - __first; + _OutputIterator __rout = __result + (__rlast - __first); + + for (; len > 0; len--) { + if (__comp(*__first, *__last)) { + _Copy::__op(&*__result, _VSTD::move(*__first++)); + } else { + _Copy::__op(&*__result, _VSTD::move(*__last--)); + } + __result++; + if (__comp(*__rfirst, *__rlast)) { + _Copy::__op(&*__rout, _VSTD::move(*__rfirst++)); + } else { + _Copy::__op(&*__rout, _VSTD::move(*__rlast--)); + } + __rout--; + } + _Copy::__op(&*__result, _VSTD::move(*__first)); + _Copy::__op(&*__rout, _VSTD::move(*__rfirst)); +} + +template +inline bool __small_sort( + _RandomAccessIterator __first, + typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type __len, + typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type* __buff, + _ConditionalSwap __cond_swap, + _ReverseConditionalSwap __reverse_cond_swap) { + typedef + typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type; + if (__len > __detail::__small_sort_max) { + return false; + } + _RandomAccessIterator __last = __first + __len; + __enforce_order(__first, __last, __cond_swap, __reverse_cond_swap); + if (__len <= __detail::__batch) { + // sorted. + return true; + } + auto __comp = __cond_swap.get(); + if (__len <= __detail::__bitonic_batch) { + // single bitonic order merge. + __forward_merge<__bitonic::__construct>(__first, __last, __buff, __comp); + copy(_VSTD::make_move_iterator(__buff), _VSTD::make_move_iterator(__buff + __len), + __first); + for (auto __iter = __buff; __iter < __buff + __len; __iter++) { + (*__iter).~value_type(); + } + return true; + } + // double bitonic order merge. + __forward_merge<__construct>(__first, __first + __detail::__bitonic_batch, + __buff, __comp); + __backward_merge<__construct>(__first + __detail::__bitonic_batch, __last, + __buff + __detail::__bitonic_batch, __comp); + __forward_merge<__move_assign>(__buff, __buff + __len, __first, __comp); + for (auto __iter = __buff; __iter < __buff + __len; __iter++) { + (*__iter).~value_type(); + } + return true; +} +} // namespace __bitonic + +namespace __bitsetsort { +struct __64bit_set { + typedef uint64_t __storage_t; + enum { __block_size = 64 }; + static __storage_t __blsr(__storage_t x) { + // _blsr_u64 can be used here but it did not make any performance + // difference in practice. + return x ^ (x & -x); + } + static int __clz(__storage_t x) { return __builtin_clzll(x); } + static int __ctz(__storage_t x) { return __builtin_ctzll(x); } +}; + +struct __32bit_set { + typedef uint32_t __storage_t; + enum { __block_size = 32 }; + static __storage_t __blsr(__storage_t x) { + // _blsr_u32 can be used here but it did not make any performance + // difference in practice. + return x ^ (x & -x); + } + static int __clz(__storage_t x) { return __builtin_clzl(x); } + static int __ctz(__storage_t x) { return __builtin_ctzl(x); } +}; + +template +struct __set_selector { + typedef __64bit_set __set; +}; + +template<> +struct __set_selector<4> { + typedef __32bit_set __set; +}; + +template +inline void __swap_bitmap_pos(_RandomAccessIterator __first, + _RandomAccessIterator __last, + typename _Bitset::__storage_t& __left_bitset, + typename _Bitset::__storage_t& __right_bitset) { + while (__left_bitset != 0 & __right_bitset != 0) { + int tz_left = _Bitset::__ctz(__left_bitset); + __left_bitset = _Bitset::__blsr(__left_bitset); + int tz_right = _Bitset::__ctz(__right_bitset); + __right_bitset = _Bitset::__blsr(__right_bitset); + _VSTD::iter_swap(__first + tz_left, __last - tz_right); + } +} + +template +inline void __swap_bitmap(_RandomAccessIterator __first, + _RandomAccessIterator __last, + typename _Bitset::__storage_t& __left_bitset, + typename _Bitset::__storage_t& __right_bitset) { + if (__left_bitset == 0 || __right_bitset == 0) { + return; + } + int tz_left; + int tz_right; -// pop_heap + tz_left = _Bitset::__ctz(__left_bitset); + __left_bitset = _Bitset::__blsr(__left_bitset); -template -void -__sift_down(_RandomAccessIterator __first, _RandomAccessIterator /*__last*/, - _Compare __comp, - typename iterator_traits<_RandomAccessIterator>::difference_type __len, - _RandomAccessIterator __start) -{ - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; - // left-child of __start is at 2 * __start + 1 - // right-child of __start is at 2 * __start + 2 - difference_type __child = __start - __first; + tz_right = _Bitset::__ctz(__right_bitset); + __right_bitset = _Bitset::__blsr(__right_bitset); - if (__len < 2 || (__len - 2) / 2 < __child) - return; + _RandomAccessIterator l = __first + tz_left; + _RandomAccessIterator r = __last - tz_right; + typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type tmp( + _VSTD::move(*l)); + *l = _VSTD::move(*r); + while (__left_bitset != 0 & __right_bitset != 0) { + tz_left = _Bitset::__ctz(__left_bitset); + __left_bitset = _Bitset::__blsr(__left_bitset); + tz_right = _Bitset::__ctz(__right_bitset); + __right_bitset = _Bitset::__blsr(__right_bitset); - __child = 2 * __child + 1; - _RandomAccessIterator __child_i = __first + __child; + l = __first + tz_left; + *r = _VSTD::move(*l); + r = __last - tz_right; + *l = _VSTD::move(*r); + } + *r = _VSTD::move(tmp); +} - if ((__child + 1) < __len && __comp(*__child_i, *(__child_i + 1))) { - // right-child exists and is greater than left-child - ++__child_i; - ++__child; - } +template +_VSTD::pair<_RandomAccessIterator, bool> __bitset_partition( + _RandomAccessIterator __first, _RandomAccessIterator __last, + _Compare __comp) { + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type + value_type; + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type + difference_type; + typedef typename _Bitset::__storage_t __storage_t; + _RandomAccessIterator __begin = __first; + value_type __pivot = _VSTD::move(*__first); - // check if we are in heap-order - if (__comp(*__child_i, *__start)) - // we are, __start is larger than it's largest child - return; + if (__comp(__pivot, *(__last - 1))) { + // Guarded. + while (!__comp(__pivot, *++__first)) {} + } else { + while (++__first < __last && !__comp(__pivot, *__first)) {} + } - value_type __top(_VSTD::move(*__start)); - do - { - // we are not in heap-order, swap the parent with it's largest child - *__start = _VSTD::move(*__child_i); - __start = __child_i; + if (__first < __last) { + // It will be always guarded because __bitset_sort will do the median-of-three before calling this. + while (__comp(__pivot, *--__last)) {} + } + bool __already_partitioned = __first >= __last; + if (!__already_partitioned) { + _VSTD::iter_swap(__first, __last); + ++__first; + } - if ((__len - 2) / 2 < __child) - break; + // [__first, __last) - __last is not inclusive. From now one, it uses last minus one to be inclusive on both sides. + _RandomAccessIterator __lm1 = __last - 1; + __storage_t __left_bitset = 0; + __storage_t __right_bitset = 0; - // recompute the child based off of the updated parent - __child = 2 * __child + 1; - __child_i = __first + __child; + // Reminder: length = __lm1 - __first + 1. + while (__lm1 - __first >= 2 * _Bitset::__block_size - 1) { + if (__left_bitset == 0) { + // Possible vectorization. With a proper "-march" flag, the following loop + // will be compiled into a set of SIMD instructions. + _RandomAccessIterator __iter = __first; + for (int __j = 0; __j < _Bitset::__block_size;) { + __left_bitset |= (static_cast<__storage_t>(__comp(__pivot, *__iter)) << __j); + __j++; + __iter++; + } + } - if ((__child + 1) < __len && __comp(*__child_i, *(__child_i + 1))) { - // right-child exists and is greater than left-child - ++__child_i; - ++__child; - } + if (__right_bitset == 0) { + // Possible vectorization. With a proper "-march" flag, the following loop + // will be compiled into a set of SIMD instructions. + _RandomAccessIterator __iter = __lm1; + for (int __j = 0; __j < _Bitset::__block_size;) { + __right_bitset |= + (static_cast<__storage_t>(!__comp(__pivot, *__iter)) << __j); + __j++; + __iter--; + } + } - // check if we are in heap-order - } while (!__comp(*__child_i, __top)); - *__start = _VSTD::move(__top); -} + __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); + __first += (__left_bitset == 0) ? _Bitset::__block_size : 0; + __lm1 -= (__right_bitset == 0) ? _Bitset::__block_size : 0; + } + // Now, we have a less-than a block on each side. + difference_type __remaining_len = __lm1 - __first + 1; + difference_type __l_size; + difference_type __r_size; + if (__left_bitset == 0 && __right_bitset == 0) { + __l_size = __remaining_len / 2; + __r_size = __remaining_len - __l_size; + } else if (__left_bitset == 0) { + // We know at least one side is a full block. + __l_size = __remaining_len - _Bitset::__block_size; + __r_size = _Bitset::__block_size; + } else { // if (right == 0) + __l_size = _Bitset::__block_size; + __r_size = __remaining_len - _Bitset::__block_size; + } + if (__left_bitset == 0) { + _RandomAccessIterator __iter = __first; + for (int j = 0; j < __l_size; j++) { + __left_bitset |= + (static_cast<__storage_t>(__comp(__pivot, *(__iter))) << j); + __iter++; + } + } + if (__right_bitset == 0) { + _RandomAccessIterator __iter = __lm1; + for (int j = 0; j < __r_size; j++) { + __right_bitset |= + (static_cast<__storage_t>(!__comp(__pivot, *(__iter))) << j); + --__iter; + } + } + __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); + __first += (__left_bitset == 0) ? __l_size : 0; + __lm1 -= (__right_bitset == 0) ? __r_size : 0; -template -inline _LIBCPP_INLINE_VISIBILITY -void -__pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, - typename iterator_traits<_RandomAccessIterator>::difference_type __len) -{ - if (__len > 1) - { - swap(*__first, *--__last); - _VSTD::__sift_down<_Compare>(__first, __last, __comp, __len - 1, __first); + if (__left_bitset) { + // Swap within the right side. + int __tz_left; + + // Need to find set positions in the reverse order. + while (__left_bitset != 0) { + __tz_left = _Bitset::__block_size - 1 - _Bitset::__clz(__left_bitset); + __left_bitset &= (static_cast<__storage_t>(1) << __tz_left) - 1; + _VSTD::iter_swap(__first + __tz_left, __lm1--); } -} + __first = __lm1 + 1; + } else if (__right_bitset) { + // Swap within the left side. + int __tz_right; + // Need to find set positions in the reverse order. + while (__right_bitset != 0) { + __tz_right = _Bitset::__block_size - 1 - _Bitset::__clz(__right_bitset); + __right_bitset &= (static_cast<__storage_t>(1) << __tz_right) - 1; + _VSTD::iter_swap(__lm1 - __tz_right, __first++); + } + } -template -inline _LIBCPP_INLINE_VISIBILITY -void -pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - typedef typename __comp_ref_type<_Compare>::type _Comp_ref; - _VSTD::__pop_heap<_Comp_ref>(__first, __last, __comp, __last - __first); + _RandomAccessIterator __pivot_pos = __first - 1; + *__begin = _VSTD::move(*__pivot_pos); + *__pivot_pos = _VSTD::move(__pivot); + return _VSTD::make_pair(__pivot_pos, __already_partitioned); } -template -inline _LIBCPP_INLINE_VISIBILITY -void -pop_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) -{ - _VSTD::pop_heap(__first, __last, __less::value_type>()); +template +inline bool __partial_insertion_sort(_RandomAccessIterator __first, + _RandomAccessIterator __last, + _Compare __comp) { + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type + value_type; + if (__first == __last) return true; + + const unsigned __limit = 8; + unsigned __count = 0; + _RandomAccessIterator __j = __first; + for (_RandomAccessIterator __i = __j + 1; __i != __last; ++__i) { + if (__comp(*__i, *__j)) { + value_type __t(_VSTD::move(*__i)); + _RandomAccessIterator __k = __j; + __j = __i; + do { + *__j = _VSTD::move(*__k); + __j = __k; + } while (__j != __first && __comp(__t, *--__k)); + *__j = _VSTD::move(__t); + if (++__count == __limit) return ++__i == __last; + } + __j = __i; + } + return true; } -// make_heap - template -void -__make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - difference_type __n = __last - __first; - if (__n > 1) - { - // start from the first parent, there is no need to consider children - for (difference_type __start = (__n - 2) / 2; __start >= 0; --__start) - { - _VSTD::__sift_down<_Compare>(__first, __last, __comp, __n, __first + __start); +void __bitsetsort_loop( + _RandomAccessIterator __first, _RandomAccessIterator __last, + _Compare __comp, + typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type* __buff, + typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type __limit) { + _LIBCPP_CONSTEXPR_AFTER_CXX11 int __ninther_threshold = 128; + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type + difference_type; + __sorting_network::__conditional_swap<_RandomAccessIterator, _Compare> + __cond_swap(__comp); + __sorting_network::__reverse_conditional_swap<_RandomAccessIterator, _Compare> + __reverse_cond_swap(__comp); + while (true) { + if (__limit == 0) { + // Fallback to heap sort as Introsort suggests. + _VSTD::make_heap(__first, __last, __comp); + _VSTD::sort_heap(__first, __last, __comp); + return; + } + __limit--; + difference_type __len = __last - __first; + if (__len <= __bitonic::__detail::__batch) { + __sorting_network::__sort1to8(__first, __len, __cond_swap); + return; + } else if (__len <= 32) { + __bitonic::__small_sort(__first, __len, __buff, __cond_swap, + __reverse_cond_swap); + // __bitonic::__sort9to32(__first, __len, __buff, __cond_swap, + // __reverse_cond_swap); + return; + } + difference_type __half_len = __len / 2; + if (__len > __ninther_threshold) { + __sorting_network::__sort3(__first, __first + __half_len, __last - 1, __cond_swap); + __sorting_network::__sort3(__first + 1, __first + (__half_len - 1), __last - 2, __cond_swap); + __sorting_network::__sort3(__first + 2, __first + (__half_len + 1), __last - 3, __cond_swap); + __sorting_network::__sort3(__first + (__half_len - 1), __first + __half_len, + __first + (__half_len + 1), __cond_swap); + _VSTD::iter_swap(__first, __first + __half_len); + } else { + __sorting_network::__sort3(__first + __half_len, __first, __last - 1, __cond_swap); + } + auto __ret = __bitset_partition<__64bit_set>(__first, __last, __comp); + if (__ret.second) { + bool __left = __partial_insertion_sort(__first, __ret.first, __comp); + if (__partial_insertion_sort(__ret.first + 1, __last, __comp)) { + if (__left) return; + __last = __ret.first; + continue; + } else { + if (__left) { + __first = ++__ret.first; + continue; } + } } -} -template -inline _LIBCPP_INLINE_VISIBILITY -void -make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - typedef typename __comp_ref_type<_Compare>::type _Comp_ref; - _VSTD::__make_heap<_Comp_ref>(__first, __last, __comp); + // Sort smaller range with recursive call and larger with tail recursion + // elimination. + if (__ret.first - __first < __last - __ret.first) { + __bitsetsort_loop<_Compare>(__first, __ret.first, __comp, __buff, __limit); + __first = ++__ret.first; + } else { + __bitsetsort_loop<_Compare>(__ret.first + 1, __last, __comp, __buff, __limit); + __last = __ret.first; + } + } } -template -inline _LIBCPP_INLINE_VISIBILITY -void -make_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) -{ - _VSTD::make_heap(__first, __last, __less::value_type>()); +template +inline _LIBCPP_INLINE_VISIBILITY _Number __log2i(_Number __n) { + _Number __log2 = 0; + while (__n > 1) { + __log2++; + __n >>= 1; + } + return __log2; } -// sort_heap template -void -__sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; - for (difference_type __n = __last - __first; __n > 1; --__last, (void) --__n) - _VSTD::__pop_heap<_Compare>(__first, __last, __comp, __n); +inline _LIBCPP_INLINE_VISIBILITY void __bitsetsort_internal( + _RandomAccessIterator __first, _RandomAccessIterator __last, + _Compare __comp) { + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type + value_type; + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type + difference_type; + typename _VSTD::aligned_storage::type + __buff[__bitonic::__detail::__small_sort_max]; + + // 2*log2 comes from Introsort https://reviews.llvm.org/D36423. + difference_type __depth_limit = 2 * __log2i(__last - __first); + __bitsetsort_loop(__first, __last, __comp, + reinterpret_cast(&__buff[0]), + __depth_limit); } +} // namespace __bitsetsort template -inline _LIBCPP_INLINE_VISIBILITY -void -sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) -{ - typedef typename __comp_ref_type<_Compare>::type _Comp_ref; - _VSTD::__sort_heap<_Comp_ref>(__first, __last, __comp); +inline _LIBCPP_INLINE_VISIBILITY void sort(_RandomAccessIterator __first, + _RandomAccessIterator __last, + _Compare __comp) { + typedef typename _VSTD::__comp_ref_type<_Compare>::type _Comp_ref; + __bitsetsort::__bitsetsort_internal<_Comp_ref>(__first, __last, + _Comp_ref(__comp)); +} + +template +inline _LIBCPP_INLINE_VISIBILITY void sort(_VSTD::__wrap_iter<_Tp*> __first, + _VSTD::__wrap_iter<_Tp*> __last, + _Compare __comp) { + typedef typename _VSTD::add_lvalue_reference<_Compare>::type _Comp_ref; + _VSTD::sort<_Tp*, _Comp_ref>(__first.base(), __last.base(), __comp); } template -inline _LIBCPP_INLINE_VISIBILITY -void -sort_heap(_RandomAccessIterator __first, _RandomAccessIterator __last) -{ - _VSTD::sort_heap(__first, __last, __less::value_type>()); +inline _LIBCPP_INLINE_VISIBILITY void sort(_RandomAccessIterator __first, + _RandomAccessIterator __last) { + _VSTD::sort( + __first, __last, + __less::value_type>()); } +template +inline _LIBCPP_INLINE_VISIBILITY void sort(_VSTD::__wrap_iter<_Tp*> __first, + _VSTD::__wrap_iter<_Tp*> __last) { + _VSTD::sort(__first.base(), __last.base()); +} + +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, char*>(char*, char*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, signed char*>(signed char*, signed char*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, short*>(short*, short*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, int*>(int*, int*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, unsigned*>(unsigned*, unsigned*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, long*>(long*, long*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, long long*>(long long*, long long*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, unsigned long long*>(unsigned long long*, unsigned long long*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, float*>(float*, float*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, double*>(double*, double*, __less&)) +_LIBCPP_EXTERN_TEMPLATE(_LIBCPP_FUNC_VIS void __bitsetsort::__bitsetsort_internal<__less&, long double*>(long double*, long double*, __less&)) + // partial_sort template diff --git a/libcxx/src/algorithm.cpp b/libcxx/src/algorithm.cpp --- a/libcxx/src/algorithm.cpp +++ b/libcxx/src/algorithm.cpp @@ -10,37 +10,49 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template void __sort<__less&, char*>(char*, char*, __less&); -template void __sort<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&); -template void __sort<__less&, signed char*>(signed char*, signed char*, __less&); -template void __sort<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&); -template void __sort<__less&, short*>(short*, short*, __less&); -template void __sort<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&); -template void __sort<__less&, int*>(int*, int*, __less&); -template void __sort<__less&, unsigned*>(unsigned*, unsigned*, __less&); -template void __sort<__less&, long*>(long*, long*, __less&); -template void __sort<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&); -template void __sort<__less&, long long*>(long long*, long long*, __less&); -template void __sort<__less&, unsigned long long*>(unsigned long long*, unsigned long long*, __less&); -template void __sort<__less&, float*>(float*, float*, __less&); -template void __sort<__less&, double*>(double*, double*, __less&); -template void __sort<__less&, long double*>(long double*, long double*, __less&); - -template bool __insertion_sort_incomplete<__less&, char*>(char*, char*, __less&); -template bool __insertion_sort_incomplete<__less&, wchar_t*>(wchar_t*, wchar_t*, __less&); -template bool __insertion_sort_incomplete<__less&, signed char*>(signed char*, signed char*, __less&); -template bool __insertion_sort_incomplete<__less&, unsigned char*>(unsigned char*, unsigned char*, __less&); -template bool __insertion_sort_incomplete<__less&, short*>(short*, short*, __less&); -template bool __insertion_sort_incomplete<__less&, unsigned short*>(unsigned short*, unsigned short*, __less&); -template bool __insertion_sort_incomplete<__less&, int*>(int*, int*, __less&); -template bool __insertion_sort_incomplete<__less&, unsigned*>(unsigned*, unsigned*, __less&); -template bool __insertion_sort_incomplete<__less&, long*>(long*, long*, __less&); -template bool __insertion_sort_incomplete<__less&, unsigned long*>(unsigned long*, unsigned long*, __less&); -template bool __insertion_sort_incomplete<__less&, long long*>(long long*, long long*, __less&); -template bool __insertion_sort_incomplete<__less&, unsigned long long*>(unsigned long long*, unsigned long long*, __less&); -template bool __insertion_sort_incomplete<__less&, float*>(float*, float*, __less&); -template bool __insertion_sort_incomplete<__less&, double*>(double*, double*, __less&); -template bool __insertion_sort_incomplete<__less&, long double*>(long double*, long double*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, char*>(char*, char*, + __less&); +template void __bitsetsort::__bitsetsort_internal<__less&, wchar_t*>( + wchar_t*, wchar_t*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, signed char*>( + signed char*, signed char*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, unsigned char*>( + unsigned char*, unsigned char*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, short*>(short*, short*, + __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, unsigned short*>( + unsigned short*, unsigned short*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, int*>(int*, int*, + __less&); +template void __bitsetsort::__bitsetsort_internal<__less&, unsigned*>( + unsigned*, unsigned*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, long*>(long*, long*, + __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, unsigned long*>( + unsigned long*, unsigned long*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, long long*>( + long long*, long long*, __less&); +template void __bitsetsort::__bitsetsort_internal<__less&, + unsigned long long*>( + unsigned long long*, unsigned long long*, __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, float*>(float*, float*, + __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, double*>(double*, double*, + __less&); +template void +__bitsetsort::__bitsetsort_internal<__less&, long double*>( + long double*, long double*, __less&); template unsigned __sort5<__less&, long double*>(long double*, long double*, long double*, long double*, long double*, __less&);