diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h --- a/libcxx/include/__algorithm/sort.h +++ b/libcxx/include/__algorithm/sort.h @@ -11,6 +11,7 @@ #include <__algorithm/comp.h> #include <__algorithm/comp_ref_type.h> +#include <__algorithm/iter_swap.h> #include <__algorithm/min_element.h> #include <__algorithm/partial_sort.h> #include <__algorithm/unwrap_iter.h> @@ -137,13 +138,16 @@ } } +// Sort the iterator range [__first, __last) using the comparator __comp using +// the insertion sort algorithm. template void __insertion_sort_3(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; - _RandomAccessIterator __j = __first + difference_type(2); - _VSTD::__sort3<_Compare>(__first, __first + difference_type(1), __j, __comp); - for (_RandomAccessIterator __i = __j + difference_type(1); __i != __last; ++__i) { + if (__first == __last) + return; + for (_RandomAccessIterator __i = __first + difference_type(1); __i != __last; ++__i) { + _RandomAccessIterator __j = __i - difference_type(1); if (__comp(*__i, *__j)) { value_type __t(_VSTD::move(*__i)); _RandomAccessIterator __k = __j; @@ -154,7 +158,34 @@ } while (__j != __first && __comp(__t, *--__k)); *__j = _VSTD::move(__t); } - __j = __i; + } + __j = __i; +} +} + +// Sort the iterator range [__first, __last) using the comparator __comp using +// the insertion sort algorithm. Insertion sort has two loops, outer and inner. +// The implementation below has not bounds check (unguarded) for the inner loop. +// Assumes that there is an element in the position (__first - 1) and that each +// element in the input range is greater or equal to the element at __first - 1. +template +void __insertion_sort_3_unguarded(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; + if (__first == __last) + return; + for (_RandomAccessIterator __i = __first + difference_type(1); __i != __last; ++__i) { + _RandomAccessIterator __j = __i - difference_type(1); + if (__comp(*__i, *__j)) { + value_type __t(_VSTD::move(*__i)); + _RandomAccessIterator __k = __j; + __j = __i; + do { + *__j = _VSTD::move(*__k); + __j = __k; + } while (__comp(__t, *--__k)); // No need for bounds check due to the assumption stated above. + *__j = _VSTD::move(__t); + } } } @@ -270,139 +301,410 @@ _VSTD::__partial_sort<_Compare>(__first, __last, __last, __comp); return; } - --__depth; - _RandomAccessIterator __m = __first; - _RandomAccessIterator __lm1 = __last; - --__lm1; - unsigned __n_swaps; - { - difference_type __delta; - if (__len >= 1000) { - __delta = __len / 2; - __m += __delta; - __delta /= 2; - __n_swaps = _VSTD::__sort5<_Compare>(__first, __first + __delta, __m, __m + __delta, __lm1, __comp); - } else { - __delta = __len / 2; - __m += __delta; - __n_swaps = _VSTD::__sort3<_Compare>(__first, __m, __lm1, __comp); + } + + struct __64bit_set { + typedef uint64_t __storage_t; + enum { __block_size = 64 }; + static __storage_t __blsr(__storage_t x) { + // _blsr_u64 can be used here but it did not make any performance + // difference in practice. + return x ^ (x & -x); + } + static int __clz(__storage_t x) { return __builtin_clzll(x); } + static int __ctz(__storage_t x) { return __builtin_ctzll(x); } + static int __popcount(__storage_t x) { return __builtin_popcountll(x); } + }; + + struct __32bit_set { + typedef uint32_t __storage_t; + enum { __block_size = 32 }; + static __storage_t __blsr(__storage_t x) { + // _blsr_u32 can be used here but it did not make any performance + // difference in practice. + return x ^ (x & -x); + } + static int __clz(__storage_t x) { return __builtin_clzl(x); } + static int __ctz(__storage_t x) { return __builtin_ctzl(x); } + static int __popcount(__storage_t x) { return __builtin_popcountl(x); } + }; + + template + inline _LIBCPP_HIDE_FROM_ABI void __swap_bitmap_pos(_RandomAccessIterator __first, _RandomAccessIterator __last, + typename _Bitset::__storage_t & __left_bitset, + typename _Bitset::__storage_t & __right_bitset) { + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + // Swap one pair on each iteration as long as both bitsets have at least one + // element for swapping. + while (__left_bitset != 0 && __right_bitset != 0) { + difference_type tz_left = _Bitset::__ctz(__left_bitset); + __left_bitset = _Bitset::__blsr(__left_bitset); + difference_type tz_right = _Bitset::__ctz(__right_bitset); + __right_bitset = _Bitset::__blsr(__right_bitset); + _VSTD::iter_swap(__first + tz_left, __last - tz_right); + } + } + + // Partition [__first, __last) using the comparator __comp. *__first has the + // chosen pivot. Elements that are equivalent are kept to the left of the + // pivot. Returns the iterator for the pivot and a bool value which is true if + // the provided range is already sorted, false otherwise. We assume that the + // length of the range is at least three elements. + // + // We term the partitioning algorithm as bitset partitioning since the + // outcomes of the comparisons between the pivot and other elements are stored + // in the fixed size bitsets. + template + _LIBCPP_HIDE_FROM_ABI _VSTD::pair<_RandomAccessIterator, bool> __bitset_partition( + _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type; + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef typename _Bitset::__storage_t __storage_t; + _RandomAccessIterator __begin = __first; + value_type __pivot(_VSTD::move(*__first)); + // Find the first element greater than the pivot. + if (__comp(__pivot, *(__last - difference_type(1)))) { + // Not guarded since we know the last element is greater than the pivot. + while (!__comp(__pivot, *++__first)) { + } + } else { + while (++__first < __last && !__comp(__pivot, *__first)) { } } - // *__m is median - // partition [__first, __m) < *__m and *__m <= [__m, __last) - // (this inhibits tossing elements equivalent to __m around unnecessarily) - _RandomAccessIterator __i = __first; - _RandomAccessIterator __j = __lm1; - // j points beyond range to be tested, *__m is known to be <= *__lm1 - // The search going up is known to be guarded but the search coming down isn't. - // Prime the downward search with a guard. - if (!__comp(*__i, *__m)) // if *__first == *__m - { - // *__first == *__m, *__first doesn't go in first part - // manually guard downward moving __j against __i - while (true) { - if (__i == --__j) { - // *__first == *__m, *__m <= all other elements - // Parition instead into [__first, __i) == *__first and *__first < [__i, __last) - ++__i; // __first + 1 - __j = __last; - if (!__comp(*__first, *--__j)) // we need a guard if *__first == *(__last-1) - { - while (true) { - if (__i == __j) - return; // [__first, __last) all equivalent elements - if (__comp(*__first, *__i)) { - swap(*__i, *__j); - ++__n_swaps; - ++__i; - break; - } - ++__i; - } - } - // [__first, __i) == *__first and *__first < [__j, __last) and __j == __last - 1 - if (__i == __j) - return; - while (true) { - while (!__comp(*__first, *__i)) - ++__i; - while (__comp(*__first, *--__j)) - ; - if (__i >= __j) - break; - swap(*__i, *__j); - ++__n_swaps; - ++__i; - } - // [__first, __i) == *__first and *__first < [__i, __last) - // The first part is sorted, sort the second part - // _VSTD::__sort<_Compare>(__i, __last, __comp); - __first = __i; - goto __restart; + // Find the last element less than or equal to the pivot. + if (__first < __last) { + // It will be always guarded because __introsort will do the median-of-three + // before calling this. + while (__comp(__pivot, *--__last)) { + } + } + // If the first element greater than the pivot is at or after the + // last element less than or equal to the pivot, then we have covered the + // entire range without swapping elements. This implies the range is already + // partitioned. + bool __already_partitioned = __first >= __last; + if (!__already_partitioned) { + _VSTD::iter_swap(__first, __last); + ++__first; + } + + // In [__first, __last) __last is not inclusive. From now on, it uses last + // minus one to be inclusive on both sides. + _RandomAccessIterator __lm1 = __last - difference_type(1); + __storage_t __left_bitset = 0; + __storage_t __right_bitset = 0; + + // Reminder: length = __lm1 - __first + 1. + while (__lm1 - __first >= 2 * _Bitset::__block_size - 1) { + // Record the comparion outcomes for the elements currently on the left + // side. + if (__left_bitset == 0) { + // Possible vectorization. With a proper "-march" flag, the following loop + // will be compiled into a set of SIMD instructions. + _RandomAccessIterator __iter = __first; + for (int __j = 0; __j < _Bitset::__block_size;) { + bool __comp_result = !__comp(*__iter, __pivot); + __left_bitset |= (static_cast<__storage_t>(__comp_result) << __j); + __j++; + ++__iter; } - if (__comp(*__j, *__m)) { - swap(*__i, *__j); - ++__n_swaps; - break; // found guard for downward moving __j, now use unguarded partition + } + // Record the comparion outcomes for the elements currently on the right + // side. + if (__right_bitset == 0) { + // Possible vectorization. With a proper "-march" flag, the following loop + // will be compiled into a set of SIMD instructions. + _RandomAccessIterator __iter = __lm1; + for (int __j = 0; __j < _Bitset::__block_size;) { + bool __comp_result = __comp(*__iter, __pivot); + __right_bitset |= (static_cast<__storage_t>(__comp_result) << __j); + __j++; + --__iter; } } + // Swap the elements recorded to be the candidates for swapping in the + // bitsets. + __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); + // Only advance the iterator if all the elements that need to be moved to + // other side were moved. + __first += (__left_bitset == 0) ? difference_type(_Bitset::__block_size) : difference_type(0); + __lm1 -= (__right_bitset == 0) ? difference_type(_Bitset::__block_size) : difference_type(0); } - // It is known that *__i < *__m - ++__i; - // j points beyond range to be tested, *__m is known to be <= *__lm1 - // if not yet partitioned... - if (__i < __j) { - // known that *(__i - 1) < *__m - // known that __i <= __m - while (true) { - // __m still guards upward moving __i - while (__comp(*__i, *__m)) - ++__i; - // It is now known that a guard exists for downward moving __j - while (!__comp(*--__j, *__m)) - ; - if (__i > __j) - break; - swap(*__i, *__j); - ++__n_swaps; - // It is known that __m != __j - // If __m just moved, follow it - if (__m == __i) - __m = __j; - ++__i; + // Now, we have a less-than a block worth of elements on at least one of the + // sides. + difference_type __remaining_len = __lm1 - __first + 1; + difference_type __l_size; + difference_type __r_size; + if (__left_bitset == 0 && __right_bitset == 0) { + __l_size = __remaining_len / 2; + __r_size = __remaining_len - __l_size; + } else if (__left_bitset == 0) { + // We know at least one side is a full block. + __l_size = __remaining_len - _Bitset::__block_size; + __r_size = _Bitset::__block_size; + } else { // if (__right_bitset == 0) + __l_size = _Bitset::__block_size; + __r_size = __remaining_len - _Bitset::__block_size; + } + // Record the comparion outcomes for the elements currently on the left side. + if (__left_bitset == 0) { + _RandomAccessIterator __iter = __first; + for (int j = 0; j < __l_size; j++) { + bool __comp_result = !__comp(*__iter, __pivot); + __left_bitset |= (static_cast<__storage_t>(__comp_result) << j); + ++__iter; + } + } + // Record the comparion outcomes for the elements currently on the right side. + if (__right_bitset == 0) { + _RandomAccessIterator __iter = __lm1; + for (int j = 0; j < __r_size; j++) { + bool __comp_result = __comp(*__iter, __pivot); + __right_bitset |= (static_cast<__storage_t>(__comp_result) << j); + --__iter; + } + } + __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); + __first += (__left_bitset == 0) ? __l_size : 0; + __lm1 -= (__right_bitset == 0) ? __r_size : 0; + // At least one the bitsets would be empty. For the non-empty one, we need to + // properly partition the elements that appear within that bitset. + if (__left_bitset) { + // Swap within the left side. Need to find set positions in the reverse + // order. + while (__left_bitset != 0) { + difference_type __tz_left = _Bitset::__block_size - 1 - _Bitset::__clz(__left_bitset); + __left_bitset &= (static_cast<__storage_t>(1) << __tz_left) - 1; + _RandomAccessIterator it = __first + __tz_left; + if (it != __lm1) { + _VSTD::iter_swap(it, __lm1); + } + --__lm1; + } + __first = __lm1 + difference_type(1); + } else if (__right_bitset) { + // Swap within the right side. Need to find set positions in the reverse + // order. + while (__right_bitset != 0) { + difference_type __tz_right = _Bitset::__block_size - 1 - _Bitset::__clz(__right_bitset); + __right_bitset &= (static_cast<__storage_t>(1) << __tz_right) - 1; + _RandomAccessIterator it = __lm1 - __tz_right; + if (it != __first) { + _VSTD::iter_swap(it, __first); + } + ++__first; + } + } + // Move the pivot to its correct position. + _RandomAccessIterator __pivot_pos = __first - difference_type(1); + if (__begin != __pivot_pos) { + *__begin = _VSTD::move(*__pivot_pos); + } + *__pivot_pos = _VSTD::move(__pivot); + return _VSTD::make_pair(__pivot_pos, __already_partitioned); + } + + // Partition [__first, __last) using the comparator __comp. *__first has the + // chosen pivot. Elements that are equivalent are kept to the right of the + // pivot. Returns the iterator for the pivot and a bool value which is true if + // the provided range is already sorted, false otherwise. We assume that the + // length of the range is at least three elements. + template + _LIBCPP_HIDE_FROM_ABI _VSTD::pair<_RandomAccessIterator, bool> __partition_with_equals_on_right( + _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type; + _RandomAccessIterator __begin = __first; + value_type __pivot(_VSTD::move(*__first)); + // Find the first element greater or equal to the pivot. It will be always + // guarded because __introsort will do the median-of-three before calling + // this. + while (__comp(*++__first, __pivot)) + ; + + // Find the last element less than the pivot. + if (__begin == __first - difference_type(1)) { + while (__first < __last && !__comp(*--__last, __pivot)) + ; + } else { + // Guarded. + while (!__comp(*--__last, __pivot)) + ; + } + + // If the first element greater than or equal to the pivot is at or after the + // last element less than the pivot, then we have covered the entire range + // without swapping elements. This implies the range is already partitioned. + bool __already_partitioned = __first >= __last; + // Go through the remaining elements. Swap pairs of elements (one to the + // right of the pivot and the other to left of the pivot) that are not on the + // correct side of the pivot. + while (__first < __last) { + _VSTD::iter_swap(__first, __last); + while (__comp(*++__first, __pivot)) + ; + while (!__comp(*--__last, __pivot)) + ; + } + // Move the pivot to its correct position. + _RandomAccessIterator __pivot_pos = __first - difference_type(1); + if (__begin != __pivot_pos) { + *__begin = _VSTD::move(*__pivot_pos); + } + *__pivot_pos = _VSTD::move(__pivot); + return _VSTD::make_pair(__pivot_pos, __already_partitioned); + } + + // Similar to the above function. Elements equivalent to the pivot are put to + // the left of the pivot. Returns the iterator to the pivot element. + template + _LIBCPP_HIDE_FROM_ABI _RandomAccessIterator __partition_with_equals_on_left( + _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef typename _VSTD::iterator_traits<_RandomAccessIterator>::value_type value_type; + _RandomAccessIterator __begin = __first; + value_type __pivot(_VSTD::move(*__first)); + if (__comp(__pivot, *(__last - difference_type(1)))) { + // Guarded. + while (!__comp(__pivot, *++__first)) { } + } else { + while (++__first < __last && !__comp(__pivot, *__first)) { + } + } + + if (__first < __last) { + // It will be always guarded because __introsort will do the + // median-of-three before calling this. + while (__comp(__pivot, *--__last)) { + } + } + while (__first < __last) { + _VSTD::iter_swap(__first, __last); + while (!__comp(__pivot, *++__first)) + ; + while (__comp(__pivot, *--__last)) + ; } - // [__first, __i) < *__m and *__m <= [__i, __last) - if (__i != __m && __comp(*__m, *__i)) { - swap(*__i, *__m); - ++__n_swaps; + _RandomAccessIterator __pivot_pos = __first - difference_type(1); + if (__begin != __pivot_pos) { + *__begin = _VSTD::move(*__pivot_pos); } - // [__first, __i) < *__i and *__i <= [__i+1, __last) - // If we were given a perfect partition, see if insertion sort is quick... - if (__n_swaps == 0) { - bool __fs = _VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp); - if (_VSTD::__insertion_sort_incomplete<_Compare>(__i + difference_type(1), __last, __comp)) { - if (__fs) - return; - __last = __i; + *__pivot_pos = _VSTD::move(__pivot); + return __first; + } + + // The main sorting function. Implements introsort combined with other ideas: + // - option of using block quick sort for partitioning, + // - guarded and unguarded insertion sort for small lengths, + // - Tuckey's ninther technique for computing the pivot, + // - check on whether partition was not required. + // The implementation is partly based on Orson Peters' pattern-defeating + // quicksort, published at: + template + void __introsort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, + typename iterator_traits<_RandomAccessIterator>::difference_type __depth, bool __leftmost = true) { + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + typedef typename __comp_ref_type<_Compare>::type _Comp_ref; + // Upper bound for using insertion sort for sorting. + _LIBCPP_CONSTEXPR_AFTER_CXX11 difference_type __limit = 24; + // Lower bound for using Tuckey's ninther technique for median computation. + _LIBCPP_CONSTEXPR_AFTER_CXX11 difference_type __ninther_threshold = 128; + while (true) { + difference_type __len = __last - __first; + switch (__len) { + case 0: + case 1: + return; + case 2: + if (__comp(*--__last, *__first)) + swap(*__first, *__last); + return; + case 3: + _VSTD::__sort3<_Compare>(__first, __first + difference_type(1), --__last, __comp); + return; + case 4: + _VSTD::__sort4<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), --__last, __comp); + return; + case 5: + _VSTD::__sort5<_Compare>(__first, __first + difference_type(1), __first + difference_type(2), + __first + difference_type(3), --__last, __comp); + return; + } + // Use insertion sort if the length of the range is below the specified + // limit. + if (__len < __limit) { + if (__leftmost) { + _VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp); + } else { + _VSTD::__insertion_sort_3_unguarded<_Compare>(__first, __last, __comp); + } + return; + } + if (__depth == 0) { + // Fallback to heap sort as Introsort suggests. + _VSTD::__partial_sort<_Compare>(__first, __last, __last, __comp); + return; + } + --__depth; + { + difference_type __half_len = __len / 2; + // Use Tuckey's ninther technique or median of 3 for pivot selection + // depending on the length of the range being sorted. + if (__len > __ninther_threshold) { + _VSTD::__sort3<_Compare>(__first, __first + __half_len, __last - difference_type(1), __comp); + _VSTD::__sort3<_Compare>(__first + difference_type(1), __first + (__half_len - 1), + __last - difference_type(2), __comp); + _VSTD::__sort3<_Compare>(__first + difference_type(2), __first + (__half_len + 1), + __last - difference_type(3), __comp); + _VSTD::__sort3<_Compare>(__first + (__half_len - 1), __first + __half_len, __first + (__half_len + 1), + __comp); + _VSTD::iter_swap(__first, __first + __half_len); + } else { + _VSTD::__sort3<_Compare>(__first + __half_len, __first, __last - difference_type(1), __comp); + } + } + // The elements to the left of the current iterator range are already + // sorted. If the current iterator range to be sorted is not the + // leftmost part of the entire iterator range and the pivot is same as + // the highest element in the range to the left, then we know that all + // the elements in the range [first, pivot] would be equal to the pivot, + // assuming the equal elements are put on the left side when + // partitioned. This also means that we do not need to sort the left + // side of the partition. + if (!__leftmost && !__comp(*(__first - difference_type(1)), *__first)) { + __first = __partition_with_equals_on_left<_RandomAccessIterator, _Comp_ref>(__first, __last, _Comp_ref(__comp)); continue; - } else { - if (__fs) { - __first = ++__i; + } + // Use bitset partition only if asked for. + auto __ret = _UseBitSetPartition + ? __bitset_partition<__64bit_set, _RandomAccessIterator, _Compare>(__first, __last, __comp) + : __partition_with_equals_on_right<_RandomAccessIterator, _Compare>(__first, __last, __comp); + _RandomAccessIterator __i = __ret.first; + // [__first, __i) < *__i and *__i <= [__i+1, __last) + // If we were given a perfect partition, see if insertion sort is quick... + if (__ret.second) { + bool __fs = _VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp); + if (_VSTD::__insertion_sort_incomplete<_Compare>(__i + difference_type(1), __last, __comp)) { + if (__fs) + return; + __last = __i; continue; + } else { + if (__fs) { + __first = ++__i; + continue; + } } } - } - // sort smaller range with recursive call and larger with tail recursion elimination - if (__i - __first < __last - __i) { - _VSTD::__introsort<_Compare>(__first, __i, __comp, __depth); + // Sort the left partiton recursively and the right partition with tail + // recursion elimination. + _VSTD::__introsort<_Compare, _RandomAccessIterator, _UseBitSetPartition>(__first, __i, __comp, __depth, + __leftmost); + __leftmost = false; __first = ++__i; - } else { - _VSTD::__introsort<_Compare>(__i + difference_type(1), __last, __comp, __depth); - __last = __i; } } -} template inline _LIBCPP_HIDE_FROM_ABI _Number __log2i(_Number __n) { @@ -418,7 +720,12 @@ void __sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; difference_type __depth_limit = 2 * __log2i(__last - __first); - _VSTD::__introsort<_Compare>(__first, __last, __comp, __depth_limit); + // Only use bitset partitioning for arithmetic types. We should also check + // that the default comparator is in use so that we are sure that there are no + // branches in the comparator. + _VSTD::__introsort<_Compare, _RandomAccessIterator, + _VSTD::is_arithmetic::value_type>::value>( + __first, __last, __comp, __depth_limit); } template