diff --git a/libcxx/include/algorithm b/libcxx/include/algorithm --- a/libcxx/include/algorithm +++ b/libcxx/include/algorithm @@ -4139,6 +4139,273 @@ } } +namespace __bitsetsort { +struct __64bit_set { + typedef uint64_t __storage_t; + constexpr static int __block_size = 64; + static __storage_t __blsr(__storage_t __x) { + // _blsr_u64 can be used here but it did not make any performance + // difference in practice. + return __x ^ (__x & -__x); + } + static int __clz(__storage_t __x) { return __builtin_clzll(__x); } + static int __ctz(__storage_t __x) { return __builtin_ctzll(__x); } +}; + +struct __32bit_set { + typedef uint32_t __storage_t; + constexpr static int __block_size = 32; + static __storage_t __blsr(__storage_t __x) { + // _blsr_u32 can be used here but it did not make any performance + // difference in practice. + return __x ^ (__x & -__x); + } + static int __clz(__storage_t __x) { return __builtin_clzl(__x); } + static int __ctz(__storage_t __x) { return __builtin_ctzl(__x); } +}; + +template +struct __set_selector { + typedef __64bit_set __set; +}; + +template<> +struct __set_selector<4> { + typedef __32bit_set __set; +}; + +template +inline void __swap_bitmap_pos(_RandomAccessIterator __first, + _RandomAccessIterator __last, + typename _Bitset::__storage_t& __left_bitset, + typename _Bitset::__storage_t& __right_bitset) { + while (__left_bitset != 0 & __right_bitset != 0) { + int tz_left = _Bitset::__ctz(__left_bitset); + __left_bitset = _Bitset::__blsr(__left_bitset); + + int tz_right = _Bitset::__ctz(__right_bitset); + __right_bitset = _Bitset::__blsr(__right_bitset); + + std::iter_swap(__first + tz_left, __last - tz_right); + } +} + +template +std::pair<_RandomAccessIterator, bool> __bitset_partition( + _RandomAccessIterator __first, _RandomAccessIterator __last, + _Compare __comp) { + typedef typename std::iterator_traits<_RandomAccessIterator>::value_type + value_type; + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type + difference_type; + typedef typename _Bitset::__storage_t __storage_t; + _RandomAccessIterator __begin = __first; + value_type __pivot = std::move(*__first); + + if (__comp(__pivot, *(__last - 1))) { + // Guarded. + while (!__comp(__pivot, *++__first)) {} + } else { + while (++__first < __last && !__comp(__pivot, *__first)) {} + } + + if (__first < __last) { + // It will be always guarded because __bitset_sort will do the median-of-three before calling this. + while (__comp(__pivot, *--__last)) {} + } + bool __already_partitioned = __first >= __last; + if (!__already_partitioned) { + std::iter_swap(__first, __last); + ++__first; + } + + // [__first, __last) - __last is not inclusive. From now one, it uses last minus one to be inclusive on both sides. + _RandomAccessIterator __lm1 = __last - 1; + __storage_t __left_bitset = 0; + __storage_t __right_bitset = 0; + + // Reminder: length = __lm1 - __first + 1. + while (__lm1 - __first >= 2 * _Bitset::__block_size - 1) { + if (__left_bitset == 0) { + // Possible vectorization. + _RandomAccessIterator __iter = __first; + for (int __j = 0; __j < _Bitset::__block_size;) { + __left_bitset |= (static_cast<__storage_t>(__comp(__pivot, *__iter)) << __j); + __j++; + __iter++; + } + } + + if (__right_bitset == 0) { + _RandomAccessIterator __iter = __lm1; + for (int __j = 0; __j < _Bitset::__block_size;) { + __right_bitset |= + (static_cast<__storage_t>(!__comp(__pivot, *__iter)) << __j); + __j++; + __iter--; + } + } + + __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); + __first += (__left_bitset == 0) ? _Bitset::__block_size : 0; + __lm1 -= (__right_bitset == 0) ? _Bitset::__block_size : 0; + } + // Now, we have a less-than a block. + difference_type __remaining_len = __lm1 - __first + 1; + difference_type __l_size; + difference_type __r_size; + if (__left_bitset == 0 && __right_bitset == 0) { + __l_size = __remaining_len / 2; + __r_size = __remaining_len - __l_size; + } else if (__left_bitset == 0) { + // We know at least one side is a full block. + __l_size = __remaining_len - _Bitset::__block_size; + __r_size = _Bitset::__block_size; + } else { // if (right == 0) + __l_size = _Bitset::__block_size; + __r_size = __remaining_len - _Bitset::__block_size; + } + if (__left_bitset == 0) { + _RandomAccessIterator __iter = __first; + for (int j = 0; j < __l_size; j++) { + __left_bitset |= + (static_cast<__storage_t>(__comp(__pivot, *(__iter))) << j); + __iter++; + } + } + if (__right_bitset == 0) { + _RandomAccessIterator __iter = __lm1; + for (int j = 0; j < __r_size; j++) { + __right_bitset |= + (static_cast<__storage_t>(!__comp(__pivot, *(__iter))) << j); + --__iter; + } + } + __swap_bitmap_pos<_Bitset>(__first, __lm1, __left_bitset, __right_bitset); + __first += (__left_bitset == 0) ? __l_size : 0; + __lm1 -= (__right_bitset == 0) ? __r_size : 0; + + if (__left_bitset) { + // Swap within the right side. + int __tz_left; + + // Need to find set positions in the reverse order. + while (__left_bitset != 0) { + __tz_left = _Bitset::__block_size - 1 - _Bitset::__clz(__left_bitset); + __left_bitset &= (static_cast<__storage_t>(1) << __tz_left) - 1; + std::iter_swap(__first + __tz_left, __lm1--); + } + __first = __lm1 + 1; + } else if (__right_bitset) { + // Swap within the left side. + int __tz_right; + // Need to find set positions in the reverse order. + while (__right_bitset != 0) { + __tz_right = _Bitset::__block_size - 1 - _Bitset::__clz(__right_bitset); + __right_bitset &= (static_cast<__storage_t>(1) << __tz_right) - 1; + std::iter_swap(__lm1 - __tz_right, __first++); + } + } + + _RandomAccessIterator __pivot_pos = __first - 1; + *__begin = std::move(*__pivot_pos); + *__pivot_pos = std::move(__pivot); + return std::make_pair(__pivot_pos, __already_partitioned); +} +} // __bitsetsort +constexpr int __ninther_threshold = 128; +template +void +__bitset_sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) +{ + // _Compare is known to be a reference type + typedef typename iterator_traits<_RandomAccessIterator>::difference_type difference_type; + //const difference_type __limit = 32; + typedef typename iterator_traits<_RandomAccessIterator>::value_type value_type; + const difference_type __limit = is_trivially_copy_constructible::value && + is_trivially_copy_assignable::value ? 30 : 6; + while (true) + { + difference_type __len = __last - __first; + switch (__len) + { + case 0: + case 1: + return; + case 2: + if (__comp(*--__last, *__first)) + swap(*__first, *__last); + return; + case 3: + _VSTD::__sort3<_Compare>(__first, __first+1, --__last, __comp); + return; + case 4: + _VSTD::__sort4<_Compare>(__first, __first+1, __first+2, --__last, __comp); + return; + case 5: + _VSTD::__sort5<_Compare>(__first, __first+1, __first+2, __first+3, --__last, __comp); + return; + } + if (__len <= __limit) + { + _VSTD::__insertion_sort_3<_Compare>(__first, __last, __comp); + return; + } + // __len > 5 + { + difference_type __delta = __len / 2; + if (__len > __ninther_threshold) { + __sort3(__first, __first + __delta, __last - 1, __comp); + __sort3(__first + 1, __first + (__delta - 1), __last - 2, __comp); + __sort3(__first + 2, __first + (__delta + 1), __last - 3, __comp); + __sort3(__first + (__delta - 1), __first + __delta, + __first + (__delta + 1), __comp); + std::iter_swap(__first, __first + __delta); + } + else + { + _VSTD::__sort3<_Compare>(__first + __delta, __first, __last - 1, __comp); + } + } + auto __ret = __bitsetsort::__bitset_partition<__bitsetsort::__set_selector::__set>(__first, __last, __comp); + _RandomAccessIterator __i = __ret.first; // pivot + bool __already_partitioned = __ret.second; + + // [__first, __i) < *__i and *__i <= [__i+1, __last) + // If we were given a perfect partition, see if insertion sort is quick... + if (__already_partitioned) + { + bool __fs = _VSTD::__insertion_sort_incomplete<_Compare>(__first, __i, __comp); + if (_VSTD::__insertion_sort_incomplete<_Compare>(__i+1, __last, __comp)) + { + if (__fs) + return; + __last = __i; + continue; + } + else + { + if (__fs) + { + __first = ++__i; + continue; + } + } + } + // sort smaller range with recursive call and larger with tail recursion elimination + if (__i - __first < __last - __i) + { + __bitset_sort<_Compare>(__first, __i, __comp); + __first = ++__i; + } + else + { + __bitset_sort<_Compare>(__i+1, __last, __comp); + __last = __i; + } + } +} + // This forwarder keeps the top call and the recursive calls using the same instantiation, forcing a reference _Compare template inline _LIBCPP_INLINE_VISIBILITY @@ -4146,7 +4413,7 @@ sort(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp) { typedef typename __comp_ref_type<_Compare>::type _Comp_ref; - _VSTD::__sort<_Comp_ref>(__first, __last, _Comp_ref(__comp)); + _VSTD::__bitset_sort<_Comp_ref>(__first, __last, _Comp_ref(__comp)); } template diff --git a/libcxx/test/libcxx/algorithms/sort.pass.cpp b/libcxx/test/libcxx/algorithms/sort.pass.cpp new file mode 100644 --- /dev/null +++ b/libcxx/test/libcxx/algorithms/sort.pass.cpp @@ -0,0 +1,117 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// + +#include +#include +#include +#include +#include +#include + +#include "test_macros.h" + +#ifndef _LIBCPP_VERSION +#error _LIBCPP_VERSION not defined +#endif + +template +void generate(const std::vector& source, std::map& dest) { + for (auto v : source) { + dest[v]++; + } +} + +template +bool is_sorted(const std::vector& sorted) { + for (int i = 0; i < static_cast(sorted.size()) - 1; i++) { + if (sorted[i] > sorted[i + 1]) { + return false; + } + } + return true; +} + +template +bool destruct(const std::vector& source, std::map& dest) { + for (auto v : source) { + auto iter = dest.find(v); + if (iter == dest.end()) { + return false; + } + iter->second--; + if (iter->second == 0) { + dest.erase(iter); + } + } + return dest.empty(); +} + +template +bool check(std::vector& a) { + std::map c; + generate(a, c); + std::sort(a.begin(), a.end()); + if (!is_sorted(a)) { + return false; + } + if (!destruct(a, c)) { + return false; + } + return true; +} + +int main(int, char**) { + { + for (int N = 500; N < 1000; N++) { + std::vector seq(N); + for (int i = 0; i < N; i++) { + seq[i] = 1001 - i; + } + assert(check(seq)); + } + } + { + const int n = 500; + for (int j = 0; j < n; j++) { + std::vector seq(n); + for (int i = 0; i < n; i++) { + seq[i] = 0; + } + seq[j] = 1; + assert(check(seq)); + } + } + { + for (int n = 500; n < 1000; n++) { + std::vector seq(n); + for (int i = 0; i < n; i++) { + seq[i] = i; + } + assert(check(seq)); + } + } + { + std::random_device rd; + std::mt19937 mt(rd()); + std::uniform_int_distribution size_dist(100, 1000); + for (int rep = 0; rep < 100000; rep++) { + const int n = size_dist(mt); + std::vector seq(n); + std::uniform_real_distribution value_dist( + std::numeric_limits::min(), + std::numeric_limits::max()); + for (unsigned long i = 0; i < seq.size(); i++) { + seq[i] = value_dist(mt); + } + assert(check(seq)); + } + } + return 0; +}