diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -161,6 +161,7 @@ algorithms/min.bench.cpp algorithms/min_max_element.bench.cpp algorithms/pop_heap.bench.cpp + algorithms/pstl.stable_sort.bench.cpp algorithms/push_heap.bench.cpp algorithms/ranges_make_heap.bench.cpp algorithms/ranges_make_heap_then_sort_heap.bench.cpp diff --git a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp b/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp copy from libcxx/benchmarks/algorithms/stable_sort.bench.cpp copy to libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp --- a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp +++ b/libcxx/benchmarks/algorithms/pstl.stable_sort.bench.cpp @@ -16,16 +16,16 @@ size_t Quantity; void run(benchmark::State& state) const { - runOpOnCopies(state, Quantity, Order(), BatchSize::CountElements, [](auto& Copy) { - std::stable_sort(Copy.begin(), Copy.end()); + runOpOnCopies(state, Quantity, Order(), BatchSize::CountBatch, [](auto& Copy) { + std::stable_sort(std::execution::par, Copy.begin(), Copy.end()); }); } bool skip() const { return Order() == ::Order::Heap; } std::string name() const { - return "BM_StableSort" + ValueType::name() + Order::name() + "_" + std::to_string(Quantity); - }; + return "BM_pstl_stable_sort" + ValueType::name() + Order::name() + "/" + std::to_string(Quantity); + } }; } // namespace diff --git a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp b/libcxx/benchmarks/algorithms/stable_sort.bench.cpp --- a/libcxx/benchmarks/algorithms/stable_sort.bench.cpp +++ b/libcxx/benchmarks/algorithms/stable_sort.bench.cpp @@ -16,7 +16,7 @@ size_t Quantity; void run(benchmark::State& state) const { - runOpOnCopies(state, Quantity, Order(), BatchSize::CountElements, [](auto& Copy) { + runOpOnCopies(state, Quantity, Order(), BatchSize::CountBatch, [](auto& Copy) { std::stable_sort(Copy.begin(), Copy.end()); }); } diff --git a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h --- a/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h +++ b/libcxx/include/__algorithm/pstl_backends/cpu_backends/libdispatch.h @@ -9,8 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H #define _LIBCPP___ALGORITHM_PSTL_BACKENDS_CPU_BACKENDS_LIBDISPATCH_H +#include <__algorithm/inplace_merge.h> #include <__algorithm/lower_bound.h> -#include <__algorithm/upper_bound.h> +#include <__algorithm/merge.h> #include <__atomic/atomic.h> #include <__config> #include <__exception/terminate.h> @@ -20,7 +21,6 @@ #include <__memory/unique_ptr.h> #include <__memory_resource/memory_resource.h> #include <__numeric/reduce.h> -#include <__utility/exception_guard.h> #include <__utility/move.h> #include <__utility/terminate_on_exception.h> #include @@ -58,10 +58,7 @@ template _LIBCPP_HIDE_FROM_ABI void -__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { - auto __partitions = __libdispatch::__partition_chunks(__last - __first); - - // Perform the chunked execution. +__dispatch_parallel_for(__chunk_partitions __partitions, _RandomAccessIterator __first, _Functor __func) { __libdispatch::__dispatch_apply(__partitions.__chunk_count_, [&](size_t __chunk) { auto __this_chunk_size = __chunk == 0 ? __partitions.__first_chunk_size_ : __partitions.__chunk_size_; auto __index = @@ -72,6 +69,13 @@ }); } +template +_LIBCPP_HIDE_FROM_ABI void +__parallel_for(_RandomAccessIterator __first, _RandomAccessIterator __last, _Functor __func) { + return __libdispatch::__dispatch_parallel_for( + __libdispatch::__partition_chunks(__last - __first), std::move(__first), std::move(__func)); +} + template struct __merge_range { __merge_range(_RandomAccessIterator1 __mid1, _RandomAccessIterator2 __mid2, _RandomAccessIteratorOut __result) @@ -207,11 +211,92 @@ }); } -// TODO: parallelize this template _LIBCPP_HIDE_FROM_ABI void __parallel_stable_sort( _RandomAccessIterator __first, _RandomAccessIterator __last, _Comp __comp, _LeafSort __leaf_sort) { - __leaf_sort(__first, __last, __comp); + const auto __size = __last - __first; + auto __partitions = __libdispatch::__partition_chunks(__size); + + if (__partitions.__chunk_count_ == 0) + return; + + if (__partitions.__chunk_count_ == 1) + return __leaf_sort(__first, __last, __comp); + + using _Value = __iter_value_type<_RandomAccessIterator>; + + auto __destroy = [__size](_Value* __ptr) { + std::destroy_n(__ptr, __size); + std::allocator<_Value>().deallocate(__ptr, __size); + }; + + // TODO: use __uninitialized_buffer + unique_ptr<_Value[], decltype(__destroy)> __values(std::allocator<_Value>().allocate(__size), __destroy); + + return std::__terminate_on_exception([&] { + // Initialize all elements to a moved-from state + // TODO: Don't do this - this can be done in the first merge - see https://llvm.org/PR63928 + std::__construct_at(__values.get(), std::move(*__first)); + for (__iter_diff_t<_RandomAccessIterator> __i = 1; __i != __size; ++__i) { + std::__construct_at(__values.get() + __i, std::move(__values.get()[__i - 1])); + } + *__first = std::move(__values.get()[__size - 1]); + + __libdispatch::__dispatch_parallel_for( + __partitions, + __first, + [&__leaf_sort, &__comp](_RandomAccessIterator __chunk_first, _RandomAccessIterator __chunk_last) { + __leaf_sort(std::move(__chunk_first), std::move(__chunk_last), __comp); + }); + + bool __objects_are_in_buffer = false; + do { + const auto __old_chunk_size = __partitions.__chunk_size_; + if (__partitions.__chunk_count_ % 2 == 1) { + auto __inplace_merge_chunks = [&__comp, &__partitions](auto __first_chunk_begin) { + std::inplace_merge( + __first_chunk_begin, + __first_chunk_begin + __partitions.__first_chunk_size_, + __first_chunk_begin + __partitions.__first_chunk_size_ + __partitions.__chunk_size_, + __comp); + }; + if (__objects_are_in_buffer) + __inplace_merge_chunks(__values.get()); + else + __inplace_merge_chunks(__first); + __partitions.__first_chunk_size_ += 2 * __partitions.__chunk_size_; + } else { + __partitions.__first_chunk_size_ += __partitions.__chunk_size_; + } + + __partitions.__chunk_size_ *= 2; + __partitions.__chunk_count_ /= 2; + + auto __merge_chunks = [__partitions, __old_chunk_size, &__comp](auto __from_first, auto __to_first) { + __libdispatch::__dispatch_parallel_for( + __partitions, + __from_first, + [__old_chunk_size, &__from_first, &__to_first, &__comp](auto __chunk_first, auto __chunk_last) { + std::merge(std::make_move_iterator(__chunk_first), + std::make_move_iterator(__chunk_last - __old_chunk_size), + std::make_move_iterator(__chunk_last - __old_chunk_size), + std::make_move_iterator(__chunk_last), + __to_first + (__chunk_first - __from_first), + __comp); + }); + }; + + if (__objects_are_in_buffer) + __merge_chunks(__values.get(), __first); + else + __merge_chunks(__first, __values.get()); + __objects_are_in_buffer = !__objects_are_in_buffer; + } while (__partitions.__chunk_count_ > 1); + + if (__objects_are_in_buffer) { + std::move(__values.get(), __values.get() + __size, __first); + } + }); } _LIBCPP_HIDE_FROM_ABI inline void __cancel_execution() {} diff --git a/libcxx/src/pstl/libdispatch.cpp b/libcxx/src/pstl/libdispatch.cpp --- a/libcxx/src/pstl/libdispatch.cpp +++ b/libcxx/src/pstl/libdispatch.cpp @@ -31,6 +31,8 @@ partitions.__chunk_count_ = element_count / 256; partitions.__chunk_size_ = partitions.__chunk_count_ * 256; partitions.__first_chunk_size_ = element_count - (partitions.__chunk_count_ - 1) * partitions.__chunk_size_; + if (partitions.__chunk_count_ == 0 && element_count > 0) + partitions.__chunk_count_ = 1; return partitions; } diff --git a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp --- a/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp +++ b/libcxx/test/libcxx/algorithms/pstl.libdispatch.chunk_partitions.pass.cpp @@ -8,6 +8,7 @@ // +// TODO: find out why this doesn't work properly // REQUIRES-: libcpp-pstl-cpu-backend-libdispatch // ADDITIONAL_COMPILE_FLAGS: -Wno-private-header @@ -23,6 +24,7 @@ auto chunks = std::__par_backend::__libdispatch::__partition_chunks(i); assert(chunks.__chunk_count_ <= i); assert((chunks.__chunk_count_ - 1) * chunks.__chunk_size_ + chunks.__first_chunk_size_ == i); + assert(chunks.__chunk_count_ >= 1 || i == 0); } return 0; } diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.all_of/pstl.all_of.pass.cpp @@ -62,7 +62,7 @@ // check that a large number of elements works std::vector vec(100); std::fill(vec.begin(), vec.end(), 3); - assert(std::all_of(Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; })); + assert(std::all_of(policy, Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; })); } }; diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.any_of/pstl.any_of.pass.cpp @@ -56,7 +56,7 @@ // check that a large number of elements works std::vector vec(100, 2); vec[96] = 3; - assert(std::any_of(Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; })); + assert(std::any_of(policy, Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i == 3; })); } }; diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.none_of/pstl.none_of.pass.cpp @@ -56,7 +56,7 @@ // check that a large number of elements works std::vector vec(100); std::fill(vec.begin(), vec.end(), 3); - assert(std::none_of(Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i != 3; })); + assert(std::none_of(policy, Iter(vec.data()), Iter(vec.data() + vec.size()), [](int i) { return i != 3; })); } }; diff --git a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp --- a/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp +++ b/libcxx/test/std/algorithms/alg.sorting/alg.sort/stable.sort/pstl.stable_sort.pass.cpp @@ -30,6 +30,7 @@ #include "test_macros.h" #include "test_execution_policies.h" #include "test_iterators.h" +#include "MoveOnly.h" EXECUTION_POLICY_SFINAE_TEST(stable_sort); @@ -45,9 +46,9 @@ auto operator>(const OrderedValue& rhs) const { return value > rhs.value; } }; -template -void test_one(std::array input, std::array expected) { - std::stable_sort(Iter(input.data()), Iter(input.data() + input.size())); +template ::value_type> +void test_one(Policy&& policy, std::array input, std::array expected) { + std::stable_sort(policy, Iter(input.data()), Iter(input.data() + input.size())); assert(input == expected); } @@ -55,27 +56,26 @@ struct Test { template void operator()(Policy&& policy) { - // Empty sequence. - test_one({}, {}); + test_one(policy, {}, {}); // 1-element sequence. - test_one({1}, {1}); + test_one(policy, {1}, {1}); // 2-element sequence. - test_one({2, 1}, {1, 2}); + test_one(policy, {2, 1}, {1, 2}); // 3-element sequence. - test_one({2, 1, 3}, {1, 2, 3}); + test_one(policy, {2, 1, 3}, {1, 2, 3}); // Longer sequence. - test_one({2, 1, 3, 6, 8, 4, 11, 5}, {1, 2, 3, 4, 5, 6, 8, 11}); + test_one(policy, {2, 1, 3, 6, 8, 4, 11, 5}, {1, 2, 3, 4, 5, 6, 8, 11}); // Longer sequence with duplicates. - test_one({2, 1, 3, 6, 2, 8, 6}, {1, 2, 2, 3, 6, 6, 8}); + test_one(policy, {2, 1, 3, 6, 2, 8, 6}, {1, 2, 2, 3, 6, 6, 8}); // All elements are the same. - test_one({1, 1, 1}, {1, 1, 1}); + test_one(policy, {1, 1, 1}, {1, 1, 1}); // Already sorted. - test_one({1, 2, 3, 4, 5}, {1, 2, 3, 4, 5}); + test_one(policy, {1, 2, 3, 4, 5}, {1, 2, 3, 4, 5}); // Reverse-sorted. - test_one({5, 4, 3, 2, 1}, {1, 2, 3, 4, 5}); + test_one(policy, {5, 4, 3, 2, 1}, {1, 2, 3, 4, 5}); // Repeating pattern. - test_one({1, 2, 1, 2, 1, 2}, {1, 1, 1, 2, 2, 2}); + test_one(policy, {1, 2, 1, 2, 1, 2}, {1, 1, 1, 2, 2, 2}); { // The sort is stable (equivalent elements remain in the same order). using V = OrderedValue; @@ -126,8 +126,25 @@ } }; +struct NotDefaultConstructible { + NotDefaultConstructible(int i) : i_(i) {} + + int i_; + + friend bool operator==(NotDefaultConstructible lhs, NotDefaultConstructible rhs) { + return lhs.i_ == rhs.i_; + } + + friend bool operator<(NotDefaultConstructible lhs, NotDefaultConstructible rhs) { + return lhs.i_ < rhs.i_; + } +}; + int main(int, char**) { - types::for_each(types::random_access_iterator_list{}, TestIteratorWithPolicies{}); + types::for_each(types::concatenate_t, + types::random_access_iterator_list, + types::random_access_iterator_list>{}, + TestIteratorWithPolicies{}); #ifndef TEST_HAS_NO_EXCEPTIONS std::set_terminate(terminate_successful);