diff --git a/libcxx/cmake/Modules/DefineLinkerScript.cmake b/libcxx/cmake/Modules/DefineLinkerScript.cmake --- a/libcxx/cmake/Modules/DefineLinkerScript.cmake +++ b/libcxx/cmake/Modules/DefineLinkerScript.cmake @@ -31,7 +31,7 @@ set(link_libraries) if (interface_libs) foreach(lib IN LISTS interface_libs) - if ("${lib}" STREQUAL "cxx-headers") + if ("${lib}" MATCHES "cxx-headers|ParallelSTL") continue() endif() # If ${lib} is not a target, we use a dummy target which we know will diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt --- a/libcxx/src/CMakeLists.txt +++ b/libcxx/src/CMakeLists.txt @@ -179,7 +179,7 @@ endif() if (LIBCXX_ENABLE_PARALLEL_ALGORITHMS) - target_link_libraries(${name} PUBLIC pstl::ParallelSTL) + target_link_libraries(${name} PUBLIC pstl::ParallelSTL) endif() endfunction() diff --git a/pstl/CMakeLists.txt b/pstl/CMakeLists.txt --- a/pstl/CMakeLists.txt +++ b/pstl/CMakeLists.txt @@ -16,7 +16,7 @@ project(ParallelSTL VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH} LANGUAGES CXX) -set(PSTL_PARALLEL_BACKEND "serial" CACHE STRING "Threading backend to use. Valid choices are 'serial' and 'tbb'. The default is 'serial'.") +set(PSTL_PARALLEL_BACKEND "serial" CACHE STRING "Threading backend to use. Valid choices are 'serial', 'omp', and 'tbb'. The default is 'serial'.") set(PSTL_HIDE_FROM_ABI_PER_TU OFF CACHE BOOL "Whether to constrain ABI-unstable symbols to each translation unit (basically, mark them with C's static keyword).") set(_PSTL_HIDE_FROM_ABI_PER_TU ${PSTL_HIDE_FROM_ABI_PER_TU}) # For __pstl_config_site @@ -43,6 +43,10 @@ message(STATUS "Parallel STL uses TBB ${TBB_VERSION} (interface version: ${TBB_INTERFACE_VERSION})") target_link_libraries(ParallelSTL INTERFACE TBB::tbb) set(_PSTL_PAR_BACKEND_TBB ON) +elseif (PSTL_PARALLEL_BACKEND STREQUAL "omp") + message(STATUS "Parallel STL uses the omp backend") + target_compile_options(ParallelSTL INTERFACE "-fopenmp=libomp") + set(_PSTL_PAR_BACKEND_OMP ON) else() message(FATAL_ERROR "Requested unknown Parallel STL backend '${PSTL_PARALLEL_BACKEND}'.") endif() @@ -62,8 +66,8 @@ ############################################################################### # Setup tests ############################################################################### -enable_testing() -add_subdirectory(test) +#enable_testing() +#add_subdirectory(test) ############################################################################### # Install the target and the associated CMake files diff --git a/pstl/include/__pstl_config_site.in b/pstl/include/__pstl_config_site.in --- a/pstl/include/__pstl_config_site.in +++ b/pstl/include/__pstl_config_site.in @@ -11,6 +11,7 @@ #cmakedefine _PSTL_PAR_BACKEND_SERIAL #cmakedefine _PSTL_PAR_BACKEND_TBB +#cmakedefine _PSTL_PAR_BACKEND_OMP #cmakedefine _PSTL_HIDE_FROM_ABI_PER_TU #endif // __PSTL_CONFIG_SITE diff --git a/pstl/include/pstl/internal/algorithm_impl.h b/pstl/include/pstl/internal/algorithm_impl.h --- a/pstl/include/pstl/internal/algorithm_impl.h +++ b/pstl/include/pstl/internal/algorithm_impl.h @@ -43,9 +43,9 @@ return std::any_of(__first, __last, __pred); }; -template +template bool -__brick_any_of(const _RandomAccessIterator __first, const _RandomAccessIterator __last, _Pred __pred, +__brick_any_of(const _ForwardIterator __first, const _ForwardIterator __last, _Pred __pred, /*__is_vector=*/std::true_type) noexcept { return __unseq_backend::__simd_or(__first, __last - __first, __pred); @@ -59,14 +59,14 @@ return __internal::__brick_any_of(__first, __last, __pred, __is_vector); } -template +template bool -__pattern_any_of(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, _Pred __pred, +__pattern_any_of(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Pred __pred, _IsVector __is_vector, /*parallel=*/std::true_type) { return __internal::__except_handler([&]() { return __internal::__parallel_or(std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__pred, __is_vector](_RandomAccessIterator __i, _RandomAccessIterator __j) { + [__pred, __is_vector](_ForwardIterator __i, _ForwardIterator __j) { return __internal::__brick_any_of(__i, __j, __pred, __is_vector); }); }); @@ -113,15 +113,15 @@ __internal::__brick_walk1(__first, __last, __f, __is_vector); } -template +template void -__pattern_walk1(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, _Function __f, +__pattern_walk1(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Function __f, _IsVector __is_vector, /*parallel=*/std::true_type) { __internal::__except_handler([&]() { __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__f, __is_vector](_RandomAccessIterator __i, _RandomAccessIterator __j) { + [__f, __is_vector](_ForwardIterator __i, _ForwardIterator __j) { __internal::__brick_walk1(__i, __j, __f, __is_vector); }); }); @@ -135,16 +135,14 @@ __brick(__first, __last); } -template +template void -__pattern_walk_brick(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, - _Brick __brick, +__pattern_walk_brick(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Brick __brick, /*parallel=*/std::true_type) { __internal::__except_handler([&]() { - __par_backend::__parallel_for( - std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__brick](_RandomAccessIterator __i, _RandomAccessIterator __j) { __brick(__i, __j); }); + __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __last, + [__brick](_ForwardIterator __i, _ForwardIterator __j) { __brick(__i, __j); }); }); } @@ -222,10 +220,9 @@ return __first2; } -template -_RandomAccessIterator2 -__brick_walk2(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _Function __f, +template +_ForwardIterator2 +__brick_walk2(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _Function __f, /*vector=*/std::true_type) noexcept { return __unseq_backend::__simd_walk_2(__first1, __last1 - __first1, __first2, __f); @@ -241,9 +238,9 @@ return __first2; } -template -_RandomAccessIterator2 -__brick_walk2_n(_RandomAccessIterator1 __first1, _Size __n, _RandomAccessIterator2 __first2, _Function __f, +template +_ForwardIterator2 +__brick_walk2_n(_ForwardIterator1 __first1, _Size __n, _ForwardIterator2 __first2, _Function __f, /*vector=*/std::true_type) noexcept { return __unseq_backend::__simd_walk_2(__first1, __n, __first2, __f); @@ -257,16 +254,15 @@ return __internal::__brick_walk2(__first1, __last1, __first2, __f, __is_vector); } -template -_RandomAccessIterator2 -__pattern_walk2(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _Function __f, _IsVector __is_vector, /*parallel=*/std::true_type) +template +_ForwardIterator2 +__pattern_walk2(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _Function __f, _IsVector __is_vector, /*parallel=*/std::true_type) { return __internal::__except_handler([&]() { __par_backend::__parallel_for( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, - [__f, __first1, __first2, __is_vector](_RandomAccessIterator1 __i, _RandomAccessIterator1 __j) { + [__f, __first1, __first2, __is_vector](_ForwardIterator1 __i, _ForwardIterator1 __j) { __internal::__brick_walk2(__i, __j, __first2 + (__i - __first1), __f, __is_vector); }); return __first2 + (__last1 - __first1); @@ -517,19 +513,19 @@ return __internal::__brick_find_if(__first, __last, __pred, __is_vector); } -template -_RandomAccessIterator -__pattern_find_if(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, - _Predicate __pred, _IsVector __is_vector, +template +_ForwardIterator +__pattern_find_if(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred, + _IsVector __is_vector, /*is_parallel=*/std::true_type) { return __internal::__except_handler([&]() { return __internal::__parallel_find( std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__pred, __is_vector](_RandomAccessIterator __i, _RandomAccessIterator __j) { + [__pred, __is_vector](_ForwardIterator __i, _ForwardIterator __j) { return __internal::__brick_find_if(__i, __j, __pred, __is_vector); }, - std::less::difference_type>(), + std::less::difference_type>(), /*is_first=*/true); }); } @@ -638,10 +634,10 @@ return std::find_end(__first, __last, __s_first, __s_last, __pred); } -template -_RandomAccessIterator1 -__brick_find_end(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __s_first, - _RandomAccessIterator2 __s_last, _BinaryPredicate __pred, /*__is_vector=*/std::true_type) noexcept +template +_ForwardIterator1 +__brick_find_end(_ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __s_first, + _ForwardIterator2 __s_last, _BinaryPredicate __pred, /*__is_vector=*/std::true_type) noexcept { return __find_subrange(__first, __last, __last, __s_first, __s_last, __pred, false, std::true_type()); } @@ -656,11 +652,11 @@ return __internal::__brick_find_end(__first, __last, __s_first, __s_last, __pred, __is_vector); } -template -_RandomAccessIterator1 -__pattern_find_end(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __s_first, _RandomAccessIterator2 __s_last, _BinaryPredicate __pred, +_ForwardIterator1 +__pattern_find_end(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, + _ForwardIterator2 __s_first, _ForwardIterator2 __s_last, _BinaryPredicate __pred, _IsVector __is_vector, /*is_parallel=*/std::true_type) noexcept { if (__last - __first == __s_last - __s_first) @@ -674,13 +670,11 @@ return __internal::__except_handler([&]() { return __internal::__parallel_find( std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__last, __s_first, __s_last, __pred, __is_vector](_RandomAccessIterator1 __i, - _RandomAccessIterator1 __j) { + [__last, __s_first, __s_last, __pred, __is_vector](_ForwardIterator1 __i, _ForwardIterator1 __j) { return __internal::__find_subrange(__i, __j, __last, __s_first, __s_last, __pred, false, __is_vector); }, - std::greater::difference_type>(), - /*is_first=*/false); + std::greater::difference_type>(), /*is_first=*/false); }); } } @@ -696,10 +690,10 @@ return std::find_first_of(__first, __last, __s_first, __s_last, __pred); } -template -_RandomAccessIterator1 -__brick_find_first_of(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __s_first, - _RandomAccessIterator2 __s_last, _BinaryPredicate __pred, /*__is_vector=*/std::true_type) noexcept +template +_ForwardIterator1 +__brick_find_first_of(_ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __s_first, + _ForwardIterator2 __s_last, _BinaryPredicate __pred, /*__is_vector=*/std::true_type) noexcept { return __unseq_backend::__simd_find_first_of(__first, __last, __s_first, __s_last, __pred); } @@ -714,38 +708,38 @@ return __internal::__brick_find_first_of(__first, __last, __s_first, __s_last, __pred, __is_vector); } -template -_RandomAccessIterator1 -__pattern_find_first_of(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __s_first, _RandomAccessIterator2 __s_last, _BinaryPredicate __pred, +_ForwardIterator1 +__pattern_find_first_of(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, + _ForwardIterator2 __s_first, _ForwardIterator2 __s_last, _BinaryPredicate __pred, _IsVector __is_vector, /*is_parallel=*/std::true_type) noexcept { return __internal::__except_handler([&]() { return __internal::__parallel_find( std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__s_first, __s_last, __pred, __is_vector](_RandomAccessIterator1 __i, _RandomAccessIterator1 __j) { + [__s_first, __s_last, __pred, __is_vector](_ForwardIterator1 __i, _ForwardIterator1 __j) { return __internal::__brick_find_first_of(__i, __j, __s_first, __s_last, __pred, __is_vector); }, - std::less::difference_type>(), /*is_first=*/true); + std::less::difference_type>(), /*is_first=*/true); }); } //------------------------------------------------------------------------ // search //------------------------------------------------------------------------ -template -_RandomAccessIterator1 -__brick_search(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __s_first, - _RandomAccessIterator2 __s_last, _BinaryPredicate __pred, /*vector=*/std::false_type) noexcept +template +_ForwardIterator1 +__brick_search(_ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __s_first, + _ForwardIterator2 __s_last, _BinaryPredicate __pred, /*vector=*/std::false_type) noexcept { return std::search(__first, __last, __s_first, __s_last, __pred); } -template -_RandomAccessIterator1 -__brick_search(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __s_first, - _RandomAccessIterator2 __s_last, _BinaryPredicate __pred, /*vector=*/std::true_type) noexcept +template +_ForwardIterator1 +__brick_search(_ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __s_first, + _ForwardIterator2 __s_last, _BinaryPredicate __pred, /*vector=*/std::true_type) noexcept { return __internal::__find_subrange(__first, __last, __last, __s_first, __s_last, __pred, true, std::true_type()); } @@ -760,11 +754,11 @@ return __internal::__brick_search(__first, __last, __s_first, __s_last, __pred, __is_vector); } -template -_RandomAccessIterator1 -__pattern_search(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __s_first, _RandomAccessIterator2 __s_last, _BinaryPredicate __pred, +_ForwardIterator1 +__pattern_search(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, + _ForwardIterator2 __s_first, _ForwardIterator2 __s_last, _BinaryPredicate __pred, _IsVector __is_vector, /*is_parallel=*/std::true_type) noexcept { @@ -779,12 +773,11 @@ return __internal::__except_handler([&]() { return __internal::__parallel_find( std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__last, __s_first, __s_last, __pred, __is_vector](_RandomAccessIterator1 __i, - _RandomAccessIterator1 __j) { + [__last, __s_first, __s_last, __pred, __is_vector](_ForwardIterator1 __i, _ForwardIterator1 __j) { return __internal::__find_subrange(__i, __j, __last, __s_first, __s_last, __pred, true, __is_vector); }, - std::less::difference_type>(), /*is_first=*/true); + std::less::difference_type>(), /*is_first=*/true); }); } } @@ -800,9 +793,9 @@ return std::search_n(__first, __last, __count, __value, __pred); } -template -_RandomAccessIterator -__brick_search_n(_RandomAccessIterator __first, _RandomAccessIterator __last, _Size __count, const _Tp& __value, +template +_ForwardIterator +__brick_search_n(_ForwardIterator __first, _ForwardIterator __last, _Size __count, const _Tp& __value, _BinaryPredicate __pred, /*vector=*/std::true_type) noexcept { return __internal::__find_subrange(__first, __last, __last, __count, __value, __pred, std::true_type()); @@ -857,14 +850,12 @@ return std::copy_n(__first, __n, __result); } -template -_RandomAccessIterator2 -__brick_copy_n(_RandomAccessIterator1 __first, _Size __n, _RandomAccessIterator2 __result, - /*vector=*/std::true_type) noexcept +template +_OutputIterator +__brick_copy_n(_ForwardIterator __first, _Size __n, _OutputIterator __result, /*vector=*/std::true_type) noexcept { return __unseq_backend::__simd_assign( - __first, __n, __result, - [](_RandomAccessIterator1 __first, _RandomAccessIterator2 __result) { *__result = *__first; }); + __first, __n, __result, [](_ForwardIterator __first, _OutputIterator __result) { *__result = *__first; }); } //------------------------------------------------------------------------ @@ -878,14 +869,14 @@ return std::copy(__first, __last, __result); } -template -_RandomAccessIterator2 -__brick_copy(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, +template +_OutputIterator +__brick_copy(_RandomAccessIterator __first, _RandomAccessIterator __last, _OutputIterator __result, /*vector=*/std::true_type) noexcept { return __unseq_backend::__simd_assign( __first, __last - __first, __result, - [](_RandomAccessIterator1 __first, _RandomAccessIterator2 __result) { *__result = *__first; }); + [](_RandomAccessIterator __first, _OutputIterator __result) { *__result = *__first; }); } //------------------------------------------------------------------------ @@ -899,38 +890,36 @@ return std::move(__first, __last, __result); } -template -_RandomAccessIterator2 -__brick_move(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, +template +_OutputIterator +__brick_move(_RandomAccessIterator __first, _RandomAccessIterator __last, _OutputIterator __result, /*vector=*/std::true_type) noexcept { return __unseq_backend::__simd_assign( __first, __last - __first, __result, - [](_RandomAccessIterator1 __first, _RandomAccessIterator2 __result) { *__result = std::move(*__first); }); + [](_RandomAccessIterator __first, _OutputIterator __result) { *__result = std::move(*__first); }); } struct __brick_move_destroy { - template - _RandomAccessIterator2 - operator()(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, - /*vec*/ std::true_type) const + template + _OutputIterator + operator()(_Iterator __first, _Iterator __last, _OutputIterator __result, /*vec*/ std::true_type) const { - using _IteratorValueType = typename std::iterator_traits<_RandomAccessIterator1>::value_type; + using _IteratorValueType = typename std::iterator_traits<_Iterator>::value_type; return __unseq_backend::__simd_assign(__first, __last - __first, __result, - [](_RandomAccessIterator1 __first, _RandomAccessIterator2 __result) { + [](_Iterator __first, _OutputIterator __result) { *__result = std::move(*__first); (*__first).~_IteratorValueType(); }); } - template - _RandomAccessIterator2 - operator()(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, - /*vec*/ std::false_type) const + template + _OutputIterator + operator()(_Iterator __first, _Iterator __last, _OutputIterator __result, /*vec*/ std::false_type) const { - using _IteratorValueType = typename std::iterator_traits<_RandomAccessIterator1>::value_type; + using _IteratorValueType = typename std::iterator_traits<_Iterator>::value_type; for (; __first != __last; ++__first, ++__result) { @@ -952,14 +941,14 @@ return std::swap_ranges(__first, __last, __result); } -template -_RandomAccessIterator2 -__brick_swap_ranges(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, +template +_OutputIterator +__brick_swap_ranges(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, /*vector=*/std::true_type) noexcept { using std::iter_swap; return __unseq_backend::__simd_assign(__first, __last - __first, __result, - iter_swap<_RandomAccessIterator1, _RandomAccessIterator2>); + iter_swap<_ForwardIterator, _OutputIterator>); } //------------------------------------------------------------------------ @@ -973,10 +962,9 @@ return std::copy_if(__first, __last, __result, __pred); } -template -_RandomAccessIterator2 -__brick_copy_if(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, - _UnaryPredicate __pred, +template +_OutputIterator +__brick_copy_if(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, _UnaryPredicate __pred, /*vector=*/std::true_type) noexcept { #if (_PSTL_MONOTONIC_PRESENT) @@ -1033,9 +1021,9 @@ } } -template +template void -__brick_copy_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, +__brick_copy_by_mask(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator __result, bool* __restrict __mask, _Assigner __assigner, /*vector=*/std::true_type) noexcept { #if (_PSTL_MONOTONIC_PRESENT) @@ -1065,11 +1053,10 @@ } } -template +template void -__brick_partition_by_mask(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __out_true, _RandomAccessIterator3 __out_false, bool* __mask, - /*vector=*/std::true_type) noexcept +__brick_partition_by_mask(_RandomAccessIterator __first, _RandomAccessIterator __last, _OutputIterator1 __out_true, + _OutputIterator2 __out_false, bool* __mask, /*vector=*/std::true_type) noexcept { #if (_PSTL_MONOTONIC_PRESENT) __unseq_backend::__simd_partition_by_mask(__first, __last - __first, __out_true, __out_false, __mask); @@ -1086,14 +1073,13 @@ return __internal::__brick_copy_if(__first, __last, __result, __pred, __is_vector); } -template -_RandomAccessIterator2 -__pattern_copy_if(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __result, _UnaryPredicate __pred, _IsVector __is_vector, - /*parallel=*/std::true_type) +_OutputIterator +__pattern_copy_if(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, + _OutputIterator __result, _UnaryPredicate __pred, _IsVector __is_vector, /*parallel=*/std::true_type) { - typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType; const _DifferenceType __n = __last - __first; if (_DifferenceType(1) < __n) { @@ -1112,7 +1098,7 @@ [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { // Scan __internal::__brick_copy_by_mask( __first + __i, __first + (__i + __len), __result + __initial, __mask + __i, - [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, __is_vector); + [](_RandomAccessIterator __x, _OutputIterator __z) { *__z = *__x; }, __is_vector); }, [&__m](_DifferenceType __total) { __m = __total; }); return __result + __m; @@ -1125,9 +1111,9 @@ //------------------------------------------------------------------------ // count //------------------------------------------------------------------------ -template -typename std::iterator_traits<_RandomAccessIterator>::difference_type -__brick_count(_RandomAccessIterator __first, _RandomAccessIterator __last, _Predicate __pred, +template +typename std::iterator_traits<_ForwardIterator>::difference_type +__brick_count(_ForwardIterator __first, _ForwardIterator __last, _Predicate __pred, /* is_vector = */ std::true_type) noexcept { return __unseq_backend::__simd_count(__first, __last - __first, __pred); @@ -1149,18 +1135,18 @@ return __internal::__brick_count(__first, __last, __pred, __is_vector); } -template -typename std::iterator_traits<_RandomAccessIterator>::difference_type -__pattern_count(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, - _Predicate __pred, +template +typename std::iterator_traits<_ForwardIterator>::difference_type +__pattern_count(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Predicate __pred, /* is_parallel */ std::true_type, _IsVector __is_vector) { - typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _SizeType; + typedef typename std::iterator_traits<_ForwardIterator>::difference_type _SizeType; return __internal::__except_handler([&]() { return __par_backend::__parallel_reduce( std::forward<_ExecutionPolicy>(__exec), __first, __last, _SizeType(0), - [__pred, __is_vector](_RandomAccessIterator __begin, _RandomAccessIterator __end, _SizeType __value) - -> _SizeType { return __value + __internal::__brick_count(__begin, __end, __pred, __is_vector); }, + [__pred, __is_vector](_ForwardIterator __begin, _ForwardIterator __end, _SizeType __value) -> _SizeType { + return __value + __internal::__brick_count(__begin, __end, __pred, __is_vector); + }, std::plus<_SizeType>()); }); } @@ -1169,17 +1155,17 @@ // unique //------------------------------------------------------------------------ -template -_RandomAccessIterator -__brick_unique(_RandomAccessIterator __first, _RandomAccessIterator __last, _BinaryPredicate __pred, +template +_ForwardIterator +__brick_unique(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred, /*is_vector=*/std::false_type) noexcept { return std::unique(__first, __last, __pred); } -template -_RandomAccessIterator -__brick_unique(_RandomAccessIterator __first, _RandomAccessIterator __last, _BinaryPredicate __pred, +template +_ForwardIterator +__brick_unique(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred, /*is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -1221,8 +1207,8 @@ return __local_min; } // find first iterator that should be removed - bool* __result = __internal::__brick_find_if(__mask + __i, __mask + __j, - [](bool __val) { return !__val; }, __is_vector); + bool* __result = __internal::__brick_find_if( + __mask + __i, __mask + __j, [](bool __val) { return !__val; }, __is_vector); if (__result - __mask == __j) { return __local_min; @@ -1249,16 +1235,17 @@ __par_backend::__parallel_strict_scan( std::forward<_ExecutionPolicy>(__exec), __n, _DifferenceType(0), [__mask, __is_vector](_DifferenceType __i, _DifferenceType __len) { - return __internal::__brick_count(__mask + __i, __mask + __i + __len, [](bool __val) { return __val; }, - __is_vector); + return __internal::__brick_count( + __mask + __i, __mask + __i + __len, [](bool __val) { return __val; }, __is_vector); }, std::plus<_DifferenceType>(), [=](_DifferenceType __i, _DifferenceType __len, _DifferenceType __initial) { __internal::__brick_copy_by_mask( __first + __i, __first + __i + __len, __result + __initial, __mask + __i, [](_ForwardIterator __x, _Tp* __z) { - __internal::__invoke_if_else(std::is_trivial<_Tp>(), [&]() { *__z = std::move(*__x); }, - [&]() { ::new (std::addressof(*__z)) _Tp(std::move(*__x)); }); + __internal::__invoke_if_else( + std::is_trivial<_Tp>(), [&]() { *__z = std::move(*__x); }, + [&]() { ::new (std::addressof(*__z)) _Tp(std::move(*__x)); }); }, __is_vector); }, @@ -1268,20 +1255,20 @@ __par_backend::__parallel_for( std::forward<_ExecutionPolicy>(__exec), __result, __result + __m, [__result, __first, __is_vector](_Tp* __i, _Tp* __j) { - __invoke_if_else(std::is_trivial<_Tp>(), - [&]() { __brick_move(__i, __j, __first + (__i - __result), __is_vector); }, - [&]() { __brick_move_destroy()(__i, __j, __first + (__i - __result), __is_vector); }); + __invoke_if_else( + std::is_trivial<_Tp>(), [&]() { __brick_move(__i, __j, __first + (__i - __result), __is_vector); }, + [&]() { __brick_move_destroy()(__i, __j, __first + (__i - __result), __is_vector); }); }); return __first + __m; }); } -template -_RandomAccessIterator -__pattern_unique(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, - _BinaryPredicate __pred, _IsVector __is_vector, /*is_parallel=*/std::true_type) noexcept +template +_ForwardIterator +__pattern_unique(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred, + _IsVector __is_vector, /*is_parallel=*/std::true_type) noexcept { - typedef typename std::iterator_traits<_RandomAccessIterator>::reference _ReferenceType; + typedef typename std::iterator_traits<_ForwardIterator>::reference _ReferenceType; if (__first == __last) { @@ -1294,7 +1281,7 @@ } return __internal::__remove_elements( std::forward<_ExecutionPolicy>(__exec), ++__first, __last, - [&__pred, __is_vector](bool* __b, bool* __e, _RandomAccessIterator __it) { + [&__pred, __is_vector](bool* __b, bool* __e, _ForwardIterator __it) { __internal::__brick_walk3( __b, __e, __it - 1, __it, [&__pred](bool& __x, _ReferenceType __y, _ReferenceType __z) { __x = !__pred(__y, __z); }, __is_vector); @@ -1314,9 +1301,9 @@ return std::unique_copy(__first, __last, __result, __pred); } -template -_RandomAccessIterator2 -__brick_unique_copy(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, +template +OutputIterator +__brick_unique_copy(_RandomAccessIterator __first, _RandomAccessIterator __last, OutputIterator __result, _BinaryPredicate __pred, /*vector=*/std::true_type) noexcept { #if (_PSTL_MONOTONIC_PRESENT) @@ -1357,14 +1344,14 @@ return __unseq_backend::__simd_calc_mask_2(__first, __last - __first, __mask, __pred); } -template -_RandomAccessIterator2 -__pattern_unique_copy(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __result, _BinaryPredicate __pred, _IsVector __is_vector, +_OutputIterator +__pattern_unique_copy(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, + _OutputIterator __result, _BinaryPredicate __pred, _IsVector __is_vector, /*parallel=*/std::true_type) { - typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType; const _DifferenceType __n = __last - __first; if (_DifferenceType(2) < __n) { @@ -1396,7 +1383,7 @@ // Phase 2 is same as for __pattern_copy_if __internal::__brick_copy_by_mask( __first + __i, __first + (__i + __len), __result + __initial, __mask + __i, - [](_RandomAccessIterator1 __x, _RandomAccessIterator2 __z) { *__z = *__x; }, __is_vector); + [](_RandomAccessIterator __x, _OutputIterator __z) { *__z = *__x; }, __is_vector); }, [&__m](_DifferenceType __total) { __m = __total; }); return __result + __m; @@ -1417,14 +1404,14 @@ std::reverse(__first, __last); } -template +template void -__brick_reverse(_RandomAccessIterator __first, _RandomAccessIterator __last, /*__is_vector=*/std::true_type) noexcept +__brick_reverse(_BidirectionalIterator __first, _BidirectionalIterator __last, /*__is_vector=*/std::true_type) noexcept { - typedef typename std::iterator_traits<_RandomAccessIterator>::reference _ReferenceType; + typedef typename std::iterator_traits<_BidirectionalIterator>::reference _ReferenceType; const auto __n = (__last - __first) / 2; - __unseq_backend::__simd_walk_2(__first, __n, std::reverse_iterator<_RandomAccessIterator>(__last), + __unseq_backend::__simd_walk_2(__first, __n, std::reverse_iterator<_BidirectionalIterator>(__last), [](_ReferenceType __x, _ReferenceType __y) { using std::swap; swap(__x, __y); @@ -1445,14 +1432,14 @@ } // this brick is called in parallel version, so we can use iterator arithmetic -template +template void -__brick_reverse(_RandomAccessIterator __first, _RandomAccessIterator __last, _RandomAccessIterator __d_last, +__brick_reverse(_BidirectionalIterator __first, _BidirectionalIterator __last, _BidirectionalIterator __d_last, /*is_vector=*/std::true_type) noexcept { - typedef typename std::iterator_traits<_RandomAccessIterator>::reference _ReferenceType; + typedef typename std::iterator_traits<_BidirectionalIterator>::reference _ReferenceType; - __unseq_backend::__simd_walk_2(__first, __last - __first, std::reverse_iterator<_RandomAccessIterator>(__d_last), + __unseq_backend::__simd_walk_2(__first, __last - __first, std::reverse_iterator<_BidirectionalIterator>(__d_last), [](_ReferenceType __x, _ReferenceType __y) { using std::swap; swap(__x, __y); @@ -1468,14 +1455,14 @@ __internal::__brick_reverse(__first, __last, _is_vector); } -template +template void -__pattern_reverse(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, +__pattern_reverse(_ExecutionPolicy&& __exec, _BidirectionalIterator __first, _BidirectionalIterator __last, _IsVector __is_vector, /*is_parallel=*/std::true_type) { __par_backend::__parallel_for( std::forward<_ExecutionPolicy>(__exec), __first, __first + (__last - __first) / 2, - [__is_vector, __first, __last](_RandomAccessIterator __inner_first, _RandomAccessIterator __inner_last) { + [__is_vector, __first, __last](_BidirectionalIterator __inner_first, _BidirectionalIterator __inner_last) { __internal::__brick_reverse(__inner_first, __inner_last, __last - (__inner_first - __first), __is_vector); }); } @@ -1492,15 +1479,15 @@ return std::reverse_copy(__first, __last, __d_first); } -template -_RandomAccessIterator2 -__brick_reverse_copy(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __d_first, +template +_OutputIterator +__brick_reverse_copy(_BidirectionalIterator __first, _BidirectionalIterator __last, _OutputIterator __d_first, /*is_vector=*/std::true_type) noexcept { - typedef typename std::iterator_traits<_RandomAccessIterator1>::reference _ReferenceType1; - typedef typename std::iterator_traits<_RandomAccessIterator2>::reference _ReferenceType2; + typedef typename std::iterator_traits<_BidirectionalIterator>::reference _ReferenceType1; + typedef typename std::iterator_traits<_OutputIterator>::reference _ReferenceType2; - return __unseq_backend::__simd_walk_2(std::reverse_iterator<_RandomAccessIterator1>(__last), __last - __first, + return __unseq_backend::__simd_walk_2(std::reverse_iterator<_BidirectionalIterator>(__last), __last - __first, __d_first, [](_ReferenceType1 __x, _ReferenceType2 __y) { __y = __x; }); } @@ -1512,15 +1499,15 @@ return __internal::__brick_reverse_copy(__first, __last, __d_first, __is_vector); } -template -_RandomAccessIterator2 -__pattern_reverse_copy(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __d_first, _IsVector __is_vector, /*is_parallel=*/std::true_type) +template +_OutputIterator +__pattern_reverse_copy(_ExecutionPolicy&& __exec, _BidirectionalIterator __first, _BidirectionalIterator __last, + _OutputIterator __d_first, _IsVector __is_vector, /*is_parallel=*/std::true_type) { auto __len = __last - __first; __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__is_vector, __first, __len, __d_first](_RandomAccessIterator1 __inner_first, - _RandomAccessIterator1 __inner_last) { + [__is_vector, __first, __len, __d_first](_BidirectionalIterator __inner_first, + _BidirectionalIterator __inner_last) { __internal::__brick_reverse_copy(__inner_first, __inner_last, __d_first + (__len - (__inner_last - __first)), __is_vector); @@ -1544,14 +1531,14 @@ #endif } -template -_RandomAccessIterator -__brick_rotate(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last, +template +_ForwardIterator +__brick_rotate(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, /*is_vector=*/std::true_type) noexcept { auto __n = __last - __first; auto __m = __middle - __first; - const _RandomAccessIterator __ret = __first + (__last - __middle); + const _ForwardIterator __ret = __first + (__last - __middle); bool __is_left = (__m <= __n / 2); if (!__is_left) @@ -1566,7 +1553,7 @@ for (; __last - __first >= __m_2; __first += __m) { __unseq_backend::__simd_assign(__first, __m, __first + __m, - iter_swap<_RandomAccessIterator, _RandomAccessIterator>); + iter_swap<_ForwardIterator, _ForwardIterator>); } } else @@ -1574,7 +1561,7 @@ for (; __last - __first >= __m_2; __last -= __m) { __unseq_backend::__simd_assign(__last - __m, __m, __last - __m_2, - iter_swap<_RandomAccessIterator, _RandomAccessIterator>); + iter_swap<_ForwardIterator, _ForwardIterator>); } } __is_left = !__is_left; @@ -1593,12 +1580,12 @@ return __internal::__brick_rotate(__first, __middle, __last, __is_vector); } -template -_RandomAccessIterator -__pattern_rotate(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __middle, - _RandomAccessIterator __last, _IsVector __is_vector, /*is_parallel=*/std::true_type) +template +_ForwardIterator +__pattern_rotate(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __middle, + _ForwardIterator __last, _IsVector __is_vector, /*is_parallel=*/std::true_type) { - typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _Tp; + typedef typename std::iterator_traits<_ForwardIterator>::value_type _Tp; auto __n = __last - __first; auto __m = __middle - __first; if (__m <= __n / 2) @@ -1608,15 +1595,15 @@ _Tp* __result = __buf.get(); __par_backend::__parallel_for( std::forward<_ExecutionPolicy>(__exec), __middle, __last, - [__middle, __result, __is_vector](_RandomAccessIterator __b, _RandomAccessIterator __e) { + [__middle, __result, __is_vector](_ForwardIterator __b, _ForwardIterator __e) { __internal::__brick_uninitialized_move(__b, __e, __result + (__b - __middle), __is_vector); }); - __par_backend::__parallel_for( - std::forward<_ExecutionPolicy>(__exec), __first, __middle, - [__last, __middle, __is_vector](_RandomAccessIterator __b, _RandomAccessIterator __e) { - __internal::__brick_move(__b, __e, __b + (__last - __middle), __is_vector); - }); + __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __middle, + [__last, __middle, __is_vector](_ForwardIterator __b, _ForwardIterator __e) { + __internal::__brick_move(__b, __e, __b + (__last - __middle), + __is_vector); + }); __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __result, __result + (__n - __m), [__first, __result, __is_vector](_Tp* __b, _Tp* __e) { @@ -1631,17 +1618,17 @@ __par_backend::__buffer<_Tp> __buf(__m); return __internal::__except_handler([&__exec, __n, __m, __first, __middle, __last, __is_vector, &__buf]() { _Tp* __result = __buf.get(); - __par_backend::__parallel_for( - std::forward<_ExecutionPolicy>(__exec), __first, __middle, - [__first, __result, __is_vector](_RandomAccessIterator __b, _RandomAccessIterator __e) { - __internal::__brick_uninitialized_move(__b, __e, __result + (__b - __first), __is_vector); - }); + __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __middle, + [__first, __result, __is_vector](_ForwardIterator __b, _ForwardIterator __e) { + __internal::__brick_uninitialized_move( + __b, __e, __result + (__b - __first), __is_vector); + }); - __par_backend::__parallel_for( - std::forward<_ExecutionPolicy>(__exec), __middle, __last, - [__first, __middle, __is_vector](_RandomAccessIterator __b, _RandomAccessIterator __e) { - __internal::__brick_move(__b, __e, __first + (__b - __middle), __is_vector); - }); + __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __middle, __last, + [__first, __middle, __is_vector](_ForwardIterator __b, _ForwardIterator __e) { + __internal::__brick_move(__b, __e, __first + (__b - __middle), + __is_vector); + }); __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __result, __result + __m, [__n, __m, __first, __result, __is_vector](_Tp* __b, _Tp* __e) { @@ -1666,12 +1653,12 @@ return std::rotate_copy(__first, __middle, __last, __result); } -template -_RandomAccessIterator2 -__brick_rotate_copy(_RandomAccessIterator1 __first, _RandomAccessIterator1 __middle, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __result, /*__is_vector=*/std::true_type) noexcept +template +_OutputIterator +__brick_rotate_copy(_ForwardIterator __first, _ForwardIterator __middle, _ForwardIterator __last, + _OutputIterator __result, /*__is_vector=*/std::true_type) noexcept { - _RandomAccessIterator2 __res = __internal::__brick_copy(__middle, __last, __result, std::true_type()); + _OutputIterator __res = __internal::__brick_copy(__middle, __last, __result, std::true_type()); return __internal::__brick_copy(__first, __middle, __res, std::true_type()); } @@ -1683,22 +1670,22 @@ return __internal::__brick_rotate_copy(__first, __middle, __last, __result, __is_vector); } -template -_RandomAccessIterator2 -__pattern_rotate_copy(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __middle, - _RandomAccessIterator1 __last, _RandomAccessIterator2 __result, _IsVector __is_vector, +template +_OutputIterator +__pattern_rotate_copy(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __middle, + _ForwardIterator __last, _OutputIterator __result, _IsVector __is_vector, /*is_parallel=*/std::true_type) { __par_backend::__parallel_for( std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__first, __last, __middle, __result, __is_vector](_RandomAccessIterator1 __b, _RandomAccessIterator1 __e) { + [__first, __last, __middle, __result, __is_vector](_ForwardIterator __b, _ForwardIterator __e) { if (__b > __middle) { __internal::__brick_copy(__b, __e, __result + (__b - __middle), __is_vector); } else { - _RandomAccessIterator2 __new_result = __result + ((__last - __middle) + (__b - __first)); + _OutputIterator __new_result = __result + ((__last - __middle) + (__b - __first)); if (__e < __middle) { __internal::__brick_copy(__b, __e, __new_result, __is_vector); @@ -1725,21 +1712,21 @@ return std::is_partitioned(__first, __last, __pred); } -template +template bool -__brick_is_partitioned(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryPredicate __pred, +__brick_is_partitioned(_ForwardIterator __first, _ForwardIterator __last, _UnaryPredicate __pred, /*is_vector=*/std::true_type) noexcept { - typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _SizeType; + typedef typename std::iterator_traits<_ForwardIterator>::difference_type _SizeType; if (__first == __last) { return true; } else { - _RandomAccessIterator __result = __unseq_backend::__simd_first( + _ForwardIterator __result = __unseq_backend::__simd_first( __first, _SizeType(0), __last - __first, - [&__pred](_RandomAccessIterator __it, _SizeType __i) { return !__pred(__it[__i]); }); + [&__pred](_ForwardIterator __it, _SizeType __i) { return !__pred(__it[__i]); }); if (__result == __last) { return true; @@ -1760,9 +1747,9 @@ return __internal::__brick_is_partitioned(__first, __last, __pred, __is_vector); } -template +template bool -__pattern_is_partitioned(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, +__pattern_is_partitioned(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _UnaryPredicate __pred, _IsVector __is_vector, /*is_parallel=*/std::true_type) { if (__first == __last) @@ -1795,7 +1782,7 @@ __init = __par_backend::__parallel_reduce( std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, - [&__pred, &__table, __is_vector](_RandomAccessIterator __i, _RandomAccessIterator __j, + [&__pred, &__table, __is_vector](_ForwardIterator __i, _ForwardIterator __j, _ReduceType __value) -> _ReduceType { if (__value == __broken) { @@ -1806,12 +1793,12 @@ if (__pred(*__i)) { // find first element that don't satisfy pred - _RandomAccessIterator __x = + _ForwardIterator __x = __internal::__brick_find_if(__i + 1, __j, std::not_fn(__pred), __is_vector); if (__x != __j) { // find first element after "x" that satisfy pred - _RandomAccessIterator __y = __internal::__brick_find_if(__x + 1, __j, __pred, __is_vector); + _ForwardIterator __y = __internal::__brick_find_if(__x + 1, __j, __pred, __is_vector); // if it was found then range isn't partitioned by pred if (__y != __j) { @@ -1869,9 +1856,9 @@ return std::partition(__first, __last, __pred); } -template -_RandomAccessIterator -__brick_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryPredicate __pred, +template +_ForwardIterator +__brick_partition(_ForwardIterator __first, _ForwardIterator __last, _UnaryPredicate __pred, /*is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -1886,9 +1873,9 @@ return __internal::__brick_partition(__first, __last, __pred, __is_vector); } -template -_RandomAccessIterator -__pattern_partition(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, +template +_ForwardIterator +__pattern_partition(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _UnaryPredicate __pred, _IsVector __is_vector, /*is_parallel=*/std::true_type) { @@ -1896,9 +1883,9 @@ // elements after pivot don't satisfy pred (false part) struct _PartitionRange { - _RandomAccessIterator __begin; - _RandomAccessIterator __pivot; - _RandomAccessIterator __end; + _ForwardIterator __begin; + _ForwardIterator __pivot; + _ForwardIterator __end; }; return __internal::__except_handler([&]() { @@ -1921,7 +1908,7 @@ { __par_backend::__parallel_for( std::forward<_ExecutionPolicy>(__exec), __val1.__pivot, __val1.__pivot + __size1, - [__val1, __val2, __size1, __is_vector](_RandomAccessIterator __i, _RandomAccessIterator __j) { + [__val1, __val2, __size1, __is_vector](_ForwardIterator __i, _ForwardIterator __j) { __internal::__brick_swap_ranges(__i, __j, (__val2.__pivot - __size1) + (__i - __val1.__pivot), __is_vector); }); @@ -1932,7 +1919,7 @@ { __par_backend::__parallel_for( std::forward<_ExecutionPolicy>(__exec), __val1.__pivot, __val1.__pivot + __size2, - [__val1, __val2, __is_vector](_RandomAccessIterator __i, _RandomAccessIterator __j) { + [__val1, __val2, __is_vector](_ForwardIterator __i, _ForwardIterator __j) { __internal::__brick_swap_ranges(__i, __j, __val2.__begin + (__i - __val1.__pivot), __is_vector); }); return {__new_begin, __val1.__pivot + __size2, __val2.__end}; @@ -1941,10 +1928,10 @@ _PartitionRange __result = __par_backend::__parallel_reduce( std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, - [__pred, __is_vector, __reductor](_RandomAccessIterator __i, _RandomAccessIterator __j, + [__pred, __is_vector, __reductor](_ForwardIterator __i, _ForwardIterator __j, _PartitionRange __value) -> _PartitionRange { //1. serial partition - _RandomAccessIterator __pivot = __internal::__brick_partition(__i, __j, __pred, __is_vector); + _ForwardIterator __pivot = __internal::__brick_partition(__i, __j, __pred, __is_vector); // 2. merging of two ranges (left and right respectively) return __reductor(__value, {__i, __pivot, __j}); @@ -1966,9 +1953,9 @@ return std::stable_partition(__first, __last, __pred); } -template -_RandomAccessIterator -__brick_stable_partition(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryPredicate __pred, +template +_BidirectionalIterator +__brick_stable_partition(_BidirectionalIterator __first, _BidirectionalIterator __last, _UnaryPredicate __pred, /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -1984,9 +1971,9 @@ return __internal::__brick_stable_partition(__first, __last, __pred, __is_vector); } -template -_RandomAccessIterator -__pattern_stable_partition(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, +template +_BidirectionalIterator +__pattern_stable_partition(_ExecutionPolicy&& __exec, _BidirectionalIterator __first, _BidirectionalIterator __last, _UnaryPredicate __pred, _IsVector __is_vector, /*is_parallelization=*/std::true_type) noexcept { @@ -1994,9 +1981,9 @@ // elements after pivot don't satisfy pred (false part) struct _PartitionRange { - _RandomAccessIterator __begin; - _RandomAccessIterator __pivot; - _RandomAccessIterator __end; + _BidirectionalIterator __begin; + _BidirectionalIterator __pivot; + _BidirectionalIterator __end; }; return __internal::__except_handler([&]() { @@ -2023,10 +2010,10 @@ _PartitionRange __result = __par_backend::__parallel_reduce( std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, - [&__pred, __is_vector, __reductor](_RandomAccessIterator __i, _RandomAccessIterator __j, + [&__pred, __is_vector, __reductor](_BidirectionalIterator __i, _BidirectionalIterator __j, _PartitionRange __value) -> _PartitionRange { //1. serial stable_partition - _RandomAccessIterator __pivot = __internal::__brick_stable_partition(__i, __j, __pred, __is_vector); + _BidirectionalIterator __pivot = __internal::__brick_stable_partition(__i, __j, __pred, __is_vector); // 2. merging of two ranges (left and right respectively) return __reductor(__value, {__i, __pivot, __j}); @@ -2048,12 +2035,10 @@ return std::partition_copy(__first, __last, __out_true, __out_false, __pred); } -template -std::pair<_RandomAccessIterator2, _RandomAccessIterator3> -__brick_partition_copy(_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _RandomAccessIterator2 __out_true, - _RandomAccessIterator3 __out_false, _UnaryPredicate __pred, - /*is_vector=*/std::true_type) noexcept +template +std::pair<_OutputIterator1, _OutputIterator2> +__brick_partition_copy(_ForwardIterator __first, _ForwardIterator __last, _OutputIterator1 __out_true, + _OutputIterator2 __out_false, _UnaryPredicate __pred, /*is_vector=*/std::true_type) noexcept { #if (_PSTL_MONOTONIC_PRESENT) return __unseq_backend::__simd_partition_copy(__first, __last - __first, __out_true, __out_false, __pred); @@ -2072,14 +2057,14 @@ return __internal::__brick_partition_copy(__first, __last, __out_true, __out_false, __pred, __is_vector); } -template -std::pair<_RandomAccessIterator2, _RandomAccessIterator3> -__pattern_partition_copy(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __out_true, _RandomAccessIterator3 __out_false, _UnaryPredicate __pred, +template +std::pair<_OutputIterator1, _OutputIterator2> +__pattern_partition_copy(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, + _OutputIterator1 __out_true, _OutputIterator2 __out_false, _UnaryPredicate __pred, _IsVector __is_vector, /*is_parallelization=*/std::true_type) { - typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; + typedef typename std::iterator_traits<_RandomAccessIterator>::difference_type _DifferenceType; typedef std::pair<_DifferenceType, _DifferenceType> _ReturnType; const _DifferenceType __n = __last - __first; if (_DifferenceType(1) < __n) @@ -2208,11 +2193,10 @@ return std::partial_sort_copy(__first, __last, __d_first, __d_last, __comp); } -template -_RandomAccessIterator2 -__pattern_partial_sort_copy(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, - _RandomAccessIterator2 __d_first, _RandomAccessIterator2 __d_last, _Compare __comp, +template +_RandomAccessIterator +__pattern_partial_sort_copy(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, + _RandomAccessIterator __d_first, _RandomAccessIterator __d_last, _Compare __comp, _IsVector __is_vector, /*is_parallel=*/std::true_type) { if (__last == __first || __d_last == __d_first) @@ -2226,10 +2210,10 @@ { __par_backend::__parallel_stable_sort( std::forward<_ExecutionPolicy>(__exec), __d_first, __d_first + __n1, __comp, - [__first, __d_first, __is_vector](_RandomAccessIterator2 __i, _RandomAccessIterator2 __j, + [__first, __d_first, __is_vector](_RandomAccessIterator __i, _RandomAccessIterator __j, _Compare __comp) { - _RandomAccessIterator1 __i1 = __first + (__i - __d_first); - _RandomAccessIterator1 __j1 = __first + (__j - __d_first); + _ForwardIterator __i1 = __first + (__i - __d_first); + _ForwardIterator __j1 = __first + (__j - __d_first); // 1. Copy elements from input to output #if !_PSTL_ICC_18_OMP_SIMD_BROKEN @@ -2245,28 +2229,29 @@ } else { - typedef typename std::iterator_traits<_RandomAccessIterator1>::value_type _T1; - typedef typename std::iterator_traits<_RandomAccessIterator2>::value_type _T2; + typedef typename std::iterator_traits<_ForwardIterator>::value_type _T1; + typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _T2; __par_backend::__buffer<_T1> __buf(__n1); _T1* __r = __buf.get(); - __par_backend::__parallel_stable_sort(std::forward<_ExecutionPolicy>(__exec), __r, __r + __n1, __comp, - [__n2, __first, __r](_T1* __i, _T1* __j, _Compare __comp) { - _RandomAccessIterator1 __it = __first + (__i - __r); + __par_backend::__parallel_stable_sort( + std::forward<_ExecutionPolicy>(__exec), __r, __r + __n1, __comp, + [__n2, __first, __r](_T1* __i, _T1* __j, _Compare __comp) { + _ForwardIterator __it = __first + (__i - __r); - // 1. Copy elements from input to raw memory - for (_T1* __k = __i; __k != __j; ++__k, ++__it) - { - ::new (__k) _T2(*__it); - } + // 1. Copy elements from input to raw memory + for (_T1* __k = __i; __k != __j; ++__k, ++__it) + { + ::new (__k) _T2(*__it); + } - // 2. Sort elements in temporary __buffer - if (__n2 < __j - __i) - std::partial_sort(__i, __i + __n2, __j, __comp); - else - std::sort(__i, __j, __comp); - }, - __n2); + // 2. Sort elements in temporary __buffer + if (__n2 < __j - __i) + std::partial_sort(__i, __i + __n2, __j, __comp); + else + std::sort(__i, __j, __comp); + }, + __n2); // 3. Move elements from temporary __buffer to output __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __r, __r + __n2, @@ -2285,9 +2270,9 @@ //------------------------------------------------------------------------ // adjacent_find //------------------------------------------------------------------------ -template -_RandomAccessIterator -__brick_adjacent_find(_RandomAccessIterator __first, _RandomAccessIterator __last, _BinaryPredicate __pred, +template +_ForwardIterator +__brick_adjacent_find(_ForwardIterator __first, _ForwardIterator __last, _BinaryPredicate __pred, /* IsVector = */ std::true_type, bool __or_semantic) noexcept { return __unseq_backend::__simd_adjacent_find(__first, __last, __pred, __or_semantic); @@ -2384,10 +2369,10 @@ _RandomAccessIterator __x; do { - __x = __internal::__pattern_partition(std::forward<_ExecutionPolicy>(__exec), __first + 1, __last, - [&__comp, __first](const _Tp& __x) { return __comp(__x, *__first); }, - __is_vector, - /*is_parallel=*/std::true_type()); + __x = __internal::__pattern_partition( + std::forward<_ExecutionPolicy>(__exec), __first + 1, __last, + [&__comp, __first](const _Tp& __x) { return __comp(__x, *__first); }, __is_vector, + /*is_parallel=*/std::true_type()); --__x; if (__x != __first) { @@ -2418,9 +2403,9 @@ //------------------------------------------------------------------------ // fill, fill_n //------------------------------------------------------------------------ -template +template void -__brick_fill(_RandomAccessIterator __first, _RandomAccessIterator __last, const _Tp& __value, +__brick_fill(_ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, /* __is_vector = */ std::true_type) noexcept { __unseq_backend::__simd_fill_n(__first, __last - __first, __value); @@ -2442,26 +2427,23 @@ __internal::__brick_fill(__first, __last, __value, __is_vector); } -template -_RandomAccessIterator -__pattern_fill(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, - const _Tp& __value, +template +_ForwardIterator +__pattern_fill(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value, /*is_parallel=*/std::true_type, _IsVector __is_vector) { return __internal::__except_handler([&__exec, __first, __last, &__value, __is_vector]() { - __par_backend::__parallel_for( - std::forward<_ExecutionPolicy>(__exec), __first, __last, - [&__value, __is_vector](_RandomAccessIterator __begin, _RandomAccessIterator __end) { - __internal::__brick_fill(__begin, __end, __value, __is_vector); - }); + __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __last, + [&__value, __is_vector](_ForwardIterator __begin, _ForwardIterator __end) { + __internal::__brick_fill(__begin, __end, __value, __is_vector); + }); return __last; }); } -template -_RandomAccessIterator -__brick_fill_n(_RandomAccessIterator __first, _Size __count, const _Tp& __value, - /* __is_vector = */ std::true_type) noexcept +template +_OutputIterator +__brick_fill_n(_OutputIterator __first, _Size __count, const _Tp& __value, /* __is_vector = */ std::true_type) noexcept { return __unseq_backend::__simd_fill_n(__first, __count, __value); } @@ -2481,9 +2463,9 @@ return __internal::__brick_fill_n(__first, __count, __value, __is_vector); } -template -_RandomAccessIterator -__pattern_fill_n(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _Size __count, const _Tp& __value, +template +_OutputIterator +__pattern_fill_n(_ExecutionPolicy&& __exec, _OutputIterator __first, _Size __count, const _Tp& __value, /*is_parallel=*/std::true_type, _IsVector __is_vector) { return __internal::__pattern_fill(std::forward<_ExecutionPolicy>(__exec), __first, __first + __count, __value, @@ -2517,25 +2499,23 @@ __internal::__brick_generate(__first, __last, __g, __is_vector); } -template -_RandomAccessIterator -__pattern_generate(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, - _Generator __g, +template +_ForwardIterator +__pattern_generate(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Generator __g, /*is_parallel=*/std::true_type, _IsVector __is_vector) { return __internal::__except_handler([&]() { __par_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __last, - [__g, __is_vector](_RandomAccessIterator __begin, _RandomAccessIterator __end) { + [__g, __is_vector](_ForwardIterator __begin, _ForwardIterator __end) { __internal::__brick_generate(__begin, __end, __g, __is_vector); }); return __last; }); } -template -_RandomAccessIterator -__brick_generate_n(_RandomAccessIterator __first, Size __count, _Generator __g, - /* is_vector = */ std::true_type) noexcept +template +OutputIterator +__brick_generate_n(OutputIterator __first, Size __count, _Generator __g, /* is_vector = */ std::true_type) noexcept { return __unseq_backend::__simd_generate_n(__first, __count, __g); } @@ -2555,12 +2535,12 @@ return __internal::__brick_generate_n(__first, __count, __g, __is_vector); } -template -_RandomAccessIterator -__pattern_generate_n(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _Size __count, _Generator __g, +template +_OutputIterator +__pattern_generate_n(_ExecutionPolicy&& __exec, _OutputIterator __first, _Size __count, _Generator __g, /*is_parallel=*/std::true_type, _IsVector __is_vector) { - static_assert(__is_random_access_iterator<_RandomAccessIterator>::value, + static_assert(__is_random_access_iterator<_OutputIterator>::value, "Pattern-brick error. Should be a random access iterator."); return __internal::__pattern_generate(std::forward<_ExecutionPolicy>(__exec), __first, __first + __count, __g, std::true_type(), __is_vector); @@ -2598,12 +2578,12 @@ return __internal::__brick_remove_if(__first, __last, __pred, __is_vector); } -template -_RandomAccessIterator -__pattern_remove_if(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, +template +_ForwardIterator +__pattern_remove_if(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _UnaryPredicate __pred, _IsVector __is_vector, /*is_parallel*/ std::true_type) noexcept { - typedef typename std::iterator_traits<_RandomAccessIterator>::reference _ReferenceType; + typedef typename std::iterator_traits<_ForwardIterator>::reference _ReferenceType; if (__first == __last || __first + 1 == __last) { @@ -2613,9 +2593,9 @@ return __internal::__remove_elements( std::forward<_ExecutionPolicy>(__exec), __first, __last, - [&__pred, __is_vector](bool* __b, bool* __e, _RandomAccessIterator __it) { - __internal::__brick_walk2(__b, __e, __it, [&__pred](bool& __x, _ReferenceType __y) { __x = !__pred(__y); }, - __is_vector); + [&__pred, __is_vector](bool* __b, bool* __e, _ForwardIterator __it) { + __internal::__brick_walk2( + __b, __e, __it, [&__pred](bool& __x, _ReferenceType __y) { __x = !__pred(__y); }, __is_vector); }, __is_vector); } @@ -2633,10 +2613,10 @@ return std::merge(__first1, __last1, __first2, __last2, __d_first, __comp); } -template -_RandomAccessIterator3 -__brick_merge(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _RandomAccessIterator3 __d_first, _Compare __comp, +template +_OutputIterator +__brick_merge(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _OutputIterator __d_first, _Compare __comp, /* __is_vector = */ std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -2653,17 +2633,17 @@ return __internal::__brick_merge(__first1, __last1, __first2, __last2, __d_first, __comp, __is_vector); } -template -_RandomAccessIterator3 +template +_OutputIterator __pattern_merge(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _RandomAccessIterator3 __d_first, + _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _OutputIterator __d_first, _Compare __comp, _IsVector __is_vector, /* is_parallel = */ std::true_type) { __par_backend::__parallel_merge( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __d_first, __comp, [__is_vector](_RandomAccessIterator1 __f1, _RandomAccessIterator1 __l1, _RandomAccessIterator2 __f2, - _RandomAccessIterator2 __l2, _RandomAccessIterator3 __f3, _Compare __comp) { + _RandomAccessIterator2 __l2, _OutputIterator __f3, _Compare __comp) { return __internal::__brick_merge(__f1, __l1, __f2, __l2, __f3, __comp, __is_vector); }); return __d_first + (__last1 - __first1) + (__last2 - __first2); @@ -2680,9 +2660,9 @@ std::inplace_merge(__first, __middle, __last, __comp); } -template +template void -__brick_inplace_merge(_RandomAccessIterator __first, _RandomAccessIterator __middle, _RandomAccessIterator __last, +__brick_inplace_merge(_BidirectionalIterator __first, _BidirectionalIterator __middle, _BidirectionalIterator __last, _Compare __comp, /* __is_vector = */ std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial") @@ -2698,34 +2678,35 @@ __internal::__brick_inplace_merge(__first, __middle, __last, __comp, __is_vector); } -template +template void -__pattern_inplace_merge(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __middle, - _RandomAccessIterator __last, _Compare __comp, _IsVector __is_vector, +__pattern_inplace_merge(_ExecutionPolicy&& __exec, _BidirectionalIterator __first, _BidirectionalIterator __middle, + _BidirectionalIterator __last, _Compare __comp, _IsVector __is_vector, /*is_parallel=*/std::true_type) { if (__first == __last || __first == __middle || __middle == __last) { return; } - typedef typename std::iterator_traits<_RandomAccessIterator>::value_type _Tp; + typedef typename std::iterator_traits<_BidirectionalIterator>::value_type _Tp; auto __n = __last - __first; __par_backend::__buffer<_Tp> __buf(__n); _Tp* __r = __buf.get(); __internal::__except_handler([&]() { - auto __move_values = [](_RandomAccessIterator __x, _Tp* __z) { - __internal::__invoke_if_else(std::is_trivial<_Tp>(), [&]() { *__z = std::move(*__x); }, - [&]() { ::new (std::addressof(*__z)) _Tp(std::move(*__x)); }); + auto __move_values = [](_BidirectionalIterator __x, _Tp* __z) { + __internal::__invoke_if_else( + std::is_trivial<_Tp>(), [&]() { *__z = std::move(*__x); }, + [&]() { ::new (std::addressof(*__z)) _Tp(std::move(*__x)); }); }; - auto __move_sequences = [](_RandomAccessIterator __first1, _RandomAccessIterator __last1, _Tp* __first2) { + auto __move_sequences = [](_BidirectionalIterator __first1, _BidirectionalIterator __last1, _Tp* __first2) { return __internal::__brick_uninitialized_move(__first1, __last1, __first2, _IsVector()); }; __par_backend::__parallel_merge( std::forward<_ExecutionPolicy>(__exec), __first, __middle, __middle, __last, __r, __comp, - [__n, __move_values, __move_sequences](_RandomAccessIterator __f1, _RandomAccessIterator __l1, - _RandomAccessIterator __f2, _RandomAccessIterator __l2, _Tp* __f3, + [__n, __move_values, __move_sequences](_BidirectionalIterator __f1, _BidirectionalIterator __l1, + _BidirectionalIterator __f2, _BidirectionalIterator __l2, _Tp* __f3, _Compare __comp) { (__utils::__serial_move_merge(__n))(__f1, __l1, __f2, __l2, __f3, __comp, __move_values, __move_values, __move_sequences, __move_sequences); @@ -2751,11 +2732,10 @@ return std::includes(__first1, __last1, __first2, __last2, __comp); } -template +template bool -__pattern_includes(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _Compare __comp, _IsVector, +__pattern_includes(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator2 __last2, _Compare __comp, _IsVector, /*is_parallel=*/std::true_type) { if (__first2 >= __last2) @@ -2774,12 +2754,12 @@ return __internal::__except_handler([&]() { return !__internal::__parallel_or( std::forward<_ExecutionPolicy>(__exec), __first2, __last2, - [__first1, __last1, __first2, __last2, &__comp](_RandomAccessIterator2 __i, _RandomAccessIterator2 __j) { - _PSTL_ASSERT(__j > __i); - //_PSTL_ASSERT(__j - __i > 1); + [__first1, __last1, __first2, __last2, &__comp](_ForwardIterator2 __i, _ForwardIterator2 __j) { + assert(__j > __i); + //assert(__j - __i > 1); //1. moving boundaries to "consume" subsequence of equal elements - auto __is_equal = [&__comp](_RandomAccessIterator2 __a, _RandomAccessIterator2 __b) -> bool { + auto __is_equal = [&__comp](_ForwardIterator2 __a, _ForwardIterator2 __b) -> bool { return !__comp(*__a, *__b) && !__comp(*__b, *__a); }; @@ -2800,8 +2780,8 @@ //2. testing is __a subsequence of the second range included into the first range auto __b = std::lower_bound(__first1, __last1, *__i, __comp); - _PSTL_ASSERT(!__comp(*(__last1 - 1), *__b)); - _PSTL_ASSERT(!__comp(*(__j - 1), *__i)); + assert(!__comp(*(__last1 - 1), *__b)); + assert(!__comp(*(__j - 1), *__i)); return !std::includes(__b, __last1, __i, __j, __comp); }); }); @@ -2988,7 +2968,7 @@ } const auto __m2 = __left_bound_seq_2 - __first2; - _PSTL_ASSERT(__m1 == 0 || __m2 == 0); + assert(__m1 == 0 || __m2 == 0); if (__m2 > __set_algo_cut_off) { auto __res_or = __result; @@ -3038,10 +3018,10 @@ } }; -template +template _OutputIterator -__brick_set_union(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _OutputIterator __result, _Compare __comp, +__brick_set_union(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -3059,12 +3039,12 @@ return __internal::__brick_set_union(__first1, __last1, __first2, __last2, __result, __comp, __is_vector); } -template _OutputIterator -__pattern_set_union(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _OutputIterator __result, - _Compare __comp, _IsVector __is_vector, /*__is_parallel=*/std::true_type) +__pattern_set_union(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, + _IsVector __is_vector, /*__is_parallel=*/std::true_type) { const auto __n1 = __last1 - __first1; @@ -3077,8 +3057,8 @@ typedef typename std::iterator_traits<_OutputIterator>::value_type _Tp; return __parallel_set_union_op( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, __comp, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Compare __comp) { + [](_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2, + _Tp* __result, _Compare __comp) { return __pstl::__utils::__set_union_construct(__first1, __last1, __first2, __last2, __result, __comp, __BrickCopyConstruct<_IsVector>()); }, @@ -3098,11 +3078,10 @@ return std::set_intersection(__first1, __last1, __first2, __last2, __result, __comp); } -template -_RandomAccessIterator3 -__brick_set_intersection(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __result, _Compare __comp, +template +_OutputIterator +__brick_set_intersection(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -3119,16 +3098,15 @@ return __internal::__brick_set_intersection(__first1, __last1, __first2, __last2, __result, __comp, __is_vector); } -template -_RandomAccessIterator3 -__pattern_set_intersection(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __result, _Compare __comp, _IsVector __is_vector, - /*is_parallel=*/std::true_type) +template +_OutputIterator +__pattern_set_intersection(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator2 __last2, _OutputIterator __result, + _Compare __comp, _IsVector __is_vector, /*is_parallel=*/std::true_type) { - typedef typename std::iterator_traits<_RandomAccessIterator3>::value_type _Tp; - typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; + typedef typename std::iterator_traits<_OutputIterator>::value_type _Tp; + typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _DifferenceType; const auto __n1 = __last1 - __first1; const auto __n2 = __last2 - __first2; @@ -3138,13 +3116,13 @@ return __result; // testing whether the sequences are intersected - _RandomAccessIterator1 __left_bound_seq_1 = std::lower_bound(__first1, __last1, *__first2, __comp); + _ForwardIterator1 __left_bound_seq_1 = std::lower_bound(__first1, __last1, *__first2, __comp); //{1} < {2}: seq 2 is wholly greater than seq 1, so, the intersection is empty if (__left_bound_seq_1 == __last1) return __result; // testing whether the sequences are intersected - _RandomAccessIterator2 __left_bound_seq_2 = std::lower_bound(__first2, __last2, *__first1, __comp); + _ForwardIterator2 __left_bound_seq_2 = std::lower_bound(__first2, __last2, *__first1, __comp); //{2} < {1}: seq 1 is wholly greater than seq 2, so, the intersection is empty if (__left_bound_seq_2 == __last2) return __result; @@ -3156,8 +3134,8 @@ return __internal::__parallel_set_op( std::forward<_ExecutionPolicy>(__exec), __left_bound_seq_1, __last1, __first2, __last2, __result, __comp, [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Compare __comp) { + [](_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _Tp* __result, _Compare __comp) { return __pstl::__utils::__set_intersection_construct(__first1, __last1, __first2, __last2, __result, __comp); }, @@ -3171,8 +3149,8 @@ __result = __internal::__parallel_set_op( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __left_bound_seq_2, __last2, __result, __comp, [](_DifferenceType __n, _DifferenceType __m) { return std::min(__n, __m); }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Compare __comp) { + [](_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _Tp* __result, _Compare __comp) { return __pstl::__utils::__set_intersection_construct(__first2, __last2, __first1, __last1, __result, __comp); }, @@ -3197,10 +3175,10 @@ return std::set_difference(__first1, __last1, __first2, __last2, __result, __comp); } -template -_RandomAccessIterator3 -__brick_set_difference(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _RandomAccessIterator3 __result, _Compare __comp, +template +_OutputIterator +__brick_set_difference(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -3217,16 +3195,15 @@ return __internal::__brick_set_difference(__first1, __last1, __first2, __last2, __result, __comp, __is_vector); } -template -_RandomAccessIterator3 -__pattern_set_difference(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __result, _Compare __comp, _IsVector __is_vector, - /*is_parallel=*/std::true_type) +template +_OutputIterator +__pattern_set_difference(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator2 __last2, _OutputIterator __result, + _Compare __comp, _IsVector __is_vector, /*is_parallel=*/std::true_type) { - typedef typename std::iterator_traits<_RandomAccessIterator3>::value_type _Tp; - typedef typename std::iterator_traits<_RandomAccessIterator1>::difference_type _DifferenceType; + typedef typename std::iterator_traits<_OutputIterator>::value_type _Tp; + typedef typename std::iterator_traits<_ForwardIterator1>::difference_type _DifferenceType; const auto __n1 = __last1 - __first1; const auto __n2 = __last2 - __first2; @@ -3239,29 +3216,29 @@ if (__n2 == 0) return __internal::__pattern_walk2_brick( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __result, - [__is_vector](_RandomAccessIterator1 __begin, _RandomAccessIterator1 __end, _RandomAccessIterator3 __res) { + [__is_vector](_ForwardIterator1 __begin, _ForwardIterator1 __end, _OutputIterator __res) { return __internal::__brick_copy(__begin, __end, __res, __is_vector); }, std::true_type()); // testing whether the sequences are intersected - _RandomAccessIterator1 __left_bound_seq_1 = std::lower_bound(__first1, __last1, *__first2, __comp); + _ForwardIterator1 __left_bound_seq_1 = std::lower_bound(__first1, __last1, *__first2, __comp); //{1} < {2}: seq 2 is wholly greater than seq 1, so, parallel copying just first sequence if (__left_bound_seq_1 == __last1) return __internal::__pattern_walk2_brick( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __result, - [__is_vector](_RandomAccessIterator1 __begin, _RandomAccessIterator1 __end, _RandomAccessIterator3 __res) { + [__is_vector](_ForwardIterator1 __begin, _ForwardIterator1 __end, _OutputIterator __res) { return __internal::__brick_copy(__begin, __end, __res, __is_vector); }, std::true_type()); // testing whether the sequences are intersected - _RandomAccessIterator2 __left_bound_seq_2 = std::lower_bound(__first2, __last2, *__first1, __comp); + _ForwardIterator2 __left_bound_seq_2 = std::lower_bound(__first2, __last2, *__first1, __comp); //{2} < {1}: seq 1 is wholly greater than seq 2, so, parallel copying just first sequence if (__left_bound_seq_2 == __last2) return __internal::__pattern_walk2_brick( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __result, - [__is_vector](_RandomAccessIterator1 __begin, _RandomAccessIterator1 __end, _RandomAccessIterator3 __res) { + [__is_vector](_ForwardIterator1 __begin, _ForwardIterator1 __end, _OutputIterator __res) { return __internal::__brick_copy(__begin, __end, __res, __is_vector); }, std::true_type()); @@ -3270,8 +3247,8 @@ return __parallel_set_op( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, __comp, [](_DifferenceType __n, _DifferenceType) { return __n; }, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Compare __comp) { + [](_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _Tp* __result, _Compare __comp) { return __pstl::__utils::__set_difference_construct(__first1, __last1, __first2, __last2, __result, __comp, __BrickCopyConstruct<_IsVector>()); }, @@ -3294,11 +3271,10 @@ return std::set_symmetric_difference(__first1, __last1, __first2, __last2, __result, __comp); } -template -_RandomAccessIterator3 -__brick_set_symmetric_difference(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, - _RandomAccessIterator3 __result, _Compare __comp, +template +_OutputIterator +__brick_set_symmetric_difference(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _OutputIterator __result, _Compare __comp, /*__is_vector=*/std::true_type) noexcept { _PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); @@ -3316,13 +3292,12 @@ __is_vector); } -template -_RandomAccessIterator3 -__pattern_set_symmetric_difference(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _RandomAccessIterator3 __result, _Compare __comp, - _IsVector __is_vector, /*is_parallel=*/std::true_type) +template +_OutputIterator +__pattern_set_symmetric_difference(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator2 __last2, _OutputIterator __result, + _Compare __comp, _IsVector __is_vector, /*is_parallel=*/std::true_type) { const auto __n1 = __last1 - __first1; @@ -3332,11 +3307,11 @@ if (__n1 + __n2 <= __set_algo_cut_off) return std::set_symmetric_difference(__first1, __last1, __first2, __last2, __result, __comp); - typedef typename std::iterator_traits<_RandomAccessIterator3>::value_type _Tp; + typedef typename std::iterator_traits<_OutputIterator>::value_type _Tp; return __internal::__parallel_set_union_op( std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __last2, __result, __comp, - [](_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Tp* __result, _Compare __comp) { + [](_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, _ForwardIterator2 __last2, + _Tp* __result, _Compare __comp) { return __pstl::__utils::__set_symmetric_difference_construct(__first1, __last1, __first2, __last2, __result, __comp, __BrickCopyConstruct<_IsVector>()); }, @@ -3432,9 +3407,9 @@ return std::min_element(__first, __last, __comp); } -template -_RandomAccessIterator -__brick_min_element(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, +template +_ForwardIterator +__brick_min_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp, /* __is_vector = */ std::true_type) noexcept { #if _PSTL_UDR_PRESENT @@ -3487,9 +3462,9 @@ return std::minmax_element(__first, __last, __comp); } -template -std::pair<_RandomAccessIterator, _RandomAccessIterator> -__brick_minmax_element(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, +template +std::pair<_ForwardIterator, _ForwardIterator> +__brick_minmax_element(_ForwardIterator __first, _ForwardIterator __last, _Compare __comp, /* __is_vector = */ std::true_type) noexcept { #if _PSTL_UDR_PRESENT @@ -3507,20 +3482,20 @@ return __internal::__brick_minmax_element(__first, __last, __comp, __is_vector); } -template -std::pair<_RandomAccessIterator, _RandomAccessIterator> -__pattern_minmax_element(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, - _Compare __comp, _IsVector __is_vector, /* is_parallel = */ std::true_type) +template +std::pair<_ForwardIterator, _ForwardIterator> +__pattern_minmax_element(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Compare __comp, + _IsVector __is_vector, /* is_parallel = */ std::true_type) { if (__first == __last) return std::make_pair(__first, __first); return __internal::__except_handler([&]() { - typedef std::pair<_RandomAccessIterator, _RandomAccessIterator> _Result; + typedef std::pair<_ForwardIterator, _ForwardIterator> _Result; return __par_backend::__parallel_reduce( std::forward<_ExecutionPolicy>(__exec), __first + 1, __last, std::make_pair(__first, __first), - [=](_RandomAccessIterator __begin, _RandomAccessIterator __end, _Result __init) -> _Result { + [=](_ForwardIterator __begin, _ForwardIterator __end, _Result __init) -> _Result { const _Result __subresult = __internal::__brick_minmax_element(__begin, __end, __comp, __is_vector); return std::make_pair( __internal::__cmp_iterators_by_values(__subresult.first, __init.first, __comp), @@ -3560,10 +3535,10 @@ return __mismatch_serial(__first1, __last1, __first2, __last2, __pred); } -template -std::pair<_RandomAccessIterator1, _RandomAccessIterator2> -__brick_mismatch(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Predicate __pred, /* __is_vector = */ std::true_type) noexcept +template +std::pair<_ForwardIterator1, _ForwardIterator2> +__brick_mismatch(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _Predicate __pred, /* __is_vector = */ std::true_type) noexcept { auto __n = std::min(__last1 - __first1, __last2 - __first2); return __unseq_backend::__simd_first(__first1, __n, __first2, std::not_fn(__pred)); @@ -3612,11 +3587,10 @@ return std::lexicographical_compare(__first1, __last1, __first2, __last2, __comp); } -template +template bool -__brick_lexicographical_compare(_RandomAccessIterator1 __first1, _RandomAccessIterator1 __last1, - _RandomAccessIterator2 __first2, _RandomAccessIterator2 __last2, _Compare __comp, - /* __is_vector = */ std::true_type) noexcept +__brick_lexicographical_compare(_ForwardIterator1 __first1, _ForwardIterator1 __last1, _ForwardIterator2 __first2, + _ForwardIterator2 __last2, _Compare __comp, /* __is_vector = */ std::true_type) noexcept { if (__first2 == __last2) { // if second sequence is empty @@ -3628,12 +3602,12 @@ } else { - typedef typename std::iterator_traits<_RandomAccessIterator1>::reference ref_type1; - typedef typename std::iterator_traits<_RandomAccessIterator2>::reference ref_type2; + typedef typename std::iterator_traits<_ForwardIterator1>::reference ref_type1; + typedef typename std::iterator_traits<_ForwardIterator2>::reference ref_type2; --__last1; --__last2; auto __n = std::min(__last1 - __first1, __last2 - __first2); - std::pair<_RandomAccessIterator1, _RandomAccessIterator2> __result = __unseq_backend::__simd_first( + std::pair<_ForwardIterator1, _ForwardIterator2> __result = __unseq_backend::__simd_first( __first1, __n, __first2, [__comp](const ref_type1 __x, const ref_type2 __y) mutable { return __comp(__x, __y) || __comp(__y, __x); }); @@ -3658,13 +3632,11 @@ return __internal::__brick_lexicographical_compare(__first1, __last1, __first2, __last2, __comp, __is_vector); } -template +template bool -__pattern_lexicographical_compare(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1, - _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, - _RandomAccessIterator2 __last2, _Compare __comp, _IsVector __is_vector, - /* is_parallel = */ std::true_type) noexcept +__pattern_lexicographical_compare(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1, + _ForwardIterator2 __first2, _ForwardIterator2 __last2, _Compare __comp, + _IsVector __is_vector, /* is_parallel = */ std::true_type) noexcept { if (__first2 == __last2) { // if second sequence is empty @@ -3676,22 +3648,23 @@ } else { - typedef typename std::iterator_traits<_RandomAccessIterator1>::reference _RefType1; - typedef typename std::iterator_traits<_RandomAccessIterator2>::reference _RefType2; + typedef typename std::iterator_traits<_ForwardIterator1>::reference _RefType1; + typedef typename std::iterator_traits<_ForwardIterator2>::reference _RefType2; --__last1; --__last2; auto __n = std::min(__last1 - __first1, __last2 - __first2); auto __result = __internal::__parallel_find( std::forward<_ExecutionPolicy>(__exec), __first1, __first1 + __n, - [__first1, __first2, &__comp, __is_vector](_RandomAccessIterator1 __i, _RandomAccessIterator1 __j) { - return __internal::__brick_mismatch(__i, __j, __first2 + (__i - __first1), __first2 + (__j - __first1), - [&__comp](const _RefType1 __x, const _RefType2 __y) { - return !__comp(__x, __y) && !__comp(__y, __x); - }, - __is_vector) + [__first1, __first2, &__comp, __is_vector](_ForwardIterator1 __i, _ForwardIterator1 __j) { + return __internal::__brick_mismatch( + __i, __j, __first2 + (__i - __first1), __first2 + (__j - __first1), + [&__comp](const _RefType1 __x, const _RefType2 __y) { + return !__comp(__x, __y) && !__comp(__y, __x); + }, + __is_vector) .first; }, - std::less::difference_type>(), /*is_first=*/true); + std::less::difference_type>(), /*is_first=*/true); if (__result == __last1 && __first2 + (__result - __first1) != __last2) { // if first sequence shorter than second diff --git a/pstl/include/pstl/internal/execution_impl.h b/pstl/include/pstl/internal/execution_impl.h --- a/pstl/include/pstl/internal/execution_impl.h +++ b/pstl/include/pstl/internal/execution_impl.h @@ -82,17 +82,15 @@ typename __internal::__policy_traits::type>::__allow_parallel; template -typename std::conjunction<__allow_vector<_ExecutionPolicy>, - __is_random_access_iterator<_IteratorTypes>...>::type -__is_vectorization_preferred(_ExecutionPolicy&& __exec) +typename std::conjunction<__allow_vector<_ExecutionPolicy>, __is_random_access_iterator<_IteratorTypes>...>::type +__is_vectorization_preferred(_ExecutionPolicy&&) { return {}; } template -typename std::conjunction<__allow_parallel<_ExecutionPolicy>, - __is_random_access_iterator<_IteratorTypes>...>::type -__is_parallelization_preferred(_ExecutionPolicy&& __exec) +typename std::conjunction<__allow_parallel<_ExecutionPolicy>, __is_random_access_iterator<_IteratorTypes>...>::type +__is_parallelization_preferred(_ExecutionPolicy&&) { return {}; } diff --git a/pstl/include/pstl/internal/parallel_backend.h b/pstl/include/pstl/internal/parallel_backend.h --- a/pstl/include/pstl/internal/parallel_backend.h +++ b/pstl/include/pstl/internal/parallel_backend.h @@ -24,6 +24,12 @@ { namespace __par_backend = __tbb_backend; } +#elif defined(_PSTL_PAR_BACKEND_OMP) +# include "parallel_backend_omp.h" +namespace __pstl +{ +namespace __par_backend = __omp_backend; +} #else _PSTL_PRAGMA_MESSAGE("Parallel backend was not specified"); #endif diff --git a/pstl/include/pstl/internal/parallel_backend_omp.h b/pstl/include/pstl/internal/parallel_backend_omp.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/parallel_backend_omp.h @@ -0,0 +1,943 @@ +// -*- C++ +// -*-===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_PARALLEL_BACKEND_OMP_H +#define _PSTL_PARALLEL_BACKEND_OMP_H + +#include +#include +#include +#include +#include + +#include "parallel_backend_serial.h" +#include "unseq_backend_simd.h" +#include "utils.h" + +#if !defined(_OPENMP) +# error _OPENMP not defined. +#else +//# include +//# if (_OPENMP <= 200505) +//# warning OpenMP 2.5 or older +//# elif (_OPENMP == 200805) +//# warning OpenMP 3.0 +//# elif (_OPENMP == 201107) +//# warning OpenMP 3.1 +//# elif (_OPENMP == 201307) +//# warning OpenMP 4.0 +//# elif (_OPENMP == 201511) +//# warning OpenMP 4.5 +//# elif (_OPENMP == 201811) +//# warning OpenMP 5.0 +//# else +//# warning Unknown version of OpenMP. +//# endif +#endif + +namespace __pstl +{ +namespace __omp_backend +{ + +//------------------------------------------------------------------------ +// use to cancel execution +//------------------------------------------------------------------------ +inline void +__cancel_execution() +{ +} + +//------------------------------------------------------------------------ +// raw buffer +//------------------------------------------------------------------------ + +template +class __buffer +{ + std::allocator<_Tp> __allocator_; + _Tp* __ptr_; + const std::size_t __buf_size_; + __buffer(const __buffer&) = delete; + void + operator=(const __buffer&) = delete; + + public: + __buffer(std::size_t __n) : __allocator_(), __ptr_(__allocator_.allocate(__n)), __buf_size_(__n) {} + + operator bool() const { return __ptr_ != nullptr; } + + _Tp* + get() const + { + return __ptr_; + } + ~__buffer() { __allocator_.deallocate(__ptr_, __buf_size_); } +}; + +// Preliminary size of each chunk: requires further discussion +constexpr std::size_t __default_chunk_size = 512; + +// The iteration space partitioner according to __requested_chunk_size +template +void +__chunk_partitioner(_RandomAccessIterator __first, _RandomAccessIterator __last, _Size& __n_chunks, _Size& __chunk_size, + _Size& __first_chunk_size, _Size __requested_chunk_size = __default_chunk_size) +{ + /* + * This algorithm improves distribution of elements in chunks by avoiding + * small tail chunks. The leftover elements that do not fit neatly into + * the chunk size are redistributed to early chunks. This improves + * utilization of the processor's prefetch and reduces the number of + * tasks needed by 1. + */ + + const _Size __n = __last - __first; + if (__n < __requested_chunk_size) + { + __chunk_size = __n; + __first_chunk_size = __n; + __n_chunks = 1; + return; + } + + __n_chunks = (__n / __requested_chunk_size) + 1; + __chunk_size = __n / __n_chunks; + const _Size __n_leftover_items = __n % __chunk_size; + + if (__n_leftover_items == 0) + { + __first_chunk_size = __chunk_size; + return; + } + + const _Size __n_extra_items_per_chunk = __n_leftover_items / __n_chunks; + const _Size __n_final_leftover_items = __n_leftover_items % __n_chunks; + + __chunk_size += __n_extra_items_per_chunk; + __first_chunk_size = __chunk_size + __n_final_leftover_items; +} + +//------------------------------------------------------------------------ +// parallel_invoke +//------------------------------------------------------------------------ + +template +void +__parallel_invoke_body(_F1&& __f1, _F2&& __f2) +{ + _PSTL_PRAGMA(omp taskgroup) + { + _PSTL_PRAGMA(omp task) { std::forward<_F1>(__f1)(); } + _PSTL_PRAGMA(omp task) { std::forward<_F2>(__f2)(); } + } +} + +template +void +__parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2) +{ + if (omp_in_parallel()) + { + __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); + } +} + +//------------------------------------------------------------------------ +// parallel_for +//------------------------------------------------------------------------ + +template +void +__parallel_for_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) +{ + std::size_t __n_chunks{0}, __chunk_size{0}, __first_chunk_size{0}; + __chunk_partitioner(__first, __last, __n_chunks, __chunk_size, __first_chunk_size); + + // To avoid over-subscription we use taskloop for the nested parallelism + _PSTL_PRAGMA(omp taskloop) + for (std::size_t __chunk = 0; __chunk < __n_chunks; ++__chunk) + { + auto __this_chunk_size = __chunk == 0 ? __first_chunk_size : __chunk_size; + auto __index = __chunk == 0 ? 0 : (__chunk * __chunk_size) + (__first_chunk_size - __chunk_size); + auto __begin = __first + __index; + auto __end = __begin + __this_chunk_size; + __f(__begin, __end); + } +} + +//------------------------------------------------------------------------ +// Notation: +// Evaluation of brick f[i,j) for each subrange [i,j) of [first, last) +//------------------------------------------------------------------------ + +template +void +__parallel_for(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, _Fp __f) +{ + if (omp_in_parallel()) + { + // we don't create a nested parallel region in an existing parallel + // region: just create tasks + __pstl::__omp_backend::__parallel_for_body(__first, __last, __f); + } + else + { + // in any case (nested or non-nested) one parallel region is created and + // only one thread creates a set of tasks + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) { __pstl::__omp_backend::__parallel_for_body(__first, __last, __f); } + } +} + +//------------------------------------------------------------------------ +// parallel_for_each +//------------------------------------------------------------------------ + +template +void +__parallel_for_each(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Fp __f) +{ + __pstl::__omp_backend::__parallel_for(std::forward<_ExecutionPolicy>(__exec), __first, __last, __f); +} + +//------------------------------------------------------------------------ +// parallel_reduce +//------------------------------------------------------------------------ + +template +auto +__parallel_reduce_chunks(std::uint32_t start, std::uint32_t end, _ChunkReducer __reduce_chunk, _Reduction __reduce) + -> _Value +{ + _Value v1, v2; + + if (end - start == 1) + { + return __reduce_chunk(start); + } + else if (end - start == 2) + { + _PSTL_PRAGMA(omp task shared(v1)) + v1 = __reduce_chunk(start); + + _PSTL_PRAGMA(omp task shared(v2)) + v2 = __reduce_chunk(start + 1); + } + else + { + auto middle = start + ((end - start) / 2); + + _PSTL_PRAGMA(omp task shared(v1)) + v1 = __parallel_reduce_chunks<_Value>(start, middle, __reduce_chunk, __reduce); + + _PSTL_PRAGMA(omp task shared(v2)) + v2 = __parallel_reduce_chunks<_Value>(middle, end, __reduce_chunk, __reduce); + } + + _PSTL_PRAGMA(omp taskwait) + return __reduce(v1, v2); +} + +template +_Value +__parallel_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity, + _RealBody __real_body, _Reduction __reduction) +{ + + std::size_t __n_chunks{0}, __chunk_size{0}, __first_chunk_size{0}; + __omp_backend::__chunk_partitioner(__first, __last, __n_chunks, __chunk_size, __first_chunk_size); + + auto __reduce_chunk = [&](std::uint32_t __chunk) { + auto __this_chunk_size = __chunk == 0 ? __first_chunk_size : __chunk_size; + auto __index = __chunk == 0 ? 0 : (__chunk * __chunk_size) + (__first_chunk_size - __chunk_size); + auto __begin = __first + __index; + auto __end = __begin + __this_chunk_size; + + //IMPORTANT: __real_body call does a serial reduction based on an initial value; + //in case of passing an identity value, a partial result should be explicitly combined + //with the previous partial reduced value. + + return __real_body(__begin, __end, __identity); + }; + + return __parallel_reduce_chunks<_Value>(0, __n_chunks, __reduce_chunk, __reduction); +} + +//------------------------------------------------------------------------ +// Notation: +// r(i,j,init) returns reduction of init with reduction over [i,j) +// c(x,y) combines values x and y that were the result of r +//------------------------------------------------------------------------ + +template +_Value +__parallel_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity, + _RealBody __real_body, _Reduction __reduction) +{ + if (__first == __last) + return __identity; + + // Don't bother parallelizing the work if the size is too small. + if (__last - __first < static_cast(__default_chunk_size)) + { + return __real_body(__first, __last, __identity); + } + + // We don't create a nested parallel region in an existing parallel region: + // just create tasks. + if (omp_in_parallel()) + { + return __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction); + } + + // In any case (nested or non-nested) one parallel region is created and only + // one thread creates a set of tasks. + _Value __res = __identity; + + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + { + __res = __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction); + } + + return __res; +} + +//------------------------------------------------------------------------ +// parallel_transform_reduce +// +// Notation: +// r(i,j,init) returns reduction of init with reduction over [i,j) +// u(i) returns f(i,i+1,identity) for a hypothetical left identity element +// of r c(x,y) combines values x and y that were the result of r or u +//------------------------------------------------------------------------ + +template +auto +__transform_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init, + _Combiner __combiner, _Reduction __reduction) +{ + using _Size = std::size_t; + const _Size __num_threads = omp_get_num_threads(); + const _Size __n = __last - __first; + + if (__n >= __num_threads) + { + // Here, we cannot use OpenMP UDR because we must store the init value in + // the combiner and it will be used several times. Although there should be + // the only one; we manually generate the identity elements for each thread. + alignas(_Value) char __accums_storage[__num_threads * sizeof(_Value)]; + _Value* __accums = reinterpret_cast<_Value*>(__accums_storage); + + // initialize accumulators for all threads + for (_Size __i = 0; __i < __num_threads; ++__i) + { + ::new (__accums + __i) _Value(__unary_op(__first + __i)); + } + + // initial partition of the iteration space into chunks + std::size_t __n_chunks{0}, __chunk_size{0}, __first_chunk_size{0}; + __omp_backend::__chunk_partitioner(__first + __num_threads, __last, __n_chunks, __chunk_size, + __first_chunk_size); + + // main loop + _PSTL_PRAGMA(omp taskloop) + for (std::size_t __chunk = 0; __chunk < __n_chunks; ++__chunk) + { + auto __this_chunk_size = __chunk == 0 ? __first_chunk_size : __chunk_size; + auto __index = __chunk == 0 ? 0 : (__chunk * __chunk_size) + (__first_chunk_size - __chunk_size); + auto __begin = __first + __index + __num_threads; + auto __end = __begin + __this_chunk_size; + + auto __thread_num = omp_get_thread_num(); + __accums[__thread_num] = __reduction(__begin, __end, __accums[__thread_num]); + } + + // combine by accumulators + for (_Size __i = 0; __i < __num_threads; ++__i) + { + __init = __combiner(__init, __accums[__i]); + } + + // destroy accumulators + for (_Size __i = 0; __i < __num_threads; ++__i) + { + __accums[__i].~_Value(); + } + } + else + { // if the number of elements is less than the number of threads, we + // process them sequentially + for (_Size __i = 0; __i < __n; ++__i) + { + __init = __combiner(__init, __unary_op(__first + __i)); + } + } + + return __init; +} + +template +_Value +__parallel_transform_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, + _UnaryOp __unary_op, _Value __init, _Combiner __combiner, _Reduction __reduction) +{ + + if (__first == __last) + { + return __init; + } + + _Value __result = __init; + if (omp_in_parallel()) + { + // We don't create a nested parallel region in an existing parallel + // region: just create tasks + __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner, + __reduction); + } + else + { + // Create a parallel region, and a single thread will create tasks + // for the region. + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + { + __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner, + __reduction); + } + } + + return __result; +} + +//------------------------------------------------------------------------ +// parallel_scan +//------------------------------------------------------------------------ + +template +_Index +__split(_Index __m) +{ + _Index __k = 1; + while (2 * __k < __m) + __k *= 2; + return __k; +} + +template +void +__upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine) +{ + if (__m == 1) + __r[0] = __reduce(__i * __tilesize, __lastsize); + else + { + _Index __k = __split(__m); + __omp_backend::__parallel_invoke_body( + [=] { __omp_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); }, + [=] { + __omp_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine); + }); + if (__m == 2 * __k) + __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]); + } +} + +template +void +__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, + _Sp __scan) +{ + if (__m == 1) + __scan(__i * __tilesize, __lastsize, __initial); + else + { + const _Index __k = __split(__m); + __omp_backend::__parallel_invoke_body( + [=] { __omp_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); }, + // Assumes that __combine never throws. + // TODO: Consider adding a requirement for user functors to be constant. + [=, &__combine] { + __omp_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __combine(__initial, __r[__k - 1]), __combine, __scan); + }); + } +} + +template +void +__parallel_strict_scan_body(_Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex) +{ + _Index __p = omp_get_num_threads(); + const _Index __slack = 4; + _Index __tilesize = (__n - 1) / (__slack * __p) + 1; + _Index __m = (__n - 1) / __tilesize; + __buffer<_Tp> __buf(__m + 1); + _Tp* __r = __buf.get(); + + __omp_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, __combine); + + std::size_t __k = __m + 1; + _Tp __t = __r[__k - 1]; + while ((__k &= __k - 1)) + { + __t = __combine(__r[__k - 1], __t); + } + + __apex(__combine(__initial, __t)); + __omp_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, + __combine, __scan); +} + +template +void +__parallel_strict_scan(_ExecutionPolicy&& __exec, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, + _Ap __apex) +{ + if (__n <= 1) + { + __serial_backend::__parallel_strict_scan(std::forward<_ExecutionPolicy>(__exec), __n, __initial, __reduce, + __combine, __scan, __apex); + return; + } + + if (omp_in_parallel()) + { + // we don't create a nested parallel region in an existing parallel + // region: just create tasks + __pstl::__omp_backend::__parallel_strict_scan_body(__n, __initial, __reduce, __combine, __scan, __apex); + } + else + { + // in any case (nested or non-nested) one parallel region is created and + // only one thread creates a set of tasks + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + { + __pstl::__omp_backend::__parallel_strict_scan_body(__n, __initial, __reduce, __combine, __scan, __apex); + } + } +} + +template +_Tp +__parallel_transform_scan(_ExecutionPolicy&& __exec, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce, + _Sp __scan) +{ + + return __serial_backend::__parallel_transform_scan(std::forward<_ExecutionPolicy>(__exec), __n, __u, __init, + __combine, __brick_reduce, __scan); +} + +//------------------------------------------------------------------------ +// parallel_stable_sort +//------------------------------------------------------------------------ + +template +struct _MinKItems +{ + using _MinKVector = std::vector<_RandomAccessIterator>; + _MinKVector __smallest_k_items; + typename _MinKVector::iterator __largest_item; + + bool + __empty() + { + return __smallest_k_items.empty(); + } + + auto + __size() + { + return std::size(__smallest_k_items); + } + + void + __resize(std::size_t new_size) + { + __smallest_k_items.resize(new_size); + } +}; + +template +struct _MinKOp +{ + _MinKItems<_RandomAccessIterator>& __items; + _Compare __comp; + + _MinKOp(_MinKItems<_RandomAccessIterator>& __items_, _Compare __comp_) : __items(__items_), __comp(__comp_) {} + + void + __keep_smallest_k_items(_RandomAccessIterator __item) + { + // If the new item is larger than the largest item in the list, discard it. + if (__comp(**__items.__largest_item, *__item)) + { + return; + } + + // If thew new item is equal to the largest item in the list, discard it. + if (!__comp(*__item, **__items.__largest_item)) + { + return; + } + + // The new item is smaller than the largest item. Replace the largest item + // with the new item. + *__items.__largest_item = __item; + + // Find the new largest item. + __update_largest_item(); + }; + + void + __update_largest_item() + { + __items.__largest_item = + std::max_element(std::begin(__items.__smallest_k_items), std::end(__items.__smallest_k_items), + [this](const auto& l, const auto& r) { return __comp(*l, *r); }); + } + + void + __merge(_MinKItems<_RandomAccessIterator>& __other) + { + for (auto __it = std::begin(__other.__smallest_k_items); __it != std::end(__other.__smallest_k_items); ++__it) + { + __keep_smallest_k_items(*__it); + } + } + + void + __initialize(_RandomAccessIterator __first, _RandomAccessIterator __last, std::size_t __k) + { + __items.__resize(__k); + auto __item_it = __first; + for (auto __tracking_it = begin(__items.__smallest_k_items); + __item_it != __last && __tracking_it != end(__items.__smallest_k_items); ++__item_it, ++__tracking_it) + { + *__tracking_it = __item_it; + } + __update_largest_item(); + for (; __item_it != __last; ++__item_it) + { + __keep_smallest_k_items(__item_it); + } + } + + static auto + __reduce(_MinKItems<_RandomAccessIterator>& __v1, _MinKItems<_RandomAccessIterator>& __v2, _Compare __comp) + -> _MinKItems<_RandomAccessIterator> + { + if (__v1.__empty()) + { + return __v2; + } + + if (__v2.__empty()) + { + return __v1; + } + + if (__v1.__size() >= __v2.__size()) + { + _MinKOp<_RandomAccessIterator, _Compare> __op(__v1, __comp); + __op.__merge(__v2); + return __v1; + } + + _MinKOp<_RandomAccessIterator, _Compare> __op(__v2, __comp); + __op.__merge(__v1); + return __v2; + } +}; + +template +auto +__find_min_k(_RandomAccessIterator __first, _RandomAccessIterator __last, std::size_t __k, _Compare __comp) + -> _MinKItems<_RandomAccessIterator> +{ + _MinKItems<_RandomAccessIterator> __items; + _MinKOp op(__items, __comp); + + op.__initialize(__first, __last, __k); + return __items; +} + +template +auto +__parallel_find_pivot(_RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp, std::size_t __nsort) + -> _RandomAccessIterator +{ + using _Value = _MinKItems<_RandomAccessIterator>; + using _Op = _MinKOp<_RandomAccessIterator, _Compare>; + + std::size_t __n_chunks{0}, __chunk_size{0}, __first_chunk_size{0}; + __omp_backend::__chunk_partitioner(__first, __last, __n_chunks, __chunk_size, __first_chunk_size, + std::max(__nsort, __default_chunk_size)); + /* + * This function creates a vector of iterators to the container being operated + * on. It splits that container into fixed size chunks, just like other + * functions in this backend. For each chunk it finds the smallest k items. + * The chunks are run through a reducer which keeps the smallest k items from + * each chunk. Finally, the largest item from the merged chunks is returned as + * the pivot. + * + * The chunks are partitioned in such a way that there will always be at least + * k items in one chunk. The reducer will always produce a chunk merge so that + * the longest k items list propagates out. So even if some of the chunks are + * less than __nsort elements, at least one chunk will be and this chunk will + * end up producing a correctly sized smallest k items list. + * + * Note that the case where __nsort == distance(__first, __last) is handled by + * performing a complete sort of the container, so we don't have to handle + * that here. + */ + + auto __reduce_chunk = [&](std::uint32_t __chunk) { + auto __this_chunk_size = __chunk == 0 ? __first_chunk_size : __chunk_size; + auto __index = __chunk == 0 ? 0 : (__chunk * __chunk_size) + (__first_chunk_size - __chunk_size); + auto __begin = __first + __index; + auto __end = __begin + __this_chunk_size; + + return __find_min_k(__begin, __end, __nsort, __comp); + }; + + auto __reduce_value = [&](auto& __v1, auto& __v2) { return _Op::__reduce(__v1, __v2, __comp); }; + auto __result = __parallel_reduce_chunks<_Value>(0, __n_chunks, __reduce_chunk, __reduce_value); + + return *__result.__largest_item; +} + +template +void +__parallel_partition(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _RandomAccessIterator __pivot, + _Compare __comp, std::size_t __nsort) +{ + auto __size = static_cast(std::distance(__xs, __xe)); + std::atomic_bool* __status = new std::atomic_bool[__size]; + + /* + * First, walk through the entire array and mark items that are on the + * correct side of the pivot as true, and the others as false. + */ + _PSTL_PRAGMA(omp taskloop shared(__status)) + for (std::size_t __index = 0U; __index < __size; ++__index) + { + auto __item = std::next(__xs, __index); + if (__index < __nsort) + { + __status[__index].store(__comp(*__item, *__pivot)); + } + else + { + __status[__index].store(__comp(*__pivot, *__item)); + } + } + + /* + * Second, walk through the first __nsort items of the array and move + * any items that are not in the right place. The status array is used + * to locate places outside the partition where values can be safely + * swapped. + */ + _PSTL_PRAGMA(omp taskloop shared(__status)) + for (std::size_t __index = 0U; __index < __nsort; ++__index) + { + // If the item is already in the right place, move along. + if (__status[__index].load()) + { + continue; + } + + // Otherwise, find the an item that can be moved into this + // spot safely. + for (std::size_t __swap_index = __nsort; __swap_index < __size; ++__swap_index) + { + // Try to capture this slot by using compare and exchange. If we + // are able to capture the slot then perform a swap and exit this + // loop. + if (__status[__swap_index].load() == false && __status[__swap_index].exchange(true) == false) + { + auto __current_item = std::next(__xs, __index); + auto __swap_item = std::next(__xs, __swap_index); + std::swap(__current_item, __swap_item); + break; + } + } + } + + delete[] __status; +} + +template +void +__parallel_stable_sort_body(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp) +{ + std::size_t __size = std::distance(__xs, __xe); + if (__size == 0) + { + return; + } + + auto __left_it = __xs; + auto __right_it = __xe; + bool __is_swapped_left = false, __is_swapped_right = false; + auto __pivot = *__xs; + + auto __forward_it = __xs + 1; + while (__forward_it <= __right_it) + { + if (__comp(*__forward_it, __pivot)) + { + __is_swapped_left = true; + std::iter_swap(__left_it, __forward_it); + __left_it++; + __forward_it++; + } + else if (__comp(__pivot, *__forward_it)) + { + __is_swapped_right = true; + std::iter_swap(__right_it, __forward_it); + __right_it--; + } + else + { + __forward_it++; + } + } + + if (__size >= __default_chunk_size) + { + _PSTL_PRAGMA(omp taskgroup) + { + _PSTL_PRAGMA(omp task untied mergeable) + { + if (std::distance(__xs, __left_it) > 0 && __is_swapped_left) + { + __parallel_stable_sort_body(__xs, __left_it - 1, __comp); + } + } + + _PSTL_PRAGMA(omp task untied mergeable) + { + if (std::distance(__right_it, __xe) && __is_swapped_right) + { + __parallel_stable_sort_body(__right_it + 1, __xe, __comp); + } + } + } + } + else + { + _PSTL_PRAGMA(omp task untied mergeable) + { + if (std::distance(__xs, __left_it) > 0 && __is_swapped_left) + { + __parallel_stable_sort_body(__xs, __left_it - 1, __comp); + } + + if (std::distance(__right_it, __xe) && __is_swapped_right) + { + __parallel_stable_sort_body(__right_it + 1, __xe, __comp); + } + } + } +} + +template +void +__parallel_stable_partial_sort(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp, + _LeafSort __leaf_sort, std::size_t __nsort) +{ + auto __pivot = __parallel_find_pivot(__xs, __xe, __comp, __nsort); + __parallel_partition(__xs, __xe, __pivot, __comp, __nsort); + + if (__nsort <= __default_chunk_size) + { + __leaf_sort(__xs, __pivot, __comp); + } + else + { + __parallel_stable_sort_body(__xs, __pivot, __comp); + } +} + +template +void +__parallel_stable_sort(_ExecutionPolicy&&, _RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp, + _LeafSort __leaf_sort, std::size_t __nsort = 0) +{ + if (__xs >= __xe) + { + return; + } + + if (__nsort < __default_chunk_size) + { + __leaf_sort(__xs, __xe, __comp); + return; + } + + std::size_t __count = static_cast(std::distance(__xs, __xe)); + + if (omp_in_parallel()) + { + if (__count <= __nsort) + { + __parallel_stable_sort_body(__xs, __xe, __comp); + } + else + { + __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort); + } + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + if (__count <= __nsort) + { + __parallel_stable_sort_body(__xs, __xe, __comp); + } + else + { + __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort); + } + } +} + +//------------------------------------------------------------------------ +// parallel_merge +//------------------------------------------------------------------------ + +template +void +__parallel_merge(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, + _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp, + _LeafMerge __leaf_merge) + +{ + __serial_backend::__parallel_merge(std::forward<_ExecutionPolicy>(__exec), __xs, __xe, __ys, __ye, __zs, __comp, + __leaf_merge); +} + +} // namespace __omp_backend +} // namespace __pstl + +#endif // diff --git a/pstl/include/pstl/internal/pstl_config.h b/pstl/include/pstl/internal/pstl_config.h --- a/pstl/include/pstl/internal/pstl_config.h +++ b/pstl/include/pstl/internal/pstl_config.h @@ -18,7 +18,8 @@ #define _PSTL_VERSION_MINOR ((_PSTL_VERSION % 1000) / 10) #define _PSTL_VERSION_PATCH (_PSTL_VERSION % 10) -#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB) +#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB) && \ + !defined(_PSTL_PAR_BACKEND_OMP) # error "A parallel backend must be specified" #endif diff --git a/pstl/include/pstl/internal/unseq_backend_simd.h b/pstl/include/pstl/internal/unseq_backend_simd.h --- a/pstl/include/pstl/internal/unseq_backend_simd.h +++ b/pstl/include/pstl/internal/unseq_backend_simd.h @@ -528,23 +528,6 @@ return std::make_pair(__result + __n, __init); } -// As soon as we cannot call __binary_op in "combiner" we create a wrapper over _Tp to encapsulate __binary_op -template -struct _Combiner -{ - _Tp __value; - _BinaryOp* __bin_op; // Here is a pointer to function because of default ctor - - _Combiner() : __value{}, __bin_op(nullptr) {} - _Combiner(const _Tp& value, const _BinaryOp* bin_op) : __value(value), __bin_op(const_cast<_BinaryOp*>(bin_op)) {} - _Combiner(const _Combiner& __obj) : __value{}, __bin_op(__obj.__bin_op) {} - - void - operator()(const _Combiner& __obj) - { - __value = (*__bin_op)(__value, __obj.__value); - } -}; // Exclusive scan for other binary operations and types template _CombinerType; + typedef __internal::_Combiner<_Tp, _BinaryOperation> _CombinerType; _CombinerType __init_{__init, &__binary_op}; _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType) @@ -593,7 +576,7 @@ __simd_scan(_InputIterator __first, _Size __n, _OutputIterator __result, _UnaryOperation __unary_op, _Tp __init, _BinaryOperation __binary_op, std::true_type) { - typedef _Combiner<_Tp, _BinaryOperation> _CombinerType; + typedef __internal::_Combiner<_Tp, _BinaryOperation> _CombinerType; _CombinerType __init_{__init, &__binary_op}; _PSTL_PRAGMA_DECLARE_REDUCTION(__bin_op, _CombinerType) diff --git a/pstl/include/pstl/internal/utils.h b/pstl/include/pstl/internal/utils.h --- a/pstl/include/pstl/internal/utils.h +++ b/pstl/include/pstl/internal/utils.h @@ -169,6 +169,23 @@ } } +// As soon as we cannot call __binary_op in "combiner" we create a wrapper over _Tp to encapsulate __binary_op +template +struct _Combiner +{ + _Tp __value; + _BinaryOp* __bin_op; // Here is a pointer to function because of default ctor + + _Combiner() : __value{}, __bin_op(nullptr) {} + _Combiner(const _Tp& value, const _BinaryOp* bin_op) : __value(value), __bin_op(const_cast<_BinaryOp*>(bin_op)) {} + + void + operator()(const _Combiner& __obj) + { + __value = (*__bin_op)(__value, __obj.__value); + } +}; + } // namespace __internal } // namespace __pstl diff --git a/pstl/test/std/algorithms/alg.modifying.operations/replace.pass.cpp b/pstl/test/std/algorithms/alg.modifying.operations/replace.pass.cpp --- a/pstl/test/std/algorithms/alg.modifying.operations/replace.pass.cpp +++ b/pstl/test/std/algorithms/alg.modifying.operations/replace.pass.cpp @@ -25,6 +25,17 @@ int32_t copied_times = 0; constexpr explicit copy_int(int32_t val = 0) : value(val) {} + constexpr copy_int(const copy_int& other) + { + if (&other == this) + copied_times++; + else + { + value = other.value; + copied_times = other.copied_times; + } + } + constexpr copy_int& operator=(const copy_int& other) { diff --git a/pstl/test/std/numerics/numeric.ops/transform_reduce.pass.cpp b/pstl/test/std/numerics/numeric.ops/transform_reduce.pass.cpp --- a/pstl/test/std/numerics/numeric.ops/transform_reduce.pass.cpp +++ b/pstl/test/std/numerics/numeric.ops/transform_reduce.pass.cpp @@ -38,6 +38,12 @@ MyClass() { my_field = 0; } MyClass(int32_t in) { my_field = in; } MyClass(const MyClass& in) { my_field = in.my_field; } + MyClass& + operator=(const MyClass& in) + { + my_field = in.my_field; + return *this; + } friend MyClass operator+(const MyClass& x, const MyClass& y) @@ -49,11 +55,13 @@ { return MyClass(-x.my_field); } - friend MyClass operator*(const MyClass& x, const MyClass& y) + friend MyClass + operator*(const MyClass& x, const MyClass& y) { return MyClass(x.my_field * y.my_field); } - friend bool operator==(const MyClass& x, const MyClass& y) + friend bool + operator==(const MyClass& x, const MyClass& y) { return x.my_field == y.my_field; } @@ -115,9 +123,10 @@ { test_by_type(42, std::plus(), std::multiplies(), std::negate(), [](std::size_t) -> int32_t { return int32_t(rand() % 1000); }); - test_by_type(0, [](const int64_t& a, const int64_t& b) -> int64_t { return a | b; }, XOR(), - [](const int64_t& x) -> int64_t { return x * 2; }, - [](std::size_t) -> int64_t { return int64_t(rand() % 1000); }); + test_by_type( + 0, [](const int64_t& a, const int64_t& b) -> int64_t { return a | b; }, XOR(), + [](const int64_t& x) -> int64_t { return x * 2; }, + [](std::size_t) -> int64_t { return int64_t(rand() % 1000); }); test_by_type( 1.0f, std::multiplies(), [](const float32_t& a, const float32_t& b) -> float32_t { return a + b; }, [](const float32_t& x) -> float32_t { return x + 2; }, [](std::size_t) -> float32_t { return rand() % 1000; });