Index: pstl/include/__pstl_config_site.in =================================================================== --- pstl/include/__pstl_config_site.in +++ pstl/include/__pstl_config_site.in @@ -11,6 +11,7 @@ #cmakedefine _PSTL_PAR_BACKEND_SERIAL #cmakedefine _PSTL_PAR_BACKEND_TBB +#cmakedefine _PSTL_PAR_BACKEND_OMP #cmakedefine _PSTL_HIDE_FROM_ABI_PER_TU #endif // __PSTL_CONFIG_SITE Index: pstl/include/pstl/internal/omp/parallel_for.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_for.h @@ -0,0 +1,53 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_for_body(_Index __first, _Index __last, _Fp __f) +{ + // initial partition of the iteration space into chunks + auto __policy = __omp_backend::__chunk_partitioner(__first, __last); + + // To avoid over-subscription we use taskloop for the nested parallelism + _PSTL_PRAGMA(omp taskloop untied mergeable) + for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk) + { + __omp_backend::__process_chunk(__policy, __first, __chunk, __f); + } +} + +//------------------------------------------------------------------------ +// Notation: +// Evaluation of brick f[i,j) for each subrange [i,j) of [first, last) +//------------------------------------------------------------------------ + +template +void +__parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f) +{ + if (__should_run_serial(__first, __last)) + { + __f(__first, __last); + return; + } + + if (omp_in_parallel()) + { + // we don't create a nested parallel region in an existing parallel + // region: just create tasks + __pstl::__omp_backend::__parallel_for_body(__first, __last, __f); + } + else + { + // in any case (nested or non-nested) one parallel region is created and + // only one thread creates a set of tasks + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) { __pstl::__omp_backend::__parallel_for_body(__first, __last, __f); } + } +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_for_each.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_for_each.h @@ -0,0 +1,42 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_for_each_body(_ForwardIterator __first, _ForwardIterator __last, _Fp __f) +{ + using DifferenceType = typename ::std::iterator_traits<_ForwardIterator>::difference_type; + auto __size = ::std::distance(__first, __last); + + _PSTL_PRAGMA(omp taskloop untied mergeable) + for (DifferenceType __index = 0; __index < __size; ++__index) + { + auto __iter = ::std::next(__first, __index); + __f(*__iter); + } +} + +template +void +__parallel_for_each(_ExecutionPolicy&&, _ForwardIterator __first, _ForwardIterator __last, _Fp __f) +{ + if (omp_in_parallel()) + { + // we don't create a nested parallel region in an existing parallel + // region: just create tasks + __pstl::__omp_backend::__parallel_for_each_body(__first, __last, __f); + } + else + { + // in any case (nested or non-nested) one parallel region is created and + // only one thread creates a set of tasks + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) { __pstl::__omp_backend::__parallel_for_each_body(__first, __last, __f); } + } +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_invoke.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_invoke.h @@ -0,0 +1,35 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_invoke_body(_F1&& __f1, _F2&& __f2) +{ + _PSTL_PRAGMA(omp taskgroup) + { + _PSTL_PRAGMA(omp task untied mergeable) { std::forward<_F1>(__f1)(); } + _PSTL_PRAGMA(omp task untied mergeable) { std::forward<_F2>(__f2)(); } + } +} + +template +void +__parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2) +{ + if (omp_in_parallel()) + { + __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); + } +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_merge.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_merge.h @@ -0,0 +1,88 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_merge_body(std::size_t __size_x, std::size_t __size_y, _RandomAccessIterator1 __xs, + _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, + _RandomAccessIterator3 __zs, _Compare __comp, _LeafMerge __leaf_merge) +{ + + if (__size_x + __size_y <= __omp_backend::__default_chunk_size) + { + __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp); + return; + } + + _RandomAccessIterator1 __xm; + _RandomAccessIterator2 __ym; + + if (__size_x < __size_y) + { + __ym = __ys + (__size_y / 2); + __xm = ::std::upper_bound(__xs, __xe, *__ym, __comp); + } + else + { + __xm = __xs + (__size_x / 2); + __ym = ::std::lower_bound(__ys, __ye, *__xm, __comp); + } + + auto __zm = __zs + std::distance(__xs, __xm) + std::distance(__ys, __ym); + + _PSTL_PRAGMA(omp task untied mergeable default(none) + firstprivate(__xs, __xm, __ys, __ym, __zs, __comp, __leaf_merge)) + __parallel_merge_body(std::distance(__xs, __xm), std::distance(__ys, __ym), __xs, __xm, __ys, __ym, __zs, __comp, + __leaf_merge); + + _PSTL_PRAGMA(omp task untied mergeable default(none) + firstprivate(__xm, __xe, __ym, __ye, __zm, __comp, __leaf_merge)) + __parallel_merge_body(std::distance(__xm, __xe), std::distance(__ym, __ye), __xm, __xe, __ym, __ye, __zm, __comp, + __leaf_merge); + + _PSTL_PRAGMA(omp taskwait) +} + +template +void +__parallel_merge(_ExecutionPolicy&& /*__exec*/, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, + _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp, + _LeafMerge __leaf_merge) + +{ + std::size_t __size_x = std::distance(__xs, __xe); + std::size_t __size_y = std::distance(__ys, __ye); + + // If the size is too small, don't bother parallelizing it. + if (__size_x + __size_y <= __default_chunk_size) + { + __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp); + return; + } + + /* + * Run the merge in parallel by chunking it up. Use the smaller range (if any) as the iteration range, and the + * larger range as the search range. + */ + + if (omp_in_parallel()) + { + __parallel_merge_body(__size_x, __size_y, __xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge); + } + else + { + _PSTL_PRAGMA(omp parallel) + { + _PSTL_PRAGMA(omp single) + __parallel_merge_body(__size_x, __size_y, __xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge); + } + } +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_reduce.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_reduce.h @@ -0,0 +1,67 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +_Value +__parallel_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity, + _RealBody __real_body, _Reduction __reduce) +{ + if (__should_run_serial(__first, __last)) + { + return __real_body(__first, __last, __identity); + } + + auto __middle = __first + ((__last - __first) / 2); + _Value __v1(__identity), __v2(__identity); + __parallel_invoke_body( + [&]() { __v1 = __parallel_reduce_body(__first, __middle, __identity, __real_body, __reduce); }, + [&]() { __v2 = __parallel_reduce_body(__middle, __last, __identity, __real_body, __reduce); }); + + return __reduce(__v1, __v2); +} + +//------------------------------------------------------------------------ +// Notation: +// r(i,j,init) returns reduction of init with reduction over [i,j) +// c(x,y) combines values x and y that were the result of r +//------------------------------------------------------------------------ + +template +_Value +__parallel_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity, + _RealBody __real_body, _Reduction __reduction) +{ + if (__first == __last) + return __identity; + + // Don't bother parallelizing the work if the size is too small. + if (__should_run_serial(__first, __last)) + { + return __real_body(__first, __last, __identity); + } + + // We don't create a nested parallel region in an existing parallel region: + // just create tasks. + if (omp_in_parallel()) + { + return __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction); + } + + // In any case (nested or non-nested) one parallel region is created and only + // one thread creates a set of tasks. + _Value __res = __identity; + + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + { + __res = __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction); + } + + return __res; +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_scan.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_scan.h @@ -0,0 +1,120 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +_Index +__split(_Index __m) +{ + _Index __k = 1; + while (2 * __k < __m) + __k *= 2; + return __k; +} + +template +void +__upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine) +{ + if (__m == 1) + __r[0] = __reduce(__i * __tilesize, __lastsize); + else + { + _Index __k = __split(__m); + __omp_backend::__parallel_invoke_body( + [=] { __omp_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); }, + [=] { + __omp_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine); + }); + if (__m == 2 * __k) + __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]); + } +} + +template +void +__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, + _Sp __scan) +{ + if (__m == 1) + __scan(__i * __tilesize, __lastsize, __initial); + else + { + const _Index __k = __split(__m); + __omp_backend::__parallel_invoke_body( + [=] { __omp_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); }, + // Assumes that __combine never throws. + // TODO: Consider adding a requirement for user functors to be constant. + [=, &__combine] { + __omp_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __combine(__initial, __r[__k - 1]), __combine, __scan); + }); + } +} + +template +void +__parallel_strict_scan_body(_Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex) +{ + _Index __p = omp_get_num_threads(); + const _Index __slack = 4; + _Index __tilesize = (__n - 1) / (__slack * __p) + 1; + _Index __m = (__n - 1) / __tilesize; + __buffer<_Tp> __buf(__m + 1); + _Tp* __r = __buf.get(); + + __omp_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, __combine); + + std::size_t __k = __m + 1; + _Tp __t = __r[__k - 1]; + while ((__k &= __k - 1)) + { + __t = __combine(__r[__k - 1], __t); + } + + __apex(__combine(__initial, __t)); + __omp_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, + __combine, __scan); +} + +template +void +__parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, + _Ap __apex) +{ + if (__n <= __default_chunk_size) + { + _Tp __sum = __initial; + if (__n) + { + __sum = __combine(__sum, __reduce(_Index(0), __n)); + } + __apex(__sum); + if (__n) + { + __scan(_Index(0), __n, __initial); + } + return; + } + + if (omp_in_parallel()) + { + __pstl::__omp_backend::__parallel_strict_scan_body<_ExecutionPolicy>(__n, __initial, __reduce, __combine, __scan, + __apex); + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + { + __pstl::__omp_backend::__parallel_strict_scan_body<_ExecutionPolicy>(__n, __initial, __reduce, __combine, + __scan, __apex); + } + } +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_stable_partial_sort.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_stable_partial_sort.h @@ -0,0 +1,17 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_stable_partial_sort(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp, + _LeafSort __leaf_sort, std::size_t __nsort) +{ + //TODO: "Parallel partial sort needs to be implemented."); + __leaf_sort(__xs, __xe, __comp); +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_stable_sort.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_stable_sort.h @@ -0,0 +1,146 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +namespace __sort_details +{ +template +struct __move_value +{ + void + operator()(Iterator1 __x, Iterator2 __z) + { + *__z = ::std::move(*__x); + } +}; + +template +Iterator2 +__parallel_move_range(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2) +{ + std::size_t __size = std::distance(__first1, __last1); + + // Perform serial moving of small chunks + + if (__size <= __default_chunk_size) + { + return ::std::move(__first1, __last1, __first2); + } + + // Perform parallel moving of larger chunks + auto __policy = __omp_backend::__chunk_partitioner(__first1, __last1); + + _PSTL_PRAGMA(omp taskloop) + for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk) + { + __omp_backend::__process_chunk(__policy, __first1, __chunk, [&](auto __chunk_first, auto __chunk_last) { + auto __chunk_offset = ::std::distance(__first1, __chunk_first); + auto __output = __first2 + __chunk_offset; + ::std::move(__chunk_first, __chunk_last, __output); + }); + } + + return __first2 + __size; +} + +template +struct __move_range +{ + Iterator2 + operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2) + { + return __parallel_move_range(__first1, __last1, __first2); + } +}; +} // namespace __sort_details + +template +void +__parallel_stable_sort_body(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp, + _LeafSort __leaf_sort) +{ + using _ValueType = typename std::iterator_traits<_RandomAccessIterator>::value_type; + using _VecType = typename std::vector<_ValueType>; + using _OutputIterator = typename _VecType::iterator; + using _MoveValueType = typename __omp_backend::__sort_details::__move_value<_RandomAccessIterator, _OutputIterator>; + using _MoveRangeType = __omp_backend::__sort_details::__move_range<_RandomAccessIterator, _OutputIterator>; + + if (__should_run_serial(__xs, __xe)) + { + __leaf_sort(__xs, __xe, __comp); + } + else + { + std::size_t __size = std::distance(__xs, __xe); + auto __mid = __xs + (__size / 2); + __parallel_invoke_body([&]() { __parallel_stable_sort_body(__xs, __mid, __comp, __leaf_sort); }, + [&]() { __parallel_stable_sort_body(__mid, __xe, __comp, __leaf_sort); }); + + // Perform a parallel merge of the sorted ranges into __output. + _VecType __output(__size); + _MoveValueType __move_value; + _MoveRangeType __move_range; + __utils::__serial_move_merge __merge(__size); + __parallel_merge_body( + std::distance(__xs, __mid), std::distance(__mid, __xe), __xs, __mid, __mid, __xe, __output.begin(), __comp, + [&__merge, &__move_value, &__move_range](_RandomAccessIterator __as, _RandomAccessIterator __ae, + _RandomAccessIterator __bs, _RandomAccessIterator __be, + _OutputIterator __cs, _Compare __comp) { + __merge(__as, __ae, __bs, __be, __cs, __comp, __move_value, __move_value, __move_range, __move_range); + }); + + // Move the values from __output back in the original source range. + __sort_details::__parallel_move_range(__output.begin(), __output.end(), __xs); + } +} + +template +void +__parallel_stable_sort(_ExecutionPolicy&& __exec, _RandomAccessIterator __xs, _RandomAccessIterator __xe, + _Compare __comp, _LeafSort __leaf_sort, std::size_t __nsort = 0) +{ + if (__xs >= __xe) + { + return; + } + + auto __count = static_cast(std::distance(__xs, __xe)); + if (__count <= __default_chunk_size || __nsort < __count) + { + __leaf_sort(__xs, __xe, __comp); + return; + } + + // TODO: the partial sort implementation should + // be shared with the other backends. + + if (omp_in_parallel()) + { + if (__count <= __nsort) + { + __parallel_stable_sort_body(__xs, __xe, __comp, __leaf_sort); + } + else + { + __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort); + } + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + if (__count <= __nsort) + { + __parallel_stable_sort_body(__xs, __xe, __comp, __leaf_sort); + } + else + { + __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort); + } + } +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_transform_reduce.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_transform_reduce.h @@ -0,0 +1,102 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +//------------------------------------------------------------------------ +// parallel_transform_reduce +// +// Notation: +// r(i,j,init) returns reduction of init with reduction over [i,j) +// u(i) returns f(i,i+1,identity) for a hypothetical left identity element +// of r c(x,y) combines values x and y that were the result of r or u +//------------------------------------------------------------------------ + +template +auto +__transform_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init, + _Combiner __combiner, _Reduction __reduction) +{ + using _Size = std::size_t; + const _Size __num_threads = omp_get_num_threads(); + const _Size __size = __last - __first; + + // Initial partition of the iteration space into chunks. If the range is too small, + // this will result in a nonsense policy, so we check on the size as well below. + auto __policy = __omp_backend::__chunk_partitioner(__first + __num_threads, __last); + + if (__size <= __num_threads || __policy.__n_chunks < 2) + { + return __reduction(__first, __last, __init); + } + + // Here, we cannot use OpenMP UDR because we must store the init value in + // the combiner and it will be used several times. Although there should be + // the only one; we manually generate the identity elements for each thread. + ::std::vector<_Value> __accums; + __accums.reserve(__num_threads); + + // initialize accumulators for all threads + for (_Size __i = 0; __i < __num_threads; ++__i) + { + __accums.emplace_back(__unary_op(__first + __i)); + } + + // main loop + _PSTL_PRAGMA(omp taskloop shared(__accums)) + for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk) + { + __omp_backend::__process_chunk( + __policy, __first + __num_threads, __chunk, [&](auto __chunk_first, auto __chunk_last) { + auto __thread_num = omp_get_thread_num(); + __accums[__thread_num] = __reduction(__chunk_first, __chunk_last, __accums[__thread_num]); + }); + } + + // combine by accumulators + for (_Size __i = 0; __i < __num_threads; ++__i) + { + __init = __combiner(__init, __accums[__i]); + } + + return __init; +} + +template +_Value +__parallel_transform_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, + _UnaryOp __unary_op, _Value __init, _Combiner __combiner, _Reduction __reduction) +{ + + if (__first == __last) + { + return __init; + } + + _Value __result = __init; + if (omp_in_parallel()) + { + // We don't create a nested parallel region in an existing parallel + // region: just create tasks + __result = + __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner, __reduction); + } + else + { + // Create a parallel region, and a single thread will create tasks + // for the region. + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single) + { + __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner, + __reduction); + } + } + + return __result; +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/omp/parallel_transform_scan.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/omp/parallel_transform_scan.h @@ -0,0 +1,16 @@ + +namespace __pstl +{ +namespace __omp_backend +{ + +template +_Tp +__parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce, + _Sp __scan) +{ + return __scan(_Index(0), __n, __init); +} + +} // namespace __omp_backend +} // namespace __pstl Index: pstl/include/pstl/internal/parallel_backend.h =================================================================== --- pstl/include/pstl/internal/parallel_backend.h +++ pstl/include/pstl/internal/parallel_backend.h @@ -24,6 +24,12 @@ { namespace __par_backend = __tbb_backend; } +#elif defined(_PSTL_PAR_BACKEND_OMP) +# include "parallel_backend_omp.h" +namespace __pstl +{ +namespace __par_backend = __omp_backend; +} #else _PSTL_PRAGMA_MESSAGE("Parallel backend was not specified"); #endif Index: pstl/include/pstl/internal/parallel_backend_omp.h =================================================================== --- /dev/null +++ pstl/include/pstl/internal/parallel_backend_omp.h @@ -0,0 +1,220 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// Copyright (C) Christopher Nelson +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_PARALLEL_BACKEND_OMP_H +#define _PSTL_PARALLEL_BACKEND_OMP_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "parallel_backend_utils.h" +#include "unseq_backend_simd.h" +#include "utils.h" + +#if !defined(_OPENMP) +# error _OPENMP not defined. +#endif + +// Portability "#pragma" definition +#ifdef _MSC_VER +# define _PSTL_PRAGMA(x) __pragma(x) +#else +# define _PSTL_PRAGMA(x) _Pragma(# x) +#endif + +_PSTL_HIDE_FROM_ABI_PUSH + +namespace __pstl +{ +namespace __omp_backend +{ + +//------------------------------------------------------------------------ +// use to cancel execution +//------------------------------------------------------------------------ +inline void +__cancel_execution() +{ +} + +//------------------------------------------------------------------------ +// raw buffer +//------------------------------------------------------------------------ + +template +class __buffer +{ + std::allocator<_Tp> __allocator_; + _Tp* __ptr_; + const std::size_t __buf_size_; + __buffer(const __buffer&) = delete; + void + operator=(const __buffer&) = delete; + + public: + __buffer(std::size_t __n) : __allocator_(), __ptr_(__allocator_.allocate(__n)), __buf_size_(__n) {} + + operator bool() const { return __ptr_ != nullptr; } + + _Tp* + get() const + { + return __ptr_; + } + ~__buffer() { __allocator_.deallocate(__ptr_, __buf_size_); } +}; + +// Preliminary size of each chunk: requires further discussion +constexpr std::size_t __default_chunk_size = 512; + +// Convenience function to determine when we should run serial. +template ::value, bool> = true> +constexpr auto +__should_run_serial(_Iterator __first, _Iterator __last) -> bool +{ + using _difference_type = typename ::std::iterator_traits<_Iterator>::difference_type; + auto __size = ::std::distance(__first, __last); + return __size <= static_cast<_difference_type>(__default_chunk_size); +} + +template ::value, bool> = true> +constexpr auto +__should_run_serial(_Index __first, _Index __last) -> bool +{ + using _difference_type = ::std::intptr_t; + auto __size = __last - __first; + return __size <= static_cast<_difference_type>(__default_chunk_size); +} + +struct __chunk_policy +{ + std::size_t __n_chunks{0}, __chunk_size{0}, __first_chunk_size{0}; +}; + +// The iteration space partitioner according to __requested_chunk_size +template +auto +__chunk_partitioner(_RandomAccessIterator __first, _RandomAccessIterator __last, + _Size __requested_chunk_size = __default_chunk_size) -> __chunk_policy +{ + /* + * This algorithm improves distribution of elements in chunks by avoiding + * small tail chunks. The leftover elements that do not fit neatly into + * the chunk size are redistributed to early chunks. This improves + * utilization of the processor's prefetch and reduces the number of + * tasks needed by 1. + */ + + const _Size __n = __last - __first; + _Size __n_chunks = 0; + _Size __chunk_size = 0; + _Size __first_chunk_size = 0; + if (__n < __requested_chunk_size) + { + __chunk_size = __n; + __first_chunk_size = __n; + __n_chunks = 1; + return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size}; + } + + __n_chunks = (__n / __requested_chunk_size) + 1; + __chunk_size = __n / __n_chunks; + __first_chunk_size = __chunk_size; + const _Size __n_leftover_items = __n - (__n_chunks * __chunk_size); + + if (__n_leftover_items == __chunk_size) + { + __n_chunks += 1; + return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size}; + } + else if (__n_leftover_items == 0) + { + __first_chunk_size = __chunk_size; + return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size}; + } + + const _Size __n_extra_items_per_chunk = __n_leftover_items / __n_chunks; + const _Size __n_final_leftover_items = __n_leftover_items - (__n_extra_items_per_chunk * __n_chunks); + + __chunk_size += __n_extra_items_per_chunk; + __first_chunk_size = __chunk_size + __n_final_leftover_items; + + return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size}; +} + +template +void +__process_chunk(const __chunk_policy& __policy, _Iterator __base, _Index __chunk_index, _F __f) +{ + auto __this_chunk_size = __chunk_index == 0 ? __policy.__first_chunk_size : __policy.__chunk_size; + auto __index = __chunk_index == 0 ? 0 + : (__chunk_index * __policy.__chunk_size) + + (__policy.__first_chunk_size - __policy.__chunk_size); + auto __first = __base + __index; + auto __last = __first + __this_chunk_size; + __f(__first, __last); +} + +} // namespace __omp_backend +} // namespace __pstl + +//------------------------------------------------------------------------ +// parallel_invoke +//------------------------------------------------------------------------ + +#include "./omp/parallel_invoke.h" + +//------------------------------------------------------------------------ +// parallel_for +//------------------------------------------------------------------------ + +#include "./omp/parallel_for.h" + +//------------------------------------------------------------------------ +// parallel_for_each +//------------------------------------------------------------------------ + +#include "./omp/parallel_for_each.h" + +//------------------------------------------------------------------------ +// parallel_reduce +//------------------------------------------------------------------------ + +#include "./omp/parallel_reduce.h" +#include "./omp/parallel_transform_reduce.h" + +//------------------------------------------------------------------------ +// parallel_scan +//------------------------------------------------------------------------ + +#include "./omp/parallel_scan.h" +#include "./omp/parallel_transform_scan.h" + +//------------------------------------------------------------------------ +// parallel_stable_sort +//------------------------------------------------------------------------ + +#include "./omp/parallel_stable_partial_sort.h" +#include "./omp/parallel_stable_sort.h" + +//------------------------------------------------------------------------ +// parallel_merge +//------------------------------------------------------------------------ +#include "./omp/parallel_merge.h" + +#endif //_PSTL_PARALLEL_BACKEND_OMP_H Index: pstl/include/pstl/internal/pstl_config.h =================================================================== --- pstl/include/pstl/internal/pstl_config.h +++ pstl/include/pstl/internal/pstl_config.h @@ -18,7 +18,7 @@ #define _PSTL_VERSION_MINOR ((_PSTL_VERSION % 1000) / 10) #define _PSTL_VERSION_PATCH (_PSTL_VERSION % 10) -#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB) +#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB) && !defined(_PSTL_PAR_BACKEND_OMP) # error "A parallel backend must be specified" #endif @@ -98,7 +98,9 @@ #endif // Should be defined to 1 for environments with a vendor implementation of C++17 execution policies -#define _PSTL_CPP17_EXECUTION_POLICIES_PRESENT (_MSC_VER >= 1912) +#define _PSTL_CPP17_EXECUTION_POLICIES_PRESENT \ + (_MSC_VER >= 1912 && _MSVC_LANG >= 201703L) || \ + (_GLIBCXX_RELEASE >= 9 && __GLIBCXX__ >= 20190503 && __cplusplus >= 201703L) #define _PSTL_CPP14_2RANGE_MISMATCH_EQUAL_PRESENT \ (_MSC_VER >= 1900 || __cplusplus >= 201300L || __cpp_lib_robust_nonmodifying_seq_ops == 201304)