diff --git a/libcxx/cmake/Modules/DefineLinkerScript.cmake b/libcxx/cmake/Modules/DefineLinkerScript.cmake --- a/libcxx/cmake/Modules/DefineLinkerScript.cmake +++ b/libcxx/cmake/Modules/DefineLinkerScript.cmake @@ -31,7 +31,7 @@ set(link_libraries) if (interface_libs) foreach(lib IN LISTS interface_libs) - if ("${lib}" STREQUAL "cxx-headers") + if ("${lib}" MATCHES "cxx-headers|ParallelSTL") continue() endif() # If ${lib} is not a target, we use a dummy target which we know will diff --git a/pstl/CMakeLists.txt b/pstl/CMakeLists.txt --- a/pstl/CMakeLists.txt +++ b/pstl/CMakeLists.txt @@ -16,7 +16,7 @@ project(ParallelSTL VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_PATCH} LANGUAGES CXX) -set(PSTL_PARALLEL_BACKEND "serial" CACHE STRING "Threading backend to use. Valid choices are 'serial' and 'tbb'. The default is 'serial'.") +set(PSTL_PARALLEL_BACKEND "serial" CACHE STRING "Threading backend to use. Valid choices are 'serial', 'omp', and 'tbb'. The default is 'serial'.") set(PSTL_HIDE_FROM_ABI_PER_TU OFF CACHE BOOL "Whether to constrain ABI-unstable symbols to each translation unit (basically, mark them with C's static keyword).") set(_PSTL_HIDE_FROM_ABI_PER_TU ${PSTL_HIDE_FROM_ABI_PER_TU}) # For __pstl_config_site @@ -43,6 +43,10 @@ message(STATUS "Parallel STL uses TBB ${TBB_VERSION} (interface version: ${TBB_INTERFACE_VERSION})") target_link_libraries(ParallelSTL INTERFACE TBB::tbb) set(_PSTL_PAR_BACKEND_TBB ON) +elseif (PSTL_PARALLEL_BACKEND STREQUAL "omp") + message(STATUS "Parallel STL uses the omp backend") + target_compile_options(ParallelSTL INTERFACE "-fopenmp=libomp") + set(_PSTL_PAR_BACKEND_OPENMP ON) else() message(FATAL_ERROR "Requested unknown Parallel STL backend '${PSTL_PARALLEL_BACKEND}'.") endif() diff --git a/pstl/CREDITS.txt b/pstl/CREDITS.txt --- a/pstl/CREDITS.txt +++ b/pstl/CREDITS.txt @@ -15,3 +15,7 @@ N: Thomas Rodgers E: trodgers@redhat.com D: Identifier name transformation for inclusion in a Standard C++ library. + +N: Christopher Nelson +E: nadiasvertex@gmail.com +D: Add support for an OpenMP backend. diff --git a/pstl/include/__pstl_config_site.in b/pstl/include/__pstl_config_site.in --- a/pstl/include/__pstl_config_site.in +++ b/pstl/include/__pstl_config_site.in @@ -11,6 +11,7 @@ #cmakedefine _PSTL_PAR_BACKEND_SERIAL #cmakedefine _PSTL_PAR_BACKEND_TBB +#cmakedefine _PSTL_PAR_BACKEND_OPENMP #cmakedefine _PSTL_HIDE_FROM_ABI_PER_TU #endif // __PSTL_CONFIG_SITE diff --git a/pstl/include/pstl/internal/omp/parallel_for.h b/pstl/include/pstl/internal/omp/parallel_for.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_for.h @@ -0,0 +1,64 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_FOR_H +#define _PSTL_INTERNAL_OMP_PARALLEL_FOR_H + +#include + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_for_body(_Index __first, _Index __last, _Fp __f) +{ + // initial partition of the iteration space into chunks + auto __policy = __omp_backend::__chunk_partitioner(__first, __last); + + // To avoid over-subscription we use taskloop for the nested parallelism + _PSTL_PRAGMA(omp taskloop untied mergeable) + for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk) + { + __omp_backend::__process_chunk(__policy, __first, __chunk, __f); + } +} + +//------------------------------------------------------------------------ +// Notation: +// Evaluation of brick f[i,j) for each subrange [i,j) of [first, last) +//------------------------------------------------------------------------ + +template +void +__parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f) +{ + if (omp_in_parallel()) + { + // we don't create a nested parallel region in an existing parallel + // region: just create tasks + __pstl::__omp_backend::__parallel_for_body(__first, __last, __f); + } + else + { + // in any case (nested or non-nested) one parallel region is created and + // only one thread creates a set of tasks + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) { __pstl::__omp_backend::__parallel_for_body(__first, __last, __f); } + } +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_FOR_H diff --git a/pstl/include/pstl/internal/omp/parallel_for_each.h b/pstl/include/pstl/internal/omp/parallel_for_each.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_for_each.h @@ -0,0 +1,59 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_FOR_EACH_H +#define _PSTL_INTERNAL_OMP_PARALLEL_FOR_EACH_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_for_each_body(_ForwardIterator __first, _ForwardIterator __last, _Fp __f) +{ + using DifferenceType = typename std::iterator_traits<_ForwardIterator>::difference_type; + // TODO: Think of an approach to remove the std::distance call + auto __size = std::distance(__first, __last); + + _PSTL_PRAGMA(omp taskloop untied mergeable) + for (DifferenceType __index = 0; __index < __size; ++__index) + { + // TODO: Think of an approach to remove the increment here each time. + auto __iter = std::next(__first, __index); + __f(*__iter); + } +} + +template +void +__parallel_for_each(_ExecutionPolicy&&, _ForwardIterator __first, _ForwardIterator __last, _Fp __f) +{ + if (omp_in_parallel()) + { + // we don't create a nested parallel region in an existing parallel + // region: just create tasks + __pstl::__omp_backend::__parallel_for_each_body(__first, __last, __f); + } + else + { + // in any case (nested or non-nested) one parallel region is created and + // only one thread creates a set of tasks + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) { __pstl::__omp_backend::__parallel_for_each_body(__first, __last, __f); } + } +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_FOR_EACH_H diff --git a/pstl/include/pstl/internal/omp/parallel_invoke.h b/pstl/include/pstl/internal/omp/parallel_invoke.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_invoke.h @@ -0,0 +1,50 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_INVOKE_H +#define _PSTL_INTERNAL_OMP_PARALLEL_INVOKE_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_invoke_body(_F1&& __f1, _F2&& __f2) +{ + _PSTL_PRAGMA(omp taskgroup) + { + _PSTL_PRAGMA(omp task untied mergeable) { std::forward<_F1>(__f1)(); } + _PSTL_PRAGMA(omp task untied mergeable) { std::forward<_F2>(__f2)(); } + } +} + +template +void +__parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2) +{ + if (omp_in_parallel()) + { + __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2)); + } +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_INVOKE_H diff --git a/pstl/include/pstl/internal/omp/parallel_merge.h b/pstl/include/pstl/internal/omp/parallel_merge.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_merge.h @@ -0,0 +1,94 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_MERGE_H +#define _PSTL_INTERNAL_OMP_PARALLEL_MERGE_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_merge_body(std::size_t __size_x, std::size_t __size_y, _RandomAccessIterator1 __xs, + _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, + _RandomAccessIterator3 __zs, _Compare __comp, _LeafMerge __leaf_merge) +{ + + if (__size_x + __size_y <= __omp_backend::__default_chunk_size) + { + __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp); + return; + } + + _RandomAccessIterator1 __xm; + _RandomAccessIterator2 __ym; + + if (__size_x < __size_y) + { + __ym = __ys + (__size_y / 2); + __xm = std::upper_bound(__xs, __xe, *__ym, __comp); + } + else + { + __xm = __xs + (__size_x / 2); + __ym = std::lower_bound(__ys, __ye, *__xm, __comp); + } + + auto __zm = __zs + (__xm - __xs) + (__ym - __ys); + + _PSTL_PRAGMA(omp task untied mergeable default(none) + firstprivate(__xs, __xm, __ys, __ym, __zs, __comp, __leaf_merge)) + __parallel_merge_body(__xm - __xs, __ym - __ys, __xs, __xm, __ys, __ym, __zs, __comp, __leaf_merge); + + _PSTL_PRAGMA(omp task untied mergeable default(none) + firstprivate(__xm, __xe, __ym, __ye, __zm, __comp, __leaf_merge)) + __parallel_merge_body(__xe - __xm, __ye - __ym, __xm, __xe, __ym, __ye, __zm, __comp, __leaf_merge); + + _PSTL_PRAGMA(omp taskwait) +} + +template +void +__parallel_merge(_ExecutionPolicy&& /*__exec*/, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe, + _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp, + _LeafMerge __leaf_merge) + +{ + std::size_t __size_x = __xe - __xs; + std::size_t __size_y = __ye - __ys; + + /* + * Run the merge in parallel by chunking it up. Use the smaller range (if any) as the iteration range, and the + * larger range as the search range. + */ + + if (omp_in_parallel()) + { + __parallel_merge_body(__size_x, __size_y, __xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge); + } + else + { + _PSTL_PRAGMA(omp parallel) + { + _PSTL_PRAGMA(omp single nowait) + __parallel_merge_body(__size_x, __size_y, __xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge); + } + } +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_MERGE_H diff --git a/pstl/include/pstl/internal/omp/parallel_reduce.h b/pstl/include/pstl/internal/omp/parallel_reduce.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_reduce.h @@ -0,0 +1,68 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_REDUCE_H +#define _PSTL_INTERNAL_OMP_PARALLEL_REDUCE_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +_Value +__parallel_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity, + _RealBody __real_body, _Reduction __reduce) +{ + auto __middle = __first + ((__last - __first) / 2); + _Value __v1(__identity), __v2(__identity); + __parallel_invoke_body( + [&]() { __v1 = __parallel_reduce_body(__first, __middle, __identity, __real_body, __reduce); }, + [&]() { __v2 = __parallel_reduce_body(__middle, __last, __identity, __real_body, __reduce); }); + + return __reduce(__v1, __v2); +} + +//------------------------------------------------------------------------ +// Notation: +// r(i,j,init) returns reduction of init with reduction over [i,j) +// c(x,y) combines values x and y that were the result of r +//------------------------------------------------------------------------ + +template +_Value +__parallel_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity, + _RealBody __real_body, _Reduction __reduction) +{ + // We don't create a nested parallel region in an existing parallel region: + // just create tasks. + if (omp_in_parallel()) + { + return __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction); + } + + // In any case (nested or non-nested) one parallel region is created and only + // one thread creates a set of tasks. + _Value __res = __identity; + + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + { + __res = __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction); + } + + return __res; +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_REDUCE_H diff --git a/pstl/include/pstl/internal/omp/parallel_scan.h b/pstl/include/pstl/internal/omp/parallel_scan.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_scan.h @@ -0,0 +1,136 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_SCAN_H +#define _PSTL_INTERNAL_OMP_PARALLEL_SCAN_H + +#include "parallel_invoke.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +_Index +__split(_Index __m) +{ + _Index __k = 1; + while (2 * __k < __m) + __k *= 2; + return __k; +} + +template +void +__upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine) +{ + if (__m == 1) + __r[0] = __reduce(__i * __tilesize, __lastsize); + else + { + _Index __k = __split(__m); + __omp_backend::__parallel_invoke_body( + [=] { __omp_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); }, + [=] { + __omp_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine); + }); + if (__m == 2 * __k) + __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]); + } +} + +template +void +__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine, + _Sp __scan) +{ + if (__m == 1) + __scan(__i * __tilesize, __lastsize, __initial); + else + { + const _Index __k = __split(__m); + __omp_backend::__parallel_invoke_body( + [=] { __omp_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); }, + // Assumes that __combine never throws. + // TODO: Consider adding a requirement for user functors to be constant. + [=, &__combine] + { + __omp_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, + __combine(__initial, __r[__k - 1]), __combine, __scan); + }); + } +} + +template +void +__parallel_strict_scan_body(_Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex) +{ + _Index __p = omp_get_num_threads(); + const _Index __slack = 4; + _Index __tilesize = (__n - 1) / (__slack * __p) + 1; + _Index __m = (__n - 1) / __tilesize; + __buffer<_Tp> __buf(__m + 1); + _Tp* __r = __buf.get(); + + __omp_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, __combine); + + std::size_t __k = __m + 1; + _Tp __t = __r[__k - 1]; + while ((__k &= __k - 1)) + { + __t = __combine(__r[__k - 1], __t); + } + + __apex(__combine(__initial, __t)); + __omp_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial, + __combine, __scan); +} + +template +void +__parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, + _Ap __apex) +{ + if (__n <= __default_chunk_size) + { + _Tp __sum = __initial; + if (__n) + { + __sum = __combine(__sum, __reduce(_Index(0), __n)); + } + __apex(__sum); + if (__n) + { + __scan(_Index(0), __n, __initial); + } + return; + } + + if (omp_in_parallel()) + { + __pstl::__omp_backend::__parallel_strict_scan_body<_ExecutionPolicy>(__n, __initial, __reduce, __combine, + __scan, __apex); + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + { + __pstl::__omp_backend::__parallel_strict_scan_body<_ExecutionPolicy>(__n, __initial, __reduce, __combine, + __scan, __apex); + } + } +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_SCAN_H diff --git a/pstl/include/pstl/internal/omp/parallel_stable_partial_sort.h b/pstl/include/pstl/internal/omp/parallel_stable_partial_sort.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_stable_partial_sort.h @@ -0,0 +1,32 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_STABLE_PARTIAL_SORT_H +#define _PSTL_INTERNAL_OMP_PARALLEL_STABLE_PARTIAL_SORT_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +void +__parallel_stable_partial_sort(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp, + _LeafSort __leaf_sort, std::size_t /* __nsort */) +{ + // TODO: "Parallel partial sort needs to be implemented."); + __leaf_sort(__xs, __xe, __comp); +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_STABLE_PARTIAL_SORT_H diff --git a/pstl/include/pstl/internal/omp/parallel_stable_sort.h b/pstl/include/pstl/internal/omp/parallel_stable_sort.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_stable_sort.h @@ -0,0 +1,157 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_STABLE_SORT_H +#define _PSTL_INTERNAL_OMP_PARALLEL_STABLE_SORT_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +namespace __sort_details +{ +struct __move_value +{ + template + void + operator()(_Iterator __x, _OutputIterator __z) const + { + *__z = std::move(*__x); + } +}; + +template +_OutputIterator +__parallel_move_range(_RandomAccessIterator __first1, _RandomAccessIterator __last1, _OutputIterator __d_first) +{ + std::size_t __size = __last1 - __first1; + + // Perform serial moving of small chunks + + if (__size <= __default_chunk_size) + { + return std::move(__first1, __last1, __d_first); + } + + // Perform parallel moving of larger chunks + auto __policy = __omp_backend::__chunk_partitioner(__first1, __last1); + + _PSTL_PRAGMA(omp taskloop) + for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk) + { + __omp_backend::__process_chunk(__policy, __first1, __chunk, + [&](auto __chunk_first, auto __chunk_last) + { + auto __chunk_offset = __chunk_first - __first1; + auto __output_it = __d_first + __chunk_offset; + std::move(__chunk_first, __chunk_last, __output_it); + }); + } + + return __d_first + __size; +} + +struct __move_range +{ + template + _OutputIterator + operator()(_RandomAccessIterator __first1, _RandomAccessIterator __last1, _OutputIterator __d_first) const + { + return __parallel_move_range(__first1, __last1, __d_first); + } +}; +} // namespace __sort_details + +template +void +__parallel_stable_sort_body(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp, + _LeafSort __leaf_sort) +{ + using _ValueType = typename std::iterator_traits<_RandomAccessIterator>::value_type; + using _VecType = typename std::vector<_ValueType>; + using _OutputIterator = typename _VecType::iterator; + using _MoveValue = typename __omp_backend::__sort_details::__move_value; + using _MoveRange = __omp_backend::__sort_details::__move_range; + + if (__should_run_serial(__xs, __xe)) + { + __leaf_sort(__xs, __xe, __comp); + } + else + { + std::size_t __size = __xe - __xs; + auto __mid = __xs + (__size / 2); + __parallel_invoke_body([&]() { __parallel_stable_sort_body(__xs, __mid, __comp, __leaf_sort); }, + [&]() { __parallel_stable_sort_body(__mid, __xe, __comp, __leaf_sort); }); + + // Perform a parallel merge of the sorted ranges into __output_data. + _VecType __output_data(__size); + _MoveValue __move_value; + _MoveRange __move_range; + __utils::__serial_move_merge __merge(__size); + __parallel_merge_body( + __mid - __xs, __xe - __mid, __xs, __mid, __mid, __xe, __output_data.begin(), __comp, + [&__merge, &__move_value, &__move_range](_RandomAccessIterator __as, _RandomAccessIterator __ae, + _RandomAccessIterator __bs, _RandomAccessIterator __be, + _OutputIterator __cs, _Compare __comp) + { __merge(__as, __ae, __bs, __be, __cs, __comp, __move_value, __move_value, __move_range, __move_range); }); + + // Move the values from __output_data back in the original source range. + __omp_backend::__sort_details::__parallel_move_range(__output_data.begin(), __output_data.end(), __xs); + } +} + +template +void +__parallel_stable_sort(_ExecutionPolicy&& /*__exec*/, _RandomAccessIterator __xs, _RandomAccessIterator __xe, + _Compare __comp, _LeafSort __leaf_sort, std::size_t __nsort = 0) +{ + auto __count = static_cast(__xe - __xs); + if (__count <= __default_chunk_size || __nsort < __count) + { + __leaf_sort(__xs, __xe, __comp); + return; + } + + // TODO: the partial sort implementation should + // be shared with the other backends. + + if (omp_in_parallel()) + { + if (__count <= __nsort) + { + __parallel_stable_sort_body(__xs, __xe, __comp, __leaf_sort); + } + else + { + __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort); + } + } + else + { + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + if (__count <= __nsort) + { + __parallel_stable_sort_body(__xs, __xe, __comp, __leaf_sort); + } + else + { + __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort); + } + } +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_STABLE_SORT_H diff --git a/pstl/include/pstl/internal/omp/parallel_transform_reduce.h b/pstl/include/pstl/internal/omp/parallel_transform_reduce.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_transform_reduce.h @@ -0,0 +1,112 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H +#define _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +//------------------------------------------------------------------------ +// parallel_transform_reduce +// +// Notation: +// r(i,j,init) returns reduction of init with reduction over [i,j) +// u(i) returns f(i,i+1,identity) for a hypothetical left identity element +// of r c(x,y) combines values x and y that were the result of r or u +//------------------------------------------------------------------------ + +template +auto +__transform_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init, + _Combiner __combiner, _Reduction __reduction) +{ + const std::size_t __num_threads = omp_get_num_threads(); + const std::size_t __size = __last - __first; + + // Initial partition of the iteration space into chunks. If the range is too small, + // this will result in a nonsense policy, so we check on the size as well below. + auto __policy = __omp_backend::__chunk_partitioner(__first + __num_threads, __last); + + if (__size <= __num_threads || __policy.__n_chunks < 2) + { + return __reduction(__first, __last, __init); + } + + // Here, we cannot use OpenMP UDR because we must store the init value in + // the combiner and it will be used several times. Although there should be + // the only one; we manually generate the identity elements for each thread. + std::vector<_Value> __accums; + __accums.reserve(__num_threads); + + // initialize accumulators for all threads + for (std::size_t __i = 0; __i < __num_threads; ++__i) + { + __accums.emplace_back(__unary_op(__first + __i)); + } + + // main loop + _PSTL_PRAGMA(omp taskloop shared(__accums)) + for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk) + { + __omp_backend::__process_chunk(__policy, __first + __num_threads, __chunk, + [&](auto __chunk_first, auto __chunk_last) + { + auto __thread_num = omp_get_thread_num(); + __accums[__thread_num] = + __reduction(__chunk_first, __chunk_last, __accums[__thread_num]); + }); + } + + // combine by accumulators + for (std::size_t __i = 0; __i < __num_threads; ++__i) + { + __init = __combiner(__init, __accums[__i]); + } + + return __init; +} + +template +_Value +__parallel_transform_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, + _UnaryOp __unary_op, _Value __init, _Combiner __combiner, _Reduction __reduction) +{ + _Value __result = __init; + if (omp_in_parallel()) + { + // We don't create a nested parallel region in an existing parallel + // region: just create tasks + __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner, + __reduction); + } + else + { + // Create a parallel region, and a single thread will create tasks + // for the region. + _PSTL_PRAGMA(omp parallel) + _PSTL_PRAGMA(omp single nowait) + { + __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner, + __reduction); + } + } + + return __result; +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_REDUCE_H diff --git a/pstl/include/pstl/internal/omp/parallel_transform_scan.h b/pstl/include/pstl/internal/omp/parallel_transform_scan.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/parallel_transform_scan.h @@ -0,0 +1,32 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_SCAN_H +#define _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_SCAN_H + +#include "util.h" + +namespace __pstl +{ +namespace __omp_backend +{ + +template +_Tp +__parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up /* __u */, _Tp __init, _Cp /* __combine */, + _Rp /* __brick_reduce */, _Sp __scan) +{ + // TODO: parallelize this function. + return __scan(_Index(0), __n, __init); +} + +} // namespace __omp_backend +} // namespace __pstl +#endif // _PSTL_INTERNAL_OMP_PARALLEL_TRANSFORM_SCAN_H diff --git a/pstl/include/pstl/internal/omp/util.h b/pstl/include/pstl/internal/omp/util.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/omp/util.h @@ -0,0 +1,173 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_INTERNAL_OMP_UTIL_H +#define _PSTL_INTERNAL_OMP_UTIL_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../parallel_backend_utils.h" +#include "../unseq_backend_simd.h" +#include "../utils.h" + +// Portability "#pragma" definition +#ifdef _MSC_VER +# define _PSTL_PRAGMA(x) __pragma(x) +#else +# define _PSTL_PRAGMA(x) _Pragma(# x) +#endif + +_PSTL_HIDE_FROM_ABI_PUSH + +namespace __pstl +{ +namespace __omp_backend +{ + +//------------------------------------------------------------------------ +// use to cancel execution +//------------------------------------------------------------------------ +inline void +__cancel_execution() +{ + // TODO: Figure out how to make cancelation work. +} + +//------------------------------------------------------------------------ +// raw buffer +//------------------------------------------------------------------------ + +template +class __buffer +{ + std::allocator<_Tp> __allocator_; + _Tp* __ptr_; + const std::size_t __buf_size_; + __buffer(const __buffer&) = delete; + void + operator=(const __buffer&) = delete; + + public: + __buffer(std::size_t __n) : __allocator_(), __ptr_(__allocator_.allocate(__n)), __buf_size_(__n) {} + + operator bool() const { return __ptr_ != nullptr; } + + _Tp* + get() const + { + return __ptr_; + } + ~__buffer() { __allocator_.deallocate(__ptr_, __buf_size_); } +}; + +// Preliminary size of each chunk: requires further discussion +inline constexpr std::size_t __default_chunk_size = 2048; + +// Convenience function to determine when we should run serial. +template ::value, bool> = true> +constexpr auto +__should_run_serial(_Iterator __first, _Iterator __last) -> bool +{ + using _difference_type = typename std::iterator_traits<_Iterator>::difference_type; + auto __size = std::distance(__first, __last); + return __size <= static_cast<_difference_type>(__default_chunk_size); +} + +template ::value, bool> = true> +constexpr auto +__should_run_serial(_Index __first, _Index __last) -> bool +{ + using _difference_type = _Index; + auto __size = __last - __first; + return __size <= static_cast<_difference_type>(__default_chunk_size); +} + +struct __chunk_metrics +{ + std::size_t __n_chunks; + std::size_t __chunk_size; + std::size_t __first_chunk_size; +}; + +// The iteration space partitioner according to __requested_chunk_size +template +auto +__chunk_partitioner(_RandomAccessIterator __first, _RandomAccessIterator __last, + _Size __requested_chunk_size = __default_chunk_size) -> __chunk_metrics +{ + /* + * This algorithm improves distribution of elements in chunks by avoiding + * small tail chunks. The leftover elements that do not fit neatly into + * the chunk size are redistributed to early chunks. This improves + * utilization of the processor's prefetch and reduces the number of + * tasks needed by 1. + */ + + const _Size __n = __last - __first; + _Size __n_chunks = 0; + _Size __chunk_size = 0; + _Size __first_chunk_size = 0; + if (__n < __requested_chunk_size) + { + __chunk_size = __n; + __first_chunk_size = __n; + __n_chunks = 1; + return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size}; + } + + __n_chunks = (__n / __requested_chunk_size) + 1; + __chunk_size = __n / __n_chunks; + __first_chunk_size = __chunk_size; + const _Size __n_leftover_items = __n - (__n_chunks * __chunk_size); + + if (__n_leftover_items == __chunk_size) + { + __n_chunks += 1; + return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size}; + } + else if (__n_leftover_items == 0) + { + __first_chunk_size = __chunk_size; + return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size}; + } + + const _Size __n_extra_items_per_chunk = __n_leftover_items / __n_chunks; + const _Size __n_final_leftover_items = __n_leftover_items - (__n_extra_items_per_chunk * __n_chunks); + + __chunk_size += __n_extra_items_per_chunk; + __first_chunk_size = __chunk_size + __n_final_leftover_items; + + return __chunk_metrics{__n_chunks, __chunk_size, __first_chunk_size}; +} + +template +void +__process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chunk_index, _Func __f) +{ + auto __this_chunk_size = __chunk_index == 0 ? __metrics.__first_chunk_size : __metrics.__chunk_size; + auto __index = __chunk_index == 0 ? 0 + : (__chunk_index * __metrics.__chunk_size) + + (__metrics.__first_chunk_size - __metrics.__chunk_size); + auto __first = __base + __index; + auto __last = __first + __this_chunk_size; + __f(__first, __last); +} + +} // namespace __omp_backend +} // namespace __pstl + +#endif // _PSTL_INTERNAL_OMP_UTIL_H diff --git a/pstl/include/pstl/internal/parallel_backend.h b/pstl/include/pstl/internal/parallel_backend.h --- a/pstl/include/pstl/internal/parallel_backend.h +++ b/pstl/include/pstl/internal/parallel_backend.h @@ -24,6 +24,12 @@ { namespace __par_backend = __tbb_backend; } +#elif defined(_PSTL_PAR_BACKEND_OPENMP) +# include "parallel_backend_omp.h" +namespace __pstl +{ +namespace __par_backend = __omp_backend; +} #else _PSTL_PRAGMA_MESSAGE("Parallel backend was not specified"); #endif diff --git a/pstl/include/pstl/internal/parallel_backend_omp.h b/pstl/include/pstl/internal/parallel_backend_omp.h new file mode 100644 --- /dev/null +++ b/pstl/include/pstl/internal/parallel_backend_omp.h @@ -0,0 +1,58 @@ +// -*- C++ -*- +// -*-===----------------------------------------------------------------------===// +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// +//===----------------------------------------------------------------------===// + +#ifndef _PSTL_PARALLEL_BACKEND_OMP_H +#define _PSTL_PARALLEL_BACKEND_OMP_H + +//------------------------------------------------------------------------ +// parallel_invoke +//------------------------------------------------------------------------ + +#include "./omp/parallel_invoke.h" + +//------------------------------------------------------------------------ +// parallel_for +//------------------------------------------------------------------------ + +#include "./omp/parallel_for.h" + +//------------------------------------------------------------------------ +// parallel_for_each +//------------------------------------------------------------------------ + +#include "./omp/parallel_for_each.h" + +//------------------------------------------------------------------------ +// parallel_reduce +//------------------------------------------------------------------------ + +#include "./omp/parallel_reduce.h" +#include "./omp/parallel_transform_reduce.h" + +//------------------------------------------------------------------------ +// parallel_scan +//------------------------------------------------------------------------ + +#include "./omp/parallel_scan.h" +#include "./omp/parallel_transform_scan.h" + +//------------------------------------------------------------------------ +// parallel_stable_sort +//------------------------------------------------------------------------ + +#include "./omp/parallel_stable_partial_sort.h" +#include "./omp/parallel_stable_sort.h" + +//------------------------------------------------------------------------ +// parallel_merge +//------------------------------------------------------------------------ +#include "./omp/parallel_merge.h" + +#endif //_PSTL_PARALLEL_BACKEND_OMP_H diff --git a/pstl/include/pstl/internal/pstl_config.h b/pstl/include/pstl/internal/pstl_config.h --- a/pstl/include/pstl/internal/pstl_config.h +++ b/pstl/include/pstl/internal/pstl_config.h @@ -18,7 +18,7 @@ #define _PSTL_VERSION_MINOR ((_PSTL_VERSION % 1000) / 10) #define _PSTL_VERSION_PATCH (_PSTL_VERSION % 10) -#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB) +#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB) && !defined(_PSTL_PAR_BACKEND_OPENMP) # error "A parallel backend must be specified" #endif @@ -96,7 +96,8 @@ #endif // Should be defined to 1 for environments with a vendor implementation of C++17 execution policies -#define _PSTL_CPP17_EXECUTION_POLICIES_PRESENT (_MSC_VER >= 1912) +#define _PSTL_CPP17_EXECUTION_POLICIES_PRESENT (_MSC_VER >= 1912 && _MSVC_LANG >= 201703L) || \ + (_GLIBCXX_RELEASE >= 9 && __GLIBCXX__ >= 20190503 && __cplusplus >= 201703L) #if (defined(_MSC_VER) && _MSC_VER >= 1900) || \ __cplusplus >= 201300L || \ diff --git a/pstl/include/pstl/internal/utils.h b/pstl/include/pstl/internal/utils.h --- a/pstl/include/pstl/internal/utils.h +++ b/pstl/include/pstl/internal/utils.h @@ -63,14 +63,14 @@ } template -typename std::result_of<_F1()>::type +typename std::invoke_result<_F1()>::type __invoke_if_else(std::true_type, _F1 __f1, _F2) { return __f1(); } template -typename std::result_of<_F2()>::type +typename std::invoke_result<_F2()>::type __invoke_if_else(std::false_type, _F1, _F2 __f2) { return __f2();