Index: pstl/include/__pstl_config_site.in
===================================================================
--- pstl/include/__pstl_config_site.in
+++ pstl/include/__pstl_config_site.in
@@ -11,6 +11,7 @@
 
 #cmakedefine _PSTL_PAR_BACKEND_SERIAL
 #cmakedefine _PSTL_PAR_BACKEND_TBB
+#cmakedefine _PSTL_PAR_BACKEND_OMP
 #cmakedefine _PSTL_HIDE_FROM_ABI_PER_TU
 
 #endif // __PSTL_CONFIG_SITE
Index: pstl/include/pstl/internal/omp/parallel_for.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_for.h
@@ -0,0 +1,53 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <class _Index, class _Fp>
+void
+__parallel_for_body(_Index __first, _Index __last, _Fp __f)
+{
+    // initial partition of the iteration space into chunks
+    auto __policy = __omp_backend::__chunk_partitioner(__first, __last);
+
+    // To avoid over-subscription we use taskloop for the nested parallelism
+    _PSTL_PRAGMA(omp taskloop untied mergeable)
+    for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk)
+    {
+        __omp_backend::__process_chunk(__policy, __first, __chunk, __f);
+    }
+}
+
+//------------------------------------------------------------------------
+// Notation:
+// Evaluation of brick f[i,j) for each subrange [i,j) of [first, last)
+//------------------------------------------------------------------------
+
+template <class _ExecutionPolicy, class _Index, class _Fp>
+void
+__parallel_for(_ExecutionPolicy&&, _Index __first, _Index __last, _Fp __f)
+{
+    if (__should_run_serial(__first, __last))
+    {
+        __f(__first, __last);
+        return;
+    }
+
+    if (omp_in_parallel())
+    {
+        // we don't create a nested parallel region in an existing parallel
+        // region: just create tasks
+        __pstl::__omp_backend::__parallel_for_body(__first, __last, __f);
+    }
+    else
+    {
+        // in any case (nested or non-nested) one parallel region is created and
+        // only one thread creates a set of tasks
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single) { __pstl::__omp_backend::__parallel_for_body(__first, __last, __f); }
+    }
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_for_each.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_for_each.h
@@ -0,0 +1,42 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <class _ForwardIterator, class _Fp>
+void
+__parallel_for_each_body(_ForwardIterator __first, _ForwardIterator __last, _Fp __f)
+{
+    using DifferenceType = typename ::std::iterator_traits<_ForwardIterator>::difference_type;
+    auto __size = ::std::distance(__first, __last);
+
+    _PSTL_PRAGMA(omp taskloop untied mergeable)
+    for (DifferenceType __index = 0; __index < __size; ++__index)
+    {
+        auto __iter = ::std::next(__first, __index);
+        __f(*__iter);
+    }
+}
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Fp>
+void
+__parallel_for_each(_ExecutionPolicy&&, _ForwardIterator __first, _ForwardIterator __last, _Fp __f)
+{
+    if (omp_in_parallel())
+    {
+        // we don't create a nested parallel region in an existing parallel
+        // region: just create tasks
+        __pstl::__omp_backend::__parallel_for_each_body(__first, __last, __f);
+    }
+    else
+    {
+        // in any case (nested or non-nested) one parallel region is created and
+        // only one thread creates a set of tasks
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single) { __pstl::__omp_backend::__parallel_for_each_body(__first, __last, __f); }
+    }
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_invoke.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_invoke.h
@@ -0,0 +1,35 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <typename _F1, typename _F2>
+void
+__parallel_invoke_body(_F1&& __f1, _F2&& __f2)
+{
+    _PSTL_PRAGMA(omp taskgroup)
+    {
+        _PSTL_PRAGMA(omp task untied mergeable) { std::forward<_F1>(__f1)(); }
+        _PSTL_PRAGMA(omp task untied mergeable) { std::forward<_F2>(__f2)(); }
+    }
+}
+
+template <class _ExecutionPolicy, typename _F1, typename _F2>
+void
+__parallel_invoke(_ExecutionPolicy&&, _F1&& __f1, _F2&& __f2)
+{
+    if (omp_in_parallel())
+    {
+        __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2));
+    }
+    else
+    {
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single)
+        __parallel_invoke_body(std::forward<_F1>(__f1), std::forward<_F2>(__f2));
+    }
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_merge.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_merge.h
@@ -0,0 +1,88 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _RandomAccessIterator3,
+          typename _Compare, typename _LeafMerge>
+void
+__parallel_merge_body(std::size_t __size_x, std::size_t __size_y, _RandomAccessIterator1 __xs,
+                      _RandomAccessIterator1 __xe, _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye,
+                      _RandomAccessIterator3 __zs, _Compare __comp, _LeafMerge __leaf_merge)
+{
+
+    if (__size_x + __size_y <= __omp_backend::__default_chunk_size)
+    {
+        __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp);
+        return;
+    }
+
+    _RandomAccessIterator1 __xm;
+    _RandomAccessIterator2 __ym;
+
+    if (__size_x < __size_y)
+    {
+        __ym = __ys + (__size_y / 2);
+        __xm = ::std::upper_bound(__xs, __xe, *__ym, __comp);
+    }
+    else
+    {
+        __xm = __xs + (__size_x / 2);
+        __ym = ::std::lower_bound(__ys, __ye, *__xm, __comp);
+    }
+
+    auto __zm = __zs + std::distance(__xs, __xm) + std::distance(__ys, __ym);
+
+    _PSTL_PRAGMA(omp task untied mergeable default(none)
+                     firstprivate(__xs, __xm, __ys, __ym, __zs, __comp, __leaf_merge))
+    __parallel_merge_body(std::distance(__xs, __xm), std::distance(__ys, __ym), __xs, __xm, __ys, __ym, __zs, __comp,
+                          __leaf_merge);
+
+    _PSTL_PRAGMA(omp task untied mergeable default(none)
+                     firstprivate(__xm, __xe, __ym, __ye, __zm, __comp, __leaf_merge))
+    __parallel_merge_body(std::distance(__xm, __xe), std::distance(__ym, __ye), __xm, __xe, __ym, __ye, __zm, __comp,
+                          __leaf_merge);
+
+    _PSTL_PRAGMA(omp taskwait)
+}
+
+template <class _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2,
+          typename _RandomAccessIterator3, typename _Compare, typename _LeafMerge>
+void
+__parallel_merge(_ExecutionPolicy&& /*__exec*/, _RandomAccessIterator1 __xs, _RandomAccessIterator1 __xe,
+                 _RandomAccessIterator2 __ys, _RandomAccessIterator2 __ye, _RandomAccessIterator3 __zs, _Compare __comp,
+                 _LeafMerge __leaf_merge)
+
+{
+    std::size_t __size_x = std::distance(__xs, __xe);
+    std::size_t __size_y = std::distance(__ys, __ye);
+
+    // If the size is too small, don't bother parallelizing it.
+    if (__size_x + __size_y <= __default_chunk_size)
+    {
+        __leaf_merge(__xs, __xe, __ys, __ye, __zs, __comp);
+        return;
+    }
+
+    /*
+     * Run the merge in parallel by chunking it up. Use the smaller range (if any) as the iteration range, and the
+     * larger range as the search range.
+     */
+
+    if (omp_in_parallel())
+    {
+        __parallel_merge_body(__size_x, __size_y, __xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge);
+    }
+    else
+    {
+        _PSTL_PRAGMA(omp parallel)
+        {
+            _PSTL_PRAGMA(omp single)
+            __parallel_merge_body(__size_x, __size_y, __xs, __xe, __ys, __ye, __zs, __comp, __leaf_merge);
+        }
+    }
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_reduce.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_reduce.h
@@ -0,0 +1,67 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <class _RandomAccessIterator, class _Value, typename _RealBody, typename _Reduction>
+_Value
+__parallel_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity,
+                       _RealBody __real_body, _Reduction __reduce)
+{
+    if (__should_run_serial(__first, __last))
+    {
+        return __real_body(__first, __last, __identity);
+    }
+
+    auto __middle = __first + ((__last - __first) / 2);
+    _Value __v1(__identity), __v2(__identity);
+    __parallel_invoke_body(
+        [&]() { __v1 = __parallel_reduce_body(__first, __middle, __identity, __real_body, __reduce); },
+        [&]() { __v2 = __parallel_reduce_body(__middle, __last, __identity, __real_body, __reduce); });
+
+    return __reduce(__v1, __v2);
+}
+
+//------------------------------------------------------------------------
+// Notation:
+//      r(i,j,init) returns reduction of init with reduction over [i,j)
+//      c(x,y) combines values x and y that were the result of r
+//------------------------------------------------------------------------
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _Value, typename _RealBody, typename _Reduction>
+_Value
+__parallel_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last, _Value __identity,
+                  _RealBody __real_body, _Reduction __reduction)
+{
+    if (__first == __last)
+        return __identity;
+
+    // Don't bother parallelizing the work if the size is too small.
+    if (__should_run_serial(__first, __last))
+    {
+        return __real_body(__first, __last, __identity);
+    }
+
+    // We don't create a nested parallel region in an existing parallel region:
+    // just create tasks.
+    if (omp_in_parallel())
+    {
+        return __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction);
+    }
+
+    // In any case (nested or non-nested) one parallel region is created and only
+    // one thread creates a set of tasks.
+    _Value __res = __identity;
+
+    _PSTL_PRAGMA(omp parallel)
+    _PSTL_PRAGMA(omp single)
+    {
+        __res = __pstl::__omp_backend::__parallel_reduce_body(__first, __last, __identity, __real_body, __reduction);
+    }
+
+    return __res;
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_scan.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_scan.h
@@ -0,0 +1,120 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <typename _Index>
+_Index
+__split(_Index __m)
+{
+    _Index __k = 1;
+    while (2 * __k < __m)
+        __k *= 2;
+    return __k;
+}
+
+template <typename _Index, typename _Tp, typename _Rp, typename _Cp>
+void
+__upsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Rp __reduce, _Cp __combine)
+{
+    if (__m == 1)
+        __r[0] = __reduce(__i * __tilesize, __lastsize);
+    else
+    {
+        _Index __k = __split(__m);
+        __omp_backend::__parallel_invoke_body(
+            [=] { __omp_backend::__upsweep(__i, __k, __tilesize, __r, __tilesize, __reduce, __combine); },
+            [=] {
+                __omp_backend::__upsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize, __reduce, __combine);
+            });
+        if (__m == 2 * __k)
+            __r[__m - 1] = __combine(__r[__k - 1], __r[__m - 1]);
+    }
+}
+
+template <typename _Index, typename _Tp, typename _Cp, typename _Sp>
+void
+__downsweep(_Index __i, _Index __m, _Index __tilesize, _Tp* __r, _Index __lastsize, _Tp __initial, _Cp __combine,
+            _Sp __scan)
+{
+    if (__m == 1)
+        __scan(__i * __tilesize, __lastsize, __initial);
+    else
+    {
+        const _Index __k = __split(__m);
+        __omp_backend::__parallel_invoke_body(
+            [=] { __omp_backend::__downsweep(__i, __k, __tilesize, __r, __tilesize, __initial, __combine, __scan); },
+            // Assumes that __combine never throws.
+            // TODO: Consider adding a requirement for user functors to be constant.
+            [=, &__combine] {
+                __omp_backend::__downsweep(__i + __k, __m - __k, __tilesize, __r + __k, __lastsize,
+                                           __combine(__initial, __r[__k - 1]), __combine, __scan);
+            });
+    }
+}
+
+template <typename _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp,
+          typename _Ap>
+void
+__parallel_strict_scan_body(_Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan, _Ap __apex)
+{
+    _Index __p = omp_get_num_threads();
+    const _Index __slack = 4;
+    _Index __tilesize = (__n - 1) / (__slack * __p) + 1;
+    _Index __m = (__n - 1) / __tilesize;
+    __buffer<_Tp> __buf(__m + 1);
+    _Tp* __r = __buf.get();
+
+    __omp_backend::__upsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __reduce, __combine);
+
+    std::size_t __k = __m + 1;
+    _Tp __t = __r[__k - 1];
+    while ((__k &= __k - 1))
+    {
+        __t = __combine(__r[__k - 1], __t);
+    }
+
+    __apex(__combine(__initial, __t));
+    __omp_backend::__downsweep(_Index(0), _Index(__m + 1), __tilesize, __r, __n - __m * __tilesize, __initial,
+                               __combine, __scan);
+}
+
+template <class _ExecutionPolicy, typename _Index, typename _Tp, typename _Rp, typename _Cp, typename _Sp, typename _Ap>
+void
+__parallel_strict_scan(_ExecutionPolicy&&, _Index __n, _Tp __initial, _Rp __reduce, _Cp __combine, _Sp __scan,
+                       _Ap __apex)
+{
+    if (__n <= __default_chunk_size)
+    {
+        _Tp __sum = __initial;
+        if (__n)
+        {
+            __sum = __combine(__sum, __reduce(_Index(0), __n));
+        }
+        __apex(__sum);
+        if (__n)
+        {
+            __scan(_Index(0), __n, __initial);
+        }
+        return;
+    }
+
+    if (omp_in_parallel())
+    {
+        __pstl::__omp_backend::__parallel_strict_scan_body<_ExecutionPolicy>(__n, __initial, __reduce, __combine, __scan,
+                                                                          __apex);
+    }
+    else
+    {
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single)
+        {
+            __pstl::__omp_backend::__parallel_strict_scan_body<_ExecutionPolicy>(__n, __initial, __reduce, __combine,
+                                                                              __scan, __apex);
+        }
+    }
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_stable_partial_sort.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_stable_partial_sort.h
@@ -0,0 +1,17 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
+void
+__parallel_stable_partial_sort(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp,
+                               _LeafSort __leaf_sort, std::size_t __nsort)
+{
+    //TODO: "Parallel partial sort needs to be implemented.");
+    __leaf_sort(__xs, __xe, __comp);
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_stable_sort.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_stable_sort.h
@@ -0,0 +1,146 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+namespace __sort_details
+{
+template <typename Iterator1, typename Iterator2>
+struct __move_value
+{
+    void
+    operator()(Iterator1 __x, Iterator2 __z)
+    {
+        *__z = ::std::move(*__x);
+    }
+};
+
+template <typename Iterator1, typename Iterator2>
+Iterator2
+__parallel_move_range(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
+{
+    std::size_t __size = std::distance(__first1, __last1);
+
+    // Perform serial moving of small chunks
+
+    if (__size <= __default_chunk_size)
+    {
+        return ::std::move(__first1, __last1, __first2);
+    }
+
+    // Perform parallel moving of larger chunks
+    auto __policy = __omp_backend::__chunk_partitioner(__first1, __last1);
+
+    _PSTL_PRAGMA(omp taskloop)
+    for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk)
+    {
+        __omp_backend::__process_chunk(__policy, __first1, __chunk, [&](auto __chunk_first, auto __chunk_last) {
+            auto __chunk_offset = ::std::distance(__first1, __chunk_first);
+            auto __output = __first2 + __chunk_offset;
+            ::std::move(__chunk_first, __chunk_last, __output);
+        });
+    }
+
+    return __first2 + __size;
+}
+
+template <typename Iterator1, typename Iterator2>
+struct __move_range
+{
+    Iterator2
+    operator()(Iterator1 __first1, Iterator1 __last1, Iterator2 __first2)
+    {
+        return __parallel_move_range(__first1, __last1, __first2);
+    }
+};
+} // namespace __sort_details
+
+template <typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
+void
+__parallel_stable_sort_body(_RandomAccessIterator __xs, _RandomAccessIterator __xe, _Compare __comp,
+                            _LeafSort __leaf_sort)
+{
+    using _ValueType = typename std::iterator_traits<_RandomAccessIterator>::value_type;
+    using _VecType = typename std::vector<_ValueType>;
+    using _OutputIterator = typename _VecType::iterator;
+    using _MoveValueType = typename __omp_backend::__sort_details::__move_value<_RandomAccessIterator, _OutputIterator>;
+    using _MoveRangeType = __omp_backend::__sort_details::__move_range<_RandomAccessIterator, _OutputIterator>;
+
+    if (__should_run_serial(__xs, __xe))
+    {
+        __leaf_sort(__xs, __xe, __comp);
+    }
+    else
+    {
+        std::size_t __size = std::distance(__xs, __xe);
+        auto __mid = __xs + (__size / 2);
+        __parallel_invoke_body([&]() { __parallel_stable_sort_body(__xs, __mid, __comp, __leaf_sort); },
+                               [&]() { __parallel_stable_sort_body(__mid, __xe, __comp, __leaf_sort); });
+
+        // Perform a parallel merge of the sorted ranges into __output.
+        _VecType __output(__size);
+        _MoveValueType __move_value;
+        _MoveRangeType __move_range;
+        __utils::__serial_move_merge __merge(__size);
+        __parallel_merge_body(
+            std::distance(__xs, __mid), std::distance(__mid, __xe), __xs, __mid, __mid, __xe, __output.begin(), __comp,
+            [&__merge, &__move_value, &__move_range](_RandomAccessIterator __as, _RandomAccessIterator __ae,
+                                                     _RandomAccessIterator __bs, _RandomAccessIterator __be,
+                                                     _OutputIterator __cs, _Compare __comp) {
+                __merge(__as, __ae, __bs, __be, __cs, __comp, __move_value, __move_value, __move_range, __move_range);
+            });
+
+        // Move the values from __output back in the original source range.
+        __sort_details::__parallel_move_range(__output.begin(), __output.end(), __xs);
+    }
+}
+
+template <class _ExecutionPolicy, typename _RandomAccessIterator, typename _Compare, typename _LeafSort>
+void
+__parallel_stable_sort(_ExecutionPolicy&& __exec, _RandomAccessIterator __xs, _RandomAccessIterator __xe,
+                       _Compare __comp, _LeafSort __leaf_sort, std::size_t __nsort = 0)
+{
+    if (__xs >= __xe)
+    {
+        return;
+    }
+
+    auto __count = static_cast<std::size_t>(std::distance(__xs, __xe));
+    if (__count <= __default_chunk_size || __nsort < __count)
+    {
+        __leaf_sort(__xs, __xe, __comp);
+        return;
+    }
+
+    // TODO: the partial sort implementation should
+    // be shared with the other backends.
+
+    if (omp_in_parallel())
+    {
+        if (__count <= __nsort)
+        {
+            __parallel_stable_sort_body(__xs, __xe, __comp, __leaf_sort);
+        }
+        else
+        {
+            __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort);
+        }
+    }
+    else
+    {
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single)
+        if (__count <= __nsort)
+        {
+            __parallel_stable_sort_body(__xs, __xe, __comp, __leaf_sort);
+        }
+        else
+        {
+            __parallel_stable_partial_sort(__xs, __xe, __comp, __leaf_sort, __nsort);
+        }
+    }
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_transform_reduce.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_transform_reduce.h
@@ -0,0 +1,102 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+//------------------------------------------------------------------------
+// parallel_transform_reduce
+//
+// Notation:
+//      r(i,j,init) returns reduction of init with reduction over [i,j)
+//      u(i) returns f(i,i+1,identity) for a hypothetical left identity element
+//      of r c(x,y) combines values x and y that were the result of r or u
+//------------------------------------------------------------------------
+
+template <class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner, class _Reduction>
+auto
+__transform_reduce_body(_RandomAccessIterator __first, _RandomAccessIterator __last, _UnaryOp __unary_op, _Value __init,
+                        _Combiner __combiner, _Reduction __reduction)
+{
+    using _Size = std::size_t;
+    const _Size __num_threads = omp_get_num_threads();
+    const _Size __size = __last - __first;
+
+    // Initial partition of the iteration space into chunks. If the range is too small,
+    // this will result in a nonsense policy, so we check on the size as well below.
+    auto __policy = __omp_backend::__chunk_partitioner(__first + __num_threads, __last);
+
+    if (__size <= __num_threads || __policy.__n_chunks < 2)
+    {
+        return __reduction(__first, __last, __init);
+    }
+
+    // Here, we cannot use OpenMP UDR because we must store the init value in
+    // the combiner and it will be used several times. Although there should be
+    // the only one; we manually generate the identity elements for each thread.
+    ::std::vector<_Value> __accums;
+    __accums.reserve(__num_threads);
+
+    // initialize accumulators for all threads
+    for (_Size __i = 0; __i < __num_threads; ++__i)
+    {
+        __accums.emplace_back(__unary_op(__first + __i));
+    }
+
+    // main loop
+    _PSTL_PRAGMA(omp taskloop shared(__accums))
+    for (std::size_t __chunk = 0; __chunk < __policy.__n_chunks; ++__chunk)
+    {
+        __omp_backend::__process_chunk(
+            __policy, __first + __num_threads, __chunk, [&](auto __chunk_first, auto __chunk_last) {
+                auto __thread_num = omp_get_thread_num();
+                __accums[__thread_num] = __reduction(__chunk_first, __chunk_last, __accums[__thread_num]);
+            });
+    }
+
+    // combine by accumulators
+    for (_Size __i = 0; __i < __num_threads; ++__i)
+    {
+        __init = __combiner(__init, __accums[__i]);
+    }
+
+    return __init;
+}
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _UnaryOp, class _Value, class _Combiner,
+          class _Reduction>
+_Value
+__parallel_transform_reduce(_ExecutionPolicy&&, _RandomAccessIterator __first, _RandomAccessIterator __last,
+                            _UnaryOp __unary_op, _Value __init, _Combiner __combiner, _Reduction __reduction)
+{
+
+    if (__first == __last)
+    {
+        return __init;
+    }
+
+    _Value __result = __init;
+    if (omp_in_parallel())
+    {
+        // We don't create a nested parallel region in an existing parallel
+        // region: just create tasks
+        __result =
+            __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner, __reduction);
+    }
+    else
+    {
+        // Create a parallel region, and a single thread will create tasks
+        // for the region.
+        _PSTL_PRAGMA(omp parallel)
+        _PSTL_PRAGMA(omp single)
+        {
+            __result = __pstl::__omp_backend::__transform_reduce_body(__first, __last, __unary_op, __init, __combiner,
+                                                                   __reduction);
+        }
+    }
+
+    return __result;
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/omp/parallel_transform_scan.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/omp/parallel_transform_scan.h
@@ -0,0 +1,16 @@
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+template <class _ExecutionPolicy, class _Index, class _Up, class _Tp, class _Cp, class _Rp, class _Sp>
+_Tp
+__parallel_transform_scan(_ExecutionPolicy&&, _Index __n, _Up __u, _Tp __init, _Cp __combine, _Rp __brick_reduce,
+                          _Sp __scan)
+{
+    return __scan(_Index(0), __n, __init);
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
Index: pstl/include/pstl/internal/parallel_backend.h
===================================================================
--- pstl/include/pstl/internal/parallel_backend.h
+++ pstl/include/pstl/internal/parallel_backend.h
@@ -24,6 +24,12 @@
 {
 namespace __par_backend = __tbb_backend;
 }
+#elif defined(_PSTL_PAR_BACKEND_OMP)
+#    include "parallel_backend_omp.h"
+namespace __pstl
+{
+namespace __par_backend = __omp_backend;
+}
 #else
 _PSTL_PRAGMA_MESSAGE("Parallel backend was not specified");
 #endif
Index: pstl/include/pstl/internal/parallel_backend_omp.h
===================================================================
--- /dev/null
+++ pstl/include/pstl/internal/parallel_backend_omp.h
@@ -0,0 +1,220 @@
+// -*- C++ -*-
+// -*-===----------------------------------------------------------------------===//
+//
+// Copyright (C) Christopher Nelson
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _PSTL_PARALLEL_BACKEND_OMP_H
+#define _PSTL_PARALLEL_BACKEND_OMP_H
+
+#include <algorithm>
+#include <atomic>
+#include <iterator>
+#include <cstddef>
+#include <cstdio>
+#include <memory>
+#include <vector>
+#include <omp.h>
+
+#include "parallel_backend_utils.h"
+#include "unseq_backend_simd.h"
+#include "utils.h"
+
+#if !defined(_OPENMP)
+#    error _OPENMP not defined.
+#endif
+
+// Portability "#pragma" definition
+#ifdef _MSC_VER
+#    define _PSTL_PRAGMA(x) __pragma(x)
+#else
+#    define _PSTL_PRAGMA(x) _Pragma(#    x)
+#endif
+
+_PSTL_HIDE_FROM_ABI_PUSH
+
+namespace __pstl
+{
+namespace __omp_backend
+{
+
+//------------------------------------------------------------------------
+// use to cancel execution
+//------------------------------------------------------------------------
+inline void
+__cancel_execution()
+{
+}
+
+//------------------------------------------------------------------------
+// raw buffer
+//------------------------------------------------------------------------
+
+template <typename _Tp>
+class __buffer
+{
+    std::allocator<_Tp> __allocator_;
+    _Tp* __ptr_;
+    const std::size_t __buf_size_;
+    __buffer(const __buffer&) = delete;
+    void
+    operator=(const __buffer&) = delete;
+
+  public:
+    __buffer(std::size_t __n) : __allocator_(), __ptr_(__allocator_.allocate(__n)), __buf_size_(__n) {}
+
+    operator bool() const { return __ptr_ != nullptr; }
+
+    _Tp*
+    get() const
+    {
+        return __ptr_;
+    }
+    ~__buffer() { __allocator_.deallocate(__ptr_, __buf_size_); }
+};
+
+// Preliminary size of each chunk: requires further discussion
+constexpr std::size_t __default_chunk_size = 512;
+
+// Convenience function to determine when we should run serial.
+template <typename _Iterator, ::std::enable_if_t<!::std::is_integral<_Iterator>::value, bool> = true>
+constexpr auto
+__should_run_serial(_Iterator __first, _Iterator __last) -> bool
+{
+    using _difference_type = typename ::std::iterator_traits<_Iterator>::difference_type;
+    auto __size = ::std::distance(__first, __last);
+    return __size <= static_cast<_difference_type>(__default_chunk_size);
+}
+
+template <typename _Index, ::std::enable_if_t<::std::is_integral<_Index>::value, bool> = true>
+constexpr auto
+__should_run_serial(_Index __first, _Index __last) -> bool
+{
+    using _difference_type = ::std::intptr_t;
+    auto __size = __last - __first;
+    return __size <= static_cast<_difference_type>(__default_chunk_size);
+}
+
+struct __chunk_policy
+{
+    std::size_t __n_chunks{0}, __chunk_size{0}, __first_chunk_size{0};
+};
+
+// The iteration space partitioner according to __requested_chunk_size
+template <class _RandomAccessIterator, class _Size = std::size_t>
+auto
+__chunk_partitioner(_RandomAccessIterator __first, _RandomAccessIterator __last,
+                    _Size __requested_chunk_size = __default_chunk_size) -> __chunk_policy
+{
+    /*
+     * This algorithm improves distribution of elements in chunks by avoiding
+     * small tail chunks. The leftover elements that do not fit neatly into
+     * the chunk size are redistributed to early chunks. This improves
+     * utilization of the processor's prefetch and reduces the number of
+     * tasks needed by 1.
+     */
+
+    const _Size __n = __last - __first;
+    _Size __n_chunks = 0;
+    _Size __chunk_size = 0;
+    _Size __first_chunk_size = 0;
+    if (__n < __requested_chunk_size)
+    {
+        __chunk_size = __n;
+        __first_chunk_size = __n;
+        __n_chunks = 1;
+        return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size};
+    }
+
+    __n_chunks = (__n / __requested_chunk_size) + 1;
+    __chunk_size = __n / __n_chunks;
+    __first_chunk_size = __chunk_size;
+    const _Size __n_leftover_items = __n - (__n_chunks * __chunk_size);
+
+    if (__n_leftover_items == __chunk_size)
+    {
+        __n_chunks += 1;
+        return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size};
+    }
+    else if (__n_leftover_items == 0)
+    {
+        __first_chunk_size = __chunk_size;
+        return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size};
+    }
+
+    const _Size __n_extra_items_per_chunk = __n_leftover_items / __n_chunks;
+    const _Size __n_final_leftover_items = __n_leftover_items - (__n_extra_items_per_chunk * __n_chunks);
+
+    __chunk_size += __n_extra_items_per_chunk;
+    __first_chunk_size = __chunk_size + __n_final_leftover_items;
+
+    return __chunk_policy{__n_chunks, __chunk_size, __first_chunk_size};
+}
+
+template <typename _Iterator, typename _Index, typename _F>
+void
+__process_chunk(const __chunk_policy& __policy, _Iterator __base, _Index __chunk_index, _F __f)
+{
+    auto __this_chunk_size = __chunk_index == 0 ? __policy.__first_chunk_size : __policy.__chunk_size;
+    auto __index = __chunk_index == 0 ? 0
+                                      : (__chunk_index * __policy.__chunk_size) +
+                                            (__policy.__first_chunk_size - __policy.__chunk_size);
+    auto __first = __base + __index;
+    auto __last = __first + __this_chunk_size;
+    __f(__first, __last);
+}
+
+} // namespace __omp_backend
+} // namespace __pstl
+
+//------------------------------------------------------------------------
+// parallel_invoke
+//------------------------------------------------------------------------
+
+#include "./omp/parallel_invoke.h"
+
+//------------------------------------------------------------------------
+// parallel_for
+//------------------------------------------------------------------------
+
+#include "./omp/parallel_for.h"
+
+//------------------------------------------------------------------------
+// parallel_for_each
+//------------------------------------------------------------------------
+
+#include "./omp/parallel_for_each.h"
+
+//------------------------------------------------------------------------
+// parallel_reduce
+//------------------------------------------------------------------------
+
+#include "./omp/parallel_reduce.h"
+#include "./omp/parallel_transform_reduce.h"
+
+//------------------------------------------------------------------------
+// parallel_scan
+//------------------------------------------------------------------------
+
+#include "./omp/parallel_scan.h"
+#include "./omp/parallel_transform_scan.h"
+
+//------------------------------------------------------------------------
+// parallel_stable_sort
+//------------------------------------------------------------------------
+
+#include "./omp/parallel_stable_partial_sort.h"
+#include "./omp/parallel_stable_sort.h"
+
+//------------------------------------------------------------------------
+// parallel_merge
+//------------------------------------------------------------------------
+#include "./omp/parallel_merge.h"
+
+#endif //_PSTL_PARALLEL_BACKEND_OMP_H
Index: pstl/include/pstl/internal/pstl_config.h
===================================================================
--- pstl/include/pstl/internal/pstl_config.h
+++ pstl/include/pstl/internal/pstl_config.h
@@ -18,7 +18,7 @@
 #define _PSTL_VERSION_MINOR ((_PSTL_VERSION % 1000) / 10)
 #define _PSTL_VERSION_PATCH (_PSTL_VERSION % 10)
 
-#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB)
+#if !defined(_PSTL_PAR_BACKEND_SERIAL) && !defined(_PSTL_PAR_BACKEND_TBB) && !defined(_PSTL_PAR_BACKEND_OMP)
 #    error "A parallel backend must be specified"
 #endif
 
@@ -98,7 +98,9 @@
 #endif
 
 // Should be defined to 1 for environments with a vendor implementation of C++17 execution policies
-#define _PSTL_CPP17_EXECUTION_POLICIES_PRESENT (_MSC_VER >= 1912)
+#define _PSTL_CPP17_EXECUTION_POLICIES_PRESENT                                                                         \
+    (_MSC_VER >= 1912 && _MSVC_LANG >= 201703L) ||                                                                     \
+        (_GLIBCXX_RELEASE >= 9 && __GLIBCXX__ >= 20190503 && __cplusplus >= 201703L)
 
 #define _PSTL_CPP14_2RANGE_MISMATCH_EQUAL_PRESENT                                                                      \
     (_MSC_VER >= 1900 || __cplusplus >= 201300L || __cpp_lib_robust_nonmodifying_seq_ops == 201304)