diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -177,6 +177,7 @@ algorithms/sort.bench.cpp algorithms/sort_heap.bench.cpp algorithms/stable_sort.bench.cpp + algorithms/transform_reduce.bench.cpp libcxxabi/dynamic_cast.bench.cpp allocation.bench.cpp deque.bench.cpp diff --git a/libcxx/benchmarks/algorithms/transform_reduce.bench.cpp b/libcxx/benchmarks/algorithms/transform_reduce.bench.cpp new file mode 100644 --- /dev/null +++ b/libcxx/benchmarks/algorithms/transform_reduce.bench.cpp @@ -0,0 +1,31 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +template +static void bm_transform_reduce(benchmark::State& state) { + auto size = state.range(); + std::vector a; + std::vector b; + for (size_t i = 0; i != size; ++i) { + a.emplace_back(i * 3); + b.emplace_back(i + 2); + } + + for (auto _ : state) { + benchmark::ClobberMemory(); + auto ret = std::transform_reduce(a.begin(), a.end(), b.begin(), T(1)); + benchmark::DoNotOptimize(ret); + } +} + +BENCHMARK(bm_transform_reduce)->DenseRange(1, 8)->Range(16, 1 << 20); + +BENCHMARK_MAIN(); diff --git a/libcxx/include/__config b/libcxx/include/__config --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1278,6 +1278,12 @@ # define _LIBCPP_WORKAROUND_OBJCXX_COMPILER_INTRINSICS # endif +#ifdef _LIBCPP_COMPILER_CLANG_BASED +# define _LIBCPP_CLANG_PRAGMA(...) _Pragma(__VA_ARGS__) +#else +# define _LIBCPP_CLANG_PRAGMA(...) +#endif + // TODO: Make this a proper configuration option #define _PSTL_PAR_BACKEND_SERIAL diff --git a/libcxx/include/__functional/operations.h b/libcxx/include/__functional/operations.h --- a/libcxx/include/__functional/operations.h +++ b/libcxx/include/__functional/operations.h @@ -107,6 +107,9 @@ }; _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(multiplies); +template +struct __is_trivial_multiplies_operation, _Tp, _Tp> : true_type {}; + #if _LIBCPP_STD_VER >= 14 template <> struct _LIBCPP_TEMPLATE_VIS multiplies @@ -119,6 +122,9 @@ { return _VSTD::forward<_T1>(__t) * _VSTD::forward<_T2>(__u); } typedef void is_transparent; }; + +template +struct __is_trivial_multiplies_operation, _Lhs, _Rhs> : true_type {}; #endif #if _LIBCPP_STD_VER >= 14 diff --git a/libcxx/include/__numeric/reduce.h b/libcxx/include/__numeric/reduce.h --- a/libcxx/include/__numeric/reduce.h +++ b/libcxx/include/__numeric/reduce.h @@ -24,6 +24,7 @@ template _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOp __b) { + _LIBCPP_CLANG_PRAGMA("clang loop vectorize(enable)") for (; __first != __last; ++__first) __init = __b(__init, *__first); return __init; diff --git a/libcxx/include/__numeric/transform_reduce.h b/libcxx/include/__numeric/transform_reduce.h --- a/libcxx/include/__numeric/transform_reduce.h +++ b/libcxx/include/__numeric/transform_reduce.h @@ -10,8 +10,12 @@ #ifndef _LIBCPP___NUMERIC_TRANSFORM_REDUCE_H #define _LIBCPP___NUMERIC_TRANSFORM_REDUCE_H +#include <__concepts/arithmetic.h> #include <__config> #include <__functional/operations.h> +#include <__iterator/iterator_traits.h> +#include <__type_traits/is_same.h> +#include <__type_traits/operation_traits.h> #include <__utility/move.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -26,7 +30,7 @@ _InputIterator __last, _Tp __init, _BinaryOp __b, _UnaryOp __u) { for (; __first != __last; ++__first) - __init = __b(std::move(__init), __u(*__first)); + __init = __b(__init, __u(*__first)); return __init; } @@ -36,10 +40,29 @@ _InputIterator2 __first2, _Tp __init, _BinaryOp1 __b1, _BinaryOp2 __b2) { for (; __first1 != __last1; ++__first1, (void)++__first2) - __init = __b1(std::move(__init), __b2(*__first1, *__first2)); + __init = __b1(__init, __b2(*__first1, *__first2)); return __init; } +#if _LIBCPP_STD_VER >= 20 +template + requires(__is_trivial_operation<_TransformOp, _Tp, _Tp>::value && + __is_trivial_operation<_ReductionOp, _Tp, _Tp>::value && + is_same_v<__iter_value_type<_InputIterator1>, _Tp> && is_same_v<__iter_value_type<_InputIterator2>, _Tp>) +_LIBCPP_HIDE_FROM_ABI constexpr _Tp transform_reduce( + _InputIterator1 __first1, + _InputIterator1 __last1, + _InputIterator2 __first2, + _Tp __init, + _ReductionOp __b1, + _TransformOp __b2) { + _LIBCPP_CLANG_PRAGMA("clang loop vectorize(enable)") + for (; __first1 != __last1; ++__first1, (void)++__first2) + __init = __b1(__init, __b2(*__first1, *__first2)); + return __init; +} +# endif + template _LIBCPP_INLINE_VISIBILITY _LIBCPP_CONSTEXPR_SINCE_CXX20 _Tp transform_reduce(_InputIterator1 __first1, _InputIterator1 __last1, diff --git a/libcxx/include/__type_traits/operation_traits.h b/libcxx/include/__type_traits/operation_traits.h --- a/libcxx/include/__type_traits/operation_traits.h +++ b/libcxx/include/__type_traits/operation_traits.h @@ -21,6 +21,15 @@ template struct __is_trivial_plus_operation : false_type {}; +template +struct __is_trivial_multiplies_operation : false_type {}; + +template +using __is_trivial_operation = + integral_constant::value || + __is_trivial_multiplies_operation<_Pred, _Lhs, _Rhs>::value>; + _LIBCPP_END_NAMESPACE_STD #endif // _LIBCPP___TYPE_TRAITS_OPERATION_TRAITS_H diff --git a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp --- a/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp +++ b/libcxx/test/std/numerics/numeric.ops/transform.reduce/transform_reduce_iter_iter_iter_init.pass.cpp @@ -70,6 +70,13 @@ std::transform_reduce(std::begin(ia), std::end(ia), std::begin(ib), MoveOnly{0}).get()); } +TEST_CONSTEXPR_CXX20 void test_optimized_path() { + float a[] = {1.f, 2.f, 3.f, 4.f}; + float b[] = {1.f, 2.f, 3.f, 4.f}; + auto ret = std::transform_reduce(a, a + 4, b, 0); + assert(ret > 29.5f && ret < 30.5f); +} + TEST_CONSTEXPR_CXX20 bool test() { @@ -109,6 +116,7 @@ test< int*, unsigned int *>(); test_move_only_types(); + test_optimized_path(); return true; }