diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt --- a/libcxx/benchmarks/CMakeLists.txt +++ b/libcxx/benchmarks/CMakeLists.txt @@ -161,6 +161,7 @@ algorithms/lower_bound.bench.cpp algorithms/make_heap.bench.cpp algorithms/make_heap_then_sort_heap.bench.cpp + algorithms/min.bench.cpp algorithms/min_max_element.bench.cpp algorithms/pop_heap.bench.cpp algorithms/push_heap.bench.cpp diff --git a/libcxx/benchmarks/algorithms/min.bench.cpp b/libcxx/benchmarks/algorithms/min.bench.cpp new file mode 100644 --- /dev/null +++ b/libcxx/benchmarks/algorithms/min.bench.cpp @@ -0,0 +1,70 @@ +#include +#include + +#include + +void run_sizes(auto benchmark) { + benchmark->Arg(1) + ->Arg(2) + ->Arg(3) + ->Arg(4) + ->Arg(5) + ->Arg(6) + ->Arg(7) + ->Arg(8) + ->Arg(9) + ->Arg(10) + ->Arg(11) + ->Arg(12) + ->Arg(13) + ->Arg(14) + ->Arg(15) + ->Arg(16) + ->Arg(17) + ->Arg(18) + ->Arg(19) + ->Arg(20) + ->Arg(21) + ->Arg(22) + ->Arg(23) + ->Arg(24) + ->Arg(25) + ->Arg(26) + ->Arg(27) + ->Arg(28) + ->Arg(29) + ->Arg(30) + ->Arg(31) + ->Arg(32) + ->Arg(64) + ->Arg(512) + ->Arg(1024) + ->Arg(4000) + ->Arg(4096) + ->Arg(5500) + ->Arg(64000) + ->Arg(65536) + ->Arg(70000); +} + +template +static void BM_std_min(benchmark::State& state) { + std::vector vec(state.range(), 3); + + for (auto _ : state) { + benchmark::DoNotOptimize(vec); + benchmark::DoNotOptimize(std::ranges::min(vec)); + } +} +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min<__int128>)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); +BENCHMARK(BM_std_min)->Apply(run_sizes); + +BENCHMARK_MAIN(); diff --git a/libcxx/include/__algorithm/ranges_min.h b/libcxx/include/__algorithm/ranges_min.h --- a/libcxx/include/__algorithm/ranges_min.h +++ b/libcxx/include/__algorithm/ranges_min.h @@ -9,7 +9,9 @@ #ifndef _LIBCPP___ALGORITHM_RANGES_MIN_H #define _LIBCPP___ALGORITHM_RANGES_MIN_H +#include <__algorithm/copy.h> #include <__algorithm/ranges_min_element.h> +#include <__algorithm/unwrap_range.h> #include <__assert> #include <__concepts/copyable.h> #include <__config> @@ -59,15 +61,21 @@ requires indirectly_copyable_storable, range_value_t<_Rp>*> _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr range_value_t<_Rp> operator()(_Rp&& __r, _Comp __comp = {}, _Proj __proj = {}) const { - auto __first = ranges::begin(__r); - auto __last = ranges::end(__r); + if constexpr(is_integral_v> && is_same_v<_Comp, ranges::less> && is_same_v<_Proj, identity>) { + auto __unwrapped = std::__unwrap_range(ranges::begin(__r), ranges::end(__r)); + return __integral_impl(std::move(__unwrapped.first), std::move(__unwrapped.second), __comp, __proj); + } else + return __simple_loop_impl(ranges::begin(__r), ranges::end(__r), std::move(__comp), std::move(__proj)); + } +private: + template + static constexpr iter_value_t<_Iter> __simple_loop_impl(_Iter __first, _Sent __last, _Comp __comp, _Proj __proj) { _LIBCPP_ASSERT(__first != __last, "range must contain at least one element"); - - if constexpr (forward_range<_Rp>) { + if constexpr (forward_iterator<_Iter>) { return *ranges::__min_element_impl(__first, __last, __comp, __proj); } else { - range_value_t<_Rp> __result = *__first; + iter_value_t<_Iter> __result = *__first; while (++__first != __last) { if (std::invoke(__comp, std::invoke(__proj, *__first), std::invoke(__proj, __result))) __result = *__first; @@ -75,6 +83,36 @@ return __result; } } + + template + static constexpr iter_value_t<_Iter> __integral_impl(_Iter __first, _Sent __last, _Comp __comp, _Proj __proj) { + return __simple_loop_impl(std::move(__first), std::move(__last), std::move(__comp), std::move(__proj)); + } + + template + static constexpr _Tp __integral_impl(_Tp* __first, _Tp* __last, _Comp __comp, _Proj __proj) { + constexpr size_t __vector_size = _LIBCPP_NATIVE_SIMD_WIDTH / sizeof(_Tp); + _LIBCPP_ASSERT(__first != __last, "range must contain at least one element"); + + if (__last - __first < static_cast(3 * __vector_size)) // Avoid pessimitzing small ranges too much + return __simple_loop_impl(__first, __last, __comp, __proj); + + _Tp __mins[2 * __vector_size - 1]; + std::copy(__first, __first + __vector_size, __mins); + + size_t __loop_count = (__last - __first) & ~(__vector_size - 1); + + // Iterate over __vector_size elements at a time to motivate the compiler to generate a SIMD loop + for (size_t __i = __vector_size; __i != __loop_count; __i += __vector_size) { + for (size_t __j = 0; __j != __vector_size; ++__j) { + __mins[__j] = std::min(__first[__i + __j], __mins[__j]); + } + } + + size_t __tail_size = (__last - __first) & (__vector_size - 1); + std::copy(__last - __tail_size, __last, __mins + __vector_size); + return __simple_loop_impl(__mins, __mins + __vector_size + __tail_size, __comp, __proj); + } }; } // namespace __min diff --git a/libcxx/include/__config b/libcxx/include/__config --- a/libcxx/include/__config +++ b/libcxx/include/__config @@ -1250,6 +1250,16 @@ # define _LIBCPP_CTAD_SUPPORTED_FOR_TYPE(_ClassName) static_assert(true, "") #endif +// TODO: What vector extensions exist on other platforms? +// TODO: What should be checked for AVX512? +# if defined(__AVX__) +# define _LIBCPP_NATIVE_SIMD_WIDTH 32 +# elif defined(__MMX__) +# define _LIBCPP_NATIVE_SIMD_WIDTH 16 +# else +# define _LIBCPP_NATIVE_SIMD_WIDTH 8 +# endif + #endif // __cplusplus #endif // _LIBCPP___CONFIG