diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -572,6 +572,8 @@ add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=sandybridge REQUIRE AVX) add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) + add_memcpy(memcpy_x86_64_opt_sw_prefetch_sse4 COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=nehalem REQUIRE SSE4_2) + add_memcpy(memcpy_x86_64_opt_sw_prefetch_avx COMPILE_OPTIONS -DLIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING -march=sandybridge REQUIRE AVX) add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memcpy(memcpy) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h --- a/libc/src/string/memory_utils/op_builtin.h +++ b/libc/src/string/memory_utils/op_builtin.h @@ -23,19 +23,24 @@ // Memcpy template struct Memcpy { static constexpr size_t SIZE = Size; - LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) { + LIBC_INLINE static void block_offset(Ptr __restrict dst, CPtr __restrict src, + size_t offset) { #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE - return __builtin_memcpy_inline(dst, src, SIZE); + return __builtin_memcpy_inline(dst + offset, src + offset, SIZE); #else // The codegen may be suboptimal. for (size_t i = 0; i < Size; ++i) - dst[i] = src[i]; + dst[i + offset] = src[i + offset]; #endif } + LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) { + block_offset(dst, src, 0); + } + LIBC_INLINE static void tail(Ptr __restrict dst, CPtr __restrict src, size_t count) { - block(dst + count - SIZE, src + count - SIZE); + block_offset(dst, src, count - SIZE); } LIBC_INLINE static void head_tail(Ptr __restrict dst, CPtr __restrict src, @@ -44,16 +49,21 @@ tail(dst, src, count); } - LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, - size_t count) { + LIBC_INLINE static void loop_and_tail_offset(Ptr __restrict dst, + CPtr __restrict src, + size_t count, size_t offset) { static_assert(Size > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; do { - block(dst + offset, src + offset); + block_offset(dst, src, offset); offset += SIZE; } while (offset < count - SIZE); tail(dst, src, count); } + + LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + return loop_and_tail_offset(dst, src, count, 0); + } }; /////////////////////////////////////////////////////////////////////////////// diff --git a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h --- a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h +++ b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h @@ -8,6 +8,7 @@ #ifndef LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H #define LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCPY_IMPLEMENTATIONS_H +#include "src/__support/macros/attributes.h" // LIBC_INLINE_VAR #include "src/__support/macros/config.h" // LIBC_INLINE #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY #include "src/string/memory_utils/op_builtin.h" @@ -17,28 +18,53 @@ #include // size_t #include // SIZE_MAX +#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB +#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. +#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB + +#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE +#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. +#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE + namespace __llvm_libc { +namespace x86 { + +LIBC_INLINE_VAR constexpr size_t kOneCacheline = 64; +LIBC_INLINE_VAR constexpr size_t kTwoCachelines = 2 * kOneCacheline; +LIBC_INLINE_VAR constexpr size_t kThreeCachelines = 3 * kOneCacheline; + +LIBC_INLINE_VAR constexpr bool kUseSoftwarePrefetching = + LLVM_LIBC_IS_DEFINED(LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING); + +// Whether to use rep;movsb exclusively (0), not at all (SIZE_MAX), or only +// above a certain threshold. Defaults to "do not use rep;movsb". +#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE +#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX +#endif +LIBC_INLINE_VAR constexpr size_t kRepMovsbThreshold = + LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; + +} // namespace x86 + +// TODO: Move to a shared header when appropriate. +[[maybe_unused]] LIBC_INLINE void prefetch_to_local_cache(const void *addr) { + __builtin_prefetch(addr, 0, 3); +} + [[maybe_unused]] LIBC_INLINE void -inline_memcpy_x86_avx(Ptr __restrict dst, CPtr __restrict src, size_t count) { - if (count == 0) - return; - if (count == 1) - return builtin::Memcpy<1>::block(dst, src); - if (count == 2) - return builtin::Memcpy<2>::block(dst, src); - if (count == 3) - return builtin::Memcpy<3>::block(dst, src); - if (count == 4) - return builtin::Memcpy<4>::block(dst, src); - if (count < 8) - return builtin::Memcpy<4>::head_tail(dst, src, count); - if (count < 16) - return builtin::Memcpy<8>::head_tail(dst, src, count); - if (count < 32) - return builtin::Memcpy<16>::head_tail(dst, src, count); - if (count < 64) - return builtin::Memcpy<32>::head_tail(dst, src, count); +inline_memcpy_x86_sse2_ge64(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + if (count < 128) + return builtin::Memcpy<64>::head_tail(dst, src, count); + builtin::Memcpy<32>::block(dst, src); + align_to_next_boundary<32, Arg::Dst>(dst, src, count); + return builtin::Memcpy<32>::loop_and_tail(dst, src, count); +} + +[[maybe_unused]] LIBC_INLINE void +inline_memcpy_x86_avx_ge64(Ptr __restrict dst, CPtr __restrict src, + size_t count) { if (count < 128) return builtin::Memcpy<64>::head_tail(dst, src, count); if (count < 256) @@ -48,9 +74,81 @@ return builtin::Memcpy<64>::loop_and_tail(dst, src, count); } -[[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_no_avx(Ptr __restrict dst, - CPtr __restrict src, - size_t count) { +[[maybe_unused]] LIBC_INLINE void +inline_memcpy_x86_sse2_ge64_sw_prefetching(Ptr __restrict dst, + CPtr __restrict src, size_t count) { + using namespace __llvm_libc::x86; + prefetch_to_local_cache(src + kOneCacheline); + if (count < 128) + return builtin::Memcpy<64>::head_tail(dst, src, count); + prefetch_to_local_cache(src + kTwoCachelines); + // Aligning 'dst' on a 32B boundary. + builtin::Memcpy<32>::block(dst, src); + align_to_next_boundary<32, Arg::Dst>(dst, src, count); + builtin::Memcpy<96>::block(dst, src); + size_t offset = 96; + // At this point: + // - we copied between 96B and 128B, + // - we prefetched cachelines at 'src + 64' and 'src + 128', + // - 'dst' is 32B aligned, + // - count >= 128. + if (count < 352) { + // Two cache lines at a time. + while (offset + kTwoCachelines + 32 <= count) { + prefetch_to_local_cache(src + offset + kOneCacheline); + prefetch_to_local_cache(src + offset + kTwoCachelines); + builtin::Memcpy::block_offset(dst, src, offset); + offset += kTwoCachelines; + } + } else { + // Three cache lines at a time. + while (offset + kThreeCachelines + 32 <= count) { + prefetch_to_local_cache(src + offset + kOneCacheline); + prefetch_to_local_cache(src + offset + kTwoCachelines); + prefetch_to_local_cache(src + offset + kThreeCachelines); + // It is likely that this copy will be turned into a 'rep;movsb' on + // non-AVX machines. + builtin::Memcpy::block_offset(dst, src, offset); + offset += kThreeCachelines; + } + } + return builtin::Memcpy<32>::loop_and_tail_offset(dst, src, count, offset); +} + +[[maybe_unused]] LIBC_INLINE void +inline_memcpy_x86_avx_ge64_sw_prefetching(Ptr __restrict dst, + CPtr __restrict src, size_t count) { + using namespace __llvm_libc::x86; + prefetch_to_local_cache(src + kOneCacheline); + if (count < 128) + return builtin::Memcpy<64>::head_tail(dst, src, count); + prefetch_to_local_cache(src + kTwoCachelines); + prefetch_to_local_cache(src + kThreeCachelines); + if (count < 256) + return builtin::Memcpy<128>::head_tail(dst, src, count); + // Aligning 'dst' on a 32B boundary. + builtin::Memcpy<32>::block(dst, src); + align_to_next_boundary<32, Arg::Dst>(dst, src, count); + builtin::Memcpy<224>::block(dst, src); + size_t offset = 224; + // At this point: + // - we copied between 224B and 256B, + // - we prefetched cachelines at 'src + 64', 'src + 128', and 'src + 196' + // - 'dst' is 32B aligned, + // - count >= 128. + while (offset + kThreeCachelines + 64 <= count) { + // Three cache lines at a time. + prefetch_to_local_cache(src + offset + kOneCacheline); + prefetch_to_local_cache(src + offset + kTwoCachelines); + prefetch_to_local_cache(src + offset + kThreeCachelines); + builtin::Memcpy::block_offset(dst, src, offset); + offset += kThreeCachelines; + } + return builtin::Memcpy<64>::loop_and_tail_offset(dst, src, count, offset); +} + +[[maybe_unused]] LIBC_INLINE void +inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { if (count == 0) return; if (count == 1) @@ -69,46 +167,30 @@ return builtin::Memcpy<16>::head_tail(dst, src, count); if (count < 64) return builtin::Memcpy<32>::head_tail(dst, src, count); - if (count < 128) - return builtin::Memcpy<64>::head_tail(dst, src, count); - builtin::Memcpy<32>::block(dst, src); - align_to_next_boundary<32, Arg::Dst>(dst, src, count); - return builtin::Memcpy<32>::loop_and_tail(dst, src, count); -} - -[[maybe_unused]] LIBC_INLINE void -inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { - if constexpr (x86::kAvx) - return inline_memcpy_x86_avx(dst, src, count); - else - return inline_memcpy_x86_no_avx(dst, src, count); + if constexpr (x86::kAvx) { + if constexpr (x86::kUseSoftwarePrefetching) { + return inline_memcpy_x86_avx_ge64_sw_prefetching(dst, src, count); + } else { + return inline_memcpy_x86_avx_ge64(dst, src, count); + } + } else { + if constexpr (x86::kUseSoftwarePrefetching) { + return inline_memcpy_x86_sse2_ge64_sw_prefetching(dst, src, count); + } else { + return inline_memcpy_x86_sse2_ge64(dst, src, count); + } + } } [[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst, CPtr __restrict src, size_t count) { - // Whether to use rep;movsb exclusively, not at all, or only above a certain - // threshold. -#ifndef LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -#define LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE SIZE_MAX -#endif - -#ifdef LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB -#error LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. -#endif // LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB - -#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -#error LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE is deprecated use LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0 instead. -#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE - - static constexpr size_t kRepMovsbThreshold = - LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; - if constexpr (kRepMovsbThreshold == 0) { + if constexpr (x86::kRepMovsbThreshold == 0) { return x86::Memcpy::repmovsb(dst, src, count); - } else if constexpr (kRepMovsbThreshold == SIZE_MAX) { + } else if constexpr (x86::kRepMovsbThreshold == SIZE_MAX) { return inline_memcpy_x86(dst, src, count); } else { - if (LIBC_UNLIKELY(count >= kRepMovsbThreshold)) + if (LIBC_UNLIKELY(count >= x86::kRepMovsbThreshold)) return x86::Memcpy::repmovsb(dst, src, count); else return inline_memcpy_x86(dst, src, count); diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -26,6 +26,11 @@ "LIBC_COPT_PRINTF_DISABLE_WRITE_INT", ] +MEMORY_COPTS = [ + # "LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE=0", + # "LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHING", +] + # A flag to pick which `mpfr` to use for math tests. # Usage: `--@llvm-project//libc:mpfr=`. # Flag documentation: https://bazel.build/extending/config @@ -445,10 +450,10 @@ ":__support_cpp_limits", ":__support_cpp_optional", ":__support_ctype_utils", + ":__support_fputil_dyadic_float", ":__support_fputil_fenv_impl", ":__support_fputil_fp_bits", ":__support_fputil_rounding_mode", - ":__support_fputil_dyadic_float", ":__support_str_to_integer", ":__support_str_to_num_result", ":__support_uint128", @@ -1144,8 +1149,8 @@ hdrs = ["src/math/generic/log_range_reduction.h"], deps = [ ":__support_common", - ":__support_uint128", ":__support_fputil_dyadic_float", + ":__support_uint128", ":common_constants", ], ) @@ -1999,6 +2004,7 @@ "src/string/memory_utils/op_x86.h", "src/string/memory_utils/utils.h", ], + defines = MEMORY_COPTS, textual_hdrs = [ "src/string/memory_utils/aarch64/memcmp_implementations.h", "src/string/memory_utils/aarch64/memcpy_implementations.h",