diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -19,6 +19,7 @@ libc.src.__support.CPP.bit libc.src.__support.CPP.cstddef libc.src.__support.CPP.type_traits + libc.src.__support.macros.compiler libc.src.__support.macros.config libc.src.__support.macros.optimization ) diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h --- a/libc/src/string/memory_utils/op_builtin.h +++ b/libc/src/string/memory_utils/op_builtin.h @@ -44,14 +44,16 @@ tail(dst, src, count); } - LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, - size_t count) { + LIBC_INLINE static void loop(Ptr __restrict dst, CPtr __restrict src, + size_t count, size_t offset = 0) { static_assert(Size > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; - do { + for (; offset < (count - SIZE); offset += SIZE) block(dst + offset, src + offset); - offset += SIZE; - } while (offset < count - SIZE); + } + + LIBC_INLINE static void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + loop(dst, src, count); tail(dst, src, count); } }; diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -17,6 +17,7 @@ #if defined(LIBC_TARGET_ARCH_IS_X86_64) #include "src/__support/common.h" +#include "src/__support/macros/properties/compiler.h" #include "src/string/memory_utils/op_builtin.h" #include "src/string/memory_utils/op_generic.h" @@ -46,14 +47,48 @@ static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__); static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); +static constexpr size_t kCachelineSize = 64; + +LIBC_INLINE void read_prefetch_local(CPtr __restrict src) { +#if defined(LIBC_COMPILER_IS_CLANG) || defined(LIBC_COMPILER_IS_GCC) + // This call has the same semantic between GCC and Clang. + // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html + // https://clang.llvm.org/docs/LanguageExtensions.html#builtin-prefetch + // It's a read prefetch with maximum locality. + __builtin_prefetch(src); +#else +#error "Unknown compiler, can't generate prefetch instruction" +#endif +} + /////////////////////////////////////////////////////////////////////////////// // Memcpy repmovsb implementation -struct Memcpy { - static void repmovsb(void *dst, const void *src, size_t count) { - asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); + +template struct MemcpyPrefetch { + static_assert(Size % kCachelineSize == 0, + "Size must be a multiple of cache lines"); + static constexpr size_t kCacheLines = Size / kCachelineSize; + + LIBC_INLINE static void block(Ptr __restrict dst, CPtr __restrict src) { + for (size_t i = 1; i < kCacheLines + 1; ++i) + read_prefetch_local(src + (kCachelineSize * i)); + builtin::Memcpy::block(dst, src); + } + + LIBC_INLINE static void loop(Ptr __restrict dst, CPtr __restrict src, + size_t count, size_t offset = 0) { + static_assert(Size > 1, "a loop of size 1 does not need tail"); + for (; offset < (count - Size); offset += Size) + block(dst + offset, src + offset); + builtin::Memcpy<32>::loop(dst, src, count, offset); + builtin::Memcpy<32>::tail(dst, src, count); } }; +LIBC_INLINE void repmovsb(void *dst, const void *src, size_t count) { + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); +} + /////////////////////////////////////////////////////////////////////////////// // Bcmp diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -12,7 +12,7 @@ #include "src/__support/CPP/bit.h" #include "src/__support/CPP/cstddef.h" #include "src/__support/CPP/type_traits.h" -#include "src/__support/macros/attributes.h" //LIBC_INLINE +#include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN #include // size_t diff --git a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h --- a/libc/src/string/memory_utils/x86_64/memcpy_implementations.h +++ b/libc/src/string/memory_utils/x86_64/memcpy_implementations.h @@ -19,7 +19,8 @@ namespace __llvm_libc { [[maybe_unused]] LIBC_INLINE void -inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { +inline_memcpy_x86_hw_prefetch(Ptr __restrict dst, CPtr __restrict src, + size_t count) { if (count == 0) return; if (count == 1) @@ -48,6 +49,55 @@ return builtin::Memcpy::loop_and_tail(dst, src, count); } +[[maybe_unused]] LIBC_INLINE void +inline_memcpy_x86_sw_prefetch(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + if (count == 0) + return; + if (count == 1) + return builtin::Memcpy<1>::block(dst, src); + if (count == 2) + return builtin::Memcpy<2>::block(dst, src); + if (count == 3) + return builtin::Memcpy<3>::block(dst, src); + if (count == 4) + return builtin::Memcpy<4>::block(dst, src); + if (count < 8) + return builtin::Memcpy<4>::head_tail(dst, src, count); + if (count < 16) + return builtin::Memcpy<8>::head_tail(dst, src, count); + if (count < 32) + return builtin::Memcpy<16>::head_tail(dst, src, count); + if (count < 64) + return builtin::Memcpy<32>::head_tail(dst, src, count); + x86::read_prefetch_local(src + x86::kCachelineSize); + if (count < 128) + return builtin::Memcpy<64>::head_tail(dst, src, count); + x86::read_prefetch_local(src + (2 * x86::kCachelineSize)); + // Align to 32B + builtin::Memcpy<32>::block(dst, src); + align_to_next_boundary<32, Arg::Dst>(dst, src, count); + // Invariant count >= 96. + if (count >= 128) { + if (count < 352) + x86::MemcpyPrefetch<128>::loop(dst, src, count); + else + x86::MemcpyPrefetch<192>::loop(dst, src, count); + } + if (count >= 32) + builtin::Memcpy<32>::loop(dst, src, count); + builtin::Memcpy<32>::tail(dst, src, count); +} + +[[maybe_unused]] LIBC_INLINE void +inline_memcpy_x86(Ptr __restrict dst, CPtr __restrict src, size_t count) { +#ifdef LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHER + return inline_memcpy_x86_sw_prefetch(dst, src, count); +#else + return inline_memcpy_x86_hw_prefetch(dst, src, count); +#endif // LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHER +} + [[maybe_unused]] LIBC_INLINE void inline_memcpy_x86_maybe_interpose_repmovsb(Ptr __restrict dst, CPtr __restrict src, size_t count) { @@ -68,12 +118,12 @@ static constexpr size_t kRepMovsbThreshold = LIBC_COPT_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; if constexpr (kRepMovsbThreshold == 0) { - return x86::Memcpy::repmovsb(dst, src, count); + return x86::repmovsb(dst, src, count); } else if constexpr (kRepMovsbThreshold == size_t(-1)) { return inline_memcpy_x86(dst, src, count); } else { if (LIBC_UNLIKELY(count >= kRepMovsbThreshold)) - return x86::Memcpy::repmovsb(dst, src, count); + return x86::repmovsb(dst, src, count); else return inline_memcpy_x86(dst, src, count); } diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1399,6 +1399,7 @@ "src/string/memory_utils/op_x86.h", "src/string/memory_utils/utils.h", ], + defines = ["LIBC_COPT_MEMCPY_X86_USE_SOFTWARE_PREFETCHER"], textual_hdrs = [ "src/string/memory_utils/bcmp_implementations.h", "src/string/memory_utils/bzero_implementations.h", @@ -1421,6 +1422,7 @@ ":__support_macros_config", ":__support_macros_optimization", ":__support_macros_properties_architectures", + ":__support_macros_properties_compiler", ":__support_macros_properties_cpu_features", ":libc_root", ],