diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -357,7 +357,7 @@ function(add_memcpy memcpy_name) add_implementation(memcpy ${memcpy_name} - SRCS ${MEMCPY_SRC} + SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h DEPENDS .memory_utils.memory_utils @@ -369,7 +369,6 @@ endfunction() if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) - set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86_64/memcpy.cpp) add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=k8 REQUIRE SSE2) add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2) @@ -377,14 +376,12 @@ add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memcpy(memcpy) elseif(${LIBC_TARGET_ARCHITECTURE_IS_AARCH64}) - set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp) # Disable tail merging as it leads to lower performance. # Note that '-mllvm' needs to be prefixed with 'SHELL:' to prevent CMake flag deduplication. add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0") add_memcpy(memcpy COMPILE_OPTIONS "SHELL:-mllvm --tail-merge-threshold=0") else() - set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp) add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memcpy(memcpy) endif() diff --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp deleted file mode 100644 --- a/libc/src/string/aarch64/memcpy.cpp +++ /dev/null @@ -1,77 +0,0 @@ -//===-- Implementation of memcpy ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/string/memcpy.h" -#include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" - -namespace __llvm_libc { - -using _1 = scalar::UINT8; -using _2 = scalar::UINT16; -using _3 = Chained; -using _4 = scalar::UINT32; -using _8 = scalar::UINT64; -using _16 = Repeated; -using _32 = Repeated; -using _64 = Repeated; - -// Design rationale -// ================ -// -// Using a profiler to observe size distributions for calls into libc -// functions, it was found most operations act on a small number of bytes. -// This makes it important to favor small sizes. -// -// We have used __builtin_expect to tell the compiler to favour lower sizes as -// that will reduce the branching overhead where that would hurt most -// proportional to total cost of copying. -// -// The function is written in C++ for several reasons: -// - The compiler can __see__ the code, this is useful when performing Profile -// Guided Optimization as the optimized code can take advantage of branching -// probabilities. -// - It also allows for easier customization and favors testing multiple -// implementation parameters. -// - As compilers and processors get better, the generated code is improved -// with little change on the code side. -// This implementation has been tuned for Neoverse-N1. -static void memcpy_aarch64(char *__restrict dst, const char *__restrict src, - size_t count) { - if (count == 0) - return; - if (count == 1) - return Copy<_1>(dst, src); - if (count == 2) - return Copy<_2>(dst, src); - if (count == 3) - return Copy<_3>(dst, src); - if (count == 4) - return Copy<_4>(dst, src); - if (count < 8) - return Copy>(dst, src, count); - if (count < 16) - return Copy>(dst, src, count); - if (count < 32) - return Copy>(dst, src, count); - if (count < 64) - return Copy>(dst, src, count); - if (count < 128) - return Copy>(dst, src, count); - return Copy::Then>>(dst, src, count); -} - -LLVM_LIBC_FUNCTION(void *, memcpy, - (void *__restrict dst, const void *__restrict src, - size_t size)) { - memcpy_aarch64(reinterpret_cast(dst), - reinterpret_cast(src), size); - return dst; -} - -} // namespace __llvm_libc diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -8,61 +8,15 @@ #include "src/string/memcpy.h" #include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/memcpy_implementations.h" namespace __llvm_libc { -// Design rationale -// ================ -// -// Using a profiler to observe size distributions for calls into libc -// functions, it was found most operations act on a small number of bytes. -// This makes it important to favor small sizes. -// -// The tests for `count` are in ascending order so the cost of branching is -// proportional to the cost of copying. -// -// The function is written in C++ for several reasons: -// - The compiler can __see__ the code, this is useful when performing Profile -// Guided Optimization as the optimized code can take advantage of branching -// probabilities. -// - It also allows for easier customization and favors testing multiple -// implementation parameters. -// - As compilers and processors get better, the generated code is improved -// with little change on the code side. -static void memcpy_impl(char *__restrict dst, const char *__restrict src, - size_t count) { - // Use scalar strategies (_1, _2, _3 ...) - using namespace __llvm_libc::scalar; - - if (count == 0) - return; - if (count == 1) - return Copy<_1>(dst, src); - if (count == 2) - return Copy<_2>(dst, src); - if (count == 3) - return Copy<_3>(dst, src); - if (count == 4) - return Copy<_4>(dst, src); - if (count < 8) - return Copy>(dst, src, count); - if (count < 16) - return Copy>(dst, src, count); - if (count < 32) - return Copy>(dst, src, count); - if (count < 64) - return Copy>(dst, src, count); - if (count < 128) - return Copy>(dst, src, count); - return Copy::Then>>(dst, src, count); -} - LLVM_LIBC_FUNCTION(void *, memcpy, (void *__restrict dst, const void *__restrict src, size_t size)) { - memcpy_impl(reinterpret_cast(dst), - reinterpret_cast(src), size); + inline_memcpy(reinterpret_cast(dst), + reinterpret_cast(src), size); return dst; } diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -0,0 +1,152 @@ +//===-- Memcpy implementation -----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/architectures.h" +#include "src/__support/common.h" +#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/utils.h" + +#include // size_t + +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// The tests for `count` are in ascending order so the cost of branching is +// proportional to the cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. + +namespace __llvm_libc { + +static inline void inline_memcpy(char *__restrict dst, + const char *__restrict src, size_t count) { +#if defined(LLVM_LIBC_ARCH_X86) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_X86 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::x86; + + // Whether to use only rep;movsb. + constexpr bool kUseOnlyRepMovsb = + LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); + + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used up to kRepMovsBSize and then RepMovsb. + constexpr size_t kRepMovsBSize = +#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE) + LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; +#else + -1; +#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE + + // Whether target supports AVX instructions. + constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__); + +#if defined(__AVX__) + using LoopBlockSize = _64; +#else + using LoopBlockSize = _32; +#endif + + if (kUseOnlyRepMovsb) + return Copy(dst, src, count); + + if (count == 0) + return; + if (count == 1) + return Copy<_1>(dst, src); + if (count == 2) + return Copy<_2>(dst, src); + if (count == 3) + return Copy<_3>(dst, src); + if (count == 4) + return Copy<_4>(dst, src); + if (count < 8) + return Copy>(dst, src, count); + if (count < 16) + return Copy>(dst, src, count); + if (count < 32) + return Copy>(dst, src, count); + if (count < 64) + return Copy>(dst, src, count); + if (count < 128) + return Copy>(dst, src, count); + if (kHasAvx && count < 256) + return Copy>(dst, src, count); + if (count <= kRepMovsBSize) + return Copy::Then>>(dst, src, + count); + return Copy(dst, src, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::scalar; + if (count == 0) + return; + if (count == 1) + return Copy<_1>(dst, src); + if (count == 2) + return Copy<_2>(dst, src); + if (count == 3) + return Copy<_3>(dst, src); + if (count == 4) + return Copy<_4>(dst, src); + if (count < 8) + return Copy>(dst, src, count); + if (count < 16) + return Copy>(dst, src, count); + if (count < 32) + return Copy>(dst, src, count); + if (count < 64) + return Copy>(dst, src, count); + if (count < 128) + return Copy>(dst, src, count); + return Copy::Then>>(dst, src, count); +#else + ///////////////////////////////////////////////////////////////////////////// + // Default + ///////////////////////////////////////////////////////////////////////////// + using namespace __llvm_libc::scalar; + if (count == 0) + return; + if (count == 1) + return Copy<_1>(dst, src); + if (count == 2) + return Copy<_2>(dst, src); + if (count == 3) + return Copy<_3>(dst, src); + if (count == 4) + return Copy<_4>(dst, src); + if (count < 8) + return Copy>(dst, src, count); + if (count < 16) + return Copy>(dst, src, count); + if (count < 32) + return Copy>(dst, src, count); + if (count < 64) + return Copy>(dst, src, count); + if (count < 128) + return Copy>(dst, src, count); + return Copy::Then>>(dst, src, count); +#endif +} + +} // namespace __llvm_libc diff --git a/libc/src/string/x86_64/memcpy.cpp b/libc/src/string/x86_64/memcpy.cpp deleted file mode 100644 --- a/libc/src/string/x86_64/memcpy.cpp +++ /dev/null @@ -1,109 +0,0 @@ -//===-- Implementation of memcpy ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/string/memcpy.h" -#include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" - -namespace __llvm_libc { - -// Whether to use only rep;movsb. -constexpr bool kUseOnlyRepMovsb = - LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); - -// kRepMovsBSize == -1 : Only CopyAligned is used. -// kRepMovsBSize == 0 : Only RepMovsb is used. -// else CopyAligned is used up to kRepMovsBSize and then RepMovsb. -constexpr size_t kRepMovsBSize = -#ifdef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE - LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; -#else - -1; -#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE - -// Whether target supports AVX instructions. -constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__); - -#ifdef __AVX__ -using LoopBlockSize = __llvm_libc::x86::_64; -#else -using LoopBlockSize = __llvm_libc::x86::_32; -#endif - -static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, - size_t count) { - // FIXME: Add MSVC support with - // #include - // __movsb(reinterpret_cast(dst), - // reinterpret_cast(src), count); - asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); -} - -// Design rationale -// ================ -// -// Using a profiler to observe size distributions for calls into libc -// functions, it was found most operations act on a small number of bytes. -// This makes it important to favor small sizes. -// -// The tests for `count` are in ascending order so the cost of branching is -// proportional to the cost of copying. -// -// The function is written in C++ for several reasons: -// - The compiler can __see__ the code, this is useful when performing Profile -// Guided Optimization as the optimized code can take advantage of branching -// probabilities. -// - It also allows for easier customization and favors testing multiple -// implementation parameters. -// - As compilers and processors get better, the generated code is improved -// with little change on the code side. -static void memcpy_x86(char *__restrict dst, const char *__restrict src, - size_t count) { - // Use x86 strategies (_1, _2, _3 ...) - using namespace __llvm_libc::x86; - - if (kUseOnlyRepMovsb) - return CopyRepMovsb(dst, src, count); - - if (count == 0) - return; - if (count == 1) - return Copy<_1>(dst, src); - if (count == 2) - return Copy<_2>(dst, src); - if (count == 3) - return Copy<_3>(dst, src); - if (count == 4) - return Copy<_4>(dst, src); - if (count < 8) - return Copy>(dst, src, count); - if (count < 16) - return Copy>(dst, src, count); - if (count < 32) - return Copy>(dst, src, count); - if (count < 64) - return Copy>(dst, src, count); - if (count < 128) - return Copy>(dst, src, count); - if (kHasAvx && count < 256) - return Copy>(dst, src, count); - if (count <= kRepMovsBSize) - return Copy::Then>>(dst, src, - count); - return CopyRepMovsb(dst, src, count); -} - -LLVM_LIBC_FUNCTION(void *, memcpy, - (void *__restrict dst, const void *__restrict src, - size_t size)) { - memcpy_x86(reinterpret_cast(dst), reinterpret_cast(src), - size); - return dst; -} - -} // namespace __llvm_libc