diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -68,30 +68,17 @@ # include the relevant architecture specific implementations if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64") - set(LIBC_STRING_TARGET_FOLDER "x86") + set(LIBC_STRING_TARGET_ARCH "x86") else() - set(LIBC_STRING_TARGET_FOLDER ${LIBC_TARGET_MACHINE}) + set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE}) endif() -add_gen_header( - memcpy_arch_specific - DEF_FILE - memcpy_arch_specific.h.def - GEN_HDR - memcpy_arch_specific.h - PARAMS - memcpy_arch_specific=${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc - DATA_FILES - ${LIBC_STRING_TARGET_FOLDER}/memcpy_arch_specific.h.inc -) - function(add_memcpy memcpy_name) add_implementation(memcpy ${memcpy_name} - SRCS ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp + SRCS ${LIBC_SOURCE_DIR}/src/string/${LIBC_STRING_TARGET_ARCH}/memcpy.cpp HDRS ${LIBC_SOURCE_DIR}/src/string/memcpy.h DEPENDS .memory_utils.memory_utils - .memcpy_arch_specific libc.include.string COMPILE_OPTIONS -fno-builtin-memcpy @@ -144,4 +131,4 @@ # Add all other relevant implementations for the native target. # ------------------------------------------------------------------------------ -include(${LIBC_STRING_TARGET_FOLDER}/CMakeLists.txt) +include(${LIBC_STRING_TARGET_ARCH}/CMakeLists.txt) diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp deleted file mode 100644 --- a/libc/src/string/memcpy.cpp +++ /dev/null @@ -1,22 +0,0 @@ -//===-- Implementation of memcpy ------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/string/memcpy.h" -#include "src/__support/common.h" -#include "src/string/memcpy_arch_specific.h" - -namespace __llvm_libc { - -void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst, - const void *__restrict src, size_t size) { - memcpy_no_return(reinterpret_cast(dst), - reinterpret_cast(src), size); - return dst; -} - -} // namespace __llvm_libc diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def deleted file mode 100644 --- a/libc/src/string/memcpy_arch_specific.h.def +++ /dev/null @@ -1,65 +0,0 @@ -//===-- Implementation of arch specific memcpy ----------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H -#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H - -%%include_file(${memcpy_arch_specific}) - -namespace __llvm_libc { - -// Design rationale -// ================ -// -// Using a profiler to observe size distributions for calls into libc -// functions, it was found most operations act on a small number of bytes. -// This makes it important to favor small sizes. -// -// The tests for `count` are in ascending order so the cost of branching is -// proportional to the cost of copying. -// -// The function is written in C++ for several reasons: -// - The compiler can __see__ the code, this is useful when performing Profile -// Guided Optimization as the optimized code can take advantage of branching -// probabilities. -// - It also allows for easier customization and favors testing multiple -// implementation parameters. -// - As compilers and processors get better, the generated code is improved -// with little change on the code side. -static void memcpy_no_return(char *__restrict dst, const char *__restrict src, - size_t count) { - if (count == 0) - return; - if (count == 1) - return Copy<1>(dst, src); - if (count == 2) - return Copy<2>(dst, src); - if (count == 3) - return Copy<3>(dst, src); - if (count == 4) - return Copy<4>(dst, src); - if (count < 8) - return CopyOverlap<4>(dst, src, count); - if (count == 8) - return Copy<8>(dst, src); - if (count < 16) - return CopyOverlap<8>(dst, src, count); - if (count == 16) - return Copy<16>(dst, src); - if (count < 32) - return CopyOverlap<16>(dst, src, count); - if (count < 64) - return CopyOverlap<32>(dst, src, count); - if (count < 128) - return CopyOverlap<64>(dst, src, count); - CopyGE128(dst, src, count); -} - -} // namespace __llvm_libc - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h --- a/libc/src/string/memory_utils/memcpy_utils.h +++ b/libc/src/string/memory_utils/memcpy_utils.h @@ -32,7 +32,7 @@ // Copies `kBlockSize` bytes from `src` to `dst`. template -static void Copy(char *__restrict dst, const char *__restrict src) { +static void CopyBlock(char *__restrict dst, const char *__restrict src) { #if defined(LLVM_LIBC_MEMCPY_MONITOR) LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize); #elif defined(USE_BUILTIN_MEMCPY_INLINE) @@ -52,7 +52,7 @@ static void CopyLastBlock(char *__restrict dst, const char *__restrict src, size_t count) { const size_t offset = count - kBlockSize; - Copy(dst + offset, src + offset); + CopyBlock(dst + offset, src + offset); } // Copies `kBlockSize` bytes twice with an overlap between the two. @@ -64,9 +64,9 @@ // // Precondition: `count >= kBlockSize && count <= kBlockSize`. template -static void CopyOverlap(char *__restrict dst, const char *__restrict src, - size_t count) { - Copy(dst, src); +static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src, + size_t count) { + CopyBlock(dst, src); CopyLastBlock(dst, src, count); } @@ -85,14 +85,14 @@ // Precondition: `count > 2 * kBlockSize` for efficiency. // `count >= kBlockSize` for correctness. template -static void CopyAligned(char *__restrict dst, const char *__restrict src, - size_t count) { - Copy(dst, src); // Copy first block +static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src, + size_t count) { + CopyBlock(dst, src); // Copy first block // Copy aligned blocks size_t offset = kBlockSize - offset_from_last_aligned(dst); for (; offset + kBlockSize < count; offset += kBlockSize) - Copy(dst + offset, src + offset); + CopyBlock(dst + offset, src + offset); CopyLastBlock(dst, src, count); // Copy last block } diff --git a/libc/src/string/x86/memcpy.cpp b/libc/src/string/x86/memcpy.cpp new file mode 100644 --- /dev/null +++ b/libc/src/string/x86/memcpy.cpp @@ -0,0 +1,94 @@ +//===-- Implementation of memcpy ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memcpy.h" +#include "src/__support/common.h" +#include "src/string/memory_utils/memcpy_utils.h" + +namespace __llvm_libc { + +static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, + size_t count) { + // FIXME: Add MSVC support with + // #include + // __movsb(reinterpret_cast(dst), + // reinterpret_cast(src), count); + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); +} + +#if defined(__AVX__) +#define BEST_SIZE 64 +#else +#define BEST_SIZE 32 +#endif + +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc +// functions, it was found most operations act on a small number of bytes. +// This makes it important to favor small sizes. +// +// The tests for `count` are in ascending order so the cost of branching is +// proportional to the cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved +// with little change on the code side. +static void memcpy_x86(char *__restrict dst, const char *__restrict src, + size_t count) { + if (count == 0) + return; + if (count == 1) + return CopyBlock<1>(dst, src); + if (count == 2) + return CopyBlock<2>(dst, src); + if (count == 3) + return CopyBlock<3>(dst, src); + if (count == 4) + return CopyBlock<4>(dst, src); + if (count < 8) + return CopyBlockOverlap<4>(dst, src, count); + if (count == 8) + return CopyBlock<8>(dst, src); + if (count < 16) + return CopyBlockOverlap<8>(dst, src, count); + if (count == 16) + return CopyBlock<16>(dst, src); + if (count < 32) + return CopyBlockOverlap<16>(dst, src, count); + if (count < 64) + return CopyBlockOverlap<32>(dst, src, count); + if (count < 128) + return CopyBlockOverlap<64>(dst, src, count); +#if defined(__AVX__) + if (count < 256) + return CopyBlockOverlap<128>(dst, src, count); +#endif + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used to to kRepMovsBSize and then RepMovsb. + constexpr size_t kRepMovsBSize = -1; + if (count <= kRepMovsBSize) + return CopyAlignedBlocks(dst, src, count); + return CopyRepMovsb(dst, src, count); +} + +void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst, + const void *__restrict src, size_t size) { + memcpy_x86(reinterpret_cast(dst), reinterpret_cast(src), + size); + return dst; +} + +} // namespace __llvm_libc diff --git a/libc/src/string/x86/memcpy_arch_specific.h.inc b/libc/src/string/x86/memcpy_arch_specific.h.inc deleted file mode 100644 --- a/libc/src/string/x86/memcpy_arch_specific.h.inc +++ /dev/null @@ -1,35 +0,0 @@ -#include "src/string/memory_utils/memcpy_utils.h" - -namespace __llvm_libc { - -static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, - size_t count) { - // FIXME: Add MSVC support with - // #include - // __movsb(reinterpret_cast(dst), - // reinterpret_cast(src), count); - asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); -} - -#if defined(__AVX__) -#define BEST_SIZE 64 -#else -#define BEST_SIZE 32 -#endif - -static void CopyGE128(char *__restrict dst, const char *__restrict src, - size_t count) { -#if defined(__AVX__) - if (count < 256) - return CopyOverlap<128>(dst, src, count); -#endif - // kRepMovsBSize == -1 : Only CopyAligned is used. - // kRepMovsBSize == 0 : Only RepMovsb is used. - // else CopyAligned is used to to kRepMovsBSize and then RepMovsb. - constexpr size_t kRepMovsBSize = -1; - if (count <= kRepMovsBSize) - return CopyAligned(dst, src, count); - CopyRepMovsb(dst, src, count); -} - -} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp --- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp +++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp @@ -83,37 +83,37 @@ auto &trace = GetTrace(); trace.Clear(); - Copy<1>(I(0), I(0)); + CopyBlock<1>(I(0), I(0)); EXPECT_STREQ(trace.Write(), "1"); EXPECT_STREQ(trace.Read(), "1"); trace.Clear(); - Copy<2>(I(0), I(0)); + CopyBlock<2>(I(0), I(0)); EXPECT_STREQ(trace.Write(), "11"); EXPECT_STREQ(trace.Read(), "11"); trace.Clear(); - Copy<4>(I(0), I(0)); + CopyBlock<4>(I(0), I(0)); EXPECT_STREQ(trace.Write(), "1111"); EXPECT_STREQ(trace.Read(), "1111"); trace.Clear(); - Copy<8>(I(0), I(0)); + CopyBlock<8>(I(0), I(0)); EXPECT_STREQ(trace.Write(), "11111111"); EXPECT_STREQ(trace.Read(), "11111111"); trace.Clear(); - Copy<16>(I(0), I(0)); + CopyBlock<16>(I(0), I(0)); EXPECT_STREQ(trace.Write(), "1111111111111111"); EXPECT_STREQ(trace.Read(), "1111111111111111"); trace.Clear(); - Copy<32>(I(0), I(0)); + CopyBlock<32>(I(0), I(0)); EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111"); EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111"); trace.Clear(); - Copy<64>(I(0), I(0)); + CopyBlock<64>(I(0), I(0)); EXPECT_STREQ( trace.Write(), "1111111111111111111111111111111111111111111111111111111111111111"); @@ -126,41 +126,41 @@ auto &trace = GetTrace(); trace.Clear(); - Copy<1>(I(3), I(1)); + CopyBlock<1>(I(3), I(1)); EXPECT_STREQ(trace.Write(), "0001"); EXPECT_STREQ(trace.Read(), "01"); trace.Clear(); - Copy<1>(I(2), I(1)); + CopyBlock<1>(I(2), I(1)); EXPECT_STREQ(trace.Write(), "001"); EXPECT_STREQ(trace.Read(), "01"); } -TEST(MemcpyUtilsTest, CopyOverlap) { +TEST(MemcpyUtilsTest, CopyBlockOverlap) { auto &trace = GetTrace(); trace.Clear(); - CopyOverlap<2>(I(0), I(0), 2); + CopyBlockOverlap<2>(I(0), I(0), 2); EXPECT_STREQ(trace.Write(), "22"); EXPECT_STREQ(trace.Read(), "22"); trace.Clear(); - CopyOverlap<2>(I(0), I(0), 3); + CopyBlockOverlap<2>(I(0), I(0), 3); EXPECT_STREQ(trace.Write(), "121"); EXPECT_STREQ(trace.Read(), "121"); trace.Clear(); - CopyOverlap<2>(I(0), I(0), 4); + CopyBlockOverlap<2>(I(0), I(0), 4); EXPECT_STREQ(trace.Write(), "1111"); EXPECT_STREQ(trace.Read(), "1111"); trace.Clear(); - CopyOverlap<4>(I(2), I(1), 7); + CopyBlockOverlap<4>(I(2), I(1), 7); EXPECT_STREQ(trace.Write(), "001112111"); EXPECT_STREQ(trace.Read(), "01112111"); } -TEST(MemcpyUtilsTest, CopyAligned) { +TEST(MemcpyUtilsTest, CopyAlignedBlocks) { auto &trace = GetTrace(); // Destination is aligned already. // "1111000000000" @@ -169,7 +169,7 @@ // + "0000000001111" // = "1111111112221" trace.Clear(); - CopyAligned<4>(I(0), I(0), 13); + CopyAlignedBlocks<4>(I(0), I(0), 13); EXPECT_STREQ(trace.Write(), "1111111112221"); EXPECT_STREQ(trace.Read(), "1111111112221"); @@ -180,7 +180,7 @@ // + "00000000001111" // = "01112111112211" trace.Clear(); - CopyAligned<4>(I(1), I(0), 13); + CopyAlignedBlocks<4>(I(1), I(0), 13); EXPECT_STREQ(trace.Write(), "01112111112211"); EXPECT_STREQ(trace.Read(), "1112111112211"); } @@ -191,7 +191,7 @@ for (size_t count = 64; count < 768; ++count) { trace.Clear(); // We should never reload more than twice when copying from count = 2x32. - CopyAligned<32>(I(alignment), I(0), count); + CopyAlignedBlocks<32>(I(alignment), I(0), count); const char *const written = trace.Write(); // First bytes are untouched. for (size_t i = 0; i < alignment; ++i)