diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt --- a/libc/lib/CMakeLists.txt +++ b/libc/lib/CMakeLists.txt @@ -8,6 +8,7 @@ # string.h entrypoints strcpy strcat + memcpy # sys/mman.h entrypoints mmap diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -1,3 +1,15 @@ +add_subdirectory(memory_utils) + +# Checking that the raw object does not contain any undefined symbols. +function(check_no_undefined_symbols target_name) +add_custom_command( + TARGET ${target_name} + POST_BUILD + COMMENT "Check that the generated code is valid" + COMMAND ! (nm --undefined-only $ | grep U && echo "Implementation contains undefined symbols") +) +endfunction(check_no_undefined_symbols) + add_entrypoint_object( strcat SRCS @@ -19,4 +31,28 @@ string_h ) -add_subdirectory(memory_utils) +add_gen_header( + memcpy_arch_specific + DEF_FILE + memcpy_arch_specific.h.def + GEN_HDR + memcpy_arch_specific.h + PARAMS + memcpy_arch_specific=memcpy_arch_specific_${LIBC_TARGET_MACHINE}.h.inc + DATA_FILES + memcpy_arch_specific_${LIBC_TARGET_MACHINE}.h.inc +) + +set_property(SOURCE memcpy.cpp PROPERTY COMPILE_OPTIONS -fno-builtin-memcpy) +add_entrypoint_object( + memcpy + SRCS + memcpy.cpp + HDRS + memcpy.h + DEPENDS + string_h + memory_utils + memcpy_arch_specific +) +check_no_undefined_symbols(memcpy) diff --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy.h @@ -0,0 +1,21 @@ +//===----------------- Implementation header for memcpy -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H +#define LLVM_LIBC_SRC_STRING_MEMCPY_H + +#include "include/string.h" +#include // size_t + +namespace __llvm_libc { + +void *memcpy(void *__restrict, const void *__restrict, size_t); + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy.cpp @@ -0,0 +1,65 @@ +//===--------------------- Implementation of memcpy -----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memcpy.h" +#include "src/__support/common.h" +#include "src/string/memcpy_arch_specific.h" + +namespace __llvm_libc { + +// Design rationale +// ================ +// +// Using a profiler to observe size distributions for calls into libc functions, +// it was found most operations act on a small number of bytes. This makes it +// important to favor small sizes. +// +// The tests for `count` are in ascending order so the cost of branching is +// proportional to the cost of copying. +// +// The function is written in C++ for several reasons: +// - The compiler can __see__ the code, this is useful when performing Profile +// Guided Optimization as the optimized code can take advantage of branching +// probabilities. +// - It also allows for easier customization and favors testing multiple +// implementation parameters. +// - As compilers and processors get better, the generated code is improved with +// little change on the code side. +static void memcpy_no_return(char *__restrict dst, const char *__restrict src, + size_t count) { + if (count == 0) + return; + if (count == 1) + return Copy<1>(dst, src); + if (count == 2) + return Copy<2>(dst, src); + if (count == 3) + return Copy<3>(dst, src); + if (count == 4) + return Copy<4>(dst, src); + if (count < 8) + return CopyOverlap<4>(dst, src, count); + if (count < 16) + return CopyOverlap<8>(dst, src, count); + if (count < 32) + return CopyOverlap<16>(dst, src, count); + if (count < 64) + return CopyOverlap<32>(dst, src, count); + if (count < 128) + return CopyOverlap<64>(dst, src, count); + CopyGE128(dst, src, count); +} + +void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst, + const void *__restrict src, size_t size) { + memcpy_no_return(reinterpret_cast(dst), + reinterpret_cast(src), size); + return dst; +} + +} // namespace __llvm_libc diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy_arch_specific.h.def @@ -0,0 +1,21 @@ +//===-------------- Implementation of arch specific memcpy ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H +#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H + +#include "src/string/memory_utils/memcpy_utils.h" + +namespace __llvm_libc { + +%%include_file(${memcpy_arch_specific}) + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H diff --git a/libc/src/string/memcpy_arch_specific_x86.inc b/libc/src/string/memcpy_arch_specific_x86.inc new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy_arch_specific_x86.inc @@ -0,0 +1,19 @@ +static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, + size_t count) { + // FIXME: Add MSVC suppport with + // #include + // __movsb(reinterpret_cast(dst), + // reinterpret_cast(src), count); + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); +} + +static void CopyGE128(char *__restrict dst, const char *__restrict src, + size_t count) { + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used to to kRepMovsBSize and then RepMovsb. + constexpr size_t kRepMovsBSize = -1; + if (count <= kRepMovsBSize) + return CopyAligned<32>(dst, src, count); + CopyRepMovsb(dst, src, count); +} diff --git a/libc/src/string/memcpy_arch_specific_x86_64.h.inc b/libc/src/string/memcpy_arch_specific_x86_64.h.inc new file mode 100644 --- /dev/null +++ b/libc/src/string/memcpy_arch_specific_x86_64.h.inc @@ -0,0 +1,19 @@ +static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, + size_t count) { + // FIXME: Add MSVC suppport with + // #include + // __movsb(reinterpret_cast(dst), + // reinterpret_cast(src), count); + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); +} + +static void CopyGE128(char *__restrict dst, const char *__restrict src, + size_t count) { + // kRepMovsBSize == -1 : Only CopyAligned is used. + // kRepMovsBSize == 0 : Only RepMovsb is used. + // else CopyAligned is used to to kRepMovsBSize and then RepMovsb. + constexpr size_t kRepMovsBSize = -1; + if (count <= kRepMovsBSize) + return CopyAligned<32>(dst, src, count); + CopyRepMovsb(dst, src, count); +} diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -12,6 +12,9 @@ add_header_library( memory_utils - HDRS utils.h - DEPENDS cacheline_size + HDRS + utils.h + memcpy_utils.h + DEPENDS + cacheline_size ) diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/memcpy_utils.h @@ -0,0 +1,100 @@ +//===---------------------------- Memcpy utils ----------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H +#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H + +#include "src/string/memory_utils/utils.h" +#include // size_t + +// __builtin_memcpy_inline guarantees to never call external functions. +// Unfortunately it is not widely available. +#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline) +#define USE_BUILTIN_MEMCPY_INLINE +#elif defined(__GNUC__) +#define USE_BUILTIN_MEMCPY +#endif + +// This is useful for testing. +#if defined(LLVM_LIBC_MEMCPY_MONITOR) +extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict, + const char *__restrict, size_t); +#endif + +namespace __llvm_libc { + +// Copies `kBlockSize` bytes from `src` to `dst`. +template +static void Copy(char *__restrict dst, const char *__restrict src) { +#if defined(LLVM_LIBC_MEMCPY_MONITOR) + LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize); +#elif defined(USE_BUILTIN_MEMCPY_INLINE) + __builtin_memcpy_inline(dst, src, kBlockSize); +#elif defined(USE_BUILTIN_MEMCPY) + __builtin_memcpy(dst, src, kBlockSize); +#else + for (size_t i = 0; i < kBlockSize; ++i) + dst[i] = src[i]; +#endif +} + +// Copies `kBlockSize` bytes from `src + count - kBlockSize` to +// `dst + count - kBlockSize`. +// Precondition: `count >= kBlockSize`. +template +static void CopyLastBlock(char *__restrict dst, const char *__restrict src, + size_t count) { + const size_t offset = count - kBlockSize; + Copy(dst + offset, src + offset); +} + +// Copies `kBlockSize` bytes twice with an overlap between the two. +// +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [__XXXXXXXX_________] +// [________XXXXXXXX___] +// +// Precondition: `count >= kBlockSize && count <= kBlockSize`. +template +static void CopyOverlap(char *__restrict dst, const char *__restrict src, + size_t count) { + Copy(dst, src); + CopyLastBlock(dst, src, count); +} + +// Copies `count` bytes by blocks of `kBlockSize` bytes. +// Copies at the start and end of the buffer are unaligned. +// Copies in the middle of the buffer are aligned to `kBlockSize`. +// +// e.g. with +// [12345678123456781234567812345678] +// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___] +// [__XXXXXXXX______________________] +// [________XXXXXXXX________________] +// [________________XXXXXXXX________] +// [_____________________XXXXXXXX___] +// +// Precondition: `count > 2 * kBlockSize` for efficiency. +// `count >= kBlockSize` for correctness. +template +static void CopyAligned(char *__restrict dst, const char *__restrict src, + size_t count) { + Copy(dst, src); // Copy first block + + // Copy aligned blocks + size_t offset = kBlockSize - offset_from_last_aligned(dst); + for (; offset + kBlockSize < count; offset += kBlockSize) + Copy(dst + offset, src + offset); + + CopyLastBlock(dst, src, count); // Copy last block +} + +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -43,6 +43,11 @@ return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1); } +template intptr_t offset_from_last_aligned(const void *ptr) { + static_assert(is_power2(alignment), "alignment must be a power of 2"); + return reinterpret_cast(ptr) & (alignment - 1U); +} + template intptr_t offset_to_next_aligned(const void *ptr) { static_assert(is_power2(alignment), "alignment must be a power of 2"); // The logic is not straightforward and involves unsigned modulo arithmetic @@ -51,7 +56,7 @@ } // Returns the offset from `ptr` to the next cache line. -static intptr_t offset_to_next_cache_line(const void *ptr) { +static inline intptr_t offset_to_next_cache_line(const void *ptr) { return offset_to_next_aligned(ptr); } diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -22,3 +22,13 @@ DEPENDS strcpy ) + +add_libc_unittest( + memcpy_test + SUITE + libc_string_unittests + SRCS + memcpy_test.cpp + DEPENDS + memcpy +) diff --git a/libc/test/src/string/memcpy_test.cpp b/libc/test/src/string/memcpy_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memcpy_test.cpp @@ -0,0 +1,19 @@ +//===----------------------- Unittests for memcpy -------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memcpy.h" +#include "utils/UnitTest/Test.h" + +TEST(MemcpyTest, EmptyDest) { + const char src[] = "src"; + char dest[4]; + + void *result = __llvm_libc::memcpy(dest, src, 4); + ASSERT_EQ(dest, reinterpret_cast(result)); + ASSERT_STREQ(dest, src); +} diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -4,7 +4,14 @@ libc_string_unittests SRCS utils_test.cpp + memcpy_utils_test.cpp DEPENDS memory_utils standalone_cpp ) + +target_compile_definitions( + utils_test + PRIVATE + LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor +) diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp @@ -0,0 +1,208 @@ +//===-------------------- Unittests for memory_utils ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memory_utils/memcpy_utils.h" +#include "utils/CPP/Array.h" +#include "utils/UnitTest/Test.h" + +#include +#include // uintptr_t + +#ifndef LLVM_LIBC_MEMCPY_MONITOR +#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test. +#endif + +namespace __llvm_libc { + +struct Buffer { + static constexpr size_t kMaxBuffer = 1024; + char buffer[kMaxBuffer + 1]; + size_t last = 0; + + void Clear() { + last = 0; + for (size_t i = 0; i < kMaxBuffer; ++i) + buffer[i] = '0'; + buffer[kMaxBuffer] = '\0'; + } + + void Increment(const void *ptr) { + const auto offset = reinterpret_cast(ptr); + assert(offset < kMaxBuffer); + ++buffer[offset]; + if (offset > last) + last = offset; + } + + char *Finish() { + assert(last < kMaxBuffer); + buffer[last + 1] = '\0'; + return buffer; + } +}; + +struct Trace { + Buffer read; + Buffer write; + + void Add(char *__restrict dst, const char *__restrict src, size_t count) { + for (size_t i = 0; i < count; ++i) + read.Increment(src + i); + for (size_t i = 0; i < count; ++i) + write.Increment(dst + i); + } + + void Clear() { + read.Clear(); + write.Clear(); + } + + char *Read() { return read.Finish(); } + char *Write() { return write.Finish(); } +}; + +static Trace &GetTrace() { + static thread_local Trace events; + return events; +} + +extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst, + const char *__restrict src, + size_t count) { + GetTrace().Add(dst, src, count); +} + +char *I(uintptr_t offset) { return reinterpret_cast(offset); } + +TEST(MemcpyUtilsTest, CopyTrivial) { + auto &trace = GetTrace(); + + trace.Clear(); + Copy<1>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1"); + EXPECT_STREQ(trace.Read(), "1"); + + trace.Clear(); + Copy<2>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11"); + EXPECT_STREQ(trace.Read(), "11"); + + trace.Clear(); + Copy<4>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1111"); + EXPECT_STREQ(trace.Read(), "1111"); + + trace.Clear(); + Copy<8>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11111111"); + EXPECT_STREQ(trace.Read(), "11111111"); + + trace.Clear(); + Copy<16>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "1111111111111111"); + EXPECT_STREQ(trace.Read(), "1111111111111111"); + + trace.Clear(); + Copy<32>(I(0), I(0)); + EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111"); + EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111"); + + trace.Clear(); + Copy<64>(I(0), I(0)); + EXPECT_STREQ( + trace.Write(), + "1111111111111111111111111111111111111111111111111111111111111111"); + EXPECT_STREQ( + trace.Read(), + "1111111111111111111111111111111111111111111111111111111111111111"); +} + +TEST(MemcpyUtilsTest, CopyOffset) { + auto &trace = GetTrace(); + + trace.Clear(); + Copy<1>(I(3), I(1)); + EXPECT_STREQ(trace.Write(), "0001"); + EXPECT_STREQ(trace.Read(), "01"); + + trace.Clear(); + Copy<1>(I(2), I(1)); + EXPECT_STREQ(trace.Write(), "001"); + EXPECT_STREQ(trace.Read(), "01"); +} + +TEST(MemcpyUtilsTest, CopyOverlap) { + auto &trace = GetTrace(); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 2); + EXPECT_STREQ(trace.Write(), "22"); + EXPECT_STREQ(trace.Read(), "22"); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 3); + EXPECT_STREQ(trace.Write(), "121"); + EXPECT_STREQ(trace.Read(), "121"); + + trace.Clear(); + CopyOverlap<2>(I(0), I(0), 4); + EXPECT_STREQ(trace.Write(), "1111"); + EXPECT_STREQ(trace.Read(), "1111"); + + trace.Clear(); + CopyOverlap<4>(I(2), I(1), 7); + EXPECT_STREQ(trace.Write(), "001112111"); + EXPECT_STREQ(trace.Read(), "01112111"); +} + +TEST(MemcpyUtilsTest, CopyAligned) { + auto &trace = GetTrace(); + // Destination is aligned already. + // "1111000000000" + // + "0000111100000" + // + "0000000011110" + // + "0000000001111" + // = "1111111112221" + trace.Clear(); + CopyAligned<4>(I(0), I(0), 13); + EXPECT_STREQ(trace.Write(), "1111111112221"); + EXPECT_STREQ(trace.Read(), "1111111112221"); + + // Misaligned destination + // "01111000000000" + // + "00001111000000" + // + "00000000111100" + // + "00000000001111" + // = "01112111112211" + trace.Clear(); + CopyAligned<4>(I(1), I(0), 13); + EXPECT_STREQ(trace.Write(), "01112111112211"); + EXPECT_STREQ(trace.Read(), "1112111112211"); +} + +TEST(MemcpyUtilsTest, MaxReloads) { + auto &trace = GetTrace(); + for (size_t alignment = 0; alignment < 32; ++alignment) { + for (size_t count = 64; count < 768; ++count) { + trace.Clear(); + // We should never reload more than twice when copying from count = 2x32. + CopyAligned<32>(I(alignment), I(0), count); + const char *const written = trace.Write(); + // First bytes are untouched. + for (size_t i = 0; i < alignment; ++i) + EXPECT_EQ(written[i], '0'); + // Next bytes are loaded once or twice but no more. + for (size_t i = alignment; i < count; ++i) { + EXPECT_GE(written[i], '1'); + EXPECT_LE(written[i], '2'); + } + } + } +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp --- a/libc/test/src/string/memory_utils/utils_test.cpp +++ b/libc/test/src/string/memory_utils/utils_test.cpp @@ -87,6 +87,14 @@ EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16)); } +TEST(UtilsTest, OffsetFromLastAligned) { + EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0)); + EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15)); + EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16)); +} + TEST(UtilsTest, OffsetToNextCacheLine) { EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0); EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));