diff --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp --- a/libc/src/string/aarch64/memcpy.cpp +++ b/libc/src/string/aarch64/memcpy.cpp @@ -54,7 +54,7 @@ return CopyBlockOverlap<32>(dst, src, count); if (count < 128) return CopyBlockOverlap<64>(dst, src, count); - return CopyAlignedBlocks<64, 16>(dst, src, count); + return CopySrcAlignedBlocks<64, 16>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -52,7 +52,7 @@ return CopyBlockOverlap<32>(dst, src, count); if (count < 128) return CopyBlockOverlap<64>(dst, src, count); - return CopyAlignedBlocks<32>(dst, src, count); + return CopySrcAlignedBlocks<32>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h --- a/libc/src/string/memory_utils/memcpy_utils.h +++ b/libc/src/string/memory_utils/memcpy_utils.h @@ -98,8 +98,8 @@ // `count > 2 * kBlockSize` for efficiency. // `count >= kAlignment` for correctness. template -static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src, - size_t count) { +static void CopySrcAlignedBlocks(char *__restrict dst, + const char *__restrict src, size_t count) { static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); static_assert(kAlignment <= kBlockSize, @@ -116,6 +116,25 @@ CopyLastBlock(dst, src, count); // Copy last block } +template +static void CopyDstAlignedBlocks(char *__restrict dst, + const char *__restrict src, size_t count) { + static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); + static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); + static_assert(kAlignment <= kBlockSize, + "kAlignment must be less or equal to block size"); + CopyBlock(dst, src); // Copy first block + + // Copy aligned blocks + const size_t ofla = offset_from_last_aligned(dst); + const size_t limit = count + ofla - kBlockSize; + for (size_t offset = kAlignment; offset < limit; offset += kBlockSize) + CopyBlock(assume_aligned(dst - ofla + offset), + src - ofla + offset); + + CopyLastBlock(dst, src, count); // Copy last block +} + } // namespace __llvm_libc #endif // LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H diff --git a/libc/src/string/x86/memcpy.cpp b/libc/src/string/x86/memcpy.cpp --- a/libc/src/string/x86/memcpy.cpp +++ b/libc/src/string/x86/memcpy.cpp @@ -87,7 +87,7 @@ if (kHasAvx && count < 256) return CopyBlockOverlap<128>(dst, src, count); if (count <= kRepMovsBSize) - return CopyAlignedBlocks(dst, src, count); + return CopyDstAlignedBlocks(dst, src, count); return CopyRepMovsb(dst, src, count); } diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp --- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp +++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp @@ -160,12 +160,12 @@ EXPECT_STREQ(trace.Read(), "01112111"); } -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) { +TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) { auto &trace = GetTrace(); // Source is aligned and multiple of alignment. // "1111" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(0), 4); + CopySrcAlignedBlocks<4>(I(0), I(0), 4); EXPECT_STREQ(trace.Write(), "2222"); EXPECT_STREQ(trace.Read(), "2222"); @@ -174,7 +174,7 @@ // + "00001111" // = "11111111" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(0), 8); + CopySrcAlignedBlocks<4>(I(0), I(0), 8); EXPECT_STREQ(trace.Write(), "11111111"); EXPECT_STREQ(trace.Read(), "11111111"); @@ -185,7 +185,7 @@ // + "0000000001111" // = "1111111112221" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(0), 13); + CopySrcAlignedBlocks<4>(I(0), I(0), 13); EXPECT_STREQ(trace.Write(), "1111111112221"); EXPECT_STREQ(trace.Read(), "1111111112221"); @@ -196,7 +196,7 @@ // + "00000000001111" // = "01112111112211" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(1), 13); + CopySrcAlignedBlocks<4>(I(0), I(1), 13); EXPECT_STREQ(trace.Write(), "1112111112211"); EXPECT_STREQ(trace.Read(), "01112111112211"); @@ -206,7 +206,7 @@ // + "000000001111" // = "011121111111" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(1), 11); + CopySrcAlignedBlocks<4>(I(0), I(1), 11); EXPECT_STREQ(trace.Write(), "11121111111"); EXPECT_STREQ(trace.Read(), "011121111111"); } @@ -216,14 +216,14 @@ // Source is aligned and multiple of alignment. // "11111111" trace.Clear(); - CopyAlignedBlocks<8, 4>(I(0), I(0), 8); + CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8); EXPECT_STREQ(trace.Write(), "22221111"); EXPECT_STREQ(trace.Read(), "22221111"); // Source is aligned and multiple of alignment. // "111111111" trace.Clear(); - CopyAlignedBlocks<8, 4>(I(0), I(0), 9); + CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9); EXPECT_STREQ(trace.Write(), "122211111"); EXPECT_STREQ(trace.Read(), "122211111"); } @@ -234,7 +234,7 @@ for (size_t count = 64; count < 768; ++count) { trace.Clear(); // We should never reload more than twice when copying from count = 2x32. - CopyAlignedBlocks<32>(I(alignment), I(0), count); + CopySrcAlignedBlocks<32>(I(alignment), I(0), count); const char *const written = trace.Write(); // First bytes are untouched. for (size_t i = 0; i < alignment; ++i) @@ -254,7 +254,7 @@ for (size_t count = 64; count < 768; ++count) { trace.Clear(); // We should never reload more than twice when copying from count = 2x32. - CopyAlignedBlocks<32, 16>(I(alignment), I(0), count); + CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count); const char *const written = trace.Write(); // First bytes are untouched. for (size_t i = 0; i < alignment; ++i)