diff --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp --- a/libc/src/string/aarch64/memcpy.cpp +++ b/libc/src/string/aarch64/memcpy.cpp @@ -54,7 +54,7 @@ return CopyBlockOverlap<32>(dst, src, count); if (count < 128) return CopyBlockOverlap<64>(dst, src, count); - return CopyAlignedBlocks<64, 16>(dst, src, count); + return CopySrcAlignedBlocks<64, 16>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -52,7 +52,7 @@ return CopyBlockOverlap<32>(dst, src, count); if (count < 128) return CopyBlockOverlap<64>(dst, src, count); - return CopyAlignedBlocks<32>(dst, src, count); + return CopySrcAlignedBlocks<32>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h --- a/libc/src/string/memory_utils/memcpy_utils.h +++ b/libc/src/string/memory_utils/memcpy_utils.h @@ -98,8 +98,8 @@ // `count > 2 * kBlockSize` for efficiency. // `count >= kAlignment` for correctness. template -static void CopyAlignedBlocks(char *__restrict dst, const char *__restrict src, - size_t count) { +static void CopySrcAlignedBlocks(char *__restrict dst, + const char *__restrict src, size_t count) { static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); static_assert(kAlignment <= kBlockSize, @@ -116,6 +116,25 @@ CopyLastBlock(dst, src, count); // Copy last block } +template +static void CopyDstAlignedBlocks(char *__restrict dst, + const char *__restrict src, size_t count) { + static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); + static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); + static_assert(kAlignment <= kBlockSize, + "kAlignment must be less or equal to block size"); + CopyBlock(dst, src); // Copy first block + + // Copy aligned blocks + const size_t ofla = offset_from_last_aligned(dst); + const size_t limit = count + ofla - kBlockSize; + for (size_t offset = kAlignment; offset < limit; offset += kBlockSize) + CopyBlock(assume_aligned(dst - ofla + offset), + src - ofla + offset); + + CopyLastBlock(dst, src, count); // Copy last block +} + } // namespace __llvm_libc #endif // LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H diff --git a/libc/src/string/x86/memcpy.cpp b/libc/src/string/x86/memcpy.cpp --- a/libc/src/string/x86/memcpy.cpp +++ b/libc/src/string/x86/memcpy.cpp @@ -87,7 +87,7 @@ if (kHasAvx && count < 256) return CopyBlockOverlap<128>(dst, src, count); if (count <= kRepMovsBSize) - return CopyAlignedBlocks(dst, src, count); + return CopyDstAlignedBlocks(dst, src, count); return CopyRepMovsb(dst, src, count); } diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp --- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp +++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp @@ -160,12 +160,12 @@ EXPECT_STREQ(trace.Read(), "01112111"); } -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocks) { +TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) { auto &trace = GetTrace(); // Source is aligned and multiple of alignment. // "1111" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(0), 4); + CopySrcAlignedBlocks<4>(I(0), I(0), 4); EXPECT_STREQ(trace.Write(), "2222"); EXPECT_STREQ(trace.Read(), "2222"); @@ -174,7 +174,7 @@ // + "00001111" // = "11111111" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(0), 8); + CopySrcAlignedBlocks<4>(I(0), I(0), 8); EXPECT_STREQ(trace.Write(), "11111111"); EXPECT_STREQ(trace.Read(), "11111111"); @@ -185,7 +185,7 @@ // + "0000000001111" // = "1111111112221" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(0), 13); + CopySrcAlignedBlocks<4>(I(0), I(0), 13); EXPECT_STREQ(trace.Write(), "1111111112221"); EXPECT_STREQ(trace.Read(), "1111111112221"); @@ -196,7 +196,7 @@ // + "00000000001111" // = "01112111112211" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(1), 13); + CopySrcAlignedBlocks<4>(I(0), I(1), 13); EXPECT_STREQ(trace.Write(), "1112111112211"); EXPECT_STREQ(trace.Read(), "01112111112211"); @@ -206,24 +206,89 @@ // + "000000001111" // = "011121111111" trace.Clear(); - CopyAlignedBlocks<4>(I(0), I(1), 11); + CopySrcAlignedBlocks<4>(I(0), I(1), 11); EXPECT_STREQ(trace.Write(), "11121111111"); EXPECT_STREQ(trace.Read(), "011121111111"); } +TEST(LlvmLibcMemcpyUtilsTest, CopyDstAlignedBlocks) { + auto &trace = GetTrace(); + // Destination is aligned and multiple of alignment. + // "1111" + trace.Clear(); + CopyDstAlignedBlocks<4>(I(0), I(0), 4); + EXPECT_STREQ(trace.Write(), "2222"); + EXPECT_STREQ(trace.Read(), "2222"); + + // Destination is aligned and multiple of alignment. + // "11110000" + // + "00001111" + // = "11111111" + trace.Clear(); + CopyDstAlignedBlocks<4>(I(0), I(0), 8); + EXPECT_STREQ(trace.Write(), "11111111"); + EXPECT_STREQ(trace.Read(), "11111111"); + + // Destination is aligned already overlap at end. + // "1111000000000" + // + "0000111100000" + // + "0000000011110" + // + "0000000001111" + // = "1111111112221" + trace.Clear(); + CopyDstAlignedBlocks<4>(I(0), I(0), 13); + EXPECT_STREQ(trace.Write(), "1111111112221"); + EXPECT_STREQ(trace.Read(), "1111111112221"); + + // Misaligned destination. + // "01111000000000" + // + "00001111000000" + // + "00000000111100" + // + "00000000001111" + // = "01112111112211" + trace.Clear(); + CopyDstAlignedBlocks<4>(I(1), I(0), 13); + EXPECT_STREQ(trace.Write(), "01112111112211"); + EXPECT_STREQ(trace.Read(), "1112111112211"); + + // Misaligned destination aligned at end. + // "011110000000" + // + "000011110000" + // + "000000001111" + // = "011121111111" + trace.Clear(); + CopyDstAlignedBlocks<4>(I(1), I(0), 11); + EXPECT_STREQ(trace.Write(), "011121111111"); + EXPECT_STREQ(trace.Read(), "11121111111"); +} + TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignment) { auto &trace = GetTrace(); // Source is aligned and multiple of alignment. // "11111111" trace.Clear(); - CopyAlignedBlocks<8, 4>(I(0), I(0), 8); + CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8); + EXPECT_STREQ(trace.Write(), "22221111"); + EXPECT_STREQ(trace.Read(), "22221111"); + + // Destination is aligned and multiple of alignment. + // "11111111" + trace.Clear(); + CopyDstAlignedBlocks<8, 4>(I(0), I(0), 8); EXPECT_STREQ(trace.Write(), "22221111"); EXPECT_STREQ(trace.Read(), "22221111"); // Source is aligned and multiple of alignment. // "111111111" trace.Clear(); - CopyAlignedBlocks<8, 4>(I(0), I(0), 9); + CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9); + EXPECT_STREQ(trace.Write(), "122211111"); + EXPECT_STREQ(trace.Read(), "122211111"); + + // Destination is aligned and multiple of alignment. + // "111111111" + trace.Clear(); + CopyDstAlignedBlocks<8, 4>(I(0), I(0), 9); EXPECT_STREQ(trace.Write(), "122211111"); EXPECT_STREQ(trace.Read(), "122211111"); } @@ -234,7 +299,7 @@ for (size_t count = 64; count < 768; ++count) { trace.Clear(); // We should never reload more than twice when copying from count = 2x32. - CopyAlignedBlocks<32>(I(alignment), I(0), count); + CopySrcAlignedBlocks<32>(I(alignment), I(0), count); const char *const written = trace.Write(); // First bytes are untouched. for (size_t i = 0; i < alignment; ++i) @@ -254,7 +319,7 @@ for (size_t count = 64; count < 768; ++count) { trace.Clear(); // We should never reload more than twice when copying from count = 2x32. - CopyAlignedBlocks<32, 16>(I(alignment), I(0), count); + CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count); const char *const written = trace.Write(); // First bytes are untouched. for (size_t i = 0; i < alignment; ++i)