diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -8,6 +8,7 @@
     # string.h entrypoints
     strcpy
     strcat
+    memcpy
 
     # sys/mman.h entrypoints
     mmap
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -1,3 +1,17 @@
+add_subdirectory(memory_utils)
+
+# Checking that the raw object does not contain any undefined symbols.
+function(check_no_undefined_symbols target_name)
+if(CMAKE_BUILD_TYPE MATCHES RELEASE)
+add_custom_command(
+  TARGET ${target_name}
+  POST_BUILD
+  COMMENT "Check that the generated code is valid"
+  COMMAND ! (nm --undefined-only $<TARGET_PROPERTY:${target_name},OBJECT_FILE_RAW> | grep U && echo "Implementation contains undefined symbols") 
+)
+endif(CMAKE_BUILD_TYPE MATCHES RELEASE)
+endfunction(check_no_undefined_symbols)
+
 add_entrypoint_object(
   strcat
   SRCS
@@ -19,4 +33,28 @@
     string_h
 )
 
-add_subdirectory(memory_utils)
+add_gen_header(
+  memcpy_arch_specific
+  DEF_FILE
+    memcpy_arch_specific.h.def
+  GEN_HDR
+    memcpy_arch_specific.h
+  PARAMS
+    memcpy_arch_specific=memcpy_arch_specific_${LIBC_TARGET_MACHINE}.h.inc
+  DATA_FILES
+    memcpy_arch_specific_${LIBC_TARGET_MACHINE}.h.inc
+)
+
+set_property(SOURCE memcpy.cpp PROPERTY COMPILE_OPTIONS -fno-builtin-memcpy)
+add_entrypoint_object(
+  memcpy
+  SRCS
+    memcpy.cpp
+  HDRS
+    memcpy.h
+  DEPENDS
+    string_h
+    memory_utils
+    memcpy_arch_specific
+)
+check_no_undefined_symbols(memcpy)
diff --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy.h
@@ -0,0 +1,21 @@
+//===----------------- Implementation header for memcpy -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H
+#define LLVM_LIBC_SRC_STRING_MEMCPY_H
+
+#include "include/string.h"
+#include <stddef.h> // size_t
+
+namespace __llvm_libc {
+
+void *memcpy(void *__restrict, const void *__restrict, size_t);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H
diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy.cpp
@@ -0,0 +1,65 @@
+//===--------------------- Implementation of memcpy -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memcpy_arch_specific.h"
+
+namespace __llvm_libc {
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc functions,
+// it was found most operations act on a small number of bytes. This makes it
+// important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved with
+//   little change on the code side.
+static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
+                             size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return Copy<1>(dst, src);
+  if (count == 2)
+    return Copy<2>(dst, src);
+  if (count == 3)
+    return Copy<3>(dst, src);
+  if (count == 4)
+    return Copy<4>(dst, src);
+  if (count < 8)
+    return CopyOverlap<4>(dst, src, count);
+  if (count < 16)
+    return CopyOverlap<8>(dst, src, count);
+  if (count < 32)
+    return CopyOverlap<16>(dst, src, count);
+  if (count < 64)
+    return CopyOverlap<32>(dst, src, count);
+  if (count < 128)
+    return CopyOverlap<64>(dst, src, count);
+  CopyGE128(dst, src, count);
+}
+
+void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
+                                   const void *__restrict src, size_t size) {
+  memcpy_no_return(reinterpret_cast<char *>(dst),
+                   reinterpret_cast<const char *>(src), size);
+  return dst;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy_arch_specific.h.def
@@ -0,0 +1,21 @@
+//===-------------- Implementation of arch specific memcpy ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+%%include_file(${memcpy_arch_specific})
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
diff --git a/libc/src/string/memcpy_arch_specific_x86.inc b/libc/src/string/memcpy_arch_specific_x86.inc
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy_arch_specific_x86.inc
@@ -0,0 +1,19 @@
+static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
+                         size_t count) {
+  // FIXME: Add MSVC suppport with
+  // #include <intrin.h>
+  // __movsb(reinterpret_cast<unsigned char *>(dst),
+  //         reinterpret_cast<const unsigned char *>(src), count);
+  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+}
+
+static void CopyGE128(char *__restrict dst, const char *__restrict src,
+                      size_t count) {
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
+  constexpr size_t kRepMovsBSize = -1;
+  if (count <= kRepMovsBSize)
+    return CopyAligned<32>(dst, src, count);
+  CopyRepMovsb(dst, src, count);
+}
diff --git a/libc/src/string/memcpy_arch_specific_x86_64.h.inc b/libc/src/string/memcpy_arch_specific_x86_64.h.inc
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy_arch_specific_x86_64.h.inc
@@ -0,0 +1,19 @@
+static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
+                         size_t count) {
+  // FIXME: Add MSVC suppport with
+  // #include <intrin.h>
+  // __movsb(reinterpret_cast<unsigned char *>(dst),
+  //         reinterpret_cast<const unsigned char *>(src), count);
+  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+}
+
+static void CopyGE128(char *__restrict dst, const char *__restrict src,
+                      size_t count) {
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
+  constexpr size_t kRepMovsBSize = -1;
+  if (count <= kRepMovsBSize)
+    return CopyAligned<32>(dst, src, count);
+  CopyRepMovsb(dst, src, count);
+}
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -12,6 +12,9 @@
 
 add_header_library(
   memory_utils
-  HDRS utils.h
-  DEPENDS cacheline_size
+  HDRS
+    utils.h
+    memcpy_utils.h
+  DEPENDS
+    cacheline_size
 )
diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -0,0 +1,100 @@
+//===---------------------------- Memcpy utils ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+
+#include "src/string/memory_utils/utils.h"
+#include <stddef.h> // size_t
+
+// __builtin_memcpy_inline guarantees to never call external functions.
+// Unfortunately it is not widely available.
+#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline)
+#define USE_BUILTIN_MEMCPY_INLINE
+#elif defined(__GNUC__)
+#define USE_BUILTIN_MEMCPY
+#endif
+
+// This is useful for testing.
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
+                                         const char *__restrict, size_t);
+#endif
+
+namespace __llvm_libc {
+
+// Copies `kBlockSize` bytes from `src` to `dst`.
+template <size_t kBlockSize>
+static void Copy(char *__restrict dst, const char *__restrict src) {
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+  LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY_INLINE)
+  __builtin_memcpy_inline(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY)
+  __builtin_memcpy(dst, src, kBlockSize);
+#else
+  for (size_t i = 0; i < kBlockSize; ++i)
+    dst[i] = src[i];
+#endif
+}
+
+// Copies `kBlockSize` bytes from `src + count - kBlockSize` to
+// `dst + count - kBlockSize`.
+// Precondition: `count >= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
+                          size_t count) {
+  const size_t offset = count - kBlockSize;
+  Copy<kBlockSize>(dst + offset, src + offset);
+}
+
+// Copies `kBlockSize` bytes twice with an overlap between the two.
+//
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `count >= kBlockSize && count <= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyOverlap(char *__restrict dst, const char *__restrict src,
+                        size_t count) {
+  Copy<kBlockSize>(dst, src);
+  CopyLastBlock<kBlockSize>(dst, src, count);
+}
+
+// Copies `count` bytes by blocks of `kBlockSize` bytes.
+// Copies at the start and end of the buffer are unaligned.
+// Copies in the middle of the buffer are aligned to `kBlockSize`.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX______________________]
+// [________XXXXXXXX________________]
+// [________________XXXXXXXX________]
+// [_____________________XXXXXXXX___]
+//
+// Precondition: `count > 2 * kBlockSize` for efficiency.
+//               `count >= kBlockSize` for correctness.
+template <size_t kBlockSize>
+static void CopyAligned(char *__restrict dst, const char *__restrict src,
+                        size_t count) {
+  Copy<kBlockSize>(dst, src); // Copy first block
+
+  // Copy aligned blocks
+  size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
+  for (; offset + kBlockSize < count; offset += kBlockSize)
+    Copy<kBlockSize>(dst + offset, src + offset);
+
+  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
+} // namespace __llvm_libc
+
+#endif //  LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -43,6 +43,11 @@
   return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
 }
 
+template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
+  static_assert(is_power2(alignment), "alignment must be a power of 2");
+  return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
+}
+
 template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
   static_assert(is_power2(alignment), "alignment must be a power of 2");
   // The logic is not straightforward and involves unsigned modulo arithmetic
@@ -51,7 +56,7 @@
 }
 
 // Returns the offset from `ptr` to the next cache line.
-static intptr_t offset_to_next_cache_line(const void *ptr) {
+static inline intptr_t offset_to_next_cache_line(const void *ptr) {
   return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
 }
 
diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -22,3 +22,13 @@
   DEPENDS
     strcpy
 )
+
+add_libc_unittest(
+  memcpy_test
+  SUITE
+    libc_string_unittests
+  SRCS
+    memcpy_test.cpp
+  DEPENDS
+    memcpy
+)
diff --git a/libc/test/src/string/memcpy_test.cpp b/libc/test/src/string/memcpy_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/string/memcpy_test.cpp
@@ -0,0 +1,53 @@
+//===----------------------- Unittests for memcpy -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "utils/CPP/ArrayRef.h"
+#include "utils/UnitTest/Test.h"
+
+using __llvm_libc::cpp::Array;
+using __llvm_libc::cpp::ArrayRef;
+using __llvm_libc::cpp::MutableArrayRef;
+using Data = Array<char, 2048>;
+
+static const ArrayRef<char> kNumbers("0123456789", 10);
+static const ArrayRef<char> kDeadcode("DEADC0DE", 8);
+
+// Returns a Data object filled with a repetition of `filler`.
+Data getData(ArrayRef<char> filler) {
+  Data out;
+  for (size_t i = 0; i < out.size(); ++i)
+    out[i] = filler[i % filler.size()];
+  return out;
+}
+
+TEST(MemcpyTest, Thorough) {
+  const Data groundtruth = getData(kNumbers);
+  const Data dirty = getData(kDeadcode);
+  for (size_t count = 0; count < 1024; ++count) {
+    for (size_t align = 0; align < 64; ++align) {
+      auto buffer = dirty;
+      const char *const src = groundtruth.data();
+      char *const dst = &buffer[align];
+      __llvm_libc::memcpy(dst, src, count);
+      // Everything before copy is untouched.
+      for (size_t i = 0; i < align; ++i)
+        ASSERT_EQ(buffer[i], dirty[i]);
+      // Everything in between is copied.
+      for (size_t i = 0; i < count; ++i)
+        ASSERT_EQ(buffer[align + i], groundtruth[i]);
+      // Everything after copy is untouched.
+      for (size_t i = align + count; i < dirty.size(); ++i)
+        ASSERT_EQ(buffer[i], dirty[i]);
+    }
+  }
+}
+
+// FIXME: Add tests with reads and writes on the boundary of a read/write
+// protected page to check we're not reading nor writing prior/past the allowed
+// regions.
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -4,7 +4,14 @@
     libc_string_unittests
   SRCS
     utils_test.cpp
+    memcpy_utils_test.cpp
   DEPENDS
     memory_utils
     standalone_cpp
 )
+
+target_compile_definitions(
+  utils_test
+  PRIVATE
+  LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor
+)
diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -0,0 +1,208 @@
+//===-------------------- Unittests for memory_utils ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memory_utils/memcpy_utils.h"
+#include "utils/CPP/Array.h"
+#include "utils/UnitTest/Test.h"
+
+#include <assert.h>
+#include <stdint.h> // uintptr_t
+
+#ifndef LLVM_LIBC_MEMCPY_MONITOR
+#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test.
+#endif
+
+namespace __llvm_libc {
+
+struct Buffer {
+  static constexpr size_t kMaxBuffer = 1024;
+  char buffer[kMaxBuffer + 1];
+  size_t last = 0;
+
+  void Clear() {
+    last = 0;
+    for (size_t i = 0; i < kMaxBuffer; ++i)
+      buffer[i] = '0';
+    buffer[kMaxBuffer] = '\0';
+  }
+
+  void Increment(const void *ptr) {
+    const auto offset = reinterpret_cast<uintptr_t>(ptr);
+    assert(offset < kMaxBuffer);
+    ++buffer[offset];
+    if (offset > last)
+      last = offset;
+  }
+
+  char *Finish() {
+    assert(last < kMaxBuffer);
+    buffer[last + 1] = '\0';
+    return buffer;
+  }
+};
+
+struct Trace {
+  Buffer read;
+  Buffer write;
+
+  void Add(char *__restrict dst, const char *__restrict src, size_t count) {
+    for (size_t i = 0; i < count; ++i)
+      read.Increment(src + i);
+    for (size_t i = 0; i < count; ++i)
+      write.Increment(dst + i);
+  }
+
+  void Clear() {
+    read.Clear();
+    write.Clear();
+  }
+
+  char *Read() { return read.Finish(); }
+  char *Write() { return write.Finish(); }
+};
+
+static Trace &GetTrace() {
+  static thread_local Trace events;
+  return events;
+}
+
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst,
+                                         const char *__restrict src,
+                                         size_t count) {
+  GetTrace().Add(dst, src, count);
+}
+
+char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); }
+
+TEST(MemcpyUtilsTest, CopyTrivial) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  Copy<1>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1");
+  EXPECT_STREQ(trace.Read(), "1");
+
+  trace.Clear();
+  Copy<2>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11");
+  EXPECT_STREQ(trace.Read(), "11");
+
+  trace.Clear();
+  Copy<4>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  Copy<8>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  trace.Clear();
+  Copy<16>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111111111111111");
+  EXPECT_STREQ(trace.Read(), "1111111111111111");
+
+  trace.Clear();
+  Copy<32>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
+  EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
+
+  trace.Clear();
+  Copy<64>(I(0), I(0));
+  EXPECT_STREQ(
+      trace.Write(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+  EXPECT_STREQ(
+      trace.Read(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+}
+
+TEST(MemcpyUtilsTest, CopyOffset) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  Copy<1>(I(3), I(1));
+  EXPECT_STREQ(trace.Write(), "0001");
+  EXPECT_STREQ(trace.Read(), "01");
+
+  trace.Clear();
+  Copy<1>(I(2), I(1));
+  EXPECT_STREQ(trace.Write(), "001");
+  EXPECT_STREQ(trace.Read(), "01");
+}
+
+TEST(MemcpyUtilsTest, CopyOverlap) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 2);
+  EXPECT_STREQ(trace.Write(), "22");
+  EXPECT_STREQ(trace.Read(), "22");
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 3);
+  EXPECT_STREQ(trace.Write(), "121");
+  EXPECT_STREQ(trace.Read(), "121");
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  CopyOverlap<4>(I(2), I(1), 7);
+  EXPECT_STREQ(trace.Write(), "001112111");
+  EXPECT_STREQ(trace.Read(), "01112111");
+}
+
+TEST(MemcpyUtilsTest, CopyAligned) {
+  auto &trace = GetTrace();
+  // Destination is aligned already.
+  //   "1111000000000"
+  // + "0000111100000"
+  // + "0000000011110"
+  // + "0000000001111"
+  // = "1111111112221"
+  trace.Clear();
+  CopyAligned<4>(I(0), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "1111111112221");
+  EXPECT_STREQ(trace.Read(), "1111111112221");
+
+  // Misaligned destination
+  //   "01111000000000"
+  // + "00001111000000"
+  // + "00000000111100"
+  // + "00000000001111"
+  // = "01112111112211"
+  trace.Clear();
+  CopyAligned<4>(I(1), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "01112111112211");
+  EXPECT_STREQ(trace.Read(), "1112111112211");
+}
+
+TEST(MemcpyUtilsTest, MaxReloads) {
+  auto &trace = GetTrace();
+  for (size_t alignment = 0; alignment < 32; ++alignment) {
+    for (size_t count = 64; count < 768; ++count) {
+      trace.Clear();
+      // We should never reload more than twice when copying from count = 2x32.
+      CopyAligned<32>(I(alignment), I(0), count);
+      const char *const written = trace.Write();
+      // First bytes are untouched.
+      for (size_t i = 0; i < alignment; ++i)
+        EXPECT_EQ(written[i], '0');
+      // Next bytes are loaded once or twice but no more.
+      for (size_t i = alignment; i < count; ++i) {
+        EXPECT_GE(written[i], '1');
+        EXPECT_LE(written[i], '2');
+      }
+    }
+  }
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -87,6 +87,14 @@
   EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
 }
 
+TEST(UtilsTest, OffsetFromLastAligned) {
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
+  EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
+}
+
 TEST(UtilsTest, OffsetToNextCacheLine) {
   EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
   EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));