diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -21,6 +21,7 @@
 
 include(CMakeParseArguments)
 include(LLVMLibCRules)
+include(LLVMLibCCheckCpuFeatures)
 
 add_subdirectory(src)
 add_subdirectory(config)
diff --git a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
--- a/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
+++ b/libc/cmake/modules/LLVMLibCCheckCpuFeatures.cmake
@@ -69,15 +69,19 @@
     _generate_flags_for_up_to(${feature} ${flag_name})
     set(${flag_name} ${${flag_name}} PARENT_SCOPE)
   endforeach()
+  # Populates the CPU_FEATURE_OPT_LEVELS with all opt levels.
+  list(TRANSFORM opt_levels PREPEND "CPU_FEATURE_OPT_")
+  list(TRANSFORM opt_levels TOUPPER)
+  set(CPU_FEATURE_OPT_LEVELS ${opt_levels} PARENT_SCOPE)
 endfunction()
 
 _generate_opt_levels()
 
 #------------------------------------------------------------------------------
 # Host cpu feature introspection
-#
-# Populates a HOST_CPU_FEATURES list containing the available CPU_FEATURE.
 #------------------------------------------------------------------------------
+
+# Populates a HOST_CPU_FEATURES list containing the available CPU_FEATURE.
 function(_check_host_cpu_feature feature)
   string(TOLOWER ${feature} lowercase_feature)
   try_run(
@@ -97,3 +101,33 @@
 foreach(feature IN LISTS ALL_CPU_FEATURES)
   _check_host_cpu_feature(${feature})
 endforeach()
+
+# Populates a HOST_CPU_FEATURE_OPT_LEVEL and HOST_CPU_FEATURE_OPT_FLAGS.
+function(_check_host_opt_level)
+  if(HOST_CPU_FEATURES)
+    list(GET HOST_CPU_FEATURES -1 selected_feature)
+  else()
+    set(selected_feature NONE)
+  endif()
+  set(HOST_CPU_FEATURE_OPT_LEVEL "CPU_FEATURE_OPT_${selected_feature}" PARENT_SCOPE)
+  set(HOST_CPU_FEATURE_OPT_FLAGS "${CPU_FEATURE_OPT_${selected_feature}_FLAGS}" PARENT_SCOPE)
+endfunction()
+
+_check_host_opt_level()
+
+# Convenient function to generate unique name for a particular function, architecture
+# and optimization level.
+function(generate_function_name function arch opt_level output)
+  if(NOT function)
+    message(FATAL_ERROR "function name must not be empty")
+  endif()
+  if(NOT arch)
+    message(FATAL_ERROR "architecture name must not be empty")
+  endif()
+  if(NOT ${opt_level} IN_LIST CPU_FEATURE_OPT_LEVELS)
+    message(FATAL_ERROR "Invalid opt_level, should be one of ${CPU_FEATURE_OPT_LEVELS} was ${opt_level}")
+  endif()
+  string(REPLACE "CPU_FEATURE_" "" short_opt ${opt_level})
+  string(TOLOWER ${short_opt} lower_short_opt)
+  set(${output} ${function}_${arch}_${lower_short_opt} PARENT_SCOPE)
+endfunction()
diff --git a/libc/cmake/modules/LLVMLibCRules.cmake b/libc/cmake/modules/LLVMLibCRules.cmake
--- a/libc/cmake/modules/LLVMLibCRules.cmake
+++ b/libc/cmake/modules/LLVMLibCRules.cmake
@@ -146,6 +146,18 @@
     PRIVATE
       "${LIBC_BUILD_DIR}/include;${LIBC_SOURCE_DIR};${LIBC_BUILD_DIR}"
   )
+  if(ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS)
+    target_compile_options(
+      ${target_name}_objects
+      PRIVATE ${ADD_ENTRYPOINT_OBJ_COMPILE_OPTIONS}
+    )
+  endif()
+  if(ADD_ENTRYPOINT_OBJ_COMPILE_DEFINITIONS)
+    target_compile_definitions(
+      ${target_name}_objects
+      PRIVATE ${ADD_ENTRYPOINT_OBJ_COMPILE_DEFINITIONS}
+    )
+  endif()
   add_dependencies(
     ${target_name}_objects
     support_common_h
@@ -308,6 +320,8 @@
 #      SRCS  <list of .cpp files for the test>
 #      HDRS  <list of .h files for the test>
 #      DEPENDS <list of dependencies>
+#      COMPILE_OPTIONS <list of compile options>
+#      COMPILE_DEFINITIONS <list of compile definitions>
 #    )
 function(add_libc_unittest target_name)
   if(NOT LLVM_INCLUDE_TESTS)
@@ -318,7 +332,7 @@
     "LIBC_UNITTEST"
     "" # No optional arguments
     "SUITE" # Single value arguments
-    "SRCS;HDRS;DEPENDS" # Multi-value arguments
+    "SRCS;HDRS;DEPENDS;COMPILE_OPTIONS;COMPILE_DEFINITIONS" # Multi-value arguments
     ${ARGN}
   )
   if(NOT LIBC_UNITTEST_SRCS)
@@ -356,7 +370,18 @@
       ${LIBC_BUILD_DIR}
       ${LIBC_BUILD_DIR}/include
   )
-
+  if(LIBC_UNITTEST_COMPILE_OPTIONS)
+    target_compile_options(
+      ${target_name}
+      PRIVATE ${LIBC_UNITTEST_COMPILE_OPTIONS}
+    )
+  endif()
+  if(LIBC_UNITTEST_COMPILE_DEFINITIONS)
+    target_compile_definitions(
+      ${target_name}
+      PRIVATE ${LIBC_UNITTEST_COMPILE_DEFINITIONS}
+    )
+  endif()
   if(library_deps)
     target_link_libraries(${target_name} PRIVATE ${library_deps})
   endif()
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -8,6 +8,7 @@
     # string.h entrypoints
     strcpy
     strcat
+    memcpy
 
     # sys/mman.h entrypoints
     mmap
diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(memory_utils)
+
 add_entrypoint_object(
   strcat
   SRCS
@@ -19,4 +21,52 @@
     string_h
 )
 
-add_subdirectory(memory_utils)
+# include the relevant architecture specific implementations
+if(${LIBC_TARGET_MACHINE} STREQUAL "x86")
+  set(LIBC_MEMCPY_IMPL_FOLDER "x86_64")
+else()
+  set(LIBC_MEMCPY_IMPL_FOLDER ${LIBC_TARGET_MACHINE})
+endif()
+
+add_subdirectory(${LIBC_MEMCPY_IMPL_FOLDER})
+
+add_gen_header(
+  memcpy_arch_specific
+  DEF_FILE
+    memcpy_arch_specific.h.def
+  GEN_HDR
+    memcpy_arch_specific.h
+  PARAMS
+    memcpy_arch_specific=${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc
+  DATA_FILES
+    ${LIBC_MEMCPY_IMPL_FOLDER}/memcpy_arch_specific.h.inc
+)
+
+foreach(opt_level ${LIBC_MEMCPY_OPT_LEVELS})
+  generate_function_name(memcpy ${LIBC_TARGET_MACHINE} ${opt_level} memcpy_name)
+  add_entrypoint_object(
+    ${memcpy_name}
+    SRCS memcpy.cpp
+    HDRS memcpy.h
+    DEPENDS
+      string_h
+      memory_utils
+      memcpy_arch_specific
+    COMPILE_OPTIONS
+      -fno-builtin-memcpy
+      ${${opt_level}_FLAGS}
+  )
+endforeach()
+
+add_entrypoint_object(
+  memcpy
+  SRCS memcpy.cpp
+  HDRS memcpy.h
+  DEPENDS
+    string_h
+    memory_utils
+    memcpy_arch_specific
+  COMPILE_OPTIONS
+    -fno-builtin-memcpy
+    ${HOST_CPU_FEATURE_OPT_FLAGS}
+)
diff --git a/libc/src/string/memcpy.h b/libc/src/string/memcpy.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy.h
@@ -0,0 +1,21 @@
+//===----------------- Implementation header for memcpy -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMCPY_H
+#define LLVM_LIBC_SRC_STRING_MEMCPY_H
+
+#include "include/string.h"
+#include <stddef.h> // size_t
+
+namespace __llvm_libc {
+
+void *memcpy(void *__restrict, const void *__restrict, size_t);
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMCPY_H
diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy.cpp
@@ -0,0 +1,22 @@
+//===--------------------- Implementation of memcpy -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memcpy_arch_specific.h"
+
+namespace __llvm_libc {
+
+void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
+                                   const void *__restrict src, size_t size) {
+  memcpy_no_return(reinterpret_cast<char *>(dst),
+                   reinterpret_cast<const char *>(src), size);
+  return dst;
+}
+
+} // namespace __llvm_libc
diff --git a/libc/src/string/memcpy_arch_specific.h.def b/libc/src/string/memcpy_arch_specific.h.def
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memcpy_arch_specific.h.def
@@ -0,0 +1,65 @@
+//===-------------- Implementation of arch specific memcpy ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
+
+%%include_file(${memcpy_arch_specific})
+
+namespace __llvm_libc {
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// The tests for `count` are in ascending order so the cost of branching is
+// proportional to the cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
+static void memcpy_no_return(char *__restrict dst, const char *__restrict src,
+                             size_t count) {
+  if (count == 0)
+    return;
+  if (count == 1)
+    return Copy<1>(dst, src);
+  if (count == 2)
+    return Copy<2>(dst, src);
+  if (count == 3)
+    return Copy<3>(dst, src);
+  if (count == 4)
+    return Copy<4>(dst, src);
+  if (count < 8)
+    return CopyOverlap<4>(dst, src, count);
+  if (count == 8)
+    return Copy<8>(dst, src);
+  if (count < 16)
+    return CopyOverlap<8>(dst, src, count);
+  if (count == 16)
+    return Copy<16>(dst, src);
+  if (count < 32)
+    return CopyOverlap<16>(dst, src, count);
+  if (count < 64)
+    return CopyOverlap<32>(dst, src, count);
+  if (count < 128)
+    return CopyOverlap<64>(dst, src, count);
+  CopyGE128(dst, src, count);
+}
+
+} // namespace __llvm_libc
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_ARCH_H
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -12,6 +12,9 @@
 
 add_header_library(
   memory_utils
-  HDRS utils.h
-  DEPENDS cacheline_size
+  HDRS
+    utils.h
+    memcpy_utils.h
+  DEPENDS
+    cacheline_size
 )
diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/memcpy_utils.h
@@ -0,0 +1,100 @@
+//===---------------------------- Memcpy utils ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+#define LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
+
+#include "src/string/memory_utils/utils.h"
+#include <stddef.h> // size_t
+
+// __builtin_memcpy_inline guarantees to never call external functions.
+// Unfortunately it is not widely available.
+#if defined(__clang__) && __has_builtin(__builtin_memcpy_inline)
+#define USE_BUILTIN_MEMCPY_INLINE
+#elif defined(__GNUC__)
+#define USE_BUILTIN_MEMCPY
+#endif
+
+// This is useful for testing.
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict,
+                                         const char *__restrict, size_t);
+#endif
+
+namespace __llvm_libc {
+
+// Copies `kBlockSize` bytes from `src` to `dst`.
+template <size_t kBlockSize>
+static void Copy(char *__restrict dst, const char *__restrict src) {
+#if defined(LLVM_LIBC_MEMCPY_MONITOR)
+  LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY_INLINE)
+  __builtin_memcpy_inline(dst, src, kBlockSize);
+#elif defined(USE_BUILTIN_MEMCPY)
+  __builtin_memcpy(dst, src, kBlockSize);
+#else
+  for (size_t i = 0; i < kBlockSize; ++i)
+    dst[i] = src[i];
+#endif
+}
+
+// Copies `kBlockSize` bytes from `src + count - kBlockSize` to
+// `dst + count - kBlockSize`.
+// Precondition: `count >= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyLastBlock(char *__restrict dst, const char *__restrict src,
+                          size_t count) {
+  const size_t offset = count - kBlockSize;
+  Copy<kBlockSize>(dst + offset, src + offset);
+}
+
+// Copies `kBlockSize` bytes twice with an overlap between the two.
+//
+// [1234567812345678123]
+// [__XXXXXXXXXXXXXX___]
+// [__XXXXXXXX_________]
+// [________XXXXXXXX___]
+//
+// Precondition: `count >= kBlockSize && count <= kBlockSize`.
+template <size_t kBlockSize>
+static void CopyOverlap(char *__restrict dst, const char *__restrict src,
+                        size_t count) {
+  Copy<kBlockSize>(dst, src);
+  CopyLastBlock<kBlockSize>(dst, src, count);
+}
+
+// Copies `count` bytes by blocks of `kBlockSize` bytes.
+// Copies at the start and end of the buffer are unaligned.
+// Copies in the middle of the buffer are aligned to `kBlockSize`.
+//
+// e.g. with
+// [12345678123456781234567812345678]
+// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+// [__XXXXXXXX______________________]
+// [________XXXXXXXX________________]
+// [________________XXXXXXXX________]
+// [_____________________XXXXXXXX___]
+//
+// Precondition: `count > 2 * kBlockSize` for efficiency.
+//               `count >= kBlockSize` for correctness.
+template <size_t kBlockSize>
+static void CopyAligned(char *__restrict dst, const char *__restrict src,
+                        size_t count) {
+  Copy<kBlockSize>(dst, src); // Copy first block
+
+  // Copy aligned blocks
+  size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst);
+  for (; offset + kBlockSize < count; offset += kBlockSize)
+    Copy<kBlockSize>(dst + offset, src + offset);
+
+  CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block
+}
+
+} // namespace __llvm_libc
+
+#endif //  LLVM_LIBC_SRC_MEMORY_UTILS_MEMCPY_UTILS_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -43,6 +43,11 @@
   return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
 }
 
+template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
+  static_assert(is_power2(alignment), "alignment must be a power of 2");
+  return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
+}
+
 template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
   static_assert(is_power2(alignment), "alignment must be a power of 2");
   // The logic is not straightforward and involves unsigned modulo arithmetic
@@ -51,7 +56,7 @@
 }
 
 // Returns the offset from `ptr` to the next cache line.
-static intptr_t offset_to_next_cache_line(const void *ptr) {
+static inline intptr_t offset_to_next_cache_line(const void *ptr) {
   return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
 }
 
diff --git a/libc/src/string/x86_64/CMakeLists.txt b/libc/src/string/x86_64/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/src/string/x86_64/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(LIBC_MEMCPY_OPT_LEVELS
+    CPU_FEATURE_OPT_NONE
+    CPU_FEATURE_OPT_SSE
+    CPU_FEATURE_OPT_AVX
+    CPU_FEATURE_OPT_AVX512F
+    CACHE INTERNAL LIBC_MEMCPY_OPT_LEVELS
+)
diff --git a/libc/src/string/x86_64/memcpy_arch_specific.h.inc b/libc/src/string/x86_64/memcpy_arch_specific.h.inc
new file mode 100644
--- /dev/null
+++ b/libc/src/string/x86_64/memcpy_arch_specific.h.inc
@@ -0,0 +1,35 @@
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+static void CopyRepMovsb(char *__restrict dst, const char *__restrict src,
+                         size_t count) {
+  // FIXME: Add MSVC suppport with
+  // #include <intrin.h>
+  // __movsb(reinterpret_cast<unsigned char *>(dst),
+  //         reinterpret_cast<const unsigned char *>(src), count);
+  asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+}
+
+#if defined(__AVX__)
+#define BEST_SIZE 64
+#else
+#define BEST_SIZE 32
+#endif
+
+static void CopyGE128(char *__restrict dst, const char *__restrict src,
+                      size_t count) {
+#if defined(__AVX__)
+  if (count < 256)
+    return CopyOverlap<128>(dst, src, count);
+#endif
+  // kRepMovsBSize == -1 : Only CopyAligned is used.
+  // kRepMovsBSize ==  0 : Only RepMovsb is used.
+  // else CopyAligned is used to to kRepMovsBSize and then RepMovsb.
+  constexpr size_t kRepMovsBSize = -1;
+  if (count <= kRepMovsBSize)
+    return CopyAligned<BEST_SIZE>(dst, src, count);
+  CopyRepMovsb(dst, src, count);
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt
--- a/libc/test/src/string/CMakeLists.txt
+++ b/libc/test/src/string/CMakeLists.txt
@@ -22,3 +22,22 @@
   DEPENDS
     strcpy
 )
+
+# Tests all implementations of memcpy.
+foreach(opt_level ${LIBC_MEMCPY_OPT_LEVELS})
+  # FIXME: remove implementations that can't be tested on the host.
+  # This needs a notion of opt_level compatibility.
+  generate_function_name(memcpy ${LIBC_TARGET_MACHINE} ${opt_level} memcpy_config_name)
+  add_libc_unittest(
+    ${memcpy_config_name}_test
+    SUITE
+      libc_string_unittests
+    SRCS
+      memcpy_test.cpp
+    DEPENDS
+      ${memcpy_config_name}
+    COMPILE_OPTIONS
+      ${HOST_CPU_FEATURE_OPT_FLAGS}
+  )
+endforeach()
+
diff --git a/libc/test/src/string/memcpy_test.cpp b/libc/test/src/string/memcpy_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/string/memcpy_test.cpp
@@ -0,0 +1,58 @@
+//===----------------------- Unittests for memcpy -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "utils/CPP/ArrayRef.h"
+#include "utils/UnitTest/Test.h"
+
+namespace __llvm_libc {
+
+void *memcpy(void *__restrict, const void *__restrict, size_t);
+
+} // namespace __llvm_libc
+
+using __llvm_libc::cpp::Array;
+using __llvm_libc::cpp::ArrayRef;
+using __llvm_libc::cpp::MutableArrayRef;
+using Data = Array<char, 2048>;
+
+static const ArrayRef<char> kNumbers("0123456789", 10);
+static const ArrayRef<char> kDeadcode("DEADC0DE", 8);
+
+// Returns a Data object filled with a repetition of `filler`.
+Data getData(ArrayRef<char> filler) {
+  Data out;
+  for (size_t i = 0; i < out.size(); ++i)
+    out[i] = filler[i % filler.size()];
+  return out;
+}
+
+TEST(MemcpyTest, Thorough) {
+  const Data groundtruth = getData(kNumbers);
+  const Data dirty = getData(kDeadcode);
+  for (size_t count = 0; count < 1024; ++count) {
+    for (size_t align = 0; align < 64; ++align) {
+      auto buffer = dirty;
+      const char *const src = groundtruth.data();
+      char *const dst = &buffer[align];
+      __llvm_libc::memcpy(dst, src, count);
+      // Everything before copy is untouched.
+      for (size_t i = 0; i < align; ++i)
+        ASSERT_EQ(buffer[i], dirty[i]);
+      // Everything in between is copied.
+      for (size_t i = 0; i < count; ++i)
+        ASSERT_EQ(buffer[align + i], groundtruth[i]);
+      // Everything after copy is untouched.
+      for (size_t i = align + count; i < dirty.size(); ++i)
+        ASSERT_EQ(buffer[i], dirty[i]);
+    }
+  }
+}
+
+// FIXME: Add tests with reads and writes on the boundary of a read/write
+// protected page to check we're not reading nor writing prior/past the allowed
+// regions.
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -4,7 +4,14 @@
     libc_string_unittests
   SRCS
     utils_test.cpp
+    memcpy_utils_test.cpp
   DEPENDS
     memory_utils
     standalone_cpp
 )
+
+target_compile_definitions(
+  utils_test
+  PRIVATE
+  LLVM_LIBC_MEMCPY_MONITOR=memcpy_monitor
+)
diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp
@@ -0,0 +1,208 @@
+//===-------------------- Unittests for memory_utils ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memory_utils/memcpy_utils.h"
+#include "utils/CPP/Array.h"
+#include "utils/UnitTest/Test.h"
+
+#include <assert.h>
+#include <stdint.h> // uintptr_t
+
+#ifndef LLVM_LIBC_MEMCPY_MONITOR
+#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test.
+#endif
+
+namespace __llvm_libc {
+
+struct Buffer {
+  static constexpr size_t kMaxBuffer = 1024;
+  char buffer[kMaxBuffer + 1];
+  size_t last = 0;
+
+  void Clear() {
+    last = 0;
+    for (size_t i = 0; i < kMaxBuffer; ++i)
+      buffer[i] = '0';
+    buffer[kMaxBuffer] = '\0';
+  }
+
+  void Increment(const void *ptr) {
+    const auto offset = reinterpret_cast<uintptr_t>(ptr);
+    assert(offset < kMaxBuffer);
+    ++buffer[offset];
+    if (offset > last)
+      last = offset;
+  }
+
+  char *Finish() {
+    assert(last < kMaxBuffer);
+    buffer[last + 1] = '\0';
+    return buffer;
+  }
+};
+
+struct Trace {
+  Buffer read;
+  Buffer write;
+
+  void Add(char *__restrict dst, const char *__restrict src, size_t count) {
+    for (size_t i = 0; i < count; ++i)
+      read.Increment(src + i);
+    for (size_t i = 0; i < count; ++i)
+      write.Increment(dst + i);
+  }
+
+  void Clear() {
+    read.Clear();
+    write.Clear();
+  }
+
+  char *Read() { return read.Finish(); }
+  char *Write() { return write.Finish(); }
+};
+
+static Trace &GetTrace() {
+  static thread_local Trace events;
+  return events;
+}
+
+extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst,
+                                         const char *__restrict src,
+                                         size_t count) {
+  GetTrace().Add(dst, src, count);
+}
+
+char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); }
+
+TEST(MemcpyUtilsTest, CopyTrivial) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  Copy<1>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1");
+  EXPECT_STREQ(trace.Read(), "1");
+
+  trace.Clear();
+  Copy<2>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11");
+  EXPECT_STREQ(trace.Read(), "11");
+
+  trace.Clear();
+  Copy<4>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  Copy<8>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111");
+  EXPECT_STREQ(trace.Read(), "11111111");
+
+  trace.Clear();
+  Copy<16>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "1111111111111111");
+  EXPECT_STREQ(trace.Read(), "1111111111111111");
+
+  trace.Clear();
+  Copy<32>(I(0), I(0));
+  EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111");
+  EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111");
+
+  trace.Clear();
+  Copy<64>(I(0), I(0));
+  EXPECT_STREQ(
+      trace.Write(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+  EXPECT_STREQ(
+      trace.Read(),
+      "1111111111111111111111111111111111111111111111111111111111111111");
+}
+
+TEST(MemcpyUtilsTest, CopyOffset) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  Copy<1>(I(3), I(1));
+  EXPECT_STREQ(trace.Write(), "0001");
+  EXPECT_STREQ(trace.Read(), "01");
+
+  trace.Clear();
+  Copy<1>(I(2), I(1));
+  EXPECT_STREQ(trace.Write(), "001");
+  EXPECT_STREQ(trace.Read(), "01");
+}
+
+TEST(MemcpyUtilsTest, CopyOverlap) {
+  auto &trace = GetTrace();
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 2);
+  EXPECT_STREQ(trace.Write(), "22");
+  EXPECT_STREQ(trace.Read(), "22");
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 3);
+  EXPECT_STREQ(trace.Write(), "121");
+  EXPECT_STREQ(trace.Read(), "121");
+
+  trace.Clear();
+  CopyOverlap<2>(I(0), I(0), 4);
+  EXPECT_STREQ(trace.Write(), "1111");
+  EXPECT_STREQ(trace.Read(), "1111");
+
+  trace.Clear();
+  CopyOverlap<4>(I(2), I(1), 7);
+  EXPECT_STREQ(trace.Write(), "001112111");
+  EXPECT_STREQ(trace.Read(), "01112111");
+}
+
+TEST(MemcpyUtilsTest, CopyAligned) {
+  auto &trace = GetTrace();
+  // Destination is aligned already.
+  //   "1111000000000"
+  // + "0000111100000"
+  // + "0000000011110"
+  // + "0000000001111"
+  // = "1111111112221"
+  trace.Clear();
+  CopyAligned<4>(I(0), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "1111111112221");
+  EXPECT_STREQ(trace.Read(), "1111111112221");
+
+  // Misaligned destination
+  //   "01111000000000"
+  // + "00001111000000"
+  // + "00000000111100"
+  // + "00000000001111"
+  // = "01112111112211"
+  trace.Clear();
+  CopyAligned<4>(I(1), I(0), 13);
+  EXPECT_STREQ(trace.Write(), "01112111112211");
+  EXPECT_STREQ(trace.Read(), "1112111112211");
+}
+
+TEST(MemcpyUtilsTest, MaxReloads) {
+  auto &trace = GetTrace();
+  for (size_t alignment = 0; alignment < 32; ++alignment) {
+    for (size_t count = 64; count < 768; ++count) {
+      trace.Clear();
+      // We should never reload more than twice when copying from count = 2x32.
+      CopyAligned<32>(I(alignment), I(0), count);
+      const char *const written = trace.Write();
+      // First bytes are untouched.
+      for (size_t i = 0; i < alignment; ++i)
+        EXPECT_EQ(written[i], '0');
+      // Next bytes are loaded once or twice but no more.
+      for (size_t i = alignment; i < count; ++i) {
+        EXPECT_GE(written[i], '1');
+        EXPECT_LE(written[i], '2');
+      }
+    }
+  }
+}
+
+} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -87,6 +87,14 @@
   EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
 }
 
+TEST(UtilsTest, OffsetFromLastAligned) {
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
+  EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
+}
+
 TEST(UtilsTest, OffsetToNextCacheLine) {
   EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
   EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));