diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt
--- a/libc/src/string/CMakeLists.txt
+++ b/libc/src/string/CMakeLists.txt
@@ -81,6 +81,9 @@
 if(${LIBC_TARGET_MACHINE} STREQUAL "x86_64")
   set(LIBC_STRING_TARGET_ARCH "x86")
   set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/x86/memcpy.cpp)
+elseif(${LIBC_TARGET_MACHINE} STREQUAL "aarch64")
+  set(LIBC_STRING_TARGET_ARCH "aarch64")
+  set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/aarch64/memcpy.cpp)
 else()
   set(LIBC_STRING_TARGET_ARCH ${LIBC_TARGET_MACHINE})
   set(MEMCPY_SRC ${LIBC_SOURCE_DIR}/src/string/memcpy.cpp)
diff --git a/libc/src/string/aarch64/CMakeLists.txt b/libc/src/string/aarch64/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/libc/src/string/aarch64/CMakeLists.txt
@@ -0,0 +1 @@
+add_memcpy("memcpy_${LIBC_TARGET_MACHINE}")
diff --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp
new file mode 100644
--- /dev/null
+++ b/libc/src/string/aarch64/memcpy.cpp
@@ -0,0 +1,113 @@
+//===-- Implementation of memcpy ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/string/memcpy.h"
+#include "src/__support/common.h"
+#include "src/string/memory_utils/memcpy_utils.h"
+
+namespace __llvm_libc {
+
+// Design rationale
+// ================
+//
+// Using a profiler to observe size distributions for calls into libc
+// functions, it was found most operations act on a small number of bytes.
+// This makes it important to favor small sizes.
+//
+// We have used __builtin_expect to tell the compiler to favour lower sizes as
+// that will reduce the branching overhead where that would hurt most
+// proportional to total cost of copying.
+//
+// The function is written in C++ for several reasons:
+// - The compiler can __see__ the code, this is useful when performing Profile
+//   Guided Optimization as the optimized code can take advantage of branching
+//   probabilities.
+// - It also allows for easier customization and favors testing multiple
+//   implementation parameters.
+// - As compilers and processors get better, the generated code is improved
+//   with little change on the code side.
+static void memcpy_aarch64(char *__restrict dst, const char *__restrict src,
+                           size_t count) {
+  char *dst_m = dst + count;
+  const char *src_m = src + count;
+  if (__builtin_expect(count < 128, 1)) {
+    if (__builtin_expect(count > 32, 0)) {
+      CopyBlock<32>(dst, src);
+      CopyBlock<32>(dst_m - 32, src_m - 32);
+      if (__builtin_expect(count > 64, 0)) {
+        CopyBlock<32>(dst + 32, src + 32);
+        if (__builtin_expect(count > 96, 0)) {
+          CopyBlock<32>(dst + 64, src + 64);
+        }
+      }
+      return;
+    } else if (__builtin_expect(count < 16, 1)) {
+      if (__builtin_expect((count & 0x8) != 0, 0)) {
+        CopyBlock<8>(dst, src);
+        return CopyBlock<8>(dst_m - 8, src_m - 8);
+      } else if (__builtin_expect((count & 0x4) != 0, 0)) {
+        CopyBlock<4>(dst, src);
+        return CopyBlock<4>(dst_m - 4, src_m - 4);
+      } else {
+        if (count == 0)
+          return;
+        if (count == 1)
+          return CopyBlock<1>(dst, src);
+        if (count == 2)
+          return CopyBlock<2>(dst, src);
+        if (count == 3)
+          return CopyBlock<3>(dst, src);
+      }
+    } else {
+      CopyBlock<16>(dst, src);
+      return CopyBlock<16>(dst_m - 16, src_m - 16);
+    }
+  }
+  // Large copy
+  // Copy 16 bytes and then align src to 16-byte alignment.
+  CopyBlock<16>(dst, src);
+
+  // Align to either source or destination depending on target.
+  // Default aligns to source, define 'ALIGN_DST' to align to destination.
+#if ALIGN_DST
+#define ALIGN_SRCDST dst
+#else
+#define ALIGN_SRCDST src
+#endif
+  size_t misalign = ((intptr_t)ALIGN_SRCDST) % 16;
+  dst -= misalign;
+  src -= misalign;
+
+  // Copy 64 bytes from aligned src/dst
+  CopyBlock<32>(dst + 16, src + 16);
+  CopyBlock<32>(dst + 48, src + 48);
+
+  // Since we are copying the last 64-bytes unconditionally and we have
+  // already copied 64 + 16 - misalign bytes, we only need to copy the
+  // remaining bytes. Since the difference may be negative we must use a
+  // signed comparison.
+  int64_t count2 = count - (144 - misalign);
+  while (count2 > 0) {
+    CopyBlock<32>(dst + 80, src + 80);
+    CopyBlock<32>(dst + 112, src + 112);
+    count2 -= 64;
+    dst += 64;
+    src += 64;
+  }
+  // Copy last 64-bytes.
+  return CopyBlock<64>(dst_m - 64, src_m - 64);
+}
+
+void *LLVM_LIBC_ENTRYPOINT(memcpy)(void *__restrict dst,
+                                   const void *__restrict src, size_t size) {
+  memcpy_aarch64(reinterpret_cast<char *>(dst),
+                 reinterpret_cast<const char *>(src), size);
+  return dst;
+}
+
+} // namespace __llvm_libc