diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp
--- a/libc/src/stdio/printf_core/string_writer.cpp
+++ b/libc/src/stdio/printf_core/string_writer.cpp
@@ -33,7 +33,7 @@
     len = available_capacity;
 
   if (len > 0) {
-    inline_memset(cur_buffer, new_char, len);
+    inline_memset(cur_buffer, static_cast<uint8_t>(new_char), len);
     cur_buffer += len;
     available_capacity -= len;
   }
diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp
--- a/libc/src/string/bcmp.cpp
+++ b/libc/src/string/bcmp.cpp
@@ -14,8 +14,8 @@
 
 LLVM_LIBC_FUNCTION(int, bcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return inline_bcmp(static_cast<const char *>(lhs),
-                     static_cast<const char *>(rhs), count);
+  return static_cast<int>(inline_bcmp(static_cast<const char *>(lhs),
+                                      static_cast<const char *>(rhs), count));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp
--- a/libc/src/string/memcmp.cpp
+++ b/libc/src/string/memcmp.cpp
@@ -15,8 +15,8 @@
 
 LLVM_LIBC_FUNCTION(int, memcmp,
                    (const void *lhs, const void *rhs, size_t count)) {
-  return inline_memcmp(static_cast<const char *>(lhs),
-                       static_cast<const char *>(rhs), count);
+  return static_cast<int>(inline_memcmp(static_cast<const char *>(lhs),
+                                        static_cast<const char *>(rhs), count));
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp
--- a/libc/src/string/memmove.cpp
+++ b/libc/src/string/memmove.cpp
@@ -9,36 +9,52 @@
 #include "src/string/memmove.h"
 
 #include "src/__support/common.h"
-#include "src/__support/integer_operations.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
 #include <stddef.h> // size_t, ptrdiff_t
 
+#include <stdio.h>
+
 namespace __llvm_libc {
 
 static inline void inline_memmove(char *dst, const char *src, size_t count) {
-  using namespace __llvm_libc::scalar;
+#if defined(LLVM_LIBC_ARCH_X86)
+  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
+                                     : x86::kAvx   ? 32
+                                     : x86::kSse2  ? 16
+                                                   : 8;
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
+#else
+  static constexpr size_t kMaxSize = 8;
+#endif
   if (count == 0)
     return;
   if (count == 1)
-    return move<_1>(dst, src);
+    return generic::Memmove<1, kMaxSize>::block(dst, src);
   if (count <= 4)
-    return move<HeadTail<_2>>(dst, src, count);
+    return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count);
   if (count <= 8)
-    return move<HeadTail<_4>>(dst, src, count);
+    return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count);
   if (count <= 16)
-    return move<HeadTail<_8>>(dst, src, count);
+    return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count);
   if (count <= 32)
-    return move<HeadTail<_16>>(dst, src, count);
+    return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count);
   if (count <= 64)
-    return move<HeadTail<_32>>(dst, src, count);
+    return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count);
   if (count <= 128)
-    return move<HeadTail<_64>>(dst, src, count);
+    return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count);
 
-  using AlignedMoveLoop = Align<_16, Arg::Src>::Then<Loop<_64>>;
-  if (dst < src)
-    return move<AlignedMoveLoop>(dst, src, count);
-  else if (dst > src)
-    return move_backward<AlignedMoveLoop>(dst, src, count);
+  if (dst < src) {
+    generic::Memmove<32, kMaxSize>::align_forward<Arg::Src>(dst, src, count);
+    return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src,
+                                                                 count);
+  } else {
+    generic::Memmove<32, kMaxSize>::align_backward<Arg::Src>(dst, src, count);
+    return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src,
+                                                                  count);
+  }
 }
 
 LLVM_LIBC_FUNCTION(void *, memmove,
diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -2,13 +2,17 @@
 add_header_library(
   memory_utils
   HDRS
-    utils.h
-    elements.h
     bcmp_implementations.h
     bzero_implementations.h
     memcmp_implementations.h
     memcpy_implementations.h
     memset_implementations.h
+    op_aarch64.h
+    op_higher_order.h
+    op_builtin.h
+    op_generic.h
+    op_x86.h
+    utils.h
   DEPS
     libc.src.__support.CPP.bit
 )
diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/README.md
@@ -0,0 +1,97 @@
+# The mem* framework
+
+The framework handles the following mem* functions:
+ - `memcpy`
+ - `memmove`
+ - `memset`
+ - `bzero`
+ - `bcmp`
+ - `memcmp`
+
+## Building blocks
+
+These functions can be built out of a set of lower-level operations:
+ - **`block`** : operates on a block of `SIZE` bytes.
+ - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
+ - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
+ - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
+
+As an illustration, let's take the example of a trivial `memset` implementation:
+
+ ```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return Memset<1>::block(dst, value);
+    if (count == 2) return Memset<2>::block(dst, value);
+    if (count == 3) return Memset<3>::block(dst, value);
+    if (count <= 8) return Memset<4>::head_tail(dst, value, count);  // Note that 0 to 4 bytes are written twice.
+    if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
+    return Memset<16>::loop_and_tail(dst, value, count);
+}
+ ```
+
+Now let's have a look into the `Memset` structure:
+
+```C++
+template <size_t Size>
+struct Memset {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    // Implement me
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+```
+
+As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
+In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
+**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
+
+## Scoped specializations
+
+We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
+
+In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
+```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return generic::Memset<1>::block(dst, value);
+    if (count == 2) return generic::Memset<2>::block(dst, value);
+    if (count == 3) return generic::Memset<3>::block(dst, value);
+    if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
+    if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
+    return x86::Memset<16>::loop_and_tail(dst, value, count);
+}
+```
+
+### The `builtin` scope
+
+Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
+
+### The `generic` scope
+
+In this scope we define pure C++ implementations using native integral types and clang vector extensions.
+
+### The arch specific scopes
+
+Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
+
+The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h
--- a/libc/src/string/memory_utils/bcmp_implementations.h
+++ b/libc/src/string/memory_utils/bcmp_implementations.h
@@ -11,49 +11,132 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-// Fixed-size difference between 'lhs' and 'rhs'.
-template <typename Element> bool differs(const char *lhs, const char *rhs) {
-  return !Element::equals(lhs, rhs);
-}
-// Runtime-size difference between 'lhs' and 'rhs'.
-template <typename Element>
-bool differs(const char *lhs, const char *rhs, size_t size) {
-  return !Element::equals(lhs, rhs, size);
+static inline BcmpReturnType inline_bcmp_generic_gt16(CPtr p1, CPtr p2,
+                                                      size_t count) {
+  if (count < 256)
+    return generic::Bcmp<16>::loop_and_tail(p1, p2, count);
+  if (auto value = generic::Bcmp<64>::block(p1, p2))
+    return value;
+  align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  return generic::Bcmp<64>::loop_and_tail(p1, p2, count);
 }
 
-static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
-  using namespace ::__llvm_libc::x86;
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  using namespace ::__llvm_libc::aarch64;
+static inline BcmpReturnType inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  if (count <= 32)
+    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+  if (count < 256)
+    return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count);
+  if (auto value = x86::sse2::Bcmp<16>::block(p1, p2))
+    return value;
+  align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+static inline BcmpReturnType inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2,
+                                                       size_t count) {
+  if (count <= 32)
+    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+  if (count <= 64)
+    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
+  if (count <= 128)
+    return x86::avx2::Bcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 256)) {
+    if (auto value = x86::avx2::Bcmp<64>::block(p1, p2))
+      return value;
+    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+
+static inline BcmpReturnType inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2,
+                                                           size_t count) {
+  if (count <= 32)
+    return x86::sse2::Bcmp<16>::head_tail(p1, p2, count);
+  if (count <= 64)
+    return x86::avx2::Bcmp<32>::head_tail(p1, p2, count);
+  if (count <= 128)
+    return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 256)) {
+    if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2))
+      return value;
+    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) {
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+  if (likely(count <= 32)) {
+    if (unlikely(count >= 16)) {
+      return generic::Bcmp<16>::head_tail(p1, p2, count);
+    }
+    switch (count) {
+    case 0:
+      return BcmpReturnType::ZERO();
+    case 1:
+      return generic::Bcmp<1>::block(p1, p2);
+    case 2:
+      return generic::Bcmp<2>::block(p1, p2);
+    case 3:
+      return generic::Bcmp<2>::head_tail(p1, p2, count);
+    case 4:
+      return generic::Bcmp<4>::block(p1, p2);
+    case 5 ... 7:
+      return generic::Bcmp<4>::head_tail(p1, p2, count);
+    case 8:
+      return generic::Bcmp<8>::block(p1, p2);
+    case 9 ... 15:
+      return generic::Bcmp<8>::head_tail(p1, p2, count);
+    }
+  }
+
+  if (count <= 64)
+    return generic::Bcmp<32>::head_tail(p1, p2, count);
+
+  // Aligned loop if > 256, otherwise normal loop
+  if (count > 256) {
+    if (auto value = generic::Bcmp<32>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  }
+  return generic::Bcmp<32>::loop_and_tail(p1, p2, count);
 #else
-  using namespace ::__llvm_libc::scalar;
-#endif
   if (count == 0)
-    return 0;
+    return BcmpReturnType::ZERO();
   if (count == 1)
-    return differs<_1>(lhs, rhs);
+    return generic::Bcmp<1>::block(p1, p2);
   if (count == 2)
-    return differs<_2>(lhs, rhs);
-  if (count == 3)
-    return differs<_3>(lhs, rhs);
+    return generic::Bcmp<2>::block(p1, p2);
+  if (count <= 4)
+    return generic::Bcmp<2>::head_tail(p1, p2, count);
   if (count <= 8)
-    return differs<HeadTail<_4>>(lhs, rhs, count);
+    return generic::Bcmp<4>::head_tail(p1, p2, count);
   if (count <= 16)
-    return differs<HeadTail<_8>>(lhs, rhs, count);
-  if (count <= 32)
-    return differs<HeadTail<_16>>(lhs, rhs, count);
-  if (count <= 64)
-    return differs<HeadTail<_32>>(lhs, rhs, count);
-  if (count <= 128)
-    return differs<HeadTail<_64>>(lhs, rhs, count);
-  return differs<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+    return generic::Bcmp<8>::head_tail(p1, p2, count);
+#if defined(LLVM_LIBC_ARCH_X86)
+  if constexpr (x86::kAvx512BW)
+    return inline_bcmp_x86_avx512bw_gt16(p1, p2, count);
+  else if constexpr (x86::kAvx2)
+    return inline_bcmp_x86_avx2_gt16(p1, p2, count);
+  else if constexpr (x86::kSse2)
+    return inline_bcmp_x86_sse2_gt16(p1, p2, count);
+  else
+    return inline_bcmp_generic_gt16(p1, p2, count);
+#else
+  return inline_bcmp_generic_gt16(p1, p2, count);
+#endif
+#endif
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h
deleted file mode 100644
--- a/libc/src/string/memory_utils/elements.h
+++ /dev/null
@@ -1,774 +0,0 @@
-//===-- Elementary operations to compose memory primitives ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
-
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#include "src/__support/endian.h"
-#include "src/string/memory_utils/utils.h"
-
-namespace __llvm_libc {
-
-// Elementary Operations
-// --------------------------------
-// We define abstract elementary operations acting on fixed chunks of memory.
-// These are low level building blocks that are meant to be assembled to compose
-// higher order abstractions. Each function is defined twice: once with
-// fixed-size operations, and once with runtime-size operations.
-
-// Fixed-size copy from 'src' to 'dst'.
-template <typename Element>
-void copy(char *__restrict dst, const char *__restrict src) {
-  Element::copy(dst, src);
-}
-// Runtime-size copy from 'src' to 'dst'.
-template <typename Element>
-void copy(char *__restrict dst, const char *__restrict src, size_t size) {
-  Element::copy(dst, src, size);
-}
-
-// Fixed-size move from 'src' to 'dst'.
-template <typename Element> void move(char *dst, const char *src) {
-  Element::move(dst, src);
-}
-// Runtime-size move from 'src' to 'dst'.
-template <typename Element> void move(char *dst, const char *src, size_t size) {
-  Element::move(dst, src, size);
-}
-// Runtime-size move from 'src' to 'dst'.
-template <typename Element>
-void move_backward(char *dst, const char *src, size_t size) {
-  Element::move_backward(dst, src, size);
-}
-
-// Fixed-size equality between 'lhs' and 'rhs'.
-template <typename Element> bool equals(const char *lhs, const char *rhs) {
-  return Element::equals(lhs, rhs);
-}
-// Runtime-size equality between 'lhs' and 'rhs'.
-template <typename Element>
-bool equals(const char *lhs, const char *rhs, size_t size) {
-  return Element::equals(lhs, rhs, size);
-}
-
-// Fixed-size three-way comparison between 'lhs' and 'rhs'.
-template <typename Element>
-int three_way_compare(const char *lhs, const char *rhs) {
-  return Element::three_way_compare(lhs, rhs);
-}
-// Runtime-size three-way comparison between 'lhs' and 'rhs'.
-template <typename Element>
-int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-  return Element::three_way_compare(lhs, rhs, size);
-}
-
-// Fixed-size initialization.
-template <typename Element>
-void splat_set(char *dst, const unsigned char value) {
-  Element::splat_set(dst, value);
-}
-// Runtime-size initialization.
-template <typename Element>
-void splat_set(char *dst, const unsigned char value, size_t size) {
-  Element::splat_set(dst, value, size);
-}
-
-// Stack placeholder for Move operations.
-template <typename Element> struct Storage { char bytes[Element::SIZE]; };
-
-// Fixed-size Higher-Order Operations
-// ----------------------------------
-// - Repeated<Type, ElementCount>: Repeat the operation several times in a row.
-// - Chained<Types...>: Chain the operation of several types.
-
-// Repeat the operation several times in a row.
-template <typename Element, size_t ElementCount> struct Repeated {
-  static constexpr size_t SIZE = ElementCount * Element::SIZE;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      Element::copy(dst + offset, src + offset);
-    }
-  }
-
-  static void move(char *dst, const char *src) {
-    const auto value = load(src);
-    store(dst, value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      if (!Element::equals(lhs + offset, rhs + offset))
-        return false;
-    }
-    return true;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      // We make the assumption that 'equals' is cheaper than
-      // 'three_way_compare'.
-      if (Element::equals(lhs + offset, rhs + offset))
-        continue;
-      return Element::three_way_compare(lhs + offset, rhs + offset);
-    }
-    return 0;
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    for (size_t i = 0; i < ElementCount; ++i) {
-      const size_t offset = i * Element::SIZE;
-      Element::splat_set(dst + offset, value);
-    }
-  }
-
-  static Storage<Repeated> load(const char *ptr) {
-    Storage<Repeated> value;
-    copy(reinterpret_cast<char *>(&value), ptr);
-    return value;
-  }
-
-  static void store(char *ptr, Storage<Repeated> value) {
-    copy(ptr, reinterpret_cast<const char *>(&value));
-  }
-};
-
-template <typename Element> struct Repeated<Element, 0> {
-  static void move(char *, const char *) {}
-};
-
-// Chain the operation of several types.
-// For instance, to handle a 3 bytes operation, one can use:
-// Chained<UINT16, UINT8>::Operation();
-template <typename... Types> struct Chained;
-
-template <typename Head, typename... Tail> struct Chained<Head, Tail...> {
-  static constexpr size_t SIZE = Head::SIZE + Chained<Tail...>::SIZE;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    Chained<Tail...>::copy(dst + Head::SIZE, src + Head::SIZE);
-    __llvm_libc::copy<Head>(dst, src);
-  }
-
-  static void move(char *dst, const char *src) {
-    const auto value = Head::load(src);
-    Chained<Tail...>::move(dst + Head::SIZE, src + Head::SIZE);
-    Head::store(dst, value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    if (!__llvm_libc::equals<Head>(lhs, rhs))
-      return false;
-    return Chained<Tail...>::equals(lhs + Head::SIZE, rhs + Head::SIZE);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    if (__llvm_libc::equals<Head>(lhs, rhs))
-      return Chained<Tail...>::three_way_compare(lhs + Head::SIZE,
-                                                 rhs + Head::SIZE);
-    return __llvm_libc::three_way_compare<Head>(lhs, rhs);
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    Chained<Tail...>::splat_set(dst + Head::SIZE, value);
-    __llvm_libc::splat_set<Head>(dst, value);
-  }
-};
-
-template <> struct Chained<> {
-  static constexpr size_t SIZE = 0;
-  static void copy(char *__restrict, const char *__restrict) {}
-  static void move(char *, const char *) {}
-  static bool equals(const char *, const char *) { return true; }
-  static int three_way_compare(const char *, const char *) { return 0; }
-  static void splat_set(char *, const unsigned char) {}
-};
-
-// Overlap ElementA and ElementB so they span Size bytes.
-template <size_t Size, typename ElementA, typename ElementB = ElementA>
-struct Overlap {
-  static constexpr size_t SIZE = Size;
-  static_assert(ElementB::SIZE <= ElementA::SIZE, "ElementB too big");
-  static_assert(ElementA::SIZE <= Size, "ElementA too big");
-  static_assert((ElementA::SIZE + ElementB::SIZE) >= Size,
-                "Elements too small to overlap");
-  static constexpr size_t OFFSET = SIZE - ElementB::SIZE;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    ElementA::copy(dst, src);
-    ElementB::copy(dst + OFFSET, src + OFFSET);
-  }
-
-  static void move(char *dst, const char *src) {
-    const auto value_a = ElementA::load(src);
-    const auto value_b = ElementB::load(src + OFFSET);
-    ElementB::store(dst + OFFSET, value_b);
-    ElementA::store(dst, value_a);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    if (!ElementA::equals(lhs, rhs))
-      return false;
-    if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
-      return false;
-    return true;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    if (!ElementA::equals(lhs, rhs))
-      return ElementA::three_way_compare(lhs, rhs);
-    if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET))
-      return ElementB::three_way_compare(lhs + OFFSET, rhs + OFFSET);
-    return 0;
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    ElementA::splat_set(dst, value);
-    ElementB::splat_set(dst + OFFSET, value);
-  }
-};
-
-// Runtime-size Higher-Order Operations
-// ------------------------------------
-// - Tail<T>: Perform the operation on the last 'T::SIZE' bytes of the buffer.
-// - HeadTail<T>: Perform the operation on the first and last 'T::SIZE' bytes
-//   of the buffer.
-// - Loop<T>: Perform a loop of fixed-sized operations.
-
-// Perform the operation on the last 'T::SIZE' bytes of the buffer.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [________XXXXXXXX___]
-//
-// Precondition: `size >= T::SIZE`.
-template <typename T> struct Tail {
-  static void copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    return T::copy(dst + offset(size), src + offset(size));
-  }
-
-  static bool equals(const char *lhs, const char *rhs, size_t size) {
-    return T::equals(lhs + offset(size), rhs + offset(size));
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-    return T::three_way_compare(lhs + offset(size), rhs + offset(size));
-  }
-
-  static void splat_set(char *dst, const unsigned char value, size_t size) {
-    return T::splat_set(dst + offset(size), value);
-  }
-
-  static size_t offset(size_t size) { return size - T::SIZE; }
-};
-
-// Perform the operation on the first and last 'T::SIZE' bytes of the buffer.
-// This is useful for overlapping operations.
-//
-// e.g. with
-// [1234567812345678123]
-// [__XXXXXXXXXXXXXX___]
-// [__XXXXXXXX_________]
-// [________XXXXXXXX___]
-//
-// Precondition: `size >= T::SIZE && size <= 2 x T::SIZE`.
-template <typename T> struct HeadTail {
-  static void copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    T::copy(dst, src);
-    Tail<T>::copy(dst, src, size);
-  }
-
-  static void move(char *dst, const char *src, size_t size) {
-    const size_t offset = Tail<T>::offset(size);
-    const auto head_value = T::load(src);
-    const auto tail_value = T::load(src + offset);
-    T::store(dst + offset, tail_value);
-    T::store(dst, head_value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs, size_t size) {
-    if (!T::equals(lhs, rhs))
-      return false;
-    return Tail<T>::equals(lhs, rhs, size);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-    if (!T::equals(lhs, rhs))
-      return T::three_way_compare(lhs, rhs);
-    return Tail<T>::three_way_compare(lhs, rhs, size);
-  }
-
-  static void splat_set(char *dst, const unsigned char value, size_t size) {
-    T::splat_set(dst, value);
-    Tail<T>::splat_set(dst, value, size);
-  }
-};
-
-// Simple loop ending with a Tail operation.
-//
-// e.g. with
-// [12345678123456781234567812345678]
-// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-// [__XXXXXXXX_______________________]
-// [__________XXXXXXXX_______________]
-// [__________________XXXXXXXX_______]
-// [______________________XXXXXXXX___]
-//
-// Precondition:
-// - size >= T::SIZE
-template <typename T, typename TailT = T> struct Loop {
-  static_assert(T::SIZE == TailT::SIZE,
-                "Tail type must have the same size as T");
-
-  static void copy(char *__restrict dst, const char *__restrict src,
-                   size_t size) {
-    size_t offset = 0;
-    do {
-      T::copy(dst + offset, src + offset);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    Tail<TailT>::copy(dst, src, size);
-  }
-
-  // Move forward suitable when dst < src. We load the tail bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [_________________________LLLLLLLL___]
-  // [___LLLLLLLL_________________________]
-  // [_SSSSSSSS___________________________]
-  // [___________LLLLLLLL_________________]
-  // [_________SSSSSSSS___________________]
-  // [___________________LLLLLLLL_________]
-  // [_________________SSSSSSSS___________]
-  // [_______________________SSSSSSSS_____]
-  static void move(char *dst, const char *src, size_t size) {
-    const size_t tail_offset = Tail<T>::offset(size);
-    const auto tail_value = TailT::load(src + tail_offset);
-    size_t offset = 0;
-    do {
-      T::move(dst + offset, src + offset);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    TailT::store(dst + tail_offset, tail_value);
-  }
-
-  // Move forward suitable when dst > src. We load the head bytes before
-  // handling the loop.
-  //
-  // e.g. Moving two bytes
-  // [   |       |       |       |       |]
-  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
-  // [___LLLLLLLL_________________________]
-  // [_________________________LLLLLLLL___]
-  // [___________________________SSSSSSSS_]
-  // [_________________LLLLLLLL___________]
-  // [___________________SSSSSSSS_________]
-  // [_________LLLLLLLL___________________]
-  // [___________SSSSSSSS_________________]
-  // [_____SSSSSSSS_______________________]
-  static void move_backward(char *dst, const char *src, size_t size) {
-    const auto head_value = TailT::load(src);
-    ptrdiff_t offset = size - T::SIZE;
-    do {
-      T::move(dst + offset, src + offset);
-      offset -= T::SIZE;
-    } while (offset >= 0);
-    TailT::store(dst, head_value);
-  }
-
-  static bool equals(const char *lhs, const char *rhs, size_t size) {
-    size_t offset = 0;
-    do {
-      if (!T::equals(lhs + offset, rhs + offset))
-        return false;
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    return Tail<TailT>::equals(lhs, rhs, size);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs, size_t size) {
-    size_t offset = 0;
-    do {
-      if (!T::equals(lhs + offset, rhs + offset))
-        return T::three_way_compare(lhs + offset, rhs + offset);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    return Tail<TailT>::three_way_compare(lhs, rhs, size);
-  }
-
-  static void splat_set(char *dst, const unsigned char value, size_t size) {
-    size_t offset = 0;
-    do {
-      T::splat_set(dst + offset, value);
-      offset += T::SIZE;
-    } while (offset < size - T::SIZE);
-    Tail<TailT>::splat_set(dst, value, size);
-  }
-};
-
-namespace internal {
-
-template <Arg arg> struct ArgSelector {};
-
-template <> struct ArgSelector<Arg::_1> {
-  template <typename T1, typename T2>
-  static T1 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &) {
-    return p1ref;
-  }
-};
-
-template <> struct ArgSelector<Arg::_2> {
-  template <typename T1, typename T2>
-  static T2 *__restrict &Select(T1 *__restrict &, T2 *__restrict &p2ref) {
-    return p2ref;
-  }
-};
-
-// Provides a specialized bump function that adjusts pointers and size so first
-// argument (resp. second argument) gets aligned to Alignment.
-// We make sure the compiler knows about the adjusted pointer alignment.
-// The 'additional_bumps' parameter allows to reach previous / next aligned
-// pointers.
-template <Arg arg, size_t Alignment> struct Align {
-  template <typename T1, typename T2>
-  static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size,
-                   int additional_bumps = 0) {
-    auto &aligned_ptr = ArgSelector<arg>::Select(p1ref, p2ref);
-    auto offset = offset_to_next_aligned<Alignment>(aligned_ptr);
-    adjust(offset + additional_bumps * Alignment, p1ref, p2ref, size);
-    aligned_ptr = assume_aligned<Alignment>(aligned_ptr);
-  }
-};
-
-} // namespace internal
-
-// An alignment operation that:
-// - executes the 'AlignmentT' operation
-// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected
-//   pointer gets aligned, size is decreased accordingly.
-// - calls the 'NextT' operation.
-//
-// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as:
-// copy<Align<_16, Arg::Dst>::Then<Loop<_32>>>(dst, src, count);
-template <typename AlignmentT, Arg AlignOn = Arg::_1> struct Align {
-private:
-  static constexpr size_t ALIGNMENT = AlignmentT::SIZE;
-  static_assert(ALIGNMENT > 1, "Alignment must be more than 1");
-  static_assert(is_power2(ALIGNMENT), "Alignment must be a power of 2");
-
-public:
-  template <typename NextT> struct Then {
-    static void copy(char *__restrict dst, const char *__restrict src,
-                     size_t size) {
-      AlignmentT::copy(dst, src);
-      internal::Align<AlignOn, ALIGNMENT>::bump(dst, src, size);
-      NextT::copy(dst, src, size);
-    }
-
-    // Move forward suitable when dst < src. The alignment is performed with an
-    // HeadTail operation of size ∈ [Alignment, 2 x Alignment].
-    //
-    // e.g. Moving two bytes and making sure src is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-    // [____LLLLLLLL_____________________]
-    // [___________LLLLLLLL______________]
-    // [_SSSSSSSS________________________]
-    // [________SSSSSSSS_________________]
-    //
-    // e.g. Moving two bytes and making sure dst is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
-    // [____LLLLLLLL_____________________]
-    // [______LLLLLLLL___________________]
-    // [_SSSSSSSS________________________]
-    // [___SSSSSSSS______________________]
-    static void move(char *dst, const char *src, size_t size) {
-      char *next_dst = dst;
-      const char *next_src = src;
-      size_t next_size = size;
-      internal::Align<AlignOn, ALIGNMENT>::bump(next_dst, next_src, next_size,
-                                                1);
-      HeadTail<AlignmentT>::move(dst, src, size - next_size);
-      NextT::move(next_dst, next_src, next_size);
-    }
-
-    // Move backward suitable when dst > src. The alignment is performed with an
-    // HeadTail operation of size ∈ [Alignment, 2 x Alignment].
-    //
-    // e.g. Moving two bytes backward and making sure src is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-    // [ _________________LLLLLLLL_______]
-    // [ ___________________LLLLLLLL_____]
-    // [____________________SSSSSSSS_____]
-    // [______________________SSSSSSSS___]
-    //
-    // e.g. Moving two bytes and making sure dst is then aligned.
-    // [  |       |       |       |      ]
-    // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
-    // [ _______________LLLLLLLL_________]
-    // [ ___________________LLLLLLLL_____]
-    // [__________________SSSSSSSS_______]
-    // [______________________SSSSSSSS___]
-    static void move_backward(char *dst, const char *src, size_t size) {
-      char *headtail_dst = dst + size;
-      const char *headtail_src = src + size;
-      size_t headtail_size = 0;
-      internal::Align<AlignOn, ALIGNMENT>::bump(headtail_dst, headtail_src,
-                                                headtail_size, -2);
-      HeadTail<AlignmentT>::move(headtail_dst, headtail_src, headtail_size);
-      NextT::move_backward(dst, src, size - headtail_size);
-    }
-
-    static bool equals(const char *lhs, const char *rhs, size_t size) {
-      if (!AlignmentT::equals(lhs, rhs))
-        return false;
-      internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
-      return NextT::equals(lhs, rhs, size);
-    }
-
-    static int three_way_compare(const char *lhs, const char *rhs,
-                                 size_t size) {
-      if (!AlignmentT::equals(lhs, rhs))
-        return AlignmentT::three_way_compare(lhs, rhs);
-      internal::Align<AlignOn, ALIGNMENT>::bump(lhs, rhs, size);
-      return NextT::three_way_compare(lhs, rhs, size);
-    }
-
-    static void splat_set(char *dst, const unsigned char value, size_t size) {
-      AlignmentT::splat_set(dst, value);
-      char *dummy = nullptr;
-      internal::Align<Arg::_1, ALIGNMENT>::bump(dst, dummy, size);
-      NextT::splat_set(dst, value, size);
-    }
-  };
-};
-
-// An operation that allows to skip the specified amount of bytes.
-template <ptrdiff_t Bytes> struct Skip {
-  template <typename NextT> struct Then {
-    static void copy(char *__restrict dst, const char *__restrict src,
-                     size_t size) {
-      NextT::copy(dst + Bytes, src + Bytes, size - Bytes);
-    }
-
-    static void copy(char *__restrict dst, const char *__restrict src) {
-      NextT::copy(dst + Bytes, src + Bytes);
-    }
-
-    static bool equals(const char *lhs, const char *rhs, size_t size) {
-      return NextT::equals(lhs + Bytes, rhs + Bytes, size - Bytes);
-    }
-
-    static bool equals(const char *lhs, const char *rhs) {
-      return NextT::equals(lhs + Bytes, rhs + Bytes);
-    }
-
-    static int three_way_compare(const char *lhs, const char *rhs,
-                                 size_t size) {
-      return NextT::three_way_compare(lhs + Bytes, rhs + Bytes, size - Bytes);
-    }
-
-    static int three_way_compare(const char *lhs, const char *rhs) {
-      return NextT::three_way_compare(lhs + Bytes, rhs + Bytes);
-    }
-
-    static void splat_set(char *dst, const unsigned char value, size_t size) {
-      NextT::splat_set(dst + Bytes, value, size - Bytes);
-    }
-
-    static void splat_set(char *dst, const unsigned char value) {
-      NextT::splat_set(dst + Bytes, value);
-    }
-  };
-};
-
-// Fixed-size Builtin Operations
-// -----------------------------
-// Note: Do not use 'builtin' right now as it requires the implementation of the
-// `_inline` versions of all the builtins. Theoretically, Clang can still turn
-// them into calls to the C library leading to reentrancy problems.
-namespace builtin {
-
-#ifndef __has_builtin
-#define __has_builtin(x) 0 // Compatibility with non-clang compilers.
-#endif
-
-template <size_t Size> struct Builtin {
-  static constexpr size_t SIZE = Size;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
-    for_loop_copy(dst, src);
-#elif __has_builtin(__builtin_memcpy_inline)
-    // __builtin_memcpy_inline guarantees to never call external functions.
-    // Unfortunately it is not widely available.
-    __builtin_memcpy_inline(dst, src, SIZE);
-#else
-    for_loop_copy(dst, src);
-#endif
-  }
-
-  static void move(char *dst, const char *src) {
-#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER
-    for_loop_move(dst, src);
-#elif __has_builtin(__builtin_memmove)
-    __builtin_memmove(dst, src, SIZE);
-#else
-    for_loop_move(dst, src);
-#endif
-  }
-
-#if __has_builtin(__builtin_memcmp_inline)
-#define LLVM_LIBC_MEMCMP __builtin_memcmp_inline
-#else
-#define LLVM_LIBC_MEMCMP __builtin_memcmp
-#endif
-
-  static bool equals(const char *lhs, const char *rhs) {
-    return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE) == 0;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE);
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    __builtin_memset(dst, value, SIZE);
-  }
-
-private:
-  // Copies `SIZE` bytes from `src` to `dst` using a for loop.
-  // This code requires the use of `-fno-builtin-memcpy` to prevent the compiler
-  // from turning the for-loop back into `__builtin_memcpy`.
-  static void for_loop_copy(char *__restrict dst, const char *__restrict src) {
-    for (size_t i = 0; i < SIZE; ++i)
-      dst[i] = src[i];
-  }
-
-  static void for_loop_move(char *dst, const char *src) {
-    for (size_t i = 0; i < SIZE; ++i)
-      dst[i] = src[i];
-  }
-};
-
-using _1 = Builtin<1>;
-using _2 = Builtin<2>;
-using _3 = Builtin<3>;
-using _4 = Builtin<4>;
-using _8 = Builtin<8>;
-using _16 = Builtin<16>;
-using _32 = Builtin<32>;
-using _64 = Builtin<64>;
-using _128 = Builtin<128>;
-
-} // namespace builtin
-
-// Fixed-size Scalar Operations
-// ----------------------------
-namespace scalar {
-
-// The Scalar type makes use of simple sized integers.
-template <typename T> struct Scalar {
-  static constexpr size_t SIZE = sizeof(T);
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    store(dst, load(src));
-  }
-
-  static void move(char *dst, const char *src) { store(dst, load(src)); }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    return load(lhs) == load(rhs);
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    return scalar_three_way_compare(load(lhs), load(rhs));
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    store(dst, get_splatted_value(value));
-  }
-
-  static int scalar_three_way_compare(T a, T b);
-
-  static T load(const char *ptr) {
-    T value;
-    builtin::Builtin<SIZE>::copy(reinterpret_cast<char *>(&value), ptr);
-    return value;
-  }
-  static void store(char *ptr, T value) {
-    builtin::Builtin<SIZE>::copy(ptr, reinterpret_cast<const char *>(&value));
-  }
-
-private:
-  static T get_splatted_value(const unsigned char value) {
-    return T(~0) / T(0xFF) * T(value);
-  }
-};
-
-template <>
-inline int Scalar<uint8_t>::scalar_three_way_compare(uint8_t a, uint8_t b) {
-  const int16_t la = Endian::to_big_endian(a);
-  const int16_t lb = Endian::to_big_endian(b);
-  return la - lb;
-}
-template <>
-inline int Scalar<uint16_t>::scalar_three_way_compare(uint16_t a, uint16_t b) {
-  const int32_t la = Endian::to_big_endian(a);
-  const int32_t lb = Endian::to_big_endian(b);
-  return la - lb;
-}
-template <>
-inline int Scalar<uint32_t>::scalar_three_way_compare(uint32_t a, uint32_t b) {
-  const uint32_t la = Endian::to_big_endian(a);
-  const uint32_t lb = Endian::to_big_endian(b);
-  return la > lb ? 1 : la < lb ? -1 : 0;
-}
-template <>
-inline int Scalar<uint64_t>::scalar_three_way_compare(uint64_t a, uint64_t b) {
-  const uint64_t la = Endian::to_big_endian(a);
-  const uint64_t lb = Endian::to_big_endian(b);
-  return la > lb ? 1 : la < lb ? -1 : 0;
-}
-
-using UINT8 = Scalar<uint8_t>;   // 1 Byte
-using UINT16 = Scalar<uint16_t>; // 2 Bytes
-using UINT32 = Scalar<uint32_t>; // 4 Bytes
-using UINT64 = Scalar<uint64_t>; // 8 Bytes
-
-using _1 = UINT8;
-using _2 = UINT16;
-using _3 = Chained<UINT16, UINT8>;
-using _4 = UINT32;
-using _8 = UINT64;
-using _16 = Repeated<_8, 2>;
-using _32 = Repeated<_8, 4>;
-using _64 = Repeated<_8, 8>;
-using _128 = Repeated<_8, 16>;
-
-} // namespace scalar
-} // namespace __llvm_libc
-
-#include <src/string/memory_utils/elements_aarch64.h>
-#include <src/string/memory_utils/elements_x86.h>
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H
diff --git a/libc/src/string/memory_utils/elements_aarch64.h b/libc/src/string/memory_utils/elements_aarch64.h
deleted file mode 100644
--- a/libc/src/string/memory_utils/elements_aarch64.h
+++ /dev/null
@@ -1,130 +0,0 @@
-//===-- Elementary operations for aarch64 --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
-
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_AARCH64)
-
-#include <src/string/memory_utils/elements.h>
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#ifdef __ARM_NEON
-#include <arm_neon.h>
-#endif
-
-namespace __llvm_libc {
-namespace aarch64_memset {
-#ifdef __ARM_NEON
-struct Splat8 {
-  static constexpr size_t SIZE = 8;
-  static void splat_set(char *dst, const unsigned char value) {
-    vst1_u8((uint8_t *)dst, vdup_n_u8(value));
-  }
-};
-
-struct Splat16 {
-  static constexpr size_t SIZE = 16;
-  static void splat_set(char *dst, const unsigned char value) {
-    vst1q_u8((uint8_t *)dst, vdupq_n_u8(value));
-  }
-};
-
-using _8 = Splat8;
-using _16 = Splat16;
-#else
-using _8 = __llvm_libc::scalar::_8;
-using _16 = Repeated<_8, 2>;
-#endif // __ARM_NEON
-
-using _1 = __llvm_libc::scalar::_1;
-using _2 = __llvm_libc::scalar::_2;
-using _3 = __llvm_libc::scalar::_3;
-using _4 = __llvm_libc::scalar::_4;
-using _32 = Chained<_16, _16>;
-using _64 = Chained<_32, _32>;
-
-struct Zva64 {
-  static constexpr size_t SIZE = 64;
-
-  static void splat_set(char *dst, const unsigned char) {
-#if __SIZEOF_POINTER__ == 4
-    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
-#else
-    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
-#endif
-  }
-};
-
-inline static bool hasZva() {
-  uint64_t zva_val;
-  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
-  // DC ZVA is permitted if DZP, bit [4] is zero.
-  // BS, bits [3:0] is log2 of the block size in words.
-  // So the next line checks whether the instruction is permitted and block size
-  // is 16 words (i.e. 64 bytes).
-  return (zva_val & 0b11111) == 0b00100;
-}
-
-} // namespace aarch64_memset
-
-namespace aarch64 {
-
-using _1 = __llvm_libc::scalar::_1;
-using _2 = __llvm_libc::scalar::_2;
-using _3 = __llvm_libc::scalar::_3;
-using _4 = __llvm_libc::scalar::_4;
-using _8 = __llvm_libc::scalar::_8;
-using _16 = __llvm_libc::scalar::_16;
-
-#ifdef __ARM_NEON
-struct N32 {
-  static constexpr size_t SIZE = 32;
-  static bool equals(const char *lhs, const char *rhs) {
-    uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
-    uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
-    uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
-    uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
-    uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
-    uint64_t res =
-        vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
-    return res == 0;
-  }
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs);
-    uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs);
-    uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16));
-    uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16));
-    uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1));
-    uint64_t res =
-        vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0);
-    if (res == 0)
-      return 0;
-    size_t index = (__builtin_ctzl(res) >> 3) << 2;
-    uint32_t l = *((const uint32_t *)(lhs + index));
-    uint32_t r = *((const uint32_t *)(rhs + index));
-    return __llvm_libc::scalar::_4::scalar_three_way_compare(l, r);
-  }
-};
-
-using _32 = N32;
-using _64 = Repeated<_32, 2>;
-#else
-using _32 = __llvm_libc::scalar::_32;
-using _64 = __llvm_libc::scalar::_64;
-#endif // __ARM_NEON
-
-} // namespace aarch64
-} // namespace __llvm_libc
-
-#endif // defined(LLVM_LIBC_ARCH_AARCH64)
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H
diff --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h
deleted file mode 100644
--- a/libc/src/string/memory_utils/elements_x86.h
+++ /dev/null
@@ -1,189 +0,0 @@
-//===-- Elementary operations for x86 -------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
-#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
-
-#include "src/__support/CPP/bit.h"
-#include "src/__support/architectures.h"
-
-#if defined(LLVM_LIBC_ARCH_X86)
-
-#include <stddef.h> // size_t
-#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t
-
-#ifdef __SSE2__
-#include <immintrin.h>
-#endif //  __SSE2__
-
-#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar
-
-// Fixed-size Vector Operations
-// ----------------------------
-
-namespace __llvm_libc {
-namespace x86 {
-
-#ifdef __SSE2__
-template <typename Base> struct Vector : public Base {
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    Base::store(dst, Base::load(src));
-  }
-
-  static void move(char *dst, const char *src) {
-    Base::store(dst, Base::load(src));
-  }
-
-  static bool equals(const char *a, const char *b) {
-    return Base::not_equal_mask(Base::load(a), Base::load(b)) == 0;
-  }
-
-  static int three_way_compare(const char *a, const char *b) {
-    const auto mask = Base::not_equal_mask(Base::load(a), Base::load(b));
-    if (!mask)
-      return 0;
-    return char_diff(a, b, mask);
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    Base::store(dst, Base::get_splatted_value(value));
-  }
-
-  static int char_diff(const char *a, const char *b, uint64_t mask) {
-    const size_t diff_index = __builtin_ctzll(mask);
-    const int ca = (unsigned char)a[diff_index];
-    const int cb = (unsigned char)b[diff_index];
-    return ca - cb;
-  }
-};
-
-struct M128 {
-  static constexpr size_t SIZE = 16;
-  using T = char __attribute__((__vector_size__(SIZE)));
-  static uint16_t mask(T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return static_cast<uint16_t>(
-        _mm_movemask_epi8(cpp::bit_cast<__m128i>(value)));
-  }
-  static uint16_t not_equal_mask(T a, T b) { return mask(a != b); }
-  static T load(const char *ptr) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return cpp::bit_cast<T>(
-        _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr)));
-  }
-  static void store(char *ptr, T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr),
-                            cpp::bit_cast<__m128i>(value));
-  }
-  static T get_splatted_value(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-
-using Vector128 = Vector<M128>; // 16 Bytes
-
-#ifdef __AVX2__
-struct M256 {
-  static constexpr size_t SIZE = 32;
-  using T = char __attribute__((__vector_size__(SIZE)));
-  static uint32_t mask(T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm256_movemask_epi8(cpp::bit_cast<__m256i>(value));
-  }
-  static uint32_t not_equal_mask(T a, T b) { return mask(a != b); }
-  static T load(const char *ptr) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return cpp::bit_cast<T>(
-        _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr)));
-  }
-  static void store(char *ptr, T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr),
-                               cpp::bit_cast<__m256i>(value));
-  }
-  static T get_splatted_value(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-
-using Vector256 = Vector<M256>; // 32 Bytes
-
-#if defined(__AVX512F__) and defined(__AVX512BW__)
-struct M512 {
-  static constexpr size_t SIZE = 64;
-  using T = char __attribute__((__vector_size__(SIZE)));
-  static uint64_t not_equal_mask(T a, T b) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(a),
-                                   cpp::bit_cast<__m512i>(b));
-  }
-  static T load(const char *ptr) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return cpp::bit_cast<T>(_mm512_loadu_epi8(ptr));
-  }
-  static void store(char *ptr, T value) {
-    // NOLINTNEXTLINE(llvmlibc-callee-namespace)
-    return _mm512_storeu_epi8(ptr, cpp::bit_cast<__m512i>(value));
-  }
-  static T get_splatted_value(const char v) {
-    const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v,
-                        v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v};
-    return splatted;
-  }
-};
-using Vector512 = Vector<M512>;
-
-#endif // defined(__AVX512F__) and defined(__AVX512BW__)
-#endif // __AVX2__
-#endif // __SSE2__
-
-using _1 = __llvm_libc::scalar::_1;
-using _2 = __llvm_libc::scalar::_2;
-using _3 = __llvm_libc::scalar::_3;
-using _4 = __llvm_libc::scalar::_4;
-using _8 = __llvm_libc::scalar::_8;
-#if defined(__AVX512F__) && defined(__AVX512BW__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::x86::Vector256;
-using _64 = __llvm_libc::x86::Vector512;
-using _128 = __llvm_libc::Repeated<_64, 2>;
-#elif defined(__AVX2__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::x86::Vector256;
-using _64 = __llvm_libc::Repeated<_32, 2>;
-using _128 = __llvm_libc::Repeated<_32, 4>;
-#elif defined(__SSE2__)
-using _16 = __llvm_libc::x86::Vector128;
-using _32 = __llvm_libc::Repeated<_16, 2>;
-using _64 = __llvm_libc::Repeated<_16, 4>;
-using _128 = __llvm_libc::Repeated<_16, 8>;
-#else
-using _16 = __llvm_libc::Repeated<_8, 2>;
-using _32 = __llvm_libc::Repeated<_8, 4>;
-using _64 = __llvm_libc::Repeated<_8, 8>;
-using _128 = __llvm_libc::Repeated<_8, 16>;
-#endif
-
-struct Accelerator {
-  static void copy(char *dst, const char *src, size_t count) {
-    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
-  }
-};
-
-} // namespace x86
-} // namespace __llvm_libc
-
-#endif // defined(LLVM_LIBC_ARCH_X86)
-
-#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H
diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h
--- a/libc/src/string/memory_utils/memcmp_implementations.h
+++ b/libc/src/string/memory_utils/memcmp_implementations.h
@@ -11,92 +11,120 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
 
 namespace __llvm_libc {
 
-static inline int inline_memcmp(const char *lhs, const char *rhs,
-                                size_t count) {
+static inline MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2,
+                                                          size_t count) {
+  if (unlikely(count >= 384)) {
+    if (auto value = generic::Memcmp<16>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  }
+  return generic::Memcmp<16>::loop_and_tail(p1, p2, count);
+}
+
 #if defined(LLVM_LIBC_ARCH_X86)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_X86
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace __llvm_libc::x86;
-  if (count == 0)
-    return 0;
-  if (count == 1)
-    return three_way_compare<_1>(lhs, rhs);
-  if (count == 2)
-    return three_way_compare<_2>(lhs, rhs);
-  if (count == 3)
-    return three_way_compare<_3>(lhs, rhs);
-  if (count <= 8)
-    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
-  if (count <= 16)
-    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
+static inline MemcmpReturnType inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2,
+                                                           size_t count) {
+  if (unlikely(count >= 384)) {
+    if (auto value = x86::sse2::Memcmp<16>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+  }
+  return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count);
+}
+
+static inline MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2,
+                                                           size_t count) {
   if (count <= 32)
-    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
+    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
   if (count <= 64)
-    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
+    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
   if (count <= 128)
-    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
-  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
-#elif defined(LLVM_LIBC_ARCH_AARCH64)
-  /////////////////////////////////////////////////////////////////////////////
-  // LLVM_LIBC_ARCH_AARCH64
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace ::__llvm_libc::aarch64;
-  if (count == 0) // [0, 0]
-    return 0;
-  if (count == 1) // [1, 1]
-    return three_way_compare<_1>(lhs, rhs);
-  if (count == 2) // [2, 2]
-    return three_way_compare<_2>(lhs, rhs);
-  if (count == 3) // [3, 3]
-    return three_way_compare<_3>(lhs, rhs);
-  if (count < 8) // [4, 7]
-    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
-  if (count < 16) // [8, 15]
-    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
-  if (unlikely(count >= 128)) // [128, ∞]
-    return three_way_compare<Align<_16>::Then<Loop<_32>>>(lhs, rhs, count);
-  if (!equals<_16>(lhs, rhs)) // [16, 16]
-    return three_way_compare<_16>(lhs, rhs);
+    return x86::avx2::Memcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 384)) {
+    if (auto value = x86::avx2::Memcmp<32>::block(p1, p2))
+      return value;
+    align_to_next_boundary<32, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count);
+}
+
+static inline MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2,
+                                                               size_t count) {
+  if (count <= 32)
+    return x86::sse2::Memcmp<16>::head_tail(p1, p2, count);
+  if (count <= 64)
+    return x86::avx2::Memcmp<32>::head_tail(p1, p2, count);
+  if (count <= 128)
+    return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count);
+  if (unlikely(count >= 384)) {
+    if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2))
+      return value;
+    align_to_next_boundary<64, Arg::P1>(p1, p2, count);
+  }
+  return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count);
+}
+#endif // defined(LLVM_LIBC_ARCH_X86)
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+static inline MemcmpReturnType inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2,
+                                                               size_t count) {
+  if (unlikely(count >= 128)) { // [128, ∞]
+    if (auto value = generic::Memcmp<16>::block(p1, p2))
+      return value;
+    align_to_next_boundary<16, Arg::P1>(p1, p2, count);
+    return generic::Memcmp<32>::loop_and_tail(p1, p2, count);
+  }
   if (count < 32) // [17, 31]
-    return three_way_compare<Tail<_16>>(lhs, rhs, count);
-  if (!equals<Skip<16>::Then<_16>>(lhs, rhs)) // [32, 32]
-    return three_way_compare<Skip<16>::Then<_16>>(lhs, rhs);
+    return generic::Memcmp<16>::tail(p1, p2, count);
+  if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32]
+    return generic::Memcmp<16>::block(p1 + 16, p2 + 16);
   if (count < 64) // [33, 63]
-    return three_way_compare<Tail<_32>>(lhs, rhs, count);
+    return generic::Memcmp<32>::tail(p1, p2, count);
   // [64, 127]
-  return three_way_compare<Skip<32>::Then<Loop<_16>>>(lhs, rhs, count);
-#else
-  /////////////////////////////////////////////////////////////////////////////
-  // Default
-  /////////////////////////////////////////////////////////////////////////////
-  using namespace ::__llvm_libc::scalar;
+  return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32);
+}
+#endif // defined(LLVM_LIBC_ARCH_AARCH64)
 
+static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) {
   if (count == 0)
-    return 0;
+    return MemcmpReturnType::ZERO();
   if (count == 1)
-    return three_way_compare<_1>(lhs, rhs);
+    return generic::Memcmp<1>::block(p1, p2);
   if (count == 2)
-    return three_way_compare<_2>(lhs, rhs);
+    return generic::Memcmp<2>::block(p1, p2);
   if (count == 3)
-    return three_way_compare<_3>(lhs, rhs);
+    return generic::Memcmp<3>::block(p1, p2);
   if (count <= 8)
-    return three_way_compare<HeadTail<_4>>(lhs, rhs, count);
+    return generic::Memcmp<4>::head_tail(p1, p2, count);
   if (count <= 16)
-    return three_way_compare<HeadTail<_8>>(lhs, rhs, count);
-  if (count <= 32)
-    return three_way_compare<HeadTail<_16>>(lhs, rhs, count);
-  if (count <= 64)
-    return three_way_compare<HeadTail<_32>>(lhs, rhs, count);
-  if (count <= 128)
-    return three_way_compare<HeadTail<_64>>(lhs, rhs, count);
-  return three_way_compare<Align<_32>::Then<Loop<_32>>>(lhs, rhs, count);
+    return generic::Memcmp<8>::head_tail(p1, p2, count);
+#if defined(LLVM_LIBC_ARCH_X86)
+  if constexpr (x86::kAvx512BW)
+    return inline_memcmp_x86_avx512bw_gt16(p1, p2, count);
+  else if constexpr (x86::kAvx2)
+    return inline_memcmp_x86_avx2_gt16(p1, p2, count);
+  else if constexpr (x86::kSse2)
+    return inline_memcmp_x86_sse2_gt16(p1, p2, count);
+  else
+    return inline_memcmp_generic_gt16(p1, p2, count);
+#elif defined(LLVM_LIBC_ARCH_AARCH64)
+  /////////////////////////////////////////////////////////////////////////////
+  // LLVM_LIBC_ARCH_AARCH64
+  /////////////////////////////////////////////////////////////////////////////
+  if constexpr (aarch64::kNeon)
+    return inline_memcmp_aarch64_neon_gt16(p1, p2, count);
+  else
+    return inline_memcmp_generic_gt16(p1, p2, count);
+#else
+  return inline_memcmp_generic_gt16(p1, p2, count);
 #endif
 }
 
diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h
--- a/libc/src/string/memory_utils/memcpy_implementations.h
+++ b/libc/src/string/memory_utils/memcpy_implementations.h
@@ -11,7 +11,8 @@
 
 #include "src/__support/architectures.h"
 #include "src/__support/common.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_x86.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
@@ -59,44 +60,39 @@
       -1;
 #endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE
 
-  // Whether target supports AVX instructions.
-  constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__);
-
-#if defined(__AVX__)
-  using LoopBlockSize = _64;
-#else
-  using LoopBlockSize = _32;
-#endif
+  static constexpr size_t LoopBlockSize = x86::kAvx ? 64 : 32;
 
   if (USE_ONLY_REP_MOVSB)
-    return copy<x86::Accelerator>(dst, src, count);
+    return x86::Memcpy::repmovsb(dst, src, count);
 
   if (count == 0)
     return;
   if (count == 1)
-    return copy<_1>(dst, src);
+    return Memcpy<1>::block(dst, src);
   if (count == 2)
-    return copy<_2>(dst, src);
+    return Memcpy<2>::block(dst, src);
   if (count == 3)
-    return copy<_3>(dst, src);
+    return Memcpy<3>::block(dst, src);
   if (count == 4)
-    return copy<_4>(dst, src);
+    return Memcpy<4>::block(dst, src);
   if (count < 8)
-    return copy<HeadTail<_4>>(dst, src, count);
+    return Memcpy<4>::head_tail(dst, src, count);
   if (count < 16)
-    return copy<HeadTail<_8>>(dst, src, count);
+    return Memcpy<8>::head_tail(dst, src, count);
   if (count < 32)
-    return copy<HeadTail<_16>>(dst, src, count);
+    return Memcpy<16>::head_tail(dst, src, count);
   if (count < 64)
-    return copy<HeadTail<_32>>(dst, src, count);
+    return Memcpy<32>::head_tail(dst, src, count);
   if (count < 128)
-    return copy<HeadTail<_64>>(dst, src, count);
-  if (HAS_AVX && count < 256)
-    return copy<HeadTail<_128>>(dst, src, count);
-  if (count <= REP_MOVS_B_SIZE)
-    return copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src,
-                                                                 count);
-  return copy<x86::Accelerator>(dst, src, count);
+    return Memcpy<64>::head_tail(dst, src, count);
+  if (x86::kAvx && count < 256)
+    return Memcpy<128>::head_tail(dst, src, count);
+  if (count <= REP_MOVS_B_SIZE) {
+    Memcpy<32>::block(dst, src);
+    align_to_next_boundary<32, Arg::Dst>(dst, src, count);
+    return Memcpy<LoopBlockSize>::loop_and_tail(dst, src, count);
+  }
+  return x86::Memcpy::repmovsb(dst, src, count);
 #elif defined(LLVM_LIBC_ARCH_AARCH64)
   /////////////////////////////////////////////////////////////////////////////
   // LLVM_LIBC_ARCH_AARCH64
@@ -104,24 +100,26 @@
   if (count == 0)
     return;
   if (count == 1)
-    return copy<_1>(dst, src);
+    return Memcpy<1>::block(dst, src);
   if (count == 2)
-    return copy<_2>(dst, src);
+    return Memcpy<2>::block(dst, src);
   if (count == 3)
-    return copy<_3>(dst, src);
+    return Memcpy<3>::block(dst, src);
   if (count == 4)
-    return copy<_4>(dst, src);
+    return Memcpy<4>::block(dst, src);
   if (count < 8)
-    return copy<HeadTail<_4>>(dst, src, count);
+    return Memcpy<4>::head_tail(dst, src, count);
   if (count < 16)
-    return copy<HeadTail<_8>>(dst, src, count);
+    return Memcpy<8>::head_tail(dst, src, count);
   if (count < 32)
-    return copy<HeadTail<_16>>(dst, src, count);
+    return Memcpy<16>::head_tail(dst, src, count);
   if (count < 64)
-    return copy<HeadTail<_32>>(dst, src, count);
+    return Memcpy<32>::head_tail(dst, src, count);
   if (count < 128)
-    return copy<HeadTail<_64>>(dst, src, count);
-  return copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count);
+    return Memcpy<64>::head_tail(dst, src, count);
+  Memcpy<16>::block(dst, src);
+  align_to_next_boundary<16, Arg::Src>(dst, src, count);
+  return Memcpy<64>::loop_and_tail(dst, src, count);
 #else
   /////////////////////////////////////////////////////////////////////////////
   // Default
@@ -129,24 +127,26 @@
   if (count == 0)
     return;
   if (count == 1)
-    return copy<_1>(dst, src);
+    return Memcpy<1>::block(dst, src);
   if (count == 2)
-    return copy<_2>(dst, src);
+    return Memcpy<2>::block(dst, src);
   if (count == 3)
-    return copy<_3>(dst, src);
+    return Memcpy<3>::block(dst, src);
   if (count == 4)
-    return copy<_4>(dst, src);
+    return Memcpy<4>::block(dst, src);
   if (count < 8)
-    return copy<HeadTail<_4>>(dst, src, count);
+    return Memcpy<4>::head_tail(dst, src, count);
   if (count < 16)
-    return copy<HeadTail<_8>>(dst, src, count);
+    return Memcpy<8>::head_tail(dst, src, count);
   if (count < 32)
-    return copy<HeadTail<_16>>(dst, src, count);
+    return Memcpy<16>::head_tail(dst, src, count);
   if (count < 64)
-    return copy<HeadTail<_32>>(dst, src, count);
+    return Memcpy<32>::head_tail(dst, src, count);
   if (count < 128)
-    return copy<HeadTail<_64>>(dst, src, count);
-  return copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count);
+    return Memcpy<64>::head_tail(dst, src, count);
+  Memcpy<32>::block(dst, src);
+  align_to_next_boundary<32, Arg::Src>(dst, src, count);
+  return Memcpy<32>::loop_and_tail(dst, src, count);
 #endif
 }
 
diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h
--- a/libc/src/string/memory_utils/memset_implementations.h
+++ b/libc/src/string/memory_utils/memset_implementations.h
@@ -10,7 +10,9 @@
 #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H
 
 #include "src/__support/architectures.h"
-#include "src/string/memory_utils/elements.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
 #include "src/string/memory_utils/utils.h"
 
 #include <stddef.h> // size_t
@@ -48,88 +50,100 @@
 // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32>
 // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not
 // superior for sizes that mattered.
-inline static void inline_memset(char *dst, unsigned char value, size_t count) {
+inline static void inline_memset(Ptr dst, uint8_t value, size_t count) {
 #if defined(LLVM_LIBC_ARCH_X86)
   /////////////////////////////////////////////////////////////////////////////
   // LLVM_LIBC_ARCH_X86
   /////////////////////////////////////////////////////////////////////////////
-  using namespace __llvm_libc::x86;
+  static constexpr size_t kMaxSize = x86::kAvx512F ? 64
+                                     : x86::kAvx   ? 32
+                                     : x86::kSse2  ? 16
+                                                   : 8;
   if (count == 0)
     return;
   if (count == 1)
-    return splat_set<_1>(dst, value);
+    return generic::Memset<1, kMaxSize>::block(dst, value);
   if (count == 2)
-    return splat_set<_2>(dst, value);
+    return generic::Memset<2, kMaxSize>::block(dst, value);
   if (count == 3)
-    return splat_set<_3>(dst, value);
+    return generic::Memset<3, kMaxSize>::block(dst, value);
   if (count <= 8)
-    return splat_set<HeadTail<_4>>(dst, value, count);
+    return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
   if (count <= 16)
-    return splat_set<HeadTail<_8>>(dst, value, count);
+    return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
   if (count <= 32)
-    return splat_set<HeadTail<_16>>(dst, value, count);
+    return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
   if (count <= 64)
-    return splat_set<HeadTail<_32>>(dst, value, count);
+    return generic::Memset<32, kMaxSize>::head_tail(dst, value, count);
   if (count <= 128)
-    return splat_set<HeadTail<_64>>(dst, value, count);
-  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+    return generic::Memset<64, kMaxSize>::head_tail(dst, value, count);
+  // Aligned loop
+  generic::Memset<32, kMaxSize>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count);
 #elif defined(LLVM_LIBC_ARCH_AARCH64)
   /////////////////////////////////////////////////////////////////////////////
   // LLVM_LIBC_ARCH_AARCH64
   /////////////////////////////////////////////////////////////////////////////
-  using namespace __llvm_libc::aarch64_memset;
+  static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8;
   if (count == 0)
     return;
   if (count <= 3) {
-    splat_set<_1>(dst, value);
+    generic::Memset<1, kMaxSize>::block(dst, value);
     if (count > 1)
-      splat_set<Tail<_2>>(dst, value, count);
+      generic::Memset<2, kMaxSize>::tail(dst, value, count);
     return;
   }
   if (count <= 8)
-    return splat_set<HeadTail<_4>>(dst, value, count);
+    return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
   if (count <= 16)
-    return splat_set<HeadTail<_8>>(dst, value, count);
+    return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
   if (count <= 32)
-    return splat_set<HeadTail<_16>>(dst, value, count);
+    return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
   if (count <= (32 + 64)) {
-    splat_set<_32>(dst, value);
+    generic::Memset<32, kMaxSize>::block(dst, value);
     if (count <= 64)
-      return splat_set<Tail<_32>>(dst, value, count);
-    splat_set<Skip<32>::Then<_32>>(dst, value);
-    splat_set<Tail<_32>>(dst, value, count);
+      return generic::Memset<32, kMaxSize>::tail(dst, value, count);
+    generic::Memset<32, kMaxSize>::block(dst + 32, value);
+    generic::Memset<32, kMaxSize>::tail(dst, value, count);
     return;
   }
-  if (count >= 448 && value == 0 && hasZva())
-    return splat_set<Align<_64, Arg::_1>::Then<Loop<Zva64, _64>>>(dst, 0,
-                                                                  count);
-  else
-    return splat_set<Align<_16, Arg::_1>::Then<Loop<_64>>>(dst, value, count);
+  if (count >= 448 && value == 0 && aarch64::neon::hasZva()) {
+    generic::Memset<64, kMaxSize>::block(dst, 0);
+    align_to_next_boundary<64>(dst, count);
+    return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count);
+  } else {
+    generic::Memset<16, kMaxSize>::block(dst, value);
+    align_to_next_boundary<16>(dst, count);
+    return generic::Memset<64, kMaxSize>::loop_and_tail(dst, value, count);
+  }
 #else
   /////////////////////////////////////////////////////////////////////////////
   // Default
   /////////////////////////////////////////////////////////////////////////////
-  using namespace ::__llvm_libc::scalar;
-
+  static constexpr size_t kMaxSize = 8;
   if (count == 0)
     return;
   if (count == 1)
-    return splat_set<_1>(dst, value);
+    return generic::Memset<1, kMaxSize>::block(dst, value);
   if (count == 2)
-    return splat_set<_2>(dst, value);
+    return generic::Memset<2, kMaxSize>::block(dst, value);
   if (count == 3)
-    return splat_set<_3>(dst, value);
+    return generic::Memset<3, kMaxSize>::block(dst, value);
   if (count <= 8)
-    return splat_set<HeadTail<_4>>(dst, value, count);
+    return generic::Memset<4, kMaxSize>::head_tail(dst, value, count);
   if (count <= 16)
-    return splat_set<HeadTail<_8>>(dst, value, count);
+    return generic::Memset<8, kMaxSize>::head_tail(dst, value, count);
   if (count <= 32)
-    return splat_set<HeadTail<_16>>(dst, value, count);
+    return generic::Memset<16, kMaxSize>::head_tail(dst, value, count);
   if (count <= 64)
-    return splat_set<HeadTail<_32>>(dst, value, count);
+    return generic::Memset<32, kMaxSize>::head_tail(dst, value, count);
   if (count <= 128)
-    return splat_set<HeadTail<_64>>(dst, value, count);
-  return splat_set<Align<_32, Arg::Dst>::Then<Loop<_32>>>(dst, value, count);
+    return generic::Memset<64, kMaxSize>::head_tail(dst, value, count);
+  // Aligned loop
+  generic::Memset<32, kMaxSize>::block(dst, value);
+  align_to_next_boundary<32>(dst, count);
+  return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count);
 #endif
 }
 
diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_aarch64.h
@@ -0,0 +1,172 @@
+//===-- aarch64 implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides aarch64 specific building blocks to compose memory
+// functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif //__ARM_NEON
+
+namespace __llvm_libc::aarch64 {
+
+static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
+
+namespace neon {
+
+template <size_t Size> struct BzeroCacheLine {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t) {
+    static_assert(Size == 64);
+#if __SIZEOF_POINTER__ == 4
+    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    // Unaligned store, we can't use 'dc zva' here.
+    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
+    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
+  }
+};
+
+inline static bool hasZva() {
+  uint64_t zva_val;
+  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+  // DC ZVA is permitted if DZP, bit [4] is zero.
+  // BS, bits [3:0] is log2 of the block count in words.
+  // So the next line checks whether the instruction is permitted and block
+  // count is 16 words (i.e. 64 bytes).
+  return (zva_val & 0b11111) == 0b00100;
+}
+
+} // namespace neon
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t BlockSize = 32;
+
+  static const unsigned char *as_u8(CPtr ptr) {
+    return reinterpret_cast<const unsigned char *>(ptr);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t an = veorq_u8(a, n);
+      uint8x16_t bo = veorq_u8(b, o);
+      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
+      // a difference between the two buffers.  We reduce this value down to 4
+      // bytes in two steps. First, calculate the saturated move value when
+      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
+      // a single 32 bit nonzero value if a mismatch occurred.
+      uint8x16_t anbo = vorrq_u8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr ((Size % BlockSize) == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if constexpr (Size <= 8) {
+      return generic::Bcmp<Size>::head_tail(p1, p2, count);
+    } else if constexpr (Size == 16) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + count - 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + count - 16);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      // anbo = (a ^ n) | (b ^ o)
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr (Size == 32) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t c = vld1q_u8(_p1 + count - 16);
+      uint8x16_t d = vld1q_u8(_p1 + count - 32);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t p = vld1q_u8(_p2 + count - 16);
+      uint8x16_t q = vld1q_u8(_p2 + count - 32);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      uint8x16_t cp = veorq_s8(c, p);
+      uint8x16_t dq = veorq_s8(d, q);
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint8x16_t cpdq = vorrq_s8(cp, dq);
+      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
+      // a nonzero 32 bit value if a mismatch occurred.
+      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
+      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
+      return vmaxv_u32(abnocpdq_reduced);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+} // namespace __llvm_libc::aarch64
+
+#endif // LLVM_LIBC_ARCH_AARCH64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_builtin.h
@@ -0,0 +1,146 @@
+//===-- Implementation using the __builtin_XXX_inline ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks to compose memory functions.
+// They rely on the compiler to generate the best possible code through the use
+// of the `__builtin_XXX_inline` builtins. These builtins are currently only
+// available in Clang.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+
+#include "src/string/memory_utils/utils.h"
+
+namespace __llvm_libc::builtin {
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+template <size_t Size> struct Memcpy {
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr __restrict dst, CPtr __restrict src) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    return __builtin_memcpy_inline(dst, src, SIZE);
+#else
+    deferred_static_assert("Missing __builtin_memcpy_inline");
+    (void)dst;
+    (void)src;
+#endif
+  }
+
+  static inline void tail(Ptr __restrict dst, CPtr __restrict src,
+                          size_t count) {
+    block(dst + count - SIZE, src + count - SIZE);
+  }
+
+  static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
+                               size_t count) {
+    block(dst, src);
+    tail(dst, src, count);
+  }
+
+  static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
+                                   size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, src + offset);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, src, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+template <size_t Size> struct Memset {
+  using ME = Memset;
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr dst, uint8_t value) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+    __builtin_memset_inline(dst, value, Size);
+#else
+    deferred_static_assert("Missing __builtin_memset_inline");
+    (void)dst;
+    (void)value;
+#endif
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  using ME = Bcmp;
+  static constexpr size_t SIZE = Size;
+  static inline BcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+template <size_t Size> struct Memcmp {
+  using ME = Memcmp;
+  static constexpr size_t SIZE = Size;
+  static inline MemcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+};
+
+} // namespace __llvm_libc::builtin
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -0,0 +1,461 @@
+//===-- Generic implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks.
+// Depending on the requested size, the block operation uses unsigned integral
+// types, vector types or an array of the type with the maximum size.
+//
+// The maximum size is passed as a template argument. For instance, on x86
+// platforms that only supports integral types the maximum size would be 8
+// (corresponding to uint64_t). On this platform if we request the size 32, this
+// would be treated as a cpp::array<uint64_t, 4>.
+//
+// On the other hand, if the platform is x86 with support for AVX the maximum
+// size is 32 and the operation can be handled with a single native operation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/endian.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/utils.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc::generic {
+
+// CTPair and CTMap below implement a compile time map.
+// This is useful to map from a Size to a type handling this size.
+//
+// Example usage:
+// using MyMap = CTMap<CTPair<1, uint8_t>,
+//                     CTPair<2, uint16_t>,
+//                     >;
+// ...
+// using UInt8T = MyMap::find_type<1>;
+template <size_t I, typename T> struct CTPair {
+  using type = T;
+  static CTPair get_pair(cpp::integral_constant<size_t, I>) { return {}; }
+};
+template <typename... Pairs> struct CTMap : public Pairs... {
+  using Pairs::get_pair...;
+  template <size_t I>
+  using find_type =
+      typename decltype(get_pair(cpp::integral_constant<size_t, I>{}))::type;
+};
+
+// Helper to test if a type is void.
+template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
+
+// Implements load, store and splat for unsigned integral types.
+template <typename T> struct ScalarType {
+  using Type = T;
+  static_assert(cpp::is_integral_v<Type> && !cpp::is_signed_v<Type>);
+
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    return Type(~0) / Type(0xFF) * Type(value);
+  }
+};
+
+// Implements load, store and splat for vector types.
+template <size_t Size> struct VectorType {
+  using Type = uint8_t __attribute__((__vector_size__(Size)));
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    // This for loop is optimized out for vector types.
+    for (size_t i = 0; i < Size; ++i)
+      Out[i] = static_cast<uint8_t>(value);
+    return Out;
+  }
+};
+
+// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit.
+static_assert((UINTPTR_MAX == 4294967295U) ||
+              (UINTPTR_MAX == 18446744073709551615UL));
+
+// Map from sizes to structures offering static load, store and splat methods.
+// Note: On platforms lacking vector support, we use the ArrayType below and
+// decompose the operation in smaller pieces.
+using NativeTypeMap =
+    CTMap<CTPair<1, ScalarType<uint8_t>>,  //
+          CTPair<2, ScalarType<uint16_t>>, //
+          CTPair<4, ScalarType<uint32_t>>, //
+#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
+          CTPair<8, ScalarType<uint64_t>>, // Not available on 32bit
+#endif                                     //
+          CTPair<16, VectorType<16>>,      //
+          CTPair<32, VectorType<32>>,      //
+          CTPair<64, VectorType<64>>>;
+
+// Implements load, store and splat for sizes not natively supported by the
+// platform. SubType is either ScalarType or VectorType.
+template <typename SubType, size_t ArraySize> struct ArrayType {
+  using Type = cpp::array<typename SubType::Type, ArraySize>;
+  static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
+  static inline Type load(CPtr src) {
+    Type Value;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Value[I] = SubType::load(src + (I * SizeOfElement));
+    return Value;
+  }
+  static inline void store(Ptr dst, Type Value) {
+    for (size_t I = 0; I < ArraySize; ++I)
+      SubType::store(dst + (I * SizeOfElement), Value[I]);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Out[I] = SubType::splat(value);
+    return Out;
+  }
+};
+
+// Checks whether we should use an ArrayType.
+template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
+  return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
+         !is_void_v<NativeTypeMap::find_type<MaxSize>>;
+}
+
+// Compute the type to handle an operation of Size bytes knowing that the
+// underlying platform only support native types up to MaxSize bytes.
+template <size_t Size, size_t MaxSize>
+using getTypeFor = cpp::conditional_t<
+    useArrayType<Size, MaxSize>(),
+    ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
+    NativeTypeMap::find_type<Size>>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+// When building with clang we can delegate to the builtin implementation.
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size> using Memcpy = builtin::Memcpy<Size>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+// The MaxSize template argument gives the maximum size handled natively by the
+// platform. For instance on x86 with AVX support this would be 32. If a size
+// greater than MaxSize is requested we break the operation down in smaller
+// pieces of size MaxSize.
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size, size_t MaxSize> struct Memset {
+  static_assert(is_power2(MaxSize));
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    if constexpr (Size == 3) {
+      Memset<1, MaxSize>::block(dst + 2, value);
+      Memset<2, MaxSize>::block(dst, value);
+    } else {
+      using T = getTypeFor<Size, MaxSize>;
+      if constexpr (is_void_v<T>) {
+        deferred_static_assert("Unimplemented Size");
+      } else {
+        T::store(dst, T::splat(value));
+      }
+    }
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
+    return load<T>(p1) ^ load<T>(p2);
+  }
+
+  template <typename T>
+  static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
+    return load<T>(p1) != load<T>(p2);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    static constexpr size_t MaxSize = 8;
+    if constexpr (Size == 1) {
+      return load_xor<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_xor<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_xor<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_not_equal<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Memcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline T load_be(CPtr ptr) {
+    return Endian::to_big_endian(load<T>(ptr));
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) {
+    return load_be<T>(p1) - load_be<T>(p2);
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
+    const auto la = load_be<T>(p1);
+    const auto lb = load_be<T>(p2);
+    return la > lb ? 1 : la < lb ? -1 : 0;
+  }
+
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == 1) {
+      return load_be_diff<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_be_diff<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_be_cmp<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_be_cmp<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
+      return MemcmpReturnType::ZERO();
+    } else if constexpr (Size == 3) {
+      if (auto value = Memcmp<2>::block(p1, p2))
+        return value;
+      return Memcmp<1>::block(p1 + 2, p2 + 2);
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memmove
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size, size_t MaxSize> struct Memmove {
+  static_assert(is_power2(MaxSize));
+  using T = getTypeFor<Size, MaxSize>;
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, CPtr src) {
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      T::store(dst, T::load(src));
+    }
+  }
+
+  static inline void head_tail(Ptr dst, CPtr src, size_t count) {
+    const size_t offset = count - Size;
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      // The load and store operations can be performed in any order as long as
+      // they are not interleaved. More investigations are needed to determine
+      // the best order.
+      const auto head = T::load(src);
+      const auto tail = T::load(src + offset);
+      T::store(dst, head);
+      T::store(dst + offset, tail);
+    }
+  }
+
+  // Align forward suitable when dst < src. The alignment is performed with
+  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes forward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [___________LLLLLLLA______________]
+  // [_SSSSSSSS________________________]
+  // [________SSSSSSSS_________________]
+  //
+  // e.g. Moving two bytes forward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [______LLLLLLLL___________________]
+  // [_SSSSSSSS________________________]
+  // [___SSSSSSSA______________________]
+  template <Arg AlignOn>
+  static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr prev_dst = dst;
+    CPtr prev_src = src;
+    size_t prev_count = count;
+    align_to_next_boundary<Size, AlignOn>(dst, src, count);
+    adjust(Size, dst, src, count);
+    head_tail(prev_dst, prev_src, prev_count - count);
+  }
+
+  // Align backward suitable when dst > src. The alignment is performed with
+  // an HeadTail operation of count ∈ [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes backward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _________________ALLLLLLL_______]
+  // [ ___________________LLLLLLLL_____]
+  // [____________________SSSSSSSS_____]
+  // [______________________SSSSSSSS___]
+  //
+  // e.g. Moving two bytes backward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _______________LLLLLLLL_________]
+  // [ ___________________LLLLLLLL_____]
+  // [__________________ASSSSSSS_______]
+  // [______________________SSSSSSSS___]
+  template <Arg AlignOn>
+  static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr headtail_dst = dst + count;
+    CPtr headtail_src = src + count;
+    size_t headtail_size = 0;
+    align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
+                                          headtail_size);
+    adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
+    head_tail(headtail_dst, headtail_src, headtail_size);
+    count -= headtail_size;
+  }
+
+  // Move forward suitable when dst < src. We load the tail bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [_________________________LLLLLLLL___]
+  // [___LLLLLLLL_________________________]
+  // [_SSSSSSSS___________________________]
+  // [___________LLLLLLLL_________________]
+  // [_________SSSSSSSS___________________]
+  // [___________________LLLLLLLL_________]
+  // [_________________SSSSSSSS___________]
+  // [_______________________SSSSSSSS_____]
+  static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
+    const size_t tail_offset = count - Size;
+    const auto tail_value = T::load(src + tail_offset);
+    size_t offset = 0;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset += Size;
+    } while (offset < count - Size);
+    T::store(dst + tail_offset, tail_value);
+  }
+
+  // Move backward suitable when dst > src. We load the head bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [___LLLLLLLL_________________________]
+  // [_________________________LLLLLLLL___]
+  // [___________________________SSSSSSSS_]
+  // [_________________LLLLLLLL___________]
+  // [___________________SSSSSSSS_________]
+  // [_________LLLLLLLL___________________]
+  // [___________SSSSSSSS_________________]
+  // [_____SSSSSSSS_______________________]
+  static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
+    const auto head_value = T::load(src);
+    ptrdiff_t offset = count - Size;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset -= Size;
+    } while (offset >= 0);
+    T::store(dst, head_value);
+  }
+};
+
+} // namespace __llvm_libc::generic
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -0,0 +1,217 @@
+//===-- x86 implementation of memory function building blocks -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides x86 specific building blocks to compose memory functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_X86_64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#else
+// Define fake functions to prevent the compiler from failing on undefined
+// functions in case SSE2 is not present.
+#define _mm512_cmpneq_epi8_mask(A, B) 0
+#define _mm_movemask_epi8(A) 0
+#define _mm256_movemask_epi8(A) 0
+#endif //  __SSE2__
+
+namespace __llvm_libc::x86 {
+
+// A set of constants to check compile time features.
+static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
+static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
+static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
+static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy repmovsb implementation
+struct Memcpy {
+  static void repmovsb(char *dst, const char *src, size_t count) {
+    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+
+// Base implementation for the Bcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockBcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
+  // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
+  // mask.
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
+  const bool mask_is_set = mask != 0;
+  return static_cast<uint32_t>(mask_is_set);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
+} // namespace avx512bw
+
+// Assuming that the mask is non zero, the index of the first mismatching byte
+// is the number of trailing zeros in the mask. Trailing zeros and not leading
+// zeros because the x86 architecture is little endian.
+static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
+                                                 uint64_t mask) {
+  const size_t diff_index = __builtin_ctzll(mask);
+  const int16_t ca = p1[diff_index];
+  const int16_t cb = p2[diff_index];
+  return ca - cb;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+
+// Base implementation for the Memcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockMemcmp is the function that implements the memcmp logic.
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
+struct MemcmpImpl {
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockMemcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return BlockMemcmp(p1 + offset, p2 + offset);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
+} // namespace avx512bw
+
+} // namespace __llvm_libc::x86
+
+#endif // LLVM_LIBC_ARCH_X86_64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -9,19 +9,8 @@
 #ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
 #define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H
 
-#include "src/__support/architectures.h"
-
-// Cache line sizes for ARM: These values are not strictly correct since
-// cache line sizes depend on implementations, not architectures.  There
-// are even implementations with cache line sizes configurable at boot
-// time.
-#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86)
-#define LLVM_LIBC_CACHELINE_SIZE 64
-#elif defined(LLVM_LIBC_ARCH_ARM)
-#define LLVM_LIBC_CACHELINE_SIZE 32
-#else
-#error "Unsupported platform for memory functions."
-#endif
+#include "src/__support/CPP/bit.h"
+#include "src/__support/CPP/type_traits.h"
 
 #include <stddef.h> // size_t
 #include <stdint.h> // intptr_t / uintptr_t
@@ -62,32 +51,46 @@
   return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1);
 }
 
-template <size_t alignment> intptr_t offset_from_last_aligned(const void *ptr) {
+// Returns the number of bytes to substract from ptr to get to the previous
+// multiple of alignment. If ptr is already aligned returns 0.
+template <size_t alignment> intptr_t offset_to_align_up(const void *ptr) {
   static_assert(is_power2(alignment), "alignment must be a power of 2");
   return reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
 }
 
-template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
+// Returns the number of bytes to add to ptr to get to the next multiple of
+// alignment. If ptr is already aligned returns 0.
+template <size_t alignment>
+intptr_t offset_to_next_aligned_or_zero(const void *ptr) {
   static_assert(is_power2(alignment), "alignment must be a power of 2");
   // The logic is not straightforward and involves unsigned modulo arithmetic
   // but the generated code is as fast as it can be.
   return -reinterpret_cast<uintptr_t>(ptr) & (alignment - 1U);
 }
 
-// Returns the offset from `ptr` to the next cache line.
-static inline intptr_t offset_to_next_cache_line(const void *ptr) {
-  return offset_to_next_aligned<LLVM_LIBC_CACHELINE_SIZE>(ptr);
+// Returns the number of bytes to add to ptr to get to the next multiple of
+// alignment. If ptr is already aligned returns alignment.
+template <size_t alignment> intptr_t offset_to_next_aligned(const void *ptr) {
+  return alignment - offset_to_align_up<alignment>(ptr);
 }
 
+// Returns the same pointer but notifies the compiler that it is aligned.
 template <size_t alignment, typename T> static T *assume_aligned(T *ptr) {
   return reinterpret_cast<T *>(__builtin_assume_aligned(ptr, alignment));
 }
+
 #if defined __has_builtin
 #if __has_builtin(__builtin_memcpy_inline)
 #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
 #endif
 #endif
 
+#if defined __has_builtin
+#if __has_builtin(__builtin_memset_inline)
+#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+#endif
+#endif
+
 // Performs a constant count copy.
 template <size_t Size>
 static inline void memcpy_inline(void *__restrict dst,
@@ -103,28 +106,56 @@
 using Ptr = char *;        // Pointer to raw data.
 using CPtr = const char *; // Const pointer to raw data.
 
-// Loads bytes from memory (possibly unaligned) and materializes them as type.
+// This type makes sure that we don't accidentally promote an integral type to
+// another one. It is only constructible from the exact T type.
+template <typename T> struct StrictIntegralType {
+  static_assert(cpp::is_integral_v<T>);
+
+  // Can only be constructed from a T.
+  template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
+  StrictIntegralType(U value) : value(value) {}
+
+  // Allows using the type in an if statement.
+  explicit operator bool() const { return value; }
+
+  // If type is unsigned (bcmp) we allow bitwise OR operations.
+  StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
+    static_assert(!cpp::is_signed_v<T>);
+    return value | Rhs.value;
+  }
+
+  // For interation with the C API we allow explicit conversion back to the
+  // `int` type.
+  explicit operator int() const {
+    // bit_cast makes sure that T and int have the same size.
+    return cpp::bit_cast<int>(value);
+  }
+
+  // Helper to get the zero value.
+  static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }
+
+private:
+  T value;
+};
+
+using MemcmpReturnType = StrictIntegralType<int32_t>;
+using BcmpReturnType = StrictIntegralType<uint32_t>;
+
+// Loads bytes from memory (possibly unaligned) and materializes them as
+// type.
 template <typename T> static inline T load(CPtr ptr) {
   T Out;
   memcpy_inline<sizeof(T)>(&Out, ptr);
   return Out;
 }
 
-// Stores a value of type T in memory (possibly unaligned)
+// Stores a value of type T in memory (possibly unaligned).
 template <typename T> static inline void store(Ptr ptr, T value) {
   memcpy_inline<sizeof(T)>(ptr, &value);
 }
 
-// For an operation like memset that operates on a pointer and a count, advances
-// the pointer by offset bytes and decrease count by the same amount.
-static inline void adjust(ptrdiff_t offset, Ptr &ptr, size_t &count) {
-  ptr += offset;
-  count -= offset;
-}
-
-// For an operation like memcpy or memcmp that operates on two pointers and a
-// count, advances the pointers by offset bytes and decrease count by the same
-// amount.
+// Advances the pointers p1 and p2 by offset bytes and decrease count by the
+// same amount.
 template <typename T1, typename T2>
 static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1,
                           T2 *__restrict &p2, size_t &count) {
@@ -133,31 +164,37 @@
   count -= offset;
 }
 
-// For an operation like memset that operates on a pointer and a count, advances
-// the pointer so it is aligned to SIZE bytes and decrease count by the same
-// amount.
+// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary
+// and decrease count by the same amount.
 // We make sure the compiler knows about the adjusted pointer alignment.
-template <size_t SIZE> void align(Ptr &ptr, size_t &count) {
-  adjust(offset_to_next_aligned<SIZE>(ptr), ptr, count);
-  ptr = assume_aligned<SIZE>(ptr);
+template <size_t SIZE, typename T1, typename T2>
+void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
+                               size_t &count) {
+  adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
+  p1 = assume_aligned<SIZE>(p1);
 }
 
-// For an operation like memcpy or memcmp that operates on two pointers and a
-// count, advances the pointers so one of them gets aligned to SIZE bytes and
-// decrease count by the same amount.
-// We make sure the compiler knows about the adjusted pointer alignment.
-enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 };
+// Same as align_p1_to_next_boundary above but with a single pointer instead.
+template <size_t SIZE, typename T1>
+void align_to_next_boundary(T1 *&p1, size_t &count) {
+  CPtr dummy;
+  align_p1_to_next_boundary<SIZE>(p1, dummy, count);
+}
+
+// An enum class that discriminates between the first and second pointer.
+enum class Arg { P1, P2, Dst = P1, Src = P2 };
+
+// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1.
+// Precondition: &p1 != &p2
 template <size_t SIZE, Arg AlignOn, typename T1, typename T2>
-void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) {
-  if constexpr (AlignOn == Arg::_1) {
-    adjust(offset_to_next_aligned<SIZE>(p1), p1, p2, count);
-    p1 = assume_aligned<SIZE>(p1);
-  } else if constexpr (AlignOn == Arg::_2) {
-    adjust(offset_to_next_aligned<SIZE>(p2), p1, p2, count);
-    p2 = assume_aligned<SIZE>(p2);
-  } else {
-    deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2");
-  }
+void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2,
+                            size_t &count) {
+  if constexpr (AlignOn == Arg::P1)
+    align_p1_to_next_boundary<SIZE>(p1, p2, count);
+  else if constexpr (AlignOn == Arg::P2)
+    align_p1_to_next_boundary<SIZE>(p2, p1, count); // swapping p1 and p2.
+  else
+    deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2");
 }
 
 } // namespace __llvm_libc
diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp
--- a/libc/src/string/memset.cpp
+++ b/libc/src/string/memset.cpp
@@ -13,8 +13,8 @@
 namespace __llvm_libc {
 
 LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) {
-  inline_memset(reinterpret_cast<char *>(dst),
-                static_cast<unsigned char>(value), count);
+  inline_memset(reinterpret_cast<char *>(dst), static_cast<uint8_t>(value),
+                count);
   return dst;
 }
 
diff --git a/libc/test/src/string/bcmp_test.cpp b/libc/test/src/string/bcmp_test.cpp
--- a/libc/test/src/string/bcmp_test.cpp
+++ b/libc/test/src/string/bcmp_test.cpp
@@ -12,25 +12,25 @@
 TEST(LlvmLibcBcmpTest, CmpZeroByte) {
   const char *lhs = "ab";
   const char *rhs = "bc";
-  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
+  ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0);
 }
 
 TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) {
   const char *lhs = "ab";
   const char *rhs = "ab";
-  EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }
 
 TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) {
   const char *lhs = "ab";
   const char *rhs = "ac";
-  EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }
 
 TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) {
   const char *lhs = "ac";
   const char *rhs = "ab";
-  EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
+  ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0);
 }
 
 TEST(LlvmLibcBcmpTest, Sweep) {
@@ -46,13 +46,13 @@
   reset(lhs);
   reset(rhs);
   for (size_t i = 0; i < K_MAX_SIZE; ++i)
-    EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
+    ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0);
 
   reset(lhs);
   reset(rhs);
   for (size_t i = 0; i < K_MAX_SIZE; ++i) {
     rhs[i] = 'b';
-    EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
+    ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0);
     rhs[i] = 'a';
   }
 }
diff --git a/libc/test/src/string/memmove_test.cpp b/libc/test/src/string/memmove_test.cpp
--- a/libc/test/src/string/memmove_test.cpp
+++ b/libc/test/src/string/memmove_test.cpp
@@ -20,7 +20,7 @@
   void *const Dst = Buffer;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0);
   EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) {
@@ -29,7 +29,7 @@
   void *const Dst = Buffer;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1);
   EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) {
@@ -40,7 +40,7 @@
   void *const Dst = Buffer + 1;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2);
   EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) {
@@ -49,7 +49,7 @@
   void *const Dst = Buffer + 2;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2);
   EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }
 
 // e.g. `Dst` follow `src`.
@@ -62,7 +62,7 @@
   void *const Dst = Buffer + 1;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1);
   EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }
 
 TEST(LlvmLibcMemmoveTest, DstFollowSrc) {
@@ -71,7 +71,7 @@
   void *const Dst = Buffer + 2;
   void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1);
   EXPECT_EQ(Ret, Dst);
-  EXPECT_MEM_EQ(Buffer, Expected);
+  ASSERT_MEM_EQ(Buffer, Expected);
 }
 
 static constexpr int kMaxSize = 512;
@@ -106,7 +106,7 @@
       void *const Ret =
           __llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size);
       EXPECT_EQ(Ret, Dst);
-      EXPECT_MEM_EQ(Buffer, Expected);
+      ASSERT_MEM_EQ(Buffer, Expected);
     }
   }
 }
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -3,8 +3,6 @@
   SUITE
     libc_string_unittests
   SRCS
-    elements_test.cpp
-    memory_access_test.cpp
     utils_test.cpp
   COMPILE_OPTIONS
     ${LIBC_COMPILE_OPTIONS_NATIVE}
diff --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp
deleted file mode 100644
--- a/libc/test/src/string/memory_utils/elements_test.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-//===-- Unittests for memory_utils ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "src/__support/CPP/array.h"
-#include "src/__support/CPP/span.h"
-#include "src/string/memory_utils/elements.h"
-#include "utils/UnitTest/Test.h"
-
-namespace __llvm_libc {
-
-// Registering Types
-using FixedSizeTypes = testing::TypeList<
-#if defined(__SSE2__)
-    x86::Vector128, //
-#endif              // __SSE2__
-#if defined(__AVX2__)
-    x86::Vector256, //
-#endif              // __AVX2__
-#if defined(__AVX512F__) and defined(__AVX512BW__)
-    x86::Vector512, //
-#endif              // defined(__AVX512F__) and defined(__AVX512BW__)
-    scalar::UINT8,  //
-    scalar::UINT16, //
-    scalar::UINT32, //
-    scalar::UINT64, //
-    Repeated<scalar::UINT64, 2>,                            //
-    Repeated<scalar::UINT64, 4>,                            //
-    Repeated<scalar::UINT64, 8>,                            //
-    Repeated<scalar::UINT64, 16>,                           //
-    Repeated<scalar::UINT64, 32>,                           //
-    Chained<scalar::UINT16, scalar::UINT8>,                 //
-    Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, //
-    builtin::_1,                                            //
-    builtin::_2,                                            //
-    builtin::_3,                                            //
-    builtin::_4,                                            //
-    builtin::_8                                             //
-    >;
-
-char GetRandomChar() {
-  static constexpr const uint64_t a = 1103515245;
-  static constexpr const uint64_t c = 12345;
-  static constexpr const uint64_t m = 1ULL << 31;
-  static uint64_t seed = 123456789;
-  seed = (a * seed + c) % m;
-  return seed;
-}
-
-void Randomize(cpp::span<char> buffer) {
-  for (auto &current : buffer)
-    current = GetRandomChar();
-}
-
-template <typename Element> using Buffer = cpp::array<char, Element::SIZE>;
-
-template <typename Element> Buffer<Element> GetRandomBuffer() {
-  Buffer<Element> buffer;
-  Randomize(buffer);
-  return buffer;
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, copy, FixedSizeTypes) {
-  Buffer<ParamType> Dst;
-  const auto buffer = GetRandomBuffer<ParamType>();
-  copy<ParamType>(Dst.data(), buffer.data());
-  for (size_t i = 0; i < ParamType::SIZE; ++i)
-    EXPECT_EQ(Dst[i], buffer[i]);
-}
-
-template <typename T> T copy(const T &Input) {
-  T Output;
-  for (size_t I = 0; I < Input.size(); ++I)
-    Output[I] = Input[I];
-  return Output;
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Move, FixedSizeTypes) {
-  constexpr size_t SIZE = ParamType::SIZE;
-  using LargeBuffer = cpp::array<char, SIZE * 2>;
-  LargeBuffer GroundTruth;
-  Randomize(GroundTruth);
-  // Forward, we move the SIZE first bytes from offset 0 to SIZE.
-  for (size_t Offset = 0; Offset < SIZE; ++Offset) {
-    LargeBuffer Buffer = copy(GroundTruth);
-    move<ParamType>(&Buffer[Offset], &Buffer[0]);
-    for (size_t I = 0; I < SIZE; ++I)
-      EXPECT_EQ(Buffer[I + Offset], GroundTruth[I]);
-  }
-  // Backward, we move the SIZE last bytes from offset 0 to SIZE.
-  for (size_t Offset = 0; Offset < SIZE; ++Offset) {
-    LargeBuffer Buffer = copy(GroundTruth);
-    move<ParamType>(&Buffer[Offset], &Buffer[SIZE]);
-    for (size_t I = 0; I < SIZE; ++I)
-      EXPECT_EQ(Buffer[I + Offset], GroundTruth[SIZE + I]);
-  }
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) {
-  const auto buffer = GetRandomBuffer<ParamType>();
-  EXPECT_TRUE(equals<ParamType>(buffer.data(), buffer.data()));
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, three_way_compare, FixedSizeTypes) {
-  Buffer<ParamType> initial;
-  for (auto &c : initial)
-    c = 5;
-
-  // Testing equality
-  EXPECT_EQ(three_way_compare<ParamType>(initial.data(), initial.data()), 0);
-
-  // Testing all mismatching positions
-  for (size_t i = 0; i < ParamType::SIZE; ++i) {
-    auto copy = initial;
-    ++copy[i]; // copy is now lexicographycally greated than initial
-    const auto *less = initial.data();
-    const auto *greater = copy.data();
-    EXPECT_LT(three_way_compare<ParamType>(less, greater), 0);
-    EXPECT_GT(three_way_compare<ParamType>(greater, less), 0);
-  }
-}
-
-TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) {
-  Buffer<ParamType> Dst;
-  const cpp::array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)};
-  for (char value : values) {
-    splat_set<ParamType>(Dst.data(), value);
-    for (size_t i = 0; i < ParamType::SIZE; ++i)
-      EXPECT_EQ(Dst[i], value);
-  }
-}
-
-} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp
deleted file mode 100644
--- a/libc/test/src/string/memory_utils/memory_access_test.cpp
+++ /dev/null
@@ -1,228 +0,0 @@
-//===-- Unittests for memory_utils ----------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#define LLVM_LIBC_UNITTEST_OBSERVE 1
-
-#include "src/__support/CPP/array.h"
-#include "src/string/memory_utils/elements.h"
-#include "utils/UnitTest/Test.h"
-
-#include <stdio.h>
-#include <string.h>
-
-namespace __llvm_libc {
-
-static constexpr const size_t kMaxBuffer = 32;
-
-struct BufferAccess : cpp::array<char, kMaxBuffer + 1> {
-  BufferAccess() { Reset(); }
-  void Reset() {
-    for (auto &value : *this)
-      value = '0';
-    this->operator[](kMaxBuffer) = '\0';
-  }
-  void Touch(ptrdiff_t offset, size_t size) {
-    if (offset < 0)
-      return;
-    for (size_t i = 0; i < size; ++i)
-      ++(*this)[offset + i];
-  }
-  operator const char *() const { return this->data(); }
-};
-
-struct Buffer {
-  ptrdiff_t Offset(const char *ptr) const {
-    const bool contained = ptr >= data.begin() && ptr < data.end();
-    return contained ? ptr - data.begin() : -1;
-  }
-  void Reset() {
-    reads.Reset();
-    writes.Reset();
-  }
-  cpp::array<char, kMaxBuffer> data;
-  BufferAccess __attribute__((aligned(64))) reads;
-  BufferAccess __attribute__((aligned(64))) writes;
-};
-
-struct MemoryAccessObserver {
-  void ObserveRead(const char *ptr, size_t size) {
-    Buffer1.reads.Touch(Buffer1.Offset(ptr), size);
-    Buffer2.reads.Touch(Buffer2.Offset(ptr), size);
-  }
-
-  void ObserveWrite(const char *ptr, size_t size) {
-    Buffer1.writes.Touch(Buffer1.Offset(ptr), size);
-    Buffer2.writes.Touch(Buffer2.Offset(ptr), size);
-  }
-
-  void Reset() {
-    Buffer1.Reset();
-    Buffer2.Reset();
-  }
-
-  Buffer Buffer1;
-  Buffer Buffer2;
-};
-
-MemoryAccessObserver Observer;
-
-template <size_t Size> struct TestingElement {
-  static constexpr size_t SIZE = Size;
-
-  static void copy(char *__restrict dst, const char *__restrict src) {
-    Observer.ObserveRead(src, SIZE);
-    Observer.ObserveWrite(dst, SIZE);
-  }
-
-  static bool equals(const char *lhs, const char *rhs) {
-    Observer.ObserveRead(lhs, SIZE);
-    Observer.ObserveRead(rhs, SIZE);
-    return true;
-  }
-
-  static int three_way_compare(const char *lhs, const char *rhs) {
-    Observer.ObserveRead(lhs, SIZE);
-    Observer.ObserveRead(rhs, SIZE);
-    return 0;
-  }
-
-  static void splat_set(char *dst, const unsigned char value) {
-    Observer.ObserveWrite(dst, SIZE);
-  }
-};
-
-using Types = testing::TypeList<
-    TestingElement<1>,                                               // 1 Byte
-    TestingElement<2>,                                               // 2 Bytes
-    TestingElement<4>,                                               // 4 Bytes
-    Repeated<TestingElement<2>, 3>,                                  // 6 Bytes
-    Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes
-    >;
-
-struct LlvmLibcTestAccessBase : public testing::Test {
-
-  template <typename HigherOrder, size_t Size, size_t Offset = 0>
-  void checkOperations(const BufferAccess &expected) {
-    static const BufferAccess untouched;
-
-    Observer.Reset();
-    HigherOrder::copy(dst_ptr() + Offset, src_ptr() + Offset, Size);
-    ASSERT_STREQ(src().writes, untouched);
-    ASSERT_STREQ(dst().reads, untouched);
-    ASSERT_STREQ(src().reads, expected);
-    ASSERT_STREQ(dst().writes, expected);
-    Observer.Reset();
-    HigherOrder::equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size);
-    ASSERT_STREQ(lhs().writes, untouched);
-    ASSERT_STREQ(rhs().writes, untouched);
-    ASSERT_STREQ(lhs().reads, expected);
-    ASSERT_STREQ(rhs().reads, expected);
-    Observer.Reset();
-    HigherOrder::three_way_compare(lhs_ptr() + Offset, rhs_ptr() + Offset,
-                                   Size);
-    ASSERT_STREQ(lhs().writes, untouched);
-    ASSERT_STREQ(rhs().writes, untouched);
-    ASSERT_STREQ(lhs().reads, expected);
-    ASSERT_STREQ(rhs().reads, expected);
-    Observer.Reset();
-    HigherOrder::splat_set(dst_ptr() + Offset, 5, Size);
-    ASSERT_STREQ(src().reads, untouched);
-    ASSERT_STREQ(src().writes, untouched);
-    ASSERT_STREQ(dst().reads, untouched);
-    ASSERT_STREQ(dst().writes, expected);
-  }
-
-  void checkMaxAccess(const BufferAccess &expected, int max) {
-    for (size_t i = 0; i < kMaxBuffer; ++i) {
-      int value = (int)expected[i] - '0';
-      ASSERT_GE(value, 0);
-      ASSERT_LE(value, max);
-    }
-  }
-
-private:
-  const Buffer &lhs() const { return Observer.Buffer1; }
-  const Buffer &rhs() const { return Observer.Buffer2; }
-  const Buffer &src() const { return Observer.Buffer2; }
-  const Buffer &dst() const { return Observer.Buffer1; }
-  Buffer &dst() { return Observer.Buffer1; }
-
-  char *dst_ptr() { return dst().data.begin(); }
-  const char *src_ptr() { return src().data.begin(); }
-  const char *lhs_ptr() { return lhs().data.begin(); }
-  const char *rhs_ptr() { return rhs().data.begin(); }
-};
-
-template <typename ParamType>
-struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase {
-
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-
-    BufferAccess expected;
-    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 1);
-    checkOperations<Tail<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-
-    BufferAccess expected;
-    expected.Touch(0, ParamType::SIZE);
-    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 2);
-    checkOperations<HeadTail<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 20;
-
-    BufferAccess expected;
-    for (size_t i = 0; i < Size - ParamType::SIZE; i += ParamType::SIZE)
-      expected.Touch(i, ParamType::SIZE);
-    expected.Touch(Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 2);
-    checkOperations<Loop<ParamType>, Size>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {}
-
-template <typename ParamType>
-struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase {
-  void TearDown() override {
-    static constexpr size_t Size = 10;
-    static constexpr size_t Offset = 2;
-    using AlignmentT = TestingElement<4>;
-
-    BufferAccess expected;
-    expected.Touch(Offset, AlignmentT::SIZE);
-    expected.Touch(AlignmentT::SIZE, ParamType::SIZE);
-    expected.Touch(Offset + Size - ParamType::SIZE, ParamType::SIZE);
-
-    checkMaxAccess(expected, 3);
-    checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size,
-                    Offset>(expected);
-    checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size,
-                    Offset>(expected);
-  }
-};
-TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {}
-
-} // namespace __llvm_libc
diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp
--- a/libc/test/src/string/memory_utils/utils_test.cpp
+++ b/libc/test/src/string/memory_utils/utils_test.cpp
@@ -80,41 +80,27 @@
 }
 
 TEST(LlvmLibcUtilsTest, OffsetToNextAligned) {
-  EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(16));
   EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15));
-  EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(16));
   EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1));
   EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16));
 }
 
-TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) {
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0));
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1));
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0));
-  EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15));
-  EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16));
+TEST(LlvmLibcUtilsTest, OffsetToNextAlignedOrZero) {
+  EXPECT_EQ(offset_to_next_aligned_or_zero<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_to_next_aligned_or_zero<16>(forge(1)), I(15));
+  EXPECT_EQ(offset_to_next_aligned_or_zero<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_to_next_aligned_or_zero<16>(forge(15)), I(1));
+  EXPECT_EQ(offset_to_next_aligned_or_zero<32>(forge(16)), I(16));
 }
 
-TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) {
-  EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0);
-  EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0));
-  EXPECT_EQ(offset_to_next_cache_line(forge(1)),
-            I(LLVM_LIBC_CACHELINE_SIZE - 1));
-  EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0));
-  EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)),
-            I(1));
-}
-
-TEST(LlvmLibcUtilsTest, Adjust1) {
-  char a;
-  const size_t base_size = 10;
-  for (size_t I = -2; I < 2; ++I) {
-    auto *ptr = &a;
-    size_t size = base_size;
-    adjust(I, ptr, size);
-    EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I));
-    EXPECT_EQ(size, base_size - I);
-  }
+TEST(LlvmLibcUtilsTest, OffsetFromLastAlignedOrZero) {
+  EXPECT_EQ(offset_to_align_up<16>(forge(0)), I(0));
+  EXPECT_EQ(offset_to_align_up<16>(forge(1)), I(1));
+  EXPECT_EQ(offset_to_align_up<16>(forge(16)), I(0));
+  EXPECT_EQ(offset_to_align_up<16>(forge(15)), I(15));
+  EXPECT_EQ(offset_to_align_up<32>(forge(16)), I(16));
 }
 
 TEST(LlvmLibcUtilsTest, Adjust2) {
@@ -131,19 +117,6 @@
   }
 }
 
-TEST(LlvmLibcUtilsTest, Align1) {
-  char a;
-  const size_t base_size = 10;
-  {
-    auto *ptr = &a;
-    size_t size = base_size;
-    align<128>(ptr, size);
-    EXPECT_TRUE(uintptr_t(ptr) % 128 == 0);
-    EXPECT_GE(ptr, &a);
-    EXPECT_EQ(size_t(ptr - &a), base_size - size);
-  }
-}
-
 TEST(LlvmLibcUtilsTest, Align2) {
   char a, b;
   const size_t base_size = 10;
@@ -151,10 +124,10 @@
     auto *p1 = &a;
     auto *p2 = &b;
     size_t size = base_size;
-    align<128, Arg::_1>(p1, p2, size);
+    align_to_next_boundary<128, Arg::P1>(p1, p2, size);
     EXPECT_TRUE(uintptr_t(p1) % 128 == 0);
-    EXPECT_GE(p1, &a);
-    EXPECT_GE(p2, &b);
+    EXPECT_GT(p1, &a);
+    EXPECT_GT(p2, &b);
     EXPECT_EQ(size_t(p1 - &a), base_size - size);
     EXPECT_EQ(size_t(p2 - &b), base_size - size);
   }
@@ -162,10 +135,10 @@
     auto *p1 = &a;
     auto *p2 = &b;
     size_t size = base_size;
-    align<128, Arg::_2>(p1, p2, size);
+    align_to_next_boundary<128, Arg::P2>(p1, p2, size);
     EXPECT_TRUE(uintptr_t(p2) % 128 == 0);
-    EXPECT_GE(p1, &a);
-    EXPECT_GE(p2, &b);
+    EXPECT_GT(p1, &a);
+    EXPECT_GT(p2, &b);
     EXPECT_EQ(size_t(p1 - &a), base_size - size);
     EXPECT_EQ(size_t(p2 - &b), base_size - size);
   }