diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt
--- a/libc/src/string/memory_utils/CMakeLists.txt
+++ b/libc/src/string/memory_utils/CMakeLists.txt
@@ -2,13 +2,17 @@
 add_header_library(
   memory_utils
   HDRS
-    utils.h
-    elements.h
     bcmp_implementations.h
     bzero_implementations.h
+    elements.h
     memcmp_implementations.h
     memcpy_implementations.h
     memset_implementations.h
+    op_aarch64.h
+    op_builtin.h
+    op_generic.h
+    op_x86.h
+    utils.h
   DEPS
     libc.src.__support.CPP.bit
 )
diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/README.md
@@ -0,0 +1,97 @@
+# The mem* framework
+
+The framework handles the following mem* functions:
+ - `memcpy`
+ - `memmove`
+ - `memset`
+ - `bzero`
+ - `bcmp`
+ - `memcmp`
+
+## Building blocks
+
+These functions can be built out of a set of lower-level operations:
+ - **`block`** : operates on a block of `SIZE` bytes.
+ - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`)
+ - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`.
+ - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation.
+
+As an illustration, let's take the example of a trivial `memset` implementation:
+
+ ```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return Memset<1>::block(dst, value);
+    if (count == 2) return Memset<2>::block(dst, value);
+    if (count == 3) return Memset<3>::block(dst, value);
+    if (count <= 8) return Memset<4>::head_tail(dst, value, count);  // Note that 0 to 4 bytes are written twice.
+    if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here.
+    return Memset<16>::loop_and_tail(dst, value, count);
+}
+ ```
+
+Now let's have a look into the `Memset` structure:
+
+```C++
+template <size_t Size>
+struct Memset {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    // Implement me
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+```
+
+As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented.
+In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated.
+**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics.
+
+## Scoped specializations
+
+We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation.
+
+In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop.
+```C++
+ extern "C" void memset(const char* dst, int value, size_t count) {
+    if (count == 0) return;
+    if (count == 1) return generic::Memset<1>::block(dst, value);
+    if (count == 2) return generic::Memset<2>::block(dst, value);
+    if (count == 3) return generic::Memset<3>::block(dst, value);
+    if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count);
+    if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count);
+    return x86::Memset<16>::loop_and_tail(dst, value, count);
+}
+```
+
+### The `builtin` scope
+
+Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset))
+
+### The `generic` scope
+
+In this scope we define pure C++ implementations using native integral types and clang vector extensions.
+
+### The arch specific scopes
+
+Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`).
+
+The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort.
diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_aarch64.h
@@ -0,0 +1,178 @@
+//===-- aarch64 implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides aarch64 specific building blocks to compose memory
+// functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_AARCH64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __ARM_NEON
+#include <arm_neon.h>
+#endif //__ARM_NEON
+
+namespace __llvm_libc::aarch64 {
+
+static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON);
+
+namespace neon {
+
+template <size_t Size> struct BzeroCacheLine {
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t) {
+    static_assert(Size == 64);
+#if __SIZEOF_POINTER__ == 4
+    asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory");
+#else
+    asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory");
+#endif
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    // Unaligned store, we can't use 'dc zva' here.
+    static constexpr size_t kMaxSize = kNeon ? 16 : 8;
+    generic::Memset<Size, kMaxSize>::tail(dst, value, count);
+  }
+};
+
+inline static bool hasZva() {
+  uint64_t zva_val;
+  asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val));
+  // DC ZVA is permitted if DZP, bit [4] is zero.
+  // BS, bits [3:0] is log2 of the block count in words.
+  // So the next line checks whether the instruction is permitted and block
+  // count is 16 words (i.e. 64 bytes).
+  return (zva_val & 0b11111) == 0b00100;
+}
+
+} // namespace neon
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t BlockSize = 32;
+
+  static const unsigned char *as_u8(CPtr ptr) {
+    return reinterpret_cast<const unsigned char *>(ptr);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == 16) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t an = veorq_u8(a, n);
+      uint32x2_t an_reduced = vqmovn_u64(an);
+      return vmaxv_u32(an_reduced);
+    } else if constexpr (Size == 32) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t an = veorq_u8(a, n);
+      uint8x16_t bo = veorq_u8(b, o);
+      // anbo = (a ^ n) | (b ^ o).  At least one byte is nonzero if there is
+      // a difference between the two buffers.  We reduce this value down to 4
+      // bytes in two steps. First, calculate the saturated move value when
+      // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get
+      // a single 32 bit nonzero value if a mismatch occurred.
+      uint8x16_t anbo = vorrq_u8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr ((Size % BlockSize) == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = Bcmp<BlockSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if constexpr (Size == 16) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + count - 16);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + count - 16);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      // anbo = (a ^ n) | (b ^ o)
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint32x2_t anbo_reduced = vqmovn_u64(anbo);
+      return vmaxv_u32(anbo_reduced);
+    } else if constexpr (Size == 32) {
+      auto _p1 = as_u8(p1);
+      auto _p2 = as_u8(p2);
+      uint8x16_t a = vld1q_u8(_p1);
+      uint8x16_t b = vld1q_u8(_p1 + 16);
+      uint8x16_t c = vld1q_u8(_p1 + count - 16);
+      uint8x16_t d = vld1q_u8(_p1 + count - 32);
+      uint8x16_t n = vld1q_u8(_p2);
+      uint8x16_t o = vld1q_u8(_p2 + 16);
+      uint8x16_t p = vld1q_u8(_p2 + count - 16);
+      uint8x16_t q = vld1q_u8(_p2 + count - 32);
+      uint8x16_t an = veorq_s8(a, n);
+      uint8x16_t bo = veorq_s8(b, o);
+      uint8x16_t cp = veorq_s8(c, p);
+      uint8x16_t dq = veorq_s8(d, q);
+      uint8x16_t anbo = vorrq_s8(an, bo);
+      uint8x16_t cpdq = vorrq_s8(cp, dq);
+      // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)).  Reduce this to
+      // a nonzero 32 bit value if a mismatch occurred.
+      uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq);
+      uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq);
+      return vmaxv_u32(abnocpdq_reduced);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+} // namespace __llvm_libc::aarch64
+
+#endif // LLVM_LIBC_ARCH_AARCH64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H
diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_builtin.h
@@ -0,0 +1,148 @@
+//===-- Implementation using the __builtin_XXX_inline ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks to compose memory functions.
+// They rely on the compiler to generate the best possible code through the use
+// of the `__builtin_XXX_inline` builtins. These builtins are currently only
+// available in Clang.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
+
+#include "src/string/memory_utils/utils.h"
+
+namespace __llvm_libc::builtin {
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+template <size_t Size> struct Memcpy {
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr __restrict dst, CPtr __restrict src) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    return __builtin_memcpy_inline(dst, src, SIZE);
+#else
+    deferred_static_assert("Missing __builtin_memcpy_inline");
+    (void)dst;
+    (void)src;
+#endif
+  }
+
+  static inline void tail(Ptr __restrict dst, CPtr __restrict src,
+                          size_t count) {
+    block(dst + count - SIZE, src + count - SIZE);
+  }
+
+  static inline void head_tail(Ptr __restrict dst, CPtr __restrict src,
+                               size_t count) {
+    block(dst, src);
+    tail(dst, src, count);
+  }
+
+  static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src,
+                                   size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      block(dst + offset, src + offset);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, src, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+template <size_t Size> struct Memset {
+  using ME = Memset;
+  static constexpr size_t SIZE = Size;
+  static inline void block(Ptr dst, uint8_t value) {
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+    __builtin_memset_inline(dst, value, Size);
+#else
+    deferred_static_assert("Missing __builtin_memset_inline");
+    (void)dst;
+    (void)value;
+#endif
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+template <size_t Size> struct Bcmp {
+  using ME = Bcmp;
+  static constexpr size_t SIZE = Size;
+  static inline BcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return BcmpReturnType::ZERO();
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+template <size_t Size> struct Memcmp {
+  using ME = Memcmp;
+  static constexpr size_t SIZE = Size;
+  static inline MemcmpReturnType block(CPtr, CPtr) {
+    deferred_static_assert("Missing __builtin_memcmp_inline");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) {
+    deferred_static_assert("Not implemented");
+    return MemcmpReturnType::ZERO();
+  }
+};
+
+} // namespace __llvm_libc::builtin
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H
diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_generic.h
@@ -0,0 +1,466 @@
+//===-- Generic implementation of memory function building blocks ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides generic C++ building blocks.
+// Depending on the requested size, the block operation uses unsigned integral
+// types, vector types or an array of the type with the maximum size.
+//
+// The maximum size is passed as a template argument. For instance, on x86
+// platforms that only supports integral types the maximum size would be 8
+// (corresponding to uint64_t). On this platform if we request the size 32, this
+// would be treated as a cpp::array<uint64_t, 4>.
+//
+// On the other hand, if the platform is x86 with support for AVX the maximum
+// size is 32 and the operation can be handled with a single native operation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
+
+#include "src/__support/CPP/array.h"
+#include "src/__support/CPP/type_traits.h"
+#include "src/__support/endian.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/utils.h"
+
+#include <stdint.h>
+
+namespace __llvm_libc::generic {
+
+// CTPair and CTMap below implement a compile time map.
+// This is useful to map from a Size to a type handling this size.
+//
+// Example usage:
+// using MyMap = CTMap<CTPair<1, uint8_t>,
+//                     CTPair<2, uint16_t>,
+//                     >;
+// ...
+// using UInt8T = MyMap::find_type<1>;
+template <size_t I, typename T> struct CTPair {
+  using type = T;
+  static CTPair get_pair(cpp::integral_constant<size_t, I>) { return {}; }
+};
+template <typename... Pairs> struct CTMap : public Pairs... {
+  using Pairs::get_pair...;
+  template <size_t I>
+  using find_type =
+      typename decltype(get_pair(cpp::integral_constant<size_t, I>{}))::type;
+};
+
+// Helper to test if a type is void.
+template <typename T> inline constexpr bool is_void_v = cpp::is_same_v<T, void>;
+
+// Implements load, store and splat for unsigned integral types.
+template <typename T> struct ScalarType {
+  using Type = T;
+  static_assert(cpp::is_integral_v<Type> && !cpp::is_signed_v<Type>);
+
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    return Type(~0) / Type(0xFF) * Type(value);
+  }
+};
+
+// Implements load, store and splat for vector types.
+template <size_t Size> struct VectorType {
+  using Type = uint8_t __attribute__((__vector_size__(Size)));
+  static inline Type load(CPtr src) { return ::__llvm_libc::load<Type>(src); }
+  static inline void store(Ptr dst, Type value) {
+    ::__llvm_libc::store<Type>(dst, value);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    // This for loop is optimized out for vector types.
+    for (size_t i = 0; i < Size; ++i)
+      Out[i] = static_cast<uint8_t>(value);
+    return Out;
+  }
+};
+
+static_assert((UINTPTR_MAX == 4294967295U) ||
+                  (UINTPTR_MAX == 18446744073709551615UL),
+              "We currently only support 32- or 64-bit platforms");
+
+// Map from sizes to structures offering static load, store and splat methods.
+// Note: On platforms lacking vector support, we use the ArrayType below and
+// decompose the operation in smaller pieces.
+using NativeTypeMap =
+    CTMap<CTPair<1, ScalarType<uint8_t>>,  //
+          CTPair<2, ScalarType<uint16_t>>, //
+          CTPair<4, ScalarType<uint32_t>>, //
+#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
+          CTPair<8, ScalarType<uint64_t>>, // Not available on 32bit
+#endif                                     //
+          CTPair<16, VectorType<16>>,      //
+          CTPair<32, VectorType<32>>,      //
+          CTPair<64, VectorType<64>>>;
+
+// Implements load, store and splat for sizes not natively supported by the
+// platform. SubType is either ScalarType or VectorType.
+template <typename SubType, size_t ArraySize> struct ArrayType {
+  using Type = cpp::array<typename SubType::Type, ArraySize>;
+  static constexpr size_t SizeOfElement = sizeof(typename SubType::Type);
+  static inline Type load(CPtr src) {
+    Type Value;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Value[I] = SubType::load(src + (I * SizeOfElement));
+    return Value;
+  }
+  static inline void store(Ptr dst, Type Value) {
+    for (size_t I = 0; I < ArraySize; ++I)
+      SubType::store(dst + (I * SizeOfElement), Value[I]);
+  }
+  static inline Type splat(uint8_t value) {
+    Type Out;
+    for (size_t I = 0; I < ArraySize; ++I)
+      Out[I] = SubType::splat(value);
+    return Out;
+  }
+};
+
+// Checks whether we should use an ArrayType.
+template <size_t Size, size_t MaxSize> static constexpr bool useArrayType() {
+  return (Size > MaxSize) && ((Size % MaxSize) == 0) &&
+         !is_void_v<NativeTypeMap::find_type<MaxSize>>;
+}
+
+// Compute the type to handle an operation of Size bytes knowing that the
+// underlying platform only support native types up to MaxSize bytes.
+template <size_t Size, size_t MaxSize>
+using getTypeFor = cpp::conditional_t<
+    useArrayType<Size, MaxSize>(),
+    ArrayType<NativeTypeMap::find_type<MaxSize>, Size / MaxSize>,
+    NativeTypeMap::find_type<Size>>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy
+// When building with clang we can delegate to the builtin implementation.
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size> using Memcpy = builtin::Memcpy<Size>;
+
+///////////////////////////////////////////////////////////////////////////////
+// Memset
+// The MaxSize template argument gives the maximum size handled natively by the
+// platform. For instance on x86 with AVX support this would be 32. If a size
+// greater than MaxSize is requested we break the operation down in smaller
+// pieces of size MaxSize.
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size, size_t MaxSize> struct Memset {
+  static_assert(is_power2(MaxSize));
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, uint8_t value) {
+    if constexpr (Size == 3) {
+      Memset<1, MaxSize>::block(dst + 2, value);
+      Memset<2, MaxSize>::block(dst, value);
+    } else {
+      using T = getTypeFor<Size, MaxSize>;
+      if constexpr (is_void_v<T>) {
+        deferred_static_assert("Unimplemented Size");
+      } else {
+        T::store(dst, T::splat(value));
+      }
+    }
+  }
+
+  static inline void tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst + count - SIZE, value);
+  }
+
+  static inline void head_tail(Ptr dst, uint8_t value, size_t count) {
+    block(dst, value);
+    tail(dst, value, count);
+  }
+
+  static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) {
+    static_assert(SIZE > 1);
+    size_t offset = 0;
+    do {
+      block(dst + offset, value);
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    tail(dst, value, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Bcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline uint32_t load_xor(CPtr p1, CPtr p2) {
+    return load<T>(p1) ^ load<T>(p2);
+  }
+
+  template <typename T>
+  static inline uint32_t load_not_equal(CPtr p1, CPtr p2) {
+    return load<T>(p1) != load<T>(p2);
+  }
+
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    static constexpr size_t MaxSize = 8;
+    if constexpr (Size == 1) {
+      return load_xor<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_xor<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_xor<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_not_equal<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (auto value = Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+///////////////////////////////////////////////////////////////////////////////
+template <size_t Size> struct Memcmp {
+  static constexpr size_t SIZE = Size;
+  static constexpr size_t MaxSize = 8;
+
+  template <typename T> static inline T load_be(CPtr ptr) {
+    return Endian::to_big_endian(load<T>(ptr));
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) {
+    return load_be<T>(p1) - load_be<T>(p2);
+  }
+
+  template <typename T>
+  static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) {
+    const auto la = load_be<T>(p1);
+    const auto lb = load_be<T>(p2);
+    return la > lb ? 1 : la < lb ? -1 : 0;
+  }
+
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == 1) {
+      return load_be_diff<uint8_t>(p1, p2);
+    } else if constexpr (Size == 2) {
+      return load_be_diff<uint16_t>(p1, p2);
+    } else if constexpr (Size == 4) {
+      return load_be_cmp<uint32_t>(p1, p2);
+    } else if constexpr (Size == 8) {
+      return load_be_cmp<uint64_t>(p1, p2);
+    } else if constexpr (useArrayType<Size, MaxSize>()) {
+      for (size_t offset = 0; offset < Size; offset += MaxSize)
+        if (Bcmp<MaxSize>::block(p1 + offset, p2 + offset))
+          return Memcmp<MaxSize>::block(p1 + offset, p2 + offset);
+      return MemcmpReturnType::ZERO();
+    } else if constexpr (Size == 3) {
+      if (auto value = Memcmp<2>::block(p1, p2))
+        return value;
+      return Memcmp<1>::block(p1 + 2, p2 + 2);
+    } else {
+      deferred_static_assert("Unimplemented Size");
+    }
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - SIZE, p2 + count - SIZE);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += SIZE;
+    } while (offset < count - SIZE);
+    return tail(p1, p2, count);
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Memmove
+///////////////////////////////////////////////////////////////////////////////
+
+template <size_t Size, size_t MaxSize> struct Memmove {
+  static_assert(is_power2(MaxSize));
+  using T = getTypeFor<Size, MaxSize>;
+  static constexpr size_t SIZE = Size;
+
+  static inline void block(Ptr dst, CPtr src) {
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      T::store(dst, T::load(src));
+    }
+  }
+
+  static inline void head_tail(Ptr dst, CPtr src, size_t count) {
+    const size_t offset = count - Size;
+    if constexpr (is_void_v<T>) {
+      deferred_static_assert("Unimplemented Size");
+    } else {
+      // The load and store operations can be performed in any order as long as
+      // they are not interleaved. More investigations are needed to determine
+      // the best order.
+      const auto head = T::load(src);
+      const auto tail = T::load(src + offset);
+      T::store(dst, head);
+      T::store(dst + offset, tail);
+    }
+  }
+
+  // Align forward suitable when dst < src. The alignment is performed with
+  // an HeadTail operation of count âˆˆ [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes forward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [___________LLLLLLLA______________]
+  // [_SSSSSSSS________________________]
+  // [________SSSSSSSS_________________]
+  //
+  // e.g. Moving two bytes forward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_]
+  // [____LLLLLLLL_____________________]
+  // [______LLLLLLLL___________________]
+  // [_SSSSSSSS________________________]
+  // [___SSSSSSSA______________________]
+  template <Arg AlignOn>
+  static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr prev_dst = dst;
+    CPtr prev_src = src;
+    size_t prev_count = count;
+    align_to_next_boundary<Size, AlignOn>(dst, src, count);
+    adjust(Size, dst, src, count);
+    head_tail(prev_dst, prev_src, prev_count - count);
+  }
+
+  // Align backward suitable when dst > src. The alignment is performed with
+  // an HeadTail operation of count âˆˆ [Alignment, 2 x Alignment].
+  //
+  // e.g. Moving two bytes backward, we make sure src is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _________________ALLLLLLL_______]
+  // [ ___________________LLLLLLLL_____]
+  // [____________________SSSSSSSS_____]
+  // [______________________SSSSSSSS___]
+  //
+  // e.g. Moving two bytes backward, we make sure dst is aligned.
+  // [  |       |       |       |      ]
+  // [____XXXXXXXXXXXXXXXXXXXXXXXX_____]
+  // [ _______________LLLLLLLL_________]
+  // [ ___________________LLLLLLLL_____]
+  // [__________________ASSSSSSS_______]
+  // [______________________SSSSSSSS___]
+  template <Arg AlignOn>
+  static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) {
+    Ptr headtail_dst = dst + count;
+    CPtr headtail_src = src + count;
+    size_t headtail_size = 0;
+    align_to_next_boundary<Size, AlignOn>(headtail_dst, headtail_src,
+                                          headtail_size);
+    adjust(-2 * Size, headtail_dst, headtail_src, headtail_size);
+    head_tail(headtail_dst, headtail_src, headtail_size);
+    count -= headtail_size;
+  }
+
+  // Move forward suitable when dst < src. We load the tail bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [_________________________LLLLLLLL___]
+  // [___LLLLLLLL_________________________]
+  // [_SSSSSSSS___________________________]
+  // [___________LLLLLLLL_________________]
+  // [_________SSSSSSSS___________________]
+  // [___________________LLLLLLLL_________]
+  // [_________________SSSSSSSS___________]
+  // [_______________________SSSSSSSS_____]
+  static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    const size_t tail_offset = count - Size;
+    const auto tail_value = T::load(src + tail_offset);
+    size_t offset = 0;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset += Size;
+    } while (offset < count - Size);
+    T::store(dst + tail_offset, tail_value);
+  }
+
+  // Move backward suitable when dst > src. We load the head bytes before
+  // handling the loop.
+  //
+  // e.g. Moving two bytes
+  // [   |       |       |       |       |]
+  // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___]
+  // [___LLLLLLLL_________________________]
+  // [_________________________LLLLLLLL___]
+  // [___________________________SSSSSSSS_]
+  // [_________________LLLLLLLL___________]
+  // [___________________SSSSSSSS_________]
+  // [_________LLLLLLLL___________________]
+  // [___________SSSSSSSS_________________]
+  // [_____SSSSSSSS_______________________]
+  static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    const auto head_value = T::load(src);
+    ptrdiff_t offset = count - Size;
+#pragma nounroll
+    do {
+      block(dst + offset, src + offset);
+      offset -= Size;
+    } while (offset >= 0);
+    T::store(dst, head_value);
+  }
+};
+
+} // namespace __llvm_libc::generic
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H
diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h
new file mode 100644
--- /dev/null
+++ b/libc/src/string/memory_utils/op_x86.h
@@ -0,0 +1,221 @@
+//===-- x86 implementation of memory function building blocks -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides x86 specific building blocks to compose memory functions.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
+
+#include "src/__support/architectures.h"
+
+#if defined(LLVM_LIBC_ARCH_X86_64)
+
+#include "src/__support/common.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+
+#ifdef __SSE2__
+#include <immintrin.h>
+#else
+// Define fake functions to prevent the compiler from failing on undefined
+// functions in case SSE2 is not present.
+#define _mm512_cmpneq_epi8_mask(A, B) 0
+#define _mm_movemask_epi8(A) 0
+#define _mm256_movemask_epi8(A) 0
+#endif //  __SSE2__
+
+namespace __llvm_libc::x86 {
+
+// A set of constants to check compile time features.
+static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__);
+static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__);
+static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__);
+static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__);
+static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__);
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcpy repmovsb implementation
+struct Memcpy {
+  static void repmovsb(char *dst, const char *src, size_t count) {
+    asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory");
+  }
+};
+
+///////////////////////////////////////////////////////////////////////////////
+// Bcmp
+
+// Base implementation for the Bcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockBcmp> struct BcmpImpl {
+  static constexpr size_t SIZE = Size;
+  static inline BcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockBcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return value;
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return BcmpReturnType::ZERO();
+  }
+
+  static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1, p2) | tail(p1, p2, count);
+  }
+
+  static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  const int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2));
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  const int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2));
+  // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit
+  // mask.
+  return static_cast<uint32_t>(mask);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  const uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2));
+  const bool mask_is_set = mask != 0;
+  return static_cast<uint32_t>(mask_is_set);
+}
+template <size_t Size> using Bcmp = BcmpImpl<Size, 64, bcmp64>;
+} // namespace avx512bw
+
+// Assuming that the mask is non zero, the index of the first mismatching byte
+// is the number of trailing zeros in the mask. Trailing zeros and not leading
+// zeros because the x86 architecture is little endian.
+static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2,
+                                                 uint64_t mask) {
+  const size_t diff_index = __builtin_ctzll(mask);
+  const int16_t ca = cpp::bit_cast<uint8_t>(p1[diff_index]);
+  const int16_t cb = cpp::bit_cast<uint8_t>(p2[diff_index]);
+  return ca - cb;
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Memcmp
+
+// Base implementation for the Memcmp specializations.
+//  - BlockSize is either 16, 32 or 64 depending on the available compile time
+// features, it is used to switch between "single native operation" or a
+// "sequence of native operations".
+//  - BlockMemcmp is the function that implements the memcmp logic.
+//  - BlockBcmp is the function that implements the bcmp logic.
+template <size_t Size, size_t BlockSize, auto BlockMemcmp, auto BlockBcmp>
+struct MemcmpImpl {
+  static constexpr size_t SIZE = Size;
+  static inline MemcmpReturnType block(CPtr p1, CPtr p2) {
+    if constexpr (Size == BlockSize) {
+      return BlockMemcmp(p1, p2);
+    } else if constexpr (Size % BlockSize == 0) {
+      for (size_t offset = 0; offset < Size; offset += BlockSize)
+        if (auto value = BlockBcmp(p1 + offset, p2 + offset))
+          return BlockMemcmp(p1 + offset, p2 + offset);
+    } else {
+      deferred_static_assert("SIZE not implemented");
+    }
+    return MemcmpReturnType::ZERO();
+  }
+
+  static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) {
+    return block(p1 + count - Size, p2 + count - Size);
+  }
+
+  static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) {
+    if (auto value = block(p1, p2))
+      return value;
+    return tail(p1, p2, count);
+  }
+
+  static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) {
+    static_assert(Size > 1, "a loop of size 1 does not need tail");
+    size_t offset = 0;
+    do {
+      if (auto value = block(p1 + offset, p2 + offset))
+        return value;
+      offset += Size;
+    } while (offset < count - Size);
+    return tail(p1, p2, count);
+  }
+};
+
+namespace sse2 {
+static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(16)));
+  // A mask indicating which bytes differ after loading 16 bytes from p1 and p2.
+  if (int mask = _mm_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 16, memcmp16, bcmp16>;
+} // namespace sse2
+
+namespace avx2 {
+static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(32)));
+  // A mask indicating which bytes differ after loading 32 bytes from p1 and p2.
+  if (int mask = _mm256_movemask_epi8(load<T>(p1) != load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 32, memcmp32, bcmp32>;
+} // namespace avx2
+
+namespace avx512bw {
+static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) {
+  using T = char __attribute__((__vector_size__(64)));
+  // A mask indicating which bytes differ after loading 64 bytes from p1 and p2.
+  if (uint64_t mask = _mm512_cmpneq_epi8_mask(load<T>(p1), load<T>(p2)))
+    return char_diff_no_zero(p1, p2, mask);
+  return MemcmpReturnType::ZERO();
+}
+template <size_t Size> using Memcmp = MemcmpImpl<Size, 64, memcmp64, bcmp64>;
+} // namespace avx512bw
+
+} // namespace __llvm_libc::x86
+
+#endif // LLVM_LIBC_ARCH_X86_64
+
+#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H
diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h
--- a/libc/src/string/memory_utils/utils.h
+++ b/libc/src/string/memory_utils/utils.h
@@ -106,6 +106,41 @@
 using Ptr = char *;        // Pointer to raw data.
 using CPtr = const char *; // Const pointer to raw data.
 
+// This type makes sure that we don't accidentally promote an integral type to
+// another one. It is only constructible from the exact T type.
+template <typename T> struct StrictIntegralType {
+  static_assert(cpp::is_integral_v<T>);
+
+  // Can only be constructed from a T.
+  template <typename U, cpp::enable_if_t<cpp::is_same_v<U, T>, bool> = 0>
+  StrictIntegralType(U value) : value(value) {}
+
+  // Allows using the type in an if statement.
+  explicit operator bool() const { return value; }
+
+  // If type is unsigned (bcmp) we allow bitwise OR operations.
+  StrictIntegralType operator|(const StrictIntegralType &Rhs) const {
+    static_assert(!cpp::is_signed_v<T>);
+    return value | Rhs.value;
+  }
+
+  // For interation with the C API we allow explicit conversion back to the
+  // `int` type.
+  explicit operator int() const {
+    // bit_cast makes sure that T and int have the same size.
+    return cpp::bit_cast<int>(value);
+  }
+
+  // Helper to get the zero value.
+  static inline constexpr StrictIntegralType ZERO() { return {T(0)}; }
+
+private:
+  T value;
+};
+
+using MemcmpReturnType = StrictIntegralType<int32_t>;
+using BcmpReturnType = StrictIntegralType<uint32_t>;
+
 // Loads bytes from memory (possibly unaligned) and materializes them as
 // type.
 template <typename T> static inline T load(CPtr ptr) {
diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt
--- a/libc/test/src/string/memory_utils/CMakeLists.txt
+++ b/libc/test/src/string/memory_utils/CMakeLists.txt
@@ -5,6 +5,7 @@
   SRCS
     elements_test.cpp
     memory_access_test.cpp
+    op_tests.cpp
     utils_test.cpp
   COMPILE_OPTIONS
     ${LIBC_COMPILE_OPTIONS_NATIVE}
diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp
new file mode 100644
--- /dev/null
+++ b/libc/test/src/string/memory_utils/op_tests.cpp
@@ -0,0 +1,452 @@
+//===-- Unittests for op_ files -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/limits.h"
+#include "src/__support/CPP/span.h"
+#include "src/string/memory_utils/op_aarch64.h"
+#include "src/string/memory_utils/op_builtin.h"
+#include "src/string/memory_utils/op_generic.h"
+#include "src/string/memory_utils/op_x86.h"
+#include "src/string/memory_utils/utils.h"
+#include "utils/UnitTest/Test.h"
+
+#include <assert.h>
+#include <stdlib.h>
+
+// User code should use macros instead of functions.
+#if __has_feature(address_sanitizer) || defined(__SANITIZE_ADDRESS__)
+#include <sanitizer/asan_interface.h>
+#define ASAN_POISON_MEMORY_REGION(addr, size)                                  \
+  __asan_poison_memory_region((addr), (size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size)                                \
+  __asan_unpoison_memory_region((addr), (size))
+#else
+#define ASAN_POISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#define ASAN_UNPOISON_MEMORY_REGION(addr, size) ((void)(addr), (void)(size))
+#endif
+
+#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64)
+#define LLVM_LIBC_HAS_UINT64
+#endif
+
+namespace __llvm_libc {
+
+static char GetRandomChar() {
+  static constexpr const uint64_t a = 1103515245;
+  static constexpr const uint64_t c = 12345;
+  static constexpr const uint64_t m = 1ULL << 31;
+  static uint64_t seed = 123456789;
+  seed = (a * seed + c) % m;
+  return seed;
+}
+
+// Randomize the content of the buffer.
+static void Randomize(cpp::span<char> buffer) {
+  for (auto &current : buffer)
+    current = GetRandomChar();
+}
+
+// Copy one span to another.
+static void Copy(cpp::span<char> dst, const cpp::span<char> src) {
+  assert(dst.size() == src.size());
+  for (size_t i = 0; i < dst.size(); ++i)
+    dst[i] = src[i];
+}
+
+// Simple structure to allocate a buffer of a particular size.
+struct PoisonedBuffer {
+  PoisonedBuffer(size_t size) : ptr((char *)malloc(size)) {
+    assert(ptr);
+    ASAN_POISON_MEMORY_REGION(ptr, size);
+  }
+  ~PoisonedBuffer() { free(ptr); }
+
+protected:
+  char *ptr = nullptr;
+};
+
+// Simple structure to allocate a buffer (aligned or not) of a particular size.
+// It is backed by a wider buffer that is marked poisoned when ASAN is present.
+// The requested region is unpoisoned, this allows catching out of bounds
+// accesses.
+enum class Aligned : bool { NO = false, YES = true };
+struct Buffer : private PoisonedBuffer {
+  static constexpr size_t kAlign = 64;
+  static constexpr size_t kLeeway = 2 * kAlign;
+  Buffer(size_t size, Aligned aligned = Aligned::YES)
+      : PoisonedBuffer(size + kLeeway), size(size) {
+    offset_ptr = ptr;
+    offset_ptr += distance_to_next_aligned<kAlign>(ptr);
+    assert((uintptr_t)(offset_ptr) % kAlign == 0);
+    if (aligned == Aligned::NO)
+      ++offset_ptr;
+    assert(offset_ptr > ptr);
+    assert((offset_ptr + size) < (ptr + size + kLeeway));
+    ASAN_UNPOISON_MEMORY_REGION(offset_ptr, size);
+  }
+  cpp::span<char> span() { return cpp::span<char>(offset_ptr, size); }
+
+private:
+  size_t size = 0;
+  char *offset_ptr = nullptr;
+};
+
+// Allocates two Buffer and extracts two spans out of them, one
+// aligned and one misaligned. Tests are run on both spans.
+struct Buffers {
+  Buffers(size_t size)
+      : aligned_buffer(size, Aligned::YES),
+        misaligned_buffer(size, Aligned::NO) {}
+
+  // Returns two spans of 'size' bytes. The first is aligned on
+  // Buffer::kAlign and the second one is unaligned.
+  cpp::array<cpp::span<char>, 2> spans() {
+    return {aligned_buffer.span(), misaligned_buffer.span()};
+  }
+
+  Buffer aligned_buffer;
+  Buffer misaligned_buffer;
+};
+
+using MemcpyImplementations = testing::TypeList<
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    builtin::Memcpy<1>,  //
+    builtin::Memcpy<2>,  //
+    builtin::Memcpy<3>,  //
+    builtin::Memcpy<4>,  //
+    builtin::Memcpy<8>,  //
+    builtin::Memcpy<16>, //
+    builtin::Memcpy<32>, //
+    builtin::Memcpy<64>
+#endif // LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE
+    >;
+
+template <auto FnImpl>
+bool CheckMemcpy(cpp::span<char> dst, cpp::span<char> src, size_t size) {
+  assert(dst.size() == src.size());
+  assert(dst.size() == size);
+  Randomize(dst);
+  FnImpl(dst.data(), src.data(), size);
+  for (size_t i = 0; i < size; ++i)
+    if (dst[i] != src[i])
+      return false;
+  return true;
+}
+
+template <typename T>
+static void MemcpyAdaptor(Ptr dst, CPtr src, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(dst, src);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Memcpy, MemcpyImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers SrcBuffer(kSize);
+    Buffers DstBuffer(kSize);
+    for (auto src : SrcBuffer.spans()) {
+      Randomize(src);
+      for (auto dst : DstBuffer.spans()) {
+        ASSERT_TRUE(CheckMemcpy<MemcpyAdaptor<Impl>>(dst, src, kSize));
+      }
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    Buffer SrcBuffer(2 * kSize);
+    Buffer DstBuffer(2 * kSize);
+    Randomize(SrcBuffer.span());
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      auto src = SrcBuffer.span().subspan(0, size);
+      auto dst = DstBuffer.span().subspan(0, size);
+      ASSERT_TRUE(CheckMemcpy<Impl::head_tail>(dst, src, size));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      Buffer SrcBuffer(3 * kSize);
+      Buffer DstBuffer(3 * kSize);
+      Randomize(SrcBuffer.span());
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        auto src = SrcBuffer.span().subspan(0, size);
+        auto dst = DstBuffer.span().subspan(0, size);
+        ASSERT_TRUE(CheckMemcpy<Impl::loop_and_tail>(dst, src, size));
+      }
+    }
+  }
+}
+
+using MemsetImplementations = testing::TypeList<
+#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE
+    builtin::Memset<1>,  //
+    builtin::Memset<2>,  //
+    builtin::Memset<3>,  //
+    builtin::Memset<4>,  //
+    builtin::Memset<8>,  //
+    builtin::Memset<16>, //
+    builtin::Memset<32>, //
+    builtin::Memset<64>,
+#endif
+#ifdef LLVM_LIBC_HAS_UINT64
+    generic::Memset<8, 8>,  //
+    generic::Memset<16, 8>, //
+    generic::Memset<32, 8>, //
+    generic::Memset<64, 8>, //
+#endif
+#ifdef __AVX512F__
+    generic::Memset<64, 64>, // prevents warning about avx512f
+#endif
+    generic::Memset<1, 1>,   //
+    generic::Memset<2, 1>,   //
+    generic::Memset<2, 2>,   //
+    generic::Memset<4, 2>,   //
+    generic::Memset<4, 4>,   //
+    generic::Memset<16, 16>, //
+    generic::Memset<32, 16>, //
+    generic::Memset<64, 16>, //
+    generic::Memset<32, 32>, //
+    generic::Memset<64, 32>  //
+    >;
+
+template <auto FnImpl>
+bool CheckMemset(cpp::span<char> dst, uint8_t value, size_t size) {
+  Randomize(dst);
+  FnImpl(dst.data(), value, size);
+  for (char c : dst)
+    if (c != (char)value)
+      return false;
+  return true;
+}
+
+template <typename T>
+static void MemsetAdaptor(Ptr dst, uint8_t value, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(dst, value);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers DstBuffer(kSize);
+    for (uint8_t value : cpp::array<uint8_t, 3>{0, 1, 255}) {
+      for (auto dst : DstBuffer.spans()) {
+        ASSERT_TRUE(CheckMemset<MemsetAdaptor<Impl>>(dst, value, kSize));
+      }
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    Buffer DstBuffer(2 * kSize);
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      const char value = size % 10;
+      auto dst = DstBuffer.span().subspan(0, size);
+      ASSERT_TRUE(CheckMemset<Impl::head_tail>(dst, value, size));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      Buffer DstBuffer(3 * kSize);
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        const char value = size % 10;
+        auto dst = DstBuffer.span().subspan(0, size);
+        ASSERT_TRUE((CheckMemset<Impl::loop_and_tail>(dst, value, size)));
+      }
+    }
+  }
+}
+
+using BcmpImplementations = testing::TypeList<
+#ifdef __SSE2__
+    x86::sse2::Bcmp<16>,  //
+    x86::sse2::Bcmp<32>,  //
+    x86::sse2::Bcmp<64>,  //
+    x86::sse2::Bcmp<128>, //
+#endif
+#ifdef __AVX2__
+    x86::avx2::Bcmp<32>,  //
+    x86::avx2::Bcmp<64>,  //
+    x86::avx2::Bcmp<128>, //
+#endif
+#ifdef __AVX512BW__
+    x86::avx512bw::Bcmp<64>,  //
+    x86::avx512bw::Bcmp<128>, //
+#endif
+#ifdef __ARM_NEON
+    aarch64::Bcmp<16>, //
+    aarch64::Bcmp<32>, //
+#endif
+#ifdef LLVM_LIBC_HAS_UINT64
+    generic::Bcmp<8>, //
+#endif
+    generic::Bcmp<1>,  //
+    generic::Bcmp<2>,  //
+    generic::Bcmp<4>,  //
+    generic::Bcmp<16>, //
+    generic::Bcmp<32>, //
+    generic::Bcmp<64>  //
+    >;
+
+template <auto FnImpl>
+bool CheckBcmp(cpp::span<char> span1, cpp::span<char> span2, size_t size) {
+  assert(span1.size() == span2.size());
+  Copy(span2, span1);
+  // Compare equal
+  if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp != 0)
+    return false;
+  // Compare not equal if any byte differs
+  for (size_t i = 0; i < size; ++i) {
+    ++span2[i];
+    if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp == 0)
+      return false;
+    if (int cmp = (int)FnImpl(span2.data(), span1.data(), size); cmp == 0)
+      return false;
+    --span2[i];
+  }
+  return true;
+}
+
+template <typename T>
+static BcmpReturnType BcmpAdaptor(CPtr p1, CPtr p2, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(p1, p2);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers Buffer1(kSize);
+    Buffers Buffer2(kSize);
+    for (auto span1 : Buffer1.spans()) {
+      Randomize(span1);
+      for (auto span2 : Buffer2.spans())
+        ASSERT_TRUE((CheckBcmp<BcmpAdaptor<Impl>>(span1, span2, kSize)));
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    Buffer Buffer1(2 * kSize);
+    Buffer Buffer2(2 * kSize);
+    Randomize(Buffer1.span());
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      auto span1 = Buffer1.span().subspan(0, size);
+      auto span2 = Buffer2.span().subspan(0, size);
+      ASSERT_TRUE((CheckBcmp<Impl::head_tail>(span1, span2, size)));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      Buffer Buffer1(3 * kSize);
+      Buffer Buffer2(3 * kSize);
+      Randomize(Buffer1.span());
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        auto span1 = Buffer1.span().subspan(0, size);
+        auto span2 = Buffer2.span().subspan(0, size);
+        ASSERT_TRUE((CheckBcmp<Impl::loop_and_tail>(span1, span2, size)));
+      }
+    }
+  }
+}
+
+using MemcmpImplementations = testing::TypeList<
+#ifdef __SSE2__
+    x86::sse2::Memcmp<16>,  //
+    x86::sse2::Memcmp<32>,  //
+    x86::sse2::Memcmp<64>,  //
+    x86::sse2::Memcmp<128>, //
+#endif
+#ifdef __AVX2__
+    x86::avx2::Memcmp<32>,  //
+    x86::avx2::Memcmp<64>,  //
+    x86::avx2::Memcmp<128>, //
+#endif
+#ifdef __AVX512BW__
+    x86::avx512bw::Memcmp<64>,  //
+    x86::avx512bw::Memcmp<128>, //
+#endif
+#ifdef LLVM_LIBC_HAS_UINT64
+    generic::Memcmp<8>, //
+#endif
+    generic::Memcmp<1>,  //
+    generic::Memcmp<2>,  //
+    generic::Memcmp<3>,  //
+    generic::Memcmp<4>,  //
+    generic::Memcmp<16>, //
+    generic::Memcmp<32>, //
+    generic::Memcmp<64>  //
+    >;
+
+template <auto FnImpl>
+bool CheckMemcmp(cpp::span<char> span1, cpp::span<char> span2, size_t size) {
+  assert(span1.size() == span2.size());
+  Copy(span2, span1);
+  // Compare equal
+  if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp != 0)
+    return false;
+  // Compare not equal if any byte differs
+  for (size_t i = 0; i < size; ++i) {
+    ++span2[i];
+    int ground_truth = __builtin_memcmp(span1.data(), span2.data(), size);
+    if (ground_truth > 0) {
+      if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp <= 0)
+        return false;
+      if (int cmp = (int)FnImpl(span2.data(), span1.data(), size); cmp >= 0)
+        return false;
+    } else {
+      if (int cmp = (int)FnImpl(span1.data(), span2.data(), size); cmp >= 0)
+        return false;
+      if (int cmp = (int)FnImpl(span2.data(), span1.data(), size); cmp <= 0)
+        return false;
+    }
+    --span2[i];
+  }
+  return true;
+}
+
+template <typename T>
+static MemcmpReturnType MemcmpAdaptor(CPtr p1, CPtr p2, size_t size) {
+  assert(size == T::SIZE);
+  return T::block(p1, p2);
+}
+
+TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) {
+  using Impl = ParamType;
+  constexpr size_t kSize = Impl::SIZE;
+  { // Test block operation
+    Buffers Buffer1(kSize);
+    Buffers Buffer2(kSize);
+    for (auto span1 : Buffer1.spans()) {
+      Randomize(span1);
+      for (auto span2 : Buffer2.spans())
+        ASSERT_TRUE((CheckMemcmp<MemcmpAdaptor<Impl>>(span1, span2, kSize)));
+    }
+  }
+  { // Test head tail operations from kSize to 2 * kSize.
+    Buffer Buffer1(2 * kSize);
+    Buffer Buffer2(2 * kSize);
+    Randomize(Buffer1.span());
+    for (size_t size = kSize; size < 2 * kSize; ++size) {
+      auto span1 = Buffer1.span().subspan(0, size);
+      auto span2 = Buffer2.span().subspan(0, size);
+      ASSERT_TRUE((CheckMemcmp<Impl::head_tail>(span1, span2, size)));
+    }
+  }
+  { // Test loop operations from kSize to 3 * kSize.
+    if constexpr (kSize > 1) {
+      Buffer Buffer1(3 * kSize);
+      Buffer Buffer2(3 * kSize);
+      Randomize(Buffer1.span());
+      for (size_t size = kSize; size < 3 * kSize; ++size) {
+        auto span1 = Buffer1.span().subspan(0, size);
+        auto span2 = Buffer2.span().subspan(0, size);
+        ASSERT_TRUE((CheckMemcmp<Impl::loop_and_tail>(span1, span2, size)));
+      }
+    }
+  }
+}
+
+} // namespace __llvm_libc
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -973,9 +973,13 @@
 cc_library(
     name = "string_memory_utils",
     hdrs = [
-        "src/string/memory_utils/elements.h",
         "src/string/memory_utils/elements_aarch64.h",
         "src/string/memory_utils/elements_x86.h",
+        "src/string/memory_utils/elements.h",
+        "src/string/memory_utils/op_aarch64.h",
+        "src/string/memory_utils/op_builtin.h",
+        "src/string/memory_utils/op_generic.h",
+        "src/string/memory_utils/op_x86.h",
         "src/string/memory_utils/utils.h",
     ],
     textual_hdrs = [
@@ -989,6 +993,7 @@
         ":__support_common",
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
+        ":__support_cpp_array",
         ":libc_root",
     ],
 )