diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -2,13 +2,17 @@ add_header_library( memory_utils HDRS - utils.h - elements.h bcmp_implementations.h bzero_implementations.h + elements.h memcmp_implementations.h memcpy_implementations.h memset_implementations.h + op_aarch64.h + op_builtin.h + op_generic.h + op_x86.h + utils.h DEPS libc.src.__support.CPP.bit ) diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/README.md @@ -0,0 +1,97 @@ +# The mem* framework + +The framework handles the following mem* functions: + - `memcpy` + - `memmove` + - `memset` + - `bzero` + - `bcmp` + - `memcmp` + +## Building blocks + +These functions can be built out of a set of lower-level operations: + - **`block`** : operates on a block of `SIZE` bytes. + - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`) + - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`. + - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation. + +As an illustration, let's take the example of a trivial `memset` implementation: + + ```C++ + extern "C" void memset(const char* dst, int value, size_t count) { + if (count == 0) return; + if (count == 1) return Memset<1>::block(dst, value); + if (count == 2) return Memset<2>::block(dst, value); + if (count == 3) return Memset<3>::block(dst, value); + if (count <= 8) return Memset<4>::head_tail(dst, value, count); // Note that 0 to 4 bytes are written twice. + if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here. + return Memset<16>::loop_and_tail(dst, value, count); +} + ``` + +Now let's have a look into the `Memset` structure: + +```C++ +template +struct Memset { + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, uint8_t value) { + // Implement me + } + + static inline void tail(Ptr dst, uint8_t value, size_t count) { + block(dst + count - SIZE, value); + } + + static inline void head_tail(Ptr dst, uint8_t value, size_t count) { + block(dst, value); + tail(dst, value, count); + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, value, count); + } +}; +``` + +As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented. +In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated. +**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics. + +## Scoped specializations + +We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation. + +In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop. +```C++ + extern "C" void memset(const char* dst, int value, size_t count) { + if (count == 0) return; + if (count == 1) return generic::Memset<1>::block(dst, value); + if (count == 2) return generic::Memset<2>::block(dst, value); + if (count == 3) return generic::Memset<3>::block(dst, value); + if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count); + if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count); + return x86::Memset<16>::loop_and_tail(dst, value, count); +} +``` + +### The `builtin` scope + +Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset)) + +### The `generic` scope + +In this scope we define pure C++ implementations using native integral types and clang vector extensions. + +### The arch specific scopes + +Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`). + +The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort. diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_aarch64.h @@ -0,0 +1,175 @@ +//===-- aarch64 implementation of memory function building blocks ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides aarch64 specific building blocks to compose memory +// functions. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H + +#include "src/__support/architectures.h" + +#if defined(LLVM_LIBC_ARCH_AARCH64) + +#include "src/__support/common.h" +#include "src/string/memory_utils/op_generic.h" + +#ifdef __ARM_NEON +#include +#endif //__ARM_NEON + +namespace __llvm_libc::aarch64 { + +static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON); + +namespace neon { + +template struct BzeroCacheLine { + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, uint8_t) { + static_assert(Size == 64); +#if __SIZEOF_POINTER__ == 4 + asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); +#else + asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); +#endif + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + // Unaligned store, we can't use 'dc zva' here. + static constexpr size_t kMaxSize = kNeon ? 16 : 8; + generic::Memset::tail(dst, value, count); + } +}; + +inline static bool hasZva() { + uint64_t zva_val; + asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); + // DC ZVA is permitted if DZP, bit [4] is zero. + // BS, bits [3:0] is log2 of the block count in words. + // So the next line checks whether the instruction is permitted and block + // count is 16 words (i.e. 64 bytes). + return (zva_val & 0b11111) == 0b00100; +} + +} // namespace neon + +/////////////////////////////////////////////////////////////////////////////// +// Memset + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp +template struct Bcmp { + static constexpr size_t SIZE = Size; + static constexpr size_t BlockSize = 32; + + static const unsigned char *as_u8(CPtr ptr) { + return reinterpret_cast(ptr); + } + + static inline BcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == BlockSize) { + auto _p1 = as_u8(p1); + auto _p2 = as_u8(p2); + uint8x16_t a = vld1q_u8(_p1); + uint8x16_t b = vld1q_u8(_p1 + 16); + uint8x16_t n = vld1q_u8(_p2); + uint8x16_t o = vld1q_u8(_p2 + 16); + uint8x16_t an = veorq_u8(a, n); + uint8x16_t bo = veorq_u8(b, o); + // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is + // a difference between the two buffers. We reduce this value down to 4 + // bytes in two steps. First, calculate the saturated move value when + // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get + // a single 32 bit nonzero value if a mismatch occurred. + uint8x16_t anbo = vorrq_u8(an, bo); + uint32x2_t anbo_reduced = vqmovn_u64(anbo); + return vmaxv_u32(anbo_reduced); + } else if constexpr ((Size % BlockSize) == 0) { + for (size_t offset = 0; offset < Size; offset += BlockSize) + if (auto value = Bcmp::block(p1 + offset, p2 + offset)) + return value; + } else { + deferred_static_assert("SIZE not implemented"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - SIZE, p2 + count - SIZE); + } + + static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + if constexpr (Size <= 8) { + return generic::Bcmp::head_tail(p1, p2, count); + } else if constexpr (Size == 16) { + auto _p1 = as_u8(p1); + auto _p2 = as_u8(p2); + uint8x16_t a = vld1q_u8(_p1); + uint8x16_t b = vld1q_u8(_p1 + count - 16); + uint8x16_t n = vld1q_u8(_p2); + uint8x16_t o = vld1q_u8(_p2 + count - 16); + uint8x16_t an = veorq_s8(a, n); + uint8x16_t bo = veorq_s8(b, o); + // anbo = (a ^ n) | (b ^ o) + uint8x16_t anbo = vorrq_s8(an, bo); + uint32x2_t anbo_reduced = vqmovn_u64(anbo); + return vmaxv_u32(anbo_reduced); + } else if constexpr (Size == 32) { + auto _p1 = as_u8(p1); + auto _p2 = as_u8(p2); + uint8x16_t a = vld1q_u8(_p1); + uint8x16_t b = vld1q_u8(_p1 + 16); + uint8x16_t c = vld1q_u8(_p1 + count - 16); + uint8x16_t d = vld1q_u8(_p1 + count - 32); + uint8x16_t n = vld1q_u8(_p2); + uint8x16_t o = vld1q_u8(_p2 + 16); + uint8x16_t p = vld1q_u8(_p2 + count - 16); + uint8x16_t q = vld1q_u8(_p2 + count - 32); + uint8x16_t an = veorq_s8(a, n); + uint8x16_t bo = veorq_s8(b, o); + uint8x16_t cp = veorq_s8(c, p); + uint8x16_t dq = veorq_s8(d, q); + uint8x16_t anbo = vorrq_s8(an, bo); + uint8x16_t cpdq = vorrq_s8(cp, dq); + // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to + // a nonzero 32 bit value if a mismatch occurred. + uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); + uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); + return vmaxv_u32(abnocpdq_reduced); + } else { + deferred_static_assert("SIZE not implemented"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += SIZE; + } while (offset < count - SIZE); + return tail(p1, p2, count); + } +}; + +} // namespace __llvm_libc::aarch64 + +#endif // LLVM_LIBC_ARCH_AARCH64 + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_builtin.h @@ -0,0 +1,148 @@ +//===-- Implementation using the __builtin_XXX_inline ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides generic C++ building blocks to compose memory functions. +// They rely on the compiler to generate the best possible code through the use +// of the `__builtin_XXX_inline` builtins. These builtins are currently only +// available in Clang. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H + +#include "src/string/memory_utils/utils.h" + +namespace __llvm_libc::builtin { + +/////////////////////////////////////////////////////////////////////////////// +// Memcpy +template struct Memcpy { + static constexpr size_t SIZE = Size; + static inline void block(Ptr __restrict dst, CPtr __restrict src) { +#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE + return __builtin_memcpy_inline(dst, src, SIZE); +#else + deferred_static_assert("Missing __builtin_memcpy_inline"); + (void)dst; + (void)src; +#endif + } + + static inline void tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + block(dst + count - SIZE, src + count - SIZE); + } + + static inline void head_tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + block(dst, src); + tail(dst, src, count); + } + + static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + block(dst + offset, src + offset); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, src, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memset +template struct Memset { + using ME = Memset; + static constexpr size_t SIZE = Size; + static inline void block(Ptr dst, uint8_t value) { +#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE + __builtin_memset_inline(dst, value, Size); +#else + deferred_static_assert("Missing __builtin_memset_inline"); + (void)dst; + (void)value; +#endif + } + + static inline void tail(Ptr dst, uint8_t value, size_t count) { + block(dst + count - SIZE, value); + } + + static inline void head_tail(Ptr dst, uint8_t value, size_t count) { + block(dst, value); + tail(dst, value, count); + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, value, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp +template struct Bcmp { + using ME = Bcmp; + static constexpr size_t SIZE = Size; + static inline BcmpReturnType block(CPtr, CPtr) { + deferred_static_assert("Missing __builtin_memcmp_inline"); + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return BcmpReturnType::ZERO(); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memcmp +template struct Memcmp { + using ME = Memcmp; + static constexpr size_t SIZE = Size; + static inline MemcmpReturnType block(CPtr, CPtr) { + deferred_static_assert("Missing __builtin_memcmp_inline"); + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return MemcmpReturnType::ZERO(); + } +}; + +} // namespace __llvm_libc::builtin + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_generic.h @@ -0,0 +1,466 @@ +//===-- Generic implementation of memory function building blocks ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides generic C++ building blocks. +// Depending on the requested size, the block operation uses unsigned integral +// types, vector types or an array of the type with the maximum size. +// +// The maximum size is passed as a template argument. For instance, on x86 +// platforms that only supports integral types the maximum size would be 8 +// (corresponding to uint64_t). On this platform if we request the size 32, this +// would be treated as a cpp::array. +// +// On the other hand, if the platform is x86 with support for AVX the maximum +// size is 32 and the operation can be handled with a single native operation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H + +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/type_traits.h" +#include "src/__support/endian.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/utils.h" + +#include + +namespace __llvm_libc::generic { + +// CTPair and CTMap below implement a compile time map. +// This is useful to map from a Size to a type handling this size. +// +// Example usage: +// using MyMap = CTMap, +// CTPair<2, uint16_t>, +// >; +// ... +// using UInt8T = MyMap::find_type<1>; +template struct CTPair { + using type = T; + static CTPair get_pair(cpp::integral_constant) { return {}; } +}; +template struct CTMap : public Pairs... { + using Pairs::get_pair...; + template + using find_type = + typename decltype(get_pair(cpp::integral_constant{}))::type; +}; + +// Helper to test if a type is void. +template inline constexpr bool is_void_v = cpp::is_same_v; + +// Implements load, store and splat for unsigned integral types. +template struct ScalarType { + using Type = T; + static_assert(cpp::is_integral_v && !cpp::is_signed_v); + + static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } + static inline void store(Ptr dst, Type value) { + ::__llvm_libc::store(dst, value); + } + static inline Type splat(uint8_t value) { + return Type(~0) / Type(0xFF) * Type(value); + } +}; + +// Implements load, store and splat for vector types. +template struct VectorType { + using Type = uint8_t __attribute__((__vector_size__(Size))); + static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } + static inline void store(Ptr dst, Type value) { + ::__llvm_libc::store(dst, value); + } + static inline Type splat(uint8_t value) { + Type Out; + // This for loop is optimized out for vector types. + for (size_t i = 0; i < Size; ++i) + Out[i] = static_cast(value); + return Out; + } +}; + +// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit. +static_assert((UINTPTR_MAX == 4294967295U) || + (UINTPTR_MAX == 18446744073709551615UL)); + +// Map from sizes to structures offering static load, store and splat methods. +// Note: On platforms lacking vector support, we use the ArrayType below and +// decompose the operation in smaller pieces. +using NativeTypeMap = + CTMap>, // + CTPair<2, ScalarType>, // + CTPair<4, ScalarType>, // +#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64) + CTPair<8, ScalarType>, // Not available on 32bit +#endif // + CTPair<16, VectorType<16>>, // + CTPair<32, VectorType<32>>, // + CTPair<64, VectorType<64>>>; + +// Implements load, store and splat for sizes not natively supported by the +// platform. SubType is either ScalarType or VectorType. +template struct ArrayType { + using Type = cpp::array; + static constexpr size_t SizeOfElement = sizeof(typename SubType::Type); + static inline Type load(CPtr src) { + Type Value; + for (size_t I = 0; I < ArraySize; ++I) + Value[I] = SubType::load(src + (I * SizeOfElement)); + return Value; + } + static inline void store(Ptr dst, Type Value) { + for (size_t I = 0; I < ArraySize; ++I) + SubType::store(dst + (I * SizeOfElement), Value[I]); + } + static inline Type splat(uint8_t value) { + Type Out; + for (size_t I = 0; I < ArraySize; ++I) + Out[I] = SubType::splat(value); + return Out; + } +}; + +// Checks whether we should use an ArrayType. +template static constexpr bool useArrayType() { + return (Size > MaxSize) && ((Size % MaxSize) == 0) && + !is_void_v>; +} + +// Compute the type to handle an operation of Size bytes knowing that the +// underlying platform only support native types up to MaxSize bytes. +template +using getTypeFor = cpp::conditional_t< + useArrayType(), + ArrayType, Size / MaxSize>, + NativeTypeMap::find_type>; + +/////////////////////////////////////////////////////////////////////////////// +// Memcpy +// When building with clang we can delegate to the builtin implementation. +/////////////////////////////////////////////////////////////////////////////// + +template using Memcpy = builtin::Memcpy; + +/////////////////////////////////////////////////////////////////////////////// +// Memset +// The MaxSize template argument gives the maximum size handled natively by the +// platform. For instance on x86 with AVX support this would be 32. If a size +// greater than MaxSize is requested we break the operation down in smaller +// pieces of size MaxSize. +/////////////////////////////////////////////////////////////////////////////// +template struct Memset { + static_assert(is_power2(MaxSize)); + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, uint8_t value) { + if constexpr (Size == 3) { + Memset<1, MaxSize>::block(dst + 2, value); + Memset<2, MaxSize>::block(dst, value); + } else { + using T = getTypeFor; + if constexpr (is_void_v) { + deferred_static_assert("Unimplemented Size"); + } else { + T::store(dst, T::splat(value)); + } + } + } + + static inline void tail(Ptr dst, uint8_t value, size_t count) { + block(dst + count - SIZE, value); + } + + static inline void head_tail(Ptr dst, uint8_t value, size_t count) { + block(dst, value); + tail(dst, value, count); + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + static_assert(SIZE > 1); + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, value, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp +/////////////////////////////////////////////////////////////////////////////// +template struct Bcmp { + static constexpr size_t SIZE = Size; + static constexpr size_t MaxSize = 8; + + template static inline uint32_t load_xor(CPtr p1, CPtr p2) { + return load(p1) ^ load(p2); + } + + template + static inline uint32_t load_not_equal(CPtr p1, CPtr p2) { + return load(p1) != load(p2); + } + + static inline BcmpReturnType block(CPtr p1, CPtr p2) { + static constexpr size_t MaxSize = 8; + if constexpr (Size == 1) { + return load_xor(p1, p2); + } else if constexpr (Size == 2) { + return load_xor(p1, p2); + } else if constexpr (Size == 4) { + return load_xor(p1, p2); + } else if constexpr (Size == 8) { + return load_not_equal(p1, p2); + } else if constexpr (useArrayType()) { + for (size_t offset = 0; offset < Size; offset += MaxSize) + if (auto value = Bcmp::block(p1 + offset, p2 + offset)) + return value; + } else { + deferred_static_assert("Unimplemented Size"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - SIZE, p2 + count - SIZE); + } + + static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + return block(p1, p2) | tail(p1, p2, count); + } + + static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += SIZE; + } while (offset < count - SIZE); + return tail(p1, p2, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memcmp +/////////////////////////////////////////////////////////////////////////////// +template struct Memcmp { + static constexpr size_t SIZE = Size; + static constexpr size_t MaxSize = 8; + + template static inline T load_be(CPtr ptr) { + return Endian::to_big_endian(load(ptr)); + } + + template + static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) { + return load_be(p1) - load_be(p2); + } + + template + static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) { + const auto la = load_be(p1); + const auto lb = load_be(p2); + return la > lb ? 1 : la < lb ? -1 : 0; + } + + static inline MemcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == 1) { + return load_be_diff(p1, p2); + } else if constexpr (Size == 2) { + return load_be_diff(p1, p2); + } else if constexpr (Size == 4) { + return load_be_cmp(p1, p2); + } else if constexpr (Size == 8) { + return load_be_cmp(p1, p2); + } else if constexpr (useArrayType()) { + for (size_t offset = 0; offset < Size; offset += MaxSize) + if (Bcmp::block(p1 + offset, p2 + offset)) + return Memcmp::block(p1 + offset, p2 + offset); + return MemcmpReturnType::ZERO(); + } else if constexpr (Size == 3) { + if (auto value = Memcmp<2>::block(p1, p2)) + return value; + return Memcmp<1>::block(p1 + 2, p2 + 2); + } else { + deferred_static_assert("Unimplemented Size"); + } + } + + static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - SIZE, p2 + count - SIZE); + } + + static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + if (auto value = block(p1, p2)) + return value; + return tail(p1, p2, count); + } + + static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += SIZE; + } while (offset < count - SIZE); + return tail(p1, p2, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memmove +/////////////////////////////////////////////////////////////////////////////// + +template struct Memmove { + static_assert(is_power2(MaxSize)); + using T = getTypeFor; + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, CPtr src) { + if constexpr (is_void_v) { + deferred_static_assert("Unimplemented Size"); + } else { + T::store(dst, T::load(src)); + } + } + + static inline void head_tail(Ptr dst, CPtr src, size_t count) { + const size_t offset = count - Size; + if constexpr (is_void_v) { + deferred_static_assert("Unimplemented Size"); + } else { + // The load and store operations can be performed in any order as long as + // they are not interleaved. More investigations are needed to determine + // the best order. + const auto head = T::load(src); + const auto tail = T::load(src + offset); + T::store(dst, head); + T::store(dst + offset, tail); + } + } + + // Align forward suitable when dst < src. The alignment is performed with + // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. + // + // e.g. Moving two bytes forward, we make sure src is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] + // [____LLLLLLLL_____________________] + // [___________LLLLLLLA______________] + // [_SSSSSSSS________________________] + // [________SSSSSSSS_________________] + // + // e.g. Moving two bytes forward, we make sure dst is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] + // [____LLLLLLLL_____________________] + // [______LLLLLLLL___________________] + // [_SSSSSSSS________________________] + // [___SSSSSSSA______________________] + template + static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) { + Ptr prev_dst = dst; + CPtr prev_src = src; + size_t prev_count = count; + align_to_next_boundary(dst, src, count); + adjust(Size, dst, src, count); + head_tail(prev_dst, prev_src, prev_count - count); + } + + // Align backward suitable when dst > src. The alignment is performed with + // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. + // + // e.g. Moving two bytes backward, we make sure src is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] + // [ _________________ALLLLLLL_______] + // [ ___________________LLLLLLLL_____] + // [____________________SSSSSSSS_____] + // [______________________SSSSSSSS___] + // + // e.g. Moving two bytes backward, we make sure dst is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] + // [ _______________LLLLLLLL_________] + // [ ___________________LLLLLLLL_____] + // [__________________ASSSSSSS_______] + // [______________________SSSSSSSS___] + template + static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) { + Ptr headtail_dst = dst + count; + CPtr headtail_src = src + count; + size_t headtail_size = 0; + align_to_next_boundary(headtail_dst, headtail_src, + headtail_size); + adjust(-2 * Size, headtail_dst, headtail_src, headtail_size); + head_tail(headtail_dst, headtail_src, headtail_size); + count -= headtail_size; + } + + // Move forward suitable when dst < src. We load the tail bytes before + // handling the loop. + // + // e.g. Moving two bytes + // [ | | | | |] + // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] + // [_________________________LLLLLLLL___] + // [___LLLLLLLL_________________________] + // [_SSSSSSSS___________________________] + // [___________LLLLLLLL_________________] + // [_________SSSSSSSS___________________] + // [___________________LLLLLLLL_________] + // [_________________SSSSSSSS___________] + // [_______________________SSSSSSSS_____] + static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) { + static_assert(Size > 1); + const size_t tail_offset = count - Size; + const auto tail_value = T::load(src + tail_offset); + size_t offset = 0; +#pragma nounroll + do { + block(dst + offset, src + offset); + offset += Size; + } while (offset < count - Size); + T::store(dst + tail_offset, tail_value); + } + + // Move backward suitable when dst > src. We load the head bytes before + // handling the loop. + // + // e.g. Moving two bytes + // [ | | | | |] + // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] + // [___LLLLLLLL_________________________] + // [_________________________LLLLLLLL___] + // [___________________________SSSSSSSS_] + // [_________________LLLLLLLL___________] + // [___________________SSSSSSSS_________] + // [_________LLLLLLLL___________________] + // [___________SSSSSSSS_________________] + // [_____SSSSSSSS_______________________] + static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) { + static_assert(Size > 1); + const auto head_value = T::load(src); + ptrdiff_t offset = count - Size; +#pragma nounroll + do { + block(dst + offset, src + offset); + offset -= Size; + } while (offset >= 0); + T::store(dst, head_value); + } +}; + +} // namespace __llvm_libc::generic + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_x86.h @@ -0,0 +1,221 @@ +//===-- x86 implementation of memory function building blocks -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides x86 specific building blocks to compose memory functions. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H + +#include "src/__support/architectures.h" + +#if defined(LLVM_LIBC_ARCH_X86_64) + +#include "src/__support/common.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" + +#ifdef __SSE2__ +#include +#else +// Define fake functions to prevent the compiler from failing on undefined +// functions in case SSE2 is not present. +#define _mm512_cmpneq_epi8_mask(A, B) 0 +#define _mm_movemask_epi8(A) 0 +#define _mm256_movemask_epi8(A) 0 +#endif // __SSE2__ + +namespace __llvm_libc::x86 { + +// A set of constants to check compile time features. +static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__); +static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__); +static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__); +static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__); +static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); + +/////////////////////////////////////////////////////////////////////////////// +// Memcpy repmovsb implementation +struct Memcpy { + static void repmovsb(char *dst, const char *src, size_t count) { + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp + +// Base implementation for the Bcmp specializations. +// - BlockSize is either 16, 32 or 64 depending on the available compile time +// features, it is used to switch between "single native operation" or a +// "sequence of native operations". +// - BlockBcmp is the function that implements the bcmp logic. +template struct BcmpImpl { + static constexpr size_t SIZE = Size; + static inline BcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == BlockSize) { + return BlockBcmp(p1, p2); + } else if constexpr (Size % BlockSize == 0) { + for (size_t offset = 0; offset < Size; offset += BlockSize) + if (auto value = BlockBcmp(p1 + offset, p2 + offset)) + return value; + } else { + deferred_static_assert("SIZE not implemented"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - Size, p2 + count - Size); + } + + static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + return block(p1, p2) | tail(p1, p2, count); + } + + static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += Size; + } while (offset < count - Size); + return tail(p1, p2, count); + } +}; + +namespace sse2 { +static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(16))); + // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. + const int mask = _mm_movemask_epi8(load(p1) != load(p2)); + return static_cast(mask); +} +template using Bcmp = BcmpImpl; +} // namespace sse2 + +namespace avx2 { +static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(32))); + // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. + const int mask = _mm256_movemask_epi8(load(p1) != load(p2)); + // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit + // mask. + return static_cast(mask); +} +template using Bcmp = BcmpImpl; +} // namespace avx2 + +namespace avx512bw { +static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(64))); + // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. + const uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2)); + const bool mask_is_set = mask != 0; + return static_cast(mask_is_set); +} +template using Bcmp = BcmpImpl; +} // namespace avx512bw + +// Assuming that the mask is non zero, the index of the first mismatching byte +// is the number of trailing zeros in the mask. Trailing zeros and not leading +// zeros because the x86 architecture is little endian. +static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2, + uint64_t mask) { + const size_t diff_index = __builtin_ctzll(mask); + const int16_t ca = cpp::bit_cast(p1[diff_index]); + const int16_t cb = cpp::bit_cast(p2[diff_index]); + return ca - cb; +} + +/////////////////////////////////////////////////////////////////////////////// +// Memcmp + +// Base implementation for the Memcmp specializations. +// - BlockSize is either 16, 32 or 64 depending on the available compile time +// features, it is used to switch between "single native operation" or a +// "sequence of native operations". +// - BlockMemcmp is the function that implements the memcmp logic. +// - BlockBcmp is the function that implements the bcmp logic. +template +struct MemcmpImpl { + static constexpr size_t SIZE = Size; + static inline MemcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == BlockSize) { + return BlockMemcmp(p1, p2); + } else if constexpr (Size % BlockSize == 0) { + for (size_t offset = 0; offset < Size; offset += BlockSize) + if (auto value = BlockBcmp(p1 + offset, p2 + offset)) + return BlockMemcmp(p1 + offset, p2 + offset); + } else { + deferred_static_assert("SIZE not implemented"); + } + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - Size, p2 + count - Size); + } + + static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + if (auto value = block(p1, p2)) + return value; + return tail(p1, p2, count); + } + + static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + static_assert(Size > 1); + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += Size; + } while (offset < count - Size); + return tail(p1, p2, count); + } +}; + +namespace sse2 { +static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(16))); + // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. + if (int mask = _mm_movemask_epi8(load(p1) != load(p2))) + return char_diff_no_zero(p1, p2, mask); + return MemcmpReturnType::ZERO(); +} +template using Memcmp = MemcmpImpl; +} // namespace sse2 + +namespace avx2 { +static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(32))); + // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. + if (int mask = _mm256_movemask_epi8(load(p1) != load(p2))) + return char_diff_no_zero(p1, p2, mask); + return MemcmpReturnType::ZERO(); +} +template using Memcmp = MemcmpImpl; +} // namespace avx2 + +namespace avx512bw { +static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(64))); + // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. + if (uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2))) + return char_diff_no_zero(p1, p2, mask); + return MemcmpReturnType::ZERO(); +} +template using Memcmp = MemcmpImpl; +} // namespace avx512bw + +} // namespace __llvm_libc::x86 + +#endif // LLVM_LIBC_ARCH_X86_64 + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -106,6 +106,41 @@ using Ptr = char *; // Pointer to raw data. using CPtr = const char *; // Const pointer to raw data. +// This type makes sure that we don't accidentally promote an integral type to +// another one. It is only constructible from the exact T type. +template struct StrictIntegralType { + static_assert(cpp::is_integral_v); + + // Can only be constructed from a T. + template , bool> = 0> + StrictIntegralType(U value) : value(value) {} + + // Allows using the type in an if statement. + explicit operator bool() const { return value; } + + // If type is unsigned (bcmp) we allow bitwise OR operations. + StrictIntegralType operator|(const StrictIntegralType &Rhs) const { + static_assert(!cpp::is_signed_v); + return value | Rhs.value; + } + + // For interation with the C API we allow explicit conversion back to the + // `int` type. + explicit operator int() const { + // bit_cast makes sure that T and int have the same size. + return cpp::bit_cast(value); + } + + // Helper to get the zero value. + static inline constexpr StrictIntegralType ZERO() { return {T(0)}; } + +private: + T value; +}; + +using MemcmpReturnType = StrictIntegralType; +using BcmpReturnType = StrictIntegralType; + // Loads bytes from memory (possibly unaligned) and materializes them as // type. template static inline T load(CPtr ptr) { diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -5,6 +5,7 @@ SRCS elements_test.cpp memory_access_test.cpp + op_tests.cpp utils_test.cpp COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memory_utils/op_tests.cpp @@ -0,0 +1,416 @@ +//===-- Unittests for op_ files -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/CPP/limits.h" +#include "src/__support/CPP/span.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/utils.h" +#include "utils/UnitTest/Test.h" + +#include +#include + +#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64) +#define LLVM_LIBC_HAS_UINT64 +#endif + +namespace __llvm_libc { + +static char GetRandomChar() { + static constexpr const uint64_t a = 1103515245; + static constexpr const uint64_t c = 12345; + static constexpr const uint64_t m = 1ULL << 31; + static uint64_t seed = 123456789; + seed = (a * seed + c) % m; + return seed; +} + +// Randomize the content of the buffer. +static void Randomize(cpp::span buffer) { + for (auto ¤t : buffer) + current = GetRandomChar(); +} + +// Copy one span to another. +static void Copy(cpp::span dst, const cpp::span src) { + assert(dst.size() == src.size()); + for (size_t i = 0; i < dst.size(); ++i) + dst[i] = src[i]; +} + +// Simple structure to allocate an aligned buffer of a particular size. +// By allocating exactly the right size, we can leverage asan to detect whether +// we perform out of bounds accesses. +struct RawAlignedBuffer { + static constexpr size_t kAlign = 64; + RawAlignedBuffer(size_t size) + : ptr((char *)aligned_alloc(kAlign, size)), size(size) { + assert(ptr); + assert((uintptr_t)(ptr) % kAlign == 0); + } + ~RawAlignedBuffer() { free(ptr); } + cpp::span span() { return cpp::span(ptr, size); } + +private: + char *ptr = nullptr; + size_t size = 0; +}; + +// Allocates two RawAlignedBuffer and extracts two spans out of them, one +// aligned and one misaligned. Tests are run on both spans. +struct Buffers { + Buffers(size_t size) + : size(size), aligned_buffer(size), misaligned_buffer(size + 1) {} + + // Returns two spans of 'size' bytes. The first is aligned on + // RawAlignedBuffer::kAlign and the second one is unaligned. + cpp::array, 2> spans() { + return {aligned_buffer.span(), misaligned_buffer.span().subspan(1)}; + } + + size_t size; + RawAlignedBuffer aligned_buffer; + RawAlignedBuffer misaligned_buffer; +}; + +using MemcpyImplementations = testing::TypeList< +#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE + builtin::Memcpy<1>, // + builtin::Memcpy<2>, // + builtin::Memcpy<3>, // + builtin::Memcpy<4>, // + builtin::Memcpy<8>, // + builtin::Memcpy<16>, // + builtin::Memcpy<32>, // + builtin::Memcpy<64> +#endif // LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE + >; + +template +bool CheckMemcpy(cpp::span dst, cpp::span src, size_t size) { + assert(dst.size() == src.size()); + assert(dst.size() == size); + Randomize(dst); + Foo(dst.data(), src.data(), size); + for (size_t i = 0; i < size; ++i) + if (dst[i] != src[i]) + return false; + return true; +} + +template static void MemcpyAdaptor(Ptr dst, CPtr src, size_t) { + return T::block(dst, src); +} + +TYPED_TEST(LlvmLibcOpTest, Memcpy, MemcpyImplementations) { + using Impl = ParamType; + constexpr size_t kSize = Impl::SIZE; + { // Test block operation + Buffers SrcBuffer(kSize); + Buffers DstBuffer(kSize); + for (auto src : SrcBuffer.spans()) { + Randomize(src); + for (auto dst : DstBuffer.spans()) { + ASSERT_TRUE(CheckMemcpy>(dst, src, kSize)); + } + } + } + { // Test head tail operations + RawAlignedBuffer SrcBuffer(2 * kSize); + RawAlignedBuffer DstBuffer(2 * kSize); + Randomize(SrcBuffer.span()); + for (size_t size = kSize; size < 2 * kSize; ++size) { + auto src = SrcBuffer.span().subspan(0, size); + auto dst = DstBuffer.span().subspan(0, size); + ASSERT_TRUE(CheckMemcpy(dst, src, size)); + } + } + { // Test loop operations + if constexpr (kSize > 1) { + RawAlignedBuffer SrcBuffer(3 * kSize); + RawAlignedBuffer DstBuffer(3 * kSize); + Randomize(SrcBuffer.span()); + for (size_t size = kSize; size < 3 * kSize; ++size) { + auto src = SrcBuffer.span().subspan(0, size); + auto dst = DstBuffer.span().subspan(0, size); + ASSERT_TRUE(CheckMemcpy(dst, src, size)); + } + } + } +} + +using MemsetImplementations = testing::TypeList< +#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE + builtin::Memset<1>, // + builtin::Memset<2>, // + builtin::Memset<3>, // + builtin::Memset<4>, // + builtin::Memset<8>, // + builtin::Memset<16>, // + builtin::Memset<32>, // + builtin::Memset<64>, +#endif +#ifdef LLVM_LIBC_HAS_UINT64 + generic::Memset<8, 8>, // + generic::Memset<16, 8>, // + generic::Memset<32, 8>, // + generic::Memset<64, 8>, // +#endif +#ifdef __AVX512F__ + generic::Memset<64, 64>, // prevents warning about avx512f + generic::Memset<128, 64>, // prevents warning about avx512f +#endif + generic::Memset<1, 1>, // + generic::Memset<2, 1>, // + generic::Memset<2, 2>, // + generic::Memset<4, 2>, // + generic::Memset<4, 4>, // + generic::Memset<16, 16>, // + generic::Memset<32, 16>, // + generic::Memset<64, 16>, // + generic::Memset<32, 32>, // + generic::Memset<64, 32> // + >; + +template +bool CheckMemset(cpp::span dst, uint8_t value, size_t size) { + Randomize(dst); + Foo(dst.data(), value, size); + for (char c : dst) + if (c != (char)value) + return false; + return true; +} + +template +static void MemsetAdaptor(Ptr dst, uint8_t value, size_t) { + return T::block(dst, value); +} + +TYPED_TEST(LlvmLibcOpTest, Memset, MemsetImplementations) { + using Impl = ParamType; + constexpr size_t kSize = Impl::SIZE; + { // Test block operation + Buffers DstBuffer(kSize); + for (uint8_t value : cpp::array{0, 1, 255}) { + for (auto dst : DstBuffer.spans()) { + ASSERT_TRUE(CheckMemset>(dst, value, kSize)); + } + } + } + { // Test head tail operations + RawAlignedBuffer DstBuffer(2 * kSize); + for (size_t size = kSize; size < 2 * kSize; ++size) { + const char value = size % 10; + auto dst = DstBuffer.span().subspan(0, size); + ASSERT_TRUE(CheckMemset(dst, value, size)); + } + } + { // Test loop operations + if constexpr (kSize > 1) { + RawAlignedBuffer DstBuffer(3 * kSize); + for (size_t size = kSize; size < 3 * kSize; ++size) { + const char value = size % 10; + auto dst = DstBuffer.span().subspan(0, size); + ASSERT_TRUE((CheckMemset(dst, value, size))); + } + } + } +} + +using BcmpImplementations = testing::TypeList< +#ifdef __SSE2__ + x86::sse2::Bcmp<16>, // + x86::sse2::Bcmp<32>, // + x86::sse2::Bcmp<64>, // + x86::sse2::Bcmp<128>, // +#endif +#ifdef __AVX2__ + x86::avx2::Bcmp<32>, // + x86::avx2::Bcmp<64>, // + x86::avx2::Bcmp<128>, // +#endif +#ifdef __AVX512BW__ + x86::avx512bw::Bcmp<64>, // + x86::avx512bw::Bcmp<128>, // +#endif +#ifdef __ARM_NEON + aarch64::neon::Bcmp<32>, // + aarch64::neon::Bcmp<64>, // +#endif +#ifdef LLVM_LIBC_HAS_UINT64 + generic::Bcmp<8>, // +#endif + generic::Bcmp<1>, // + generic::Bcmp<2>, // + generic::Bcmp<4>, // + generic::Bcmp<16>, // + generic::Bcmp<32>, // + generic::Bcmp<64> // + >; + +template +bool CheckBcmp(cpp::span span1, cpp::span span2, size_t size) { + assert(span1.size() == span2.size()); + Copy(span2, span1); + // Compare equal + if (int cmp = (int)Foo(span1.data(), span2.data(), size); cmp != 0) + return false; + // Compare not equal if any byte differs + for (size_t i = 0; i < size; ++i) { + ++span2[i]; + if (int cmp = (int)Foo(span1.data(), span2.data(), size); cmp == 0) + return false; + if (int cmp = (int)Foo(span2.data(), span1.data(), size); cmp == 0) + return false; + --span2[i]; + } + return true; +} + +template +static BcmpReturnType BcmpAdaptor(CPtr p1, CPtr p2, size_t) { + return T::block(p1, p2); +} + +TYPED_TEST(LlvmLibcOpTest, Bcmp, BcmpImplementations) { + using Impl = ParamType; + constexpr size_t kSize = Impl::SIZE; + { // Test block operation + Buffers Buffer1(kSize); + Buffers Buffer2(kSize); + for (auto span1 : Buffer1.spans()) { + Randomize(span1); + for (auto span2 : Buffer2.spans()) + ASSERT_TRUE((CheckBcmp>(span1, span2, kSize))); + } + } + { // Test head tail operations + RawAlignedBuffer Buffer1(2 * kSize); + RawAlignedBuffer Buffer2(2 * kSize); + Randomize(Buffer1.span()); + for (size_t size = kSize; size < 2 * kSize; ++size) { + auto span1 = Buffer1.span().subspan(0, size); + auto span2 = Buffer2.span().subspan(0, size); + ASSERT_TRUE((CheckBcmp(span1, span2, size))); + } + } + { // Test loop operations + if constexpr (kSize > 1) { + RawAlignedBuffer Buffer1(3 * kSize); + RawAlignedBuffer Buffer2(3 * kSize); + Randomize(Buffer1.span()); + for (size_t size = kSize; size < 3 * kSize; ++size) { + auto span1 = Buffer1.span().subspan(0, size); + auto span2 = Buffer2.span().subspan(0, size); + ASSERT_TRUE((CheckBcmp(span1, span2, size))); + } + } + } +} + +using MemcmpImplementations = testing::TypeList< +#ifdef __SSE2__ + x86::sse2::Memcmp<16>, // + x86::sse2::Memcmp<32>, // + x86::sse2::Memcmp<64>, // + x86::sse2::Memcmp<128>, // +#endif +#ifdef __AVX2__ + x86::avx2::Memcmp<32>, // + x86::avx2::Memcmp<64>, // + x86::avx2::Memcmp<128>, // +#endif +#ifdef __AVX512BW__ + x86::avx512bw::Memcmp<64>, // + x86::avx512bw::Memcmp<128>, // +#endif +#ifdef LLVM_LIBC_HAS_UINT64 + generic::Memcmp<8>, // +#endif + generic::Memcmp<1>, // + generic::Memcmp<2>, // + generic::Memcmp<3>, // + generic::Memcmp<4>, // + generic::Memcmp<16>, // + generic::Memcmp<32>, // + generic::Memcmp<64> // + >; + +template +bool CheckMemcmp(cpp::span span1, cpp::span span2, size_t size) { + assert(span1.size() == span2.size()); + Copy(span2, span1); + // Compare equal + if (int cmp = (int)Foo(span1.data(), span2.data(), size); cmp != 0) + return false; + // Compare not equal if any byte differs + for (size_t i = 0; i < size; ++i) { + ++span2[i]; + int ground_truth = __builtin_memcmp(span1.data(), span2.data(), size); + if (ground_truth > 0) { + if (int cmp = (int)Foo(span1.data(), span2.data(), size); cmp <= 0) + return false; + if (int cmp = (int)Foo(span2.data(), span1.data(), size); cmp >= 0) + return false; + } else { + if (int cmp = (int)Foo(span1.data(), span2.data(), size); cmp >= 0) + return false; + if (int cmp = (int)Foo(span2.data(), span1.data(), size); cmp <= 0) + return false; + } + --span2[i]; + } + return true; +} + +template +static MemcmpReturnType MemcmpAdaptor(CPtr p1, CPtr p2, size_t) { + return T::block(p1, p2); +} + +TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) { + using Impl = ParamType; + constexpr size_t kSize = Impl::SIZE; + { // Test block operation + Buffers Buffer1(kSize); + Buffers Buffer2(kSize); + for (auto span1 : Buffer1.spans()) { + Randomize(span1); + for (auto span2 : Buffer2.spans()) + ASSERT_TRUE((CheckMemcmp>(span1, span2, kSize))); + } + } + { // Test head tail operations + RawAlignedBuffer Buffer1(2 * kSize); + RawAlignedBuffer Buffer2(2 * kSize); + Randomize(Buffer1.span()); + for (size_t size = kSize; size < 2 * kSize; ++size) { + auto span1 = Buffer1.span().subspan(0, size); + auto span2 = Buffer2.span().subspan(0, size); + ASSERT_TRUE((CheckMemcmp(span1, span2, size))); + } + } + { // Test loop operations + if constexpr (kSize > 1) { + RawAlignedBuffer Buffer1(3 * kSize); + RawAlignedBuffer Buffer2(3 * kSize); + Randomize(Buffer1.span()); + for (size_t size = kSize; size < 3 * kSize; ++size) { + auto span1 = Buffer1.span().subspan(0, size); + auto span2 = Buffer2.span().subspan(0, size); + ASSERT_TRUE((CheckMemcmp(span1, span2, size))); + } + } + } +} + +} // namespace __llvm_libc diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -973,9 +973,13 @@ cc_library( name = "string_memory_utils", hdrs = [ - "src/string/memory_utils/elements.h", "src/string/memory_utils/elements_aarch64.h", "src/string/memory_utils/elements_x86.h", + "src/string/memory_utils/elements.h", + "src/string/memory_utils/op_aarch64.h", + "src/string/memory_utils/op_builtin.h", + "src/string/memory_utils/op_generic.h", + "src/string/memory_utils/op_x86.h", "src/string/memory_utils/utils.h", ], textual_hdrs = [ @@ -989,6 +993,7 @@ ":__support_common", ":__support_cpp_bit", ":__support_cpp_type_traits", + ":__support_cpp_array", ":libc_root", ], )