diff --git a/libc/src/stdio/printf_core/string_writer.cpp b/libc/src/stdio/printf_core/string_writer.cpp --- a/libc/src/stdio/printf_core/string_writer.cpp +++ b/libc/src/stdio/printf_core/string_writer.cpp @@ -33,7 +33,7 @@ len = available_capacity; if (len > 0) { - inline_memset(cur_buffer, new_char, len); + inline_memset(cur_buffer, static_cast(new_char), len); cur_buffer += len; available_capacity -= len; } diff --git a/libc/src/string/bcmp.cpp b/libc/src/string/bcmp.cpp --- a/libc/src/string/bcmp.cpp +++ b/libc/src/string/bcmp.cpp @@ -14,8 +14,8 @@ LLVM_LIBC_FUNCTION(int, bcmp, (const void *lhs, const void *rhs, size_t count)) { - return inline_bcmp(static_cast(lhs), - static_cast(rhs), count); + return static_cast(inline_bcmp(static_cast(lhs), + static_cast(rhs), count)); } } // namespace __llvm_libc diff --git a/libc/src/string/memcmp.cpp b/libc/src/string/memcmp.cpp --- a/libc/src/string/memcmp.cpp +++ b/libc/src/string/memcmp.cpp @@ -15,8 +15,8 @@ LLVM_LIBC_FUNCTION(int, memcmp, (const void *lhs, const void *rhs, size_t count)) { - return inline_memcmp(static_cast(lhs), - static_cast(rhs), count); + return static_cast(inline_memcmp(static_cast(lhs), + static_cast(rhs), count)); } } // namespace __llvm_libc diff --git a/libc/src/string/memmove.cpp b/libc/src/string/memmove.cpp --- a/libc/src/string/memmove.cpp +++ b/libc/src/string/memmove.cpp @@ -9,36 +9,52 @@ #include "src/string/memmove.h" #include "src/__support/common.h" -#include "src/__support/integer_operations.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" #include // size_t, ptrdiff_t +#include + namespace __llvm_libc { static inline void inline_memmove(char *dst, const char *src, size_t count) { - using namespace __llvm_libc::scalar; +#if defined(LLVM_LIBC_ARCH_X86) + static constexpr size_t kMaxSize = x86::kAvx512F ? 64 + : x86::kAvx ? 32 + : x86::kSse2 ? 16 + : 8; +#elif defined(LLVM_LIBC_ARCH_AARCH64) + static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; +#else + static constexpr size_t kMaxSize = 8; +#endif if (count == 0) return; if (count == 1) - return move<_1>(dst, src); + return generic::Memmove<1, kMaxSize>::block(dst, src); if (count <= 4) - return move>(dst, src, count); + return generic::Memmove<2, kMaxSize>::head_tail(dst, src, count); if (count <= 8) - return move>(dst, src, count); + return generic::Memmove<4, kMaxSize>::head_tail(dst, src, count); if (count <= 16) - return move>(dst, src, count); + return generic::Memmove<8, kMaxSize>::head_tail(dst, src, count); if (count <= 32) - return move>(dst, src, count); + return generic::Memmove<16, kMaxSize>::head_tail(dst, src, count); if (count <= 64) - return move>(dst, src, count); + return generic::Memmove<32, kMaxSize>::head_tail(dst, src, count); if (count <= 128) - return move>(dst, src, count); + return generic::Memmove<64, kMaxSize>::head_tail(dst, src, count); - using AlignedMoveLoop = Align<_16, Arg::Src>::Then>; - if (dst < src) - return move(dst, src, count); - else if (dst > src) - return move_backward(dst, src, count); + if (dst < src) { + generic::Memmove<32, kMaxSize>::align_forward(dst, src, count); + return generic::Memmove<64, kMaxSize>::loop_and_tail_forward(dst, src, + count); + } else { + generic::Memmove<32, kMaxSize>::align_backward(dst, src, count); + return generic::Memmove<64, kMaxSize>::loop_and_tail_backward(dst, src, + count); + } } LLVM_LIBC_FUNCTION(void *, memmove, diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -2,13 +2,17 @@ add_header_library( memory_utils HDRS - utils.h - elements.h bcmp_implementations.h bzero_implementations.h memcmp_implementations.h memcpy_implementations.h memset_implementations.h + op_aarch64.h + op_higher_order.h + op_builtin.h + op_generic.h + op_x86.h + utils.h DEPS libc.src.__support.CPP.bit ) diff --git a/libc/src/string/memory_utils/README.md b/libc/src/string/memory_utils/README.md new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/README.md @@ -0,0 +1,97 @@ +# The mem* framework + +The framework handles the following mem* functions: + - `memcpy` + - `memmove` + - `memset` + - `bzero` + - `bcmp` + - `memcmp` + +## Building blocks + +These functions can be built out of a set of lower-level operations: + - **`block`** : operates on a block of `SIZE` bytes. + - **`tail`** : operates on the last `SIZE` bytes of the buffer (e.g., `[dst + count - SIZE, dst + count]`) + - **`head_tail`** : operates on the first and last `SIZE` bytes. This is the same as calling `block` and `tail`. + - **`loop_and_tail`** : calls `block` in a loop to consume as much as possible of the `count` bytes and handle the remaining bytes with a `tail` operation. + +As an illustration, let's take the example of a trivial `memset` implementation: + + ```C++ + extern "C" void memset(const char* dst, int value, size_t count) { + if (count == 0) return; + if (count == 1) return Memset<1>::block(dst, value); + if (count == 2) return Memset<2>::block(dst, value); + if (count == 3) return Memset<3>::block(dst, value); + if (count <= 8) return Memset<4>::head_tail(dst, value, count); // Note that 0 to 4 bytes are written twice. + if (count <= 16) return Memset<8>::head_tail(dst, value, count); // Same here. + return Memset<16>::loop_and_tail(dst, value, count); +} + ``` + +Now let's have a look into the `Memset` structure: + +```C++ +template +struct Memset { + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, uint8_t value) { + // Implement me + } + + static inline void tail(Ptr dst, uint8_t value, size_t count) { + block(dst + count - SIZE, value); + } + + static inline void head_tail(Ptr dst, uint8_t value, size_t count) { + block(dst, value); + tail(dst, value, count); + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, value, count); + } +}; +``` + +As you can see, the `tail`, `head_tail` and `loop_and_tail` are higher order functions that build on each others. Only `block` really needs to be implemented. +In earlier designs we were implementing these higher order functions with templated functions but it appears that it is more readable to have the implementation explicitly stated. +**This design is useful because it provides customization points**. For instance, for `bcmp` on `aarch64` we can provide a better implementation of `head_tail` using vector reduction intrinsics. + +## Scoped specializations + +We can have several specializations of the `Memset` structure. Depending on the target requirements we can use one or several scopes for the same implementation. + +In the following example we use the `generic` implementation for the small sizes but use the `x86` implementation for the loop. +```C++ + extern "C" void memset(const char* dst, int value, size_t count) { + if (count == 0) return; + if (count == 1) return generic::Memset<1>::block(dst, value); + if (count == 2) return generic::Memset<2>::block(dst, value); + if (count == 3) return generic::Memset<3>::block(dst, value); + if (count <= 8) return generic::Memset<4>::head_tail(dst, value, count); + if (count <= 16) return generic::Memset<8>::head_tail(dst, value, count); + return x86::Memset<16>::loop_and_tail(dst, value, count); +} +``` + +### The `builtin` scope + +Ultimately we would like the compiler to provide the code for the `block` function. For this we rely on dedicated builtins available in Clang (e.g., [`__builtin_memset_inline`](https://clang.llvm.org/docs/LanguageExtensions.html#guaranteed-inlined-memset)) + +### The `generic` scope + +In this scope we define pure C++ implementations using native integral types and clang vector extensions. + +### The arch specific scopes + +Then comes implementations that are using specific architectures or microarchitectures features (e.g., `rep;movsb` for `x86` or `dc zva` for `aarch64`). + +The purpose here is to rely on builtins as much as possible and fallback to `asm volatile` as a last resort. diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -11,49 +11,132 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" #include // size_t namespace __llvm_libc { -// Fixed-size difference between 'lhs' and 'rhs'. -template bool differs(const char *lhs, const char *rhs) { - return !Element::equals(lhs, rhs); -} -// Runtime-size difference between 'lhs' and 'rhs'. -template -bool differs(const char *lhs, const char *rhs, size_t size) { - return !Element::equals(lhs, rhs, size); +static inline BcmpReturnType inline_bcmp_generic_gt16(CPtr p1, CPtr p2, + size_t count) { + if (count < 256) + return generic::Bcmp<16>::loop_and_tail(p1, p2, count); + if (auto value = generic::Bcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + return generic::Bcmp<64>::loop_and_tail(p1, p2, count); } -static inline int inline_bcmp(const char *lhs, const char *rhs, size_t count) { #if defined(LLVM_LIBC_ARCH_X86) - using namespace ::__llvm_libc::x86; -#elif defined(LLVM_LIBC_ARCH_AARCH64) - using namespace ::__llvm_libc::aarch64; +static inline BcmpReturnType inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, + size_t count) { + if (count <= 32) + return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + if (count < 256) + return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count); + if (auto value = x86::sse2::Bcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count); +} + +static inline BcmpReturnType inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, + size_t count) { + if (count <= 32) + return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + if (count <= 64) + return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); + if (count <= 128) + return x86::avx2::Bcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 256)) { + if (auto value = x86::avx2::Bcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + } + return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count); +} + +static inline BcmpReturnType inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, + size_t count) { + if (count <= 32) + return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + if (count <= 64) + return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); + if (count <= 128) + return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 256)) { + if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + } + return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count); +} +#endif // defined(LLVM_LIBC_ARCH_X86) + +static inline BcmpReturnType inline_bcmp(CPtr p1, CPtr p2, size_t count) { +#if defined(LLVM_LIBC_ARCH_AARCH64) + if (likely(count <= 32)) { + if (unlikely(count >= 16)) { + return generic::Bcmp<16>::head_tail(p1, p2, count); + } + switch (count) { + case 0: + return BcmpReturnType::ZERO(); + case 1: + return generic::Bcmp<1>::block(p1, p2); + case 2: + return generic::Bcmp<2>::block(p1, p2); + case 3: + return generic::Bcmp<2>::head_tail(p1, p2, count); + case 4: + return generic::Bcmp<4>::block(p1, p2); + case 5 ... 7: + return generic::Bcmp<4>::head_tail(p1, p2, count); + case 8: + return generic::Bcmp<8>::block(p1, p2); + case 9 ... 15: + return generic::Bcmp<8>::head_tail(p1, p2, count); + } + } + + if (count <= 64) + return generic::Bcmp<32>::head_tail(p1, p2, count); + + // Aligned loop if > 256, otherwise normal loop + if (count > 256) { + if (auto value = generic::Bcmp<32>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + } + return generic::Bcmp<32>::loop_and_tail(p1, p2, count); #else - using namespace ::__llvm_libc::scalar; -#endif if (count == 0) - return 0; + return BcmpReturnType::ZERO(); if (count == 1) - return differs<_1>(lhs, rhs); + return generic::Bcmp<1>::block(p1, p2); if (count == 2) - return differs<_2>(lhs, rhs); - if (count == 3) - return differs<_3>(lhs, rhs); + return generic::Bcmp<2>::block(p1, p2); + if (count <= 4) + return generic::Bcmp<2>::head_tail(p1, p2, count); if (count <= 8) - return differs>(lhs, rhs, count); + return generic::Bcmp<4>::head_tail(p1, p2, count); if (count <= 16) - return differs>(lhs, rhs, count); - if (count <= 32) - return differs>(lhs, rhs, count); - if (count <= 64) - return differs>(lhs, rhs, count); - if (count <= 128) - return differs>(lhs, rhs, count); - return differs::Then>>(lhs, rhs, count); + return generic::Bcmp<8>::head_tail(p1, p2, count); +#if defined(LLVM_LIBC_ARCH_X86) + if constexpr (x86::kAvx512BW) + return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); + else if constexpr (x86::kAvx2) + return inline_bcmp_x86_avx2_gt16(p1, p2, count); + else if constexpr (x86::kSse2) + return inline_bcmp_x86_sse2_gt16(p1, p2, count); + else + return inline_bcmp_generic_gt16(p1, p2, count); +#else + return inline_bcmp_generic_gt16(p1, p2, count); +#endif +#endif } } // namespace __llvm_libc diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h deleted file mode 100644 --- a/libc/src/string/memory_utils/elements.h +++ /dev/null @@ -1,774 +0,0 @@ -//===-- Elementary operations to compose memory primitives ----------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H - -#include // size_t -#include // uint8_t, uint16_t, uint32_t, uint64_t - -#include "src/__support/endian.h" -#include "src/string/memory_utils/utils.h" - -namespace __llvm_libc { - -// Elementary Operations -// -------------------------------- -// We define abstract elementary operations acting on fixed chunks of memory. -// These are low level building blocks that are meant to be assembled to compose -// higher order abstractions. Each function is defined twice: once with -// fixed-size operations, and once with runtime-size operations. - -// Fixed-size copy from 'src' to 'dst'. -template -void copy(char *__restrict dst, const char *__restrict src) { - Element::copy(dst, src); -} -// Runtime-size copy from 'src' to 'dst'. -template -void copy(char *__restrict dst, const char *__restrict src, size_t size) { - Element::copy(dst, src, size); -} - -// Fixed-size move from 'src' to 'dst'. -template void move(char *dst, const char *src) { - Element::move(dst, src); -} -// Runtime-size move from 'src' to 'dst'. -template void move(char *dst, const char *src, size_t size) { - Element::move(dst, src, size); -} -// Runtime-size move from 'src' to 'dst'. -template -void move_backward(char *dst, const char *src, size_t size) { - Element::move_backward(dst, src, size); -} - -// Fixed-size equality between 'lhs' and 'rhs'. -template bool equals(const char *lhs, const char *rhs) { - return Element::equals(lhs, rhs); -} -// Runtime-size equality between 'lhs' and 'rhs'. -template -bool equals(const char *lhs, const char *rhs, size_t size) { - return Element::equals(lhs, rhs, size); -} - -// Fixed-size three-way comparison between 'lhs' and 'rhs'. -template -int three_way_compare(const char *lhs, const char *rhs) { - return Element::three_way_compare(lhs, rhs); -} -// Runtime-size three-way comparison between 'lhs' and 'rhs'. -template -int three_way_compare(const char *lhs, const char *rhs, size_t size) { - return Element::three_way_compare(lhs, rhs, size); -} - -// Fixed-size initialization. -template -void splat_set(char *dst, const unsigned char value) { - Element::splat_set(dst, value); -} -// Runtime-size initialization. -template -void splat_set(char *dst, const unsigned char value, size_t size) { - Element::splat_set(dst, value, size); -} - -// Stack placeholder for Move operations. -template struct Storage { char bytes[Element::SIZE]; }; - -// Fixed-size Higher-Order Operations -// ---------------------------------- -// - Repeated: Repeat the operation several times in a row. -// - Chained: Chain the operation of several types. - -// Repeat the operation several times in a row. -template struct Repeated { - static constexpr size_t SIZE = ElementCount * Element::SIZE; - - static void copy(char *__restrict dst, const char *__restrict src) { - for (size_t i = 0; i < ElementCount; ++i) { - const size_t offset = i * Element::SIZE; - Element::copy(dst + offset, src + offset); - } - } - - static void move(char *dst, const char *src) { - const auto value = load(src); - store(dst, value); - } - - static bool equals(const char *lhs, const char *rhs) { - for (size_t i = 0; i < ElementCount; ++i) { - const size_t offset = i * Element::SIZE; - if (!Element::equals(lhs + offset, rhs + offset)) - return false; - } - return true; - } - - static int three_way_compare(const char *lhs, const char *rhs) { - for (size_t i = 0; i < ElementCount; ++i) { - const size_t offset = i * Element::SIZE; - // We make the assumption that 'equals' is cheaper than - // 'three_way_compare'. - if (Element::equals(lhs + offset, rhs + offset)) - continue; - return Element::three_way_compare(lhs + offset, rhs + offset); - } - return 0; - } - - static void splat_set(char *dst, const unsigned char value) { - for (size_t i = 0; i < ElementCount; ++i) { - const size_t offset = i * Element::SIZE; - Element::splat_set(dst + offset, value); - } - } - - static Storage load(const char *ptr) { - Storage value; - copy(reinterpret_cast(&value), ptr); - return value; - } - - static void store(char *ptr, Storage value) { - copy(ptr, reinterpret_cast(&value)); - } -}; - -template struct Repeated { - static void move(char *, const char *) {} -}; - -// Chain the operation of several types. -// For instance, to handle a 3 bytes operation, one can use: -// Chained::Operation(); -template struct Chained; - -template struct Chained { - static constexpr size_t SIZE = Head::SIZE + Chained::SIZE; - - static void copy(char *__restrict dst, const char *__restrict src) { - Chained::copy(dst + Head::SIZE, src + Head::SIZE); - __llvm_libc::copy(dst, src); - } - - static void move(char *dst, const char *src) { - const auto value = Head::load(src); - Chained::move(dst + Head::SIZE, src + Head::SIZE); - Head::store(dst, value); - } - - static bool equals(const char *lhs, const char *rhs) { - if (!__llvm_libc::equals(lhs, rhs)) - return false; - return Chained::equals(lhs + Head::SIZE, rhs + Head::SIZE); - } - - static int three_way_compare(const char *lhs, const char *rhs) { - if (__llvm_libc::equals(lhs, rhs)) - return Chained::three_way_compare(lhs + Head::SIZE, - rhs + Head::SIZE); - return __llvm_libc::three_way_compare(lhs, rhs); - } - - static void splat_set(char *dst, const unsigned char value) { - Chained::splat_set(dst + Head::SIZE, value); - __llvm_libc::splat_set(dst, value); - } -}; - -template <> struct Chained<> { - static constexpr size_t SIZE = 0; - static void copy(char *__restrict, const char *__restrict) {} - static void move(char *, const char *) {} - static bool equals(const char *, const char *) { return true; } - static int three_way_compare(const char *, const char *) { return 0; } - static void splat_set(char *, const unsigned char) {} -}; - -// Overlap ElementA and ElementB so they span Size bytes. -template -struct Overlap { - static constexpr size_t SIZE = Size; - static_assert(ElementB::SIZE <= ElementA::SIZE, "ElementB too big"); - static_assert(ElementA::SIZE <= Size, "ElementA too big"); - static_assert((ElementA::SIZE + ElementB::SIZE) >= Size, - "Elements too small to overlap"); - static constexpr size_t OFFSET = SIZE - ElementB::SIZE; - - static void copy(char *__restrict dst, const char *__restrict src) { - ElementA::copy(dst, src); - ElementB::copy(dst + OFFSET, src + OFFSET); - } - - static void move(char *dst, const char *src) { - const auto value_a = ElementA::load(src); - const auto value_b = ElementB::load(src + OFFSET); - ElementB::store(dst + OFFSET, value_b); - ElementA::store(dst, value_a); - } - - static bool equals(const char *lhs, const char *rhs) { - if (!ElementA::equals(lhs, rhs)) - return false; - if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET)) - return false; - return true; - } - - static int three_way_compare(const char *lhs, const char *rhs) { - if (!ElementA::equals(lhs, rhs)) - return ElementA::three_way_compare(lhs, rhs); - if (!ElementB::equals(lhs + OFFSET, rhs + OFFSET)) - return ElementB::three_way_compare(lhs + OFFSET, rhs + OFFSET); - return 0; - } - - static void splat_set(char *dst, const unsigned char value) { - ElementA::splat_set(dst, value); - ElementB::splat_set(dst + OFFSET, value); - } -}; - -// Runtime-size Higher-Order Operations -// ------------------------------------ -// - Tail: Perform the operation on the last 'T::SIZE' bytes of the buffer. -// - HeadTail: Perform the operation on the first and last 'T::SIZE' bytes -// of the buffer. -// - Loop: Perform a loop of fixed-sized operations. - -// Perform the operation on the last 'T::SIZE' bytes of the buffer. -// -// e.g. with -// [1234567812345678123] -// [__XXXXXXXXXXXXXX___] -// [________XXXXXXXX___] -// -// Precondition: `size >= T::SIZE`. -template struct Tail { - static void copy(char *__restrict dst, const char *__restrict src, - size_t size) { - return T::copy(dst + offset(size), src + offset(size)); - } - - static bool equals(const char *lhs, const char *rhs, size_t size) { - return T::equals(lhs + offset(size), rhs + offset(size)); - } - - static int three_way_compare(const char *lhs, const char *rhs, size_t size) { - return T::three_way_compare(lhs + offset(size), rhs + offset(size)); - } - - static void splat_set(char *dst, const unsigned char value, size_t size) { - return T::splat_set(dst + offset(size), value); - } - - static size_t offset(size_t size) { return size - T::SIZE; } -}; - -// Perform the operation on the first and last 'T::SIZE' bytes of the buffer. -// This is useful for overlapping operations. -// -// e.g. with -// [1234567812345678123] -// [__XXXXXXXXXXXXXX___] -// [__XXXXXXXX_________] -// [________XXXXXXXX___] -// -// Precondition: `size >= T::SIZE && size <= 2 x T::SIZE`. -template struct HeadTail { - static void copy(char *__restrict dst, const char *__restrict src, - size_t size) { - T::copy(dst, src); - Tail::copy(dst, src, size); - } - - static void move(char *dst, const char *src, size_t size) { - const size_t offset = Tail::offset(size); - const auto head_value = T::load(src); - const auto tail_value = T::load(src + offset); - T::store(dst + offset, tail_value); - T::store(dst, head_value); - } - - static bool equals(const char *lhs, const char *rhs, size_t size) { - if (!T::equals(lhs, rhs)) - return false; - return Tail::equals(lhs, rhs, size); - } - - static int three_way_compare(const char *lhs, const char *rhs, size_t size) { - if (!T::equals(lhs, rhs)) - return T::three_way_compare(lhs, rhs); - return Tail::three_way_compare(lhs, rhs, size); - } - - static void splat_set(char *dst, const unsigned char value, size_t size) { - T::splat_set(dst, value); - Tail::splat_set(dst, value, size); - } -}; - -// Simple loop ending with a Tail operation. -// -// e.g. with -// [12345678123456781234567812345678] -// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___] -// [__XXXXXXXX_______________________] -// [__________XXXXXXXX_______________] -// [__________________XXXXXXXX_______] -// [______________________XXXXXXXX___] -// -// Precondition: -// - size >= T::SIZE -template struct Loop { - static_assert(T::SIZE == TailT::SIZE, - "Tail type must have the same size as T"); - - static void copy(char *__restrict dst, const char *__restrict src, - size_t size) { - size_t offset = 0; - do { - T::copy(dst + offset, src + offset); - offset += T::SIZE; - } while (offset < size - T::SIZE); - Tail::copy(dst, src, size); - } - - // Move forward suitable when dst < src. We load the tail bytes before - // handling the loop. - // - // e.g. Moving two bytes - // [ | | | | |] - // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] - // [_________________________LLLLLLLL___] - // [___LLLLLLLL_________________________] - // [_SSSSSSSS___________________________] - // [___________LLLLLLLL_________________] - // [_________SSSSSSSS___________________] - // [___________________LLLLLLLL_________] - // [_________________SSSSSSSS___________] - // [_______________________SSSSSSSS_____] - static void move(char *dst, const char *src, size_t size) { - const size_t tail_offset = Tail::offset(size); - const auto tail_value = TailT::load(src + tail_offset); - size_t offset = 0; - do { - T::move(dst + offset, src + offset); - offset += T::SIZE; - } while (offset < size - T::SIZE); - TailT::store(dst + tail_offset, tail_value); - } - - // Move forward suitable when dst > src. We load the head bytes before - // handling the loop. - // - // e.g. Moving two bytes - // [ | | | | |] - // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] - // [___LLLLLLLL_________________________] - // [_________________________LLLLLLLL___] - // [___________________________SSSSSSSS_] - // [_________________LLLLLLLL___________] - // [___________________SSSSSSSS_________] - // [_________LLLLLLLL___________________] - // [___________SSSSSSSS_________________] - // [_____SSSSSSSS_______________________] - static void move_backward(char *dst, const char *src, size_t size) { - const auto head_value = TailT::load(src); - ptrdiff_t offset = size - T::SIZE; - do { - T::move(dst + offset, src + offset); - offset -= T::SIZE; - } while (offset >= 0); - TailT::store(dst, head_value); - } - - static bool equals(const char *lhs, const char *rhs, size_t size) { - size_t offset = 0; - do { - if (!T::equals(lhs + offset, rhs + offset)) - return false; - offset += T::SIZE; - } while (offset < size - T::SIZE); - return Tail::equals(lhs, rhs, size); - } - - static int three_way_compare(const char *lhs, const char *rhs, size_t size) { - size_t offset = 0; - do { - if (!T::equals(lhs + offset, rhs + offset)) - return T::three_way_compare(lhs + offset, rhs + offset); - offset += T::SIZE; - } while (offset < size - T::SIZE); - return Tail::three_way_compare(lhs, rhs, size); - } - - static void splat_set(char *dst, const unsigned char value, size_t size) { - size_t offset = 0; - do { - T::splat_set(dst + offset, value); - offset += T::SIZE; - } while (offset < size - T::SIZE); - Tail::splat_set(dst, value, size); - } -}; - -namespace internal { - -template struct ArgSelector {}; - -template <> struct ArgSelector { - template - static T1 *__restrict &Select(T1 *__restrict &p1ref, T2 *__restrict &) { - return p1ref; - } -}; - -template <> struct ArgSelector { - template - static T2 *__restrict &Select(T1 *__restrict &, T2 *__restrict &p2ref) { - return p2ref; - } -}; - -// Provides a specialized bump function that adjusts pointers and size so first -// argument (resp. second argument) gets aligned to Alignment. -// We make sure the compiler knows about the adjusted pointer alignment. -// The 'additional_bumps' parameter allows to reach previous / next aligned -// pointers. -template struct Align { - template - static void bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size, - int additional_bumps = 0) { - auto &aligned_ptr = ArgSelector::Select(p1ref, p2ref); - auto offset = offset_to_next_aligned(aligned_ptr); - adjust(offset + additional_bumps * Alignment, p1ref, p2ref, size); - aligned_ptr = assume_aligned(aligned_ptr); - } -}; - -} // namespace internal - -// An alignment operation that: -// - executes the 'AlignmentT' operation -// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected -// pointer gets aligned, size is decreased accordingly. -// - calls the 'NextT' operation. -// -// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as: -// copy::Then>>(dst, src, count); -template struct Align { -private: - static constexpr size_t ALIGNMENT = AlignmentT::SIZE; - static_assert(ALIGNMENT > 1, "Alignment must be more than 1"); - static_assert(is_power2(ALIGNMENT), "Alignment must be a power of 2"); - -public: - template struct Then { - static void copy(char *__restrict dst, const char *__restrict src, - size_t size) { - AlignmentT::copy(dst, src); - internal::Align::bump(dst, src, size); - NextT::copy(dst, src, size); - } - - // Move forward suitable when dst < src. The alignment is performed with an - // HeadTail operation of size ∈ [Alignment, 2 x Alignment]. - // - // e.g. Moving two bytes and making sure src is then aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] - // [____LLLLLLLL_____________________] - // [___________LLLLLLLL______________] - // [_SSSSSSSS________________________] - // [________SSSSSSSS_________________] - // - // e.g. Moving two bytes and making sure dst is then aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] - // [____LLLLLLLL_____________________] - // [______LLLLLLLL___________________] - // [_SSSSSSSS________________________] - // [___SSSSSSSS______________________] - static void move(char *dst, const char *src, size_t size) { - char *next_dst = dst; - const char *next_src = src; - size_t next_size = size; - internal::Align::bump(next_dst, next_src, next_size, - 1); - HeadTail::move(dst, src, size - next_size); - NextT::move(next_dst, next_src, next_size); - } - - // Move backward suitable when dst > src. The alignment is performed with an - // HeadTail operation of size ∈ [Alignment, 2 x Alignment]. - // - // e.g. Moving two bytes backward and making sure src is then aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] - // [ _________________LLLLLLLL_______] - // [ ___________________LLLLLLLL_____] - // [____________________SSSSSSSS_____] - // [______________________SSSSSSSS___] - // - // e.g. Moving two bytes and making sure dst is then aligned. - // [ | | | | ] - // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] - // [ _______________LLLLLLLL_________] - // [ ___________________LLLLLLLL_____] - // [__________________SSSSSSSS_______] - // [______________________SSSSSSSS___] - static void move_backward(char *dst, const char *src, size_t size) { - char *headtail_dst = dst + size; - const char *headtail_src = src + size; - size_t headtail_size = 0; - internal::Align::bump(headtail_dst, headtail_src, - headtail_size, -2); - HeadTail::move(headtail_dst, headtail_src, headtail_size); - NextT::move_backward(dst, src, size - headtail_size); - } - - static bool equals(const char *lhs, const char *rhs, size_t size) { - if (!AlignmentT::equals(lhs, rhs)) - return false; - internal::Align::bump(lhs, rhs, size); - return NextT::equals(lhs, rhs, size); - } - - static int three_way_compare(const char *lhs, const char *rhs, - size_t size) { - if (!AlignmentT::equals(lhs, rhs)) - return AlignmentT::three_way_compare(lhs, rhs); - internal::Align::bump(lhs, rhs, size); - return NextT::three_way_compare(lhs, rhs, size); - } - - static void splat_set(char *dst, const unsigned char value, size_t size) { - AlignmentT::splat_set(dst, value); - char *dummy = nullptr; - internal::Align::bump(dst, dummy, size); - NextT::splat_set(dst, value, size); - } - }; -}; - -// An operation that allows to skip the specified amount of bytes. -template struct Skip { - template struct Then { - static void copy(char *__restrict dst, const char *__restrict src, - size_t size) { - NextT::copy(dst + Bytes, src + Bytes, size - Bytes); - } - - static void copy(char *__restrict dst, const char *__restrict src) { - NextT::copy(dst + Bytes, src + Bytes); - } - - static bool equals(const char *lhs, const char *rhs, size_t size) { - return NextT::equals(lhs + Bytes, rhs + Bytes, size - Bytes); - } - - static bool equals(const char *lhs, const char *rhs) { - return NextT::equals(lhs + Bytes, rhs + Bytes); - } - - static int three_way_compare(const char *lhs, const char *rhs, - size_t size) { - return NextT::three_way_compare(lhs + Bytes, rhs + Bytes, size - Bytes); - } - - static int three_way_compare(const char *lhs, const char *rhs) { - return NextT::three_way_compare(lhs + Bytes, rhs + Bytes); - } - - static void splat_set(char *dst, const unsigned char value, size_t size) { - NextT::splat_set(dst + Bytes, value, size - Bytes); - } - - static void splat_set(char *dst, const unsigned char value) { - NextT::splat_set(dst + Bytes, value); - } - }; -}; - -// Fixed-size Builtin Operations -// ----------------------------- -// Note: Do not use 'builtin' right now as it requires the implementation of the -// `_inline` versions of all the builtins. Theoretically, Clang can still turn -// them into calls to the C library leading to reentrancy problems. -namespace builtin { - -#ifndef __has_builtin -#define __has_builtin(x) 0 // Compatibility with non-clang compilers. -#endif - -template struct Builtin { - static constexpr size_t SIZE = Size; - - static void copy(char *__restrict dst, const char *__restrict src) { -#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER - for_loop_copy(dst, src); -#elif __has_builtin(__builtin_memcpy_inline) - // __builtin_memcpy_inline guarantees to never call external functions. - // Unfortunately it is not widely available. - __builtin_memcpy_inline(dst, src, SIZE); -#else - for_loop_copy(dst, src); -#endif - } - - static void move(char *dst, const char *src) { -#if LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER - for_loop_move(dst, src); -#elif __has_builtin(__builtin_memmove) - __builtin_memmove(dst, src, SIZE); -#else - for_loop_move(dst, src); -#endif - } - -#if __has_builtin(__builtin_memcmp_inline) -#define LLVM_LIBC_MEMCMP __builtin_memcmp_inline -#else -#define LLVM_LIBC_MEMCMP __builtin_memcmp -#endif - - static bool equals(const char *lhs, const char *rhs) { - return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE) == 0; - } - - static int three_way_compare(const char *lhs, const char *rhs) { - return LLVM_LIBC_MEMCMP(lhs, rhs, SIZE); - } - - static void splat_set(char *dst, const unsigned char value) { - __builtin_memset(dst, value, SIZE); - } - -private: - // Copies `SIZE` bytes from `src` to `dst` using a for loop. - // This code requires the use of `-fno-builtin-memcpy` to prevent the compiler - // from turning the for-loop back into `__builtin_memcpy`. - static void for_loop_copy(char *__restrict dst, const char *__restrict src) { - for (size_t i = 0; i < SIZE; ++i) - dst[i] = src[i]; - } - - static void for_loop_move(char *dst, const char *src) { - for (size_t i = 0; i < SIZE; ++i) - dst[i] = src[i]; - } -}; - -using _1 = Builtin<1>; -using _2 = Builtin<2>; -using _3 = Builtin<3>; -using _4 = Builtin<4>; -using _8 = Builtin<8>; -using _16 = Builtin<16>; -using _32 = Builtin<32>; -using _64 = Builtin<64>; -using _128 = Builtin<128>; - -} // namespace builtin - -// Fixed-size Scalar Operations -// ---------------------------- -namespace scalar { - -// The Scalar type makes use of simple sized integers. -template struct Scalar { - static constexpr size_t SIZE = sizeof(T); - - static void copy(char *__restrict dst, const char *__restrict src) { - store(dst, load(src)); - } - - static void move(char *dst, const char *src) { store(dst, load(src)); } - - static bool equals(const char *lhs, const char *rhs) { - return load(lhs) == load(rhs); - } - - static int three_way_compare(const char *lhs, const char *rhs) { - return scalar_three_way_compare(load(lhs), load(rhs)); - } - - static void splat_set(char *dst, const unsigned char value) { - store(dst, get_splatted_value(value)); - } - - static int scalar_three_way_compare(T a, T b); - - static T load(const char *ptr) { - T value; - builtin::Builtin::copy(reinterpret_cast(&value), ptr); - return value; - } - static void store(char *ptr, T value) { - builtin::Builtin::copy(ptr, reinterpret_cast(&value)); - } - -private: - static T get_splatted_value(const unsigned char value) { - return T(~0) / T(0xFF) * T(value); - } -}; - -template <> -inline int Scalar::scalar_three_way_compare(uint8_t a, uint8_t b) { - const int16_t la = Endian::to_big_endian(a); - const int16_t lb = Endian::to_big_endian(b); - return la - lb; -} -template <> -inline int Scalar::scalar_three_way_compare(uint16_t a, uint16_t b) { - const int32_t la = Endian::to_big_endian(a); - const int32_t lb = Endian::to_big_endian(b); - return la - lb; -} -template <> -inline int Scalar::scalar_three_way_compare(uint32_t a, uint32_t b) { - const uint32_t la = Endian::to_big_endian(a); - const uint32_t lb = Endian::to_big_endian(b); - return la > lb ? 1 : la < lb ? -1 : 0; -} -template <> -inline int Scalar::scalar_three_way_compare(uint64_t a, uint64_t b) { - const uint64_t la = Endian::to_big_endian(a); - const uint64_t lb = Endian::to_big_endian(b); - return la > lb ? 1 : la < lb ? -1 : 0; -} - -using UINT8 = Scalar; // 1 Byte -using UINT16 = Scalar; // 2 Bytes -using UINT32 = Scalar; // 4 Bytes -using UINT64 = Scalar; // 8 Bytes - -using _1 = UINT8; -using _2 = UINT16; -using _3 = Chained; -using _4 = UINT32; -using _8 = UINT64; -using _16 = Repeated<_8, 2>; -using _32 = Repeated<_8, 4>; -using _64 = Repeated<_8, 8>; -using _128 = Repeated<_8, 16>; - -} // namespace scalar -} // namespace __llvm_libc - -#include -#include - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H diff --git a/libc/src/string/memory_utils/elements_aarch64.h b/libc/src/string/memory_utils/elements_aarch64.h deleted file mode 100644 --- a/libc/src/string/memory_utils/elements_aarch64.h +++ /dev/null @@ -1,130 +0,0 @@ -//===-- Elementary operations for aarch64 --------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H - -#include "src/__support/architectures.h" - -#if defined(LLVM_LIBC_ARCH_AARCH64) - -#include -#include // size_t -#include // uint8_t, uint16_t, uint32_t, uint64_t - -#ifdef __ARM_NEON -#include -#endif - -namespace __llvm_libc { -namespace aarch64_memset { -#ifdef __ARM_NEON -struct Splat8 { - static constexpr size_t SIZE = 8; - static void splat_set(char *dst, const unsigned char value) { - vst1_u8((uint8_t *)dst, vdup_n_u8(value)); - } -}; - -struct Splat16 { - static constexpr size_t SIZE = 16; - static void splat_set(char *dst, const unsigned char value) { - vst1q_u8((uint8_t *)dst, vdupq_n_u8(value)); - } -}; - -using _8 = Splat8; -using _16 = Splat16; -#else -using _8 = __llvm_libc::scalar::_8; -using _16 = Repeated<_8, 2>; -#endif // __ARM_NEON - -using _1 = __llvm_libc::scalar::_1; -using _2 = __llvm_libc::scalar::_2; -using _3 = __llvm_libc::scalar::_3; -using _4 = __llvm_libc::scalar::_4; -using _32 = Chained<_16, _16>; -using _64 = Chained<_32, _32>; - -struct Zva64 { - static constexpr size_t SIZE = 64; - - static void splat_set(char *dst, const unsigned char) { -#if __SIZEOF_POINTER__ == 4 - asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); -#else - asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); -#endif - } -}; - -inline static bool hasZva() { - uint64_t zva_val; - asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); - // DC ZVA is permitted if DZP, bit [4] is zero. - // BS, bits [3:0] is log2 of the block size in words. - // So the next line checks whether the instruction is permitted and block size - // is 16 words (i.e. 64 bytes). - return (zva_val & 0b11111) == 0b00100; -} - -} // namespace aarch64_memset - -namespace aarch64 { - -using _1 = __llvm_libc::scalar::_1; -using _2 = __llvm_libc::scalar::_2; -using _3 = __llvm_libc::scalar::_3; -using _4 = __llvm_libc::scalar::_4; -using _8 = __llvm_libc::scalar::_8; -using _16 = __llvm_libc::scalar::_16; - -#ifdef __ARM_NEON -struct N32 { - static constexpr size_t SIZE = 32; - static bool equals(const char *lhs, const char *rhs) { - uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs); - uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs); - uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16)); - uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16)); - uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1)); - uint64_t res = - vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0); - return res == 0; - } - static int three_way_compare(const char *lhs, const char *rhs) { - uint8x16_t l_0 = vld1q_u8((const uint8_t *)lhs); - uint8x16_t r_0 = vld1q_u8((const uint8_t *)rhs); - uint8x16_t l_1 = vld1q_u8((const uint8_t *)(lhs + 16)); - uint8x16_t r_1 = vld1q_u8((const uint8_t *)(rhs + 16)); - uint8x16_t temp = vpmaxq_u8(veorq_u8(l_0, r_0), veorq_u8(l_1, r_1)); - uint64_t res = - vgetq_lane_u64(vreinterpretq_u64_u8(vpmaxq_u8(temp, temp)), 0); - if (res == 0) - return 0; - size_t index = (__builtin_ctzl(res) >> 3) << 2; - uint32_t l = *((const uint32_t *)(lhs + index)); - uint32_t r = *((const uint32_t *)(rhs + index)); - return __llvm_libc::scalar::_4::scalar_three_way_compare(l, r); - } -}; - -using _32 = N32; -using _64 = Repeated<_32, 2>; -#else -using _32 = __llvm_libc::scalar::_32; -using _64 = __llvm_libc::scalar::_64; -#endif // __ARM_NEON - -} // namespace aarch64 -} // namespace __llvm_libc - -#endif // defined(LLVM_LIBC_ARCH_AARCH64) - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_AARCH64_H diff --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h deleted file mode 100644 --- a/libc/src/string/memory_utils/elements_x86.h +++ /dev/null @@ -1,189 +0,0 @@ -//===-- Elementary operations for x86 -------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H -#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H - -#include "src/__support/CPP/bit.h" -#include "src/__support/architectures.h" - -#if defined(LLVM_LIBC_ARCH_X86) - -#include // size_t -#include // uint8_t, uint16_t, uint32_t, uint64_t - -#ifdef __SSE2__ -#include -#endif // __SSE2__ - -#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar - -// Fixed-size Vector Operations -// ---------------------------- - -namespace __llvm_libc { -namespace x86 { - -#ifdef __SSE2__ -template struct Vector : public Base { - static void copy(char *__restrict dst, const char *__restrict src) { - Base::store(dst, Base::load(src)); - } - - static void move(char *dst, const char *src) { - Base::store(dst, Base::load(src)); - } - - static bool equals(const char *a, const char *b) { - return Base::not_equal_mask(Base::load(a), Base::load(b)) == 0; - } - - static int three_way_compare(const char *a, const char *b) { - const auto mask = Base::not_equal_mask(Base::load(a), Base::load(b)); - if (!mask) - return 0; - return char_diff(a, b, mask); - } - - static void splat_set(char *dst, const unsigned char value) { - Base::store(dst, Base::get_splatted_value(value)); - } - - static int char_diff(const char *a, const char *b, uint64_t mask) { - const size_t diff_index = __builtin_ctzll(mask); - const int ca = (unsigned char)a[diff_index]; - const int cb = (unsigned char)b[diff_index]; - return ca - cb; - } -}; - -struct M128 { - static constexpr size_t SIZE = 16; - using T = char __attribute__((__vector_size__(SIZE))); - static uint16_t mask(T value) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return static_cast( - _mm_movemask_epi8(cpp::bit_cast<__m128i>(value))); - } - static uint16_t not_equal_mask(T a, T b) { return mask(a != b); } - static T load(const char *ptr) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return cpp::bit_cast( - _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr))); - } - static void store(char *ptr, T value) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr), - cpp::bit_cast<__m128i>(value)); - } - static T get_splatted_value(const char v) { - const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; - return splatted; - } -}; - -using Vector128 = Vector; // 16 Bytes - -#ifdef __AVX2__ -struct M256 { - static constexpr size_t SIZE = 32; - using T = char __attribute__((__vector_size__(SIZE))); - static uint32_t mask(T value) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return _mm256_movemask_epi8(cpp::bit_cast<__m256i>(value)); - } - static uint32_t not_equal_mask(T a, T b) { return mask(a != b); } - static T load(const char *ptr) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return cpp::bit_cast( - _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr))); - } - static void store(char *ptr, T value) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), - cpp::bit_cast<__m256i>(value)); - } - static T get_splatted_value(const char v) { - const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, - v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; - return splatted; - } -}; - -using Vector256 = Vector; // 32 Bytes - -#if defined(__AVX512F__) and defined(__AVX512BW__) -struct M512 { - static constexpr size_t SIZE = 64; - using T = char __attribute__((__vector_size__(SIZE))); - static uint64_t not_equal_mask(T a, T b) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(a), - cpp::bit_cast<__m512i>(b)); - } - static T load(const char *ptr) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return cpp::bit_cast(_mm512_loadu_epi8(ptr)); - } - static void store(char *ptr, T value) { - // NOLINTNEXTLINE(llvmlibc-callee-namespace) - return _mm512_storeu_epi8(ptr, cpp::bit_cast<__m512i>(value)); - } - static T get_splatted_value(const char v) { - const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, - v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, - v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, - v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; - return splatted; - } -}; -using Vector512 = Vector; - -#endif // defined(__AVX512F__) and defined(__AVX512BW__) -#endif // __AVX2__ -#endif // __SSE2__ - -using _1 = __llvm_libc::scalar::_1; -using _2 = __llvm_libc::scalar::_2; -using _3 = __llvm_libc::scalar::_3; -using _4 = __llvm_libc::scalar::_4; -using _8 = __llvm_libc::scalar::_8; -#if defined(__AVX512F__) && defined(__AVX512BW__) -using _16 = __llvm_libc::x86::Vector128; -using _32 = __llvm_libc::x86::Vector256; -using _64 = __llvm_libc::x86::Vector512; -using _128 = __llvm_libc::Repeated<_64, 2>; -#elif defined(__AVX2__) -using _16 = __llvm_libc::x86::Vector128; -using _32 = __llvm_libc::x86::Vector256; -using _64 = __llvm_libc::Repeated<_32, 2>; -using _128 = __llvm_libc::Repeated<_32, 4>; -#elif defined(__SSE2__) -using _16 = __llvm_libc::x86::Vector128; -using _32 = __llvm_libc::Repeated<_16, 2>; -using _64 = __llvm_libc::Repeated<_16, 4>; -using _128 = __llvm_libc::Repeated<_16, 8>; -#else -using _16 = __llvm_libc::Repeated<_8, 2>; -using _32 = __llvm_libc::Repeated<_8, 4>; -using _64 = __llvm_libc::Repeated<_8, 8>; -using _128 = __llvm_libc::Repeated<_8, 16>; -#endif - -struct Accelerator { - static void copy(char *dst, const char *src, size_t count) { - asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); - } -}; - -} // namespace x86 -} // namespace __llvm_libc - -#endif // defined(LLVM_LIBC_ARCH_X86) - -#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -11,92 +11,120 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_x86.h" +#include "src/string/memory_utils/utils.h" #include // size_t namespace __llvm_libc { -static inline int inline_memcmp(const char *lhs, const char *rhs, - size_t count) { +static inline MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2, + size_t count) { + if (unlikely(count >= 384)) { + if (auto value = generic::Memcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + } + return generic::Memcmp<16>::loop_and_tail(p1, p2, count); +} + #if defined(LLVM_LIBC_ARCH_X86) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_X86 - ///////////////////////////////////////////////////////////////////////////// - using namespace __llvm_libc::x86; - if (count == 0) - return 0; - if (count == 1) - return three_way_compare<_1>(lhs, rhs); - if (count == 2) - return three_way_compare<_2>(lhs, rhs); - if (count == 3) - return three_way_compare<_3>(lhs, rhs); - if (count <= 8) - return three_way_compare>(lhs, rhs, count); - if (count <= 16) - return three_way_compare>(lhs, rhs, count); +static inline MemcmpReturnType inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, + size_t count) { + if (unlikely(count >= 384)) { + if (auto value = x86::sse2::Memcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + } + return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count); +} + +static inline MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, + size_t count) { if (count <= 32) - return three_way_compare>(lhs, rhs, count); + return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); if (count <= 64) - return three_way_compare>(lhs, rhs, count); + return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); if (count <= 128) - return three_way_compare>(lhs, rhs, count); - return three_way_compare::Then>>(lhs, rhs, count); -#elif defined(LLVM_LIBC_ARCH_AARCH64) - ///////////////////////////////////////////////////////////////////////////// - // LLVM_LIBC_ARCH_AARCH64 - ///////////////////////////////////////////////////////////////////////////// - using namespace ::__llvm_libc::aarch64; - if (count == 0) // [0, 0] - return 0; - if (count == 1) // [1, 1] - return three_way_compare<_1>(lhs, rhs); - if (count == 2) // [2, 2] - return three_way_compare<_2>(lhs, rhs); - if (count == 3) // [3, 3] - return three_way_compare<_3>(lhs, rhs); - if (count < 8) // [4, 7] - return three_way_compare>(lhs, rhs, count); - if (count < 16) // [8, 15] - return three_way_compare>(lhs, rhs, count); - if (unlikely(count >= 128)) // [128, ∞] - return three_way_compare::Then>>(lhs, rhs, count); - if (!equals<_16>(lhs, rhs)) // [16, 16] - return three_way_compare<_16>(lhs, rhs); + return x86::avx2::Memcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 384)) { + if (auto value = x86::avx2::Memcmp<32>::block(p1, p2)) + return value; + align_to_next_boundary<32, Arg::P1>(p1, p2, count); + } + return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count); +} + +static inline MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, + size_t count) { + if (count <= 32) + return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); + if (count <= 64) + return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); + if (count <= 128) + return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count); + if (unlikely(count >= 384)) { + if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2)) + return value; + align_to_next_boundary<64, Arg::P1>(p1, p2, count); + } + return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); +} +#endif // defined(LLVM_LIBC_ARCH_X86) + +#if defined(LLVM_LIBC_ARCH_AARCH64) +static inline MemcmpReturnType inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, + size_t count) { + if (unlikely(count >= 128)) { // [128, ∞] + if (auto value = generic::Memcmp<16>::block(p1, p2)) + return value; + align_to_next_boundary<16, Arg::P1>(p1, p2, count); + return generic::Memcmp<32>::loop_and_tail(p1, p2, count); + } if (count < 32) // [17, 31] - return three_way_compare>(lhs, rhs, count); - if (!equals::Then<_16>>(lhs, rhs)) // [32, 32] - return three_way_compare::Then<_16>>(lhs, rhs); + return generic::Memcmp<16>::tail(p1, p2, count); + if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32] + return generic::Memcmp<16>::block(p1 + 16, p2 + 16); if (count < 64) // [33, 63] - return three_way_compare>(lhs, rhs, count); + return generic::Memcmp<32>::tail(p1, p2, count); // [64, 127] - return three_way_compare::Then>>(lhs, rhs, count); -#else - ///////////////////////////////////////////////////////////////////////////// - // Default - ///////////////////////////////////////////////////////////////////////////// - using namespace ::__llvm_libc::scalar; + return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32); +} +#endif // defined(LLVM_LIBC_ARCH_AARCH64) +static inline MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) { if (count == 0) - return 0; + return MemcmpReturnType::ZERO(); if (count == 1) - return three_way_compare<_1>(lhs, rhs); + return generic::Memcmp<1>::block(p1, p2); if (count == 2) - return three_way_compare<_2>(lhs, rhs); + return generic::Memcmp<2>::block(p1, p2); if (count == 3) - return three_way_compare<_3>(lhs, rhs); + return generic::Memcmp<3>::block(p1, p2); if (count <= 8) - return three_way_compare>(lhs, rhs, count); + return generic::Memcmp<4>::head_tail(p1, p2, count); if (count <= 16) - return three_way_compare>(lhs, rhs, count); - if (count <= 32) - return three_way_compare>(lhs, rhs, count); - if (count <= 64) - return three_way_compare>(lhs, rhs, count); - if (count <= 128) - return three_way_compare>(lhs, rhs, count); - return three_way_compare::Then>>(lhs, rhs, count); + return generic::Memcmp<8>::head_tail(p1, p2, count); +#if defined(LLVM_LIBC_ARCH_X86) + if constexpr (x86::kAvx512BW) + return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); + else if constexpr (x86::kAvx2) + return inline_memcmp_x86_avx2_gt16(p1, p2, count); + else if constexpr (x86::kSse2) + return inline_memcmp_x86_sse2_gt16(p1, p2, count); + else + return inline_memcmp_generic_gt16(p1, p2, count); +#elif defined(LLVM_LIBC_ARCH_AARCH64) + ///////////////////////////////////////////////////////////////////////////// + // LLVM_LIBC_ARCH_AARCH64 + ///////////////////////////////////////////////////////////////////////////// + if constexpr (aarch64::kNeon) + return inline_memcmp_aarch64_neon_gt16(p1, p2, count); + else + return inline_memcmp_generic_gt16(p1, p2, count); +#else + return inline_memcmp_generic_gt16(p1, p2, count); #endif } diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h --- a/libc/src/string/memory_utils/memcpy_implementations.h +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -11,7 +11,8 @@ #include "src/__support/architectures.h" #include "src/__support/common.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" #include // size_t @@ -45,58 +46,49 @@ // LLVM_LIBC_ARCH_X86 ///////////////////////////////////////////////////////////////////////////// - // Whether to use only rep;movsb. - constexpr bool USE_ONLY_REP_MOVSB = - LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); + // Whether to use rep;movsb exclusively, not at all, or only above a certain + // threshold. + // TODO: Use only a single preprocessor definition to simplify the code. +#ifndef LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE +#define LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE -1 +#endif - // kRepMovsBSize == -1 : Only CopyAligned is used. - // kRepMovsBSize == 0 : Only RepMovsb is used. - // else CopyAligned is used up to kRepMovsBSize and then RepMovsb. - constexpr size_t REP_MOVS_B_SIZE = -#if defined(LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE) + static constexpr bool kUseOnlyRepMovsb = + LLVM_LIBC_IS_DEFINED(LLVM_LIBC_MEMCPY_X86_USE_ONLY_REPMOVSB); + static constexpr size_t kRepMovsbThreshold = LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE; -#else - -1; -#endif // LLVM_LIBC_MEMCPY_X86_USE_REPMOVSB_FROM_SIZE - - // Whether target supports AVX instructions. - constexpr bool HAS_AVX = LLVM_LIBC_IS_DEFINED(__AVX__); - -#if defined(__AVX__) - using LoopBlockSize = _64; -#else - using LoopBlockSize = _32; -#endif - if (USE_ONLY_REP_MOVSB) - return copy(dst, src, count); + if constexpr (kUseOnlyRepMovsb) + return x86::Memcpy::repmovsb(dst, src, count); if (count == 0) return; if (count == 1) - return copy<_1>(dst, src); + return Memcpy<1>::block(dst, src); if (count == 2) - return copy<_2>(dst, src); + return Memcpy<2>::block(dst, src); if (count == 3) - return copy<_3>(dst, src); + return Memcpy<3>::block(dst, src); if (count == 4) - return copy<_4>(dst, src); + return Memcpy<4>::block(dst, src); if (count < 8) - return copy>(dst, src, count); + return Memcpy<4>::head_tail(dst, src, count); if (count < 16) - return copy>(dst, src, count); + return Memcpy<8>::head_tail(dst, src, count); if (count < 32) - return copy>(dst, src, count); + return Memcpy<16>::head_tail(dst, src, count); if (count < 64) - return copy>(dst, src, count); + return Memcpy<32>::head_tail(dst, src, count); if (count < 128) - return copy>(dst, src, count); - if (HAS_AVX && count < 256) - return copy>(dst, src, count); - if (count <= REP_MOVS_B_SIZE) - return copy::Then>>(dst, src, - count); - return copy(dst, src, count); + return Memcpy<64>::head_tail(dst, src, count); + if (x86::kAvx && count < 256) + return Memcpy<128>::head_tail(dst, src, count); + if (count <= kRepMovsbThreshold) { + Memcpy<32>::block(dst, src); + align_to_next_boundary<32, Arg::Dst>(dst, src, count); + return Memcpy < x86::kAvx ? 64 : 32 > ::loop_and_tail(dst, src, count); + } + return x86::Memcpy::repmovsb(dst, src, count); #elif defined(LLVM_LIBC_ARCH_AARCH64) ///////////////////////////////////////////////////////////////////////////// // LLVM_LIBC_ARCH_AARCH64 @@ -104,24 +96,26 @@ if (count == 0) return; if (count == 1) - return copy<_1>(dst, src); + return Memcpy<1>::block(dst, src); if (count == 2) - return copy<_2>(dst, src); + return Memcpy<2>::block(dst, src); if (count == 3) - return copy<_3>(dst, src); + return Memcpy<3>::block(dst, src); if (count == 4) - return copy<_4>(dst, src); + return Memcpy<4>::block(dst, src); if (count < 8) - return copy>(dst, src, count); + return Memcpy<4>::head_tail(dst, src, count); if (count < 16) - return copy>(dst, src, count); + return Memcpy<8>::head_tail(dst, src, count); if (count < 32) - return copy>(dst, src, count); + return Memcpy<16>::head_tail(dst, src, count); if (count < 64) - return copy>(dst, src, count); + return Memcpy<32>::head_tail(dst, src, count); if (count < 128) - return copy>(dst, src, count); - return copy::Then>>(dst, src, count); + return Memcpy<64>::head_tail(dst, src, count); + Memcpy<16>::block(dst, src); + align_to_next_boundary<16, Arg::Src>(dst, src, count); + return Memcpy<64>::loop_and_tail(dst, src, count); #else ///////////////////////////////////////////////////////////////////////////// // Default @@ -129,24 +123,26 @@ if (count == 0) return; if (count == 1) - return copy<_1>(dst, src); + return Memcpy<1>::block(dst, src); if (count == 2) - return copy<_2>(dst, src); + return Memcpy<2>::block(dst, src); if (count == 3) - return copy<_3>(dst, src); + return Memcpy<3>::block(dst, src); if (count == 4) - return copy<_4>(dst, src); + return Memcpy<4>::block(dst, src); if (count < 8) - return copy>(dst, src, count); + return Memcpy<4>::head_tail(dst, src, count); if (count < 16) - return copy>(dst, src, count); + return Memcpy<8>::head_tail(dst, src, count); if (count < 32) - return copy>(dst, src, count); + return Memcpy<16>::head_tail(dst, src, count); if (count < 64) - return copy>(dst, src, count); + return Memcpy<32>::head_tail(dst, src, count); if (count < 128) - return copy>(dst, src, count); - return copy::Then>>(dst, src, count); + return Memcpy<64>::head_tail(dst, src, count); + Memcpy<32>::block(dst, src); + align_to_next_boundary<32, Arg::Src>(dst, src, count); + return Memcpy<32>::loop_and_tail(dst, src, count); #endif } diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -10,7 +10,9 @@ #define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_IMPLEMENTATIONS_H #include "src/__support/architectures.h" -#include "src/string/memory_utils/elements.h" +#include "src/string/memory_utils/op_aarch64.h" +#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_x86.h" #include "src/string/memory_utils/utils.h" #include // size_t @@ -48,88 +50,100 @@ // advance. SetAlignedBlocks<64> may waste up to 63 Bytes, SetAlignedBlocks<32> // may waste up to 31 Bytes. Benchmarks showed that SetAlignedBlocks<64> was not // superior for sizes that mattered. -inline static void inline_memset(char *dst, unsigned char value, size_t count) { +inline static void inline_memset(Ptr dst, uint8_t value, size_t count) { #if defined(LLVM_LIBC_ARCH_X86) ///////////////////////////////////////////////////////////////////////////// // LLVM_LIBC_ARCH_X86 ///////////////////////////////////////////////////////////////////////////// - using namespace __llvm_libc::x86; + static constexpr size_t kMaxSize = x86::kAvx512F ? 64 + : x86::kAvx ? 32 + : x86::kSse2 ? 16 + : 8; if (count == 0) return; if (count == 1) - return splat_set<_1>(dst, value); + return generic::Memset<1, kMaxSize>::block(dst, value); if (count == 2) - return splat_set<_2>(dst, value); + return generic::Memset<2, kMaxSize>::block(dst, value); if (count == 3) - return splat_set<_3>(dst, value); + return generic::Memset<3, kMaxSize>::block(dst, value); if (count <= 8) - return splat_set>(dst, value, count); + return generic::Memset<4, kMaxSize>::head_tail(dst, value, count); if (count <= 16) - return splat_set>(dst, value, count); + return generic::Memset<8, kMaxSize>::head_tail(dst, value, count); if (count <= 32) - return splat_set>(dst, value, count); + return generic::Memset<16, kMaxSize>::head_tail(dst, value, count); if (count <= 64) - return splat_set>(dst, value, count); + return generic::Memset<32, kMaxSize>::head_tail(dst, value, count); if (count <= 128) - return splat_set>(dst, value, count); - return splat_set::Then>>(dst, value, count); + return generic::Memset<64, kMaxSize>::head_tail(dst, value, count); + // Aligned loop + generic::Memset<32, kMaxSize>::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count); #elif defined(LLVM_LIBC_ARCH_AARCH64) ///////////////////////////////////////////////////////////////////////////// // LLVM_LIBC_ARCH_AARCH64 ///////////////////////////////////////////////////////////////////////////// - using namespace __llvm_libc::aarch64_memset; + static constexpr size_t kMaxSize = aarch64::kNeon ? 16 : 8; if (count == 0) return; if (count <= 3) { - splat_set<_1>(dst, value); + generic::Memset<1, kMaxSize>::block(dst, value); if (count > 1) - splat_set>(dst, value, count); + generic::Memset<2, kMaxSize>::tail(dst, value, count); return; } if (count <= 8) - return splat_set>(dst, value, count); + return generic::Memset<4, kMaxSize>::head_tail(dst, value, count); if (count <= 16) - return splat_set>(dst, value, count); + return generic::Memset<8, kMaxSize>::head_tail(dst, value, count); if (count <= 32) - return splat_set>(dst, value, count); + return generic::Memset<16, kMaxSize>::head_tail(dst, value, count); if (count <= (32 + 64)) { - splat_set<_32>(dst, value); + generic::Memset<32, kMaxSize>::block(dst, value); if (count <= 64) - return splat_set>(dst, value, count); - splat_set::Then<_32>>(dst, value); - splat_set>(dst, value, count); + return generic::Memset<32, kMaxSize>::tail(dst, value, count); + generic::Memset<32, kMaxSize>::block(dst + 32, value); + generic::Memset<32, kMaxSize>::tail(dst, value, count); return; } - if (count >= 448 && value == 0 && hasZva()) - return splat_set::Then>>(dst, 0, - count); - else - return splat_set::Then>>(dst, value, count); + if (count >= 448 && value == 0 && aarch64::neon::hasZva()) { + generic::Memset<64, kMaxSize>::block(dst, 0); + align_to_next_boundary<64>(dst, count); + return aarch64::neon::BzeroCacheLine<64>::loop_and_tail(dst, 0, count); + } else { + generic::Memset<16, kMaxSize>::block(dst, value); + align_to_next_boundary<16>(dst, count); + return generic::Memset<64, kMaxSize>::loop_and_tail(dst, value, count); + } #else ///////////////////////////////////////////////////////////////////////////// // Default ///////////////////////////////////////////////////////////////////////////// - using namespace ::__llvm_libc::scalar; - + static constexpr size_t kMaxSize = 8; if (count == 0) return; if (count == 1) - return splat_set<_1>(dst, value); + return generic::Memset<1, kMaxSize>::block(dst, value); if (count == 2) - return splat_set<_2>(dst, value); + return generic::Memset<2, kMaxSize>::block(dst, value); if (count == 3) - return splat_set<_3>(dst, value); + return generic::Memset<3, kMaxSize>::block(dst, value); if (count <= 8) - return splat_set>(dst, value, count); + return generic::Memset<4, kMaxSize>::head_tail(dst, value, count); if (count <= 16) - return splat_set>(dst, value, count); + return generic::Memset<8, kMaxSize>::head_tail(dst, value, count); if (count <= 32) - return splat_set>(dst, value, count); + return generic::Memset<16, kMaxSize>::head_tail(dst, value, count); if (count <= 64) - return splat_set>(dst, value, count); + return generic::Memset<32, kMaxSize>::head_tail(dst, value, count); if (count <= 128) - return splat_set>(dst, value, count); - return splat_set::Then>>(dst, value, count); + return generic::Memset<64, kMaxSize>::head_tail(dst, value, count); + // Aligned loop + generic::Memset<32, kMaxSize>::block(dst, value); + align_to_next_boundary<32>(dst, count); + return generic::Memset<32, kMaxSize>::loop_and_tail(dst, value, count); #endif } diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_aarch64.h @@ -0,0 +1,172 @@ +//===-- aarch64 implementation of memory function building blocks ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides aarch64 specific building blocks to compose memory +// functions. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H + +#include "src/__support/architectures.h" + +#if defined(LLVM_LIBC_ARCH_AARCH64) + +#include "src/string/memory_utils/op_generic.h" + +#ifdef __ARM_NEON +#include +#endif //__ARM_NEON + +namespace __llvm_libc::aarch64 { + +static inline constexpr bool kNeon = LLVM_LIBC_IS_DEFINED(__ARM_NEON); + +namespace neon { + +template struct BzeroCacheLine { + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, uint8_t) { + static_assert(Size == 64); +#if __SIZEOF_POINTER__ == 4 + asm("dc zva, %w[dst]" : : [dst] "r"(dst) : "memory"); +#else + asm("dc zva, %[dst]" : : [dst] "r"(dst) : "memory"); +#endif + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + // Unaligned store, we can't use 'dc zva' here. + static constexpr size_t kMaxSize = kNeon ? 16 : 8; + generic::Memset::tail(dst, value, count); + } +}; + +inline static bool hasZva() { + uint64_t zva_val; + asm("mrs %[zva_val], dczid_el0" : [zva_val] "=r"(zva_val)); + // DC ZVA is permitted if DZP, bit [4] is zero. + // BS, bits [3:0] is log2 of the block count in words. + // So the next line checks whether the instruction is permitted and block + // count is 16 words (i.e. 64 bytes). + return (zva_val & 0b11111) == 0b00100; +} + +} // namespace neon + +/////////////////////////////////////////////////////////////////////////////// +// Memset + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp +template struct Bcmp { + static constexpr size_t SIZE = Size; + static constexpr size_t BlockSize = 32; + + static const unsigned char *as_u8(CPtr ptr) { + return reinterpret_cast(ptr); + } + + static inline BcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == BlockSize) { + auto _p1 = as_u8(p1); + auto _p2 = as_u8(p2); + uint8x16_t a = vld1q_u8(_p1); + uint8x16_t b = vld1q_u8(_p1 + 16); + uint8x16_t n = vld1q_u8(_p2); + uint8x16_t o = vld1q_u8(_p2 + 16); + uint8x16_t an = veorq_u8(a, n); + uint8x16_t bo = veorq_u8(b, o); + // anbo = (a ^ n) | (b ^ o). At least one byte is nonzero if there is + // a difference between the two buffers. We reduce this value down to 4 + // bytes in two steps. First, calculate the saturated move value when + // going from 2x64b to 2x32b. Second, compute the max of the 2x32b to get + // a single 32 bit nonzero value if a mismatch occurred. + uint8x16_t anbo = vorrq_u8(an, bo); + uint32x2_t anbo_reduced = vqmovn_u64(anbo); + return vmaxv_u32(anbo_reduced); + } else if constexpr ((Size % BlockSize) == 0) { + for (size_t offset = 0; offset < Size; offset += BlockSize) + if (auto value = Bcmp::block(p1 + offset, p2 + offset)) + return value; + } else { + deferred_static_assert("SIZE not implemented"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - SIZE, p2 + count - SIZE); + } + + static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + if constexpr (Size <= 8) { + return generic::Bcmp::head_tail(p1, p2, count); + } else if constexpr (Size == 16) { + auto _p1 = as_u8(p1); + auto _p2 = as_u8(p2); + uint8x16_t a = vld1q_u8(_p1); + uint8x16_t b = vld1q_u8(_p1 + count - 16); + uint8x16_t n = vld1q_u8(_p2); + uint8x16_t o = vld1q_u8(_p2 + count - 16); + uint8x16_t an = veorq_s8(a, n); + uint8x16_t bo = veorq_s8(b, o); + // anbo = (a ^ n) | (b ^ o) + uint8x16_t anbo = vorrq_s8(an, bo); + uint32x2_t anbo_reduced = vqmovn_u64(anbo); + return vmaxv_u32(anbo_reduced); + } else if constexpr (Size == 32) { + auto _p1 = as_u8(p1); + auto _p2 = as_u8(p2); + uint8x16_t a = vld1q_u8(_p1); + uint8x16_t b = vld1q_u8(_p1 + 16); + uint8x16_t c = vld1q_u8(_p1 + count - 16); + uint8x16_t d = vld1q_u8(_p1 + count - 32); + uint8x16_t n = vld1q_u8(_p2); + uint8x16_t o = vld1q_u8(_p2 + 16); + uint8x16_t p = vld1q_u8(_p2 + count - 16); + uint8x16_t q = vld1q_u8(_p2 + count - 32); + uint8x16_t an = veorq_s8(a, n); + uint8x16_t bo = veorq_s8(b, o); + uint8x16_t cp = veorq_s8(c, p); + uint8x16_t dq = veorq_s8(d, q); + uint8x16_t anbo = vorrq_s8(an, bo); + uint8x16_t cpdq = vorrq_s8(cp, dq); + // abnocpdq = ((a ^ n) | (b ^ o)) | ((c ^ p) | (d ^ q)). Reduce this to + // a nonzero 32 bit value if a mismatch occurred. + uint64x2_t abnocpdq = vreinterpretq_u64_u8(anbo | cpdq); + uint32x2_t abnocpdq_reduced = vqmovn_u64(abnocpdq); + return vmaxv_u32(abnocpdq_reduced); + } else { + deferred_static_assert("SIZE not implemented"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += SIZE; + } while (offset < count - SIZE); + return tail(p1, p2, count); + } +}; + +} // namespace __llvm_libc::aarch64 + +#endif // LLVM_LIBC_ARCH_AARCH64 + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_builtin.h @@ -0,0 +1,146 @@ +//===-- Implementation using the __builtin_XXX_inline ---------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides generic C++ building blocks to compose memory functions. +// They rely on the compiler to generate the best possible code through the use +// of the `__builtin_XXX_inline` builtins. These builtins are currently only +// available in Clang. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H + +#include "src/string/memory_utils/utils.h" + +namespace __llvm_libc::builtin { + +/////////////////////////////////////////////////////////////////////////////// +// Memcpy +template struct Memcpy { + static constexpr size_t SIZE = Size; + static inline void block(Ptr __restrict dst, CPtr __restrict src) { +#ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE + return __builtin_memcpy_inline(dst, src, SIZE); +#else + deferred_static_assert("Missing __builtin_memcpy_inline"); + (void)dst; + (void)src; +#endif + } + + static inline void tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + block(dst + count - SIZE, src + count - SIZE); + } + + static inline void head_tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + block(dst, src); + tail(dst, src, count); + } + + static inline void loop_and_tail(Ptr __restrict dst, CPtr __restrict src, + size_t count) { + size_t offset = 0; + do { + block(dst + offset, src + offset); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, src, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memset +template struct Memset { + using ME = Memset; + static constexpr size_t SIZE = Size; + static inline void block(Ptr dst, uint8_t value) { +#ifdef LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE + __builtin_memset_inline(dst, value, Size); +#else + deferred_static_assert("Missing __builtin_memset_inline"); + (void)dst; + (void)value; +#endif + } + + static inline void tail(Ptr dst, uint8_t value, size_t count) { + block(dst + count - SIZE, value); + } + + static inline void head_tail(Ptr dst, uint8_t value, size_t count) { + block(dst, value); + tail(dst, value, count); + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, value, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp +template struct Bcmp { + using ME = Bcmp; + static constexpr size_t SIZE = Size; + static inline BcmpReturnType block(CPtr, CPtr) { + deferred_static_assert("Missing __builtin_memcmp_inline"); + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType head_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return BcmpReturnType::ZERO(); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memcmp +template struct Memcmp { + using ME = Memcmp; + static constexpr size_t SIZE = Size; + static inline MemcmpReturnType block(CPtr, CPtr) { + deferred_static_assert("Missing __builtin_memcmp_inline"); + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType head_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType loop_and_tail(CPtr, CPtr, size_t) { + deferred_static_assert("Not implemented"); + return MemcmpReturnType::ZERO(); + } +}; + +} // namespace __llvm_libc::builtin + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_BUILTIN_H diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_generic.h @@ -0,0 +1,461 @@ +//===-- Generic implementation of memory function building blocks ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides generic C++ building blocks. +// Depending on the requested size, the block operation uses unsigned integral +// types, vector types or an array of the type with the maximum size. +// +// The maximum size is passed as a template argument. For instance, on x86 +// platforms that only supports integral types the maximum size would be 8 +// (corresponding to uint64_t). On this platform if we request the size 32, this +// would be treated as a cpp::array. +// +// On the other hand, if the platform is x86 with support for AVX the maximum +// size is 32 and the operation can be handled with a single native operation. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H + +#include "src/__support/CPP/array.h" +#include "src/__support/CPP/type_traits.h" +#include "src/__support/endian.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/utils.h" + +#include + +namespace __llvm_libc::generic { + +// CTPair and CTMap below implement a compile time map. +// This is useful to map from a Size to a type handling this size. +// +// Example usage: +// using MyMap = CTMap, +// CTPair<2, uint16_t>, +// >; +// ... +// using UInt8T = MyMap::find_type<1>; +template struct CTPair { + using type = T; + static CTPair get_pair(cpp::integral_constant) { return {}; } +}; +template struct CTMap : public Pairs... { + using Pairs::get_pair...; + template + using find_type = + typename decltype(get_pair(cpp::integral_constant{}))::type; +}; + +// Helper to test if a type is void. +template inline constexpr bool is_void_v = cpp::is_same_v; + +// Implements load, store and splat for unsigned integral types. +template struct ScalarType { + using Type = T; + static_assert(cpp::is_integral_v && !cpp::is_signed_v); + + static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } + static inline void store(Ptr dst, Type value) { + ::__llvm_libc::store(dst, value); + } + static inline Type splat(uint8_t value) { + return Type(~0) / Type(0xFF) * Type(value); + } +}; + +// Implements load, store and splat for vector types. +template struct VectorType { + using Type = uint8_t __attribute__((__vector_size__(Size))); + static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } + static inline void store(Ptr dst, Type value) { + ::__llvm_libc::store(dst, value); + } + static inline Type splat(uint8_t value) { + Type Out; + // This for loop is optimized out for vector types. + for (size_t i = 0; i < Size; ++i) + Out[i] = static_cast(value); + return Out; + } +}; + +// We currently don't support 8- or 16-bit platforms, it must be 32- or 64-bit. +static_assert((UINTPTR_MAX == 4294967295U) || + (UINTPTR_MAX == 18446744073709551615UL)); + +// Map from sizes to structures offering static load, store and splat methods. +// Note: On platforms lacking vector support, we use the ArrayType below and +// decompose the operation in smaller pieces. +using NativeTypeMap = + CTMap>, // + CTPair<2, ScalarType>, // + CTPair<4, ScalarType>, // +#if defined(LLVM_LIBC_ARCH_X86_64) || defined(LLVM_LIBC_ARCH_AARCH64) + CTPair<8, ScalarType>, // Not available on 32bit +#endif // + CTPair<16, VectorType<16>>, // + CTPair<32, VectorType<32>>, // + CTPair<64, VectorType<64>>>; + +// Implements load, store and splat for sizes not natively supported by the +// platform. SubType is either ScalarType or VectorType. +template struct ArrayType { + using Type = cpp::array; + static constexpr size_t SizeOfElement = sizeof(typename SubType::Type); + static inline Type load(CPtr src) { + Type Value; + for (size_t I = 0; I < ArraySize; ++I) + Value[I] = SubType::load(src + (I * SizeOfElement)); + return Value; + } + static inline void store(Ptr dst, Type Value) { + for (size_t I = 0; I < ArraySize; ++I) + SubType::store(dst + (I * SizeOfElement), Value[I]); + } + static inline Type splat(uint8_t value) { + Type Out; + for (size_t I = 0; I < ArraySize; ++I) + Out[I] = SubType::splat(value); + return Out; + } +}; + +// Checks whether we should use an ArrayType. +template static constexpr bool useArrayType() { + return (Size > MaxSize) && ((Size % MaxSize) == 0) && + !is_void_v>; +} + +// Compute the type to handle an operation of Size bytes knowing that the +// underlying platform only support native types up to MaxSize bytes. +template +using getTypeFor = cpp::conditional_t< + useArrayType(), + ArrayType, Size / MaxSize>, + NativeTypeMap::find_type>; + +/////////////////////////////////////////////////////////////////////////////// +// Memcpy +// When building with clang we can delegate to the builtin implementation. +/////////////////////////////////////////////////////////////////////////////// + +template using Memcpy = builtin::Memcpy; + +/////////////////////////////////////////////////////////////////////////////// +// Memset +// The MaxSize template argument gives the maximum size handled natively by the +// platform. For instance on x86 with AVX support this would be 32. If a size +// greater than MaxSize is requested we break the operation down in smaller +// pieces of size MaxSize. +/////////////////////////////////////////////////////////////////////////////// +template struct Memset { + static_assert(is_power2(MaxSize)); + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, uint8_t value) { + if constexpr (Size == 3) { + Memset<1, MaxSize>::block(dst + 2, value); + Memset<2, MaxSize>::block(dst, value); + } else { + using T = getTypeFor; + if constexpr (is_void_v) { + deferred_static_assert("Unimplemented Size"); + } else { + T::store(dst, T::splat(value)); + } + } + } + + static inline void tail(Ptr dst, uint8_t value, size_t count) { + block(dst + count - SIZE, value); + } + + static inline void head_tail(Ptr dst, uint8_t value, size_t count) { + block(dst, value); + tail(dst, value, count); + } + + static inline void loop_and_tail(Ptr dst, uint8_t value, size_t count) { + size_t offset = 0; + do { + block(dst + offset, value); + offset += SIZE; + } while (offset < count - SIZE); + tail(dst, value, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp +/////////////////////////////////////////////////////////////////////////////// +template struct Bcmp { + static constexpr size_t SIZE = Size; + static constexpr size_t MaxSize = 8; + + template static inline uint32_t load_xor(CPtr p1, CPtr p2) { + return load(p1) ^ load(p2); + } + + template + static inline uint32_t load_not_equal(CPtr p1, CPtr p2) { + return load(p1) != load(p2); + } + + static inline BcmpReturnType block(CPtr p1, CPtr p2) { + static constexpr size_t MaxSize = 8; + if constexpr (Size == 1) { + return load_xor(p1, p2); + } else if constexpr (Size == 2) { + return load_xor(p1, p2); + } else if constexpr (Size == 4) { + return load_xor(p1, p2); + } else if constexpr (Size == 8) { + return load_not_equal(p1, p2); + } else if constexpr (useArrayType()) { + for (size_t offset = 0; offset < Size; offset += MaxSize) + if (auto value = Bcmp::block(p1 + offset, p2 + offset)) + return value; + } else { + deferred_static_assert("Unimplemented Size"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - SIZE, p2 + count - SIZE); + } + + static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + return block(p1, p2) | tail(p1, p2, count); + } + + static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += SIZE; + } while (offset < count - SIZE); + return tail(p1, p2, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memcmp +/////////////////////////////////////////////////////////////////////////////// +template struct Memcmp { + static constexpr size_t SIZE = Size; + static constexpr size_t MaxSize = 8; + + template static inline T load_be(CPtr ptr) { + return Endian::to_big_endian(load(ptr)); + } + + template + static inline MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) { + return load_be(p1) - load_be(p2); + } + + template + static inline MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) { + const auto la = load_be(p1); + const auto lb = load_be(p2); + return la > lb ? 1 : la < lb ? -1 : 0; + } + + static inline MemcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == 1) { + return load_be_diff(p1, p2); + } else if constexpr (Size == 2) { + return load_be_diff(p1, p2); + } else if constexpr (Size == 4) { + return load_be_cmp(p1, p2); + } else if constexpr (Size == 8) { + return load_be_cmp(p1, p2); + } else if constexpr (useArrayType()) { + for (size_t offset = 0; offset < Size; offset += MaxSize) + if (Bcmp::block(p1 + offset, p2 + offset)) + return Memcmp::block(p1 + offset, p2 + offset); + return MemcmpReturnType::ZERO(); + } else if constexpr (Size == 3) { + if (auto value = Memcmp<2>::block(p1, p2)) + return value; + return Memcmp<1>::block(p1 + 2, p2 + 2); + } else { + deferred_static_assert("Unimplemented Size"); + } + } + + static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - SIZE, p2 + count - SIZE); + } + + static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + if (auto value = block(p1, p2)) + return value; + return tail(p1, p2, count); + } + + static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += SIZE; + } while (offset < count - SIZE); + return tail(p1, p2, count); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Memmove +/////////////////////////////////////////////////////////////////////////////// + +template struct Memmove { + static_assert(is_power2(MaxSize)); + using T = getTypeFor; + static constexpr size_t SIZE = Size; + + static inline void block(Ptr dst, CPtr src) { + if constexpr (is_void_v) { + deferred_static_assert("Unimplemented Size"); + } else { + T::store(dst, T::load(src)); + } + } + + static inline void head_tail(Ptr dst, CPtr src, size_t count) { + const size_t offset = count - Size; + if constexpr (is_void_v) { + deferred_static_assert("Unimplemented Size"); + } else { + // The load and store operations can be performed in any order as long as + // they are not interleaved. More investigations are needed to determine + // the best order. + const auto head = T::load(src); + const auto tail = T::load(src + offset); + T::store(dst, head); + T::store(dst + offset, tail); + } + } + + // Align forward suitable when dst < src. The alignment is performed with + // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. + // + // e.g. Moving two bytes forward, we make sure src is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] + // [____LLLLLLLL_____________________] + // [___________LLLLLLLA______________] + // [_SSSSSSSS________________________] + // [________SSSSSSSS_________________] + // + // e.g. Moving two bytes forward, we make sure dst is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXXXXXX_] + // [____LLLLLLLL_____________________] + // [______LLLLLLLL___________________] + // [_SSSSSSSS________________________] + // [___SSSSSSSA______________________] + template + static inline void align_forward(Ptr &dst, CPtr &src, size_t &count) { + Ptr prev_dst = dst; + CPtr prev_src = src; + size_t prev_count = count; + align_to_next_boundary(dst, src, count); + adjust(Size, dst, src, count); + head_tail(prev_dst, prev_src, prev_count - count); + } + + // Align backward suitable when dst > src. The alignment is performed with + // an HeadTail operation of count ∈ [Alignment, 2 x Alignment]. + // + // e.g. Moving two bytes backward, we make sure src is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] + // [ _________________ALLLLLLL_______] + // [ ___________________LLLLLLLL_____] + // [____________________SSSSSSSS_____] + // [______________________SSSSSSSS___] + // + // e.g. Moving two bytes backward, we make sure dst is aligned. + // [ | | | | ] + // [____XXXXXXXXXXXXXXXXXXXXXXXX_____] + // [ _______________LLLLLLLL_________] + // [ ___________________LLLLLLLL_____] + // [__________________ASSSSSSS_______] + // [______________________SSSSSSSS___] + template + static inline void align_backward(Ptr &dst, CPtr &src, size_t &count) { + Ptr headtail_dst = dst + count; + CPtr headtail_src = src + count; + size_t headtail_size = 0; + align_to_next_boundary(headtail_dst, headtail_src, + headtail_size); + adjust(-2 * Size, headtail_dst, headtail_src, headtail_size); + head_tail(headtail_dst, headtail_src, headtail_size); + count -= headtail_size; + } + + // Move forward suitable when dst < src. We load the tail bytes before + // handling the loop. + // + // e.g. Moving two bytes + // [ | | | | |] + // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] + // [_________________________LLLLLLLL___] + // [___LLLLLLLL_________________________] + // [_SSSSSSSS___________________________] + // [___________LLLLLLLL_________________] + // [_________SSSSSSSS___________________] + // [___________________LLLLLLLL_________] + // [_________________SSSSSSSS___________] + // [_______________________SSSSSSSS_____] + static inline void loop_and_tail_forward(Ptr dst, CPtr src, size_t count) { + const size_t tail_offset = count - Size; + const auto tail_value = T::load(src + tail_offset); + size_t offset = 0; +#pragma nounroll + do { + block(dst + offset, src + offset); + offset += Size; + } while (offset < count - Size); + T::store(dst + tail_offset, tail_value); + } + + // Move backward suitable when dst > src. We load the head bytes before + // handling the loop. + // + // e.g. Moving two bytes + // [ | | | | |] + // [___XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX___] + // [___LLLLLLLL_________________________] + // [_________________________LLLLLLLL___] + // [___________________________SSSSSSSS_] + // [_________________LLLLLLLL___________] + // [___________________SSSSSSSS_________] + // [_________LLLLLLLL___________________] + // [___________SSSSSSSS_________________] + // [_____SSSSSSSS_______________________] + static inline void loop_and_tail_backward(Ptr dst, CPtr src, size_t count) { + const auto head_value = T::load(src); + ptrdiff_t offset = count - Size; +#pragma nounroll + do { + block(dst + offset, src + offset); + offset -= Size; + } while (offset >= 0); + T::store(dst, head_value); + } +}; + +} // namespace __llvm_libc::generic + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_x86.h @@ -0,0 +1,217 @@ +//===-- x86 implementation of memory function building blocks -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides x86 specific building blocks to compose memory functions. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H + +#include "src/__support/architectures.h" + +#if defined(LLVM_LIBC_ARCH_X86_64) + +#include "src/__support/common.h" +#include "src/string/memory_utils/op_builtin.h" +#include "src/string/memory_utils/op_generic.h" + +#ifdef __SSE2__ +#include +#else +// Define fake functions to prevent the compiler from failing on undefined +// functions in case SSE2 is not present. +#define _mm512_cmpneq_epi8_mask(A, B) 0 +#define _mm_movemask_epi8(A) 0 +#define _mm256_movemask_epi8(A) 0 +#endif // __SSE2__ + +namespace __llvm_libc::x86 { + +// A set of constants to check compile time features. +static inline constexpr bool kSse2 = LLVM_LIBC_IS_DEFINED(__SSE2__); +static inline constexpr bool kAvx = LLVM_LIBC_IS_DEFINED(__AVX__); +static inline constexpr bool kAvx2 = LLVM_LIBC_IS_DEFINED(__AVX2__); +static inline constexpr bool kAvx512F = LLVM_LIBC_IS_DEFINED(__AVX512F__); +static inline constexpr bool kAvx512BW = LLVM_LIBC_IS_DEFINED(__AVX512BW__); + +/////////////////////////////////////////////////////////////////////////////// +// Memcpy repmovsb implementation +struct Memcpy { + static void repmovsb(char *dst, const char *src, size_t count) { + asm volatile("rep movsb" : "+D"(dst), "+S"(src), "+c"(count) : : "memory"); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Bcmp + +// Base implementation for the Bcmp specializations. +// - BlockSize is either 16, 32 or 64 depending on the available compile time +// features, it is used to switch between "single native operation" or a +// "sequence of native operations". +// - BlockBcmp is the function that implements the bcmp logic. +template struct BcmpImpl { + static inline BcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == BlockSize) { + return BlockBcmp(p1, p2); + } else if constexpr (Size % BlockSize == 0) { + for (size_t offset = 0; offset < Size; offset += BlockSize) + if (auto value = BlockBcmp(p1 + offset, p2 + offset)) + return value; + } else { + deferred_static_assert("SIZE not implemented"); + } + return BcmpReturnType::ZERO(); + } + + static inline BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - Size, p2 + count - Size); + } + + static inline BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + return block(p1, p2) | tail(p1, p2, count); + } + + static inline BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += Size; + } while (offset < count - Size); + return tail(p1, p2, count); + } +}; + +namespace sse2 { +static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(16))); + // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. + const int mask = _mm_movemask_epi8(load(p1) != load(p2)); + return static_cast(mask); +} +template using Bcmp = BcmpImpl; +} // namespace sse2 + +namespace avx2 { +static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(32))); + // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. + const int mask = _mm256_movemask_epi8(load(p1) != load(p2)); + // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit + // mask. + return static_cast(mask); +} +template using Bcmp = BcmpImpl; +} // namespace avx2 + +namespace avx512bw { +static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(64))); + // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. + const uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2)); + const bool mask_is_set = mask != 0; + return static_cast(mask_is_set); +} +template using Bcmp = BcmpImpl; +} // namespace avx512bw + +// Assuming that the mask is non zero, the index of the first mismatching byte +// is the number of trailing zeros in the mask. Trailing zeros and not leading +// zeros because the x86 architecture is little endian. +static inline MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2, + uint64_t mask) { + const size_t diff_index = __builtin_ctzll(mask); + const int16_t ca = p1[diff_index]; + const int16_t cb = p2[diff_index]; + return ca - cb; +} + +/////////////////////////////////////////////////////////////////////////////// +// Memcmp + +// Base implementation for the Memcmp specializations. +// - BlockSize is either 16, 32 or 64 depending on the available compile time +// features, it is used to switch between "single native operation" or a +// "sequence of native operations". +// - BlockMemcmp is the function that implements the memcmp logic. +// - BlockBcmp is the function that implements the bcmp logic. +template +struct MemcmpImpl { + static inline MemcmpReturnType block(CPtr p1, CPtr p2) { + if constexpr (Size == BlockSize) { + return BlockMemcmp(p1, p2); + } else if constexpr (Size % BlockSize == 0) { + for (size_t offset = 0; offset < Size; offset += BlockSize) + if (auto value = BlockBcmp(p1 + offset, p2 + offset)) + return BlockMemcmp(p1 + offset, p2 + offset); + } else { + deferred_static_assert("SIZE not implemented"); + } + return MemcmpReturnType::ZERO(); + } + + static inline MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block(p1 + count - Size, p2 + count - Size); + } + + static inline MemcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + if (auto value = block(p1, p2)) + return value; + return tail(p1, p2, count); + } + + static inline MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, size_t count) { + size_t offset = 0; + do { + if (auto value = block(p1 + offset, p2 + offset)) + return value; + offset += Size; + } while (offset < count - Size); + return tail(p1, p2, count); + } +}; + +namespace sse2 { +static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(16))); + // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. + if (int mask = _mm_movemask_epi8(load(p1) != load(p2))) + return char_diff_no_zero(p1, p2, mask); + return MemcmpReturnType::ZERO(); +} +template using Memcmp = MemcmpImpl; +} // namespace sse2 + +namespace avx2 { +static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(32))); + // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. + if (int mask = _mm256_movemask_epi8(load(p1) != load(p2))) + return char_diff_no_zero(p1, p2, mask); + return MemcmpReturnType::ZERO(); +} +template using Memcmp = MemcmpImpl; +} // namespace avx2 + +namespace avx512bw { +static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { + using T = char __attribute__((__vector_size__(64))); + // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. + if (uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2))) + return char_diff_no_zero(p1, p2, mask); + return MemcmpReturnType::ZERO(); +} +template using Memcmp = MemcmpImpl; +} // namespace avx512bw + +} // namespace __llvm_libc::x86 + +#endif // LLVM_LIBC_ARCH_X86_64 + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_X86_H diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -9,19 +9,8 @@ #ifndef LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H #define LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H -#include "src/__support/architectures.h" - -// Cache line sizes for ARM: These values are not strictly correct since -// cache line sizes depend on implementations, not architectures. There -// are even implementations with cache line sizes configurable at boot -// time. -#if defined(LLVM_LIBC_ARCH_AARCH64) || defined(LLVM_LIBC_ARCH_X86) -#define LLVM_LIBC_CACHELINE_SIZE 64 -#elif defined(LLVM_LIBC_ARCH_ARM) -#define LLVM_LIBC_CACHELINE_SIZE 32 -#else -#error "Unsupported platform for memory functions." -#endif +#include "src/__support/CPP/bit.h" +#include "src/__support/CPP/type_traits.h" #include // size_t #include // intptr_t / uintptr_t @@ -62,32 +51,46 @@ return is_power2_or_zero(value) ? value : 1ULL << (log2(value) + 1); } -template intptr_t offset_from_last_aligned(const void *ptr) { +// Returns the number of bytes to substract from ptr to get to the previous +// multiple of alignment. If ptr is already aligned returns 0. +template uintptr_t distance_to_align_down(const void *ptr) { static_assert(is_power2(alignment), "alignment must be a power of 2"); return reinterpret_cast(ptr) & (alignment - 1U); } -template intptr_t offset_to_next_aligned(const void *ptr) { +// Returns the number of bytes to add to ptr to get to the next multiple of +// alignment. If ptr is already aligned returns 0. +template uintptr_t distance_to_align_up(const void *ptr) { static_assert(is_power2(alignment), "alignment must be a power of 2"); // The logic is not straightforward and involves unsigned modulo arithmetic // but the generated code is as fast as it can be. return -reinterpret_cast(ptr) & (alignment - 1U); } -// Returns the offset from `ptr` to the next cache line. -static inline intptr_t offset_to_next_cache_line(const void *ptr) { - return offset_to_next_aligned(ptr); +// Returns the number of bytes to add to ptr to get to the next multiple of +// alignment. If ptr is already aligned returns alignment. +template +uintptr_t distance_to_next_aligned(const void *ptr) { + return alignment - distance_to_align_down(ptr); } +// Returns the same pointer but notifies the compiler that it is aligned. template static T *assume_aligned(T *ptr) { return reinterpret_cast(__builtin_assume_aligned(ptr, alignment)); } + #if defined __has_builtin #if __has_builtin(__builtin_memcpy_inline) #define LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE #endif #endif +#if defined __has_builtin +#if __has_builtin(__builtin_memset_inline) +#define LLVM_LIBC_HAS_BUILTIN_MEMSET_INLINE +#endif +#endif + // Performs a constant count copy. template static inline void memcpy_inline(void *__restrict dst, @@ -103,28 +106,56 @@ using Ptr = char *; // Pointer to raw data. using CPtr = const char *; // Const pointer to raw data. -// Loads bytes from memory (possibly unaligned) and materializes them as type. +// This type makes sure that we don't accidentally promote an integral type to +// another one. It is only constructible from the exact T type. +template struct StrictIntegralType { + static_assert(cpp::is_integral_v); + + // Can only be constructed from a T. + template , bool> = 0> + StrictIntegralType(U value) : value(value) {} + + // Allows using the type in an if statement. + explicit operator bool() const { return value; } + + // If type is unsigned (bcmp) we allow bitwise OR operations. + StrictIntegralType operator|(const StrictIntegralType &Rhs) const { + static_assert(!cpp::is_signed_v); + return value | Rhs.value; + } + + // For interation with the C API we allow explicit conversion back to the + // `int` type. + explicit operator int() const { + // bit_cast makes sure that T and int have the same size. + return cpp::bit_cast(value); + } + + // Helper to get the zero value. + static inline constexpr StrictIntegralType ZERO() { return {T(0)}; } + +private: + T value; +}; + +using MemcmpReturnType = StrictIntegralType; +using BcmpReturnType = StrictIntegralType; + +// Loads bytes from memory (possibly unaligned) and materializes them as +// type. template static inline T load(CPtr ptr) { T Out; memcpy_inline(&Out, ptr); return Out; } -// Stores a value of type T in memory (possibly unaligned) +// Stores a value of type T in memory (possibly unaligned). template static inline void store(Ptr ptr, T value) { memcpy_inline(ptr, &value); } -// For an operation like memset that operates on a pointer and a count, advances -// the pointer by offset bytes and decrease count by the same amount. -static inline void adjust(ptrdiff_t offset, Ptr &ptr, size_t &count) { - ptr += offset; - count -= offset; -} - -// For an operation like memcpy or memcmp that operates on two pointers and a -// count, advances the pointers by offset bytes and decrease count by the same -// amount. +// Advances the pointers p1 and p2 by offset bytes and decrease count by the +// same amount. template static inline void adjust(ptrdiff_t offset, T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) { @@ -133,31 +164,37 @@ count -= offset; } -// For an operation like memset that operates on a pointer and a count, advances -// the pointer so it is aligned to SIZE bytes and decrease count by the same -// amount. +// Advances p1 and p2 so p1 gets aligned to the next SIZE bytes boundary +// and decrease count by the same amount. // We make sure the compiler knows about the adjusted pointer alignment. -template void align(Ptr &ptr, size_t &count) { - adjust(offset_to_next_aligned(ptr), ptr, count); - ptr = assume_aligned(ptr); +template +void align_p1_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2, + size_t &count) { + adjust(distance_to_next_aligned(p1), p1, p2, count); + p1 = assume_aligned(p1); } -// For an operation like memcpy or memcmp that operates on two pointers and a -// count, advances the pointers so one of them gets aligned to SIZE bytes and -// decrease count by the same amount. -// We make sure the compiler knows about the adjusted pointer alignment. -enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 }; +// Same as align_p1_to_next_boundary above but with a single pointer instead. +template +void align_to_next_boundary(T1 *&p1, size_t &count) { + CPtr dummy; + align_p1_to_next_boundary(p1, dummy, count); +} + +// An enum class that discriminates between the first and second pointer. +enum class Arg { P1, P2, Dst = P1, Src = P2 }; + +// Same as align_p1_to_next_boundary but allows for aligning p2 instead of p1. +// Precondition: &p1 != &p2 template -void align(T1 *__restrict &p1, T2 *__restrict &p2, size_t &count) { - if constexpr (AlignOn == Arg::_1) { - adjust(offset_to_next_aligned(p1), p1, p2, count); - p1 = assume_aligned(p1); - } else if constexpr (AlignOn == Arg::_2) { - adjust(offset_to_next_aligned(p2), p1, p2, count); - p2 = assume_aligned(p2); - } else { - deferred_static_assert("AlignOn must be either Arg::_1 or Arg::_2"); - } +void align_to_next_boundary(T1 *__restrict &p1, T2 *__restrict &p2, + size_t &count) { + if constexpr (AlignOn == Arg::P1) + align_p1_to_next_boundary(p1, p2, count); + else if constexpr (AlignOn == Arg::P2) + align_p1_to_next_boundary(p2, p1, count); // swapping p1 and p2. + else + deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2"); } } // namespace __llvm_libc diff --git a/libc/src/string/memset.cpp b/libc/src/string/memset.cpp --- a/libc/src/string/memset.cpp +++ b/libc/src/string/memset.cpp @@ -13,8 +13,8 @@ namespace __llvm_libc { LLVM_LIBC_FUNCTION(void *, memset, (void *dst, int value, size_t count)) { - inline_memset(reinterpret_cast(dst), - static_cast(value), count); + inline_memset(reinterpret_cast(dst), static_cast(value), + count); return dst; } diff --git a/libc/test/src/string/bcmp_test.cpp b/libc/test/src/string/bcmp_test.cpp --- a/libc/test/src/string/bcmp_test.cpp +++ b/libc/test/src/string/bcmp_test.cpp @@ -12,25 +12,25 @@ TEST(LlvmLibcBcmpTest, CmpZeroByte) { const char *lhs = "ab"; const char *rhs = "bc"; - EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0); + ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 0), 0); } TEST(LlvmLibcBcmpTest, LhsRhsAreTheSame) { const char *lhs = "ab"; const char *rhs = "ab"; - EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0); + ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, 2), 0); } TEST(LlvmLibcBcmpTest, LhsBeforeRhsLexically) { const char *lhs = "ab"; const char *rhs = "ac"; - EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); + ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); } TEST(LlvmLibcBcmpTest, LhsAfterRhsLexically) { const char *lhs = "ac"; const char *rhs = "ab"; - EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); + ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, 2), 0); } TEST(LlvmLibcBcmpTest, Sweep) { @@ -46,13 +46,13 @@ reset(lhs); reset(rhs); for (size_t i = 0; i < K_MAX_SIZE; ++i) - EXPECT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0); + ASSERT_EQ(__llvm_libc::bcmp(lhs, rhs, i), 0); reset(lhs); reset(rhs); for (size_t i = 0; i < K_MAX_SIZE; ++i) { rhs[i] = 'b'; - EXPECT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0); + ASSERT_NE(__llvm_libc::bcmp(lhs, rhs, K_MAX_SIZE), 0); rhs[i] = 'a'; } } diff --git a/libc/test/src/string/memmove_test.cpp b/libc/test/src/string/memmove_test.cpp --- a/libc/test/src/string/memmove_test.cpp +++ b/libc/test/src/string/memmove_test.cpp @@ -20,7 +20,7 @@ void *const Dst = Buffer; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 0); EXPECT_EQ(Ret, Dst); - EXPECT_MEM_EQ(Buffer, Expected); + ASSERT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstAndSrcPointToSameAddress) { @@ -29,7 +29,7 @@ void *const Dst = Buffer; void *const Ret = __llvm_libc::memmove(Dst, Buffer, 1); EXPECT_EQ(Ret, Dst); - EXPECT_MEM_EQ(Buffer, Expected); + ASSERT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstStartsBeforeSrc) { @@ -40,7 +40,7 @@ void *const Dst = Buffer + 1; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 2); EXPECT_EQ(Ret, Dst); - EXPECT_MEM_EQ(Buffer, Expected); + ASSERT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstStartsAfterSrc) { @@ -49,7 +49,7 @@ void *const Dst = Buffer + 2; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 2); EXPECT_EQ(Ret, Dst); - EXPECT_MEM_EQ(Buffer, Expected); + ASSERT_MEM_EQ(Buffer, Expected); } // e.g. `Dst` follow `src`. @@ -62,7 +62,7 @@ void *const Dst = Buffer + 1; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 2, 1); EXPECT_EQ(Ret, Dst); - EXPECT_MEM_EQ(Buffer, Expected); + ASSERT_MEM_EQ(Buffer, Expected); } TEST(LlvmLibcMemmoveTest, DstFollowSrc) { @@ -71,7 +71,7 @@ void *const Dst = Buffer + 2; void *const Ret = __llvm_libc::memmove(Dst, Buffer + 1, 1); EXPECT_EQ(Ret, Dst); - EXPECT_MEM_EQ(Buffer, Expected); + ASSERT_MEM_EQ(Buffer, Expected); } static constexpr int kMaxSize = 512; @@ -106,7 +106,7 @@ void *const Ret = __llvm_libc::memmove(Dst, Buffer.data() + SrcOffset, Size); EXPECT_EQ(Ret, Dst); - EXPECT_MEM_EQ(Buffer, Expected); + ASSERT_MEM_EQ(Buffer, Expected); } } } diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -3,8 +3,6 @@ SUITE libc_string_unittests SRCS - elements_test.cpp - memory_access_test.cpp utils_test.cpp COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE} diff --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp deleted file mode 100644 --- a/libc/test/src/string/memory_utils/elements_test.cpp +++ /dev/null @@ -1,137 +0,0 @@ -//===-- Unittests for memory_utils ----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/__support/CPP/array.h" -#include "src/__support/CPP/span.h" -#include "src/string/memory_utils/elements.h" -#include "utils/UnitTest/Test.h" - -namespace __llvm_libc { - -// Registering Types -using FixedSizeTypes = testing::TypeList< -#if defined(__SSE2__) - x86::Vector128, // -#endif // __SSE2__ -#if defined(__AVX2__) - x86::Vector256, // -#endif // __AVX2__ -#if defined(__AVX512F__) and defined(__AVX512BW__) - x86::Vector512, // -#endif // defined(__AVX512F__) and defined(__AVX512BW__) - scalar::UINT8, // - scalar::UINT16, // - scalar::UINT32, // - scalar::UINT64, // - Repeated, // - Repeated, // - Repeated, // - Repeated, // - Repeated, // - Chained, // - Chained, // - builtin::_1, // - builtin::_2, // - builtin::_3, // - builtin::_4, // - builtin::_8 // - >; - -char GetRandomChar() { - static constexpr const uint64_t a = 1103515245; - static constexpr const uint64_t c = 12345; - static constexpr const uint64_t m = 1ULL << 31; - static uint64_t seed = 123456789; - seed = (a * seed + c) % m; - return seed; -} - -void Randomize(cpp::span buffer) { - for (auto ¤t : buffer) - current = GetRandomChar(); -} - -template using Buffer = cpp::array; - -template Buffer GetRandomBuffer() { - Buffer buffer; - Randomize(buffer); - return buffer; -} - -TYPED_TEST(LlvmLibcMemoryElements, copy, FixedSizeTypes) { - Buffer Dst; - const auto buffer = GetRandomBuffer(); - copy(Dst.data(), buffer.data()); - for (size_t i = 0; i < ParamType::SIZE; ++i) - EXPECT_EQ(Dst[i], buffer[i]); -} - -template T copy(const T &Input) { - T Output; - for (size_t I = 0; I < Input.size(); ++I) - Output[I] = Input[I]; - return Output; -} - -TYPED_TEST(LlvmLibcMemoryElements, Move, FixedSizeTypes) { - constexpr size_t SIZE = ParamType::SIZE; - using LargeBuffer = cpp::array; - LargeBuffer GroundTruth; - Randomize(GroundTruth); - // Forward, we move the SIZE first bytes from offset 0 to SIZE. - for (size_t Offset = 0; Offset < SIZE; ++Offset) { - LargeBuffer Buffer = copy(GroundTruth); - move(&Buffer[Offset], &Buffer[0]); - for (size_t I = 0; I < SIZE; ++I) - EXPECT_EQ(Buffer[I + Offset], GroundTruth[I]); - } - // Backward, we move the SIZE last bytes from offset 0 to SIZE. - for (size_t Offset = 0; Offset < SIZE; ++Offset) { - LargeBuffer Buffer = copy(GroundTruth); - move(&Buffer[Offset], &Buffer[SIZE]); - for (size_t I = 0; I < SIZE; ++I) - EXPECT_EQ(Buffer[I + Offset], GroundTruth[SIZE + I]); - } -} - -TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) { - const auto buffer = GetRandomBuffer(); - EXPECT_TRUE(equals(buffer.data(), buffer.data())); -} - -TYPED_TEST(LlvmLibcMemoryElements, three_way_compare, FixedSizeTypes) { - Buffer initial; - for (auto &c : initial) - c = 5; - - // Testing equality - EXPECT_EQ(three_way_compare(initial.data(), initial.data()), 0); - - // Testing all mismatching positions - for (size_t i = 0; i < ParamType::SIZE; ++i) { - auto copy = initial; - ++copy[i]; // copy is now lexicographycally greated than initial - const auto *less = initial.data(); - const auto *greater = copy.data(); - EXPECT_LT(three_way_compare(less, greater), 0); - EXPECT_GT(three_way_compare(greater, less), 0); - } -} - -TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) { - Buffer Dst; - const cpp::array values = {char(0x00), char(0x7F), char(0xFF)}; - for (char value : values) { - splat_set(Dst.data(), value); - for (size_t i = 0; i < ParamType::SIZE; ++i) - EXPECT_EQ(Dst[i], value); - } -} - -} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp deleted file mode 100644 --- a/libc/test/src/string/memory_utils/memory_access_test.cpp +++ /dev/null @@ -1,228 +0,0 @@ -//===-- Unittests for memory_utils ----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#define LLVM_LIBC_UNITTEST_OBSERVE 1 - -#include "src/__support/CPP/array.h" -#include "src/string/memory_utils/elements.h" -#include "utils/UnitTest/Test.h" - -#include -#include - -namespace __llvm_libc { - -static constexpr const size_t kMaxBuffer = 32; - -struct BufferAccess : cpp::array { - BufferAccess() { Reset(); } - void Reset() { - for (auto &value : *this) - value = '0'; - this->operator[](kMaxBuffer) = '\0'; - } - void Touch(ptrdiff_t offset, size_t size) { - if (offset < 0) - return; - for (size_t i = 0; i < size; ++i) - ++(*this)[offset + i]; - } - operator const char *() const { return this->data(); } -}; - -struct Buffer { - ptrdiff_t Offset(const char *ptr) const { - const bool contained = ptr >= data.begin() && ptr < data.end(); - return contained ? ptr - data.begin() : -1; - } - void Reset() { - reads.Reset(); - writes.Reset(); - } - cpp::array data; - BufferAccess __attribute__((aligned(64))) reads; - BufferAccess __attribute__((aligned(64))) writes; -}; - -struct MemoryAccessObserver { - void ObserveRead(const char *ptr, size_t size) { - Buffer1.reads.Touch(Buffer1.Offset(ptr), size); - Buffer2.reads.Touch(Buffer2.Offset(ptr), size); - } - - void ObserveWrite(const char *ptr, size_t size) { - Buffer1.writes.Touch(Buffer1.Offset(ptr), size); - Buffer2.writes.Touch(Buffer2.Offset(ptr), size); - } - - void Reset() { - Buffer1.Reset(); - Buffer2.Reset(); - } - - Buffer Buffer1; - Buffer Buffer2; -}; - -MemoryAccessObserver Observer; - -template struct TestingElement { - static constexpr size_t SIZE = Size; - - static void copy(char *__restrict dst, const char *__restrict src) { - Observer.ObserveRead(src, SIZE); - Observer.ObserveWrite(dst, SIZE); - } - - static bool equals(const char *lhs, const char *rhs) { - Observer.ObserveRead(lhs, SIZE); - Observer.ObserveRead(rhs, SIZE); - return true; - } - - static int three_way_compare(const char *lhs, const char *rhs) { - Observer.ObserveRead(lhs, SIZE); - Observer.ObserveRead(rhs, SIZE); - return 0; - } - - static void splat_set(char *dst, const unsigned char value) { - Observer.ObserveWrite(dst, SIZE); - } -}; - -using Types = testing::TypeList< - TestingElement<1>, // 1 Byte - TestingElement<2>, // 2 Bytes - TestingElement<4>, // 4 Bytes - Repeated, 3>, // 6 Bytes - Chained, TestingElement<2>, TestingElement<1>> // 7 Bytes - >; - -struct LlvmLibcTestAccessBase : public testing::Test { - - template - void checkOperations(const BufferAccess &expected) { - static const BufferAccess untouched; - - Observer.Reset(); - HigherOrder::copy(dst_ptr() + Offset, src_ptr() + Offset, Size); - ASSERT_STREQ(src().writes, untouched); - ASSERT_STREQ(dst().reads, untouched); - ASSERT_STREQ(src().reads, expected); - ASSERT_STREQ(dst().writes, expected); - Observer.Reset(); - HigherOrder::equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size); - ASSERT_STREQ(lhs().writes, untouched); - ASSERT_STREQ(rhs().writes, untouched); - ASSERT_STREQ(lhs().reads, expected); - ASSERT_STREQ(rhs().reads, expected); - Observer.Reset(); - HigherOrder::three_way_compare(lhs_ptr() + Offset, rhs_ptr() + Offset, - Size); - ASSERT_STREQ(lhs().writes, untouched); - ASSERT_STREQ(rhs().writes, untouched); - ASSERT_STREQ(lhs().reads, expected); - ASSERT_STREQ(rhs().reads, expected); - Observer.Reset(); - HigherOrder::splat_set(dst_ptr() + Offset, 5, Size); - ASSERT_STREQ(src().reads, untouched); - ASSERT_STREQ(src().writes, untouched); - ASSERT_STREQ(dst().reads, untouched); - ASSERT_STREQ(dst().writes, expected); - } - - void checkMaxAccess(const BufferAccess &expected, int max) { - for (size_t i = 0; i < kMaxBuffer; ++i) { - int value = (int)expected[i] - '0'; - ASSERT_GE(value, 0); - ASSERT_LE(value, max); - } - } - -private: - const Buffer &lhs() const { return Observer.Buffer1; } - const Buffer &rhs() const { return Observer.Buffer2; } - const Buffer &src() const { return Observer.Buffer2; } - const Buffer &dst() const { return Observer.Buffer1; } - Buffer &dst() { return Observer.Buffer1; } - - char *dst_ptr() { return dst().data.begin(); } - const char *src_ptr() { return src().data.begin(); } - const char *lhs_ptr() { return lhs().data.begin(); } - const char *rhs_ptr() { return rhs().data.begin(); } -}; - -template -struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase { - - void TearDown() override { - static constexpr size_t Size = 10; - - BufferAccess expected; - expected.Touch(Size - ParamType::SIZE, ParamType::SIZE); - - checkMaxAccess(expected, 1); - checkOperations, Size>(expected); - } -}; -TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {} - -template -struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase { - void TearDown() override { - static constexpr size_t Size = 10; - - BufferAccess expected; - expected.Touch(0, ParamType::SIZE); - expected.Touch(Size - ParamType::SIZE, ParamType::SIZE); - - checkMaxAccess(expected, 2); - checkOperations, Size>(expected); - } -}; -TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {} - -template -struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase { - void TearDown() override { - static constexpr size_t Size = 20; - - BufferAccess expected; - for (size_t i = 0; i < Size - ParamType::SIZE; i += ParamType::SIZE) - expected.Touch(i, ParamType::SIZE); - expected.Touch(Size - ParamType::SIZE, ParamType::SIZE); - - checkMaxAccess(expected, 2); - checkOperations, Size>(expected); - } -}; -TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {} - -template -struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase { - void TearDown() override { - static constexpr size_t Size = 10; - static constexpr size_t Offset = 2; - using AlignmentT = TestingElement<4>; - - BufferAccess expected; - expected.Touch(Offset, AlignmentT::SIZE); - expected.Touch(AlignmentT::SIZE, ParamType::SIZE); - expected.Touch(Offset + Size - ParamType::SIZE, ParamType::SIZE); - - checkMaxAccess(expected, 3); - checkOperations::Then>, Size, - Offset>(expected); - checkOperations::Then>, Size, - Offset>(expected); - } -}; -TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {} - -} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/utils_test.cpp b/libc/test/src/string/memory_utils/utils_test.cpp --- a/libc/test/src/string/memory_utils/utils_test.cpp +++ b/libc/test/src/string/memory_utils/utils_test.cpp @@ -72,55 +72,41 @@ EXPECT_EQ(ge_power2(i), kExpectedValues[i]); } -using I = intptr_t; +using UINT = uintptr_t; // Converts an offset into a pointer. const void *forge(size_t offset) { return reinterpret_cast(offset); } -TEST(LlvmLibcUtilsTest, OffsetToNextAligned) { - EXPECT_EQ(offset_to_next_aligned<16>(forge(0)), I(0)); - EXPECT_EQ(offset_to_next_aligned<16>(forge(1)), I(15)); - EXPECT_EQ(offset_to_next_aligned<16>(forge(16)), I(0)); - EXPECT_EQ(offset_to_next_aligned<16>(forge(15)), I(1)); - EXPECT_EQ(offset_to_next_aligned<32>(forge(16)), I(16)); +TEST(LlvmLibcUtilsTest, DistanceToNextAligned) { + EXPECT_EQ(distance_to_next_aligned<16>(forge(0)), UINT(16)); + EXPECT_EQ(distance_to_next_aligned<16>(forge(1)), UINT(15)); + EXPECT_EQ(distance_to_next_aligned<16>(forge(16)), UINT(16)); + EXPECT_EQ(distance_to_next_aligned<16>(forge(15)), UINT(1)); + EXPECT_EQ(distance_to_next_aligned<32>(forge(16)), UINT(16)); } -TEST(LlvmLibcUtilsTest, OffsetFromLastAligned) { - EXPECT_EQ(offset_from_last_aligned<16>(forge(0)), I(0)); - EXPECT_EQ(offset_from_last_aligned<16>(forge(1)), I(1)); - EXPECT_EQ(offset_from_last_aligned<16>(forge(16)), I(0)); - EXPECT_EQ(offset_from_last_aligned<16>(forge(15)), I(15)); - EXPECT_EQ(offset_from_last_aligned<32>(forge(16)), I(16)); +TEST(LlvmLibcUtilsTest, DistanceToAlignUp) { + EXPECT_EQ(distance_to_align_up<16>(forge(0)), UINT(0)); + EXPECT_EQ(distance_to_align_up<16>(forge(1)), UINT(15)); + EXPECT_EQ(distance_to_align_up<16>(forge(16)), UINT(0)); + EXPECT_EQ(distance_to_align_up<16>(forge(15)), UINT(1)); + EXPECT_EQ(distance_to_align_up<32>(forge(16)), UINT(16)); } -TEST(LlvmLibcUtilsTest, OffsetToNextCacheLine) { - EXPECT_GT(LLVM_LIBC_CACHELINE_SIZE, 0); - EXPECT_EQ(offset_to_next_cache_line(forge(0)), I(0)); - EXPECT_EQ(offset_to_next_cache_line(forge(1)), - I(LLVM_LIBC_CACHELINE_SIZE - 1)); - EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE)), I(0)); - EXPECT_EQ(offset_to_next_cache_line(forge(LLVM_LIBC_CACHELINE_SIZE - 1)), - I(1)); -} - -TEST(LlvmLibcUtilsTest, Adjust1) { - char a; - const size_t base_size = 10; - for (size_t I = -2; I < 2; ++I) { - auto *ptr = &a; - size_t size = base_size; - adjust(I, ptr, size); - EXPECT_EQ(intptr_t(ptr), intptr_t(&a + I)); - EXPECT_EQ(size, base_size - I); - } +TEST(LlvmLibcUtilsTest, DistanceToAlignDown) { + EXPECT_EQ(distance_to_align_down<16>(forge(0)), UINT(0)); + EXPECT_EQ(distance_to_align_down<16>(forge(1)), UINT(1)); + EXPECT_EQ(distance_to_align_down<16>(forge(16)), UINT(0)); + EXPECT_EQ(distance_to_align_down<16>(forge(15)), UINT(15)); + EXPECT_EQ(distance_to_align_down<32>(forge(16)), UINT(16)); } TEST(LlvmLibcUtilsTest, Adjust2) { char a, b; const size_t base_size = 10; - for (size_t I = -2; I < 2; ++I) { + for (ptrdiff_t I = -2; I < 2; ++I) { auto *p1 = &a; auto *p2 = &b; size_t size = base_size; @@ -131,19 +117,6 @@ } } -TEST(LlvmLibcUtilsTest, Align1) { - char a; - const size_t base_size = 10; - { - auto *ptr = &a; - size_t size = base_size; - align<128>(ptr, size); - EXPECT_TRUE(uintptr_t(ptr) % 128 == 0); - EXPECT_GE(ptr, &a); - EXPECT_EQ(size_t(ptr - &a), base_size - size); - } -} - TEST(LlvmLibcUtilsTest, Align2) { char a, b; const size_t base_size = 10; @@ -151,10 +124,10 @@ auto *p1 = &a; auto *p2 = &b; size_t size = base_size; - align<128, Arg::_1>(p1, p2, size); + align_to_next_boundary<128, Arg::P1>(p1, p2, size); EXPECT_TRUE(uintptr_t(p1) % 128 == 0); - EXPECT_GE(p1, &a); - EXPECT_GE(p2, &b); + EXPECT_GT(p1, &a); + EXPECT_GT(p2, &b); EXPECT_EQ(size_t(p1 - &a), base_size - size); EXPECT_EQ(size_t(p2 - &b), base_size - size); } @@ -162,10 +135,10 @@ auto *p1 = &a; auto *p2 = &b; size_t size = base_size; - align<128, Arg::_2>(p1, p2, size); + align_to_next_boundary<128, Arg::P2>(p1, p2, size); EXPECT_TRUE(uintptr_t(p2) % 128 == 0); - EXPECT_GE(p1, &a); - EXPECT_GE(p2, &b); + EXPECT_GT(p1, &a); + EXPECT_GT(p2, &b); EXPECT_EQ(size_t(p1 - &a), base_size - size); EXPECT_EQ(size_t(p2 - &b), base_size - size); } diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -973,9 +973,10 @@ cc_library( name = "string_memory_utils", hdrs = [ - "src/string/memory_utils/elements.h", - "src/string/memory_utils/elements_aarch64.h", - "src/string/memory_utils/elements_x86.h", + "src/string/memory_utils/op_aarch64.h", + "src/string/memory_utils/op_builtin.h", + "src/string/memory_utils/op_generic.h", + "src/string/memory_utils/op_x86.h", "src/string/memory_utils/utils.h", ], textual_hdrs = [ @@ -988,6 +989,8 @@ deps = [ ":__support_common", ":__support_cpp_bit", + ":__support_cpp_type_traits", + ":__support_cpp_array", ":libc_root", ], )