diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -194,7 +194,7 @@ SRCS ${ADD_IMPL_SRCS} HDRS ${ADD_IMPL_HDRS} DEPENDS ${ADD_IMPL_DEPENDS} - COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} + COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} "SHELL:-mllvm -combiner-global-alias-analysis" ) get_fq_target_name(${impl_name} fq_target_name) set_target_properties(${fq_target_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_IMPL_REQUIRE}") diff --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp --- a/libc/src/string/aarch64/memcpy.cpp +++ b/libc/src/string/aarch64/memcpy.cpp @@ -8,10 +8,19 @@ #include "src/string/memcpy.h" #include "src/__support/common.h" -#include "src/string/memory_utils/memcpy_utils.h" +#include "src/string/memory_utils/elements.h" namespace __llvm_libc { +using _1 = scalar::UINT8; +using _2 = scalar::UINT16; +using _3 = Chained; +using _4 = scalar::UINT32; +using _8 = scalar::UINT64; +using _16 = Repeated; +using _32 = Repeated; +using _64 = Repeated; + // Design rationale // ================ // @@ -37,24 +46,24 @@ if (count == 0) return; if (count == 1) - return CopyBlock<1>(dst, src); + return Copy<_1>(dst, src); if (count == 2) - return CopyBlock<2>(dst, src); + return Copy<_2>(dst, src); if (count == 3) - return CopyBlock<3>(dst, src); + return Copy<_3>(dst, src); if (count == 4) - return CopyBlock<4>(dst, src); + return Copy<_4>(dst, src); if (count < 8) - return CopyBlockOverlap<4>(dst, src, count); + return Copy>(dst, src, count); if (count < 16) - return CopyBlockOverlap<8>(dst, src, count); + return Copy>(dst, src, count); if (count < 32) - return CopyBlockOverlap<16>(dst, src, count); + return Copy>(dst, src, count); if (count < 64) - return CopyBlockOverlap<32>(dst, src, count); + return Copy>(dst, src, count); if (count < 128) - return CopyBlockOverlap<64>(dst, src, count); - return CopySrcAlignedBlocks<64, 16>(dst, src, count); + return Copy>(dst, src, count); + return Copy::Then>>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -8,7 +8,7 @@ #include "src/string/memcpy.h" #include "src/__support/common.h" -#include "src/string/memory_utils/memcpy_utils.h" +#include "src/string/memory_utils/elements.h" namespace __llvm_libc { @@ -35,24 +35,24 @@ if (count == 0) return; if (count == 1) - return CopyBlock<1>(dst, src); + return Copy<_1>(dst, src); if (count == 2) - return CopyBlock<2>(dst, src); + return Copy<_2>(dst, src); if (count == 3) - return CopyBlock<3>(dst, src); + return Copy<_3>(dst, src); if (count == 4) - return CopyBlock<4>(dst, src); + return Copy<_4>(dst, src); if (count < 8) - return CopyBlockOverlap<4>(dst, src, count); + return Copy>(dst, src, count); if (count < 16) - return CopyBlockOverlap<8>(dst, src, count); + return Copy>(dst, src, count); if (count < 32) - return CopyBlockOverlap<16>(dst, src, count); + return Copy>(dst, src, count); if (count < 64) - return CopyBlockOverlap<32>(dst, src, count); + return Copy>(dst, src, count); if (count < 128) - return CopyBlockOverlap<64>(dst, src, count); - return CopySrcAlignedBlocks<32>(dst, src, count); + return Copy>(dst, src, count); + return Copy::Then>>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -2,6 +2,5 @@ memory_utils HDRS utils.h - memcpy_utils.h - memset_utils.h + elements.h ) diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/elements.h @@ -0,0 +1,475 @@ +//===-- Elementary operations to compose memory primitives ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H + +#include // size_t +#include // uint8_t, uint16_t, uint32_t, uint64_t + +#include "src/__support/endian.h" +#include "src/string/memory_utils/utils.h" + +namespace __llvm_libc { + +// Elementary Operations +// -------------------------------- +// We define abstract elementary operations acting on fixed chunks of memory. +// These are low level building blocks that are meant to be assembled to compose +// higher order abstractions. Each function is defined twice: once with +// fixed-size operations, and once with runtime-size operations. + +// Fixed-size copies from 'src' to 'dst'. +template +void Copy(char *__restrict dst, const char *__restrict src) { + Element::Copy(dst, src); +} +// Runtime-size copies from 'src' to 'dst'. +template +void Copy(char *__restrict dst, const char *__restrict src, size_t size) { + Element::Copy(dst, src, size); +} + +// Fixed-size equality between 'lhs' and 'rhs'. +template bool Equals(const char *lhs, const char *rhs) { + return Element::Equals(lhs, rhs); +} +// Runtime-size equality between 'lhs' and 'rhs'. +template +bool Equals(const char *lhs, const char *rhs, size_t size) { + return Element::Equals(lhs, rhs, size); +} + +// Fixed-size three-way comparison between 'lhs' and 'rhs'. +template +int ThreeWayCompare(const char *lhs, const char *rhs) { + return Element::ThreeWayCompare(lhs, rhs); +} +// Runtime-size three-way comparison between 'lhs' and 'rhs'. +template +int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + return Element::ThreeWayCompare(lhs, rhs, size); +} + +// Fixed-size initialization. +template +void SplatSet(char *dst, const unsigned char value) { + Element::SplatSet(dst, value); +} +// Runtime-size initialization. +template +void SplatSet(char *dst, const unsigned char value, size_t size) { + Element::SplatSet(dst, value, size); +} + +// Fixed-size Higher-Order Operations +// ---------------------------------- +// - Repeated: Repeat the operation several times in a row. +// - Chained: Chain the operation of several types. + +// Repeat the operation several times in a row. +template struct Repeated { + static constexpr size_t kSize = ElementCount * Element::kSize; + + static void Copy(char *__restrict dst, const char *__restrict src) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + Element::Copy(dst + offset, src + offset); + } + } + + static bool Equals(const char *lhs, const char *rhs) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + if (!Element::Equals(lhs + offset, rhs + offset)) + return false; + } + return true; + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + // We make the assumption that 'Equals' si cheaper than 'ThreeWayCompare'. + if (Element::Equals(lhs + offset, rhs + offset)) + continue; + return Element::ThreeWayCompare(lhs + offset, rhs + offset); + } + return 0; + } + + static void SplatSet(char *dst, const unsigned char value) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + Element::SplatSet(dst + offset, value); + } + } +}; + +// Chain the operation of several types. +// For instance, to handle a 3 bytes operation, one can use: +// Chained::Operation(); +template struct Chained; + +template struct Chained { + static constexpr size_t kSize = Head::kSize + Chained::kSize; + + static void Copy(char *__restrict dst, const char *__restrict src) { + Chained::Copy(dst + Head::kSize, src + Head::kSize); + __llvm_libc::Copy(dst, src); + } + + static bool Equals(const char *lhs, const char *rhs) { + if (!__llvm_libc::Equals(lhs, rhs)) + return false; + return Chained::Equals(lhs + Head::kSize, rhs + Head::kSize); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + if (__llvm_libc::Equals(lhs, rhs)) + return Chained::ThreeWayCompare(lhs + Head::kSize, + rhs + Head::kSize); + return __llvm_libc::ThreeWayCompare(lhs, rhs); + } + + static void SplatSet(char *dst, const unsigned char value) { + Chained::SplatSet(dst + Head::kSize, value); + __llvm_libc::SplatSet(dst, value); + } +}; + +template <> struct Chained<> { + static constexpr size_t kSize = 0; + static void Copy(char *__restrict dst, const char *__restrict src) {} + static bool Equals(const char *lhs, const char *rhs) { return true; } + static int ThreeWayCompare(const char *lhs, const char *rhs) { return 0; } + static void SplatSet(char *dst, const unsigned char value) {} +}; + +// Runtime-size Higher-Order Operations +// ------------------------------------ +// - Tail: Perform the operation on the last 'T::kSize' bytes of the buffer. +// - HeadTail: Perform the operation on the first and last 'T::kSize' bytes +// of the buffer. +// - Loop: Perform a loop of fixed-sized operations. + +// Perform the operation on the last 'T::kSize' bytes of the buffer. +// +// e.g. with +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [________XXXXXXXX___] +// +// Precondition: `size >= T::kSize`. +template struct Tail { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + return T::Copy(dst + offset(size), src + offset(size)); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + return T::Equals(lhs + offset(size), rhs + offset(size)); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + return T::ThreeWayCompare(lhs + offset(size), rhs + offset(size)); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + return T::SplatSet(dst + offset(size), value); + } + + static size_t offset(size_t size) { return size - T::kSize; } +}; + +// Perform the operation on the first and last 'T::kSize' bytes of the buffer. +// This is useful for overlapping operations. +// +// e.g. with +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [__XXXXXXXX_________] +// [________XXXXXXXX___] +// +// Precondition: `size >= T::kSize && size <= 2 x T::kSize`. +template struct HeadTail { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + T::Copy(dst, src); + Tail::Copy(dst, src, size); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + if (!T::Equals(lhs, rhs)) + return false; + return Tail::Equals(lhs, rhs, size); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + if (const int result = T::ThreeWayCompare(lhs, rhs)) + return result; + return Tail::ThreeWayCompare(lhs, rhs, size); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + T::SplatSet(dst, value); + Tail::SplatSet(dst, value, size); + } +}; + +// Simple loop ending with a Tail operation. +// +// e.g. with +// [12345678123456781234567812345678] +// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___] +// [__XXXXXXXX_______________________] +// [__________XXXXXXXX_______________] +// [__________________XXXXXXXX_______] +// [______________________XXXXXXXX___] +// +// Precondition: +// - size >= T::kSize +template struct Loop { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + T::Copy(dst + offset, src + offset); + Tail::Copy(dst, src, size); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + if (!T::Equals(lhs + offset, rhs + offset)) + return false; + return Tail::Equals(lhs, rhs, size); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + if (const int result = T::ThreeWayCompare(lhs + offset, rhs + offset)) + return result; + return Tail::ThreeWayCompare(lhs, rhs, size); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + T::SplatSet(dst + offset, value); + Tail::SplatSet(dst, value, size); + } +}; + +enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 }; + +namespace internal { + +// Provides a specialized Bump function that adjusts pointers and size so first +// argument (resp. second argument) gets aligned to Alignment. +// We make sure the compiler knows about the adjusted pointer alignment. +template struct AlignHelper {}; + +template struct AlignHelper { + template + static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) { + const intptr_t offset = offset_to_next_aligned(p1ref); + p1ref += offset; + p2ref += offset; + size -= offset; + p1ref = assume_aligned(p1ref); + } +}; + +template struct AlignHelper { + template + static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) { + const intptr_t offset = offset_to_next_aligned(p2ref); + p1ref += offset; + p2ref += offset; + size -= offset; + p2ref = assume_aligned(p2ref); + } +}; + +} // namespace internal + +// An alignment operation that: +// - executes the 'AlignmentT' operation +// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected +// pointer gets aligned, size is decreased accordingly. +// - calls the 'NextT' operation. +// +// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as: +// Copy::Then>>(dst, src, count); +template struct Align { +private: + static constexpr size_t Alignment = AlignmentT::kSize; + static_assert(Alignment > 1, "Alignment must be more than 1"); + static_assert(is_power2(Alignment), "Alignment must be a power of 2"); + +public: + template struct Then { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + AlignmentT::Copy(dst, src); + internal::AlignHelper::Bump(dst, src, size); + NextT::Copy(dst, src, size); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + if (!AlignmentT::Equals(lhs, rhs)) + return false; + internal::AlignHelper::Bump(lhs, rhs, size); + return NextT::Equals(lhs, rhs, size); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + if (const int result = AlignmentT::ThreeWayCompare(lhs, rhs)) + return result; + internal::AlignHelper::Bump(lhs, rhs, size); + return NextT::ThreeWayCompare(lhs, rhs, size); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + AlignmentT::SplatSet(dst, value); + char *dummy = nullptr; + internal::AlignHelper::Bump(dst, dummy, size); + NextT::SplatSet(dst, value, size); + } + }; +}; + +// Fixed-size Builtin Operations +// ----------------------------- +// Note: Do not use 'builtin' right now as it requires the implementation of the +// `_inline` versions of all the builtins. Theoretically, Clang can still turn +// them into calls to the C library leading to reentrancy problems. +namespace builtin { +template struct Builtin { + static constexpr size_t kSize = Size; + + static void Copy(char *__restrict dst, const char *__restrict src) { + __builtin_memcpy_inline(dst, src, kSize); + } + + static bool Equals(const char *lhs, const char *rhs) { + return __builtin_memcmp(lhs, rhs, kSize) == 0; + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + return __builtin_memcmp(lhs, rhs, kSize); + } + + static void SplatSet(char *dst, const unsigned char value) { + __builtin_memset(dst, value, kSize); + } +}; + +using _1 = Builtin<1>; +using _2 = Builtin<2>; +using _3 = Builtin<3>; +using _4 = Builtin<4>; +using _8 = Builtin<8>; +using _16 = Builtin<16>; +using _32 = Builtin<32>; +using _64 = Builtin<64>; +using _128 = Builtin<128>; + +} // namespace builtin + +// Fixed-size Scalar Operations +// ---------------------------- +namespace scalar { + +// The Scalar type makes use of simple sized integers. +template struct Scalar { + static constexpr size_t kSize = sizeof(T); + + static void Copy(char *__restrict dst, const char *__restrict src) { + Store(dst, Load(src)); + } + + static bool Equals(const char *lhs, const char *rhs) { + return Load(lhs) == Load(rhs); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + return ScalarThreeWayCompare(Load(lhs), Load(rhs)); + } + + static void SplatSet(char *dst, const unsigned char value) { + Store(dst, GetSplattedValue(value)); + } + +private: + static T Load(const char *ptr) { + T value; + __builtin_memcpy_inline(&value, ptr, kSize); + return value; + } + static void Store(char *ptr, T value) { + __builtin_memcpy_inline(ptr, &value, kSize); + } + static T GetSplattedValue(const unsigned char value) { + return T(~0) / T(0xFF) * T(value); + } + static int ScalarThreeWayCompare(T a, T b); +}; + +template <> +inline int Scalar::ScalarThreeWayCompare(uint8_t a, uint8_t b) { + const int16_t la = Endian::ToBigEndian(a); + const int16_t lb = Endian::ToBigEndian(b); + return la - lb; +} +template <> +inline int Scalar::ScalarThreeWayCompare(uint16_t a, uint16_t b) { + const int32_t la = Endian::ToBigEndian(a); + const int32_t lb = Endian::ToBigEndian(b); + return la - lb; +} +template <> +inline int Scalar::ScalarThreeWayCompare(uint32_t a, uint32_t b) { + const int64_t la = Endian::ToBigEndian(a); + const int64_t lb = Endian::ToBigEndian(b); + if (la < lb) + return -1; + if (la > lb) + return 1; + return 0; +} +template <> +inline int Scalar::ScalarThreeWayCompare(uint64_t a, uint64_t b) { + const __int128_t la = Endian::ToBigEndian(a); + const __int128_t lb = Endian::ToBigEndian(b); + if (la < lb) + return -1; + if (la > lb) + return 1; + return 0; +} + +using UINT8 = Scalar; // 1 Byte +using UINT16 = Scalar; // 2 Bytes +using UINT32 = Scalar; // 4 Bytes +using UINT64 = Scalar; // 8 Bytes + +using _1 = UINT8; +using _2 = UINT16; +using _3 = Chained; +using _4 = UINT32; +using _8 = UINT64; +using _16 = Repeated<_8, 2>; +using _32 = Repeated<_8, 4>; +using _64 = Repeated<_8, 8>; +using _128 = Repeated<_8, 16>; + +} // namespace scalar +} // namespace __llvm_libc + +#include + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H diff --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/elements_x86.h @@ -0,0 +1,149 @@ +//===-- Elementary operations for x86 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H + +#include // size_t +#include // uint8_t, uint16_t, uint32_t, uint64_t + +#ifdef __SSE2__ +#include +#endif // __SSE2__ + +// Fixed-size Vector Operations +// ---------------------------- + +namespace __llvm_libc { +namespace x86 { + +#ifdef __SSE2__ +template struct Vector : public Base { + static void Copy(char *dst, const char *src) { + Base::Store(dst, Base::Load(src)); + } + + static bool Equals(const char *a, const char *b) { + return Base::NotEqualMask(Base::Load(a), Base::Load(b)) == 0; + } + + static int ThreeWayCompare(const char *a, const char *b) { + const auto mask = Base::NotEqualMask(Base::Load(a), Base::Load(b)); + if (!mask) + return 0; + return CharDiff(a, b, mask); + } + + static void SplatSet(char *dst, const unsigned char value) { + Base::Store(dst, Base::GetSplattedValue(value)); + } + + static int CharDiff(const char *a, const char *b, uint64_t mask) { + const size_t diff_index = __builtin_ctzl(mask); + const int ca = (unsigned char)a[diff_index]; + const int cb = (unsigned char)b[diff_index]; + return ca - cb; + } +}; + +struct M128 { + static constexpr size_t kSize = 16; + using T = char __attribute__((__vector_size__(kSize))); + static uint16_t mask(T value) { return _mm_movemask_epi8(value); } + static uint16_t NotEqualMask(T a, T b) { return mask(a != b); } + static T Load(const char *ptr) { + return _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr)); + } + static void Store(char *ptr, T value) { + return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr), value); + } + static T GetSplattedValue(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; + +using Vector128 = Vector; // 16 Bytes + +#ifdef __AVX2__ +struct M256 { + static constexpr size_t kSize = 32; + using T = char __attribute__((__vector_size__(kSize))); + static uint32_t mask(T value) { return _mm256_movemask_epi8(value); } + static uint32_t NotEqualMask(T a, T b) { return mask(a != b); } + static T Load(const char *ptr) { + return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr)); + } + static void Store(char *ptr, T value) { + return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), value); + } + static T GetSplattedValue(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; + +using Vector256 = Vector; // 32 Bytes + +#if defined(__AVX512F__) and defined(__AVX512BW__) +struct M512 { + static constexpr size_t kSize = 64; + using T = char __attribute__((__vector_size__(kSize))); + static uint64_t NotEqualMask(T a, T b) { + return _mm512_cmpneq_epi8_mask(a, b); + } + static T Load(const char *ptr) { return _mm512_loadu_epi8(ptr); } + static void Store(char *ptr, T value) { + return _mm512_storeu_epi8(ptr, value); + } + static T GetSplattedValue(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; +using Vector512 = Vector; + +#endif // defined(__AVX512F__) and defined(__AVX512BW__) +#endif // __AVX2__ +#endif // __SSE2__ + +using _1 = __llvm_libc::scalar::_1; +using _2 = __llvm_libc::scalar::_2; +using _3 = __llvm_libc::scalar::_3; +using _4 = __llvm_libc::scalar::_4; +using _8 = __llvm_libc::scalar::_8; +#if defined(__AVX512F__) && defined(__AVX512BW__) +using _16 = x86::Vector128; +using _32 = x86::Vector256; +using _64 = x86::Vector512; +using _128 = Repeated<_64, 2>; +#elif defined(__AVX2__) +using _16 = x86::Vector128; +using _32 = x86::Vector256; +using _64 = Repeated<_32, 2>; +using _128 = Repeated<_32, 4>; +#elif defined(__SSE2__) +using _16 = x86::Vector128; +using _32 = Repeated<_16, 2>; +using _64 = Repeated<_16, 4>; +using _128 = Repeated<_16, 8>; +#else +using _16 = Repeated<_8, 2>; +using _32 = Repeated<_8, 4>; +using _64 = Repeated<_8, 8>; +using _128 = Repeated<_8, 16>; +#endif + +} // namespace x86 +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h deleted file mode 100644 --- a/libc/src/string/memory_utils/memcpy_utils.h +++ /dev/null @@ -1,140 +0,0 @@ -//===-- Memcpy utils --------------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H - -#include "src/__support/sanitizer.h" -#include "src/string/memory_utils/utils.h" -#include // size_t - -// __builtin_memcpy_inline guarantees to never call external functions. -// Unfortunately it is not widely available. -#ifdef __clang__ -#if __has_builtin(__builtin_memcpy_inline) -#define USE_BUILTIN_MEMCPY_INLINE -#endif -#elif defined(__GNUC__) -#define USE_BUILTIN_MEMCPY -#endif - -namespace __llvm_libc { - -// This is useful for testing. -#if defined(LLVM_LIBC_MEMCPY_MONITOR) -extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict, - const char *__restrict, size_t); -#endif - -// Copies `kBlockSize` bytes from `src` to `dst` using a for loop. -// This code requires the use of `-fno-buitin-memcpy` to prevent the compiler -// from turning the for-loop back into `__builtin_memcpy`. -template -static void ForLoopCopy(char *__restrict dst, const char *__restrict src) { - for (size_t i = 0; i < kBlockSize; ++i) - dst[i] = src[i]; -} - -// Copies `kBlockSize` bytes from `src` to `dst`. -template -static void CopyBlock(char *__restrict dst, const char *__restrict src) { -#if defined(LLVM_LIBC_MEMCPY_MONITOR) - LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize); -#elif LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER - ForLoopCopy(dst, src); -#elif defined(USE_BUILTIN_MEMCPY_INLINE) - __builtin_memcpy_inline(dst, src, kBlockSize); -#elif defined(USE_BUILTIN_MEMCPY) - __builtin_memcpy(dst, src, kBlockSize); -#else - ForLoopCopy(dst, src); -#endif -} - -// Copies `kBlockSize` bytes from `src + count - kBlockSize` to -// `dst + count - kBlockSize`. -// Precondition: `count >= kBlockSize`. -template -static void CopyLastBlock(char *__restrict dst, const char *__restrict src, - size_t count) { - const size_t offset = count - kBlockSize; - CopyBlock(dst + offset, src + offset); -} - -// Copies `kBlockSize` bytes twice with an overlap between the two. -// -// [1234567812345678123] -// [__XXXXXXXXXXXXXX___] -// [__XXXXXXXX_________] -// [________XXXXXXXX___] -// -// Precondition: `count >= kBlockSize && count <= kBlockSize`. -template -static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src, - size_t count) { - CopyBlock(dst, src); - CopyLastBlock(dst, src, count); -} - -// Copies `count` bytes by blocks of `kBlockSize` bytes. -// Copies at the start and end of the buffer are unaligned. -// Copies in the middle of the buffer are aligned to `kAlignment`. -// -// e.g. with -// [12345678123456781234567812345678] -// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___] -// [__XXXX___________________________] -// [_____XXXXXXXX____________________] -// [_____________XXXXXXXX____________] -// [_____________________XXXXXXXX____] -// [______________________XXXXXXXX___] -// -// Precondition: `kAlignment <= kBlockSize` -// `count > 2 * kBlockSize` for efficiency. -// `count >= kAlignment` for correctness. -template -static void CopySrcAlignedBlocks(char *__restrict dst, - const char *__restrict src, size_t count) { - static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); - static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); - static_assert(kAlignment <= kBlockSize, - "kAlignment must be less or equal to block size"); - CopyBlock(dst, src); // Copy first block - - // Copy aligned blocks - const size_t ofla = offset_from_last_aligned(src); - const size_t limit = count + ofla - kBlockSize; - for (size_t offset = kAlignment; offset < limit; offset += kBlockSize) - CopyBlock(dst - ofla + offset, - assume_aligned(src - ofla + offset)); - - CopyLastBlock(dst, src, count); // Copy last block -} - -template -static void CopyDstAlignedBlocks(char *__restrict dst, - const char *__restrict src, size_t count) { - static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); - static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); - static_assert(kAlignment <= kBlockSize, - "kAlignment must be less or equal to block size"); - CopyBlock(dst, src); // Copy first block - - // Copy aligned blocks - const size_t ofla = offset_from_last_aligned(dst); - const size_t limit = count + ofla - kBlockSize; - for (size_t offset = kAlignment; offset < limit; offset += kBlockSize) - CopyBlock(assume_aligned(dst - ofla + offset), - src - ofla + offset); - - CopyLastBlock(dst, src, count); // Copy last block -} - -} // namespace __llvm_libc - -#endif // LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H diff --git a/libc/src/string/memory_utils/memset_utils.h b/libc/src/string/memory_utils/memset_utils.h --- a/libc/src/string/memory_utils/memset_utils.h +++ b/libc/src/string/memory_utils/memset_utils.h @@ -6,70 +6,16 @@ // //===----------------------------------------------------------------------===// -#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#include "src/string/memory_utils/elements.h" #include "src/string/memory_utils/utils.h" #include // size_t namespace __llvm_libc { -// Sets `kBlockSize` bytes starting from `src` to `value`. -template static void SetBlock(char *dst, unsigned value) { - // Theoretically the compiler is allowed to call memset here and end up with a - // recursive call, practically it doesn't happen, however this should be - // replaced with a __builtin_memset_inline once it's available in clang. - __builtin_memset(dst, value, kBlockSize); -} - -// Sets `kBlockSize` bytes from `src + count - kBlockSize` to `value`. -// Precondition: `count >= kBlockSize`. -template -static void SetLastBlock(char *dst, unsigned value, size_t count) { - SetBlock(dst + count - kBlockSize, value); -} - -// Sets `kBlockSize` bytes twice with an overlap between the two. -// -// [1234567812345678123] -// [__XXXXXXXXXXXXXX___] -// [__XXXXXXXX_________] -// [________XXXXXXXX___] -// -// Precondition: `count >= kBlockSize && count <= kBlockSize`. -template -static void SetBlockOverlap(char *dst, unsigned value, size_t count) { - SetBlock(dst, value); - SetLastBlock(dst, value, count); -} - -// Sets `count` bytes by blocks of `kBlockSize` bytes. -// Sets at the start and end of the buffer are unaligned. -// Sets in the middle of the buffer are aligned to `kBlockSize`. -// -// e.g. with -// [12345678123456781234567812345678] -// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___] -// [__XXXXXXXX______________________] -// [________XXXXXXXX________________] -// [________________XXXXXXXX________] -// [_____________________XXXXXXXX___] -// -// Precondition: `count > 2 * kBlockSize` for efficiency. -// `count >= kBlockSize` for correctness. -template -static void SetAlignedBlocks(char *dst, unsigned value, size_t count) { - SetBlock(dst, value); // Set first block - - // Set aligned blocks - size_t offset = kBlockSize - offset_from_last_aligned(dst); - for (; offset + kBlockSize < count; offset += kBlockSize) - SetBlock(dst + offset, value); - - SetLastBlock(dst, value, count); // Set last block -} - // A general purpose implementation assuming cheap unaligned writes for sizes: // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32 // or 64 Bytes at a time, the compiler will expand them as needed. @@ -106,26 +52,27 @@ if (count == 0) return; if (count == 1) - return SetBlock<1>(dst, value); + return SplatSet(dst, value); if (count == 2) - return SetBlock<2>(dst, value); + return SplatSet(dst, value); if (count == 3) - return SetBlock<3>(dst, value); + return SplatSet(dst, value); if (count == 4) - return SetBlock<4>(dst, value); + return SplatSet(dst, value); if (count <= 8) - return SetBlockOverlap<4>(dst, value, count); + return SplatSet>(dst, value, count); if (count <= 16) - return SetBlockOverlap<8>(dst, value, count); + return SplatSet>(dst, value, count); if (count <= 32) - return SetBlockOverlap<16>(dst, value, count); + return SplatSet>(dst, value, count); if (count <= 64) - return SetBlockOverlap<32>(dst, value, count); + return SplatSet>(dst, value, count); if (count <= 128) - return SetBlockOverlap<64>(dst, value, count); - return SetAlignedBlocks<32>(dst, value, count); + return SplatSet>(dst, value, count); + return SplatSet::Then>>( + dst, value, count); } } // namespace __llvm_libc -#endif // LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H diff --git a/libc/src/string/x86_64/memcpy.cpp b/libc/src/string/x86_64/memcpy.cpp --- a/libc/src/string/x86_64/memcpy.cpp +++ b/libc/src/string/x86_64/memcpy.cpp @@ -8,9 +8,10 @@ #include "src/string/memcpy.h" #include "src/__support/common.h" -#include "src/string/memory_utils/memcpy_utils.h" +#include "src/string/memory_utils/elements.h" namespace __llvm_libc { +namespace x86 { // Whether to use only rep;movsb. constexpr bool kUseOnlyRepMovsb = @@ -29,8 +30,11 @@ // Whether target supports AVX instructions. constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__); -// The chunk size used for the loop copy strategy. -constexpr size_t kLoopCopyBlockSize = kHasAvx ? 64 : 32; +#ifdef __AVX__ +using LoopBlockSize = _64; +#else +using LoopBlockSize = _32; +#endif static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, size_t count) { @@ -67,35 +71,37 @@ if (count == 0) return; if (count == 1) - return CopyBlock<1>(dst, src); + return Copy<_1>(dst, src); if (count == 2) - return CopyBlock<2>(dst, src); + return Copy<_2>(dst, src); if (count == 3) - return CopyBlock<3>(dst, src); + return Copy<_3>(dst, src); if (count == 4) - return CopyBlock<4>(dst, src); + return Copy<_4>(dst, src); if (count < 8) - return CopyBlockOverlap<4>(dst, src, count); + return Copy>(dst, src, count); if (count < 16) - return CopyBlockOverlap<8>(dst, src, count); + return Copy>(dst, src, count); if (count < 32) - return CopyBlockOverlap<16>(dst, src, count); + return Copy>(dst, src, count); if (count < 64) - return CopyBlockOverlap<32>(dst, src, count); + return Copy>(dst, src, count); if (count < 128) - return CopyBlockOverlap<64>(dst, src, count); + return Copy>(dst, src, count); if (kHasAvx && count < 256) - return CopyBlockOverlap<128>(dst, src, count); + return Copy>(dst, src, count); if (count <= kRepMovsBSize) - return CopyDstAlignedBlocks(dst, src, count); + return Copy::Then>>(dst, src, + count); return CopyRepMovsb(dst, src, count); } +} // namespace x86 LLVM_LIBC_FUNCTION(void *, memcpy, (void *__restrict dst, const void *__restrict src, size_t size)) { - memcpy_x86(reinterpret_cast(dst), reinterpret_cast(src), - size); + x86::memcpy_x86(reinterpret_cast(dst), + reinterpret_cast(src), size); return dst; } diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -3,11 +3,14 @@ SUITE libc_string_unittests SRCS + elements_test.cpp + memory_access_test.cpp utils_test.cpp - memcpy_utils_test.cpp DEPENDS libc.src.string.memory_utils.memory_utils libc.utils.CPP.standalone_cpp + COMPILE_OPTIONS + -march=native ) target_compile_definitions( diff --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memory_utils/elements_test.cpp @@ -0,0 +1,103 @@ +//===-- Unittests for memory_utils ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memory_utils/elements.h" +#include "utils/CPP/Array.h" +#include "utils/UnitTest/Test.h" + +namespace __llvm_libc { + +// Registering Types +using FixedSizeTypes = testing::TypeList< +#ifdef __SSE2__ + x86::Vector128, // +#endif // __SSE2__ +#ifdef __AVX2__ + x86::Vector256, // +#endif // __AVX2__ +#if defined(__AVX512F__) and defined(__AVX512BW__) + x86::Vector512, // +#endif // defined(__AVX512F__) and defined(__AVX512BW__) + scalar::UINT8, // + scalar::UINT16, // + scalar::UINT32, // + scalar::UINT64, // + Repeated, // + Repeated, // + Repeated, // + Repeated, // + Repeated, // + Chained, // + Chained, // + builtin::_1, // + builtin::_2, // + builtin::_3, // + builtin::_4, // + builtin::_8 // + >; + +char GetRandomChar() { + static constexpr const uint64_t a = 1103515245; + static constexpr const uint64_t c = 12345; + static constexpr const uint64_t m = 1ULL << 31; + static uint64_t seed = 123456789; + seed = (a * seed + c) % m; + return seed; +} + +template using Buffer = cpp::Array; +template Buffer GetRandomBuffer() { + Buffer buffer; + for (auto ¤t : buffer) + current = GetRandomChar(); + return buffer; +} + +TYPED_TEST(LlvmLibcMemoryElements, Copy, FixedSizeTypes) { + Buffer Dst; + const auto buffer = GetRandomBuffer(); + Copy(Dst.data(), buffer.data()); + for (size_t i = 0; i < ParamType::kSize; ++i) + EXPECT_EQ(Dst[i], buffer[i]); +} + +TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) { + const auto buffer = GetRandomBuffer(); + EXPECT_TRUE(Equals(buffer.data(), buffer.data())); +} + +TYPED_TEST(LlvmLibcMemoryElements, ThreeWayCompare, FixedSizeTypes) { + Buffer initial; + for (auto &c : initial) + c = 5; + + // Testing equality + EXPECT_EQ(ThreeWayCompare(initial.data(), initial.data()), 0); + + // Testing all mismatching positions + for (size_t i = 0; i < ParamType::kSize; ++i) { + auto copy = initial; + ++copy[i]; // Copy is now lexicographycally greated than initial + const auto *less = initial.data(); + const auto *greater = copy.data(); + EXPECT_LT(ThreeWayCompare(less, greater), 0); + EXPECT_GT(ThreeWayCompare(greater, less), 0); + } +} + +TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) { + Buffer Dst; + const cpp::Array values = {char(0x00), char(0x7F), char(0xFF)}; + for (char value : values) { + SplatSet(Dst.data(), value); + for (size_t i = 0; i < ParamType::kSize; ++i) + EXPECT_EQ(Dst[i], value); + } +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp deleted file mode 100644 --- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp +++ /dev/null @@ -1,336 +0,0 @@ -//===-- Unittests for memory_utils ----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/string/memory_utils/memcpy_utils.h" -#include "utils/CPP/Array.h" -#include "utils/UnitTest/Test.h" - -#include -#include // uintptr_t - -#ifndef LLVM_LIBC_MEMCPY_MONITOR -#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test. -#endif - -namespace __llvm_libc { - -struct Buffer { - static constexpr size_t kMaxBuffer = 1024; - char buffer[kMaxBuffer + 1]; - size_t last = 0; - - void Clear() { - last = 0; - for (size_t i = 0; i < kMaxBuffer; ++i) - buffer[i] = '0'; - buffer[kMaxBuffer] = '\0'; - } - - void Increment(const void *ptr) { - const auto offset = reinterpret_cast(ptr); - assert(offset < kMaxBuffer); - ++buffer[offset]; - if (offset > last) - last = offset; - } - - char *Finish() { - assert(last < kMaxBuffer); - buffer[last + 1] = '\0'; - return buffer; - } -}; - -struct Trace { - Buffer read; - Buffer write; - - void Add(char *__restrict dst, const char *__restrict src, size_t count) { - for (size_t i = 0; i < count; ++i) - read.Increment(src + i); - for (size_t i = 0; i < count; ++i) - write.Increment(dst + i); - } - - void Clear() { - read.Clear(); - write.Clear(); - } - - char *Read() { return read.Finish(); } - char *Write() { return write.Finish(); } -}; - -static Trace &GetTrace() { - static thread_local Trace events; - return events; -} - -extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst, - const char *__restrict src, - size_t count) { - GetTrace().Add(dst, src, count); -} - -char *I(uintptr_t offset) { return reinterpret_cast(offset); } - -TEST(LlvmLibcMemcpyUtilsTest, CopyTrivial) { - auto &trace = GetTrace(); - - trace.Clear(); - CopyBlock<1>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "1"); - EXPECT_STREQ(trace.Read(), "1"); - - trace.Clear(); - CopyBlock<2>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "11"); - EXPECT_STREQ(trace.Read(), "11"); - - trace.Clear(); - CopyBlock<4>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "1111"); - EXPECT_STREQ(trace.Read(), "1111"); - - trace.Clear(); - CopyBlock<8>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "11111111"); - EXPECT_STREQ(trace.Read(), "11111111"); - - trace.Clear(); - CopyBlock<16>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "1111111111111111"); - EXPECT_STREQ(trace.Read(), "1111111111111111"); - - trace.Clear(); - CopyBlock<32>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111"); - EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111"); - - trace.Clear(); - CopyBlock<64>(I(0), I(0)); - EXPECT_STREQ( - trace.Write(), - "1111111111111111111111111111111111111111111111111111111111111111"); - EXPECT_STREQ( - trace.Read(), - "1111111111111111111111111111111111111111111111111111111111111111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyOffset) { - auto &trace = GetTrace(); - - trace.Clear(); - CopyBlock<1>(I(3), I(1)); - EXPECT_STREQ(trace.Write(), "0001"); - EXPECT_STREQ(trace.Read(), "01"); - - trace.Clear(); - CopyBlock<1>(I(2), I(1)); - EXPECT_STREQ(trace.Write(), "001"); - EXPECT_STREQ(trace.Read(), "01"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyBlockOverlap) { - auto &trace = GetTrace(); - - trace.Clear(); - CopyBlockOverlap<2>(I(0), I(0), 2); - EXPECT_STREQ(trace.Write(), "22"); - EXPECT_STREQ(trace.Read(), "22"); - - trace.Clear(); - CopyBlockOverlap<2>(I(0), I(0), 3); - EXPECT_STREQ(trace.Write(), "121"); - EXPECT_STREQ(trace.Read(), "121"); - - trace.Clear(); - CopyBlockOverlap<2>(I(0), I(0), 4); - EXPECT_STREQ(trace.Write(), "1111"); - EXPECT_STREQ(trace.Read(), "1111"); - - trace.Clear(); - CopyBlockOverlap<4>(I(2), I(1), 7); - EXPECT_STREQ(trace.Write(), "001112111"); - EXPECT_STREQ(trace.Read(), "01112111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) { - auto &trace = GetTrace(); - // Source is aligned and multiple of alignment. - // "1111" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(0), 4); - EXPECT_STREQ(trace.Write(), "2222"); - EXPECT_STREQ(trace.Read(), "2222"); - - // Source is aligned and multiple of alignment. - // "11110000" - // + "00001111" - // = "11111111" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "11111111"); - EXPECT_STREQ(trace.Read(), "11111111"); - - // Source is aligned already overlap at end. - // "1111000000000" - // + "0000111100000" - // + "0000000011110" - // + "0000000001111" - // = "1111111112221" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(0), 13); - EXPECT_STREQ(trace.Write(), "1111111112221"); - EXPECT_STREQ(trace.Read(), "1111111112221"); - - // Misaligned source. - // "01111000000000" - // + "00001111000000" - // + "00000000111100" - // + "00000000001111" - // = "01112111112211" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(1), 13); - EXPECT_STREQ(trace.Write(), "1112111112211"); - EXPECT_STREQ(trace.Read(), "01112111112211"); - - // Misaligned source aligned at end. - // "011110000000" - // + "000011110000" - // + "000000001111" - // = "011121111111" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(1), 11); - EXPECT_STREQ(trace.Write(), "11121111111"); - EXPECT_STREQ(trace.Read(), "011121111111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyDstAlignedBlocks) { - auto &trace = GetTrace(); - // Destination is aligned and multiple of alignment. - // "1111" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(0), I(0), 4); - EXPECT_STREQ(trace.Write(), "2222"); - EXPECT_STREQ(trace.Read(), "2222"); - - // Destination is aligned and multiple of alignment. - // "11110000" - // + "00001111" - // = "11111111" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "11111111"); - EXPECT_STREQ(trace.Read(), "11111111"); - - // Destination is aligned already overlap at end. - // "1111000000000" - // + "0000111100000" - // + "0000000011110" - // + "0000000001111" - // = "1111111112221" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(0), I(0), 13); - EXPECT_STREQ(trace.Write(), "1111111112221"); - EXPECT_STREQ(trace.Read(), "1111111112221"); - - // Misaligned destination. - // "01111000000000" - // + "00001111000000" - // + "00000000111100" - // + "00000000001111" - // = "01112111112211" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(1), I(0), 13); - EXPECT_STREQ(trace.Write(), "01112111112211"); - EXPECT_STREQ(trace.Read(), "1112111112211"); - - // Misaligned destination aligned at end. - // "011110000000" - // + "000011110000" - // + "000000001111" - // = "011121111111" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(1), I(0), 11); - EXPECT_STREQ(trace.Write(), "011121111111"); - EXPECT_STREQ(trace.Read(), "11121111111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignment) { - auto &trace = GetTrace(); - // Source is aligned and multiple of alignment. - // "11111111" - trace.Clear(); - CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "22221111"); - EXPECT_STREQ(trace.Read(), "22221111"); - - // Destination is aligned and multiple of alignment. - // "11111111" - trace.Clear(); - CopyDstAlignedBlocks<8, 4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "22221111"); - EXPECT_STREQ(trace.Read(), "22221111"); - - // Source is aligned and multiple of alignment. - // "111111111" - trace.Clear(); - CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9); - EXPECT_STREQ(trace.Write(), "122211111"); - EXPECT_STREQ(trace.Read(), "122211111"); - - // Destination is aligned and multiple of alignment. - // "111111111" - trace.Clear(); - CopyDstAlignedBlocks<8, 4>(I(0), I(0), 9); - EXPECT_STREQ(trace.Write(), "122211111"); - EXPECT_STREQ(trace.Read(), "122211111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksMaxReloads) { - auto &trace = GetTrace(); - for (size_t alignment = 0; alignment < 32; ++alignment) { - for (size_t count = 64; count < 768; ++count) { - trace.Clear(); - // We should never reload more than twice when copying from count = 2x32. - CopySrcAlignedBlocks<32>(I(alignment), I(0), count); - const char *const written = trace.Write(); - // First bytes are untouched. - for (size_t i = 0; i < alignment; ++i) - EXPECT_EQ(written[i], '0'); - // Next bytes are loaded once or twice but no more. - for (size_t i = alignment; i < count; ++i) { - EXPECT_GE(written[i], '1'); - EXPECT_LE(written[i], '2'); - } - } - } -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignmentMaxReloads) { - auto &trace = GetTrace(); - for (size_t alignment = 0; alignment < 32; ++alignment) { - for (size_t count = 64; count < 768; ++count) { - trace.Clear(); - // We should never reload more than twice when copying from count = 2x32. - CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count); - const char *const written = trace.Write(); - // First bytes are untouched. - for (size_t i = 0; i < alignment; ++i) - EXPECT_EQ(written[i], '0'); - // Next bytes are loaded once or twice but no more. - for (size_t i = alignment; i < count; ++i) { - EXPECT_GE(written[i], '1'); - EXPECT_LE(written[i], '2'); - } - } - } -} - -} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memory_utils/memory_access_test.cpp @@ -0,0 +1,231 @@ +//===-- Unittests for memory_utils ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define LLVM_LIBC_UNITTEST_OBSERVE 1 + +#include "src/string/memory_utils/elements.h" +#include "utils/CPP/Array.h" +#include "utils/CPP/ArrayRef.h" +#include "utils/UnitTest/Test.h" + +#include +#include + +namespace __llvm_libc { + +static constexpr const size_t kMaxBuffer = 32; + +struct BufferAccess : cpp::Array { + BufferAccess() { Reset(); } + void Reset() { + for (auto &value : *this) + value = '0'; + this->operator[](kMaxBuffer) = '\0'; + } + void Touch(ptrdiff_t offset, size_t size) { + if (offset < 0) + return; + for (size_t i = 0; i < size; ++i) + ++(*this)[offset + i]; + } + operator const char *() const { return this->data(); } +}; + +struct Buffer { + ptrdiff_t Offset(const char *ptr) const { + const bool contained = ptr >= data.begin() && ptr < data.end(); + return contained ? ptr - data.begin() : -1; + } + void Reset() { + reads.Reset(); + writes.Reset(); + } + cpp::Array data; + BufferAccess __attribute__((aligned(64))) reads; + BufferAccess __attribute__((aligned(64))) writes; +}; + +struct MemoryAccessObserver { + void ObserveRead(const char *ptr, size_t size) { + Buffer1.reads.Touch(Buffer1.Offset(ptr), size); + Buffer2.reads.Touch(Buffer2.Offset(ptr), size); + } + + void ObserveWrite(const char *ptr, size_t size) { + Buffer1.writes.Touch(Buffer1.Offset(ptr), size); + Buffer2.writes.Touch(Buffer2.Offset(ptr), size); + } + + void Reset() { + Buffer1.Reset(); + Buffer2.Reset(); + } + + Buffer Buffer1; + Buffer Buffer2; +}; + +MemoryAccessObserver Observer; + +template struct TestingElement { + static constexpr size_t kSize = Size; + + static void Copy(char *__restrict dst, const char *__restrict src) { + Observer.ObserveRead(src, kSize); + Observer.ObserveWrite(dst, kSize); + } + + static bool Equals(const char *lhs, const char *rhs) { + Observer.ObserveRead(lhs, kSize); + Observer.ObserveRead(rhs, kSize); + return true; + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + Observer.ObserveRead(lhs, kSize); + Observer.ObserveRead(rhs, kSize); + return 0; + } + + static void SplatSet(char *dst, const unsigned char value) { + Observer.ObserveWrite(dst, kSize); + } +}; + +using Types = testing::TypeList< + TestingElement<1>, // 1 Byte + TestingElement<2>, // 2 Bytes + TestingElement<4>, // 4 Bytes + Repeated, 3>, // 6 Bytes + Chained, TestingElement<2>, TestingElement<1>> // 7 Bytes + >; + +struct LlvmLibcTestAccessBase : public testing::Test { + + template + void checkOperations(const BufferAccess &expected) { + static const BufferAccess untouched; + + Observer.Reset(); + HigherOrder::Copy(dst_ptr() + Offset, src_ptr() + Offset, Size); + ASSERT_STREQ(src().writes, untouched); + ASSERT_STREQ(dst().reads, untouched); + ASSERT_STREQ(src().reads, expected); + ASSERT_STREQ(dst().writes, expected); + Observer.Reset(); + HigherOrder::Equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size); + ASSERT_STREQ(lhs().writes, untouched); + ASSERT_STREQ(rhs().writes, untouched); + ASSERT_STREQ(lhs().reads, expected); + ASSERT_STREQ(rhs().reads, expected); + Observer.Reset(); + HigherOrder::ThreeWayCompare(lhs_ptr() + Offset, rhs_ptr() + Offset, Size); + ASSERT_STREQ(lhs().writes, untouched); + ASSERT_STREQ(rhs().writes, untouched); + ASSERT_STREQ(lhs().reads, expected); + ASSERT_STREQ(rhs().reads, expected); + Observer.Reset(); + HigherOrder::SplatSet(dst_ptr() + Offset, 5, Size); + ASSERT_STREQ(src().reads, untouched); + ASSERT_STREQ(src().writes, untouched); + ASSERT_STREQ(dst().reads, untouched); + ASSERT_STREQ(dst().writes, expected); + } + + void checkMaxAccess(const BufferAccess &expected, int max) { + for (size_t i = 0; i < kMaxBuffer; ++i) { + int value = (int)expected[i] - '0'; + if (value < 0 || value > max) { + printf("expected no more than %d access, was '%s'\n", max, + (const char *)expected); + ASSERT_LE(value, max); + } + } + } + +private: + const Buffer &lhs() const { return Observer.Buffer1; } + const Buffer &rhs() const { return Observer.Buffer2; } + const Buffer &src() const { return Observer.Buffer2; } + const Buffer &dst() const { return Observer.Buffer1; } + Buffer &dst() { return Observer.Buffer1; } + + char *dst_ptr() { return dst().data.begin(); } + const char *src_ptr() { return src().data.begin(); } + const char *lhs_ptr() { return lhs().data.begin(); } + const char *rhs_ptr() { return rhs().data.begin(); } +}; + +template +struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase { + + void TearDown() override { + static constexpr size_t Size = 10; + + BufferAccess expected; + expected.Touch(Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 1); + checkOperations, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {} + +template +struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 10; + + BufferAccess expected; + expected.Touch(0, ParamType::kSize); + expected.Touch(Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 2); + checkOperations, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {} + +template +struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 20; + + BufferAccess expected; + for (size_t i = 0; i < Size - ParamType::kSize; i += ParamType::kSize) + expected.Touch(i, ParamType::kSize); + expected.Touch(Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 2); + checkOperations, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {} + +template +struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 10; + static constexpr size_t Offset = 2; + using AlignmentT = TestingElement<4>; + + BufferAccess expected; + expected.Touch(Offset, AlignmentT::kSize); + expected.Touch(AlignmentT::kSize, ParamType::kSize); + expected.Touch(Offset + Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 3); + checkOperations::Then>, Size, + Offset>(expected); + checkOperations::Then>, Size, + Offset>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {} + +} // namespace __llvm_libc