diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -194,7 +194,7 @@ SRCS ${ADD_IMPL_SRCS} HDRS ${ADD_IMPL_HDRS} DEPENDS ${ADD_IMPL_DEPENDS} - COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} + COMPILE_OPTIONS ${ADD_IMPL_COMPILE_OPTIONS} "SHELL:-mllvm -combiner-global-alias-analysis" ) get_fq_target_name(${impl_name} fq_target_name) set_target_properties(${fq_target_name} PROPERTIES REQUIRE_CPU_FEATURES "${ADD_IMPL_REQUIRE}") diff --git a/libc/src/string/aarch64/memcpy.cpp b/libc/src/string/aarch64/memcpy.cpp --- a/libc/src/string/aarch64/memcpy.cpp +++ b/libc/src/string/aarch64/memcpy.cpp @@ -8,10 +8,19 @@ #include "src/string/memcpy.h" #include "src/__support/common.h" -#include "src/string/memory_utils/memcpy_utils.h" +#include "src/string/memory_utils/elements.h" namespace __llvm_libc { +using _1 = scalar::UINT8; +using _2 = scalar::UINT16; +using _3 = Chained<scalar::UINT16, scalar::UINT8>; +using _4 = scalar::UINT32; +using _8 = scalar::UINT64; +using _16 = Repeated<scalar::UINT64, 2>; +using _32 = Repeated<scalar::UINT64, 4>; +using _64 = Repeated<scalar::UINT64, 8>; + // Design rationale // ================ // @@ -37,24 +46,24 @@ if (count == 0) return; if (count == 1) - return CopyBlock<1>(dst, src); + return Copy<_1>(dst, src); if (count == 2) - return CopyBlock<2>(dst, src); + return Copy<_2>(dst, src); if (count == 3) - return CopyBlock<3>(dst, src); + return Copy<_3>(dst, src); if (count == 4) - return CopyBlock<4>(dst, src); + return Copy<_4>(dst, src); if (count < 8) - return CopyBlockOverlap<4>(dst, src, count); + return Copy<HeadTail<_4>>(dst, src, count); if (count < 16) - return CopyBlockOverlap<8>(dst, src, count); + return Copy<HeadTail<_8>>(dst, src, count); if (count < 32) - return CopyBlockOverlap<16>(dst, src, count); + return Copy<HeadTail<_16>>(dst, src, count); if (count < 64) - return CopyBlockOverlap<32>(dst, src, count); + return Copy<HeadTail<_32>>(dst, src, count); if (count < 128) - return CopyBlockOverlap<64>(dst, src, count); - return CopySrcAlignedBlocks<64, 16>(dst, src, count); + return Copy<HeadTail<_64>>(dst, src, count); + return Copy<Align<_16, Arg::Src>::Then<Loop<_64>>>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memcpy.cpp b/libc/src/string/memcpy.cpp --- a/libc/src/string/memcpy.cpp +++ b/libc/src/string/memcpy.cpp @@ -8,7 +8,7 @@ #include "src/string/memcpy.h" #include "src/__support/common.h" -#include "src/string/memory_utils/memcpy_utils.h" +#include "src/string/memory_utils/elements.h" namespace __llvm_libc { @@ -32,27 +32,30 @@ // with little change on the code side. static void memcpy_impl(char *__restrict dst, const char *__restrict src, size_t count) { + // Use scalar strategies (_1, _2, _3 ...) + using namespace __llvm_libc::scalar; + if (count == 0) return; if (count == 1) - return CopyBlock<1>(dst, src); + return Copy<_1>(dst, src); if (count == 2) - return CopyBlock<2>(dst, src); + return Copy<_2>(dst, src); if (count == 3) - return CopyBlock<3>(dst, src); + return Copy<_3>(dst, src); if (count == 4) - return CopyBlock<4>(dst, src); + return Copy<_4>(dst, src); if (count < 8) - return CopyBlockOverlap<4>(dst, src, count); + return Copy<HeadTail<_4>>(dst, src, count); if (count < 16) - return CopyBlockOverlap<8>(dst, src, count); + return Copy<HeadTail<_8>>(dst, src, count); if (count < 32) - return CopyBlockOverlap<16>(dst, src, count); + return Copy<HeadTail<_16>>(dst, src, count); if (count < 64) - return CopyBlockOverlap<32>(dst, src, count); + return Copy<HeadTail<_32>>(dst, src, count); if (count < 128) - return CopyBlockOverlap<64>(dst, src, count); - return CopySrcAlignedBlocks<32>(dst, src, count); + return Copy<HeadTail<_64>>(dst, src, count); + return Copy<Align<_32, Arg::Src>::Then<Loop<_32>>>(dst, src, count); } LLVM_LIBC_FUNCTION(void *, memcpy, diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -2,6 +2,5 @@ memory_utils HDRS utils.h - memcpy_utils.h - memset_utils.h + elements.h ) diff --git a/libc/src/string/memory_utils/elements.h b/libc/src/string/memory_utils/elements.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/elements.h @@ -0,0 +1,475 @@ +//===-- Elementary operations to compose memory primitives ----------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H + +#include <stddef.h> // size_t +#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t + +#include "src/__support/endian.h" +#include "src/string/memory_utils/utils.h" + +namespace __llvm_libc { + +// Elementary Operations +// -------------------------------- +// We define abstract elementary operations acting on fixed chunks of memory. +// These are low level building blocks that are meant to be assembled to compose +// higher order abstractions. Each function is defined twice: once with +// fixed-size operations, and once with runtime-size operations. + +// Fixed-size copies from 'src' to 'dst'. +template <typename Element> +void Copy(char *__restrict dst, const char *__restrict src) { + Element::Copy(dst, src); +} +// Runtime-size copies from 'src' to 'dst'. +template <typename Element> +void Copy(char *__restrict dst, const char *__restrict src, size_t size) { + Element::Copy(dst, src, size); +} + +// Fixed-size equality between 'lhs' and 'rhs'. +template <typename Element> bool Equals(const char *lhs, const char *rhs) { + return Element::Equals(lhs, rhs); +} +// Runtime-size equality between 'lhs' and 'rhs'. +template <typename Element> +bool Equals(const char *lhs, const char *rhs, size_t size) { + return Element::Equals(lhs, rhs, size); +} + +// Fixed-size three-way comparison between 'lhs' and 'rhs'. +template <typename Element> +int ThreeWayCompare(const char *lhs, const char *rhs) { + return Element::ThreeWayCompare(lhs, rhs); +} +// Runtime-size three-way comparison between 'lhs' and 'rhs'. +template <typename Element> +int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + return Element::ThreeWayCompare(lhs, rhs, size); +} + +// Fixed-size initialization. +template <typename Element> +void SplatSet(char *dst, const unsigned char value) { + Element::SplatSet(dst, value); +} +// Runtime-size initialization. +template <typename Element> +void SplatSet(char *dst, const unsigned char value, size_t size) { + Element::SplatSet(dst, value, size); +} + +// Fixed-size Higher-Order Operations +// ---------------------------------- +// - Repeated<Type, ElementCount>: Repeat the operation several times in a row. +// - Chained<Types...>: Chain the operation of several types. + +// Repeat the operation several times in a row. +template <typename Element, size_t ElementCount> struct Repeated { + static constexpr size_t kSize = ElementCount * Element::kSize; + + static void Copy(char *__restrict dst, const char *__restrict src) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + Element::Copy(dst + offset, src + offset); + } + } + + static bool Equals(const char *lhs, const char *rhs) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + if (!Element::Equals(lhs + offset, rhs + offset)) + return false; + } + return true; + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + // We make the assumption that 'Equals' si cheaper than 'ThreeWayCompare'. + if (Element::Equals(lhs + offset, rhs + offset)) + continue; + return Element::ThreeWayCompare(lhs + offset, rhs + offset); + } + return 0; + } + + static void SplatSet(char *dst, const unsigned char value) { + for (size_t i = 0; i < ElementCount; ++i) { + const size_t offset = i * Element::kSize; + Element::SplatSet(dst + offset, value); + } + } +}; + +// Chain the operation of several types. +// For instance, to handle a 3 bytes operation, one can use: +// Chained<UINT16, UINT8>::Operation(); +template <typename... Types> struct Chained; + +template <typename Head, typename... Tail> struct Chained<Head, Tail...> { + static constexpr size_t kSize = Head::kSize + Chained<Tail...>::kSize; + + static void Copy(char *__restrict dst, const char *__restrict src) { + Chained<Tail...>::Copy(dst + Head::kSize, src + Head::kSize); + __llvm_libc::Copy<Head>(dst, src); + } + + static bool Equals(const char *lhs, const char *rhs) { + if (!__llvm_libc::Equals<Head>(lhs, rhs)) + return false; + return Chained<Tail...>::Equals(lhs + Head::kSize, rhs + Head::kSize); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + if (__llvm_libc::Equals<Head>(lhs, rhs)) + return Chained<Tail...>::ThreeWayCompare(lhs + Head::kSize, + rhs + Head::kSize); + return __llvm_libc::ThreeWayCompare<Head>(lhs, rhs); + } + + static void SplatSet(char *dst, const unsigned char value) { + Chained<Tail...>::SplatSet(dst + Head::kSize, value); + __llvm_libc::SplatSet<Head>(dst, value); + } +}; + +template <> struct Chained<> { + static constexpr size_t kSize = 0; + static void Copy(char *__restrict dst, const char *__restrict src) {} + static bool Equals(const char *lhs, const char *rhs) { return true; } + static int ThreeWayCompare(const char *lhs, const char *rhs) { return 0; } + static void SplatSet(char *dst, const unsigned char value) {} +}; + +// Runtime-size Higher-Order Operations +// ------------------------------------ +// - Tail<T>: Perform the operation on the last 'T::kSize' bytes of the buffer. +// - HeadTail<T>: Perform the operation on the first and last 'T::kSize' bytes +// of the buffer. +// - Loop<T>: Perform a loop of fixed-sized operations. + +// Perform the operation on the last 'T::kSize' bytes of the buffer. +// +// e.g. with +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [________XXXXXXXX___] +// +// Precondition: `size >= T::kSize`. +template <typename T> struct Tail { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + return T::Copy(dst + offset(size), src + offset(size)); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + return T::Equals(lhs + offset(size), rhs + offset(size)); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + return T::ThreeWayCompare(lhs + offset(size), rhs + offset(size)); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + return T::SplatSet(dst + offset(size), value); + } + + static size_t offset(size_t size) { return size - T::kSize; } +}; + +// Perform the operation on the first and last 'T::kSize' bytes of the buffer. +// This is useful for overlapping operations. +// +// e.g. with +// [1234567812345678123] +// [__XXXXXXXXXXXXXX___] +// [__XXXXXXXX_________] +// [________XXXXXXXX___] +// +// Precondition: `size >= T::kSize && size <= 2 x T::kSize`. +template <typename T> struct HeadTail { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + T::Copy(dst, src); + Tail<T>::Copy(dst, src, size); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + if (!T::Equals(lhs, rhs)) + return false; + return Tail<T>::Equals(lhs, rhs, size); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + if (const int result = T::ThreeWayCompare(lhs, rhs)) + return result; + return Tail<T>::ThreeWayCompare(lhs, rhs, size); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + T::SplatSet(dst, value); + Tail<T>::SplatSet(dst, value, size); + } +}; + +// Simple loop ending with a Tail operation. +// +// e.g. with +// [12345678123456781234567812345678] +// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___] +// [__XXXXXXXX_______________________] +// [__________XXXXXXXX_______________] +// [__________________XXXXXXXX_______] +// [______________________XXXXXXXX___] +// +// Precondition: +// - size >= T::kSize +template <typename T> struct Loop { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + T::Copy(dst + offset, src + offset); + Tail<T>::Copy(dst, src, size); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + if (!T::Equals(lhs + offset, rhs + offset)) + return false; + return Tail<T>::Equals(lhs, rhs, size); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + if (const int result = T::ThreeWayCompare(lhs + offset, rhs + offset)) + return result; + return Tail<T>::ThreeWayCompare(lhs, rhs, size); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + for (size_t offset = 0; offset < size - T::kSize; offset += T::kSize) + T::SplatSet(dst + offset, value); + Tail<T>::SplatSet(dst, value, size); + } +}; + +enum class Arg { _1, _2, Dst = _1, Src = _2, Lhs = _1, Rhs = _2 }; + +namespace internal { + +// Provides a specialized Bump function that adjusts pointers and size so first +// argument (resp. second argument) gets aligned to Alignment. +// We make sure the compiler knows about the adjusted pointer alignment. +template <Arg arg, size_t Alignment> struct AlignHelper {}; + +template <size_t Alignment> struct AlignHelper<Arg::_1, Alignment> { + template <typename T1, typename T2> + static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) { + const intptr_t offset = offset_to_next_aligned<Alignment>(p1ref); + p1ref += offset; + p2ref += offset; + size -= offset; + p1ref = assume_aligned<Alignment>(p1ref); + } +}; + +template <size_t Alignment> struct AlignHelper<Arg::_2, Alignment> { + template <typename T1, typename T2> + static void Bump(T1 *__restrict &p1ref, T2 *__restrict &p2ref, size_t &size) { + const intptr_t offset = offset_to_next_aligned<Alignment>(p2ref); + p1ref += offset; + p2ref += offset; + size -= offset; + p2ref = assume_aligned<Alignment>(p2ref); + } +}; + +} // namespace internal + +// An alignment operation that: +// - executes the 'AlignmentT' operation +// - bumps 'dst' or 'src' (resp. 'lhs' or 'rhs') pointers so that the selected +// pointer gets aligned, size is decreased accordingly. +// - calls the 'NextT' operation. +// +// e.g. A 16-byte Destination Aligned 32-byte Loop Copy can be written as: +// Copy<Align<_16, Arg::Dst>::Then<Loop<_32>>>(dst, src, count); +template <typename AlignmentT, Arg AlignOn> struct Align { +private: + static constexpr size_t Alignment = AlignmentT::kSize; + static_assert(Alignment > 1, "Alignment must be more than 1"); + static_assert(is_power2(Alignment), "Alignment must be a power of 2"); + +public: + template <typename NextT> struct Then { + static void Copy(char *__restrict dst, const char *__restrict src, + size_t size) { + AlignmentT::Copy(dst, src); + internal::AlignHelper<AlignOn, Alignment>::Bump(dst, src, size); + NextT::Copy(dst, src, size); + } + + static bool Equals(const char *lhs, const char *rhs, size_t size) { + if (!AlignmentT::Equals(lhs, rhs)) + return false; + internal::AlignHelper<AlignOn, Alignment>::Bump(lhs, rhs, size); + return NextT::Equals(lhs, rhs, size); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs, size_t size) { + if (const int result = AlignmentT::ThreeWayCompare(lhs, rhs)) + return result; + internal::AlignHelper<AlignOn, Alignment>::Bump(lhs, rhs, size); + return NextT::ThreeWayCompare(lhs, rhs, size); + } + + static void SplatSet(char *dst, const unsigned char value, size_t size) { + AlignmentT::SplatSet(dst, value); + char *dummy = nullptr; + internal::AlignHelper<Arg::_1, Alignment>::Bump(dst, dummy, size); + NextT::SplatSet(dst, value, size); + } + }; +}; + +// Fixed-size Builtin Operations +// ----------------------------- +// Note: Do not use 'builtin' right now as it requires the implementation of the +// `_inline` versions of all the builtins. Theoretically, Clang can still turn +// them into calls to the C library leading to reentrancy problems. +namespace builtin { +template <size_t Size> struct Builtin { + static constexpr size_t kSize = Size; + + static void Copy(char *__restrict dst, const char *__restrict src) { + __builtin_memcpy_inline(dst, src, kSize); + } + + static bool Equals(const char *lhs, const char *rhs) { + return __builtin_memcmp(lhs, rhs, kSize) == 0; + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + return __builtin_memcmp(lhs, rhs, kSize); + } + + static void SplatSet(char *dst, const unsigned char value) { + __builtin_memset(dst, value, kSize); + } +}; + +using _1 = Builtin<1>; +using _2 = Builtin<2>; +using _3 = Builtin<3>; +using _4 = Builtin<4>; +using _8 = Builtin<8>; +using _16 = Builtin<16>; +using _32 = Builtin<32>; +using _64 = Builtin<64>; +using _128 = Builtin<128>; + +} // namespace builtin + +// Fixed-size Scalar Operations +// ---------------------------- +namespace scalar { + +// The Scalar type makes use of simple sized integers. +template <typename T> struct Scalar { + static constexpr size_t kSize = sizeof(T); + + static void Copy(char *__restrict dst, const char *__restrict src) { + Store(dst, Load(src)); + } + + static bool Equals(const char *lhs, const char *rhs) { + return Load(lhs) == Load(rhs); + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + return ScalarThreeWayCompare(Load(lhs), Load(rhs)); + } + + static void SplatSet(char *dst, const unsigned char value) { + Store(dst, GetSplattedValue(value)); + } + +private: + static T Load(const char *ptr) { + T value; + __builtin_memcpy_inline(&value, ptr, kSize); + return value; + } + static void Store(char *ptr, T value) { + __builtin_memcpy_inline(ptr, &value, kSize); + } + static T GetSplattedValue(const unsigned char value) { + return T(~0) / T(0xFF) * T(value); + } + static int ScalarThreeWayCompare(T a, T b); +}; + +template <> +inline int Scalar<uint8_t>::ScalarThreeWayCompare(uint8_t a, uint8_t b) { + const int16_t la = Endian::ToBigEndian(a); + const int16_t lb = Endian::ToBigEndian(b); + return la - lb; +} +template <> +inline int Scalar<uint16_t>::ScalarThreeWayCompare(uint16_t a, uint16_t b) { + const int32_t la = Endian::ToBigEndian(a); + const int32_t lb = Endian::ToBigEndian(b); + return la - lb; +} +template <> +inline int Scalar<uint32_t>::ScalarThreeWayCompare(uint32_t a, uint32_t b) { + const int64_t la = Endian::ToBigEndian(a); + const int64_t lb = Endian::ToBigEndian(b); + if (la < lb) + return -1; + if (la > lb) + return 1; + return 0; +} +template <> +inline int Scalar<uint64_t>::ScalarThreeWayCompare(uint64_t a, uint64_t b) { + const __int128_t la = Endian::ToBigEndian(a); + const __int128_t lb = Endian::ToBigEndian(b); + if (la < lb) + return -1; + if (la > lb) + return 1; + return 0; +} + +using UINT8 = Scalar<uint8_t>; // 1 Byte +using UINT16 = Scalar<uint16_t>; // 2 Bytes +using UINT32 = Scalar<uint32_t>; // 4 Bytes +using UINT64 = Scalar<uint64_t>; // 8 Bytes + +using _1 = UINT8; +using _2 = UINT16; +using _3 = Chained<UINT16, UINT8>; +using _4 = UINT32; +using _8 = UINT64; +using _16 = Repeated<_8, 2>; +using _32 = Repeated<_8, 4>; +using _64 = Repeated<_8, 8>; +using _128 = Repeated<_8, 16>; + +} // namespace scalar +} // namespace __llvm_libc + +#include <src/string/memory_utils/elements_x86.h> + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_H diff --git a/libc/src/string/memory_utils/elements_x86.h b/libc/src/string/memory_utils/elements_x86.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/elements_x86.h @@ -0,0 +1,151 @@ +//===-- Elementary operations for x86 -------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H + +#include <stddef.h> // size_t +#include <stdint.h> // uint8_t, uint16_t, uint32_t, uint64_t + +#ifdef __SSE2__ +#include <immintrin.h> +#endif // __SSE2__ + +#include "src/string/memory_utils/elements.h" // __llvm_libc::scalar + +// Fixed-size Vector Operations +// ---------------------------- + +namespace __llvm_libc { +namespace x86 { + +#ifdef __SSE2__ +template <typename Base> struct Vector : public Base { + static void Copy(char *dst, const char *src) { + Base::Store(dst, Base::Load(src)); + } + + static bool Equals(const char *a, const char *b) { + return Base::NotEqualMask(Base::Load(a), Base::Load(b)) == 0; + } + + static int ThreeWayCompare(const char *a, const char *b) { + const auto mask = Base::NotEqualMask(Base::Load(a), Base::Load(b)); + if (!mask) + return 0; + return CharDiff(a, b, mask); + } + + static void SplatSet(char *dst, const unsigned char value) { + Base::Store(dst, Base::GetSplattedValue(value)); + } + + static int CharDiff(const char *a, const char *b, uint64_t mask) { + const size_t diff_index = __builtin_ctzl(mask); + const int ca = (unsigned char)a[diff_index]; + const int cb = (unsigned char)b[diff_index]; + return ca - cb; + } +}; + +struct M128 { + static constexpr size_t kSize = 16; + using T = char __attribute__((__vector_size__(kSize))); + static uint16_t mask(T value) { return _mm_movemask_epi8(value); } + static uint16_t NotEqualMask(T a, T b) { return mask(a != b); } + static T Load(const char *ptr) { + return _mm_loadu_si128(reinterpret_cast<__m128i_u const *>(ptr)); + } + static void Store(char *ptr, T value) { + return _mm_storeu_si128(reinterpret_cast<__m128i_u *>(ptr), value); + } + static T GetSplattedValue(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; + +using Vector128 = Vector<M128>; // 16 Bytes + +#ifdef __AVX2__ +struct M256 { + static constexpr size_t kSize = 32; + using T = char __attribute__((__vector_size__(kSize))); + static uint32_t mask(T value) { return _mm256_movemask_epi8(value); } + static uint32_t NotEqualMask(T a, T b) { return mask(a != b); } + static T Load(const char *ptr) { + return _mm256_loadu_si256(reinterpret_cast<__m256i const *>(ptr)); + } + static void Store(char *ptr, T value) { + return _mm256_storeu_si256(reinterpret_cast<__m256i *>(ptr), value); + } + static T GetSplattedValue(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; + +using Vector256 = Vector<M256>; // 32 Bytes + +#if defined(__AVX512F__) and defined(__AVX512BW__) +struct M512 { + static constexpr size_t kSize = 64; + using T = char __attribute__((__vector_size__(kSize))); + static uint64_t NotEqualMask(T a, T b) { + return _mm512_cmpneq_epi8_mask(a, b); + } + static T Load(const char *ptr) { return _mm512_loadu_epi8(ptr); } + static void Store(char *ptr, T value) { + return _mm512_storeu_epi8(ptr, value); + } + static T GetSplattedValue(const char v) { + const T splatted = {v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, + v, v, v, v, v, v, v, v, v, v, v, v, v, v, v, v}; + return splatted; + } +}; +using Vector512 = Vector<M512>; + +#endif // defined(__AVX512F__) and defined(__AVX512BW__) +#endif // __AVX2__ +#endif // __SSE2__ + +using _1 = __llvm_libc::scalar::_1; +using _2 = __llvm_libc::scalar::_2; +using _3 = __llvm_libc::scalar::_3; +using _4 = __llvm_libc::scalar::_4; +using _8 = __llvm_libc::scalar::_8; +#if defined(__AVX512F__) && defined(__AVX512BW__) +using _16 = __llvm_libc::x86::Vector128; +using _32 = __llvm_libc::x86::Vector256; +using _64 = __llvm_libc::x86::Vector512; +using _128 = __llvm_libc::Repeated<_64, 2>; +#elif defined(__AVX2__) +using _16 = __llvm_libc::x86::Vector128; +using _32 = __llvm_libc::x86::Vector256; +using _64 = __llvm_libc::Repeated<_32, 2>; +using _128 = __llvm_libc::Repeated<_32, 4>; +#elif defined(__SSE2__) +using _16 = __llvm_libc::x86::Vector128; +using _32 = __llvm_libc::Repeated<_16, 2>; +using _64 = __llvm_libc::Repeated<_16, 4>; +using _128 = __llvm_libc::Repeated<_16, 8>; +#else +using _16 = __llvm_libc::Repeated<_8, 2>; +using _32 = __llvm_libc::Repeated<_8, 4>; +using _64 = __llvm_libc::Repeated<_8, 8>; +using _128 = __llvm_libc::Repeated<_8, 16>; +#endif + +} // namespace x86 +} // namespace __llvm_libc + +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_ELEMENTS_X86_H diff --git a/libc/src/string/memory_utils/memcpy_utils.h b/libc/src/string/memory_utils/memcpy_utils.h deleted file mode 100644 --- a/libc/src/string/memory_utils/memcpy_utils.h +++ /dev/null @@ -1,140 +0,0 @@ -//===-- Memcpy utils --------------------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H - -#include "src/__support/sanitizer.h" -#include "src/string/memory_utils/utils.h" -#include <stddef.h> // size_t - -// __builtin_memcpy_inline guarantees to never call external functions. -// Unfortunately it is not widely available. -#ifdef __clang__ -#if __has_builtin(__builtin_memcpy_inline) -#define USE_BUILTIN_MEMCPY_INLINE -#endif -#elif defined(__GNUC__) -#define USE_BUILTIN_MEMCPY -#endif - -namespace __llvm_libc { - -// This is useful for testing. -#if defined(LLVM_LIBC_MEMCPY_MONITOR) -extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict, - const char *__restrict, size_t); -#endif - -// Copies `kBlockSize` bytes from `src` to `dst` using a for loop. -// This code requires the use of `-fno-buitin-memcpy` to prevent the compiler -// from turning the for-loop back into `__builtin_memcpy`. -template <size_t kBlockSize> -static void ForLoopCopy(char *__restrict dst, const char *__restrict src) { - for (size_t i = 0; i < kBlockSize; ++i) - dst[i] = src[i]; -} - -// Copies `kBlockSize` bytes from `src` to `dst`. -template <size_t kBlockSize> -static void CopyBlock(char *__restrict dst, const char *__restrict src) { -#if defined(LLVM_LIBC_MEMCPY_MONITOR) - LLVM_LIBC_MEMCPY_MONITOR(dst, src, kBlockSize); -#elif LLVM_LIBC_HAVE_MEMORY_SANITIZER || LLVM_LIBC_HAVE_ADDRESS_SANITIZER - ForLoopCopy<kBlockSize>(dst, src); -#elif defined(USE_BUILTIN_MEMCPY_INLINE) - __builtin_memcpy_inline(dst, src, kBlockSize); -#elif defined(USE_BUILTIN_MEMCPY) - __builtin_memcpy(dst, src, kBlockSize); -#else - ForLoopCopy<kBlockSize>(dst, src); -#endif -} - -// Copies `kBlockSize` bytes from `src + count - kBlockSize` to -// `dst + count - kBlockSize`. -// Precondition: `count >= kBlockSize`. -template <size_t kBlockSize> -static void CopyLastBlock(char *__restrict dst, const char *__restrict src, - size_t count) { - const size_t offset = count - kBlockSize; - CopyBlock<kBlockSize>(dst + offset, src + offset); -} - -// Copies `kBlockSize` bytes twice with an overlap between the two. -// -// [1234567812345678123] -// [__XXXXXXXXXXXXXX___] -// [__XXXXXXXX_________] -// [________XXXXXXXX___] -// -// Precondition: `count >= kBlockSize && count <= kBlockSize`. -template <size_t kBlockSize> -static void CopyBlockOverlap(char *__restrict dst, const char *__restrict src, - size_t count) { - CopyBlock<kBlockSize>(dst, src); - CopyLastBlock<kBlockSize>(dst, src, count); -} - -// Copies `count` bytes by blocks of `kBlockSize` bytes. -// Copies at the start and end of the buffer are unaligned. -// Copies in the middle of the buffer are aligned to `kAlignment`. -// -// e.g. with -// [12345678123456781234567812345678] -// [__XXXXXXXXXXXXXXXXXXXXXXXXXXXX___] -// [__XXXX___________________________] -// [_____XXXXXXXX____________________] -// [_____________XXXXXXXX____________] -// [_____________________XXXXXXXX____] -// [______________________XXXXXXXX___] -// -// Precondition: `kAlignment <= kBlockSize` -// `count > 2 * kBlockSize` for efficiency. -// `count >= kAlignment` for correctness. -template <size_t kBlockSize, size_t kAlignment = kBlockSize> -static void CopySrcAlignedBlocks(char *__restrict dst, - const char *__restrict src, size_t count) { - static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); - static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); - static_assert(kAlignment <= kBlockSize, - "kAlignment must be less or equal to block size"); - CopyBlock<kAlignment>(dst, src); // Copy first block - - // Copy aligned blocks - const size_t ofla = offset_from_last_aligned<kAlignment>(src); - const size_t limit = count + ofla - kBlockSize; - for (size_t offset = kAlignment; offset < limit; offset += kBlockSize) - CopyBlock<kBlockSize>(dst - ofla + offset, - assume_aligned<kAlignment>(src - ofla + offset)); - - CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block -} - -template <size_t kBlockSize, size_t kAlignment = kBlockSize> -static void CopyDstAlignedBlocks(char *__restrict dst, - const char *__restrict src, size_t count) { - static_assert(is_power2(kAlignment), "kAlignment must be a power of two"); - static_assert(is_power2(kBlockSize), "kBlockSize must be a power of two"); - static_assert(kAlignment <= kBlockSize, - "kAlignment must be less or equal to block size"); - CopyBlock<kAlignment>(dst, src); // Copy first block - - // Copy aligned blocks - const size_t ofla = offset_from_last_aligned<kAlignment>(dst); - const size_t limit = count + ofla - kBlockSize; - for (size_t offset = kAlignment; offset < limit; offset += kBlockSize) - CopyBlock<kBlockSize>(assume_aligned<kAlignment>(dst - ofla + offset), - src - ofla + offset); - - CopyLastBlock<kBlockSize>(dst, src, count); // Copy last block -} - -} // namespace __llvm_libc - -#endif // LIBC_SRC_STRING_MEMORY_UTILS_MEMCPY_UTILS_H diff --git a/libc/src/string/memory_utils/memset_utils.h b/libc/src/string/memory_utils/memset_utils.h --- a/libc/src/string/memory_utils/memset_utils.h +++ b/libc/src/string/memory_utils/memset_utils.h @@ -6,70 +6,16 @@ // //===----------------------------------------------------------------------===// -#ifndef LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H -#define LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#include "src/string/memory_utils/elements.h" #include "src/string/memory_utils/utils.h" #include <stddef.h> // size_t namespace __llvm_libc { -// Sets `kBlockSize` bytes starting from `src` to `value`. -template <size_t kBlockSize> static void SetBlock(char *dst, unsigned value) { - // Theoretically the compiler is allowed to call memset here and end up with a - // recursive call, practically it doesn't happen, however this should be - // replaced with a __builtin_memset_inline once it's available in clang. - __builtin_memset(dst, value, kBlockSize); -} - -// Sets `kBlockSize` bytes from `src + count - kBlockSize` to `value`. -// Precondition: `count >= kBlockSize`. -template <size_t kBlockSize> -static void SetLastBlock(char *dst, unsigned value, size_t count) { - SetBlock<kBlockSize>(dst + count - kBlockSize, value); -} - -// Sets `kBlockSize` bytes twice with an overlap between the two. -// -// [1234567812345678123] -// [__XXXXXXXXXXXXXX___] -// [__XXXXXXXX_________] -// [________XXXXXXXX___] -// -// Precondition: `count >= kBlockSize && count <= kBlockSize`. -template <size_t kBlockSize> -static void SetBlockOverlap(char *dst, unsigned value, size_t count) { - SetBlock<kBlockSize>(dst, value); - SetLastBlock<kBlockSize>(dst, value, count); -} - -// Sets `count` bytes by blocks of `kBlockSize` bytes. -// Sets at the start and end of the buffer are unaligned. -// Sets in the middle of the buffer are aligned to `kBlockSize`. -// -// e.g. with -// [12345678123456781234567812345678] -// [__XXXXXXXXXXXXXXXXXXXXXXXXXXX___] -// [__XXXXXXXX______________________] -// [________XXXXXXXX________________] -// [________________XXXXXXXX________] -// [_____________________XXXXXXXX___] -// -// Precondition: `count > 2 * kBlockSize` for efficiency. -// `count >= kBlockSize` for correctness. -template <size_t kBlockSize> -static void SetAlignedBlocks(char *dst, unsigned value, size_t count) { - SetBlock<kBlockSize>(dst, value); // Set first block - - // Set aligned blocks - size_t offset = kBlockSize - offset_from_last_aligned<kBlockSize>(dst); - for (; offset + kBlockSize < count; offset += kBlockSize) - SetBlock<kBlockSize>(dst + offset, value); - - SetLastBlock<kBlockSize>(dst, value, count); // Set last block -} - // A general purpose implementation assuming cheap unaligned writes for sizes: // 1, 2, 4, 8, 16, 32 and 64 Bytes. Note that some architecture can't store 32 // or 64 Bytes at a time, the compiler will expand them as needed. @@ -106,26 +52,27 @@ if (count == 0) return; if (count == 1) - return SetBlock<1>(dst, value); + return SplatSet<scalar::_1>(dst, value); if (count == 2) - return SetBlock<2>(dst, value); + return SplatSet<scalar::_2>(dst, value); if (count == 3) - return SetBlock<3>(dst, value); + return SplatSet<scalar::_3>(dst, value); if (count == 4) - return SetBlock<4>(dst, value); + return SplatSet<scalar::_4>(dst, value); if (count <= 8) - return SetBlockOverlap<4>(dst, value, count); + return SplatSet<HeadTail<scalar::_4>>(dst, value, count); if (count <= 16) - return SetBlockOverlap<8>(dst, value, count); + return SplatSet<HeadTail<scalar::_8>>(dst, value, count); if (count <= 32) - return SetBlockOverlap<16>(dst, value, count); + return SplatSet<HeadTail<scalar::_16>>(dst, value, count); if (count <= 64) - return SetBlockOverlap<32>(dst, value, count); + return SplatSet<HeadTail<scalar::_32>>(dst, value, count); if (count <= 128) - return SetBlockOverlap<64>(dst, value, count); - return SetAlignedBlocks<32>(dst, value, count); + return SplatSet<HeadTail<scalar::_64>>(dst, value, count); + return SplatSet<Align<scalar::_32, Arg::Dst>::Then<Loop<scalar::_32>>>( + dst, value, count); } } // namespace __llvm_libc -#endif // LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_MEMSET_UTILS_H diff --git a/libc/src/string/x86_64/memcpy.cpp b/libc/src/string/x86_64/memcpy.cpp --- a/libc/src/string/x86_64/memcpy.cpp +++ b/libc/src/string/x86_64/memcpy.cpp @@ -8,7 +8,7 @@ #include "src/string/memcpy.h" #include "src/__support/common.h" -#include "src/string/memory_utils/memcpy_utils.h" +#include "src/string/memory_utils/elements.h" namespace __llvm_libc { @@ -29,8 +29,11 @@ // Whether target supports AVX instructions. constexpr bool kHasAvx = LLVM_LIBC_IS_DEFINED(__AVX__); -// The chunk size used for the loop copy strategy. -constexpr size_t kLoopCopyBlockSize = kHasAvx ? 64 : 32; +#ifdef __AVX__ +using LoopBlockSize = __llvm_libc::x86::_64; +#else +using LoopBlockSize = __llvm_libc::x86::_32; +#endif static void CopyRepMovsb(char *__restrict dst, const char *__restrict src, size_t count) { @@ -61,33 +64,37 @@ // with little change on the code side. static void memcpy_x86(char *__restrict dst, const char *__restrict src, size_t count) { + // Use x86 strategies (_1, _2, _3 ...) + using namespace __llvm_libc::x86; + if (kUseOnlyRepMovsb) return CopyRepMovsb(dst, src, count); if (count == 0) return; if (count == 1) - return CopyBlock<1>(dst, src); + return Copy<_1>(dst, src); if (count == 2) - return CopyBlock<2>(dst, src); + return Copy<_2>(dst, src); if (count == 3) - return CopyBlock<3>(dst, src); + return Copy<_3>(dst, src); if (count == 4) - return CopyBlock<4>(dst, src); + return Copy<_4>(dst, src); if (count < 8) - return CopyBlockOverlap<4>(dst, src, count); + return Copy<HeadTail<_4>>(dst, src, count); if (count < 16) - return CopyBlockOverlap<8>(dst, src, count); + return Copy<HeadTail<_8>>(dst, src, count); if (count < 32) - return CopyBlockOverlap<16>(dst, src, count); + return Copy<HeadTail<_16>>(dst, src, count); if (count < 64) - return CopyBlockOverlap<32>(dst, src, count); + return Copy<HeadTail<_32>>(dst, src, count); if (count < 128) - return CopyBlockOverlap<64>(dst, src, count); + return Copy<HeadTail<_64>>(dst, src, count); if (kHasAvx && count < 256) - return CopyBlockOverlap<128>(dst, src, count); + return Copy<HeadTail<_128>>(dst, src, count); if (count <= kRepMovsBSize) - return CopyDstAlignedBlocks<kLoopCopyBlockSize, 32>(dst, src, count); + return Copy<Align<_32, Arg::Dst>::Then<Loop<LoopBlockSize>>>(dst, src, + count); return CopyRepMovsb(dst, src, count); } diff --git a/libc/test/src/string/memory_utils/CMakeLists.txt b/libc/test/src/string/memory_utils/CMakeLists.txt --- a/libc/test/src/string/memory_utils/CMakeLists.txt +++ b/libc/test/src/string/memory_utils/CMakeLists.txt @@ -3,11 +3,14 @@ SUITE libc_string_unittests SRCS + elements_test.cpp + memory_access_test.cpp utils_test.cpp - memcpy_utils_test.cpp DEPENDS libc.src.string.memory_utils.memory_utils libc.utils.CPP.standalone_cpp + COMPILE_OPTIONS + ${LIBC_COMPILE_OPTIONS_NATIVE} ) target_compile_definitions( diff --git a/libc/test/src/string/memory_utils/elements_test.cpp b/libc/test/src/string/memory_utils/elements_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memory_utils/elements_test.cpp @@ -0,0 +1,103 @@ +//===-- Unittests for memory_utils ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/string/memory_utils/elements.h" +#include "utils/CPP/Array.h" +#include "utils/UnitTest/Test.h" + +namespace __llvm_libc { + +// Registering Types +using FixedSizeTypes = testing::TypeList< +#ifdef __SSE2__ + x86::Vector128, // +#endif // __SSE2__ +#ifdef __AVX2__ + x86::Vector256, // +#endif // __AVX2__ +#if defined(__AVX512F__) and defined(__AVX512BW__) + x86::Vector512, // +#endif // defined(__AVX512F__) and defined(__AVX512BW__) + scalar::UINT8, // + scalar::UINT16, // + scalar::UINT32, // + scalar::UINT64, // + Repeated<scalar::UINT64, 2>, // + Repeated<scalar::UINT64, 4>, // + Repeated<scalar::UINT64, 8>, // + Repeated<scalar::UINT64, 16>, // + Repeated<scalar::UINT64, 32>, // + Chained<scalar::UINT16, scalar::UINT8>, // + Chained<scalar::UINT32, scalar::UINT16, scalar::UINT8>, // + builtin::_1, // + builtin::_2, // + builtin::_3, // + builtin::_4, // + builtin::_8 // + >; + +char GetRandomChar() { + static constexpr const uint64_t a = 1103515245; + static constexpr const uint64_t c = 12345; + static constexpr const uint64_t m = 1ULL << 31; + static uint64_t seed = 123456789; + seed = (a * seed + c) % m; + return seed; +} + +template <typename Element> using Buffer = cpp::Array<char, Element::kSize>; +template <typename Element> Buffer<Element> GetRandomBuffer() { + Buffer<Element> buffer; + for (auto ¤t : buffer) + current = GetRandomChar(); + return buffer; +} + +TYPED_TEST(LlvmLibcMemoryElements, Copy, FixedSizeTypes) { + Buffer<ParamType> Dst; + const auto buffer = GetRandomBuffer<ParamType>(); + Copy<ParamType>(Dst.data(), buffer.data()); + for (size_t i = 0; i < ParamType::kSize; ++i) + EXPECT_EQ(Dst[i], buffer[i]); +} + +TYPED_TEST(LlvmLibcMemoryElements, Equals, FixedSizeTypes) { + const auto buffer = GetRandomBuffer<ParamType>(); + EXPECT_TRUE(Equals<ParamType>(buffer.data(), buffer.data())); +} + +TYPED_TEST(LlvmLibcMemoryElements, ThreeWayCompare, FixedSizeTypes) { + Buffer<ParamType> initial; + for (auto &c : initial) + c = 5; + + // Testing equality + EXPECT_EQ(ThreeWayCompare<ParamType>(initial.data(), initial.data()), 0); + + // Testing all mismatching positions + for (size_t i = 0; i < ParamType::kSize; ++i) { + auto copy = initial; + ++copy[i]; // Copy is now lexicographycally greated than initial + const auto *less = initial.data(); + const auto *greater = copy.data(); + EXPECT_LT(ThreeWayCompare<ParamType>(less, greater), 0); + EXPECT_GT(ThreeWayCompare<ParamType>(greater, less), 0); + } +} + +TYPED_TEST(LlvmLibcMemoryElements, Splat, FixedSizeTypes) { + Buffer<ParamType> Dst; + const cpp::Array<char, 3> values = {char(0x00), char(0x7F), char(0xFF)}; + for (char value : values) { + SplatSet<ParamType>(Dst.data(), value); + for (size_t i = 0; i < ParamType::kSize; ++i) + EXPECT_EQ(Dst[i], value); + } +} + +} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp b/libc/test/src/string/memory_utils/memcpy_utils_test.cpp deleted file mode 100644 --- a/libc/test/src/string/memory_utils/memcpy_utils_test.cpp +++ /dev/null @@ -1,336 +0,0 @@ -//===-- Unittests for memory_utils ----------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "src/string/memory_utils/memcpy_utils.h" -#include "utils/CPP/Array.h" -#include "utils/UnitTest/Test.h" - -#include <assert.h> -#include <stdint.h> // uintptr_t - -#ifndef LLVM_LIBC_MEMCPY_MONITOR -#error LLVM_LIBC_MEMCPY_MONITOR must be defined for this test. -#endif - -namespace __llvm_libc { - -struct Buffer { - static constexpr size_t kMaxBuffer = 1024; - char buffer[kMaxBuffer + 1]; - size_t last = 0; - - void Clear() { - last = 0; - for (size_t i = 0; i < kMaxBuffer; ++i) - buffer[i] = '0'; - buffer[kMaxBuffer] = '\0'; - } - - void Increment(const void *ptr) { - const auto offset = reinterpret_cast<uintptr_t>(ptr); - assert(offset < kMaxBuffer); - ++buffer[offset]; - if (offset > last) - last = offset; - } - - char *Finish() { - assert(last < kMaxBuffer); - buffer[last + 1] = '\0'; - return buffer; - } -}; - -struct Trace { - Buffer read; - Buffer write; - - void Add(char *__restrict dst, const char *__restrict src, size_t count) { - for (size_t i = 0; i < count; ++i) - read.Increment(src + i); - for (size_t i = 0; i < count; ++i) - write.Increment(dst + i); - } - - void Clear() { - read.Clear(); - write.Clear(); - } - - char *Read() { return read.Finish(); } - char *Write() { return write.Finish(); } -}; - -static Trace &GetTrace() { - static thread_local Trace events; - return events; -} - -extern "C" void LLVM_LIBC_MEMCPY_MONITOR(char *__restrict dst, - const char *__restrict src, - size_t count) { - GetTrace().Add(dst, src, count); -} - -char *I(uintptr_t offset) { return reinterpret_cast<char *>(offset); } - -TEST(LlvmLibcMemcpyUtilsTest, CopyTrivial) { - auto &trace = GetTrace(); - - trace.Clear(); - CopyBlock<1>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "1"); - EXPECT_STREQ(trace.Read(), "1"); - - trace.Clear(); - CopyBlock<2>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "11"); - EXPECT_STREQ(trace.Read(), "11"); - - trace.Clear(); - CopyBlock<4>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "1111"); - EXPECT_STREQ(trace.Read(), "1111"); - - trace.Clear(); - CopyBlock<8>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "11111111"); - EXPECT_STREQ(trace.Read(), "11111111"); - - trace.Clear(); - CopyBlock<16>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "1111111111111111"); - EXPECT_STREQ(trace.Read(), "1111111111111111"); - - trace.Clear(); - CopyBlock<32>(I(0), I(0)); - EXPECT_STREQ(trace.Write(), "11111111111111111111111111111111"); - EXPECT_STREQ(trace.Read(), "11111111111111111111111111111111"); - - trace.Clear(); - CopyBlock<64>(I(0), I(0)); - EXPECT_STREQ( - trace.Write(), - "1111111111111111111111111111111111111111111111111111111111111111"); - EXPECT_STREQ( - trace.Read(), - "1111111111111111111111111111111111111111111111111111111111111111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyOffset) { - auto &trace = GetTrace(); - - trace.Clear(); - CopyBlock<1>(I(3), I(1)); - EXPECT_STREQ(trace.Write(), "0001"); - EXPECT_STREQ(trace.Read(), "01"); - - trace.Clear(); - CopyBlock<1>(I(2), I(1)); - EXPECT_STREQ(trace.Write(), "001"); - EXPECT_STREQ(trace.Read(), "01"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyBlockOverlap) { - auto &trace = GetTrace(); - - trace.Clear(); - CopyBlockOverlap<2>(I(0), I(0), 2); - EXPECT_STREQ(trace.Write(), "22"); - EXPECT_STREQ(trace.Read(), "22"); - - trace.Clear(); - CopyBlockOverlap<2>(I(0), I(0), 3); - EXPECT_STREQ(trace.Write(), "121"); - EXPECT_STREQ(trace.Read(), "121"); - - trace.Clear(); - CopyBlockOverlap<2>(I(0), I(0), 4); - EXPECT_STREQ(trace.Write(), "1111"); - EXPECT_STREQ(trace.Read(), "1111"); - - trace.Clear(); - CopyBlockOverlap<4>(I(2), I(1), 7); - EXPECT_STREQ(trace.Write(), "001112111"); - EXPECT_STREQ(trace.Read(), "01112111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopySrcAlignedBlocks) { - auto &trace = GetTrace(); - // Source is aligned and multiple of alignment. - // "1111" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(0), 4); - EXPECT_STREQ(trace.Write(), "2222"); - EXPECT_STREQ(trace.Read(), "2222"); - - // Source is aligned and multiple of alignment. - // "11110000" - // + "00001111" - // = "11111111" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "11111111"); - EXPECT_STREQ(trace.Read(), "11111111"); - - // Source is aligned already overlap at end. - // "1111000000000" - // + "0000111100000" - // + "0000000011110" - // + "0000000001111" - // = "1111111112221" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(0), 13); - EXPECT_STREQ(trace.Write(), "1111111112221"); - EXPECT_STREQ(trace.Read(), "1111111112221"); - - // Misaligned source. - // "01111000000000" - // + "00001111000000" - // + "00000000111100" - // + "00000000001111" - // = "01112111112211" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(1), 13); - EXPECT_STREQ(trace.Write(), "1112111112211"); - EXPECT_STREQ(trace.Read(), "01112111112211"); - - // Misaligned source aligned at end. - // "011110000000" - // + "000011110000" - // + "000000001111" - // = "011121111111" - trace.Clear(); - CopySrcAlignedBlocks<4>(I(0), I(1), 11); - EXPECT_STREQ(trace.Write(), "11121111111"); - EXPECT_STREQ(trace.Read(), "011121111111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyDstAlignedBlocks) { - auto &trace = GetTrace(); - // Destination is aligned and multiple of alignment. - // "1111" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(0), I(0), 4); - EXPECT_STREQ(trace.Write(), "2222"); - EXPECT_STREQ(trace.Read(), "2222"); - - // Destination is aligned and multiple of alignment. - // "11110000" - // + "00001111" - // = "11111111" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "11111111"); - EXPECT_STREQ(trace.Read(), "11111111"); - - // Destination is aligned already overlap at end. - // "1111000000000" - // + "0000111100000" - // + "0000000011110" - // + "0000000001111" - // = "1111111112221" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(0), I(0), 13); - EXPECT_STREQ(trace.Write(), "1111111112221"); - EXPECT_STREQ(trace.Read(), "1111111112221"); - - // Misaligned destination. - // "01111000000000" - // + "00001111000000" - // + "00000000111100" - // + "00000000001111" - // = "01112111112211" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(1), I(0), 13); - EXPECT_STREQ(trace.Write(), "01112111112211"); - EXPECT_STREQ(trace.Read(), "1112111112211"); - - // Misaligned destination aligned at end. - // "011110000000" - // + "000011110000" - // + "000000001111" - // = "011121111111" - trace.Clear(); - CopyDstAlignedBlocks<4>(I(1), I(0), 11); - EXPECT_STREQ(trace.Write(), "011121111111"); - EXPECT_STREQ(trace.Read(), "11121111111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignment) { - auto &trace = GetTrace(); - // Source is aligned and multiple of alignment. - // "11111111" - trace.Clear(); - CopySrcAlignedBlocks<8, 4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "22221111"); - EXPECT_STREQ(trace.Read(), "22221111"); - - // Destination is aligned and multiple of alignment. - // "11111111" - trace.Clear(); - CopyDstAlignedBlocks<8, 4>(I(0), I(0), 8); - EXPECT_STREQ(trace.Write(), "22221111"); - EXPECT_STREQ(trace.Read(), "22221111"); - - // Source is aligned and multiple of alignment. - // "111111111" - trace.Clear(); - CopySrcAlignedBlocks<8, 4>(I(0), I(0), 9); - EXPECT_STREQ(trace.Write(), "122211111"); - EXPECT_STREQ(trace.Read(), "122211111"); - - // Destination is aligned and multiple of alignment. - // "111111111" - trace.Clear(); - CopyDstAlignedBlocks<8, 4>(I(0), I(0), 9); - EXPECT_STREQ(trace.Write(), "122211111"); - EXPECT_STREQ(trace.Read(), "122211111"); -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksMaxReloads) { - auto &trace = GetTrace(); - for (size_t alignment = 0; alignment < 32; ++alignment) { - for (size_t count = 64; count < 768; ++count) { - trace.Clear(); - // We should never reload more than twice when copying from count = 2x32. - CopySrcAlignedBlocks<32>(I(alignment), I(0), count); - const char *const written = trace.Write(); - // First bytes are untouched. - for (size_t i = 0; i < alignment; ++i) - EXPECT_EQ(written[i], '0'); - // Next bytes are loaded once or twice but no more. - for (size_t i = alignment; i < count; ++i) { - EXPECT_GE(written[i], '1'); - EXPECT_LE(written[i], '2'); - } - } - } -} - -TEST(LlvmLibcMemcpyUtilsTest, CopyAlignedBlocksWithAlignmentMaxReloads) { - auto &trace = GetTrace(); - for (size_t alignment = 0; alignment < 32; ++alignment) { - for (size_t count = 64; count < 768; ++count) { - trace.Clear(); - // We should never reload more than twice when copying from count = 2x32. - CopySrcAlignedBlocks<32, 16>(I(alignment), I(0), count); - const char *const written = trace.Write(); - // First bytes are untouched. - for (size_t i = 0; i < alignment; ++i) - EXPECT_EQ(written[i], '0'); - // Next bytes are loaded once or twice but no more. - for (size_t i = alignment; i < count; ++i) { - EXPECT_GE(written[i], '1'); - EXPECT_LE(written[i], '2'); - } - } - } -} - -} // namespace __llvm_libc diff --git a/libc/test/src/string/memory_utils/memory_access_test.cpp b/libc/test/src/string/memory_utils/memory_access_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/string/memory_utils/memory_access_test.cpp @@ -0,0 +1,231 @@ +//===-- Unittests for memory_utils ----------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#define LLVM_LIBC_UNITTEST_OBSERVE 1 + +#include "src/string/memory_utils/elements.h" +#include "utils/CPP/Array.h" +#include "utils/CPP/ArrayRef.h" +#include "utils/UnitTest/Test.h" + +#include <stdio.h> +#include <string.h> + +namespace __llvm_libc { + +static constexpr const size_t kMaxBuffer = 32; + +struct BufferAccess : cpp::Array<char, kMaxBuffer + 1> { + BufferAccess() { Reset(); } + void Reset() { + for (auto &value : *this) + value = '0'; + this->operator[](kMaxBuffer) = '\0'; + } + void Touch(ptrdiff_t offset, size_t size) { + if (offset < 0) + return; + for (size_t i = 0; i < size; ++i) + ++(*this)[offset + i]; + } + operator const char *() const { return this->data(); } +}; + +struct Buffer { + ptrdiff_t Offset(const char *ptr) const { + const bool contained = ptr >= data.begin() && ptr < data.end(); + return contained ? ptr - data.begin() : -1; + } + void Reset() { + reads.Reset(); + writes.Reset(); + } + cpp::Array<char, kMaxBuffer> data; + BufferAccess __attribute__((aligned(64))) reads; + BufferAccess __attribute__((aligned(64))) writes; +}; + +struct MemoryAccessObserver { + void ObserveRead(const char *ptr, size_t size) { + Buffer1.reads.Touch(Buffer1.Offset(ptr), size); + Buffer2.reads.Touch(Buffer2.Offset(ptr), size); + } + + void ObserveWrite(const char *ptr, size_t size) { + Buffer1.writes.Touch(Buffer1.Offset(ptr), size); + Buffer2.writes.Touch(Buffer2.Offset(ptr), size); + } + + void Reset() { + Buffer1.Reset(); + Buffer2.Reset(); + } + + Buffer Buffer1; + Buffer Buffer2; +}; + +MemoryAccessObserver Observer; + +template <size_t Size> struct TestingElement { + static constexpr size_t kSize = Size; + + static void Copy(char *__restrict dst, const char *__restrict src) { + Observer.ObserveRead(src, kSize); + Observer.ObserveWrite(dst, kSize); + } + + static bool Equals(const char *lhs, const char *rhs) { + Observer.ObserveRead(lhs, kSize); + Observer.ObserveRead(rhs, kSize); + return true; + } + + static int ThreeWayCompare(const char *lhs, const char *rhs) { + Observer.ObserveRead(lhs, kSize); + Observer.ObserveRead(rhs, kSize); + return 0; + } + + static void SplatSet(char *dst, const unsigned char value) { + Observer.ObserveWrite(dst, kSize); + } +}; + +using Types = testing::TypeList< + TestingElement<1>, // 1 Byte + TestingElement<2>, // 2 Bytes + TestingElement<4>, // 4 Bytes + Repeated<TestingElement<2>, 3>, // 6 Bytes + Chained<TestingElement<4>, TestingElement<2>, TestingElement<1>> // 7 Bytes + >; + +struct LlvmLibcTestAccessBase : public testing::Test { + + template <typename HigherOrder, size_t Size, size_t Offset = 0> + void checkOperations(const BufferAccess &expected) { + static const BufferAccess untouched; + + Observer.Reset(); + HigherOrder::Copy(dst_ptr() + Offset, src_ptr() + Offset, Size); + ASSERT_STREQ(src().writes, untouched); + ASSERT_STREQ(dst().reads, untouched); + ASSERT_STREQ(src().reads, expected); + ASSERT_STREQ(dst().writes, expected); + Observer.Reset(); + HigherOrder::Equals(lhs_ptr() + Offset, rhs_ptr() + Offset, Size); + ASSERT_STREQ(lhs().writes, untouched); + ASSERT_STREQ(rhs().writes, untouched); + ASSERT_STREQ(lhs().reads, expected); + ASSERT_STREQ(rhs().reads, expected); + Observer.Reset(); + HigherOrder::ThreeWayCompare(lhs_ptr() + Offset, rhs_ptr() + Offset, Size); + ASSERT_STREQ(lhs().writes, untouched); + ASSERT_STREQ(rhs().writes, untouched); + ASSERT_STREQ(lhs().reads, expected); + ASSERT_STREQ(rhs().reads, expected); + Observer.Reset(); + HigherOrder::SplatSet(dst_ptr() + Offset, 5, Size); + ASSERT_STREQ(src().reads, untouched); + ASSERT_STREQ(src().writes, untouched); + ASSERT_STREQ(dst().reads, untouched); + ASSERT_STREQ(dst().writes, expected); + } + + void checkMaxAccess(const BufferAccess &expected, int max) { + for (size_t i = 0; i < kMaxBuffer; ++i) { + int value = (int)expected[i] - '0'; + if (value < 0 || value > max) { + printf("expected no more than %d access, was '%s'\n", max, + (const char *)expected); + ASSERT_LE(value, max); + } + } + } + +private: + const Buffer &lhs() const { return Observer.Buffer1; } + const Buffer &rhs() const { return Observer.Buffer2; } + const Buffer &src() const { return Observer.Buffer2; } + const Buffer &dst() const { return Observer.Buffer1; } + Buffer &dst() { return Observer.Buffer1; } + + char *dst_ptr() { return dst().data.begin(); } + const char *src_ptr() { return src().data.begin(); } + const char *lhs_ptr() { return lhs().data.begin(); } + const char *rhs_ptr() { return rhs().data.begin(); } +}; + +template <typename ParamType> +struct LlvmLibcTestAccessTail : public LlvmLibcTestAccessBase { + + void TearDown() override { + static constexpr size_t Size = 10; + + BufferAccess expected; + expected.Touch(Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 1); + checkOperations<Tail<ParamType>, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessTail, Operations, Types) {} + +template <typename ParamType> +struct LlvmLibcTestAccessHeadTail : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 10; + + BufferAccess expected; + expected.Touch(0, ParamType::kSize); + expected.Touch(Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 2); + checkOperations<HeadTail<ParamType>, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessHeadTail, Operations, Types) {} + +template <typename ParamType> +struct LlvmLibcTestAccessLoop : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 20; + + BufferAccess expected; + for (size_t i = 0; i < Size - ParamType::kSize; i += ParamType::kSize) + expected.Touch(i, ParamType::kSize); + expected.Touch(Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 2); + checkOperations<Loop<ParamType>, Size>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessLoop, Operations, Types) {} + +template <typename ParamType> +struct LlvmLibcTestAccessAlignedAccess : public LlvmLibcTestAccessBase { + void TearDown() override { + static constexpr size_t Size = 10; + static constexpr size_t Offset = 2; + using AlignmentT = TestingElement<4>; + + BufferAccess expected; + expected.Touch(Offset, AlignmentT::kSize); + expected.Touch(AlignmentT::kSize, ParamType::kSize); + expected.Touch(Offset + Size - ParamType::kSize, ParamType::kSize); + + checkMaxAccess(expected, 3); + checkOperations<Align<AlignmentT, Arg::_1>::Then<HeadTail<ParamType>>, Size, + Offset>(expected); + checkOperations<Align<AlignmentT, Arg::_2>::Then<HeadTail<ParamType>>, Size, + Offset>(expected); + } +}; +TYPED_TEST_F(LlvmLibcTestAccessAlignedAccess, Operations, Types) {} + +} // namespace __llvm_libc