diff --git a/libc/src/__support/macros/properties/architectures.h b/libc/src/__support/macros/properties/architectures.h --- a/libc/src/__support/macros/properties/architectures.h +++ b/libc/src/__support/macros/properties/architectures.h @@ -45,6 +45,10 @@ #define LIBC_TARGET_ARCH_IS_AARCH64 #endif +#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM)) +#define LIBC_TARGET_ARCH_IS_ANY_ARM +#endif + #if defined(__riscv) && (__riscv_xlen == 64) #define LIBC_TARGET_ARCH_IS_RISCV64 #endif @@ -53,8 +57,9 @@ #define LIBC_TARGET_ARCH_IS_RISCV32 #endif -#if (defined(LIBC_TARGET_ARCH_IS_AARCH64) || defined(LIBC_TARGET_ARCH_IS_ARM)) -#define LIBC_TARGET_ARCH_IS_ANY_ARM +#if (defined(LIBC_TARGET_ARCH_IS_RISCV64) || \ + defined(LIBC_TARGET_ARCH_IS_RISCV32)) +#define LIBC_TARGET_ARCH_IS_ANY_RISCV #endif #endif // LLVM_LIBC_SUPPORT_MACROS_PROPERTIES_ARCHITECTURES_H diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -450,6 +450,12 @@ endforeach() endif() + if("${CMAKE_CXX_COMPILER_ID}" MATCHES "GNU") + # Prevent warning when passing x86 SIMD types as template arguments. + # e.g. "warning: ignoring attributes on template argument ā€˜__m128iā€™ [-Wignored-attributes]" + list(APPEND ADD_IMPL_COMPILE_OPTIONS "-Wno-ignored-attributes") + endif() + add_entrypoint_object(${impl_name} NAME ${name} SRCS ${ADD_IMPL_SRCS} @@ -564,7 +570,7 @@ if(${LIBC_TARGET_ARCHITECTURE_IS_X86}) add_memcpy(memcpy_x86_64_opt_sse2 COMPILE_OPTIONS -march=k8 REQUIRE SSE2) add_memcpy(memcpy_x86_64_opt_sse4 COMPILE_OPTIONS -march=nehalem REQUIRE SSE4_2) - add_memcpy(memcpy_x86_64_opt_avx2 COMPILE_OPTIONS -march=haswell REQUIRE AVX2) + add_memcpy(memcpy_x86_64_opt_avx COMPILE_OPTIONS -march=sandybridge REQUIRE AVX) add_memcpy(memcpy_x86_64_opt_avx512 COMPILE_OPTIONS -march=skylake-avx512 REQUIRE AVX512F) add_memcpy(memcpy_opt_host COMPILE_OPTIONS ${LIBC_COMPILE_OPTIONS_NATIVE}) add_memcpy(memcpy) diff --git a/libc/src/string/memory_utils/CMakeLists.txt b/libc/src/string/memory_utils/CMakeLists.txt --- a/libc/src/string/memory_utils/CMakeLists.txt +++ b/libc/src/string/memory_utils/CMakeLists.txt @@ -24,6 +24,7 @@ libc.src.__support.CPP.type_traits libc.src.__support.macros.config libc.src.__support.macros.optimization + libc.src.__support.macros.properties.architectures ) add_header_library( diff --git a/libc/src/string/memory_utils/aarch64/memcmp_implementations.h b/libc/src/string/memory_utils/aarch64/memcmp_implementations.h --- a/libc/src/string/memory_utils/aarch64/memcmp_implementations.h +++ b/libc/src/string/memory_utils/aarch64/memcmp_implementations.h @@ -19,31 +19,32 @@ [[maybe_unused]] LIBC_INLINE MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { if (LIBC_UNLIKELY(count >= 384)) { - if (auto value = generic::Memcmp<16>::block(p1, p2)) + if (auto value = generic::Memcmp::block(p1, p2)) return value; align_to_next_boundary<16, Arg::P1>(p1, p2, count); } - return generic::Memcmp<16>::loop_and_tail(p1, p2, count); + return generic::Memcmp::loop_and_tail(p1, p2, count); } [[maybe_unused]] LIBC_INLINE MemcmpReturnType inline_memcmp_aarch64_neon_gt16(CPtr p1, CPtr p2, size_t count) { if (LIBC_UNLIKELY(count >= 128)) { // [128, āˆž] - if (auto value = generic::Memcmp<16>::block(p1, p2)) + if (auto value = generic::Memcmp::block(p1, p2)) return value; align_to_next_boundary<16, Arg::P1>(p1, p2, count); - return generic::Memcmp<32>::loop_and_tail(p1, p2, count); + return generic::Memcmp::loop_and_tail(p1, p2, count); } - if (generic::Bcmp<16>::block(p1, p2)) // [16, 16] - return generic::Memcmp<16>::block(p1, p2); + if (generic::Bcmp::block(p1, p2)) // [16, 16] + return generic::Memcmp::block(p1, p2); if (count < 32) // [17, 31] - return generic::Memcmp<16>::tail(p1, p2, count); - if (generic::Bcmp<16>::block(p1 + 16, p2 + 16)) // [32, 32] - return generic::Memcmp<16>::block(p1 + 16, p2 + 16); + return generic::Memcmp::tail(p1, p2, count); + if (generic::Bcmp::block(p1 + 16, p2 + 16)) // [32, 32] + return generic::Memcmp::block(p1 + 16, p2 + 16); if (count < 64) // [33, 63] - return generic::Memcmp<32>::tail(p1, p2, count); + return generic::Memcmp::tail(p1, p2, count); // [64, 127] - return generic::Memcmp<16>::loop_and_tail(p1 + 32, p2 + 32, count - 32); + return generic::Memcmp::loop_and_tail(p1 + 32, p2 + 32, + count - 32); } LIBC_INLINE MemcmpReturnType inline_memcmp_aarch64(CPtr p1, CPtr p2, @@ -51,15 +52,15 @@ if (count == 0) return MemcmpReturnType::ZERO(); if (count == 1) - return generic::Memcmp<1>::block(p1, p2); + return generic::Memcmp::block(p1, p2); if (count == 2) - return generic::Memcmp<2>::block(p1, p2); + return generic::Memcmp::block(p1, p2); if (count == 3) - return generic::Memcmp<3>::block(p1, p2); + return generic::MemcmpSequence::block(p1, p2); if (count <= 8) - return generic::Memcmp<4>::head_tail(p1, p2, count); + return generic::Memcmp::head_tail(p1, p2, count); if (count <= 16) - return generic::Memcmp<8>::head_tail(p1, p2, count); + return generic::Memcmp::head_tail(p1, p2, count); if constexpr (aarch64::kNeon) return inline_memcmp_aarch64_neon_gt16(p1, p2, count); else diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -15,6 +15,7 @@ #include "src/string/memory_utils/op_aarch64.h" #include "src/string/memory_utils/op_builtin.h" #include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_riscv.h" #include "src/string/memory_utils/op_x86.h" #include // size_t @@ -22,21 +23,17 @@ namespace __llvm_libc { [[maybe_unused]] LIBC_INLINE BcmpReturnType -inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) { - LIBC_LOOP_NOUNROLL - for (; offset < count; ++offset) - if (p1[offset] != p2[offset]) - return BcmpReturnType::NONZERO(); - return BcmpReturnType::ZERO(); +inline_bcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) { + return generic::Bcmp::loop_and_tail_offset(p1, p2, count, offset); } [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) { constexpr size_t kAlign = sizeof(uint64_t); if (count <= 2 * kAlign) - return inline_bcmp_byte_per_byte(p1, p2, 0, count); + return inline_bcmp_byte_per_byte(p1, p2, count); size_t bytes_to_p1_align = distance_to_align_up(p1); - if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align)) return value; size_t offset = bytes_to_p1_align; size_t p2_alignment = distance_to_align_down(p2 + offset); @@ -55,16 +52,16 @@ if (a != b) return BcmpReturnType::NONZERO(); } - return inline_bcmp_byte_per_byte(p1, p2, offset, count); + return inline_bcmp_byte_per_byte(p1, p2, count, offset); } [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) { constexpr size_t kAlign = sizeof(uint32_t); if (count <= 2 * kAlign) - return inline_bcmp_byte_per_byte(p1, p2, 0, count); + return inline_bcmp_byte_per_byte(p1, p2, count); size_t bytes_to_p1_align = distance_to_align_up(p1); - if (auto value = inline_bcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + if (auto value = inline_bcmp_byte_per_byte(p1, p2, bytes_to_p1_align)) return value; size_t offset = bytes_to_p1_align; size_t p2_alignment = distance_to_align_down(p2 + offset); @@ -80,89 +77,82 @@ if (a != b) return BcmpReturnType::NONZERO(); } - return inline_bcmp_byte_per_byte(p1, p2, offset, count); + return inline_bcmp_byte_per_byte(p1, p2, count, offset); } #if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64) [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { - if (count < 256) - return generic::Bcmp<16>::loop_and_tail(p1, p2, count); - if (auto value = generic::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - return generic::Bcmp<64>::loop_and_tail(p1, p2, count); + return generic::Bcmp::loop_and_tail_align_above(256, p1, p2, count); } #endif // defined(LIBC_TARGET_ARCH_IS_X86) || // defined(LIBC_TARGET_ARCH_IS_AARCH64) #if defined(LIBC_TARGET_ARCH_IS_X86) +#if defined(__SSE4_1__) [[maybe_unused]] LIBC_INLINE BcmpReturnType -inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { +inline_bcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); - if (count < 256) - return x86::sse2::Bcmp<16>::loop_and_tail(p1, p2, count); - if (auto value = x86::sse2::Bcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count); + return generic::Bcmp<__m128i>::head_tail(p1, p2, count); + return generic::Bcmp<__m128i>::loop_and_tail_align_above(256, p1, p2, count); } +#endif // __SSE4_1__ +#if defined(__AVX__) [[maybe_unused]] LIBC_INLINE BcmpReturnType -inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { +inline_bcmp_x86_avx_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + return generic::Bcmp<__m128i>::head_tail(p1, p2, count); if (count <= 64) - return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx2::Bcmp<64>::head_tail(p1, p2, count); - if (LIBC_UNLIKELY(count >= 256)) { - if (auto value = x86::avx2::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count); + return generic::Bcmp<__m256i>::head_tail(p1, p2, count); + return generic::Bcmp<__m256i>::loop_and_tail_align_above(256, p1, p2, count); } +#endif // __AVX__ +#if defined(__AVX512BW__) [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return x86::sse2::Bcmp<16>::head_tail(p1, p2, count); + return generic::Bcmp<__m128i>::head_tail(p1, p2, count); if (count <= 64) - return x86::avx2::Bcmp<32>::head_tail(p1, p2, count); + return generic::Bcmp<__m256i>::head_tail(p1, p2, count); if (count <= 128) - return x86::avx512bw::Bcmp<64>::head_tail(p1, p2, count); - if (LIBC_UNLIKELY(count >= 256)) { - if (auto value = x86::avx512bw::Bcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count); + return generic::Bcmp<__m512i>::head_tail(p1, p2, count); + return generic::Bcmp<__m512i>::loop_and_tail_align_above(256, p1, p2, count); } +#endif // __AVX512BW__ [[maybe_unused]] LIBC_INLINE BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2, size_t count) { if (count == 0) return BcmpReturnType::ZERO(); if (count == 1) - return generic::Bcmp<1>::block(p1, p2); + return generic::Bcmp::block(p1, p2); if (count == 2) - return generic::Bcmp<2>::block(p1, p2); - if (count <= 4) - return generic::Bcmp<2>::head_tail(p1, p2, count); - if (count <= 8) - return generic::Bcmp<4>::head_tail(p1, p2, count); + return generic::Bcmp::block(p1, p2); + if (count == 3) + return generic::BcmpSequence::block(p1, p2); + if (count == 4) + return generic::Bcmp::block(p1, p2); + if (count == 5) + return generic::BcmpSequence::block(p1, p2); + if (count == 6) + return generic::BcmpSequence::block(p1, p2); + if (count == 7) + return generic::BcmpSequence::block(p1, p2); + if (count == 8) + return generic::Bcmp::block(p1, p2); if (count <= 16) - return generic::Bcmp<8>::head_tail(p1, p2, count); - if constexpr (x86::kAvx512BW) - return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_bcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_bcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_bcmp_generic_gt16(p1, p2, count); + return generic::Bcmp::head_tail(p1, p2, count); +#if defined(__AVX512BW__) + return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); +#elif defined(__AVX__) + return inline_bcmp_x86_avx_gt16(p1, p2, count); +#elif defined(__SSE4_1__) + return inline_bcmp_x86_sse41_gt16(p1, p2, count); +#else + return inline_bcmp_generic_gt16(p1, p2, count); +#endif } #endif // defined(LIBC_TARGET_ARCH_IS_X86) @@ -178,19 +168,19 @@ case 0: return BcmpReturnType::ZERO(); case 1: - return generic::Bcmp<1>::block(p1, p2); + return generic::Bcmp::block(p1, p2); case 2: - return generic::Bcmp<2>::block(p1, p2); + return generic::Bcmp::block(p1, p2); case 3: - return generic::Bcmp<2>::head_tail(p1, p2, count); + return generic::Bcmp::head_tail(p1, p2, count); case 4: - return generic::Bcmp<4>::block(p1, p2); + return generic::Bcmp::block(p1, p2); case 5: case 6: case 7: - return generic::Bcmp<4>::head_tail(p1, p2, count); + return generic::Bcmp::head_tail(p1, p2, count); case 8: - return generic::Bcmp<8>::block(p1, p2); + return generic::Bcmp::block(p1, p2); case 9: case 10: case 11: @@ -198,7 +188,7 @@ case 13: case 14: case 15: - return generic::Bcmp<8>::head_tail(p1, p2, count); + return generic::Bcmp::head_tail(p1, p2, count); } } @@ -225,7 +215,7 @@ #elif defined(LIBC_TARGET_ARCH_IS_RISCV32) return inline_bcmp_aligned_access_32bit(p1, p2, count); #else - return inline_bcmp_byte_per_byte(p1, p2, 0, count); + return inline_bcmp_byte_per_byte(p1, p2, count); #endif } diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -13,6 +13,7 @@ #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY LIBC_LOOP_NOUNROLL #include "src/__support/macros/properties/architectures.h" #include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_riscv.h" #include "src/string/memory_utils/utils.h" // CPtr MemcmpReturnType #include // size_t @@ -26,21 +27,17 @@ namespace __llvm_libc { [[maybe_unused]] LIBC_INLINE MemcmpReturnType -inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t offset, size_t count) { - LIBC_LOOP_NOUNROLL - for (; offset < count; ++offset) - if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) - return value; - return MemcmpReturnType::ZERO(); +inline_memcmp_byte_per_byte(CPtr p1, CPtr p2, size_t count, size_t offset = 0) { + return generic::Memcmp::loop_and_tail_offset(p1, p2, count, offset); } [[maybe_unused]] LIBC_INLINE MemcmpReturnType inline_memcmp_aligned_access_64bit(CPtr p1, CPtr p2, size_t count) { constexpr size_t kAlign = sizeof(uint64_t); if (count <= 2 * kAlign) - return inline_memcmp_byte_per_byte(p1, p2, 0, count); + return inline_memcmp_byte_per_byte(p1, p2, count); size_t bytes_to_p1_align = distance_to_align_up(p1); - if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align)) return value; size_t offset = bytes_to_p1_align; size_t p2_alignment = distance_to_align_down(p2 + offset); @@ -56,21 +53,20 @@ b = load64_aligned( p2, offset); uint64_t a = load64_aligned(p1, offset); - if (a != b) { - // TODO use cmp_neq_uint64_t from D148717 once it's submitted. - return Endian::to_big_endian(a) < Endian::to_big_endian(b) ? -1 : 1; - } + if (a != b) + return cmp_neq_uint64_t(Endian::to_big_endian(a), + Endian::to_big_endian(b)); } - return inline_memcmp_byte_per_byte(p1, p2, offset, count); + return inline_memcmp_byte_per_byte(p1, p2, count, offset); } [[maybe_unused]] LIBC_INLINE MemcmpReturnType inline_memcmp_aligned_access_32bit(CPtr p1, CPtr p2, size_t count) { constexpr size_t kAlign = sizeof(uint32_t); if (count <= 2 * kAlign) - return inline_memcmp_byte_per_byte(p1, p2, 0, count); + return inline_memcmp_byte_per_byte(p1, p2, count); size_t bytes_to_p1_align = distance_to_align_up(p1); - if (auto value = inline_memcmp_byte_per_byte(p1, p2, 0, bytes_to_p1_align)) + if (auto value = inline_memcmp_byte_per_byte(p1, p2, bytes_to_p1_align)) return value; size_t offset = bytes_to_p1_align; size_t p2_alignment = distance_to_align_down(p2 + offset); @@ -83,16 +79,10 @@ else b = load32_aligned(p2, offset); uint32_t a = load32_aligned(p1, offset); - if (a != b) { - // TODO use cmp_uint32_t from D148717 once it's submitted. - // We perform the difference as an uint64_t. - const int64_t diff = static_cast(Endian::to_big_endian(a)) - - static_cast(Endian::to_big_endian(b)); - // And reduce the uint64_t into an uint32_t. - return static_cast((diff >> 1) | (diff & 0xFFFF)); - } + if (a != b) + return cmp_uint32_t(Endian::to_big_endian(a), Endian::to_big_endian(b)); } - return inline_memcmp_byte_per_byte(p1, p2, offset, count); + return inline_memcmp_byte_per_byte(p1, p2, count, offset); } LIBC_INLINE MemcmpReturnType inline_memcmp(CPtr p1, CPtr p2, size_t count) { @@ -105,7 +95,7 @@ #elif defined(LIBC_TARGET_ARCH_IS_RISCV32) return inline_memcmp_aligned_access_32bit(p1, p2, count); #else - return inline_memcmp_byte_per_byte(p1, p2, 0, count); + return inline_memcmp_byte_per_byte(p1, p2, count); #endif } diff --git a/libc/src/string/memory_utils/memmove_implementations.h b/libc/src/string/memory_utils/memmove_implementations.h --- a/libc/src/string/memory_utils/memmove_implementations.h +++ b/libc/src/string/memory_utils/memmove_implementations.h @@ -38,17 +38,17 @@ #if defined(LIBC_TARGET_ARCH_IS_X86) || defined(LIBC_TARGET_ARCH_IS_AARCH64) #if defined(LIBC_TARGET_ARCH_IS_X86) #if defined(__AVX512F__) - using uint128_t = uint8x16_t; - using uint256_t = uint8x32_t; - using uint512_t = uint8x64_t; + using uint128_t = generic_v128; + using uint256_t = generic_v256; + using uint512_t = generic_v512; #elif defined(__AVX__) - using uint128_t = uint8x16_t; - using uint256_t = uint8x32_t; - using uint512_t = cpp::array; + using uint128_t = generic_v128; + using uint256_t = generic_v256; + using uint512_t = cpp::array; #elif defined(__SSE2__) - using uint128_t = uint8x16_t; - using uint256_t = cpp::array; - using uint512_t = cpp::array; + using uint128_t = generic_v128; + using uint256_t = cpp::array; + using uint512_t = cpp::array; #else using uint128_t = cpp::array; using uint256_t = cpp::array; @@ -56,9 +56,9 @@ #endif #elif defined(LIBC_TARGET_ARCH_IS_AARCH64) static_assert(aarch64::kNeon, "aarch64 supports vector types"); - using uint128_t = uint8x16_t; - using uint256_t = uint8x32_t; - using uint512_t = uint8x64_t; + using uint128_t = generic_v128; + using uint256_t = generic_v256; + using uint512_t = generic_v512; #endif if (count == 0) return; diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -60,17 +60,17 @@ [[maybe_unused]] LIBC_INLINE static void inline_memset_x86(Ptr dst, uint8_t value, size_t count) { #if defined(__AVX512F__) - using uint128_t = uint8x16_t; - using uint256_t = uint8x32_t; - using uint512_t = uint8x64_t; + using uint128_t = generic_v128; + using uint256_t = generic_v256; + using uint512_t = generic_v512; #elif defined(__AVX__) - using uint128_t = uint8x16_t; - using uint256_t = uint8x32_t; - using uint512_t = cpp::array; + using uint128_t = generic_v128; + using uint256_t = generic_v256; + using uint512_t = cpp::array; #elif defined(__SSE2__) - using uint128_t = uint8x16_t; - using uint256_t = cpp::array; - using uint512_t = cpp::array; + using uint128_t = generic_v128; + using uint256_t = cpp::array; + using uint512_t = cpp::array; #else using uint128_t = cpp::array; using uint256_t = cpp::array; @@ -106,9 +106,9 @@ [[maybe_unused]] LIBC_INLINE static void inline_memset_aarch64(Ptr dst, uint8_t value, size_t count) { static_assert(aarch64::kNeon, "aarch64 supports vector types"); - using uint128_t = uint8x16_t; - using uint256_t = uint8x32_t; - using uint512_t = uint8x64_t; + using uint128_t = generic_v128; + using uint256_t = generic_v256; + using uint512_t = generic_v512; if (count == 0) return; if (count <= 3) { diff --git a/libc/src/string/memory_utils/op_aarch64.h b/libc/src/string/memory_utils/op_aarch64.h --- a/libc/src/string/memory_utils/op_aarch64.h +++ b/libc/src/string/memory_utils/op_aarch64.h @@ -48,7 +48,7 @@ offset += SIZE; } while (offset < count - SIZE); // Unaligned store, we can't use 'dc zva' here. - generic::Memset::tail(dst, value, count); + generic::Memset::tail(dst, value, count); } }; @@ -171,6 +171,100 @@ } // namespace __llvm_libc::aarch64 +namespace __llvm_libc::generic { + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint16_t +template <> struct cmp_is_expensive : public cpp::false_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); +} +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) ^ load(p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + return static_cast(load_be(p1, offset)) - + static_cast(load_be(p2, offset)); +} + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint32_t +template <> struct cmp_is_expensive : cpp::false_type {}; +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) ^ load(p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + const auto a = load_be(p1, offset); + const auto b = load_be(p2, offset); + return a > b ? 1 : a < b ? -1 : 0; +} + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint64_t +template <> struct cmp_is_expensive : cpp::false_type {}; +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) != load(p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + const auto a = load_be(p1, offset); + const auto b = load_be(p2, offset); + if (a != b) + return a > b ? 1 : -1; + return MemcmpReturnType::ZERO(); +} + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint8x16_t +template <> struct is_vector : cpp::true_type {}; +template <> struct cmp_is_expensive : cpp::false_type {}; +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + for (size_t i = 0; i < 2; ++i) { + auto a = load(p1, offset); + auto b = load(p2, offset); + uint32_t cond = a != b; + if (cond) + return cond; + offset += sizeof(uint64_t); + } + return 0; +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + for (size_t i = 0; i < 2; ++i) { + auto a = load_be(p1, offset); + auto b = load_be(p2, offset); + if (a != b) + return cmp_neq_uint64_t(a, b); + offset += sizeof(uint64_t); + } + return MemcmpReturnType::ZERO(); +} + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint8x16x2_t +template <> struct is_vector : cpp::true_type {}; +template <> struct cmp_is_expensive : cpp::false_type {}; +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, + size_t offset) { + for (size_t i = 0; i < 4; ++i) { + auto a = load_be(p1, offset); + auto b = load_be(p2, offset); + if (a != b) + return cmp_neq_uint64_t(a, b); + offset += sizeof(uint64_t); + } + return MemcmpReturnType::ZERO(); +} +} // namespace __llvm_libc::generic + #endif // LIBC_TARGET_ARCH_IS_AARCH64 #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_AARCH64_H diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -33,31 +33,43 @@ #include +static_assert((UINTPTR_MAX == 4294967295U) || + (UINTPTR_MAX == 18446744073709551615UL), + "We currently only support 32- or 64-bit platforms"); + +#if defined(UINT64_MAX) +#define LLVM_LIBC_HAS_UINT64 +#endif + namespace __llvm_libc { // Compiler types using the vector attributes. -using uint8x1_t = uint8_t __attribute__((__vector_size__(1))); -using uint8x2_t = uint8_t __attribute__((__vector_size__(2))); -using uint8x4_t = uint8_t __attribute__((__vector_size__(4))); -using uint8x8_t = uint8_t __attribute__((__vector_size__(8))); -using uint8x16_t = uint8_t __attribute__((__vector_size__(16))); -using uint8x32_t = uint8_t __attribute__((__vector_size__(32))); -using uint8x64_t = uint8_t __attribute__((__vector_size__(64))); +using generic_v128 = uint8_t __attribute__((__vector_size__(16))); +using generic_v256 = uint8_t __attribute__((__vector_size__(32))); +using generic_v512 = uint8_t __attribute__((__vector_size__(64))); } // namespace __llvm_libc namespace __llvm_libc::generic { + // We accept three types of values as elements for generic operations: -// - scalar : unsigned integral types -// - vector : compiler types using the vector attributes +// - scalar : unsigned integral types, +// - vector : compiler types using the vector attributes or platform builtins, // - array : a cpp::array where T is itself either a scalar or a vector. // The following traits help discriminate between these cases. -template -constexpr bool is_scalar_v = cpp::is_integral_v && cpp::is_unsigned_v; -template -constexpr bool is_vector_v = - cpp::details::is_unqualified_any_of(); +template struct is_scalar : cpp::false_type {}; +template <> struct is_scalar : cpp::true_type {}; +template <> struct is_scalar : cpp::true_type {}; +template <> struct is_scalar : cpp::true_type {}; +#ifdef LLVM_LIBC_HAS_UINT64 +template <> struct is_scalar : cpp::true_type {}; +#endif // LLVM_LIBC_HAS_UINT64 +template constexpr bool is_scalar_v = is_scalar::value; + +template struct is_vector : cpp::false_type {}; +template <> struct is_vector : cpp::true_type {}; +template <> struct is_vector : cpp::true_type {}; +template <> struct is_vector : cpp::true_type {}; +template constexpr bool is_vector_v = is_vector::value; template struct is_array : cpp::false_type {}; template struct is_array> { @@ -69,7 +81,7 @@ constexpr bool is_element_type_v = is_scalar_v || is_vector_v || is_array_v; -// +// Helper struct to retrieve the number of elements of an array. template struct array_size {}; template struct array_size> : cpp::integral_constant {}; @@ -114,105 +126,15 @@ } } -static_assert((UINTPTR_MAX == 4294967295U) || - (UINTPTR_MAX == 18446744073709551615UL), - "We currently only support 32- or 64-bit platforms"); - -#if defined(LIBC_TARGET_ARCH_IS_X86_64) || defined(LIBC_TARGET_ARCH_IS_AARCH64) -#define LLVM_LIBC_HAS_UINT64 -#endif - -namespace details { -// Checks that each type is sorted in strictly decreasing order of size. -// i.e. sizeof(First) > sizeof(Second) > ... > sizeof(Last) -template constexpr bool is_decreasing_size() { - return sizeof(First) == 1; -} -template -constexpr bool is_decreasing_size() { - if constexpr (sizeof...(Next) > 0) - return sizeof(First) > sizeof(Second) && is_decreasing_size(); - else - return sizeof(First) > sizeof(Second) && is_decreasing_size(); -} - -template struct Largest; -template struct Largest : cpp::type_identity {}; -template -struct Largest { - using next = Largest; - using type = cpp::conditional_t<(Size >= sizeof(T)), T, typename next::type>; -}; - -} // namespace details - -// 'SupportedTypes' holds a list of natively supported types. -// The types are instanciations of ScalarType or VectorType. -// They should be ordered in strictly decreasing order. -// The 'TypeFor' type retrieves is the largest supported type that can -// handle 'Size' bytes. e.g. -// -// using ST = SupportedTypes, ScalarType>; -// using Type = ST::TypeFor<10>; -// static_assert(cpp:is_same_v>); - -template struct SupportedTypes { - static_assert(details::is_decreasing_size()); - - using MaxType = First; - - template - using TypeFor = typename details::Largest::type; -}; - -// Map from sizes to structures offering static load, store and splat methods. -// Note: On platforms lacking vector support, we use the ArrayType below and -// decompose the operation in smaller pieces. - -// Lists a generic native types to use for Memset and Memmove operations. -// TODO: Inject the native types within Memset and Memmove depending on the -// target architectures and derive MaxSize from it. -using NativeTypeMap = SupportedTypes; - -namespace details { - -// Helper to test if a type is void. -template inline constexpr bool is_void_v = cpp::is_same_v; - -// In case the 'Size' is not supported we can fall back to a sequence of smaller -// operations using the largest natively supported type. -template static constexpr bool useArrayType() { - return (Size > MaxSize) && ((Size % MaxSize) == 0) && - !details::is_void_v>; -} - -// Compute the type to handle an operation of 'Size' bytes knowing that the -// underlying platform only support native types up to MaxSize bytes. -template -using getTypeFor = cpp::conditional_t< - useArrayType(), - cpp::array, Size / MaxSize>, - NativeTypeMap::TypeFor>; - -} // namespace details - /////////////////////////////////////////////////////////////////////////////// // Memset /////////////////////////////////////////////////////////////////////////////// template struct Memset { + static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); LIBC_INLINE static void block(Ptr dst, uint8_t value) { - static_assert(is_element_type_v); if constexpr (is_scalar_v || is_vector_v) { store(dst, splat(value)); } else if constexpr (is_array_v) { @@ -247,9 +169,8 @@ static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); LIBC_INLINE static void block(Ptr dst, uint8_t value) { Memset::block(dst, value); - if constexpr (sizeof...(TS) > 0) { + if constexpr (sizeof...(TS) > 0) return MemsetSequence::block(dst + sizeof(T), value); - } } }; @@ -258,6 +179,7 @@ /////////////////////////////////////////////////////////////////////////////// template struct Memmove { + static_assert(is_element_type_v); static constexpr size_t SIZE = sizeof(T); LIBC_INLINE static void block(Ptr dst, CPtr src) { @@ -390,136 +312,257 @@ }; /////////////////////////////////////////////////////////////////////////////// -// Bcmp +// Low level operations for Bcmp and Memcmp that operate on memory locations. /////////////////////////////////////////////////////////////////////////////// -template struct Bcmp { - static constexpr size_t SIZE = Size; - static constexpr size_t MaxSize = LLVM_LIBC_IS_DEFINED(LLVM_LIBC_HAS_UINT64) - ? sizeof(uint64_t) - : sizeof(uint32_t); - - template LIBC_INLINE static uint32_t load_xor(CPtr p1, CPtr p2) { - static_assert(sizeof(T) <= sizeof(uint32_t)); - return load(p1) ^ load(p2); - } - template - LIBC_INLINE static uint32_t load_not_equal(CPtr p1, CPtr p2) { - return load(p1) != load(p2); - } +// Same as load above but with an offset to the pointer. +// Making the offset explicit hints the compiler to use relevant addressing mode +// consistently. +template LIBC_INLINE static T load(CPtr ptr, size_t offset) { + return ::__llvm_libc::load(ptr + offset); +} - LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == 1) { - return load_xor(p1, p2); - } else if constexpr (Size == 2) { - return load_xor(p1, p2); - } else if constexpr (Size == 4) { - return load_xor(p1, p2); - } else if constexpr (Size == 8) { - return load_not_equal(p1, p2); - } else if constexpr (details::useArrayType()) { - for (size_t offset = 0; offset < Size; offset += MaxSize) - if (auto value = Bcmp::block(p1 + offset, p2 + offset)) - return value; +// Same as above but also makes sure the loaded value is in big endian format. +// This is useful when implementing lexicograhic comparisons as big endian +// scalar comparison directly maps to lexicographic byte comparisons. +template LIBC_INLINE static T load_be(CPtr ptr, size_t offset) { + return Endian::to_big_endian(load(ptr, offset)); +} + +// Equality: returns true iff values at locations (p1 + offset) and (p2 + +// offset) compare equal. +template static bool eq(CPtr p1, CPtr p2, size_t offset); + +// Not equals: returns non-zero iff values at locations (p1 + offset) and (p2 + +// offset) differ. +template static uint32_t neq(CPtr p1, CPtr p2, size_t offset); + +// Lexicographic comparison: +// - returns 0 iff values at locations (p1 + offset) and (p2 + offset) compare +// equal. +// - returns a negative value if value at location (p1 + offset) is +// lexicographically less than value at (p2 + offset). +// - returns a positive value if value at location (p1 + offset) is +// lexicographically greater than value at (p2 + offset). +template +static MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset); + +// Lexicographic comparison of non-equal values: +// - returns a negative value if value at location (p1 + offset) is +// lexicographically less than value at (p2 + offset). +// - returns a positive value if value at location (p1 + offset) is +// lexicographically greater than value at (p2 + offset). +template +static MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); + +/////////////////////////////////////////////////////////////////////////////// +// Memcmp implementation +// +// When building memcmp, not all types are considered equals. +// +// For instance, the lexicographic comparison of two uint8_t can be implemented +// as a simple subtraction, but for wider operations the logic can be much more +// involving, especially on little endian platforms. +// +// For such wider types it is a good strategy to test for equality first and +// only do the expensive lexicographic comparison if necessary. +// +// Decomposing the algorithm like this for wider types allows us to have +// efficient implementation of higher order functions like 'head_tail' or +// 'loop_and_tail'. +/////////////////////////////////////////////////////////////////////////////// + +// Type traits to decide whether we can use 'cmp' directly or if we need to +// split the computation. +template struct cmp_is_expensive; + +template struct Memcmp { + static_assert(is_element_type_v); + static constexpr size_t SIZE = sizeof(T); + +private: + LIBC_INLINE static MemcmpReturnType block_offset(CPtr p1, CPtr p2, + size_t offset) { + if constexpr (cmp_is_expensive::value) { + if (!eq(p1, p2, offset)) + return cmp_neq(p1, p2, offset); + return MemcmpReturnType::ZERO(); } else { - deferred_static_assert("Unimplemented Size"); + return cmp(p1, p2, offset); } - return BcmpReturnType::ZERO(); } - LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - SIZE, p2 + count - SIZE); +public: + LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { + return block_offset(p1, p2, 0); } - LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { - return block(p1, p2) | tail(p1, p2, count); + LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + return block_offset(p1, p2, count - SIZE); } - LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, - size_t count) { - static_assert(Size > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) + LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2, + size_t count) { + if constexpr (cmp_is_expensive::value) { + if (!eq(p1, p2, 0)) + return cmp_neq(p1, p2, 0); + } else { + if (const auto value = cmp(p1, p2, 0)) return value; - offset += SIZE; - } while (offset < count - SIZE); + } return tail(p1, p2, count); } -}; -/////////////////////////////////////////////////////////////////////////////// -// Memcmp -/////////////////////////////////////////////////////////////////////////////// -template struct Memcmp { - static constexpr size_t SIZE = Size; - static constexpr size_t MaxSize = LLVM_LIBC_IS_DEFINED(LLVM_LIBC_HAS_UINT64) - ? sizeof(uint64_t) - : sizeof(uint32_t); - - template LIBC_INLINE static T load_be(CPtr ptr) { - return Endian::to_big_endian(load(ptr)); + LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, + size_t count) { + return loop_and_tail_offset(p1, p2, count, 0); } - template - LIBC_INLINE static MemcmpReturnType load_be_diff(CPtr p1, CPtr p2) { - return load_be(p1) - load_be(p2); + LIBC_INLINE static MemcmpReturnType + loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) { + if constexpr (SIZE > 1) { + const size_t limit = count - SIZE; + LIBC_LOOP_NOUNROLL + for (; offset < limit; offset += SIZE) { + if constexpr (cmp_is_expensive::value) { + if (!eq(p1, p2, offset)) + return cmp_neq(p1, p2, offset); + } else { + if (const auto value = cmp(p1, p2, offset)) + return value; + } + } + return block_offset(p1, p2, limit); // tail + } else { + // No need for a tail operation when SIZE == 1. + LIBC_LOOP_NOUNROLL + for (; offset < count; offset += SIZE) + if (auto value = cmp(p1, p2, offset)) + return value; + return MemcmpReturnType::ZERO(); + } } - template - LIBC_INLINE static MemcmpReturnType load_be_cmp(CPtr p1, CPtr p2) { - const auto la = load_be(p1); - const auto lb = load_be(p2); - return la > lb ? 1 : la < lb ? -1 : 0; + LIBC_INLINE static MemcmpReturnType + loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) { + const AlignHelper helper(p1); + if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) { + if (auto value = block(p1, p2)) + return value; + adjust(helper.offset(), p1, p2, count); + } + return loop_and_tail(p1, p2, count); } +}; +template struct MemcmpSequence { + static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == 1) { - return load_be_diff(p1, p2); - } else if constexpr (Size == 2) { - return load_be_diff(p1, p2); - } else if constexpr (Size == 4) { - return load_be_cmp(p1, p2); - } else if constexpr (Size == 8) { - return load_be_cmp(p1, p2); - } else if constexpr (details::useArrayType()) { - for (size_t offset = 0; offset < Size; offset += MaxSize) - if (Bcmp::block(p1 + offset, p2 + offset)) - return Memcmp::block(p1 + offset, p2 + offset); - return MemcmpReturnType::ZERO(); - } else if constexpr (Size == 3) { - if (auto value = Memcmp<2>::block(p1, p2)) - return value; - return Memcmp<1>::block(p1 + 2, p2 + 2); + // TODO: test suggestion in + // https://reviews.llvm.org/D148717?id=515724#inline-1446890 + // once we have a proper way to check memory operation latency. + if constexpr (cmp_is_expensive::value) { + if (!eq(p1, p2, 0)) + return cmp_neq(p1, p2, 0); } else { - deferred_static_assert("Unimplemented Size"); + if (auto value = cmp(p1, p2, 0)) + return value; } + if constexpr (sizeof...(TS) > 0) + return MemcmpSequence::block(p1 + sizeof(T), p2 + sizeof(T)); + else + return MemcmpReturnType::ZERO(); } +}; - LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - SIZE, p2 + count - SIZE); +/////////////////////////////////////////////////////////////////////////////// +// Bcmp +/////////////////////////////////////////////////////////////////////////////// +template struct Bcmp { + static_assert(is_element_type_v); + static constexpr size_t SIZE = sizeof(T); + + LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { + return neq(p1, p2, 0); } - LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2, - size_t count) { - if (auto value = block(p1, p2)) + LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { + const size_t tail_offset = count - SIZE; + return neq(p1, p2, tail_offset); + } + + LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { + if (const auto value = neq(p1, p2, 0)) return value; return tail(p1, p2, count); } - LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, - size_t count) { - static_assert(Size > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) + LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, + size_t count) { + return loop_and_tail_offset(p1, p2, count, 0); + } + + LIBC_INLINE static BcmpReturnType + loop_and_tail_offset(CPtr p1, CPtr p2, size_t count, size_t offset) { + if constexpr (SIZE > 1) { + const size_t limit = count - SIZE; + LIBC_LOOP_NOUNROLL + for (; offset < limit; offset += SIZE) + if (const auto value = neq(p1, p2, offset)) + return value; + return tail(p1, p2, count); + } else { + // No need for a tail operation when SIZE == 1. + LIBC_LOOP_NOUNROLL + for (; offset < count; offset += SIZE) + if (const auto value = neq(p1, p2, offset)) + return value; + return BcmpReturnType::ZERO(); + } + } + + LIBC_INLINE static BcmpReturnType + loop_and_tail_align_above(size_t threshold, CPtr p1, CPtr p2, size_t count) { + static_assert(SIZE > 1, + "No need to align when processing one byte at a time"); + const AlignHelper helper(p1); + if (LIBC_UNLIKELY(count >= threshold) && helper.not_aligned()) { + if (auto value = block(p1, p2)) return value; - offset += SIZE; - } while (offset < count - SIZE); - return tail(p1, p2, count); + adjust(helper.offset(), p1, p2, count); + } + return loop_and_tail(p1, p2, count); } }; +template struct BcmpSequence { + static constexpr size_t SIZE = (sizeof(T) + ... + sizeof(TS)); + LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { + if (auto value = neq(p1, p2, 0)) + return value; + if constexpr (sizeof...(TS) > 0) + return BcmpSequence::block(p1 + sizeof(T), p2 + sizeof(T)); + else + return BcmpReturnType::ZERO(); + } +}; + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint8_t +template <> struct cmp_is_expensive : public cpp::false_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); +} +template <> LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) ^ load(p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + return static_cast(load(p1, offset)) - + static_cast(load(p2, offset)); +} +template <> +LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); } // namespace __llvm_libc::generic #endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_GENERIC_H diff --git a/libc/src/string/memory_utils/op_riscv.h b/libc/src/string/memory_utils/op_riscv.h new file mode 100644 --- /dev/null +++ b/libc/src/string/memory_utils/op_riscv.h @@ -0,0 +1,84 @@ +//===-- RISC-V implementation of memory function building blocks ----------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file provides x86 specific building blocks to compose memory functions. +// +//===----------------------------------------------------------------------===// +#ifndef LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H +#define LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H + +#include "src/__support/macros/properties/architectures.h" + +#if defined(LIBC_TARGET_ARCH_IS_ANY_RISCV) + +#include "src/__support/common.h" +#include "src/string/memory_utils/op_generic.h" + +namespace __llvm_libc::generic { + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint16_t +template <> struct cmp_is_expensive : public cpp::false_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); +} +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) ^ load(p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + return static_cast(load_be(p1, offset)) - + static_cast(load_be(p2, offset)); +} +template <> +LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint32_t +template <> struct cmp_is_expensive : public cpp::false_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); +} +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) ^ load(p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + const auto a = load_be(p1, offset); + const auto b = load_be(p2, offset); + return cmp_uint32_t(a, b); +} +template <> +LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); + +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint64_t +template <> struct cmp_is_expensive : public cpp::true_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); +} +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return !eq(p1, p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset); +template <> +LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, + size_t offset) { + const auto a = load_be(p1, offset); + const auto b = load_be(p2, offset); + return cmp_neq_uint64_t(a, b); +} + +} // namespace __llvm_libc::generic + +#endif // LIBC_TARGET_ARCH_IS_ANY_RISCV +#endif // LLVM_LIBC_SRC_STRING_MEMORY_UTILS_OP_RISCV_H diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -55,220 +55,199 @@ } }; -/////////////////////////////////////////////////////////////////////////////// -// Bcmp - -// Base implementation for the Bcmp specializations. -// - BlockSize is either 16, 32 or 64 depending on the available compile time -// features, it is used to switch between "single native operation" or a -// "sequence of native operations". -// - BlockBcmp is the function that implements the bcmp logic. -template struct BcmpImpl { - static constexpr size_t SIZE = Size; - LIBC_INLINE static BcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == BlockSize) { - return BlockBcmp(p1, p2); - } else if constexpr (Size % BlockSize == 0) { - for (size_t offset = 0; offset < Size; offset += BlockSize) - if (auto value = BlockBcmp(p1 + offset, p2 + offset)) - return value; - } else { - deferred_static_assert("SIZE not implemented"); - } - return BcmpReturnType::ZERO(); - } - - LIBC_INLINE static BcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - Size, p2 + count - Size); - } - - LIBC_INLINE static BcmpReturnType head_tail(CPtr p1, CPtr p2, size_t count) { - return block(p1, p2) | tail(p1, p2, count); - } +} // namespace __llvm_libc::x86 - LIBC_INLINE static BcmpReturnType loop_and_tail(CPtr p1, CPtr p2, - size_t count) { - static_assert(Size > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) - return value; - offset += Size; - } while (offset < count - Size); - return tail(p1, p2, count); - } -}; +namespace __llvm_libc::generic { -namespace sse2 { -LIBC_INLINE BcmpReturnType bcmp16(CPtr p1, CPtr p2) { -#if defined(__SSE2__) - using T = char __attribute__((__vector_size__(16))); - // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. - const int mask = - _mm_movemask_epi8(cpp::bit_cast<__m128i>(load(p1) != load(p2))); - return static_cast(mask); -#else - (void)p1; - (void)p2; - return BcmpReturnType::ZERO(); -#endif // defined(__SSE2__) +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint16_t +template <> struct cmp_is_expensive : public cpp::false_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); } -template using Bcmp = BcmpImpl; -} // namespace sse2 - -namespace avx2 { -LIBC_INLINE BcmpReturnType bcmp32(CPtr p1, CPtr p2) { -#if defined(__AVX2__) - using T = char __attribute__((__vector_size__(32))); - // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. - const int mask = - _mm256_movemask_epi8(cpp::bit_cast<__m256i>(load(p1) != load(p2))); - // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit - // mask. - return static_cast(mask); -#else - (void)p1; - (void)p2; - return BcmpReturnType::ZERO(); -#endif // defined(__AVX2__) +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) ^ load(p2, offset); } -template using Bcmp = BcmpImpl; -} // namespace avx2 - -namespace avx512bw { -LIBC_INLINE BcmpReturnType bcmp64(CPtr p1, CPtr p2) { -#if defined(__AVX512BW__) - using T = char __attribute__((__vector_size__(64))); - // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - const uint64_t mask = _mm512_cmpneq_epi8_mask( - cpp::bit_cast<__m512i>(load(p1)), cpp::bit_cast<__m512i>(load(p2))); - const bool mask_is_set = mask != 0; - return static_cast(mask_is_set); -#else - (void)p1; - (void)p2; - return BcmpReturnType::ZERO(); -#endif // defined(__AVX512BW__) +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + return static_cast(load_be(p1, offset)) - + static_cast(load_be(p2, offset)); } -template using Bcmp = BcmpImpl; -} // namespace avx512bw +template <> +LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); -// Assuming that the mask is non zero, the index of the first mismatching byte -// is the number of trailing zeros in the mask. Trailing zeros and not leading -// zeros because the x86 architecture is little endian. -LIBC_INLINE MemcmpReturnType char_diff_no_zero(CPtr p1, CPtr p2, - uint64_t mask) { - const size_t diff_index = __builtin_ctzll(mask); - const int16_t ca = cpp::to_integer(p1[diff_index]); - const int16_t cb = cpp::to_integer(p2[diff_index]); - return ca - cb; +/////////////////////////////////////////////////////////////////////////////// +// Specializations for uint32_t +template <> struct cmp_is_expensive : public cpp::false_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); +} +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) ^ load(p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset) { + const auto a = load_be(p1, offset); + const auto b = load_be(p2, offset); + return cmp_uint32_t(a, b); } +template <> +LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, size_t offset); /////////////////////////////////////////////////////////////////////////////// -// Memcmp - -// Base implementation for the Memcmp specializations. -// - BlockSize is either 16, 32 or 64 depending on the available compile time -// features, it is used to switch between "single native operation" or a -// "sequence of native operations". -// - BlockMemcmp is the function that implements the memcmp logic. -// - BlockBcmp is the function that implements the bcmp logic. -template -struct MemcmpImpl { - static constexpr size_t SIZE = Size; - LIBC_INLINE static MemcmpReturnType block(CPtr p1, CPtr p2) { - if constexpr (Size == BlockSize) { - return BlockMemcmp(p1, p2); - } else if constexpr (Size % BlockSize == 0) { - for (size_t offset = 0; offset < Size; offset += BlockSize) - if (auto value = BlockBcmp(p1 + offset, p2 + offset)) - return BlockMemcmp(p1 + offset, p2 + offset); - } else { - deferred_static_assert("SIZE not implemented"); - } - return MemcmpReturnType::ZERO(); - } - - LIBC_INLINE static MemcmpReturnType tail(CPtr p1, CPtr p2, size_t count) { - return block(p1 + count - Size, p2 + count - Size); - } - - LIBC_INLINE static MemcmpReturnType head_tail(CPtr p1, CPtr p2, - size_t count) { - if (auto value = block(p1, p2)) - return value; - return tail(p1, p2, count); - } +// Specializations for uint64_t +template <> struct cmp_is_expensive : public cpp::true_type {}; +template <> LIBC_INLINE bool eq(CPtr p1, CPtr p2, size_t offset) { + return load(p1, offset) == load(p2, offset); +} +template <> +LIBC_INLINE uint32_t neq(CPtr p1, CPtr p2, size_t offset) { + return !eq(p1, p2, offset); +} +template <> +LIBC_INLINE MemcmpReturnType cmp(CPtr p1, CPtr p2, size_t offset); +template <> +LIBC_INLINE MemcmpReturnType cmp_neq(CPtr p1, CPtr p2, + size_t offset) { + const auto a = load_be(p1, offset); + const auto b = load_be(p2, offset); + return cmp_neq_uint64_t(a, b); +} - LIBC_INLINE static MemcmpReturnType loop_and_tail(CPtr p1, CPtr p2, - size_t count) { - static_assert(Size > 1, "a loop of size 1 does not need tail"); - size_t offset = 0; - do { - if (auto value = block(p1 + offset, p2 + offset)) - return value; - offset += Size; - } while (offset < count - Size); - return tail(p1, p2, count); - } -}; +/////////////////////////////////////////////////////////////////////////////// +// Specializations for __m128i +#if defined(__SSE4_1__) +template <> struct is_vector<__m128i> : cpp::true_type {}; +template <> struct cmp_is_expensive<__m128i> : cpp::true_type {}; +LIBC_INLINE __m128i bytewise_max(__m128i a, __m128i b) { + return _mm_max_epu8(a, b); +} +LIBC_INLINE __m128i bytewise_reverse(__m128i value) { + return _mm_shuffle_epi8(value, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15)); +} +LIBC_INLINE uint16_t big_endian_cmp_mask(__m128i max, __m128i value) { + return _mm_movemask_epi8(bytewise_reverse(_mm_cmpeq_epi8(max, value))); +} +template <> LIBC_INLINE bool eq<__m128i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m128i>(p1, offset); + const auto b = load<__m128i>(p2, offset); + const auto xored = _mm_xor_si128(a, b); + return _mm_testz_si128(xored, xored) == 1; // 1 iff xored == 0 +} +template <> LIBC_INLINE uint32_t neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m128i>(p1, offset); + const auto b = load<__m128i>(p2, offset); + const auto xored = _mm_xor_si128(a, b); + return _mm_testz_si128(xored, xored) == 0; // 0 iff xored != 0 +} +template <> +LIBC_INLINE MemcmpReturnType cmp_neq<__m128i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m128i>(p1, offset); + const auto b = load<__m128i>(p2, offset); + const auto vmax = bytewise_max(a, b); + const auto le = big_endian_cmp_mask(vmax, b); + const auto ge = big_endian_cmp_mask(vmax, a); + static_assert(cpp::is_same_v, uint16_t>); + return static_cast(ge) - static_cast(le); +} +#endif // __SSE4_1__ -namespace sse2 { -LIBC_INLINE MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { -#if defined(__SSE2__) - using T = char __attribute__((__vector_size__(16))); - // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. - if (int mask = - _mm_movemask_epi8(cpp::bit_cast<__m128i>(load(p1) != load(p2)))) - return char_diff_no_zero(p1, p2, mask); - return MemcmpReturnType::ZERO(); -#else - (void)p1; - (void)p2; - return MemcmpReturnType::ZERO(); -#endif // defined(__SSE2__) +/////////////////////////////////////////////////////////////////////////////// +// Specializations for __m256i +#if defined(__AVX__) +template <> struct is_vector<__m256i> : cpp::true_type {}; +template <> struct cmp_is_expensive<__m256i> : cpp::true_type {}; +template <> LIBC_INLINE bool eq<__m256i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m256i>(p1, offset); + const auto b = load<__m256i>(p2, offset); + const auto xored = _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); + return _mm256_testz_si256(xored, xored) == 1; // 1 iff xored == 0 +} +template <> LIBC_INLINE uint32_t neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m256i>(p1, offset); + const auto b = load<__m256i>(p2, offset); + const auto xored = _mm256_castps_si256( + _mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); + return _mm256_testz_si256(xored, xored) == 0; // 0 iff xored != 0 } -template using Memcmp = MemcmpImpl; -} // namespace sse2 +#endif // __AVX__ -namespace avx2 { -LIBC_INLINE MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { #if defined(__AVX2__) - using T = char __attribute__((__vector_size__(32))); - // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. - if (int mask = _mm256_movemask_epi8( - cpp::bit_cast<__m256i>(load(p1) != load(p2)))) - return char_diff_no_zero(p1, p2, mask); - return MemcmpReturnType::ZERO(); -#else - (void)p1; - (void)p2; - return MemcmpReturnType::ZERO(); -#endif // defined(__AVX2__) +LIBC_INLINE __m256i bytewise_max(__m256i a, __m256i b) { + return _mm256_max_epu8(a, b); +} +LIBC_INLINE __m256i bytewise_reverse(__m256i value) { + return _mm256_shuffle_epi8(value, + _mm256_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, // + 24, 25, 26, 27, 28, 29, 30, 31)); } -template using Memcmp = MemcmpImpl; -} // namespace avx2 +LIBC_INLINE uint32_t big_endian_cmp_mask(__m256i max, __m256i value) { + return _mm256_movemask_epi8(bytewise_reverse(_mm256_cmpeq_epi8(max, value))); +} +template <> +LIBC_INLINE MemcmpReturnType cmp_neq<__m256i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m256i>(p1, offset); + const auto b = load<__m256i>(p2, offset); + const auto vmax = bytewise_max(a, b); + const auto le = big_endian_cmp_mask(vmax, b); + const auto ge = big_endian_cmp_mask(vmax, a); + static_assert(cpp::is_same_v, uint32_t>); + return cmp_uint32_t(ge, le); +} +#endif // __AVX2__ -namespace avx512bw { -LIBC_INLINE MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { +/////////////////////////////////////////////////////////////////////////////// +// Specializations for __m512i #if defined(__AVX512BW__) - using T = char __attribute__((__vector_size__(64))); - // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - if (uint64_t mask = - _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load(p1)), - cpp::bit_cast<__m512i>(load(p2)))) - return char_diff_no_zero(p1, p2, mask); - return MemcmpReturnType::ZERO(); -#else - (void)p1; - (void)p2; - return MemcmpReturnType::ZERO(); -#endif // defined(__AVX512BW__) +template <> struct is_vector<__m512i> : cpp::true_type {}; +template <> struct cmp_is_expensive<__m512i> : cpp::true_type {}; +LIBC_INLINE __m512i bytewise_max(__m512i a, __m512i b) { + return _mm512_max_epu8(a, b); } -template using Memcmp = MemcmpImpl; -} // namespace avx512bw +LIBC_INLINE __m512i bytewise_reverse(__m512i value) { + return _mm512_shuffle_epi8(value, + _mm512_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, // + 8, 9, 10, 11, 12, 13, 14, 15, // + 16, 17, 18, 19, 20, 21, 22, 23, // + 24, 25, 26, 27, 28, 29, 30, 31, // + 32, 33, 34, 35, 36, 37, 38, 39, // + 40, 41, 42, 43, 44, 45, 46, 47, // + 48, 49, 50, 51, 52, 53, 54, 55, // + 56, 57, 58, 59, 60, 61, 62, 63)); +} +LIBC_INLINE uint64_t big_endian_cmp_mask(__m512i max, __m512i value) { + return _mm512_cmpeq_epi8_mask(bytewise_reverse(max), bytewise_reverse(value)); +} +template <> LIBC_INLINE bool eq<__m512i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m512i>(p1, offset); + const auto b = load<__m512i>(p2, offset); + return _mm512_cmpneq_epi8_mask(a, b) == 0; +} +template <> LIBC_INLINE uint32_t neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m512i>(p1, offset); + const auto b = load<__m512i>(p2, offset); + const uint64_t xored = _mm512_cmpneq_epi8_mask(a, b); + return (xored >> 32) | (xored & 0xFFFFFFFF); +} +template <> +LIBC_INLINE MemcmpReturnType cmp_neq<__m512i>(CPtr p1, CPtr p2, size_t offset) { + const auto a = load<__m512i>(p1, offset); + const auto b = load<__m512i>(p2, offset); + const auto vmax = bytewise_max(a, b); + const auto le = big_endian_cmp_mask(vmax, b); + const auto ge = big_endian_cmp_mask(vmax, a); + static_assert(cpp::is_same_v, uint64_t>); + return cmp_neq_uint64_t(ge, le); +} +#endif // __AVX512BW__ -} // namespace __llvm_libc::x86 +} // namespace __llvm_libc::generic #endif // LIBC_TARGET_ARCH_IS_X86_64 diff --git a/libc/src/string/memory_utils/utils.h b/libc/src/string/memory_utils/utils.h --- a/libc/src/string/memory_utils/utils.h +++ b/libc/src/string/memory_utils/utils.h @@ -15,9 +15,10 @@ #include "src/__support/endian.h" #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/config.h" // LIBC_HAS_BUILTIN +#include "src/__support/macros/properties/architectures.h" #include // size_t -#include // intptr_t / uintptr_t +#include // intptr_t / uintptr_t / INT32_MAX / INT32_MIN namespace __llvm_libc { @@ -164,6 +165,68 @@ using MemcmpReturnType = StrictIntegralType; using BcmpReturnType = StrictIntegralType; +// This implements the semantic of 'memcmp' returning a negative value when 'a' +// is less than 'b', '0' when 'a' equals 'b' and a positive number otherwise. +LIBC_INLINE MemcmpReturnType cmp_uint32_t(uint32_t a, uint32_t b) { + // We perform the difference as an int64_t. + const int64_t diff = static_cast(a) - static_cast(b); + // For the int64_t to int32_t conversion we want the following properties: + // - int32_t[31:31] == 1 iff diff < 0 + // - int32_t[31:0] == 0 iff diff == 0 + + // We also observe that: + // - When diff < 0: diff[63:32] == 0xffffffff and diff[31:0] != 0 + // - When diff > 0: diff[63:32] == 0 and diff[31:0] != 0 + // - When diff == 0: diff[63:32] == 0 and diff[31:0] == 0 + // - https://godbolt.org/z/8W7qWP6e5 + // - This implies that we can only look at diff[32:32] for determining the + // sign bit for the returned int32_t. + + // So, we do the following: + // - int32_t[31:31] = diff[32:32] + // - int32_t[30:0] = diff[31:0] == 0 ? 0 : non-0. + + // And, we can achieve the above by the expression below. We could have also + // used (diff64 >> 1) | (diff64 & 0x1) but (diff64 & 0xFFFF) is faster than + // (diff64 & 0x1). https://godbolt.org/z/j3b569rW1 + return static_cast((diff >> 1) | (diff & 0xFFFF)); +} + +// Returns a negative value if 'a' is less than 'b' and a positive value +// otherwise. This implements the semantic of 'memcmp' when we know that 'a' and +// 'b' differ. +LIBC_INLINE MemcmpReturnType cmp_neq_uint64_t(uint64_t a, uint64_t b) { +#if defined(LIBC_TARGET_ARCH_IS_X86_64) + // On x86, the best strategy would be to use 'INT32_MAX' and 'INT32_MIN' for + // positive and negative value respectively as they are one value apart: + // xor eax, eax <- free + // cmp rdi, rsi <- serializing + // adc eax, 2147483647 <- serializing + + // Unfortunately we found instances of client code that negate the result of + // 'memcmp' to reverse ordering. Because signed integers are not symmetric + // (e.g., int8_t āˆˆ [-128, 127]) returning 'INT_MIN' would break such code as + // `-INT_MIN` is not representable as an int32_t. + + // As a consequence, we use 5 and -5 which is still OK nice in terms of + // latency. + // cmp rdi, rsi <- serializing + // mov ecx, -5 <- can be done in parallel + // mov eax, 5 <- can be done in parallel + // cmovb eax, ecx <- serializing + static constexpr int32_t POSITIVE = 5; + static constexpr int32_t NEGATIVE = -5; +#else + // On RISC-V we simply use '1' and '-1' as it leads to branchless code. + // On ARMv8, both strategies lead to the same performance. + static constexpr int32_t POSITIVE = 1; + static constexpr int32_t NEGATIVE = -1; +#endif + static_assert(POSITIVE > 0); + static_assert(NEGATIVE < 0); + return a < b ? NEGATIVE : POSITIVE; +} + // Loads bytes from memory (possibly unaligned) and materializes them as // type. template LIBC_INLINE T load(CPtr ptr) { @@ -295,6 +358,16 @@ deferred_static_assert("AlignOn must be either Arg::P1 or Arg::P2"); } +template struct AlignHelper { + AlignHelper(CPtr ptr) : offset_(distance_to_next_aligned(ptr)) {} + + LIBC_INLINE bool not_aligned() const { return offset_ != SIZE; } + LIBC_INLINE uintptr_t offset() const { return offset_; } + +private: + uintptr_t offset_; +}; + } // namespace __llvm_libc #endif // LLVM_LIBC_SRC_MEMORY_UTILS_UTILS_H diff --git a/libc/src/string/memory_utils/x86_64/memcmp_implementations.h b/libc/src/string/memory_utils/x86_64/memcmp_implementations.h --- a/libc/src/string/memory_utils/x86_64/memcmp_implementations.h +++ b/libc/src/string/memory_utils/x86_64/memcmp_implementations.h @@ -18,79 +18,76 @@ [[maybe_unused]] LIBC_INLINE MemcmpReturnType inline_memcmp_generic_gt16(CPtr p1, CPtr p2, size_t count) { - if (LIBC_UNLIKELY(count >= 384)) { - if (auto value = generic::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return generic::Memcmp<16>::loop_and_tail(p1, p2, count); + return generic::Memcmp::loop_and_tail_align_above(384, p1, p2, + count); } +#if defined(__SSE4_1__) [[maybe_unused]] LIBC_INLINE MemcmpReturnType -inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { - if (LIBC_UNLIKELY(count >= 384)) { - if (auto value = x86::sse2::Memcmp<16>::block(p1, p2)) - return value; - align_to_next_boundary<16, Arg::P1>(p1, p2, count); - } - return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count); +inline_memcmp_x86_sse41_gt16(CPtr p1, CPtr p2, size_t count) { + return generic::Memcmp<__m128i>::loop_and_tail_align_above(384, p1, p2, + count); } +#endif // __SSE4_1__ +#if defined(__AVX2__) [[maybe_unused]] LIBC_INLINE MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); + return generic::Memcmp<__m128i>::head_tail(p1, p2, count); if (count <= 64) - return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); - if (count <= 128) - return x86::avx2::Memcmp<64>::head_tail(p1, p2, count); - if (LIBC_UNLIKELY(count >= 384)) { - if (auto value = x86::avx2::Memcmp<32>::block(p1, p2)) - return value; - align_to_next_boundary<32, Arg::P1>(p1, p2, count); - } - return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count); + return generic::Memcmp<__m256i>::head_tail(p1, p2, count); + return generic::Memcmp<__m256i>::loop_and_tail_align_above(384, p1, p2, + count); } +#endif // __AVX2__ +#if defined(__AVX512BW__) [[maybe_unused]] LIBC_INLINE MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) - return x86::sse2::Memcmp<16>::head_tail(p1, p2, count); + return generic::Memcmp<__m128i>::head_tail(p1, p2, count); if (count <= 64) - return x86::avx2::Memcmp<32>::head_tail(p1, p2, count); + return generic::Memcmp<__m256i>::head_tail(p1, p2, count); if (count <= 128) - return x86::avx512bw::Memcmp<64>::head_tail(p1, p2, count); - if (LIBC_UNLIKELY(count >= 384)) { - if (auto value = x86::avx512bw::Memcmp<64>::block(p1, p2)) - return value; - align_to_next_boundary<64, Arg::P1>(p1, p2, count); - } - return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); + return generic::Memcmp<__m512i>::head_tail(p1, p2, count); + return generic::Memcmp<__m512i>::loop_and_tail_align_above(384, p1, p2, + count); } +#endif // __AVX512BW__ LIBC_INLINE MemcmpReturnType inline_memcmp_x86(CPtr p1, CPtr p2, size_t count) { - if (count == 0) return MemcmpReturnType::ZERO(); if (count == 1) - return generic::Memcmp<1>::block(p1, p2); + return generic::Memcmp::block(p1, p2); if (count == 2) - return generic::Memcmp<2>::block(p1, p2); + return generic::Memcmp::block(p1, p2); if (count == 3) - return generic::Memcmp<3>::block(p1, p2); - if (count <= 8) - return generic::Memcmp<4>::head_tail(p1, p2, count); + return generic::MemcmpSequence::block(p1, p2); + if (count == 4) + return generic::Memcmp::block(p1, p2); + if (count == 5) + return generic::MemcmpSequence::block(p1, p2); + if (count == 6) + return generic::MemcmpSequence::block(p1, p2); + if (count == 7) + return generic::Memcmp::head_tail(p1, p2, 7); + if (count == 8) + return generic::Memcmp::block(p1, p2); if (count <= 16) - return generic::Memcmp<8>::head_tail(p1, p2, count); - if constexpr (x86::kAvx512BW) - return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_memcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_memcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_memcmp_generic_gt16(p1, p2, count); + return generic::Memcmp::head_tail(p1, p2, count); +#if defined(__AVX512BW__) + return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); +#elif defined(__AVX2__) + return inline_memcmp_x86_avx2_gt16(p1, p2, count); +#elif defined(__SSE4_1__) + return inline_memcmp_x86_sse41_gt16(p1, p2, count); +#else + return inline_memcmp_generic_gt16(p1, p2, count); +#endif } + } // namespace __llvm_libc #endif // LIBC_SRC_STRING_MEMORY_UTILS_X86_64_MEMCMP_IMPLEMENTATIONS_H diff --git a/libc/test/src/string/memory_utils/op_tests.cpp b/libc/test/src/string/memory_utils/op_tests.cpp --- a/libc/test/src/string/memory_utils/op_tests.cpp +++ b/libc/test/src/string/memory_utils/op_tests.cpp @@ -9,14 +9,11 @@ #include "memory_check_utils.h" #include "src/string/memory_utils/op_aarch64.h" #include "src/string/memory_utils/op_builtin.h" -#include "src/string/memory_utils/op_generic.h" +#include "src/string/memory_utils/op_generic.h" // LLVM_LIBC_HAS_UINT64 +#include "src/string/memory_utils/op_riscv.h" #include "src/string/memory_utils/op_x86.h" #include "test/UnitTest/Test.h" -#if defined(LIBC_TARGET_ARCH_IS_X86_64) || defined(LIBC_TARGET_ARCH_IS_AARCH64) -#define LLVM_LIBC_HAS_UINT64 -#endif - namespace __llvm_libc { template struct has_head_tail { @@ -131,13 +128,13 @@ generic::Memset, generic::Memset>, #endif #ifdef __AVX512F__ - generic::Memset, generic::Memset>, + generic::Memset, generic::Memset>, #endif #ifdef __AVX__ - generic::Memset, generic::Memset>, + generic::Memset, generic::Memset>, #endif #ifdef __SSE2__ - generic::Memset, generic::Memset>, + generic::Memset, generic::Memset>, #endif generic::Memset, generic::Memset>, // generic::Memset, generic::Memset>, // @@ -194,35 +191,36 @@ } using BcmpImplementations = testing::TypeList< -#ifdef __SSE2__ - x86::sse2::Bcmp<16>, // - x86::sse2::Bcmp<32>, // - x86::sse2::Bcmp<64>, // - x86::sse2::Bcmp<128>, // -#endif +#ifdef LIBC_TARGET_ARCH_IS_X86_64 +#ifdef __SSE4_1__ + generic::Bcmp<__m128i>, +#endif // __SSE4_1__ #ifdef __AVX2__ - x86::avx2::Bcmp<32>, // - x86::avx2::Bcmp<64>, // - x86::avx2::Bcmp<128>, // -#endif + generic::Bcmp<__m256i>, +#endif // __AVX2__ #ifdef __AVX512BW__ - x86::avx512bw::Bcmp<64>, // - x86::avx512bw::Bcmp<128>, // -#endif + generic::Bcmp<__m512i>, +#endif // __AVX512BW__ + +#endif // LIBC_TARGET_ARCH_IS_X86_64 #ifdef LIBC_TARGET_ARCH_IS_AARCH64 aarch64::Bcmp<16>, // - aarch64::Bcmp<32>, // + aarch64::Bcmp<32>, #endif +#ifndef LIBC_TARGET_ARCH_IS_ARM // Removing non uint8_t types for ARM + generic::Bcmp, + generic::Bcmp, // #ifdef LLVM_LIBC_HAS_UINT64 - generic::Bcmp<8>, // -#endif - generic::Bcmp<1>, // - generic::Bcmp<2>, // - generic::Bcmp<4>, // - generic::Bcmp<16>, // - generic::Bcmp<32>, // - generic::Bcmp<64> // - >; + generic::Bcmp, +#endif // LLVM_LIBC_HAS_UINT64 + generic::BcmpSequence, + generic::BcmpSequence, // + generic::BcmpSequence, // + generic::BcmpSequence, +#endif // LIBC_TARGET_ARCH_IS_ARM + generic::BcmpSequence, + generic::BcmpSequence, // + generic::Bcmp>; // Adapt CheckBcmp signature to op implementation signatures. template @@ -247,7 +245,8 @@ ASSERT_TRUE((CheckBcmp(span1, span2, kSize))); } } - { // Test head tail operations from kSize to 2 * kSize. + if constexpr (has_head_tail::value) { + // Test head tail operations from kSize to 2 * kSize. static constexpr auto HeadTailImpl = CmpAdaptor; Buffer Buffer1(2 * kSize); Buffer Buffer2(2 * kSize); @@ -258,7 +257,8 @@ ASSERT_TRUE((CheckBcmp(span1, span2, size))); } } - { // Test loop operations from kSize to 3 * kSize. + if constexpr (has_loop_and_tail::value) { + // Test loop operations from kSize to 3 * kSize. if constexpr (kSize > 1) { static constexpr auto LoopImpl = CmpAdaptor; Buffer Buffer1(3 * kSize); @@ -274,32 +274,33 @@ } using MemcmpImplementations = testing::TypeList< +#ifdef LIBC_TARGET_ARCH_IS_X86_64 #ifdef __SSE2__ - x86::sse2::Memcmp<16>, // - x86::sse2::Memcmp<32>, // - x86::sse2::Memcmp<64>, // - x86::sse2::Memcmp<128>, // + generic::Memcmp<__m128i>, // #endif #ifdef __AVX2__ - x86::avx2::Memcmp<32>, // - x86::avx2::Memcmp<64>, // - x86::avx2::Memcmp<128>, // + generic::Memcmp<__m256i>, // #endif #ifdef __AVX512BW__ - x86::avx512bw::Memcmp<64>, // - x86::avx512bw::Memcmp<128>, // + generic::Memcmp<__m512i>, // #endif -#ifdef LLVM_LIBC_HAS_UINT64 - generic::Memcmp<8>, // +#endif // LIBC_TARGET_ARCH_IS_X86_64 +#ifdef LIBC_TARGET_ARCH_IS_AARCH64 + generic::Memcmp, // + generic::Memcmp, #endif - generic::Memcmp<1>, // - generic::Memcmp<2>, // - generic::Memcmp<3>, // - generic::Memcmp<4>, // - generic::Memcmp<16>, // - generic::Memcmp<32>, // - generic::Memcmp<64> // - >; +#ifndef LIBC_TARGET_ARCH_IS_ARM // Removing non uint8_t types for ARM + generic::Memcmp, + generic::Memcmp, // +#ifdef LLVM_LIBC_HAS_UINT64 + generic::Memcmp, +#endif // LLVM_LIBC_HAS_UINT64 + generic::MemcmpSequence, + generic::MemcmpSequence, // +#endif // LIBC_TARGET_ARCH_IS_ARM + generic::MemcmpSequence, + generic::MemcmpSequence, + generic::Memcmp>; TYPED_TEST(LlvmLibcOpTest, Memcmp, MemcmpImplementations) { using Impl = ParamType; @@ -314,7 +315,8 @@ ASSERT_TRUE((CheckMemcmp(span1, span2, kSize))); } } - { // Test head tail operations from kSize to 2 * kSize. + if constexpr (has_head_tail::value) { + // Test head tail operations from kSize to 2 * kSize. static constexpr auto HeadTailImpl = CmpAdaptor; Buffer Buffer1(2 * kSize); Buffer Buffer2(2 * kSize); @@ -325,7 +327,8 @@ ASSERT_TRUE((CheckMemcmp(span1, span2, size))); } } - { // Test loop operations from kSize to 3 * kSize. + if constexpr (has_loop_and_tail::value) { + // Test loop operations from kSize to 3 * kSize. if constexpr (kSize > 1) { static constexpr auto LoopImpl = CmpAdaptor; Buffer Buffer1(3 * kSize); diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1995,6 +1995,7 @@ "src/string/memory_utils/op_aarch64.h", "src/string/memory_utils/op_builtin.h", "src/string/memory_utils/op_generic.h", + "src/string/memory_utils/op_riscv.h", "src/string/memory_utils/op_x86.h", "src/string/memory_utils/utils.h", ],