diff --git a/libc/src/__support/compiler_features.h b/libc/src/__support/compiler_features.h --- a/libc/src/__support/compiler_features.h +++ b/libc/src/__support/compiler_features.h @@ -38,4 +38,12 @@ #define LLVM_LIBC_HAS_FEATURE(FEATURE) 0 #endif +#if defined(LLVM_LIBC_COMPILER_CLANG) +#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("nounroll") +#elif defined(LLVM_LIBC_COMPILER_GCC) +#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("GCC unroll 0") +#else +#define LLVM_LIBC_LOOP_NOUNROLL +#endif + #endif // LLVM_LIBC_SUPPORT_COMPILER_FEATURES_H diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -22,7 +22,7 @@ [[maybe_unused]] static inline BcmpReturnType inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset)) return value; @@ -42,6 +42,7 @@ #endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) #if defined(LLVM_LIBC_ARCH_X86) +#if defined(__SSE2__) [[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) @@ -53,7 +54,9 @@ align_to_next_boundary<16, Arg::P1>(p1, p2, count); return x86::sse2::Bcmp<64>::loop_and_tail(p1, p2, count); } +#endif // defined(__SSE2__) +#if defined(__AVX2__) [[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) @@ -69,7 +72,9 @@ } return x86::avx2::Bcmp<64>::loop_and_tail(p1, p2, count); } +#endif // defined(__AVX2__) +#if defined(__AVX512BW__) [[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) @@ -85,6 +90,7 @@ } return x86::avx512bw::Bcmp<64>::loop_and_tail(p1, p2, count); } +#endif // defined(__AVX512BW__) [[maybe_unused]] static inline BcmpReturnType inline_bcmp_x86(CPtr p1, CPtr p2, size_t count) { @@ -100,14 +106,15 @@ return generic::Bcmp<4>::head_tail(p1, p2, count); if (count <= 16) return generic::Bcmp<8>::head_tail(p1, p2, count); - if constexpr (x86::kAvx512BW) - return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_bcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_bcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_bcmp_generic_gt16(p1, p2, count); +#if defined(__AVX512BW__) + return inline_bcmp_x86_avx512bw_gt16(p1, p2, count); +#elif defined(__AVX2__) + return inline_bcmp_x86_avx2_gt16(p1, p2, count); +#elif defined(__SSE2__) + return inline_bcmp_x86_sse2_gt16(p1, p2, count); +#else + return inline_bcmp_generic_gt16(p1, p2, count); +#endif } #endif // defined(LLVM_LIBC_ARCH_X86) diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -22,7 +22,7 @@ namespace __llvm_libc { [[maybe_unused]] static inline MemcmpReturnType inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) return value; @@ -42,6 +42,7 @@ #endif // defined(LLVM_LIBC_ARCH_X86) || defined(LLVM_LIBC_ARCH_AARCH64) #if defined(LLVM_LIBC_ARCH_X86) +#if defined(__SSE2__) [[maybe_unused]] static inline MemcmpReturnType inline_memcmp_x86_sse2_gt16(CPtr p1, CPtr p2, size_t count) { if (unlikely(count >= 384)) { @@ -51,7 +52,9 @@ } return x86::sse2::Memcmp<16>::loop_and_tail(p1, p2, count); } +#endif // defined(__SSE2__) +#if defined(__AVX2__) [[maybe_unused]] static inline MemcmpReturnType inline_memcmp_x86_avx2_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) @@ -67,7 +70,9 @@ } return x86::avx2::Memcmp<32>::loop_and_tail(p1, p2, count); } +#endif // defined(__AVX2__) +#if defined(__AVX512BW__) [[maybe_unused]] static inline MemcmpReturnType inline_memcmp_x86_avx512bw_gt16(CPtr p1, CPtr p2, size_t count) { if (count <= 32) @@ -83,6 +88,8 @@ } return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); } +#endif // defined(__AVX512BW__) + #endif // defined(LLVM_LIBC_ARCH_X86) #if defined(LLVM_LIBC_ARCH_AARCH64) @@ -122,14 +129,15 @@ if (count <= 16) return generic::Memcmp<8>::head_tail(p1, p2, count); #if defined(LLVM_LIBC_ARCH_X86) - if constexpr (x86::kAvx512BW) - return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); - else if constexpr (x86::kAvx2) - return inline_memcmp_x86_avx2_gt16(p1, p2, count); - else if constexpr (x86::kSse2) - return inline_memcmp_x86_sse2_gt16(p1, p2, count); - else - return inline_memcmp_generic_gt16(p1, p2, count); +#if defined(__AVX512BW__) + return inline_memcmp_x86_avx512bw_gt16(p1, p2, count); +#elif defined(__AVX2__) + return inline_memcmp_x86_avx2_gt16(p1, p2, count); +#elif defined(__SSE2__) + return inline_memcmp_x86_sse2_gt16(p1, p2, count); +#else + return inline_memcmp_generic_gt16(p1, p2, count); +#endif #elif defined(LLVM_LIBC_ARCH_AARCH64) if constexpr (aarch64::kNeon) return inline_memcmp_aarch64_neon_gt16(p1, p2, count); diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h --- a/libc/src/string/memory_utils/memcpy_implementations.h +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -24,7 +24,7 @@ [[maybe_unused]] static inline void inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } diff --git a/libc/src/string/memory_utils/memmove_implementations.h b/libc/src/string/memory_utils/memmove_implementations.h --- a/libc/src/string/memory_utils/memmove_implementations.h +++ b/libc/src/string/memory_utils/memmove_implementations.h @@ -23,11 +23,11 @@ if ((count == 0) || (dst == src)) return; if (dst < src) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } else { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (ptrdiff_t offset = count - 1; offset >= 0; --offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -22,7 +22,7 @@ [[maybe_unused]] inline static void inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) generic::Memset<1, 1>::block(dst + offset, value); } diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h --- a/libc/src/string/memory_utils/op_builtin.h +++ b/libc/src/string/memory_utils/op_builtin.h @@ -27,9 +27,9 @@ #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE return __builtin_memcpy_inline(dst, src, SIZE); #else - deferred_static_assert("Missing __builtin_memcpy_inline"); - (void)dst; - (void)src; + // The codegen may be suboptimal. + for (size_t i = 0; i < Size; ++i) + dst[i] = src[i]; #endif } diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -26,6 +26,7 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/common.h" +#include "src/__support/compiler_features.h" #include "src/__support/endian.h" #include "src/string/memory_utils/op_builtin.h" #include "src/string/memory_utils/utils.h" @@ -71,9 +72,34 @@ } }; +// GCC can only take literals as __vector_size__ argument so we have to use +// template specialization. +template struct VectorValueType {}; +template <> struct VectorValueType<1> { + using type = uint8_t __attribute__((__vector_size__(1))); +}; +template <> struct VectorValueType<2> { + using type = uint8_t __attribute__((__vector_size__(2))); +}; +template <> struct VectorValueType<4> { + using type = uint8_t __attribute__((__vector_size__(4))); +}; +template <> struct VectorValueType<8> { + using type = uint8_t __attribute__((__vector_size__(8))); +}; +template <> struct VectorValueType<16> { + using type = uint8_t __attribute__((__vector_size__(16))); +}; +template <> struct VectorValueType<32> { + using type = uint8_t __attribute__((__vector_size__(32))); +}; +template <> struct VectorValueType<64> { + using type = uint8_t __attribute__((__vector_size__(64))); +}; + // Implements load, store and splat for vector types. template struct VectorType { - using Type = uint8_t __attribute__((__vector_size__(Size))); + using Type = typename VectorValueType::type; static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } static inline void store(Ptr dst, Type value) { ::__llvm_libc::store(dst, value); @@ -434,7 +460,7 @@ const size_t tail_offset = count - Size; const auto tail_value = T::load(src + tail_offset); size_t offset = 0; -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset += Size; @@ -460,7 +486,7 @@ static_assert(Size > 1, "a loop of size 1 does not need tail"); const auto head_value = T::load(src); ptrdiff_t offset = count - Size; -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset -= Size; diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -98,6 +98,7 @@ }; namespace sse2 { +#if defined(__SSE2__) static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) { using T = char __attribute__((__vector_size__(16))); // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. @@ -106,9 +107,11 @@ return static_cast(mask); } template using Bcmp = BcmpImpl; +#endif // defined(__SSE2__) } // namespace sse2 namespace avx2 { +#if defined(__AVX2__) static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { using T = char __attribute__((__vector_size__(32))); // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. @@ -119,17 +122,21 @@ return static_cast(mask); } template using Bcmp = BcmpImpl; +#endif // defined(__AVX2__) } // namespace avx2 namespace avx512bw { +#if defined(__AVX512BW__) static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) { using T = char __attribute__((__vector_size__(64))); // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - const uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2)); + const uint64_t mask = _mm512_cmpneq_epi8_mask( + cpp::bit_cast<__m512i>(load(p1)), cpp::bit_cast<__m512i>(load(p2))); const bool mask_is_set = mask != 0; return static_cast(mask_is_set); } template using Bcmp = BcmpImpl; +#endif // defined(__AVX512BW__) } // namespace avx512bw // Assuming that the mask is non zero, the index of the first mismatching byte @@ -191,6 +198,7 @@ }; namespace sse2 { +#if defined(__SSE2__) static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { using T = char __attribute__((__vector_size__(16))); // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. @@ -200,9 +208,11 @@ return MemcmpReturnType::ZERO(); } template using Memcmp = MemcmpImpl; +#endif // defined(__SSE2__) } // namespace sse2 namespace avx2 { +#if defined(__AVX2__) static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { using T = char __attribute__((__vector_size__(32))); // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. @@ -212,17 +222,22 @@ return MemcmpReturnType::ZERO(); } template using Memcmp = MemcmpImpl; +#endif // defined(__AVX2__) } // namespace avx2 namespace avx512bw { +#if defined(__AVX512BW__) static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { using T = char __attribute__((__vector_size__(64))); // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - if (uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2))) + if (uint64_t mask = + _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load(p1)), + cpp::bit_cast<__m512i>(load(p2)))) return char_diff_no_zero(p1, p2, mask); return MemcmpReturnType::ZERO(); } template using Memcmp = MemcmpImpl; +#endif // defined(__AVX512BW__) } // namespace avx512bw } // namespace __llvm_libc::x86