diff --git a/libc/src/__support/compiler_features.h b/libc/src/__support/compiler_features.h --- a/libc/src/__support/compiler_features.h +++ b/libc/src/__support/compiler_features.h @@ -38,4 +38,12 @@ #define LLVM_LIBC_HAS_FEATURE(FEATURE) 0 #endif +#if defined(LLVM_LIBC_COMPILER_CLANG) +#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("nounroll") +#elif defined(LLVM_LIBC_COMPILER_GCC) +#define LLVM_LIBC_LOOP_NOUNROLL _Pragma("GCC unroll 0") +#else +#define LLVM_LIBC_LOOP_NOUNROLL +#endif + #endif // LLVM_LIBC_SUPPORT_COMPILER_FEATURES_H diff --git a/libc/src/string/memory_utils/bcmp_implementations.h b/libc/src/string/memory_utils/bcmp_implementations.h --- a/libc/src/string/memory_utils/bcmp_implementations.h +++ b/libc/src/string/memory_utils/bcmp_implementations.h @@ -22,7 +22,7 @@ [[maybe_unused]] static inline BcmpReturnType inline_bcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) if (auto value = generic::Bcmp<1>::block(p1 + offset, p2 + offset)) return value; diff --git a/libc/src/string/memory_utils/memcmp_implementations.h b/libc/src/string/memory_utils/memcmp_implementations.h --- a/libc/src/string/memory_utils/memcmp_implementations.h +++ b/libc/src/string/memory_utils/memcmp_implementations.h @@ -22,7 +22,7 @@ namespace __llvm_libc { [[maybe_unused]] static inline MemcmpReturnType inline_memcmp_embedded_tiny(CPtr p1, CPtr p2, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) if (auto value = generic::Memcmp<1>::block(p1 + offset, p2 + offset)) return value; @@ -83,6 +83,7 @@ } return x86::avx512bw::Memcmp<64>::loop_and_tail(p1, p2, count); } + #endif // defined(LLVM_LIBC_ARCH_X86) #if defined(LLVM_LIBC_ARCH_AARCH64) diff --git a/libc/src/string/memory_utils/memcpy_implementations.h b/libc/src/string/memory_utils/memcpy_implementations.h --- a/libc/src/string/memory_utils/memcpy_implementations.h +++ b/libc/src/string/memory_utils/memcpy_implementations.h @@ -24,7 +24,7 @@ [[maybe_unused]] static inline void inline_memcpy_embedded_tiny(Ptr __restrict dst, CPtr __restrict src, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } diff --git a/libc/src/string/memory_utils/memmove_implementations.h b/libc/src/string/memory_utils/memmove_implementations.h --- a/libc/src/string/memory_utils/memmove_implementations.h +++ b/libc/src/string/memory_utils/memmove_implementations.h @@ -23,11 +23,11 @@ if ((count == 0) || (dst == src)) return; if (dst < src) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } else { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (ptrdiff_t offset = count - 1; offset >= 0; --offset) builtin::Memcpy<1>::block(dst + offset, src + offset); } diff --git a/libc/src/string/memory_utils/memset_implementations.h b/libc/src/string/memory_utils/memset_implementations.h --- a/libc/src/string/memory_utils/memset_implementations.h +++ b/libc/src/string/memory_utils/memset_implementations.h @@ -22,7 +22,7 @@ [[maybe_unused]] inline static void inline_memset_embedded_tiny(Ptr dst, uint8_t value, size_t count) { -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL for (size_t offset = 0; offset < count; ++offset) generic::Memset<1, 1>::block(dst + offset, value); } diff --git a/libc/src/string/memory_utils/op_builtin.h b/libc/src/string/memory_utils/op_builtin.h --- a/libc/src/string/memory_utils/op_builtin.h +++ b/libc/src/string/memory_utils/op_builtin.h @@ -27,9 +27,9 @@ #ifdef LLVM_LIBC_HAS_BUILTIN_MEMCPY_INLINE return __builtin_memcpy_inline(dst, src, SIZE); #else - deferred_static_assert("Missing __builtin_memcpy_inline"); - (void)dst; - (void)src; + // The codegen may be suboptimal. + for (size_t i = 0; i < Size; ++i) + dst[i] = src[i]; #endif } diff --git a/libc/src/string/memory_utils/op_generic.h b/libc/src/string/memory_utils/op_generic.h --- a/libc/src/string/memory_utils/op_generic.h +++ b/libc/src/string/memory_utils/op_generic.h @@ -26,6 +26,7 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/common.h" +#include "src/__support/compiler_features.h" #include "src/__support/endian.h" #include "src/string/memory_utils/op_builtin.h" #include "src/string/memory_utils/utils.h" @@ -71,9 +72,34 @@ } }; +// GCC can only take literals as __vector_size__ argument so we have to use +// template specialization. +template struct VectorValueType {}; +template <> struct VectorValueType<1> { + using type = uint8_t __attribute__((__vector_size__(1))); +}; +template <> struct VectorValueType<2> { + using type = uint8_t __attribute__((__vector_size__(2))); +}; +template <> struct VectorValueType<4> { + using type = uint8_t __attribute__((__vector_size__(4))); +}; +template <> struct VectorValueType<8> { + using type = uint8_t __attribute__((__vector_size__(8))); +}; +template <> struct VectorValueType<16> { + using type = uint8_t __attribute__((__vector_size__(16))); +}; +template <> struct VectorValueType<32> { + using type = uint8_t __attribute__((__vector_size__(32))); +}; +template <> struct VectorValueType<64> { + using type = uint8_t __attribute__((__vector_size__(64))); +}; + // Implements load, store and splat for vector types. template struct VectorType { - using Type = uint8_t __attribute__((__vector_size__(Size))); + using Type = typename VectorValueType::type; static inline Type load(CPtr src) { return ::__llvm_libc::load(src); } static inline void store(Ptr dst, Type value) { ::__llvm_libc::store(dst, value); @@ -434,7 +460,7 @@ const size_t tail_offset = count - Size; const auto tail_value = T::load(src + tail_offset); size_t offset = 0; -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset += Size; @@ -460,7 +486,7 @@ static_assert(Size > 1, "a loop of size 1 does not need tail"); const auto head_value = T::load(src); ptrdiff_t offset = count - Size; -#pragma nounroll + LLVM_LIBC_LOOP_NOUNROLL do { block(dst + offset, src + offset); offset -= Size; diff --git a/libc/src/string/memory_utils/op_x86.h b/libc/src/string/memory_utils/op_x86.h --- a/libc/src/string/memory_utils/op_x86.h +++ b/libc/src/string/memory_utils/op_x86.h @@ -99,17 +99,22 @@ namespace sse2 { static inline BcmpReturnType bcmp16(CPtr p1, CPtr p2) { +#if defined(__SSE2__) using T = char __attribute__((__vector_size__(16))); // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. const int mask = _mm_movemask_epi8(cpp::bit_cast<__m128i>(load(p1) != load(p2))); return static_cast(mask); +#else + return BcmpReturnType::ZERO(); +#endif // defined(__SSE2__) } template using Bcmp = BcmpImpl; } // namespace sse2 namespace avx2 { static inline BcmpReturnType bcmp32(CPtr p1, CPtr p2) { +#if defined(__AVX2__) using T = char __attribute__((__vector_size__(32))); // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. const int mask = @@ -117,17 +122,25 @@ // _mm256_movemask_epi8 returns an int but it is to be interpreted as a 32-bit // mask. return static_cast(mask); +#else + return BcmpReturnType::ZERO(); +#endif // defined(__AVX2__) } template using Bcmp = BcmpImpl; } // namespace avx2 namespace avx512bw { static inline BcmpReturnType bcmp64(CPtr p1, CPtr p2) { +#if defined(__AVX512BW__) using T = char __attribute__((__vector_size__(64))); // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - const uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2)); + const uint64_t mask = _mm512_cmpneq_epi8_mask( + cpp::bit_cast<__m512i>(load(p1)), cpp::bit_cast<__m512i>(load(p2))); const bool mask_is_set = mask != 0; return static_cast(mask_is_set); +#else + return BcmpReturnType::ZERO(); +#endif // defined(__AVX512BW__) } template using Bcmp = BcmpImpl; } // namespace avx512bw @@ -192,35 +205,49 @@ namespace sse2 { static inline MemcmpReturnType memcmp16(CPtr p1, CPtr p2) { +#if defined(__SSE2__) using T = char __attribute__((__vector_size__(16))); // A mask indicating which bytes differ after loading 16 bytes from p1 and p2. if (int mask = _mm_movemask_epi8(cpp::bit_cast<__m128i>(load(p1) != load(p2)))) return char_diff_no_zero(p1, p2, mask); return MemcmpReturnType::ZERO(); +#else + return MemcmpReturnType::ZERO(); +#endif // defined(__SSE2__) } template using Memcmp = MemcmpImpl; } // namespace sse2 namespace avx2 { static inline MemcmpReturnType memcmp32(CPtr p1, CPtr p2) { +#if defined(__AVX2__) using T = char __attribute__((__vector_size__(32))); // A mask indicating which bytes differ after loading 32 bytes from p1 and p2. if (int mask = _mm256_movemask_epi8( cpp::bit_cast<__m256i>(load(p1) != load(p2)))) return char_diff_no_zero(p1, p2, mask); return MemcmpReturnType::ZERO(); +#else + return MemcmpReturnType::ZERO(); +#endif // defined(__AVX2__) } template using Memcmp = MemcmpImpl; } // namespace avx2 namespace avx512bw { static inline MemcmpReturnType memcmp64(CPtr p1, CPtr p2) { +#if defined(__AVX512BW__) using T = char __attribute__((__vector_size__(64))); // A mask indicating which bytes differ after loading 64 bytes from p1 and p2. - if (uint64_t mask = _mm512_cmpneq_epi8_mask(load(p1), load(p2))) + if (uint64_t mask = + _mm512_cmpneq_epi8_mask(cpp::bit_cast<__m512i>(load(p1)), + cpp::bit_cast<__m512i>(load(p2)))) return char_diff_no_zero(p1, p2, mask); return MemcmpReturnType::ZERO(); +#else + return MemcmpReturnType::ZERO(); +#endif // defined(__AVX512BW__) } template using Memcmp = MemcmpImpl; } // namespace avx512bw