diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt --- a/libc/CMakeLists.txt +++ b/libc/CMakeLists.txt @@ -33,6 +33,13 @@ # Flags to pass down to the compiler while building the libc functions. set(LIBC_COMPILE_OPTIONS_DEFAULT "" CACHE STRING "Architecture to tell clang to optimize for (e.g. -march=... or -mcpu=...)") +option(LIBC_FAST_UNSAFE_WORD_READ "Use the faster strlen algorithm that depends on undefined behavior, and may fail on some systems." OFF) +if(LIBC_FAST_UNSAFE_WORD_READ) + list(APPEND LIBC_COMPILE_OPTIONS_DEFAULT "-DLIBC_FAST_UNSAFE_WORD_READ") +endif() + + + # Check --print-resource-dir to find the compiler resource dir if this flag # is supported by the compiler. execute_process( diff --git a/libc/src/string/CMakeLists.txt b/libc/src/string/CMakeLists.txt --- a/libc/src/string/CMakeLists.txt +++ b/libc/src/string/CMakeLists.txt @@ -186,6 +186,7 @@ HDRS strlen.h DEPENDS + .string_utils libc.include.string ) diff --git a/libc/src/string/string_utils.h b/libc/src/string/string_utils.h --- a/libc/src/string/string_utils.h +++ b/libc/src/string/string_utils.h @@ -19,22 +19,133 @@ namespace __llvm_libc { namespace internal { -// Returns the length of a string, denoted by the first occurrence -// of a null terminator. -static inline size_t string_length(const char *src) { +template constexpr T repeat_byte(T byte) { + constexpr size_t BITS_IN_BYTE = 8; + constexpr size_t BYTE_MASK = 0xff; + T result = 0; + byte = byte & BYTE_MASK; + for (size_t i = 0; i < sizeof(T); ++i) + result = (result << BITS_IN_BYTE) | byte; + return result; +} + +// The goal of this function is to take in a block of arbitrary size and return +// if it has any bytes equal to zero without branching. This is done by +// transforming the block such that zero bytes become non-zero and non-zero +// bytes become zero. +// The first transformation relies on the properties of carrying in arithmetic +// subtraction. Specifically, if 0x01 is subtracted from a byte that is 0x00, +// then the result for that byte must be equal to 0xff (or 0xfe if the next byte +// needs a carry as well). +// The next transformation is a simple mask. All zero bytes will have the high +// bit set after the subtraction, so each byte is masked with 0x80. This narrows +// the set of bytes that result in a non-zero value to only zero bytes and bytes +// with the high bit and any other bit set. +// The final transformation masks the result of the previous transformations +// with the inverse of the original byte. This means that any byte that had the +// high bit set will no longer have it set, narrowing the list of bytes which +// result in non-zero values to just the zero byte. +template constexpr bool has_zeroes(T block) { + constexpr T LOW_BITS = repeat_byte(0x01); + constexpr T HIGH_BITS = repeat_byte(0x80); + T subtracted = block - LOW_BITS; + T inverted = ~block; + return (subtracted & inverted & HIGH_BITS) != 0; +} + +template +static inline size_t string_length_unsafe(const char *src) { + const char *char_ptr = src; + // Step 1: read 1 byte at a time to align to block size + for (; reinterpret_cast(char_ptr) % sizeof(T) != 0; ++char_ptr) { + if (*char_ptr == 0) + return char_ptr - src; + } + // Step 2: read blocks + for (const T *block_ptr = reinterpret_cast(char_ptr); + !has_zeroes(*block_ptr); ++block_ptr) { + char_ptr = reinterpret_cast(block_ptr); + } + // Step 3: find the zero in the block + for (; *char_ptr != 0; ++char_ptr) { + ; + } + return char_ptr - src; +} + +static inline size_t string_length_safe(const char *src) { size_t length; for (length = 0; *src; ++src, ++length) ; return length; } +// Returns the length of a string, denoted by the first occurrence +// of a null terminator. +static inline size_t string_length(const char *src) { +#ifdef LIBC_FAST_UNSAFE_WORD_READ + // TODO: pick size of var based on requested size. + return string_length_unsafe(src); +#else + return string_length_safe(src); +#endif +} + +template +static inline void *find_first_character_unsafe(const unsigned char *src, + unsigned char ch, size_t n) { + const unsigned char *char_ptr = src; + size_t cur = 0; + + // Step 1: read 1 byte at a time to align to block size + for (; reinterpret_cast(char_ptr) % sizeof(T) != 0 && cur < n; + ++char_ptr, ++cur) { + if (*char_ptr == ch) + return const_cast(char_ptr); + } + + const T ch_mask = repeat_byte(ch); + + // Step 2: read blocks + for (const T *block_ptr = reinterpret_cast(char_ptr); + !has_zeroes((*block_ptr) ^ ch_mask) && cur < n; + ++block_ptr, cur += sizeof(T)) { + char_ptr = reinterpret_cast(block_ptr); + } + + // Step 3: find the zero in the block + for (; *char_ptr != ch && cur < n; ++char_ptr, ++cur) { + ; + } + + if (*char_ptr != ch || cur >= n) + return static_cast(nullptr); + + return const_cast(char_ptr); +} + +static inline void *find_first_character_safe(const unsigned char *src, + unsigned char ch, size_t n) { + for (; n && *src != ch; --n, ++src) + ; + return n ? const_cast(src) : nullptr; +} + // Returns the first occurrence of 'ch' within the first 'n' characters of // 'src'. If 'ch' is not found, returns nullptr. static inline void *find_first_character(const unsigned char *src, unsigned char ch, size_t n) { - for (; n && *src != ch; --n, ++src) - ; - return n ? const_cast(src) : nullptr; +#ifdef LIBC_FAST_UNSAFE_WORD_READ + // If n is small, the overhead of generating the ch_mask may be greater than + // the gains from reading larger chunks. + // The number 4 is a guess, but it seems reasonable. + if (n > (sizeof(uintptr_t) * 4)) { + // TODO: pick size of var based on requested size. + return find_first_character_unsafe(src, ch, n); + } +#else + return find_first_character_safe(src, ch, n); +#endif } // Returns the maximum length span that contains only characters not found in