diff --git a/libc/src/__support/swisstable/asimd.h b/libc/src/__support/swisstable/asimd.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/swisstable/asimd.h @@ -0,0 +1,86 @@ +//===-- SwissTable ASIMD Specialization -------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/swisstable/common.h" +#include + +namespace __llvm_libc::cpp::swisstable { + +// According to abseil-cpp, ARM's 16-byte ASIMD operations may +// introduce too much latency. +// Reference: +// https://github.com/abseil/abseil-cpp/commit/6481443560a92d0a3a55a31807de0cd712cd4f88 +// With ASIMD, some bitmasks are not iteratale. This is because we +// do not want to clear the lower bits in each stride with extra +// `AND` operation. +using BitMask = BitMaskAdaptor; +using IteratableBitMask = IteratableBitMaskAdaptor; + +struct Group { + int8x8_t data; + + // Load a group of control words from an arbitary address. + static Group load(const void *__restrict addr) { + return {vld1_s8(static_cast(addr))}; + } + + // Load a group of control words from an aligned address. + // Notice that there is no difference of aligned/unaigned + // loading in ASIMD. + static Group aligned_load(const void *__restrict addr) { + + return {vld1_s8(static_cast(addr))}; + } + + // Store a group of control words to an aligned address. + void aligned_store(void *addr) const { + vst1_s8(static_cast(addr), data); + } + + // Find out the lanes equal to the given byte and return the bitmask + // with corresponding bits set. + IteratableBitMask match_byte(uint8_t byte) const { + auto duplicated = vdup_n_s8(byte); + auto cmp = vceq_s8(data, duplicated); + auto converted = vget_lane_u64(vreinterpret_u64_u8(cmp), 0); + return {converted & BitMask::MASK}; + } + + // Find out the lanes equal to EMPTY and return the bitmask + // with corresponding bits set. + BitMask mask_empty() const { + auto duplicated = vdup_n_s8(EMPTY); + auto cmp = vceq_s8(data, duplicated); + auto converted = vget_lane_u64(vreinterpret_u64_u8(cmp), 0); + return {converted}; + } + + // Find out the lanes equal to EMPTY or DELETE (highest bit set) and + // return the bitmask with corresponding bits set. + BitMask mask_empty_or_deleted() const { + auto converted = vget_lane_u64(vreinterpret_u64_s8(data), 0); + return {converted & BitMask::MASK}; + } + + // Find out the lanes reprensting full cells (without highest bit) and + // return the bitmask with corresponding bits set. + BitMask mask_full() const { return mask_empty_or_deleted().invert(); } + + // Performs the following transformation on all bytes in the group: + // - `EMPTY => EMPTY` + // - `DELETED => EMPTY` + // - `FULL => DELETED` + Group convert_special_to_empty_and_full_to_deleted() const { + auto dup = vdup_n_s8(0x80); + auto zero = vdup_n_s8(0x00); + auto special = vcgt_s8(zero, data); + return vorr_s8(dup, vreinterpret_s8_u8(special)); + } +}; + +} // namespace __llvm_libc::cpp::swisstable diff --git a/libc/src/__support/swisstable/common.h b/libc/src/__support/swisstable/common.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/swisstable/common.h @@ -0,0 +1,89 @@ + +//===-- SwissTable Common Definitions ---------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include +#include +#include +#include + +namespace __llvm_libc::cpp::swisstable { +// special values of the control byte +using CtrlWord = uint8_t; +using HashWord = uint64_t; +constexpr static inline CtrlWord EMPTY = 0b11111111u; +constexpr static inline CtrlWord DELETED = 0b10000000u; + +// Implementations of the bitmask. +// The backend word type may vary depending on different microarchitectures. +// For example, with X86 SSE2, the bitmask is just the 16bit unsigned integer +// corresponding to lanes in a SIMD register. +template struct BitMaskAdaptor { + // A masked constant whose bits are all set. + constexpr static inline T MASK = WORD_MASK; + // A stride in the bitmask may use multiple bits. + constexpr static inline size_t STRIDE = WORD_STRIDE; + + T word; + + // Invert zeros and ones inside the word. + BitMaskAdaptor invert() const { return {static_cast(word ^ WORD_MASK)}; } + + // Operator helper to do bit manipulations. + BitMaskAdaptor operator^(T value) const { + return {static_cast(this->word ^ value)}; + } + + // Check if any bit is set inside the word. + bool any_bit_set() const { return word != 0; } + + // Count trailing zeros with respect to stride. + size_t trailing_zeros() const { return safe_ctz(word) / WORD_STRIDE; } + + // Count trailing zeros with respect to stride. (Assume the bitmask is none + // zero.) + size_t lowest_set_bit_nonzero() const { + return unsafe_ctz(word) / WORD_STRIDE; + } + + // Count leading zeros with respect to stride. + size_t leading_zeros() const { + // move the word to the highest location. + return safe_clz(word) / WORD_STRIDE; + } +}; + +template struct IteratableBitMaskAdaptor : public BitMask { + // Use the bitmask as an iterator. Update the state and return current lowest + // set bit. To make the bitmask iterable, each stride must contain 0 or exact + // 1 set bit. + void remove_lowest_bit() { + // Remove the last set bit inside the word: + // word = 011110100 (original value) + // word - 1 = 011110011 (invert all bits up to the last set bit) + // word & (word - 1) = 011110000 (value with the last bit cleared) + this->word = this->word & (this->word - 1); + } + using value_type = size_t; + using iterator = BitMask; + using const_iterator = BitMask; + size_t operator*() { return this->lowest_set_bit_nonzero(); } + IteratableBitMaskAdaptor &operator++() { + this->remove_lowest_bit(); + return *this; + } + IteratableBitMaskAdaptor begin() { return *this; } + IteratableBitMaskAdaptor end() { return {0}; } + bool operator==(const IteratableBitMaskAdaptor &other) { + return this->word == other.word; + } + bool operator!=(const IteratableBitMaskAdaptor &other) { + return this->word != other.word; + } +}; + +} // namespace __llvm_libc::cpp::swisstable diff --git a/libc/src/__support/swisstable/dispatch.h b/libc/src/__support/swisstable/dispatch.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/swisstable/dispatch.h @@ -0,0 +1,16 @@ +//===-- SwissTable Platform Dispatch ----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "src/__support/architectures.h" +#if defined(LLVM_LIBC_ARCH_X86_64) && defined(__SSE2__) +#include "src/__support/swisstable/sse2.h" +#elif defined(LLVM_LIBC_ARCH_ANY_ARM) && defined(__ARM_NEON) && \ + defined(__ORDER_LITTLE_ENDIAN__) +#include "src/__support/swisstable/asimd.h" +#else +#include "src/__support/swisstable/generic.h" +#endif diff --git a/libc/src/__support/swisstable/generic.h b/libc/src/__support/swisstable/generic.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/swisstable/generic.h @@ -0,0 +1,149 @@ +//===-- SwissTable Generic Fallback -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "src/__support/endian.h" +#include "src/__support/swisstable/common.h" +#include "src/string/memory_utils/memcpy_implementations.h" + +namespace __llvm_libc::cpp::swisstable { + +// Helper function to spread a byte across the whole word. +// Accumutively, the procedure looks like: +// byte = 0x00000000000000ff +// byte | (byte << 8) = 0x000000000000ffff +// byte | (byte << 16) = 0x00000000ffffffff +// byte | (byte << 32) = 0xffffffffffffffff +constexpr static inline uintptr_t repeat(uintptr_t byte) { + size_t shift_amount = 8; + while (shift_amount < sizeof(uintptr_t) * 8) { + byte |= byte << shift_amount; + shift_amount <<= 1; + } + return byte; +} + +using BitMask = BitMaskAdaptor; +using IteratableBitMask = IteratableBitMaskAdaptor; + +struct Group { + uintptr_t data; + + // Load a group of control words from an arbitary address. + static Group load(const void *__restrict addr) { + uintptr_t data; + inline_memcpy(reinterpret_cast(&data), + static_cast(addr), sizeof(data)); + return {data}; + } + + // Load a group of control words from an aligned address. + // Notice that there is no difference of aligned/unaigned + // loading in ASIMD. + static Group aligned_load(const void *__restrict addr) { + uintptr_t data = *static_cast(addr); + return {data}; + } + + // Store a group of control words to an aligned address. + void aligned_store(void *addr) const { + *static_cast(addr) = data; + } + + // Find out the lanes equal to the given byte and return the bitmask + // with corresponding bits set. + IteratableBitMask match_byte(uint8_t byte) const { + // Given byte = 0x10, suppose the data is: + // + // data = [ 0x10 | 0x10 | 0x00 | 0xF1 | ... ] + // + // First, we compare the byte using XOR operation: + // + // [ 0x10 | 0x10 | 0x10 | 0x10 | ... ] (0) + // ^ [ 0x10 | 0x10 | 0x00 | 0xF1 | ... ] (1) + // = [ 0x00 | 0x00 | 0x10 | 0xE1 | ... ] (2) + // + // Notice that the equal positions will now be 0x00, so if we substract 0x01 + // respective to every byte, it will need to carry the substraction to upper + // bits (assume no carry from the hidden parts) + // [ 0x00 | 0x00 | 0x10 | 0xE1 | ... ] (2) + // - [ 0x01 | 0x01 | 0x01 | 0x01 | ... ] (3) + // = [ 0xFE | 0xFF | 0x0F | 0xE0 | ... ] (4) + // + // But there may be some bytes whose highest bit is already set after the + // xor operation. To rule out these positions, we AND them with the NOT + // of the XOR result: + // + // [ 0xFF | 0xFF | 0xEF | 0x1E | ... ] (5, NOT (2)) + // & [ 0xFE | 0xFF | 0x0F | 0xE0 | ... ] (4) + // = [ 0xFE | 0xFF | 0x0F | 0x10 | ... ] (6) + // + // To make the bitmask iteratable, only one bit can be set in each stride. + // So we AND each byte with 0x80 and keep only the highest bit: + // + // [ 0xFE | 0xFF | 0x0F | 0x10 | ... ] (6) + // & [ 0x80 | 0x80 | 0x80 | 0x80 | ... ] (7) + // = [ 0x80 | 0x80 | 0x00 | 0x00 | ... ] (8) + // + // However, there are possitbilites for false positives. For example, if the + // data is [ 0x10 | 0x11 | 0x10 | 0xF1 | ... ]. This only happens when there + // is a key only differs from the searched by the lowest bit. The claims + // are: + // + // - This never happens for `EMPTY` and `DELETED`, only full entries. + // - The check for key equality will catch these. + // - This only happens if there is at least 1 true match. + // - The chance of this happening is very low (< 1% chance per byte). + auto cmp = data ^ repeat(byte); + auto result = + Endian::to_little_endian((cmp - repeat(0x01)) & ~cmp & repeat(0x80)); + return {result}; + } + + // Find out the lanes equal to EMPTY and return the bitmask + // with corresponding bits set. + BitMask mask_empty() const { + // If the high bit is set, then the byte must be either: + // 1111_1111 (EMPTY) or 1000_0000 (DELETED). + // So we can just check if the top two bits are 1 by ANDing them. + return {Endian::to_little_endian(data & (data << 1) & repeat(0x80))}; + } + + // Find out the lanes equal to EMPTY or DELETE (highest bit set) and + // return the bitmask with corresponding bits set. + BitMask mask_empty_or_deleted() const { + return {Endian::to_little_endian(data) & repeat(0x80)}; + } + + // Find out the lanes reprensting full cells (without highest bit) and + // return the bitmask with corresponding bits set. + BitMask mask_full() const { return mask_empty_or_deleted().invert(); } + + // Performs the following transformation on all bytes in the group: + // - `EMPTY => EMPTY` + // - `DELETED => EMPTY` + // - `FULL => DELETED` + Group convert_special_to_empty_and_full_to_deleted() const { + // Set the highest bit only for positions whose highest bit is not set + // before. + // + // data = [ 00000000 | 11111111 | 10000000 | ... ] + // ~data = [ 11111111 | 00000000 | 00000000 | ... ] + // full = [ 10000000 | 00000000 | 00000000 | ... ] + + auto full = (~data) & repeat(0x80); + + // Inverse the bit and convert `01111111` to `1000000` by + // add `1` in that bit. The carry will not propogate outside + // that byte: + // ~full = [ 01111111 | 11111111 | 11111111 | ... ] + // full >> 1 = [ 00000001 | 00000000 | 00000000 | ... ] + // result = [ 10000000 | 11111111 | 11111111 | ... ] + return {~full + (full >> 1)}; + } +}; + +} // namespace __llvm_libc::cpp::swisstable diff --git a/libc/src/__support/swisstable/sse2.h b/libc/src/__support/swisstable/sse2.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/swisstable/sse2.h @@ -0,0 +1,78 @@ +//===-- SwissTable SSE2 Specialization --------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/swisstable/common.h" +#include + +namespace __llvm_libc::cpp::swisstable { + +// With SSE2, every bitmask is iteratable: because +// we use single bit to encode the data. + +using BitMask = BitMaskAdaptor; +using IteratableBitMask = IteratableBitMaskAdaptor; + +struct Group { + __m128i data; + + // Load a group of control words from an arbitary address. + static Group load(const void *__restrict addr) { + return {_mm_loadu_si128(static_cast(addr))}; + } + + // Load a group of control words from an aligned address. + static Group aligned_load(const void *__restrict addr) { + return {_mm_load_si128(static_cast(addr))}; + } + + // Store a group of control words to an aligned address. + void aligned_store(void *addr) const { + _mm_store_si128(static_cast<__m128i *>(addr), data); + } + + // Find out the lanes equal to the given byte and return the bitmask + // with corresponding bits set. + IteratableBitMask match_byte(uint8_t byte) const { + auto cmp = _mm_cmpeq_epi8(data, _mm_set1_epi8(byte)); + auto bitmask = static_cast(_mm_movemask_epi8(cmp)); + return {bitmask}; + } + + // Find out the lanes equal to EMPTY and return the bitmask + // with corresponding bits set. + BitMask mask_empty() const { return match_byte(EMPTY); } + + // Find out the lanes equal to EMPTY or DELETE (highest bit set) and + // return the bitmask with corresponding bits set. + BitMask mask_empty_or_deleted() const { + auto bitmask = static_cast(_mm_movemask_epi8(data)); + return {bitmask}; + } + + // Find out the lanes reprensting full cells (without highest bit) and + // return the bitmask with corresponding bits set. + BitMask mask_full() const { return mask_empty_or_deleted().invert(); } + + // Performs the following transformation on all bytes in the group: + // - `EMPTY => EMPTY` + // - `DELETED => EMPTY` + // - `FULL => DELETED` + Group convert_special_to_empty_and_full_to_deleted() const { + // Recall that EMPTY and DELETED are distinguished from others in + // their highest bit. This makes them negative when considered as + // signed integer. And for full ones, highest bits are all zeros. + // So we first identify those lanes smaller than or equal to zero + // and then convert them by setting the highest bit of them. + __m128i zero = _mm_setzero_si128(); + __m128i special = _mm_cmpgt_epi8(zero, data); + __m128i converted = _mm_or_si128(special, _mm_set1_epi8(0x80u)); + return {converted}; + } +}; + +} // namespace __llvm_libc::cpp::swisstable