diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -152,6 +152,15 @@ .uint ) +add_header_library( + wyhash + HDRS + wyhash.h + DEPENDS + .common + .uint128 +) + add_subdirectory(FPUtil) add_subdirectory(OSUtil) add_subdirectory(StringUtil) diff --git a/libc/src/__support/wyhash.h b/libc/src/__support/wyhash.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/wyhash.h @@ -0,0 +1,108 @@ +//===-- A 64-bit Hash Function ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/UInt128.h" +#include "src/__support/common.h" + +// WyHash comes from https://github.com/wangyi-fudan/wyhash/ (which has been +// released into public domain). It is also the default hash function +// of Go, Nim and Zig. +// +// According to SMHasher's results, it is one of the fastest hash functions +// without primary quality issues. This hash function can be used with +// swisstable as it has the Avalanche effect to encode the data into full +// 64-bits; thus the table can effectively utilize two level hash technique +// to effectively locate candidate entries without too many false positives. +namespace __llvm_libc::internal::wyhash { +// default secret parameters from WyHash +// commit hash: ea3b25e1aef55d90f707c3a292eeb9162e2615d8 +static constexpr inline uint64_t DEFAULT_SECRET[4] = { + 0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, + 0x589965cc75374cc3ull}; +template class WyHash { + static void multiply(uint64_t &a, uint64_t &b) { + UInt128 data{a}; + data *= b; + if constexpr (EntropyProtection) { + a ^= static_cast(data); + b ^= static_cast(data >> 64); + } else { + a = static_cast(data); + b = static_cast(data >> 64); + } + } + static uint64_t mix(uint64_t a, uint64_t b) { + multiply(a, b); + return a ^ b; + } + template static uint64_t read(const uint8_t *p) { + uint64_t value = 0; + for (size_t i = 0; i < N; ++i) { + value |= (static_cast(p[i]) << (i * 8)); + } + return value; + } + + static uint64_t read3(const uint8_t *p, size_t k) { + auto a = static_cast(p[0]) << 16; + auto b = static_cast(p[k / 2]) << 8; + auto c = static_cast(p[k - 1]); + return a | b | c; + } + + static uint64_t wyhash(const void *key, size_t length, uint64_t seed, + const uint64_t *secret) { + const uint8_t *p = static_cast(key); + seed ^= mix(seed ^ secret[0], secret[1]); + uint64_t a = 0, b = 0; + if (likely(length <= 16)) { + if (likely(length >= 4)) { + a = (read<4>(p) << 32) | read<4>(p + ((length >> 3) << 2)); + b = (read<4>(p + length - 4) << 32) | + read<4>(p + length - 4 - ((length >> 3) << 2)); + } else if (likely(length > 0)) { + a = read3(p, length); + b = 0; + } + } else { + size_t i = length; + if (likely(i > 48)) { + uint64_t s1 = seed, s2 = seed; + do { + seed = mix(read<8>(p) ^ secret[1], read<8>(p + 8) ^ seed); + s1 = mix(read<8>(p + 16) ^ secret[2], read<8>(p + 24) ^ s1); + s2 = mix(read<8>(p + 32) ^ secret[3], read<8>(p + 40) ^ s2); + p += 48; + i -= 48; + } while (likely(i > 48)); + seed ^= s1 ^ s2; + } + while (unlikely(i > 16)) { + seed = mix(read<8>(p) ^ secret[1], read<8>(p + 8) ^ seed); + i -= 16; + p += 16; + } + a = read<8>(p + i - 16); + b = read<8>(p + i - 8); + } + a ^= secret[1]; + b ^= seed; + multiply(a, b); + return mix(a ^ secret[0] ^ length, b ^ secret[1]); + } + +public: + static uint64_t hash(const void *key, size_t length, uint64_t seed) { + return wyhash(key, length, seed, DEFAULT_SECRET); + } +}; + +// Follow the practice in Golang to disable low-entropy protection by default. +using DefaultHash = WyHash; + +} // namespace __llvm_libc::internal::wyhash diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -20,6 +20,19 @@ libc.src.__support.common ) +add_libc_unittest( + wyhash_test + SUITE + libc_support_unittests + SRCS + wyhash_test.cpp + DEPENDS + libc.src.__support.wyhash + libc.src.string.strlen + libc.src.string.memset + libc.src.stdlib.rand +) + add_libc_unittest( high_precision_decimal_test SUITE diff --git a/libc/test/src/__support/wyhash_test.cpp b/libc/test/src/__support/wyhash_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/__support/wyhash_test.cpp @@ -0,0 +1,151 @@ +//===-- Unittests for wyhash ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/wyhash.h" +#include "src/stdlib/rand.h" +#include "src/string/memset.h" +#include "src/string/strlen.h" +#include "utils/UnitTest/LibcTest.h" + +#if defined __has_builtin +#if __has_builtin(__builtin_popcountll) +#define LLVM_LIBC_WYHASH_TEST_HAS_BUILTIN_POPCOUNTLL +#endif +#endif + +static inline size_t random_string(char *buf) { + static constexpr size_t LOWER_LIMIT = 16; + static constexpr size_t UPPER_LIMIT = 64; + size_t length = + LOWER_LIMIT + __llvm_libc::rand() % (UPPER_LIMIT - LOWER_LIMIT); + // 1 << (8 + 16) ~ 1 << (8 + 64) should be of good randomness + for (size_t i = 0; i < length; ++i) { + buf[i] = __llvm_libc::rand(); + } + return length; +} + +static inline size_t hamming_distance(uint64_t x, uint64_t y) { +#if defined LLVM_LIBC_WYHASH_TEST_HAS_BUILTIN_POPCOUNTLL + return __builtin_popcountll(x ^ y); +#else + size_t sum = 0; + uint64_t current = x ^ y; + while (current != 0) { + sum += current & 0b1; + current >>= 1; + } + return sum; +#endif +} + +// We expect the hash value to be of low bias at each byte +// so that we can select a section of hash value as a second-level hash +TEST(LlvmLibcWyHashTest, LowBias) { + using namespace __llvm_libc::internal::wyhash; + static constexpr size_t SLOT_NUM = 1u << 8u; + static constexpr size_t SCALE = 1000; + static constexpr uint64_t SEED = 0x0123'4567'89AB'CDEFull; + // Iterate through each byte + for (size_t i = 0; i < sizeof(uint64_t) * 8; ++i) { + size_t counter[SLOT_NUM]; + __llvm_libc::memset(counter, 0, sizeof(counter)); + size_t total = SCALE * SLOT_NUM; + while (total--) { + char buf[64]; + size_t length = random_string(buf); + uint64_t hash = DefaultHash::hash(buf, length, SEED); + // Get the section corresponds to the byte being examined + counter[(hash >> (i * 8)) & 0xFF]++; + } + + // Sum up to get the variance + size_t sum = 0; + for (size_t i = 0; i < SLOT_NUM; ++i) + sum += static_cast((counter[i] - SCALE) * (counter[i] - SCALE)); + + // We hope that averaged bias at each slot is less than 10%. + // Use square to avoid SQRT. + ASSERT_LE(sum / SLOT_NUM, (SCALE / 10) * (SCALE / 10)); + } +} + +// In real world, user may insert similar data into a hash set. +// To make the hash function robust, we hope a single-bit change is +// enough to change the hash value by a large extend. +TEST(LlvmLibcWyHashTest, AvalancheRandomString) { + using namespace __llvm_libc::internal::wyhash; + static constexpr size_t TEST_GROUP_NUM = 10000; + static constexpr size_t TEST_GROUP_SIZE = 100; + static constexpr uint64_t SEED = 0x89AB'CDEF'0123'4567ull; + for (size_t i = 0; i < TEST_GROUP_NUM; ++i) { + char buf[64]; + size_t length = random_string(buf); + size_t original_hash = DefaultHash::hash(buf, length, SEED); + for (size_t j = 0; j < TEST_GROUP_SIZE; ++j) { + // Randomly select a byte + size_t byte = __llvm_libc::rand() % length; + // Randomly select a bit + size_t bit = __llvm_libc::rand() % 8; + // Alter the bit + char backup = buf[byte]; + buf[byte] ^= (1u << bit); + size_t new_hash = DefaultHash::hash(buf, length, SEED); + // The hamming distance of the hash is larger than 20% of the hash volume. + ASSERT_GE(hamming_distance(original_hash, new_hash), + (8 * sizeof(uint64_t)) * 2 / 10); + // Recover the string + buf[byte] = backup; + } + } +} + +TEST(LlvmLibcWyHashTest, AvalancheSmallValue) { + using namespace __llvm_libc::internal::wyhash; + static constexpr uint64_t SEED = 0xCDEF'89AB'4567'0123ull; + for (size_t i = 0; i < 4096; ++i) { + for (size_t j = i + 1; j < 4096; ++j) { + size_t hash_i = DefaultHash::hash(&i, sizeof(i), SEED); + size_t hash_j = DefaultHash::hash(&j, sizeof(j), SEED); + // The hamming distance of the hash is larger than 20% of the hash volume. + ASSERT_GE(hamming_distance(hash_i, hash_j), + (8 * sizeof(uint64_t)) * 2 / 10); + } + } +} + +// Sanity check +TEST(LlvmLibcWyHashTest, DefaultValues) { + using namespace __llvm_libc::internal::wyhash; + // clang-format off + const char *data[] = { + "", + "a", + "abc", + "message digest", + "abcdefghijklmnopqrstuvwxyz", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", + "12345678901234567890123456789012345678901234567890123456789012345678901234567890" + }; + // generated with original WyHash implementation (ea3b25e1aef55d90f707c3a292eeb9162e2615d8) + uint64_t hash[] = { + 0x0409638ee2bde459, + 0xa8412d091b5fe0a9, + 0x32dd92e4b2915153, + 0x8619124089a3a16b, + 0x7a43afb61d7f5f40, + 0xff42329b90e50d58, + 0xc39cab13b115aad3, + + }; + // clang-format on + for (size_t i = 0; i < 7; ++i) { + ASSERT_EQ(DefaultHash::hash(data[i], __llvm_libc::strlen(data[i]), i), + hash[i]); + } +}