diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -152,6 +152,15 @@ .uint ) +add_header_library( + wyhash + HDRS + wyhash.h + DEPENDS + .common + .uint128 +) + add_subdirectory(FPUtil) add_subdirectory(OSUtil) add_subdirectory(StringUtil) diff --git a/libc/src/__support/wyhash.h b/libc/src/__support/wyhash.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/wyhash.h @@ -0,0 +1,108 @@ +//===-- A 64-bit Hash Function ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/UInt128.h" +#include "src/__support/common.h" + +// WyHash comes from https://github.com/wangyi-fudan/wyhash/ (which has been +// released into public domain). It is also the default hash function +// of Go, Nim and Zig. +// +// According to SMHasher's results, it is one of the fastest hash functions +// without primary quality issues. This hash function can be used with +// swisstable as it has the Avalanche effect to encode the data into full +// 64-bits; thus the table can effectively utilize two level hash technique +// to effectively locate candidate entries without too many false positives. +namespace __llvm_libc::internal::wyhash { +// default secret parameters from WyHash +// commit hash: ea3b25e1aef55d90f707c3a292eeb9162e2615d8 +static constexpr inline uint64_t DEFAULT_SECRET[4] = { + 0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, + 0x589965cc75374cc3ull}; +template class WyHash { + static void multiply(uint64_t &a, uint64_t &b) { + UInt128 data{a}; + data *= b; + if constexpr (EntropyProtection) { + a ^= static_cast(data); + b ^= static_cast(data >> 64); + } else { + a = static_cast(data); + b = static_cast(data >> 64); + } + } + static uint64_t mix(uint64_t a, uint64_t b) { + multiply(a, b); + return a ^ b; + } + template static uint64_t read(const uint8_t *p) { + uint64_t value = 0; + for (size_t i = 0; i < N; ++i) { + value |= (static_cast(p[i]) << (i * 8)); + } + return value; + } + + static uint64_t read3(const uint8_t *p, size_t k) { + auto a = static_cast(p[0]) << 16; + auto b = static_cast(p[k / 2]) << 8; + auto c = static_cast(p[k - 1]); + return a | b | c; + } + + static uint64_t wyhash(const void *key, size_t length, uint64_t seed, + const uint64_t *secret) { + const uint8_t *p = static_cast(key); + seed ^= mix(seed ^ secret[0], secret[1]); + uint64_t a = 0, b = 0; + if (likely(length <= 16)) { + if (likely(length >= 4)) { + a = (read<4>(p) << 32) | read<4>(p + ((length >> 3) << 2)); + b = (read<4>(p + length - 4) << 32) | + read<4>(p + length - 4 - ((length >> 3) << 2)); + } else if (likely(length > 0)) { + a = read3(p, length); + b = 0; + } + } else { + size_t i = length; + if (likely(i > 48)) { + uint64_t s1 = seed, s2 = seed; + do { + seed = mix(read<8>(p) ^ secret[1], read<8>(p + 8) ^ seed); + s1 = mix(read<8>(p + 16) ^ secret[2], read<8>(p + 24) ^ s1); + s2 = mix(read<8>(p + 32) ^ secret[3], read<8>(p + 40) ^ s2); + p += 48; + i -= 48; + } while (likely(i > 48)); + seed ^= s1 ^ s2; + } + while (unlikely(i > 16)) { + seed = mix(read<8>(p) ^ secret[1], read<8>(p + 8) ^ seed); + i -= 16; + p += 16; + } + a = read<8>(p + i - 16); + b = read<8>(p + i - 8); + } + a ^= secret[1]; + b ^= seed; + multiply(a, b); + return mix(a ^ secret[0] ^ length, b ^ secret[1]); + } + +public: + static uint64_t hash(const void *key, size_t length, uint64_t seed) { + return wyhash(key, length, seed, DEFAULT_SECRET); + } +}; + +// Follow the practice in Golang to disable low-entropy protection by default. +using DefaultHash = WyHash; + +} // namespace __llvm_libc::internal::wyhash diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -20,6 +20,17 @@ libc.src.__support.common ) +add_libc_unittest( + wyhash_test + SUITE + libc_support_unittests + SRCS + wyhash_test.cpp + DEPENDS + libc.src.__support.wyhash + libc.src.string.strlen +) + add_libc_unittest( high_precision_decimal_test SUITE diff --git a/libc/test/src/__support/wyhash_test.cpp b/libc/test/src/__support/wyhash_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/__support/wyhash_test.cpp @@ -0,0 +1,42 @@ +//===-- Unittests for wyhash ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/wyhash.h" +#include "src/string/strlen.h" +#include "utils/UnitTest/LibcTest.h" + +// Sanity check +TEST(LlvmLibcWyHashTest, DefaultValues) { + using namespace __llvm_libc::internal::wyhash; + // clang-format off + const char *data[] = { + "", + "a", + "abc", + "message digest", + "abcdefghijklmnopqrstuvwxyz", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", + "12345678901234567890123456789012345678901234567890123456789012345678901234567890" + }; + // generated with original WyHash implementation (ea3b25e1aef55d90f707c3a292eeb9162e2615d8) + uint64_t hash[] = { + 0x0409638ee2bde459, + 0xa8412d091b5fe0a9, + 0x32dd92e4b2915153, + 0x8619124089a3a16b, + 0x7a43afb61d7f5f40, + 0xff42329b90e50d58, + 0xc39cab13b115aad3, + + }; + // clang-format on + for (size_t i = 0; i < 7; ++i) { + ASSERT_EQ(DefaultHash::hash(data[i], __llvm_libc::strlen(data[i]), i), + hash[i]); + } +}