diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -129,6 +129,16 @@ .uint ) +add_header_library( + wyhash + HDRS + wyhash.h + DEPENDS + .common + .uint128 + libc.src.string.memcpy +) + add_subdirectory(FPUtil) add_subdirectory(OSUtil) add_subdirectory(StringUtil) diff --git a/libc/src/__support/wyhash.h b/libc/src/__support/wyhash.h new file mode 100644 --- /dev/null +++ b/libc/src/__support/wyhash.h @@ -0,0 +1,107 @@ +//===-- A 64-bit Hash Function ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/UInt128.h" +#include "src/__support/common.h" +#include "src/__support/endian.h" +#include "src/string/memory_utils/memcpy_implementations.h" + +// WyHash comes from https://github.com/wangyi-fudan/wyhash/ (which has been +// release into public domain). It is also the default hash function +// of Go, Nim and Zig. +// +// According to SMHasher's results, it is one of the fastest hash functions +// without primary quality issues. This hash function can be used with +// swisstable as it has the Avalanche effect to encode the data into full +// 64-bits; thus the table can effectively utilize two level hash technique +// to effectively locate candidate entries without too many false positives. +namespace __llvm_libc::cpp::wyhash { +static constexpr inline uint64_t DEFAULT_SECRET[4] = { + 0xa0761d6478bd642full, 0xe7037ed1a0b428dbull, 0x8ebc6af09c88c6e3ull, + 0x589965cc75374cc3ull}; +template class WyHash { + static void multiply(uint64_t &a, uint64_t &b) { + UInt128 data{a}; + data *= b; + if constexpr (EntropyProtection) { + a ^= static_cast(data); + b ^= static_cast(data >> 64); + } else { + a = static_cast(data); + b = static_cast(data >> 64); + } + } + static uint64_t mix(uint64_t a, uint64_t b) { + multiply(a, b); + return a ^ b; + } + template static uint64_t read(const uint8_t *p) { + uint64_t value; + inline_memcpy(reinterpret_cast(&value), + reinterpret_cast(p), N); + return Endian::to_little_endian(value); + } + + static uint64_t read3(const uint8_t *p, size_t k) { + auto a = static_cast(p[0]) << 16; + auto b = static_cast(p[k / 2]) << 8; + auto c = static_cast(p[k - 1]); + return a | b | c; + } + + static uint64_t wyhash(const void *key, size_t length, uint64_t seed, + const uint64_t *secret) { + const uint8_t *p = static_cast(key); + seed ^= secret[0]; + uint64_t a = 0, b = 0; + if (likely(length <= 16)) { + if (likely(length >= 4)) { + a = (read<4>(p) << 32) | read<4>(p + ((length >> 3) << 2)); + b = (read<4>(p + length - 4) << 32) | + read<4>(p + length - 4 - ((length >> 3) << 2)); + } else if (likely(length > 0)) { + a = read3(p, length); + b = 0; + } + } else { + size_t i = length; + if (likely(i > 48)) { + uint64_t s1 = seed, s2 = seed; + do { + seed = mix(read<8>(p) ^ secret[1], read<8>(p + 8) ^ seed); + s1 = mix(read<8>(p + 16) ^ secret[2], read<8>(p + 24) ^ s1); + s2 = mix(read<8>(p + 32) ^ secret[3], read<8>(p + 40) ^ s2); + p += 48; + i -= 48; + } while (likely(i > 48)); + seed ^= s1 ^ s2; + } + while (unlikely(i > 16)) { + seed = mix(read<8>(p) ^ secret[1], read<8>(p + 8) ^ seed); + i -= 16; + p += 16; + } + a = read<8>(p + i - 16); + b = read<8>(p + i - 8); + } + // Golang's implementation mixes a different secret here, but we keep it the + // same as the original wyhash to make it easier to examine the correctness + // via testing known values. + return mix(secret[1] ^ length, mix(a ^ secret[1], b ^ seed)); + } + +public: + static uint64_t hash(const void *key, size_t length, uint64_t seed) { + return wyhash(key, length, seed, DEFAULT_SECRET); + } +}; + +// Follow the practice in Golang to disable low-entropy protection by default. +using DefaultHash = WyHash; + +} // namespace __llvm_libc::cpp::wyhash diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -20,6 +20,17 @@ libc.src.__support.common ) +add_libc_unittest( + wyhash_test + SUITE + libc_support_unittests + SRCS + wyhash_test.cpp + DEPENDS + libc.src.__support.wyhash + libc.src.string.strlen +) + add_libc_unittest( high_precision_decimal_test SUITE diff --git a/libc/test/src/__support/wyhash_test.cpp b/libc/test/src/__support/wyhash_test.cpp new file mode 100644 --- /dev/null +++ b/libc/test/src/__support/wyhash_test.cpp @@ -0,0 +1,40 @@ +//===-- Unittests for wyhash ----------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "src/__support/wyhash.h" +#include "src/string/strlen.h" +#include "utils/UnitTest/LibcTest.h" + +// Examination Data from SMHasher +TEST(LlvmLibcWyHashTest, DefaultValues) { + using namespace __llvm_libc::cpp::wyhash; + // clang-format off + const char *data[] = { + "", + "a", + "abc", + "message digest", + "abcdefghijklmnopqrstuvwxyz", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", + "12345678901234567890123456789012345678901234567890123456789012345678901234567890" + }; + uint64_t hash[] = { + 0x42bc986dc5eec4d3, + 0x84508dc903c31551, + 0xbc54887cfc9ecb1, + 0x6e2ff3298208a67c, + 0x9a64e42e897195b9, + 0x9199383239c32554, + 0x7c1ccf6bba30f5a5, + }; + // clang-format on + for (size_t i = 0; i < 7; ++i) { + ASSERT_EQ(DefaultHash::hash(data[i], __llvm_libc::strlen(data[i]), i), + hash[i]); + } +}