Index: include/llvm/ADT/Hashing.h =================================================================== --- include/llvm/ADT/Hashing.h +++ include/llvm/ADT/Hashing.h @@ -34,6 +34,9 @@ // a single hash_code for their object. They should only logically be used // within the implementation of a 'hash_value' routine or similar context. // +// -- 'hash_stream' is a class for combining a stream of data into a single +// hash_code. +// // Note that 'hash_combine_range' contains very special logic for hashing // a contiguous array of integers or pointers. This logic is *extremely* fast, // on a modern Intel "Gainestown" Xeon (Nehalem uarch) @2.2 GHz, these were @@ -626,6 +629,170 @@ } // namespace detail } // namespace hashing +/// \brief Combines the data that was written to it into a single hash code. +/// +/// First, this class needs to be provided with all data that should be hashed +/// by calling the write() methods and related operators. Afterwards, +/// compute_hash() should be called to retrieve the hash code for the data. +/// +/// Note that this class can produce different hash codes than hash_value or +/// hash_combine for the same input data. Also, the generated hash codes only +/// depend on the data itself and not on the way the data was provided (e.g. +/// one write call with a huge chunk of data and multiple write calls with small +/// chunks of the same data will produce the same hash code). +class hash_stream { + + /// The block size that the hash_state requires. + static const size_t block_size = 64; + /// Buffer that stores the next block of data that should be hashed. + char block_buffer[block_size]; + + /// The total number of bytes written to this hash_stream instance so far. + size_t length; + /// The internal hash_state which does the actual hashing. + hashing::detail::hash_state state; + /// The seed that is used when hashing. + size_t seed; + + /// \brief Returns the next free position in the block_buffer. + size_t next_offset_in_buffer() const { + return length % block_size; + } + + /// \brief Forwards the filled block_buffer to the hash function. + void hash_buffer() { + // Check if we are currently forwarding the first block of data. + // If no, just forward the buffer to the hash_state. + if (length != block_size) + return state.mix(block_buffer); + // Otherwise we need to create the hash_state from this first block of data. + state = hashing::detail::hash_state::create(block_buffer, seed); + } + + /// \brief Appends the given data to the end of the block_buffer and hashes + /// the block_buffer if it is full. + /// + /// Note that the given data must fit into the block_buffer. + void append_to_buffer(const char *c, size_t size) { + size_t offset = next_offset_in_buffer(); + assert(offset + size <= block_size); + std::memcpy(block_buffer + offset, c, size); + + length += size; + if (next_offset_in_buffer() == 0) + hash_buffer(); + } + +public: + hash_stream() { + reset(); + } + + /// \brief Clears the internal state of this hash_stream so that it can be + /// reused to calculate a new hash code. + void reset() { + length = 0; + seed = hashing::detail::get_execution_seed(); + } + + /// \brief Computes the hash code for the data that was provided so far. + /// \return The calculated hash value. + /// + /// This function will also reset this hash_stream instance. + hash_code compute_hash() { + // If hash_stream got not more than one block of data, we need to use + // hash_short. + if (length <= block_size) { + hash_code result(hashing::detail::hash_short(block_buffer, length, seed)); + reset(); + return result; + } + + // If the buffer is partially filled, the remaining data in the buffer first + // needs to be hashed. + if (next_offset_in_buffer() != 0) + hash_buffer(); + + hash_code result(state.finalize(length)); + reset(); + return result; + } + + /// \brief Adds data that should be used to compute the final hash value. + /// \param c The pointer to the start of the array. + /// \param size The length of the array \p c in bytes. + /// \return This hash_stream instance for convenience purposes. + hash_stream &write(const char *c, size_t size) { + // First, fill the rest of the current block_buffer if there is enough data. + // If there isn't any data left afterwards, this write call is done. If the + // block_buffer gets filled, it will be directly hashed and emptied. + size_t remaining_size = block_size - next_offset_in_buffer(); + append_to_buffer(c, std::min(size, remaining_size)); + if (size <= remaining_size) + return *this; + + // Now that the buffer is empty, we consider the rest of the array as a list + // of 64 byte blocks and some remaining bytes at the end. + + // All but the last 64 byte block can be directly hashed. They don't need to + // be written into the block_buffer as we know that they would be + // overwritten by the very next 64 byte block in this write call. + size_t position = remaining_size; + for (; (position + block_size * 2) <= size; position += block_size) { + state.mix(c + position); + length += block_size; + } + // The last 64 byte block needs to be in the buffer as there is a chance + // that it won't be overwritten in the future and could influence the final + // hash code. + if (position + block_size <= size) { + append_to_buffer(c + position, block_size); + position += block_size; + } + + // If there are any remaining bytes at the end, append them to the buffer + // so that they will be hashed later. + if (position < size) { + append_to_buffer(c + position, size - position); + } + + return *this; + } + + /// \brief Writes each trivially copyable object in the given iterator range. + template + hash_stream &write(InputIteratorT first, InputIteratorT last) { + while (first != last) { + *this << *first; + ++first; + } + return *this; + } + + /// \brief Writes the contents of a string. + template + hash_stream &operator<<(const std::basic_string &string) { + static_assert(std::is_trivial::value, + "Can only write strings of trivially copyable objects"); + return write(reinterpret_cast(string.data()), + string.size() * sizeof(T)); + } + + /// \brief Writes the contents of a string. + hash_stream &operator<<(StringRef string) { + return write(string.data(), string.size()); + } + + /// \brief Writes a trivially copyable object. + template + hash_stream &operator<<(T const &value) { + static_assert(std::is_trivial::value, + "Can only write strings of trivially copyable objects"); + return write(reinterpret_cast(&value), sizeof(T)); + } + +}; + // Declared and documented above, but defined here so that any of the hashing // infrastructure is available. template Index: unittests/ADT/HashingTest.cpp =================================================================== --- unittests/ADT/HashingTest.cpp +++ unittests/ADT/HashingTest.cpp @@ -119,6 +119,59 @@ hash_value(ws.substr(1, ws.size() - 2))); } +TEST(HashingTest, HashStream) { + hash_stream streamA, streamB; + + // Test hashing of short strings + streamA << std::string("Hello World! ") + << std::string("Another Hello World!"); + streamB << std::string("Hello World! Another Hello World!"); + EXPECT_EQ(streamA.compute_hash(), streamB.compute_hash()); + + streamA << std::string("Hello World! ") + << std::string("Another Hello World!"); + streamB << std::string("Hello World! Different Text!"); + EXPECT_NE(streamA.compute_hash(), streamB.compute_hash()); + + // Test hashing of long strings that require multiple 64 byte blocks. + streamA << std::string("Lorem ipsum dolor sit amet, consectetur adipiscing " + "elit. Donec tempus vestibulum metus, a vehicula enim " + "placerat at. Sed commodo cursus posuere."); + streamB << std::string("Lorem ipsum dolor sit amet, consectetur adipiscing ") + << std::string("elit. Donec tempus vestibulum metus, a vehicula enim") + << std::string(" placerat at. Sed commodo cursus posuere."); + EXPECT_EQ(streamA.compute_hash(), streamB.compute_hash()); + + // Test that hashing a string and hashing each character once is the same. + streamA << std::string("ab"); + streamB << 'a' << 'b'; + EXPECT_EQ(streamA.compute_hash(), streamB.compute_hash()); + + // Test hashing of integers. + streamA << 56; + streamB << 55; + EXPECT_NE(streamA.compute_hash(), streamB.compute_hash()); + + streamA << 55; + streamB << 55; + EXPECT_EQ(streamA.compute_hash(), streamB.compute_hash()); + + // Test that data type size matters for hash_stream. + streamA << 56l; + streamB << 55; + EXPECT_NE(streamA.compute_hash(), streamB.compute_hash()); + + // Test hashing with iterator pairs. + std::vector data = {1, 2, 3, 4}; + + streamA.write(data.begin(), data.end()); + + streamB.write(data.begin(), data.begin() + 2); + streamB.write(data.begin() + 2, data.end()); + EXPECT_EQ(streamA.compute_hash(), streamB.compute_hash()); + +} + template T *begin(T (&arr)[N]) { return arr; } template T *end(T (&arr)[N]) { return arr + N; }