diff --git a/compiler-rt/lib/sanitizer_common/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/CMakeLists.txt --- a/compiler-rt/lib/sanitizer_common/CMakeLists.txt +++ b/compiler-rt/lib/sanitizer_common/CMakeLists.txt @@ -155,6 +155,7 @@ sanitizer_linux.h sanitizer_list.h sanitizer_local_address_space_view.h + sanitizer_lzw.h sanitizer_mac.h sanitizer_malloc_mac.inc sanitizer_mutex.h diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h b/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h new file mode 100644 --- /dev/null +++ b/compiler-rt/lib/sanitizer_common/sanitizer_lzw.h @@ -0,0 +1,159 @@ +//===-- sanitizer_lzw.h -----------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Lempel–Ziv–Welch encoding/decoding +// +//===----------------------------------------------------------------------===// + +#ifndef SANITIZER_LZW_H +#define SANITIZER_LZW_H + +#include "sanitizer_dense_map.h" + +namespace __sanitizer { + +using LzwCodeType = u32; + +template +ItOut LzwEncode(ItIn begin, ItIn end, ItOut out) { + using Substring = + detail::DenseMapPair; + + // Sentinel value for substrings of len 1. + static constexpr LzwCodeType kNoPrefix = + Min(DenseMapInfo::getEmptyKey().first, + DenseMapInfo::getTombstoneKey().first) - + 1; + DenseMap prefix_to_code; + { + // Add all substring of len 1 as initial dictionary. + InternalMmapVector dict_len1; + for (auto it = begin; it != end; ++it) + if (prefix_to_code.try_emplace({kNoPrefix, *it}, 0).second) + dict_len1.push_back(*it); + + // Slightly helps with later delta encoding. + Sort(dict_len1.data(), dict_len1.size()); + + // For large sizeof(T) we have to store dict_len1. Smaller types like u8 can + // just generate them. + *out = dict_len1.size(); + ++out; + + for (uptr i = 0; i != dict_len1.size(); ++i) { + // Remap after the Sort. + prefix_to_code[{kNoPrefix, dict_len1[i]}] = i; + *out = dict_len1[i]; + ++out; + } + CHECK_EQ(prefix_to_code.size(), dict_len1.size()); + } + + if (begin == end) + return out; + + // Main LZW encoding loop. + LzwCodeType match = prefix_to_code.find({kNoPrefix, *begin})->second; + ++begin; + for (auto it = begin; it != end; ++it) { + // Extend match with the new item. + auto ins = prefix_to_code.try_emplace({match, *it}, prefix_to_code.size()); + if (ins.second) { + // This is a new substring, but emit the code for the current match + // (before extend). This allows LZW decoder to recover the dictionary. + *out = match; + ++out; + // Reset the match to a single item, which must be already in the map. + match = prefix_to_code.find({kNoPrefix, *it})->second; + } else { + // Already known, use as the current match. + match = ins.first->second; + } + } + + *out = match; + ++out; + + return out; +} + +template +ItOut LzwDecode(ItIn begin, ItIn end, ItOut out) { + if (begin == end) + return out; + + // Load dictionary of len 1 substrings. Theses correspont to lowest codes. + InternalMmapVector dict_len1(*begin); + ++begin; + + if (begin == end) + return out; + + for (auto& v : dict_len1) { + v = *begin; + ++begin; + } + + // Substrings of len 2 and up. Indexes are shifted because [0, + // dict_len1.size()) stored in dict_len1. Substings get here after being + // emitted to the output, so we can use output position. + InternalMmapVector> + code_to_substr; + + // Copies already emitted substrings into the output again. + auto copy = [&code_to_substr, &dict_len1](LzwCodeType code, ItOut out) { + if (code < dict_len1.size()) { + *out = dict_len1[code]; + ++out; + return out; + } + const auto& s = code_to_substr[code - dict_len1.size()]; + + for (ItOut it = s.first; it != s.second; ++it, ++out) *out = *it; + return out; + }; + + // Returns lens of the substring with the given code. + auto code_to_len = [&code_to_substr, &dict_len1](LzwCodeType code) -> uptr { + if (code < dict_len1.size()) + return 1; + const auto& s = code_to_substr[code - dict_len1.size()]; + return s.second - s.first; + }; + + // Main LZW decoding loop. + LzwCodeType prev_code = *begin; + ++begin; + out = copy(prev_code, out); + for (auto it = begin; it != end; ++it) { + LzwCodeType code = *it; + auto start = out; + if (code == dict_len1.size() + code_to_substr.size()) { + // Special LZW case. The code is not in the dictionary yet. This is + // possible only when the new substring is the same as previous one plus + // the first item of the previous substring. We can emit that in two + // steps. + out = copy(prev_code, out); + *out = *start; + ++out; + } else { + out = copy(code, out); + } + + // Every time encoded emits the code, it also creates substing of len + 1 + // including the first item of the just emmited substring. Do the same here. + uptr len = code_to_len(prev_code); + code_to_substr.push_back({start - len, start + 1}); + + prev_code = code; + } + return out; +} + +} // namespace __sanitizer +#endif diff --git a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt --- a/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt +++ b/compiler-rt/lib/sanitizer_common/tests/CMakeLists.txt @@ -27,6 +27,7 @@ sanitizer_libc_test.cpp sanitizer_linux_test.cpp sanitizer_list_test.cpp + sanitizer_lzw_test.cpp sanitizer_mac_test.cpp sanitizer_mutex_test.cpp sanitizer_nolibc_test.cpp diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_lzw_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_lzw_test.cpp new file mode 100644 --- /dev/null +++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_lzw_test.cpp @@ -0,0 +1,89 @@ +//===-- sanitizer_lzw_test.cpp ----------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#include "sanitizer_common/sanitizer_lzw.h" + +#include + +#include "gtest/gtest.h" +#include "sanitizer_hash.h" + +namespace __sanitizer { + +template +struct LzwTest : public ::testing::Test { + template + void Run(size_t n, Generator gen) { + std::vector data(n); + std::generate(data.begin(), data.end(), gen); + + std::vector lzw; + LzwEncode(data.begin(), data.end(), std::back_inserter(lzw)); + + std::vector unlzw(data.size() * 2); + auto unlzw_end = LzwDecode(lzw.begin(), lzw.end(), unlzw.data()); + unlzw.resize(unlzw_end - unlzw.data()); + + EXPECT_EQ(data, unlzw); + } +}; + +static constexpr size_t kSizes[] = {0, 1, 2, 7, 13, 32, 129, 10000}; + +using LzwTestTypes = ::testing::Types; +TYPED_TEST_SUITE(LzwTest, LzwTestTypes, ); + +TYPED_TEST(LzwTest, Same) { + MurMur2Hash64Builder h(0); + for (size_t sz : kSizes) { + u64 v = 0; + for (size_t i = 0; i < 100 && !this->HasFailure(); ++i) { + this->Run(sz, [&] { return v; }); + h.add(i); + v = h.get(); + } + } +} + +TYPED_TEST(LzwTest, Increment) { + MurMur2Hash64Builder h(0); + for (size_t sz : kSizes) { + u64 v = 0; + for (size_t i = 0; i < 100 && !this->HasFailure(); ++i) { + this->Run(sz, [&v] { return v++; }); + h.add(i); + v = h.get(); + } + } +} + +TYPED_TEST(LzwTest, IncrementMod) { + MurMur2Hash64Builder h(0); + for (size_t sz : kSizes) { + u64 v = 0; + for (size_t i = 1; i < 16 && !this->HasFailure(); ++i) { + this->Run(sz, [&] { return v++ % i; }); + h.add(i); + v = h.get(); + } + } +} + +TYPED_TEST(LzwTest, RandomLimited) { + for (size_t sz : kSizes) { + for (size_t i = 1; i < 1000 && !this->HasFailure(); i *= 2) { + u64 v = 0; + this->Run(sz, [&] { + MurMur2Hash64Builder h(v % i /* Keep unique set limited */); + v = h.get(); + return v; + }); + } + } +} + +} // namespace __sanitizer