Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -76,6 +76,7 @@ add_subdirectory(tool) add_subdirectory(indexer) add_subdirectory(index/dex/dexp) +add_subdirectory(index/dex/fuzzer) if (LLVM_INCLUDE_BENCHMARKS) add_subdirectory(benchmarks) Index: clang-tools-extra/clangd/index/dex/Dex.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/Dex.cpp +++ clang-tools-extra/clangd/index/dex/Dex.cpp @@ -122,8 +122,8 @@ // Convert lists of items to posting lists. for (const auto &TokenToPostingList : TempInvertedIndex) - InvertedIndex.insert({TokenToPostingList.first, - PostingList(move(TokenToPostingList.second))}); + InvertedIndex.insert( + {TokenToPostingList.first, PostingList(TokenToPostingList.second)}); vlog("Built Dex with estimated memory usage {0} bytes.", estimateMemoryUsage()); Index: clang-tools-extra/clangd/index/dex/PostingList.h =================================================================== --- clang-tools-extra/clangd/index/dex/PostingList.h +++ clang-tools-extra/clangd/index/dex/PostingList.h @@ -6,13 +6,19 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This defines posting list interface: a storage for identifiers of symbols -// which can be characterized by a specific feature (such as fuzzy-find trigram, -// scope, type or any other Search Token). Posting lists can be traversed in -// order using an iterator and are values for inverted index, which maps search -// tokens to corresponding posting lists. -// +/// +/// \file +/// This defines posting list interface: a storage for identifiers of symbols +/// which can be characterized by a specific feature (such as fuzzy-find +/// trigram, scope, type or any other Search Token). Posting lists can be +/// traversed in order using an iterator and are values for inverted index, +/// which maps search tokens to corresponding posting lists. +/// +/// In order to decrease size of Index in-memory representation, Variable Byte +/// Encoding (VByte) is used for PostingLists compression. An overview of VByte +/// algorithm can be found in "Introduction to Information Retrieval" book: +/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html +/// //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H @@ -29,20 +35,43 @@ class Iterator; +/// Chunk is a fixed-width piece of PostingList which contains the first DocID +/// in uncompressed format (Head) and delta-encoded Payload. It can be +/// decompressed upon request. +struct Chunk { + // Keep sizeof(Chunk) == 32. + static constexpr size_t PayloadSize = 32 - sizeof(DocID) - sizeof(uint8_t); + + std::vector decompress() const; + + /// The first element of + DocID Head; + /// VByte-encoded deltas. + std::array Payload = std::array(); + /// Number of elements encoded into Payload + 1. + uint8_t Size; +}; + /// PostingList is the storage of DocIDs which can be inserted to the Query -/// Tree as a leaf by constructing Iterator over the PostingList object. -// FIXME(kbobyrev): Use VByte algorithm to compress underlying data. +/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs +/// are stored in underlying chunks. While avoiding compression would reflect +/// positively on the Index performance, current Dex implementation has a large +/// performance gap compared to MemIndex which allows memory consumption +/// reduction at the cost of some performance. class PostingList { public: - explicit PostingList(const std::vector &&Documents) - : Documents(std::move(Documents)) {} + explicit PostingList(const std::vector &Documents); + /// Constructs DocumentIterator over given posting list. DocumentIterator will + /// go through the chunks and decompress them on-the-fly when necessary. std::unique_ptr iterator() const; - size_t bytes() const { return Documents.size() * sizeof(DocID); } + /// Returns in-memory size. + size_t bytes() const { return Chunks.size() * sizeof(Chunk); } private: - const std::vector Documents; + const std::vector Chunks; + size_t Size; }; } // namespace dex Index: clang-tools-extra/clangd/index/dex/PostingList.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/PostingList.cpp +++ clang-tools-extra/clangd/index/dex/PostingList.cpp @@ -9,6 +9,7 @@ #include "PostingList.h" #include "Iterator.h" +#include namespace clang { namespace clangd { @@ -16,21 +17,33 @@ namespace { -/// Implements Iterator over std::vector. This is the most basic -/// iterator and is simply a wrapper around -/// std::vector::const_iterator. -class PlainIterator : public Iterator { +/// Implements iterator of PostingList chunks. This requires iterating over two +/// levels: the first level iterator iterates over the chunks and decompresses +/// them on-the-fly when the contents of chunk are to be seen. +class ChunkIterator : public Iterator { public: - explicit PlainIterator(llvm::ArrayRef Documents) - : Documents(Documents), Index(std::begin(Documents)) {} + explicit ChunkIterator(const std::vector &Chunks, size_t Size) + : Chunks(Chunks), Size(Size), ChunkIndex(begin(Chunks)) { + if (!Chunks.empty()) { + DecompressedChunk = ChunkIndex->decompress(); + InnerIndex = begin(DecompressedChunk); + } + } - bool reachedEnd() const override { return Index == std::end(Documents); } + bool reachedEnd() const override { return ChunkIndex == end(Chunks); } /// Advances cursor to the next item. void advance() override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); - ++Index; + if (++InnerIndex != end(DecompressedChunk)) + return; // Return if current chunk is not exhausted. + ++ChunkIndex; + if (reachedEnd()) + return; // Can't advance to the next chunk at the end. + // Decompress next chunk and reset inner iterator. + DecompressedChunk = ChunkIndex->decompress(); + InnerIndex = begin(DecompressedChunk); } /// Applies binary search to advance cursor to the next item with DocID @@ -38,16 +51,41 @@ void advanceTo(DocID ID) override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); - // If current ID is beyond requested one, iterator is already in the right - // state. - if (peek() < ID) - Index = std::lower_bound(Index, std::end(Documents), ID); + if (ID <= peek()) + return; + // If current chunk doesn't contain needed element, find the chunk which + // does. + if ((ChunkIndex != end(Chunks) - 1) && ((ChunkIndex + 1)->Head <= ID)) { + // Find the next chunk with Head >= ID. + ChunkIndex = std::lower_bound( + ChunkIndex, end(Chunks), ID, + [](const Chunk &C, const DocID ID) { return C.Head < ID; }); + if (reachedEnd()) + return; + // Look for ID in the previous chunk if the current Head > ID and + // therefore needed position is either in previous Chunk or in the + // beginning of the current chunk. + if (ChunkIndex != begin(Chunks) && ID < ChunkIndex->Head) + --ChunkIndex; + DecompressedChunk = ChunkIndex->decompress(); + InnerIndex = begin(DecompressedChunk); + } + // Try to find ID within current chunk. + InnerIndex = std::lower_bound(InnerIndex, std::end(DecompressedChunk), ID); + // Return if the position was found in current chunk. + if (InnerIndex != std::end(DecompressedChunk)) + return; + // Otherwise, the iterator should point to the first element of the next + // chunk (if there is any). + ++ChunkIndex; + if (!reachedEnd()) + DecompressedChunk = ChunkIndex->decompress(); + InnerIndex = begin(DecompressedChunk); } DocID peek() const override { - assert(!reachedEnd() && - "Posting List iterator can't peek() at the end."); - return *Index; + assert(!reachedEnd() && "Posting List iterator can't peek() at the end."); + return *InnerIndex; } float consume() override { @@ -56,27 +94,179 @@ return DEFAULT_BOOST_SCORE; } - size_t estimateSize() const override { return Documents.size(); } + size_t estimateSize() const override { return Size; } private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << '['; - if (Index != std::end(Documents)) - OS << *Index; - else - OS << "END"; + if (ChunkIndex != begin(Chunks) || InnerIndex != begin(DecompressedChunk)) + OS << "... "; + OS << (reachedEnd() ? "END" : std::to_string(*InnerIndex)); + if (!reachedEnd() && InnerIndex < std::end(DecompressedChunk) - 1) + OS << " ..."; OS << ']'; return OS; } - llvm::ArrayRef Documents; - llvm::ArrayRef::const_iterator Index; + const std::vector &Chunks; + // Cache information about PostingList size. + size_t Size; + // Iterator over chunks. + std::vector::const_iterator ChunkIndex; + std::vector DecompressedChunk; + // Iterator over DecompressedChunk. + std::vector::iterator InnerIndex; }; +/// Single-byte masks used for VByte compression bit manipulations. +constexpr uint8_t SevenBytesMask = 0x7f; // 0b01111111 +constexpr uint8_t FourBytesMask = 0xf; // 0b00001111 +constexpr uint8_t ContinuationBit = 0x80; // 0b10000000 + +/// Fills chunk with the maximum number of bits available. +Chunk createChunk(DocID Head, std::queue &Payload, + size_t DocumentsCount, size_t MeaningfulBytes) { + assert(DocumentsCount > 0 && "Can't create chunk without Head."); + Chunk Result; + Result.Head = Head; + Result.Size = DocumentsCount; + for (size_t I = 0; I < MeaningfulBytes; ++I) { + Result.Payload[I] = Payload.front(); + Payload.pop(); + } + return Result; +} + +/// Byte offsets of Payload contents within DocID. +const size_t Offsets[] = {0, 7, 7 * 2, 7 * 3, 7 * 4}; + +/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of +/// DocIDs. The compression stores deltas (differences) between subsequent +/// DocIDs and encodes these deltas utilizing the least possible number of +/// bytes. +/// +/// Each encoding byte consists of two parts: the first bit (continuation bit) +/// indicates whether this is the last byte of current encoding and seven bytes +/// a piece of DocID (payload). DocID contains 32 bits and therefore it takes +/// up to 5 bytes to encode it (4 full 7-bit payloads and one 4-bit payload), +/// but in practice it is expected that gaps (deltas) between subsequent DocIDs +/// are not large enough to require 5 bytes. In very dense posting lists (with +/// average gaps less than 128) this representation would be 4 times more +/// efficient than raw DocID array. +/// +/// PostingList encoding example: +/// +/// DocIDs 42 47 7000 +/// gaps 5 6958 +/// Encoding (raw number) 10000101 10110110 00101110 +std::vector encodeStream(llvm::ArrayRef Documents) { + // Masks are used to perform bit manipulations over DocID. Each mask + // represents + static const std::vector Masks = { + SevenBytesMask, // First 7 bytes: 0b1111111 + SevenBytesMask << 7U, // Next 7 bytes + SevenBytesMask << 7U * 2, // ... + SevenBytesMask << 7U * 3, // ... + static_cast(FourBytesMask << 7U * 4), // 4 last bytes + }; + + // These limits are used to calculate the width of DocID encoding: when + // ID > Limits[I], it takes at least I + 1 bytes. + static const DocID Limits[] = { + 1U << 7U, + 1U << 7U * 2, + 1U << 7U * 3, + 1U << 7U * 4, + }; + + std::vector Result; + std::queue Payload; + size_t HeadIndex = 0; + // Keep track of the last Payload size which doesn't exceed the limit. + size_t LastEncodingEnd = 0; + for (size_t I = 0; I < Documents.size(); ++I) { + // Don't encode Heads. + if (HeadIndex == I) + continue; + const DocID Delta = Documents[I] - Documents[I - 1]; + // Encode Delta. + for (size_t I = 0; I < Masks.size(); ++I) { + uint8_t Encoding = (Delta & Masks[I]) >> Offsets[I]; + bool HasNextByte = I < Masks.size() - 1 ? Delta >= Limits[I] : false; + // If !HasNextByte, mark the end of encoding stream. + Payload.push(!HasNextByte ? Encoding | ContinuationBit : Encoding); + if (!HasNextByte) + break; + } + if (Payload.size() <= Chunk::PayloadSize) + LastEncodingEnd = Payload.size(); + // Read stream until Payload overflows. + if (Payload.size() < Chunk::PayloadSize) + continue; + // If Payload contains exactly Chunk::PayloadSize bytes, use all of them to + // fill the next Chunk. Otherwise, use the last valid size. + Result.push_back(createChunk(Documents[HeadIndex], Payload, + Payload.size() == Chunk::PayloadSize + ? I - HeadIndex + 1 + : I - HeadIndex, + LastEncodingEnd)); + // Next head is the next item if Payload contains exactly Chunk::PayloadSize + // bytes, otherwise it is the current item. + HeadIndex = Payload.empty() ? I + 1 : I; + // If overflow happened, Payload contains encoding of the next Head: discard + // it. + while (!Payload.empty()) + Payload.pop(); + LastEncodingEnd = 0; + } + // Add another chunk if there are some bytes left in Payload or if there's a + // trailing Head. + if (!Payload.empty() || HeadIndex + 1 == Documents.size()) + Result.push_back(createChunk(Documents[HeadIndex], Payload, + Documents.size() - HeadIndex, + LastEncodingEnd)); + return Result; +} + } // namespace +std::vector Chunk::decompress() const { + std::vector Result{Head}; + if (Size == 1) + return Result; + // Store sum of Head and all deltas to keep track of the last ID. + DocID Current = Head; + DocID Delta = 0; + uint8_t Length = 0; + // Decode bytes from Payload into Delta. + for (const auto &Byte : Payload) { + assert(Length <= 5 && "Can't decode sequences longer than 5 bytes"); + // Write meaningful bits to the correct place in the document decoding. + Delta |= (Byte & (Length < 4 ? SevenBytesMask : FourBytesMask)) + << Offsets[Length]; + ++Length; + // Add document decoding to the result as soon as END marker is seen. + if ((Byte & ContinuationBit) != 0) { + Current += Delta; + Result.push_back(Current); + Length = 0; + Delta = 0; + } + // Stop when all meaningful bytes are decoded. + if (Result.size() == Size) + break; + } + assert(Result.size() == 1 || + Length == 0 && + "Unterminated byte sequence at the end of input stream."); + return Result; +} + +PostingList::PostingList(const std::vector &Documents) + : Chunks(encodeStream(Documents)), Size(Documents.size()) {} + std::unique_ptr PostingList::iterator() const { - return llvm::make_unique(Documents); + return llvm::make_unique(Chunks, Size); } } // namespace dex Index: clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt @@ -0,0 +1,19 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) + +set(LLVM_LINK_COMPONENTS Support) + +if(LLVM_USE_SANITIZE_COVERAGE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer") +endif() + +add_clang_executable(clangd-vbyte-fuzzer + EXCLUDE_FROM_ALL + VByteFuzzer.cpp + ) + +target_link_libraries(clangd-vbyte-fuzzer + PRIVATE + clangBasic + clangDaemon + ${LLVM_LIB_FUZZING_ENGINE} + ) Index: clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp @@ -0,0 +1,64 @@ +//===-- VByteFuzzer.cpp - Fuzz VByte Posting List encoding ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a function that runs clangd on a single input. +/// This function is then linked into the Fuzzer library. +/// +//===----------------------------------------------------------------------===// + +#include "../Iterator.h" +#include "../PostingList.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using DocID = clang::clangd::dex::DocID; + +/// Transform raw byte sequence into list of DocIDs. +std::vector generateDocuments(uint8_t *Data, size_t Size) { + std::vector Result; + DocID ID = 0; + for (size_t I = 0; I < Size; ++I) { + size_t Offset = I % 4; + if (Offset == 0 && I != 0) { + ID = 0; + Result.push_back(ID); + } + ID |= (Data[I] << Offset); + } + if (Size > 4 && Size % 4 != 0) + Result.push_back(ID); + return Result; +} + +/// This fuzzer checks that compressed PostingList contains can be successfully +/// decoded into the original sequence. +extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) { + if (Size == 0) + return 0; + const auto OriginalDocuments = generateDocuments(Data, Size); + if (OriginalDocuments.empty()) + return 0; + // Ensure that given sequence of DocIDs is sorted. + for (size_t I = 1; I < OriginalDocuments.size(); ++I) + if (OriginalDocuments[I] <= OriginalDocuments[I - 1]) + return 0; + const clang::clangd::dex::PostingList List(OriginalDocuments); + const auto DecodedDocuments = clang::clangd::dex::consume(*List.iterator()); + // Compare decoded sequence against the original PostingList contents. + if (DecodedDocuments.size() != OriginalDocuments.size()) + LLVM_BUILTIN_TRAP; + for (size_t I = 0; I < DecodedDocuments.size(); ++I) + if (DecodedDocuments[I].first != OriginalDocuments[I]) + LLVM_BUILTIN_TRAP; + return 0; +}