Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -76,6 +76,7 @@ add_subdirectory(tool) add_subdirectory(indexer) add_subdirectory(index/dex/dexp) +add_subdirectory(index/dex/fuzzer) if (LLVM_INCLUDE_BENCHMARKS) add_subdirectory(benchmarks) Index: clang-tools-extra/clangd/index/dex/Dex.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/Dex.cpp +++ clang-tools-extra/clangd/index/dex/Dex.cpp @@ -128,8 +128,8 @@ // Convert lists of items to posting lists. for (const auto &TokenToPostingList : TempInvertedIndex) - InvertedIndex.insert({TokenToPostingList.first, - PostingList(move(TokenToPostingList.second))}); + InvertedIndex.insert( + {TokenToPostingList.first, PostingList(TokenToPostingList.second)}); vlog("Built Dex with estimated memory usage {0} bytes.", estimateMemoryUsage()); Index: clang-tools-extra/clangd/index/dex/PostingList.h =================================================================== --- clang-tools-extra/clangd/index/dex/PostingList.h +++ clang-tools-extra/clangd/index/dex/PostingList.h @@ -6,13 +6,19 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This defines posting list interface: a storage for identifiers of symbols -// which can be characterized by a specific feature (such as fuzzy-find trigram, -// scope, type or any other Search Token). Posting lists can be traversed in -// order using an iterator and are values for inverted index, which maps search -// tokens to corresponding posting lists. -// +/// +/// \file +/// This defines posting list interface: a storage for identifiers of symbols +/// which can be characterized by a specific feature (such as fuzzy-find +/// trigram, scope, type or any other Search Token). Posting lists can be +/// traversed in order using an iterator and are values for inverted index, +/// which maps search tokens to corresponding posting lists. +/// +/// In order to decrease size of Index in-memory representation, Variable Byte +/// Encoding (VByte) is used for PostingLists compression. An overview of VByte +/// algorithm can be found in "Introduction to Information Retrieval" book: +/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html +/// //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H @@ -20,6 +26,7 @@ #include "Iterator.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include #include @@ -29,20 +36,41 @@ class Iterator; +/// Chunk is a fixed-width piece of PostingList which contains the first DocID +/// in uncompressed format (Head) and delta-encoded Payload. It can be +/// decompressed upon request. +struct Chunk { + /// Keep sizeof(Chunk) == 32. + static constexpr size_t PayloadSize = 32 - sizeof(DocID); + /// Number of DocID bits in each encoding byte. + static constexpr size_t BitsPerEncodingByte = 7; + + llvm::SmallVector decompress() const; + + /// The first element of + DocID Head; + /// VByte-encoded deltas. + std::array Payload = std::array(); +}; +static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory."); + /// PostingList is the storage of DocIDs which can be inserted to the Query -/// Tree as a leaf by constructing Iterator over the PostingList object. -// FIXME(kbobyrev): Use VByte algorithm to compress underlying data. +/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs +/// are stored in underlying chunks. Compression saves memory at a small cost +/// in access time, which is still fast enough in practice. class PostingList { public: - explicit PostingList(const std::vector &&Documents) - : Documents(std::move(Documents)) {} + explicit PostingList(llvm::ArrayRef Documents); + /// Constructs DocumentIterator over given posting list. DocumentIterator will + /// go through the chunks and decompress them on-the-fly when necessary. std::unique_ptr iterator() const; - size_t bytes() const { return Documents.size() * sizeof(DocID); } + /// Returns in-memory size. + size_t bytes() const { return Chunks.capacity() * sizeof(Chunk); } private: - const std::vector Documents; + const llvm::SmallVector Chunks; }; } // namespace dex Index: clang-tools-extra/clangd/index/dex/PostingList.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/PostingList.cpp +++ clang-tools-extra/clangd/index/dex/PostingList.cpp @@ -9,6 +9,9 @@ #include "PostingList.h" #include "Iterator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MathExtras.h" +#include namespace clang { namespace clangd { @@ -16,21 +19,31 @@ namespace { -/// Implements Iterator over std::vector. This is the most basic -/// iterator and is simply a wrapper around -/// std::vector::const_iterator. -class PlainIterator : public Iterator { +/// Implements iterator of PostingList chunks. This requires iterating over two +/// levels: the first level iterator iterates over the chunks and decompresses +/// them on-the-fly when the contents of chunk are to be seen. +class ChunkIterator : public Iterator { public: - explicit PlainIterator(llvm::ArrayRef Documents) - : Documents(Documents), Index(std::begin(Documents)) {} + explicit ChunkIterator(llvm::ArrayRef Chunks) + : Chunks(Chunks), CurrentChunk(Chunks.begin()) { + if (!Chunks.empty()) { + DecompressedChunk = CurrentChunk->decompress(); + CurrentID = DecompressedChunk.begin(); + } + } - bool reachedEnd() const override { return Index == std::end(Documents); } + bool reachedEnd() const override { return CurrentChunk == Chunks.end(); } /// Advances cursor to the next item. void advance() override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); - ++Index; + if (++CurrentID == DecompressedChunk.end()) { // reached end of chunk + if (++CurrentChunk == Chunks.end()) // reached the end + return; + DecompressedChunk = CurrentChunk->decompress(); + CurrentID = DecompressedChunk.begin(); + } } /// Applies binary search to advance cursor to the next item with DocID @@ -38,16 +51,26 @@ void advanceTo(DocID ID) override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); - // If current ID is beyond requested one, iterator is already in the right - // state. - if (peek() < ID) - Index = std::lower_bound(Index, std::end(Documents), ID); + if (ID <= peek()) + return; + advanceToChunk(ID); + // Try to find ID within current chunk. + CurrentID = std::lower_bound(CurrentID, std::end(DecompressedChunk), ID); + // Return if the position was found in current chunk. + if (CurrentID != std::end(DecompressedChunk)) + return; + // Otherwise, the iterator should point to the first element of the next + // chunk (if there is any). + ++CurrentChunk; + if (CurrentChunk == Chunks.end()) + return; + DecompressedChunk = CurrentChunk->decompress(); + CurrentID = DecompressedChunk.begin(); } DocID peek() const override { - assert(!reachedEnd() && - "Posting List iterator can't peek() at the end."); - return *Index; + assert(!reachedEnd() && "Posting List iterator can't peek() at the end."); + return *CurrentID; } float consume() override { @@ -56,27 +79,154 @@ return DEFAULT_BOOST_SCORE; } - size_t estimateSize() const override { return Documents.size(); } + size_t estimateSize() const override { + return Chunks.size() * ApproxEntriesPerChunk; + } private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << '['; - if (Index != std::end(Documents)) - OS << *Index; - else - OS << "END"; + if (CurrentChunk != Chunks.begin() || + (CurrentID != DecompressedChunk.begin() && !DecompressedChunk.empty())) + OS << "... "; + OS << (reachedEnd() ? "END" : std::to_string(*CurrentID)); + if (!reachedEnd() && CurrentID < DecompressedChunk.end() - 1) + OS << " ..."; OS << ']'; return OS; } - llvm::ArrayRef Documents; - llvm::ArrayRef::const_iterator Index; + /// Advances CurrentChunk to the chunk which might contain ID. + void advanceToChunk(DocID ID) { + if ((CurrentChunk != Chunks.end() - 1) && + ((CurrentChunk + 1)->Head <= ID)) { + // Find the next chunk with Head >= ID. + CurrentChunk = std::lower_bound( + CurrentChunk + 1, Chunks.end(), ID, + [](const Chunk &C, const DocID ID) { return C.Head <= ID; }); + --CurrentChunk; + DecompressedChunk = CurrentChunk->decompress(); + CurrentID = DecompressedChunk.begin(); + } + } + + llvm::ArrayRef Chunks; + // Iterator over chunks. + decltype(Chunks)::const_iterator CurrentChunk; + llvm::SmallVector DecompressedChunk; + // Iterator over DecompressedChunk. + decltype(DecompressedChunk)::iterator CurrentID; + + static constexpr size_t ApproxEntriesPerChunk = 15; }; +/// Writes a variable length DocID into the buffer and updates the buffer size. +/// If it doesn't fit, returns false and doesn't write to the buffer. +bool encodeVByte(DocID Delta, llvm::MutableArrayRef &Payload) { + // Calculate number of bytes Delta encoding would take by rounding up number + // of meaningful bytes divided by Chunk::BitsPerEncodingByte. + size_t Width = (sizeof(DocID) * 8 - llvm::countLeadingZeros(Delta) + + Chunk::BitsPerEncodingByte - 1) / + Chunk::BitsPerEncodingByte; + if (Width > Payload.size()) + return false; + + static constexpr size_t MaxBytes = 5; + for (size_t I = 0; I < MaxBytes; ++I) { + DocID Mask = (I + 1 != MaxBytes ? 0x7f : 0xf) + << Chunk::BitsPerEncodingByte * I; + uint8_t Encoding = (Delta & Mask) >> (Chunk::BitsPerEncodingByte * I); + bool HasNextByte = I + 1 != Width; + // If there is next byte, apply continuation bit. + Payload.front() = HasNextByte ? Encoding | 0x80 : Encoding; + Payload = Payload.drop_front(); + if (!HasNextByte) + break; + } + return true; +} + +/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of +/// DocIDs. The compression stores deltas (differences) between subsequent +/// DocIDs and encodes these deltas utilizing the least possible number of +/// bytes. +/// +/// Each encoding byte consists of two parts: the first bit (continuation bit) +/// indicates whether this is the last byte (0 if this byte is the last) of +/// current encoding and seven bytes a piece of DocID (payload). DocID contains +/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit +/// payloads and one 4-bit payload), but in practice it is expected that gaps +/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes. +/// In very dense posting lists (with average gaps less than 128) this +/// representation would be 4 times more efficient than raw DocID array. +/// +/// PostingList encoding example: +/// +/// DocIDs 42 47 7000 +/// gaps 5 6958 +/// Encoding (raw number) 00000101 10110110 00101110 +llvm::SmallVector encodeStream(llvm::ArrayRef Documents) { + assert(!Documents.empty() && "Can't encode empty sequence."); + llvm::SmallVector Result; + std::array Payload; + Payload.fill(0); + llvm::MutableArrayRef PayloadRef(Payload); + size_t HeadIndex = 0; + for (size_t I = 0; I < Documents.size(); ++I) { + if (Documents.size() != 1 && + (I == 0 || (encodeVByte(Documents[I] - Documents[I - 1], PayloadRef) && + I + 1 != Documents.size()))) + continue; + Chunk NextChunk; + NextChunk.Head = Documents[HeadIndex]; + std::copy(Payload.begin(), Payload.end(), NextChunk.Payload.begin()); + Result.push_back(NextChunk); + HeadIndex = I; + Payload.fill(0); + PayloadRef = Payload; + } + return llvm::SmallVector(Result); // no move, shrink-to-fit +} + +/// Reads variable length DocID from the buffer and updates the buffer size. If +/// the stream is terminated, return None. +llvm::Optional readVByte(llvm::ArrayRef &Bytes) { + if (Bytes.front() == 0 || Bytes.empty()) + return llvm::None; + DocID Result = 0; + bool HasNextByte = true; + for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) { + assert(Length <= 5 && "Malformed VByte encoding sequence."); + // Write meaningful bits to the correct place in the document decoding. + Result |= (Bytes.front() & 0x7f) << (Chunk::BitsPerEncodingByte * Length); + if ((Bytes.front() & 0x80) == 0) + HasNextByte = false; + Bytes = Bytes.drop_front(); + } + return Result; +} + } // namespace +llvm::SmallVector Chunk::decompress() const { + llvm::SmallVector Result{Head}; + llvm::ArrayRef Bytes(Payload); + DocID Delta; + for (DocID Current = Head; !Bytes.empty(); Current += Delta) { + auto MaybeDelta = readVByte(Bytes); + if (!MaybeDelta) + break; + Delta = *MaybeDelta; + Result.push_back(Current + Delta); + } + return llvm::SmallVector{Result}; +} + +PostingList::PostingList(llvm::ArrayRef Documents) + : Chunks(encodeStream(Documents)) {} + std::unique_ptr PostingList::iterator() const { - return llvm::make_unique(Documents); + return llvm::make_unique(Chunks); } } // namespace dex Index: clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/fuzzer/CMakeLists.txt @@ -0,0 +1,19 @@ +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/..) + +set(LLVM_LINK_COMPONENTS Support) + +if(LLVM_USE_SANITIZE_COVERAGE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=fuzzer") +endif() + +add_clang_executable(clangd-vbyte-fuzzer + EXCLUDE_FROM_ALL + VByteFuzzer.cpp + ) + +target_link_libraries(clangd-vbyte-fuzzer + PRIVATE + clangBasic + clangDaemon + ${LLVM_LIB_FUZZING_ENGINE} + ) Index: clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/fuzzer/VByteFuzzer.cpp @@ -0,0 +1,64 @@ +//===-- VByteFuzzer.cpp - Fuzz VByte Posting List encoding ----------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// \brief This file implements a function that runs clangd on a single input. +/// This function is then linked into the Fuzzer library. +/// +//===----------------------------------------------------------------------===// + +#include "../Iterator.h" +#include "../PostingList.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/FileSystem.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + +using DocID = clang::clangd::dex::DocID; + +/// Transform raw byte sequence into list of DocIDs. +std::vector generateDocuments(uint8_t *Data, size_t Size) { + std::vector Result; + DocID ID = 0; + for (size_t I = 0; I < Size; ++I) { + size_t Offset = I % 4; + if (Offset == 0 && I != 0) { + ID = 0; + Result.push_back(ID); + } + ID |= (Data[I] << Offset); + } + if (Size > 4 && Size % 4 != 0) + Result.push_back(ID); + return Result; +} + +/// This fuzzer checks that compressed PostingList contains can be successfully +/// decoded into the original sequence. +extern "C" int LLVMFuzzerTestOneInput(uint8_t *Data, size_t Size) { + if (Size == 0) + return 0; + const auto OriginalDocuments = generateDocuments(Data, Size); + if (OriginalDocuments.empty()) + return 0; + // Ensure that given sequence of DocIDs is sorted. + for (size_t I = 1; I < OriginalDocuments.size(); ++I) + if (OriginalDocuments[I] <= OriginalDocuments[I - 1]) + return 0; + const clang::clangd::dex::PostingList List(OriginalDocuments); + const auto DecodedDocuments = clang::clangd::dex::consume(*List.iterator()); + // Compare decoded sequence against the original PostingList contents. + if (DecodedDocuments.size() != OriginalDocuments.size()) + LLVM_BUILTIN_TRAP; + for (size_t I = 0; I < DecodedDocuments.size(); ++I) + if (DecodedDocuments[I].first != OriginalDocuments[I]) + LLVM_BUILTIN_TRAP; + return 0; +} Index: clang-tools-extra/unittests/clangd/DexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexTests.cpp +++ clang-tools-extra/unittests/clangd/DexTests.cpp @@ -69,19 +69,6 @@ EXPECT_TRUE(DocIterator->reachedEnd()); } -TEST(DexIterators, AndWithEmpty) { - const PostingList L0({}); - const PostingList L1({0, 5, 7, 10, 42, 320, 9000}); - - auto AndEmpty = createAnd(L0.iterator()); - EXPECT_TRUE(AndEmpty->reachedEnd()); - - auto AndWithEmpty = createAnd(L0.iterator(), L1.iterator()); - EXPECT_TRUE(AndWithEmpty->reachedEnd()); - - EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre()); -} - TEST(DexIterators, AndTwoLists) { const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); @@ -120,20 +107,6 @@ EXPECT_TRUE(And->reachedEnd()); } -TEST(DexIterators, OrWithEmpty) { - const PostingList L0({}); - const PostingList L1({0, 5, 7, 10, 42, 320, 9000}); - - auto OrEmpty = createOr(L0.iterator()); - EXPECT_TRUE(OrEmpty->reachedEnd()); - - auto OrWithEmpty = createOr(L0.iterator(), L1.iterator()); - EXPECT_FALSE(OrWithEmpty->reachedEnd()); - - EXPECT_THAT(consumeIDs(*OrWithEmpty), - ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U)); -} - TEST(DexIterators, OrTwoLists) { const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); @@ -211,29 +184,27 @@ // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5| // +----------+----------+ +----------+------------+ // | | - // +------+-----+ +---------------------+ - // | | | | | - // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+ - // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4| - // +-------------+ +----+---+ +-----+ +---+----+ +----+---+ - // | | | - // +----v-----+ +-v--+ +---v---+ - // |1, 5, 7, 9| |1, 5| |0, 3, 5| - // +----------+ +----+ +-------+ + // +------+-----+ ------------+ + // | | | | + // +-------v-----+ +----+---+ +---v----+ +----v---+ + // |1, 3, 5, 8, 9| |Boost: 2| |Boost: 3| |Boost: 4| + // +-------------+ +----+---+ +---+----+ +----+---+ + // | | | + // +----v-----+ +-v--+ +---v---+ + // |1, 5, 7, 9| |1, 5| |0, 3, 5| + // +----------+ +----+ +-------+ // const PostingList L0({1, 3, 5, 8, 9}); const PostingList L1({1, 5, 7, 9}); - const PostingList L3({}); - const PostingList L4({1, 5}); - const PostingList L5({0, 3, 5}); + const PostingList L2({1, 5}); + const PostingList L3({0, 3, 5}); // Root of the query tree: [1, 5] auto Root = createAnd( // Lower And Iterator: [1, 5, 9] createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)), // Lower Or Iterator: [0, 1, 5] - createOr(L3.iterator(), createBoost(L4.iterator(), 3U), - createBoost(L5.iterator(), 4U))); + createOr(createBoost(L2.iterator(), 3U), createBoost(L3.iterator(), 4U))); EXPECT_FALSE(Root->reachedEnd()); EXPECT_EQ(Root->peek(), 1U); @@ -260,15 +231,13 @@ const PostingList L2({1, 5, 7, 9}); const PostingList L3({0, 5}); const PostingList L4({0, 1, 5}); - const PostingList L5({}); - - EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4]"); - - auto Nested = - createAnd(createAnd(L1.iterator(), L2.iterator()), - createOr(L3.iterator(), L4.iterator(), L5.iterator())); - EXPECT_EQ(llvm::to_string(*Nested), "(& (| [5] [1] [END]) (& [1] [1]))"); + EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4 ...]"); + auto It = L0.iterator(); + It->advanceTo(19); + EXPECT_EQ(llvm::to_string(*It), "[... 20 ...]"); + It->advanceTo(9000); + EXPECT_EQ(llvm::to_string(*It), "[... END]"); } TEST(DexIterators, Limit) {