Index: clang-tools-extra/clangd/index/dex/Dex.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/Dex.cpp +++ clang-tools-extra/clangd/index/dex/Dex.cpp @@ -128,8 +128,8 @@ // Convert lists of items to posting lists. for (const auto &TokenToPostingList : TempInvertedIndex) - InvertedIndex.insert({TokenToPostingList.first, - PostingList(move(TokenToPostingList.second))}); + InvertedIndex.insert( + {TokenToPostingList.first, PostingList(TokenToPostingList.second)}); vlog("Built Dex with estimated memory usage {0} bytes.", estimateMemoryUsage()); Index: clang-tools-extra/clangd/index/dex/PostingList.h =================================================================== --- clang-tools-extra/clangd/index/dex/PostingList.h +++ clang-tools-extra/clangd/index/dex/PostingList.h @@ -6,13 +6,19 @@ // License. See LICENSE.TXT for details. // //===----------------------------------------------------------------------===// -// -// This defines posting list interface: a storage for identifiers of symbols -// which can be characterized by a specific feature (such as fuzzy-find trigram, -// scope, type or any other Search Token). Posting lists can be traversed in -// order using an iterator and are values for inverted index, which maps search -// tokens to corresponding posting lists. -// +/// +/// \file +/// This defines posting list interface: a storage for identifiers of symbols +/// which can be characterized by a specific feature (such as fuzzy-find +/// trigram, scope, type or any other Search Token). Posting lists can be +/// traversed in order using an iterator and are values for inverted index, +/// which maps search tokens to corresponding posting lists. +/// +/// In order to decrease size of Index in-memory representation, Variable Byte +/// Encoding (VByte) is used for PostingLists compression. An overview of VByte +/// algorithm can be found in "Introduction to Information Retrieval" book: +/// https://nlp.stanford.edu/IR-book/html/htmledition/variable-byte-codes-1.html +/// //===----------------------------------------------------------------------===// #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H @@ -20,6 +26,7 @@ #include "Iterator.h" #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" #include #include @@ -29,20 +36,43 @@ class Iterator; +/// NOTE: This is an implementation detail. +/// +/// Chunk is a fixed-width piece of PostingList which contains the first DocID +/// in uncompressed format (Head) and delta-encoded Payload. It can be +/// decompressed upon request. +struct Chunk { + /// Keep sizeof(Chunk) == 32. + static constexpr size_t PayloadSize = 32 - sizeof(DocID); + + llvm::SmallVector decompress() const; + + /// The first element of decompressed Chunk. + DocID Head; + /// VByte-encoded deltas. + std::array Payload = std::array(); +}; +static_assert(sizeof(Chunk) == 32, "Chunk should take 32 bytes of memory."); + /// PostingList is the storage of DocIDs which can be inserted to the Query -/// Tree as a leaf by constructing Iterator over the PostingList object. -// FIXME(kbobyrev): Use VByte algorithm to compress underlying data. +/// Tree as a leaf by constructing Iterator over the PostingList object. DocIDs +/// are stored in underlying chunks. Compression saves memory at a small cost +/// in access time, which is still fast enough in practice. class PostingList { public: - explicit PostingList(const std::vector &&Documents) - : Documents(std::move(Documents)) {} + explicit PostingList(llvm::ArrayRef Documents); + /// Constructs DocumentIterator over given posting list. DocumentIterator will + /// go through the chunks and decompress them on-the-fly when necessary. std::unique_ptr iterator() const; - size_t bytes() const { return Documents.size() * sizeof(DocID); } + /// Returns in-memory size. + size_t bytes() const { + return sizeof(Chunk) + Chunks.capacity() * sizeof(Chunk); + } private: - const std::vector Documents; + const std::vector Chunks; }; } // namespace dex Index: clang-tools-extra/clangd/index/dex/PostingList.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/PostingList.cpp +++ clang-tools-extra/clangd/index/dex/PostingList.cpp @@ -9,6 +9,9 @@ #include "PostingList.h" #include "Iterator.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MathExtras.h" +#include namespace clang { namespace clangd { @@ -16,21 +19,27 @@ namespace { -/// Implements Iterator over std::vector. This is the most basic -/// iterator and is simply a wrapper around -/// std::vector::const_iterator. -class PlainIterator : public Iterator { +/// Implements iterator of PostingList chunks. This requires iterating over two +/// levels: the first level iterator iterates over the chunks and decompresses +/// them on-the-fly when the contents of chunk are to be seen. +class ChunkIterator : public Iterator { public: - explicit PlainIterator(llvm::ArrayRef Documents) - : Documents(Documents), Index(std::begin(Documents)) {} + explicit ChunkIterator(llvm::ArrayRef Chunks) + : Chunks(Chunks), CurrentChunk(Chunks.begin()) { + if (!Chunks.empty()) { + DecompressedChunk = CurrentChunk->decompress(); + CurrentID = DecompressedChunk.begin(); + } + } - bool reachedEnd() const override { return Index == std::end(Documents); } + bool reachedEnd() const override { return CurrentChunk == Chunks.end(); } /// Advances cursor to the next item. void advance() override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); - ++Index; + ++CurrentID; + normalizeCursor(); } /// Applies binary search to advance cursor to the next item with DocID @@ -38,16 +47,17 @@ void advanceTo(DocID ID) override { assert(!reachedEnd() && "Posting List iterator can't advance() at the end."); - // If current ID is beyond requested one, iterator is already in the right - // state. - if (peek() < ID) - Index = std::lower_bound(Index, std::end(Documents), ID); + if (ID <= peek()) + return; + advanceToChunk(ID); + // Try to find ID within current chunk. + CurrentID = std::lower_bound(CurrentID, std::end(DecompressedChunk), ID); + normalizeCursor(); } DocID peek() const override { - assert(!reachedEnd() && - "Posting List iterator can't peek() at the end."); - return *Index; + assert(!reachedEnd() && "Posting List iterator can't peek() at the end."); + return *CurrentID; } float consume() override { @@ -56,27 +66,160 @@ return DEFAULT_BOOST_SCORE; } - size_t estimateSize() const override { return Documents.size(); } + size_t estimateSize() const override { + return Chunks.size() * ApproxEntriesPerChunk; + } private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << '['; - if (Index != std::end(Documents)) - OS << *Index; - else - OS << "END"; + if (CurrentChunk != Chunks.begin() || + (CurrentID != DecompressedChunk.begin() && !DecompressedChunk.empty())) + OS << "... "; + OS << (reachedEnd() ? "END" : std::to_string(*CurrentID)); + if (!reachedEnd() && CurrentID < DecompressedChunk.end() - 1) + OS << " ..."; OS << ']'; return OS; } - llvm::ArrayRef Documents; - llvm::ArrayRef::const_iterator Index; + /// If the cursor is at the end of a chunk, place it at the start of the next + /// chunk. + void normalizeCursor() { + // Invariant is already established if examined chunk is not exhausted. + if (CurrentID != std::end(DecompressedChunk)) + return; + // Advance to next chunk if current one is exhausted. + ++CurrentChunk; + if (CurrentChunk == Chunks.end()) // Reached the end of PostingList. + return; + DecompressedChunk = CurrentChunk->decompress(); + CurrentID = DecompressedChunk.begin(); + } + + /// Advances CurrentChunk to the chunk which might contain ID. + void advanceToChunk(DocID ID) { + if ((CurrentChunk != Chunks.end() - 1) && + ((CurrentChunk + 1)->Head <= ID)) { + // Find the next chunk with Head >= ID. + CurrentChunk = std::lower_bound( + CurrentChunk + 1, Chunks.end(), ID, + [](const Chunk &C, const DocID ID) { return C.Head <= ID; }); + --CurrentChunk; + DecompressedChunk = CurrentChunk->decompress(); + CurrentID = DecompressedChunk.begin(); + } + } + + llvm::ArrayRef Chunks; + /// Iterator over chunks. + /// If CurrentChunk is valid, then DecompressedChunk is + /// CurrentChunk->decompress() and CurrentID is a valid (non-end) iterator + /// into it. + decltype(Chunks)::const_iterator CurrentChunk; + llvm::SmallVector DecompressedChunk; + /// Iterator over DecompressedChunk. + decltype(DecompressedChunk)::iterator CurrentID; + + static constexpr size_t ApproxEntriesPerChunk = 15; }; +static constexpr size_t BitsPerEncodingByte = 7; + +/// Writes a variable length DocID into the buffer and updates the buffer size. +/// If it doesn't fit, returns false and doesn't write to the buffer. +bool encodeVByte(DocID Delta, llvm::MutableArrayRef &Payload) { + assert(Delta != 0 && "0 is not a valid PostingList delta."); + // Calculate number of bytes Delta encoding would take by examining the + // meaningful bits. + unsigned Width = 1 + llvm::findLastSet(Delta) / BitsPerEncodingByte; + if (Width > Payload.size()) + return false; + + do { + uint8_t Encoding = Delta & 0x7f; + Delta >>= 7; + Payload.front() = Delta ? Encoding | 0x80 : Encoding; + Payload = Payload.drop_front(); + } while (Delta != 0); + return true; +} + +/// Use Variable-length Byte (VByte) delta encoding to compress sorted list of +/// DocIDs. The compression stores deltas (differences) between subsequent +/// DocIDs and encodes these deltas utilizing the least possible number of +/// bytes. +/// +/// Each encoding byte consists of two parts: the first bit (continuation bit) +/// indicates whether this is the last byte (0 if this byte is the last) of +/// current encoding and seven bytes a piece of DocID (payload). DocID contains +/// 32 bits and therefore it takes up to 5 bytes to encode it (4 full 7-bit +/// payloads and one 4-bit payload), but in practice it is expected that gaps +/// (deltas) between subsequent DocIDs are not large enough to require 5 bytes. +/// In very dense posting lists (with average gaps less than 128) this +/// representation would be 4 times more efficient than raw DocID array. +/// +/// PostingList encoding example: +/// +/// DocIDs 42 47 7000 +/// gaps 5 6958 +/// Encoding (raw number) 00000101 10110110 00101110 +std::vector encodeStream(llvm::ArrayRef Documents) { + assert(!Documents.empty() && "Can't encode empty sequence."); + std::vector Result; + Result.emplace_back(); + DocID Last = Result.back().Head = Documents.front(); + llvm::MutableArrayRef RemainingPayload = Result.back().Payload; + for (DocID Doc : Documents.drop_front()) { + if (!encodeVByte(Doc - Last, RemainingPayload)) { // didn't fit, flush chunk + Result.emplace_back(); + Result.back().Head = Doc; + RemainingPayload = Result.back().Payload; + } + Last = Doc; + } + return std::vector(Result); // no move, shrink-to-fit +} + +/// Reads variable length DocID from the buffer and updates the buffer size. If +/// the stream is terminated, return None. +llvm::Optional readVByte(llvm::ArrayRef &Bytes) { + if (Bytes.front() == 0 || Bytes.empty()) + return llvm::None; + DocID Result = 0; + bool HasNextByte = true; + for (size_t Length = 0; HasNextByte && !Bytes.empty(); ++Length) { + assert(Length <= 5 && "Malformed VByte encoding sequence."); + // Write meaningful bits to the correct place in the document decoding. + Result |= (Bytes.front() & 0x7f) << (BitsPerEncodingByte * Length); + if ((Bytes.front() & 0x80) == 0) + HasNextByte = false; + Bytes = Bytes.drop_front(); + } + return Result; +} + } // namespace +llvm::SmallVector Chunk::decompress() const { + llvm::SmallVector Result{Head}; + llvm::ArrayRef Bytes(Payload); + DocID Delta; + for (DocID Current = Head; !Bytes.empty(); Current += Delta) { + auto MaybeDelta = readVByte(Bytes); + if (!MaybeDelta) + break; + Delta = *MaybeDelta; + Result.push_back(Current + Delta); + } + return llvm::SmallVector{Result}; +} + +PostingList::PostingList(llvm::ArrayRef Documents) + : Chunks(encodeStream(Documents)) {} + std::unique_ptr PostingList::iterator() const { - return llvm::make_unique(Documents); + return llvm::make_unique(Chunks); } } // namespace dex Index: clang-tools-extra/unittests/clangd/DexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexTests.cpp +++ clang-tools-extra/unittests/clangd/DexTests.cpp @@ -69,19 +69,6 @@ EXPECT_TRUE(DocIterator->reachedEnd()); } -TEST(DexIterators, AndWithEmpty) { - const PostingList L0({}); - const PostingList L1({0, 5, 7, 10, 42, 320, 9000}); - - auto AndEmpty = createAnd(L0.iterator()); - EXPECT_TRUE(AndEmpty->reachedEnd()); - - auto AndWithEmpty = createAnd(L0.iterator(), L1.iterator()); - EXPECT_TRUE(AndWithEmpty->reachedEnd()); - - EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre()); -} - TEST(DexIterators, AndTwoLists) { const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); @@ -120,20 +107,6 @@ EXPECT_TRUE(And->reachedEnd()); } -TEST(DexIterators, OrWithEmpty) { - const PostingList L0({}); - const PostingList L1({0, 5, 7, 10, 42, 320, 9000}); - - auto OrEmpty = createOr(L0.iterator()); - EXPECT_TRUE(OrEmpty->reachedEnd()); - - auto OrWithEmpty = createOr(L0.iterator(), L1.iterator()); - EXPECT_FALSE(OrWithEmpty->reachedEnd()); - - EXPECT_THAT(consumeIDs(*OrWithEmpty), - ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U)); -} - TEST(DexIterators, OrTwoLists) { const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); @@ -211,29 +184,27 @@ // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5| // +----------+----------+ +----------+------------+ // | | - // +------+-----+ +---------------------+ - // | | | | | - // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+ - // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4| - // +-------------+ +----+---+ +-----+ +---+----+ +----+---+ - // | | | - // +----v-----+ +-v--+ +---v---+ - // |1, 5, 7, 9| |1, 5| |0, 3, 5| - // +----------+ +----+ +-------+ + // +------+-----+ ------------+ + // | | | | + // +-------v-----+ +----+---+ +---v----+ +----v---+ + // |1, 3, 5, 8, 9| |Boost: 2| |Boost: 3| |Boost: 4| + // +-------------+ +----+---+ +---+----+ +----+---+ + // | | | + // +----v-----+ +-v--+ +---v---+ + // |1, 5, 7, 9| |1, 5| |0, 3, 5| + // +----------+ +----+ +-------+ // const PostingList L0({1, 3, 5, 8, 9}); const PostingList L1({1, 5, 7, 9}); - const PostingList L3({}); - const PostingList L4({1, 5}); - const PostingList L5({0, 3, 5}); + const PostingList L2({1, 5}); + const PostingList L3({0, 3, 5}); // Root of the query tree: [1, 5] auto Root = createAnd( // Lower And Iterator: [1, 5, 9] createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)), // Lower Or Iterator: [0, 1, 5] - createOr(L3.iterator(), createBoost(L4.iterator(), 3U), - createBoost(L5.iterator(), 4U))); + createOr(createBoost(L2.iterator(), 3U), createBoost(L3.iterator(), 4U))); EXPECT_FALSE(Root->reachedEnd()); EXPECT_EQ(Root->peek(), 1U); @@ -260,15 +231,13 @@ const PostingList L2({1, 5, 7, 9}); const PostingList L3({0, 5}); const PostingList L4({0, 1, 5}); - const PostingList L5({}); - - EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4]"); - - auto Nested = - createAnd(createAnd(L1.iterator(), L2.iterator()), - createOr(L3.iterator(), L4.iterator(), L5.iterator())); - EXPECT_EQ(llvm::to_string(*Nested), "(& (| [5] [1] [END]) (& [1] [1]))"); + EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4 ...]"); + auto It = L0.iterator(); + It->advanceTo(19); + EXPECT_EQ(llvm::to_string(*It), "[... 20 ...]"); + It->advanceTo(9000); + EXPECT_EQ(llvm::to_string(*It), "[... END]"); } TEST(DexIterators, Limit) {