Index: clangd/index/dex/DexIndex.cpp =================================================================== --- clangd/index/dex/DexIndex.cpp +++ clangd/index/dex/DexIndex.cpp @@ -50,11 +50,15 @@ }); // Populate TempInvertedIndex with posting lists for index symbols. + llvm::DenseMap> TempInvertedIndex; for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) { const auto *Sym = Symbols[SymbolRank]; for (const auto &Token : generateSearchTokens(*Sym)) - InvertedIndex[Token].push_back(SymbolRank); + TempInvertedIndex[Token].push_back(SymbolRank); } + InvertedIndex.reserve(InvertedIndex.size()); + for (auto &Pair : TempInvertedIndex) + InvertedIndex.try_emplace(Pair.first, std::move(Pair.second)); vlog("Built DexIndex with estimated memory usage {0} bytes.", estimateMemoryUsage()); @@ -80,7 +84,7 @@ for (const auto &Trigram : TrigramTokens) { const auto It = InvertedIndex.find(Trigram); if (It != InvertedIndex.end()) - TrigramIterators.push_back(create(It->second)); + TrigramIterators.push_back(It->second.iterator()); } if (!TrigramIterators.empty()) TopLevelChildren.push_back(createAnd(move(TrigramIterators))); @@ -90,7 +94,7 @@ for (const auto &Scope : Req.Scopes) { const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope)); if (It != InvertedIndex.end()) - ScopeIterators.push_back(create(It->second)); + ScopeIterators.push_back(It->second.iterator()); } // Add OR iterator for scopes if there are any Scope Iterators. if (!ScopeIterators.empty()) @@ -154,10 +158,8 @@ LookupTable.size() * sizeof(std::pair); Bytes += SymbolQuality.size() * sizeof(std::pair); Bytes += InvertedIndex.size() * sizeof(Token); - - for (const auto &P : InvertedIndex) { - Bytes += P.second.size() * sizeof(DocID); - } + for (const auto &P : InvertedIndex) + Bytes += P.second.bytes(); return Bytes; } Index: clangd/index/dex/Iterator.h =================================================================== --- clangd/index/dex/Iterator.h +++ clangd/index/dex/Iterator.h @@ -32,6 +32,7 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/BitVector.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -44,20 +45,6 @@ /// Symbol position in the list of all index symbols sorted by a pre-computed /// symbol quality. using DocID = uint32_t; -/// Contains sorted sequence of DocIDs all of which belong to symbols matching -/// certain criteria, i.e. containing a Search Token. PostingLists are values -/// for the inverted index. -// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are -// likely to be very dense and hence require special attention so that the index -// doesn't use too much memory. Possible solution would be to construct -// compressed posting lists which consist of ranges of DocIDs instead of -// distinct DocIDs. A special case would be the empty query: for that case -// TrueIterator should be implemented - an iterator which doesn't actually store -// any PostingList within itself, but "contains" all DocIDs in range -// [0, IndexSize). -using PostingList = std::vector; -/// Immutable reference to PostingList object. -using PostingListRef = llvm::ArrayRef; /// Iterator is the interface for Query Tree node. The simplest type of Iterator /// is DocumentIterator which is simply a wrapper around PostingList iterator @@ -131,11 +118,6 @@ /// to acquire preliminary scores of requested items. std::vector> consume(Iterator &It); -/// Returns a document iterator over given PostingList. -/// -/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item. -std::unique_ptr create(PostingListRef Documents); - /// Returns AND Iterator which performs the intersection of the PostingLists of /// its children. /// @@ -200,6 +182,33 @@ Children.push_back(move(Head)); } +/// Contains sorted sequence of DocIDs all of which belong to symbols matching +/// certain criteria, i.e. containing a Search Token. PostingLists are values +/// for the inverted index. +class PostingList { +public: + PostingList() : Representation(Null) {} + PostingList(std::vector Docs); + /// Returns a document iterator over given PostingList. + std::unique_ptr iterator() const; + + PostingList(PostingList&&); + PostingList &operator=(PostingList&&); + ~PostingList(); + + size_t bytes() const; + +private: + enum Rep { Null, Dense, Sparse } Representation; + union { + struct { + llvm::BitVector Bitmap; + size_t Count; + } DenseRep; + std::vector SparseRep; + }; +}; + } // namespace dex } // namespace clangd } // namespace clang Index: clangd/index/dex/Iterator.cpp =================================================================== --- clangd/index/dex/Iterator.cpp +++ clangd/index/dex/Iterator.cpp @@ -18,12 +18,11 @@ namespace { -/// Implements Iterator over a PostingList. DocumentIterator is the most basic -/// iterator: it doesn't have any children (hence it is the leaf of iterator -/// tree) and is simply a wrapper around PostingList::const_iterator. -class DocumentIterator : public Iterator { +/// Implements Iterator over a sparse PostingList. +/// This is a leaf iterator which simply wraps a list of DocIDs. +class SparseIterator : public Iterator { public: - DocumentIterator(PostingListRef Documents) + SparseIterator(llvm::ArrayRef Documents) : Documents(Documents), Index(std::begin(Documents)) {} bool reachedEnd() const override { return Index == std::end(Documents); } @@ -74,8 +73,52 @@ return OS; } - PostingListRef Documents; - PostingListRef::const_iterator Index; + llvm::ArrayRef Documents; + llvm::ArrayRef::iterator Index; +}; + +/// Implements Iterator over a dense PostingList. +/// This is a leaf iterator over a BitVector with one bit per possible DocID. +class DenseIterator : public Iterator { +public: + DenseIterator(const llvm::BitVector &Bits, size_t Count) + : Bits(Bits), Index(Bits.find_first()), Count(Count) {} + + bool reachedEnd() const override { return Index == -1; } + + /// Advances cursor to the next item. + void advance() override { + assert(!reachedEnd() && "DENSE iterator can't advance() at the end."); + Index = Bits.find_next(Index); + } + + /// Applies binary search to advance cursor to the next item with DocID equal + /// or higher than the given one. + void advanceTo(DocID ID) override { + assert(!reachedEnd() && "DENSE iterator can't advance() at the end."); + Index = Bits.find_next(ID); + } + + DocID peek() const override { + assert(!reachedEnd() && "DENSE iterator can't peek() at the end."); + return Index; + } + + float consume() override { + assert(!reachedEnd() && "DENSE iterator can't consume() at the end."); + return DEFAULT_BOOST_SCORE; + } + + size_t estimateSize() const override { return Count; } + +private: + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + return OS << "(dense)\n"; + } + + const llvm::BitVector &Bits; + int Index; // Invariant: Index == -1 || Bits[Index] + size_t Count; }; /// Implements Iterator over the intersection of other iterators. @@ -396,10 +439,6 @@ return Result; } -std::unique_ptr create(PostingListRef Documents) { - return llvm::make_unique(Documents); -} - std::unique_ptr createAnd(std::vector> Children) { return llvm::make_unique(move(Children)); @@ -424,6 +463,77 @@ return llvm::make_unique(move(Child), Limit); } +std::unique_ptr PostingList::iterator() const { + switch(Representation) { + case Dense: + return llvm::make_unique(DenseRep.Bitmap, DenseRep.Count); + case Sparse: + return llvm::make_unique(SparseRep); + case Null: + assert(false && "iterator() on null posting list"); + return llvm::make_unique(llvm::ArrayRef{}); + } +} + +PostingList::PostingList(std::vector Docs) { + if (Docs.empty() || sizeof(DocID) * Docs.size() < Docs.back() / CHAR_BIT) { + Representation = Sparse; + new (&SparseRep) decltype(SparseRep)(std::move(Docs)); + } else { + Representation = Dense; + new (&DenseRep) decltype(DenseRep); + DenseRep.Count = Docs.size(); + DenseRep.Bitmap.resize(Docs.back() + 1); + for (DocID D : Docs) + DenseRep.Bitmap.set(D); + } +}; + +PostingList::~PostingList() { + switch (Representation) { + case Sparse: + delete &SparseRep; + break; + case Dense: + delete &DenseRep; + break; + case Null: + break; + } +} + +PostingList::PostingList(PostingList &&Other) { + Representation = Other.Representation; + switch (Representation) { + case Sparse: + new (&SparseRep) decltype(SparseRep)(std::move(Other.SparseRep)); + break; + case Dense: + new (&DenseRep) decltype(DenseRep)(std::move(Other.DenseRep)); + break; + case Null: + break; + } + Other.Representation = Null; +} + +PostingList &PostingList::operator=(PostingList &&Other) { + this->~PostingList(); + new(this) PostingList(std::move(Other)); + return *this; +} + +size_t PostingList::bytes() const { + switch(Representation) { + case Sparse: + return SparseRep.size() * sizeof(DocID); + case Dense: + return DenseRep.Bitmap.getMemorySize(); + case Null: + return 0; + } +} + } // namespace dex } // namespace clangd } // namespace clang