Index: clangd/CMakeLists.txt =================================================================== --- clangd/CMakeLists.txt +++ clangd/CMakeLists.txt @@ -48,6 +48,7 @@ index/dex/Dex.cpp index/dex/Iterator.cpp + index/dex/PostingList.cpp index/dex/Trigram.cpp LINK_LIBS Index: clangd/index/dex/Dex.h =================================================================== --- clangd/index/dex/Dex.h +++ clangd/index/dex/Dex.h @@ -21,6 +21,7 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEX_H #include "Iterator.h" +#include "PostingList.h" #include "Token.h" #include "Trigram.h" #include "index/Index.h" Index: clangd/index/dex/Dex.cpp =================================================================== --- clangd/index/dex/Dex.cpp +++ clangd/index/dex/Dex.cpp @@ -82,7 +82,7 @@ // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. PathProximitySignals.SymbolURI = ParentURI; BoostingIterators.push_back( - createBoost(create(It->second), PathProximitySignals.evaluate())); + createBoost(It->second.iterator(), PathProximitySignals.evaluate())); } } return BoostingIterators; @@ -112,13 +112,19 @@ Symbols[I] = ScoredSymbols[I].second; } - // Populate TempInvertedIndex with posting lists for index symbols. + // Populate TempInvertedIndex with lists for index symbols. + llvm::DenseMap> TempInvertedIndex; for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) { const auto *Sym = Symbols[SymbolRank]; for (const auto &Token : generateSearchTokens(*Sym)) - InvertedIndex[Token].push_back(SymbolRank); + TempInvertedIndex[Token].push_back(SymbolRank); } + // Convert lists of items to posting lists. + for (const auto &TokenToPostingList : TempInvertedIndex) + InvertedIndex.insert({TokenToPostingList.first, + PostingList(move(TokenToPostingList.second))}); + vlog("Built Dex with estimated memory usage {0} bytes.", estimateMemoryUsage()); } @@ -142,7 +148,7 @@ for (const auto &Trigram : TrigramTokens) { const auto It = InvertedIndex.find(Trigram); if (It != InvertedIndex.end()) - TrigramIterators.push_back(create(It->second)); + TrigramIterators.push_back(It->second.iterator()); } if (!TrigramIterators.empty()) TopLevelChildren.push_back(createAnd(move(TrigramIterators))); @@ -152,7 +158,7 @@ for (const auto &Scope : Req.Scopes) { const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope)); if (It != InvertedIndex.end()) - ScopeIterators.push_back(create(It->second)); + ScopeIterators.push_back(It->second.iterator()); } // Add OR iterator for scopes if there are any Scope Iterators. if (!ScopeIterators.empty()) @@ -233,7 +239,7 @@ Bytes += LookupTable.getMemorySize(); Bytes += InvertedIndex.getMemorySize(); for (const auto &P : InvertedIndex) - Bytes += P.second.size() * sizeof(DocID); + Bytes += P.second.bytes(); return Bytes + BackingDataSize; } Index: clangd/index/dex/Iterator.h =================================================================== --- clangd/index/dex/Iterator.h +++ clangd/index/dex/Iterator.h @@ -44,20 +44,6 @@ /// Symbol position in the list of all index symbols sorted by a pre-computed /// symbol quality. using DocID = uint32_t; -/// Contains sorted sequence of DocIDs all of which belong to symbols matching -/// certain criteria, i.e. containing a Search Token. PostingLists are values -/// for the inverted index. -// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are -// likely to be very dense and hence require special attention so that the index -// doesn't use too much memory. Possible solution would be to construct -// compressed posting lists which consist of ranges of DocIDs instead of -// distinct DocIDs. A special case would be the empty query: for that case -// TrueIterator should be implemented - an iterator which doesn't actually store -// any PostingList within itself, but "contains" all DocIDs in range -// [0, IndexSize). -using PostingList = std::vector; -/// Immutable reference to PostingList object. -using PostingListRef = llvm::ArrayRef; /// Iterator is the interface for Query Tree node. The simplest type of Iterator /// is DocumentIterator which is simply a wrapper around PostingList iterator @@ -131,11 +117,6 @@ /// to acquire preliminary scores of requested items. std::vector> consume(Iterator &It); -/// Returns a document iterator over given PostingList. -/// -/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item. -std::unique_ptr create(PostingListRef Documents); - /// Returns AND Iterator which performs the intersection of the PostingLists of /// its children. /// Index: clangd/index/dex/Iterator.cpp =================================================================== --- clangd/index/dex/Iterator.cpp +++ clangd/index/dex/Iterator.cpp @@ -18,69 +18,6 @@ namespace { -/// Implements Iterator over a PostingList. DocumentIterator is the most basic -/// iterator: it doesn't have any children (hence it is the leaf of iterator -/// tree) and is simply a wrapper around PostingList::const_iterator. -class DocumentIterator : public Iterator { -public: - explicit DocumentIterator(PostingListRef Documents) - : Documents(Documents), Index(std::begin(Documents)) {} - - bool reachedEnd() const override { return Index == std::end(Documents); } - - /// Advances cursor to the next item. - void advance() override { - assert(!reachedEnd() && "DOCUMENT iterator can't advance() at the end."); - ++Index; - } - - /// Applies binary search to advance cursor to the next item with DocID equal - /// or higher than the given one. - void advanceTo(DocID ID) override { - assert(!reachedEnd() && "DOCUMENT iterator can't advance() at the end."); - // If current ID is beyond requested one, iterator is already in the right - // state. - if (peek() < ID) - Index = std::lower_bound(Index, std::end(Documents), ID); - } - - DocID peek() const override { - assert(!reachedEnd() && "DOCUMENT iterator can't peek() at the end."); - return *Index; - } - - float consume() override { - assert(!reachedEnd() && "DOCUMENT iterator can't consume() at the end."); - return DEFAULT_BOOST_SCORE; - } - - size_t estimateSize() const override { return Documents.size(); } - -private: - llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { - OS << '['; - auto Separator = ""; - for (auto It = std::begin(Documents); It != std::end(Documents); ++It) { - OS << Separator; - if (It == Index) - OS << '{' << *It << '}'; - else - OS << *It; - Separator = ", "; - } - OS << Separator; - if (Index == std::end(Documents)) - OS << "{END}"; - else - OS << "END"; - OS << ']'; - return OS; - } - - PostingListRef Documents; - PostingListRef::const_iterator Index; -}; - /// Implements Iterator over the intersection of other iterators. /// /// AndIterator iterates through common items among all children. It becomes @@ -399,10 +336,6 @@ return Result; } -std::unique_ptr create(PostingListRef Documents) { - return llvm::make_unique(Documents); -} - std::unique_ptr createAnd(std::vector> Children) { // If there is exactly one child, pull it one level up: AND(Child) -> Child. Index: clangd/index/dex/PostingList.h =================================================================== --- clangd/index/dex/PostingList.h +++ clangd/index/dex/PostingList.h @@ -0,0 +1,52 @@ +//===--- PostingList.h - Symbol identifiers storage interface --*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines posting list interface: a storage for identifiers of symbols +// which can be characterized by a specific feature (such as fuzzy-find trigram, +// scope, type or any other Search Token). Posting lists can be traversed in +// order using an iterator and are values for inverted index, which maps search +// tokens to corresponding posting lists. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H + +#include "Iterator.h" +#include "llvm/ADT/ArrayRef.h" +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +class Iterator; + +/// PostingList is the storage of DocIDs which can be inserted to the Query +/// Tree as a leaf by constructing Iterator over the PostingList object. +// FIXME(kbobyrev): Use VByte algorithm to compress underlying data. +class PostingList { +public: + explicit PostingList(const std::vector &&Documents) + : Documents(std::move(Documents)) {} + + std::unique_ptr iterator() const; + + size_t bytes() const { return Documents.size() * sizeof(DocID); } + +private: + const std::vector Documents; +}; + +} // namespace dex +} // namespace clangd +} // namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_POSTINGLIST_H Index: clangd/index/dex/PostingList.cpp =================================================================== --- clangd/index/dex/PostingList.cpp +++ clangd/index/dex/PostingList.cpp @@ -0,0 +1,84 @@ +//===--- PostingList.cpp - Symbol identifiers storage interface -----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "PostingList.h" +#include "Iterator.h" + +namespace clang { +namespace clangd { +namespace dex { + +namespace { + +/// Implements Iterator over std::vector. This is the most basic +/// iterator and is simply a wrapper around +/// std::vector::const_iterator. +class PlainIterator : public Iterator { +public: + explicit PlainIterator(llvm::ArrayRef Documents) + : Documents(Documents), Index(std::begin(Documents)) {} + + bool reachedEnd() const override { return Index == std::end(Documents); } + + /// Advances cursor to the next item. + void advance() override { + assert(!reachedEnd() && + "Posting List iterator can't advance() at the end."); + ++Index; + } + + /// Applies binary search to advance cursor to the next item with DocID + /// equal or higher than the given one. + void advanceTo(DocID ID) override { + assert(!reachedEnd() && + "Posting List iterator can't advance() at the end."); + // If current ID is beyond requested one, iterator is already in the right + // state. + if (peek() < ID) + Index = std::lower_bound(Index, std::end(Documents), ID); + } + + DocID peek() const override { + assert(!reachedEnd() && + "Posting List iterator can't peek() at the end."); + return *Index; + } + + float consume() override { + assert(!reachedEnd() && + "Posting List iterator can't consume() at the end."); + return DEFAULT_BOOST_SCORE; + } + + size_t estimateSize() const override { return Documents.size(); } + +private: + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + OS << '['; + if (Index != std::end(Documents)) + OS << *Index; + else + OS << "END"; + OS << ']'; + return OS; + } + + llvm::ArrayRef Documents; + llvm::ArrayRef::const_iterator Index; +}; + +} // namespace + +std::unique_ptr PostingList::iterator() const { + return llvm::make_unique(Documents); +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: unittests/clangd/DexTests.cpp =================================================================== --- unittests/clangd/DexTests.cpp +++ unittests/clangd/DexTests.cpp @@ -47,8 +47,8 @@ } TEST(DexIterators, DocumentIterator) { - const PostingList L = {4, 7, 8, 20, 42, 100}; - auto DocIterator = create(L); + const PostingList L({4, 7, 8, 20, 42, 100}); + auto DocIterator = L.iterator(); EXPECT_EQ(DocIterator->peek(), 4U); EXPECT_FALSE(DocIterator->reachedEnd()); @@ -70,28 +70,28 @@ } TEST(DexIterators, AndWithEmpty) { - const PostingList L0; - const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L0({}); + const PostingList L1({0, 5, 7, 10, 42, 320, 9000}); - auto AndEmpty = createAnd(create(L0)); + auto AndEmpty = createAnd(L0.iterator()); EXPECT_TRUE(AndEmpty->reachedEnd()); - auto AndWithEmpty = createAnd(create(L0), create(L1)); + auto AndWithEmpty = createAnd(L0.iterator(), L1.iterator()); EXPECT_TRUE(AndWithEmpty->reachedEnd()); EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre()); } TEST(DexIterators, AndTwoLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); + const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); - auto And = createAnd(create(L1), create(L0)); + auto And = createAnd(L1.iterator(), L0.iterator()); EXPECT_FALSE(And->reachedEnd()); EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); - And = createAnd(create(L0), create(L1)); + And = createAnd(L0.iterator(), L1.iterator()); And->advanceTo(0); EXPECT_EQ(And->peek(), 0U); @@ -107,11 +107,11 @@ } TEST(DexIterators, AndThreeLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; - const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; + const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); + const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); + const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000}); - auto And = createAnd(create(L0), create(L1), create(L2)); + auto And = createAnd(L0.iterator(), L1.iterator(), L2.iterator()); EXPECT_EQ(And->peek(), 7U); And->advanceTo(300); EXPECT_EQ(And->peek(), 320U); @@ -121,13 +121,13 @@ } TEST(DexIterators, OrWithEmpty) { - const PostingList L0; - const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L0({}); + const PostingList L1({0, 5, 7, 10, 42, 320, 9000}); - auto OrEmpty = createOr(create(L0)); + auto OrEmpty = createOr(L0.iterator()); EXPECT_TRUE(OrEmpty->reachedEnd()); - auto OrWithEmpty = createOr(create(L0), create(L1)); + auto OrWithEmpty = createOr(L0.iterator(), L1.iterator()); EXPECT_FALSE(OrWithEmpty->reachedEnd()); EXPECT_THAT(consumeIDs(*OrWithEmpty), @@ -135,10 +135,10 @@ } TEST(DexIterators, OrTwoLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); + const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); - auto Or = createOr(create(L0), create(L1)); + auto Or = createOr(L0.iterator(), L1.iterator()); EXPECT_FALSE(Or->reachedEnd()); EXPECT_EQ(Or->peek(), 0U); @@ -161,18 +161,18 @@ Or->advanceTo(9001); EXPECT_TRUE(Or->reachedEnd()); - Or = createOr(create(L0), create(L1)); + Or = createOr(L0.iterator(), L1.iterator()); EXPECT_THAT(consumeIDs(*Or), ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U)); } TEST(DexIterators, OrThreeLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; - const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; + const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); + const PostingList L1({0, 4, 7, 10, 30, 60, 320, 9000}); + const PostingList L2({1, 4, 7, 11, 30, 60, 320, 9000}); - auto Or = createOr(create(L0), create(L1), create(L2)); + auto Or = createOr(L0.iterator(), L1.iterator(), L2.iterator()); EXPECT_FALSE(Or->reachedEnd()); EXPECT_EQ(Or->peek(), 0U); @@ -221,19 +221,19 @@ // |1, 5, 7, 9| |1, 5| |0, 3, 5| // +----------+ +----+ +-------+ // - const PostingList L0 = {1, 3, 5, 8, 9}; - const PostingList L1 = {1, 5, 7, 9}; - const PostingList L3; - const PostingList L4 = {1, 5}; - const PostingList L5 = {0, 3, 5}; + const PostingList L0({1, 3, 5, 8, 9}); + const PostingList L1({1, 5, 7, 9}); + const PostingList L3({}); + const PostingList L4({1, 5}); + const PostingList L5({0, 3, 5}); // Root of the query tree: [1, 5] auto Root = createAnd( // Lower And Iterator: [1, 5, 9] - createAnd(create(L0), createBoost(create(L1), 2U)), + createAnd(L0.iterator(), createBoost(L1.iterator(), 2U)), // Lower Or Iterator: [0, 1, 5] - createOr(create(L3), createBoost(create(L4), 3U), - createBoost(create(L5), 4U))); + createOr(L3.iterator(), createBoost(L4.iterator(), 3U), + createBoost(L5.iterator(), 4U))); EXPECT_FALSE(Root->reachedEnd()); EXPECT_EQ(Root->peek(), 1U); @@ -255,40 +255,39 @@ } TEST(DexIterators, StringRepresentation) { - const PostingList L0 = {4, 7, 8, 20, 42, 100}; - const PostingList L1 = {1, 3, 5, 8, 9}; - const PostingList L2 = {1, 5, 7, 9}; - const PostingList L3 = {0, 5}; - const PostingList L4 = {0, 1, 5}; - const PostingList L5; - - EXPECT_EQ(llvm::to_string(*(create(L0))), "[{4}, 7, 8, 20, 42, 100, END]"); - - auto Nested = createAnd(createAnd(create(L1), create(L2)), - createOr(create(L3), create(L4), create(L5))); - - EXPECT_EQ(llvm::to_string(*Nested), - "(& (| [0, {5}, END] [0, {1}, 5, END] [{END}]) (& [{1}, 5, 7, 9, " - "END] [{1}, 3, 5, 8, 9, END]))"); + const PostingList L0({4, 7, 8, 20, 42, 100}); + const PostingList L1({1, 3, 5, 8, 9}); + const PostingList L2({1, 5, 7, 9}); + const PostingList L3({0, 5}); + const PostingList L4({0, 1, 5}); + const PostingList L5({}); + + EXPECT_EQ(llvm::to_string(*(L0.iterator())), "[4]"); + + auto Nested = + createAnd(createAnd(L1.iterator(), L2.iterator()), + createOr(L3.iterator(), L4.iterator(), L5.iterator())); + + EXPECT_EQ(llvm::to_string(*Nested), "(& (| [5] [1] [END]) (& [1] [1]))"); } TEST(DexIterators, Limit) { - const PostingList L0 = {3, 6, 7, 20, 42, 100}; - const PostingList L1 = {1, 3, 5, 6, 7, 30, 100}; - const PostingList L2 = {0, 3, 5, 7, 8, 100}; + const PostingList L0({3, 6, 7, 20, 42, 100}); + const PostingList L1({1, 3, 5, 6, 7, 30, 100}); + const PostingList L2({0, 3, 5, 7, 8, 100}); - auto DocIterator = createLimit(create(L0), 42); + auto DocIterator = createLimit(L0.iterator(), 42); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7, 20, 42, 100)); - DocIterator = createLimit(create(L0), 3); + DocIterator = createLimit(L0.iterator(), 3); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7)); - DocIterator = createLimit(create(L0), 0); + DocIterator = createLimit(L0.iterator(), 0); EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre()); - auto AndIterator = - createAnd(createLimit(createTrue(9000), 343), createLimit(create(L0), 2), - createLimit(create(L1), 3), createLimit(create(L2), 42)); + auto AndIterator = createAnd( + createLimit(createTrue(9000), 343), createLimit(L0.iterator(), 2), + createLimit(L1.iterator(), 3), createLimit(L2.iterator(), 42)); EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(3, 7)); } @@ -297,10 +296,10 @@ EXPECT_TRUE(TrueIterator->reachedEnd()); EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre()); - PostingList L0 = {1, 2, 5, 7}; + const PostingList L0({1, 2, 5, 7}); TrueIterator = createTrue(7U); EXPECT_THAT(TrueIterator->peek(), 0); - auto AndIterator = createAnd(create(L0), move(TrueIterator)); + auto AndIterator = createAnd(L0.iterator(), move(TrueIterator)); EXPECT_FALSE(AndIterator->reachedEnd()); EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5)); } @@ -311,10 +310,10 @@ auto ElementBoost = BoostIterator->consume(); EXPECT_THAT(ElementBoost, 42U); - const PostingList L0 = {2, 4}; - const PostingList L1 = {1, 4}; - auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U), - createBoost(create(L1), 3U)); + const PostingList L0({2, 4}); + const PostingList L1({1, 4}); + auto Root = createOr(createTrue(5U), createBoost(L0.iterator(), 2U), + createBoost(L1.iterator(), 3U)); ElementBoost = Root->consume(); EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE); @@ -453,10 +452,10 @@ } TEST(Dex, FuzzyFind) { - auto Index = Dex::build( - generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", - "other::ABC", "other::A"}), - URISchemes); + auto Index = + Dex::build(generateSymbols({"ns::ABC", "ns::BCD", "::ABC", + "ns::nested::ABC", "other::ABC", "other::A"}), + URISchemes); FuzzyFindRequest Req; Req.Query = "ABC"; Req.Scopes = {"ns::"}; @@ -526,16 +525,14 @@ } TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) { - auto I = - Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); + auto I = Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); FuzzyFindRequest Req; Req.Query = "y"; EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3")); } TEST(DexTest, MatchQualifiedNamesWithGlobalScope) { - auto I = - Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); + auto I = Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {""};