Index: clangd/CMakeLists.txt =================================================================== --- clangd/CMakeLists.txt +++ clangd/CMakeLists.txt @@ -43,6 +43,7 @@ index/SymbolCollector.cpp index/SymbolYAML.cpp + index/dex/Iterator.cpp index/dex/Trigram.cpp LINK_LIBS Index: clangd/index/dex/Iterator.h =================================================================== --- clangd/index/dex/Iterator.h +++ clangd/index/dex/Iterator.h @@ -0,0 +1,143 @@ +//===--- Iterator.h - Query Symbol Retrieval --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Symbol index queries consist of specific requirements for the requested +// symbol, such as high fuzzy matching score, scope, type etc. The lists of all +// symbols matching some criteria (e.g. belonging to "clang::clangd::" scope) +// are expressed in a form of Search Tokens which are stored in the inverted +// index. Inverted index maps these tokens to the posting lists - sorted ( by +// symbol quality) sequences of symbol IDs matching the token, e.g. scope token +// "clangd::clangd::" is mapped to the list of IDs of all symbols which are +// declared in this namespace. Search queries are build from a set of +// requirements which can be combined with each other forming the query trees. +// The leafs of such trees are posting lists, and the nodes are operations on +// these posting lists, e.g. intersection or union. Efficient processing of +// these multi-level queries is handled by Iterators. Iterators advance through +// all leaf posting lists producing the result of search query, which preserves +// the sorted order of IDs. Having the resulting IDs sorted is important, +// because it allows receiving a certain number of the most valuable items (e.g. +// symbols with highest quality which was the sorting key in the first place) +// without processing all items with requested properties (this might not be +// computationally effective if search request is not very restrictive). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_ITERATOR_H + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +/// Symbol position in the list of all index symbols sorted by a pre-computed +/// symbol quality. +using DocID = uint32_t; +/// Contains sorted sequence of DocIDs all of which belong to symbols matching +/// certain criteria, i.e. containing a Search Token. PostingLists are values +/// for the inverted index. +using PostingList = std::vector; +/// Immutable reference to PostingList object. +using PostingListRef = llvm::ArrayRef; + +/// Iterator is the interface for Query Tree node. The simplest type of Iterator +/// is DocumentIterator which is simply a wrapper around PostingList iterator +/// and serves as the Query Tree leaf. More sophisticated examples of iterators +/// can manage intersection, union of the elements produced by other iterators +/// (their children) to form a multi-level Query Tree. The interface is designed +/// to be extensible in order to support multiple types of iterators. +class Iterator { + // FIXME(kbobyrev): Provide callback for matched documents. + // FIXME(kbobyrev): Implement new types of iterators: Label, Boost (with + // scoring), Limit. + // FIXME(kbobyrev): Implement iterator cost, an estimate of advance() calls + // before iterator exhaustion. +public: + /// Returns true if all valid DocIDs were processed and hence the iterator is + /// exhausted. + virtual bool reachedEnd() const = 0; + /// Moves to next valid DocID. If it doesn't exist, the iterator is exhausted + /// and proceeds to the END. + /// + /// Note: reachedEnd() must be false. + virtual void advance() = 0; + /// Moves to the first valid DocID which is equal or higher than given ID. If + /// it doesn't exist, the iterator is exhausted and proceeds to the END. + /// + /// Note: reachedEnd() must be false. + virtual void advanceTo(DocID ID) = 0; + /// Returns the current element this iterator points to. + /// + /// Note: reachedEnd() must be false. + virtual DocID peek() const = 0; + + virtual ~Iterator() {} + + /// Prints a convenient human-readable iterator representation by recursively + /// dumping iterators in the following format: + /// + /// (Type Child1 Child2 ...) + /// + /// Where Type is the iterator type representation: "&" for And, "|" for Or, + /// ChildN is N-th iterator child. Raw iterators over PostingList are + /// represented as "[ID1, ID2, ...]" where IDN is N-th PostingList entry. + friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, + const Iterator &Iterator) { + return Iterator.dump(OS); + } + +private: + virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; +}; + +/// Exhausts given iterator and returns all processed DocIDs. The result +/// contains sorted DocumentIDs. +std::vector consume(Iterator &It); + +/// Returns a document iterator over given PostingList. +std::unique_ptr create(PostingListRef Documents); + +/// Returns AND Iterator which performs the intersection of the PostingLists of +/// its children. +std::unique_ptr +createAnd(std::vector> Children); + +/// Returns OR Iterator which performs the union of the PostingLists of its +/// children. +std::unique_ptr +createOr(std::vector> Children); + +/// This allows createAnd({create(...), create(...)}) syntax. +template +std::unique_ptr +createAnd(std::unique_ptr(&&Children)[Size]) { + std::vector> Elements; + std::move(begin(Children), end(Children), std::back_inserter(Elements)); + return createAnd(move(Elements)); +} + +/// This allows createOr({create(...), create(...)}) syntax. +template +std::unique_ptr +createOr(std::unique_ptr(&&Children)[Size]) { + std::vector> Elements; + std::move(begin(Children), end(Children), std::back_inserter(Elements)); + return createOr(move(Elements)); +} + +} // namespace dex +} // namespace clangd +} // namespace clang + +#endif Index: clangd/index/dex/Iterator.cpp =================================================================== --- clangd/index/dex/Iterator.cpp +++ clangd/index/dex/Iterator.cpp @@ -0,0 +1,244 @@ +//===--- Iterator.cpp - Query Symbol Retrieval ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Iterator.h" +#include +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +namespace { + +/// Implements Iterator over a PostingList. DocumentIterator is the most basic +/// iterator: it doesn't have any children (hence it is the leaf of iterator +/// tree) and is simply a wrapper around PostingList::const_iterator. +class DocumentIterator : public Iterator { +public: + DocumentIterator(PostingListRef Documents) + : Documents(Documents), Index(std::begin(Documents)) {} + + bool reachedEnd() const override { return Index == std::end(Documents); } + + /// Advances cursor to the next item. + void advance() override { + assert(!reachedEnd() && "DocumentIterator can't advance at the end."); + ++Index; + } + + /// Applies binary search to advance cursor to the next item with DocID equal + /// or higher than the given one. + void advanceTo(DocID ID) override { + assert(!reachedEnd() && "DocumentIterator can't advance at the end."); + Index = std::lower_bound(Index, std::end(Documents), ID); + } + + DocID peek() const override { + assert(!reachedEnd() && "DocumentIterator can't call peek() at the end."); + return *Index; + } + + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + OS << '['; + auto Separator = ""; + for (const auto &ID : Documents) { + OS << Separator << ID; + Separator = ", "; + } + OS << ']'; + return OS; + } + +private: + PostingListRef Documents; + PostingListRef::const_iterator Index; +}; + +/// Implements Iterator over the intersection of other iterators. +/// +/// AndIterator iterates through common items among all children. It becomes +/// exhausted as soon as any child becomes exhausted. After each mutation, the +/// iterator restores the invariant: all children must point to the same item. +class AndIterator : public Iterator { +public: + AndIterator(std::vector> AllChildren) + : Children(std::move(AllChildren)) { + assert(!Children.empty() && "AndIterator should have at least one child."); + // Establish invariants. + sync(); + } + + bool reachedEnd() const override { return ReachedEnd; } + + /// Advances all children to the next common item. + void advance() override { + assert(!reachedEnd() && "AndIterator can't call advance() at the end."); + Children.front()->advance(); + sync(); + } + + /// Advances all children to the next common item with DocumentID >= ID. + void advanceTo(DocID ID) override { + assert(!reachedEnd() && "AndIterator can't call advanceTo() at the end."); + Children.front()->advanceTo(ID); + sync(); + } + + DocID peek() const override { return Children.front()->peek(); } + + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + OS << "(& "; + auto Separator = ""; + for (const auto &Child : Children) { + OS << Separator << *Child; + Separator = " "; + } + OS << ')'; + return OS; + } + +private: + /// Restores class invariants: each child will point to the same element after + /// sync. + void sync() { + ReachedEnd |= Children.front()->reachedEnd(); + if (ReachedEnd) + return; + auto SyncID = Children.front()->peek(); + // Indicates whether any child needs to be advanced to new SyncID. + bool NeedsAdvance = false; + do { + NeedsAdvance = false; + for (auto &Child : Children) { + Child->advanceTo(SyncID); + ReachedEnd |= Child->reachedEnd(); + // If any child reaches end And iterator can not match any other items. + // In this case, just terminate the process. + if (ReachedEnd) + return; + // If any child goes beyond given ID (i.e. ID is not the common item), + // all children should be advanced to the next common item. + // FIXME(kbobyrev): This is not a very optimized version; after costs + // are introduced, cycle should break whenever ID exceeds current one + // and cheapest children should be advanced over again. + if (Child->peek() > SyncID) { + SyncID = Child->peek(); + NeedsAdvance = true; + } + } + } while (NeedsAdvance); + } + + /// AndIterator owns its children and ensures that all of them point to the + /// same element. As soon as one child gets exhausted, AndIterator can no + /// longer advance and has reached its end. + std::vector> Children; + /// Indicates whether any child is exhausted. It is cheaper to maintain and + /// update the field, rather than traversing the whole subtree in each + /// reachedEnd() call. + bool ReachedEnd = false; +}; + +/// Implements Iterator over the union of other iterators. +/// +/// OrIterator iterates through all items which can be pointed to by at least +/// one child. To preserve the sorted order, this iterator always advances the +/// child with smallest Child->peek() value. OrIterator becomes exhausted as +/// soon as all of its children are exhausted. +class OrIterator : public Iterator { +public: + OrIterator(std::vector> AllChildren) + : Children(std::move(AllChildren)) { + assert(Children.size() > 0 && "Or Iterator must have at least one child."); + } + + /// Returns true if all children are exhausted. + bool reachedEnd() const override { + return std::all_of(begin(Children), end(Children), + [](const std::unique_ptr &Child) { + return Child->reachedEnd(); + }); + } + + /// Moves each child pointing to the smallest DocID to the next item. + void advance() override { + assert(!reachedEnd() && + "OrIterator must have at least one child to advance()."); + const auto SmallestID = peek(); + for (const auto &Child : Children) + if (!Child->reachedEnd() && Child->peek() == SmallestID) + Child->advance(); + } + + /// Advances each child to the next existing element with DocumentID >= ID. + void advanceTo(DocID ID) override { + assert(!reachedEnd() && "Can't advance iterator after it reached the end."); + for (const auto &Child : Children) + if (!Child->reachedEnd()) + Child->advanceTo(ID); + } + + /// Returns the element under cursor of the child with smallest Child->peek() + /// value. + DocID peek() const override { + assert(!reachedEnd() && + "OrIterator must have at least one child to peek()."); + DocID Result = std::numeric_limits::max(); + + for (const auto &Child : Children) + if (!Child->reachedEnd()) + Result = std::min(Result, Child->peek()); + + return Result; + } + + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + OS << "(| "; + auto Separator = ""; + for (const auto &Child : Children) { + OS << Separator << *Child; + Separator = " "; + } + OS << ')'; + return OS; + } + +private: + // FIXME(kbobyrev): Would storing Children in min-heap be faster? + std::vector> Children; +}; + +} // end namespace + +std::vector consume(Iterator &It) { + std::vector Result; + for (; !It.reachedEnd(); It.advance()) + Result.push_back(It.peek()); + return Result; +} + +std::unique_ptr create(PostingListRef Documents) { + return llvm::make_unique(Documents); +} + +std::unique_ptr +createAnd(std::vector> Children) { + return llvm::make_unique(move(Children)); +} + +std::unique_ptr +createOr(std::vector> Children) { + return llvm::make_unique(move(Children)); +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: unittests/clangd/DexIndexTests.cpp =================================================================== --- unittests/clangd/DexIndexTests.cpp +++ unittests/clangd/DexIndexTests.cpp @@ -7,11 +7,13 @@ // //===----------------------------------------------------------------------===// +#include "index/dex/Iterator.h" #include "index/dex/Token.h" #include "index/dex/Trigram.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" #include "gmock/gmock.h" #include "gtest/gtest.h" - #include #include @@ -19,6 +21,215 @@ namespace clangd { namespace dex { +using ::testing::ElementsAre; + +TEST(DexIndexIterators, DocumentIterator) { + auto DocIterator = create({4, 7, 8, 20, 42, 100}); + + EXPECT_EQ(DocIterator->peek(), 4U); + EXPECT_EQ(DocIterator->reachedEnd(), false); + + DocIterator->advance(); + EXPECT_EQ(DocIterator->peek(), 7U); + EXPECT_EQ(DocIterator->reachedEnd(), false); + + DocIterator->advanceTo(20); + EXPECT_EQ(DocIterator->peek(), 20U); + EXPECT_EQ(DocIterator->reachedEnd(), false); + + DocIterator->advanceTo(65); + EXPECT_EQ(DocIterator->peek(), 100U); + EXPECT_EQ(DocIterator->reachedEnd(), false); + + DocIterator->advanceTo(420); + EXPECT_EQ(DocIterator->reachedEnd(), true); +} + +TEST(DexIndexIterators, AndWithEmpty) { + const PostingList L0; + const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; + + auto AndEmpty = createAnd({create(L0)}); + EXPECT_EQ(AndEmpty->reachedEnd(), true); + + auto AndWithEmpty = createAnd({create(L0), create(L1)}); + EXPECT_EQ(AndWithEmpty->reachedEnd(), true); + + EXPECT_THAT(consume(*AndWithEmpty), ElementsAre()); +} + +TEST(DexIndexIterators, AndTwoLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + + auto And = createAnd({create(L1), create(L0)}); + + EXPECT_EQ(And->reachedEnd(), false); + EXPECT_THAT(consume(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); + + And = createAnd({create(L0), create(L1)}); + + And->advanceTo(0); + EXPECT_EQ(And->peek(), 0U); + And->advanceTo(5); + EXPECT_EQ(And->peek(), 7U); + And->advanceTo(10); + EXPECT_EQ(And->peek(), 10U); + And->advanceTo(42); + EXPECT_EQ(And->peek(), 320U); + And->advanceTo(8999); + EXPECT_EQ(And->peek(), 9000U); + And->advanceTo(9001); +} + +TEST(DexIndexIterators, AndThreeLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; + + auto And = createAnd({create(L0), create(L1), create(L2)}); + EXPECT_EQ(And->peek(), 7U); + And->advanceTo(300); + EXPECT_EQ(And->peek(), 320U); + And->advanceTo(100000); + + EXPECT_EQ(And->reachedEnd(), true); +} + +TEST(DexIndexIterators, OrWithEmpty) { + const PostingList L0; + const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; + + auto OrEmpty = createOr({create(L0)}); + EXPECT_EQ(OrEmpty->reachedEnd(), true); + + auto OrWithEmpty = createOr({create(L0), create(L1)}); + EXPECT_EQ(OrWithEmpty->reachedEnd(), false); + + EXPECT_THAT(consume(*OrWithEmpty), + ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U)); +} + +TEST(DexIndexIterators, OrTwoLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + + auto Or = createOr({create(L0), create(L1)}); + + EXPECT_EQ(Or->reachedEnd(), false); + EXPECT_EQ(Or->peek(), 0U); + Or->advance(); + EXPECT_EQ(Or->peek(), 4U); + Or->advance(); + EXPECT_EQ(Or->peek(), 5U); + Or->advance(); + EXPECT_EQ(Or->peek(), 7U); + Or->advance(); + EXPECT_EQ(Or->peek(), 10U); + Or->advance(); + EXPECT_EQ(Or->peek(), 30U); + Or->advanceTo(42); + EXPECT_EQ(Or->peek(), 42U); + Or->advanceTo(300); + EXPECT_EQ(Or->peek(), 320U); + Or->advanceTo(9000); + EXPECT_EQ(Or->peek(), 9000U); + Or->advanceTo(9001); + EXPECT_EQ(Or->reachedEnd(), true); + + Or = createOr({create(L0), create(L1)}); + + EXPECT_THAT(consume(*Or), + ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U)); +} + +TEST(DexIndexIterators, OrThreeLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; + + auto Or = createOr({create(L0), create(L1), create(L2)}); + + EXPECT_EQ(Or->reachedEnd(), false); + EXPECT_EQ(Or->peek(), 0U); + + Or->advance(); + EXPECT_EQ(Or->peek(), 1U); + + Or->advance(); + EXPECT_EQ(Or->peek(), 4U); + + Or->advanceTo(7); + + Or->advanceTo(59); + EXPECT_EQ(Or->peek(), 60U); + + Or->advanceTo(9001); + EXPECT_EQ(Or->reachedEnd(), true); +} + +// FIXME(kbobyrev): The testcase below is similar to what is expected in real +// queries. It should be updated once new iterators (such as boosting, limiting, +// etc iterators) appear. However, it is not exhaustive and it would be +// beneficial to implement automatic generation of query trees for more +// comprehensive testing. +TEST(DexIndexIterators, QueryTree) { + // An example of more complicated query + // + // +-----------------+ + // |And Iterator:1, 5| + // +--------+--------+ + // | + // | + // +------------------------------------+ + // | | + // | | + // +----------v----------+ +----------v---------+ + // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 5| + // +----------+----------+ +----------+---------+ + // | | + // +------+-----+ +---------+-----------+ + // | | | | | + // +-------v-----+ +----v-----+ +--v--+ +-V--+ +---v---+ + // |1, 3, 5, 8, 9| |1, 5, 7, 9| |Empty| |0, 5| |0, 1, 5| + // +-------------+ +----------+ +-----+ +----+ +-------+ + + // Root of the query tree: [1, 5] + auto Root = createAnd({ + // Lower And Iterator: [1, 5, 9] + createAnd({create({1, 3, 5, 8, 9}), create({1, 5, 7, 9})}), + // Lower Or Iterator: [0, 1, 5] + createOr({create({0, 5}), create({0, 1, 5}), create({})}), + }); + + EXPECT_EQ(Root->reachedEnd(), false); + EXPECT_EQ(Root->peek(), 1U); + Root->advanceTo(0); + // Advance multiple times. Shouldn't do anything. + Root->advanceTo(1); + Root->advanceTo(0); + EXPECT_EQ(Root->peek(), 1U); + Root->advance(); + EXPECT_EQ(Root->peek(), 5U); + Root->advanceTo(5); + EXPECT_EQ(Root->peek(), 5U); + Root->advanceTo(9000); + EXPECT_EQ(Root->reachedEnd(), true); +} + +TEST(DexIndexIterators, StringRepresentation) { + EXPECT_EQ(llvm::to_string(*(create({4, 7, 8, 20, 42, 100}))), + "[4, 7, 8, 20, 42, 100]"); + + auto Nested = createAnd({ + createAnd({create({1, 3, 5, 8, 9}), create({1, 5, 7, 9})}), + createOr({create({0, 5}), create({0, 1, 5}), create({})}), + }); + + EXPECT_EQ(llvm::to_string(*Nested), + "(& (& [1, 3, 5, 8, 9] [1, 5, 7, 9]) (| [0, 5] [0, 1, 5] []))"); +} + testing::Matcher> trigramsAre(std::initializer_list Trigrams) { std::vector Tokens;