Index: clangd/index/dex/DexIndex.cpp =================================================================== --- clangd/index/dex/DexIndex.cpp +++ clangd/index/dex/DexIndex.cpp @@ -125,11 +125,15 @@ // using 100x of the requested number might not be good in practice, e.g. // when the requested number of items is small. const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount; - std::vector SymbolDocIDs = consume(*QueryIterator, ItemsToRetrieve); + // FIXME(kbobyrev): Add boosting to the query and utilize retrieved + // boosting scores. + std::vector> SymbolDocIDs = + consume(*QueryIterator, ItemsToRetrieve); // Retrieve top Req.MaxCandidateCount items. std::priority_queue> Top; - for (const auto &SymbolDocID : SymbolDocIDs) { + for (const auto &P : SymbolDocIDs) { + const DocID SymbolDocID = P.first; const auto *Sym = (*Symbols)[SymbolDocID]; const llvm::Optional Score = Filter.match(Sym->Name); if (!Score) @@ -161,7 +165,6 @@ } } - void DexIndex::findOccurrences( const OccurrencesRequest &Req, llvm::function_ref Callback) const { Index: clangd/index/dex/Iterator.h =================================================================== --- clangd/index/dex/Iterator.h +++ clangd/index/dex/Iterator.h @@ -66,11 +66,9 @@ /// (their children) to form a multi-level Query Tree. The interface is designed /// to be extensible in order to support multiple types of iterators. class Iterator { - // FIXME(kbobyrev): Provide callback for matched documents. - // FIXME(kbobyrev): Implement new types of iterators: Label, Boost (with - // scoring), Limit. // FIXME(kbobyrev): Implement iterator cost, an estimate of advance() calls // before iterator exhaustion. + // FIXME(kbobyrev): Implement Limit iterator. public: /// Returns true if all valid DocIDs were processed and hence the iterator is /// exhausted. @@ -89,6 +87,13 @@ /// /// Note: reachedEnd() must be false. virtual DocID peek() const = 0; + /// Retrieves boosting score. Query tree root should pass Root->peek() to this + /// function, the parameter is needed to propagate through the tree. Given ID + /// should be compared against BOOST iterator peek()s: some of the iterators + /// would not point to the item which was propagated to the top of the query + /// tree (e.g. if these iterators are branches of OR iterator) and hence + /// shouldn't apply any boosting to the consumed item. + virtual float consume(DocID ID) = 0; virtual ~Iterator() {} @@ -107,32 +112,59 @@ return Iterator.dump(OS); } + constexpr static float DEFAULT_BOOST_SCORE = 1; + private: virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; }; /// Advances the iterator until it is either exhausted or the number of -/// requested items is reached. The result contains sorted DocumentIDs. -std::vector consume(Iterator &It, - size_t Limit = std::numeric_limits::max()); +/// requested items is reached. Returns pairs of document IDs with the +/// corresponding boosting score. +/// +/// Boosting can be seen as a compromise between retrieving too many items and +/// calculating finals score for each of them (which might be very expensive) +/// and not retrieving enough items so that items with very high final score +/// would not be processed. Boosting score is a computationally efficient way +/// to acquire preliminary scores of requested items. +std::vector> +consume(Iterator &It, size_t Limit = std::numeric_limits::max()); /// Returns a document iterator over given PostingList. +/// +/// DocumentIterator returns DEFAULT_BOOST_SCORE for each processed item. std::unique_ptr create(PostingListRef Documents); /// Returns AND Iterator which performs the intersection of the PostingLists of /// its children. +/// +/// consume(): AND Iterator returns the product of Childrens' boosting scores +/// when not exhausted and DEFAULT_BOOST_SCORE otherwise. std::unique_ptr createAnd(std::vector> Children); /// Returns OR Iterator which performs the union of the PostingLists of its /// children. +/// +/// consume(): OR Iterator returns the highest boost value among children +/// pointing to requested item when not exhausted and DEFAULT_BOOST_SCORE +/// otherwise. std::unique_ptr createOr(std::vector> Children); /// Returns TRUE Iterator which iterates over "virtual" PostingList containing /// all items in range [0, Size) in an efficient manner. +/// +/// TRUE returns DEFAULT_BOOST_SCORE for each processed item. std::unique_ptr createTrue(DocID Size); +/// Returns BOOST iterator which multiplies the score of each item by given +/// factor. Boosting can be used as a computationally inexpensive filtering. +/// Users can return significantly more items using consumeAndBoost() and then +/// trim Top K using retrieval score. +std::unique_ptr createBoost(std::unique_ptr Child, + float Factor); + /// This allows createAnd(create(...), create(...)) syntax. template std::unique_ptr createAnd(Args... args) { std::vector> Children; Index: clangd/index/dex/Iterator.cpp =================================================================== --- clangd/index/dex/Iterator.cpp +++ clangd/index/dex/Iterator.cpp @@ -46,6 +46,8 @@ return *Index; } + float consume(DocID ID) override { return DEFAULT_BOOST_SCORE; } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << '['; @@ -103,6 +105,19 @@ DocID peek() const override { return Children.front()->peek(); } + // If not exhausted and points to the given item, consume() returns the + // product of Children->consume(ID). Otherwise, DEFAULT_BOOST_SCORE is + // returned. + float consume(DocID ID) override { + if (reachedEnd() || peek() != ID) + return DEFAULT_BOOST_SCORE; + return std::accumulate( + begin(Children), end(Children), DEFAULT_BOOST_SCORE, + [&](float Current, const std::unique_ptr &Child) { + return Current * Child->consume(ID); + }); + } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << "(& "; @@ -209,6 +224,20 @@ return Result; } + // Returns the maximum boosting score among all Children when iterator is not + // exhausted and points to the given ID, DEFAULT_BOOST_SCORE otherwise. + float consume(DocID ID) override { + if (reachedEnd() || peek() != ID) + return DEFAULT_BOOST_SCORE; + return std::accumulate( + begin(Children), end(Children), DEFAULT_BOOST_SCORE, + [&](float Current, const std::unique_ptr &Child) { + return (!Child->reachedEnd() && Child->peek() == ID) + ? std::max(Current, Child->consume(ID)) + : Current; + }); + } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << "(| "; @@ -249,6 +278,8 @@ return Index; } + float consume(DocID) override { return DEFAULT_BOOST_SCORE; } + private: llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { OS << "(TRUE {" << Index << "} out of " << Size << ")"; @@ -260,13 +291,42 @@ DocID Size; }; +/// Boost iterator is a wrapper around its child which multiplies scores of +/// each retrieved item by a given factor. +class BoostIterator : public Iterator { +public: + BoostIterator(std::unique_ptr Child, float Factor) + : Child(move(Child)), Factor(Factor) {} + + bool reachedEnd() const override { return Child->reachedEnd(); } + + void advance() override { Child->advance(); } + + void advanceTo(DocID ID) override { Child->advanceTo(ID); } + + DocID peek() const override { return Child->peek(); } + + float consume(DocID ID) override { return Child->consume(ID) * Factor; } + +private: + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + OS << "(BOOST " << Factor << ' ' << *Child << ')'; + return OS; + } + + std::unique_ptr Child; + float Factor; +}; + } // end namespace -std::vector consume(Iterator &It, size_t Limit) { - std::vector Result; +std::vector> consume(Iterator &It, size_t Limit) { + std::vector> Result; for (size_t Retrieved = 0; !It.reachedEnd() && Retrieved < Limit; - It.advance(), ++Retrieved) - Result.push_back(It.peek()); + It.advance(), ++Retrieved) { + DocID Document = It.peek(); + Result.push_back(std::make_pair(Document, It.consume(Document))); + } return Result; } @@ -288,6 +348,11 @@ return llvm::make_unique(Size); } +std::unique_ptr createBoost(std::unique_ptr Child, + float Factor) { + return llvm::make_unique(move(Child), Factor); +} + } // namespace dex } // namespace clangd } // namespace clang Index: unittests/clangd/DexIndexTests.cpp =================================================================== --- unittests/clangd/DexIndexTests.cpp +++ unittests/clangd/DexIndexTests.cpp @@ -29,6 +29,15 @@ namespace dex { namespace { +std::vector +consumeIDs(Iterator &It, size_t Limit = std::numeric_limits::max()) { + auto IDAndScore = consume(It, Limit); + std::vector IDs(IDAndScore.size()); + for (size_t I = 0; I < IDAndScore.size(); ++I) + IDs[I] = IDAndScore[I].first; + return IDs; +} + TEST(DexIndexIterators, DocumentIterator) { const PostingList L = {4, 7, 8, 20, 42, 100}; auto DocIterator = create(L); @@ -62,7 +71,7 @@ auto AndWithEmpty = createAnd(create(L0), create(L1)); EXPECT_TRUE(AndWithEmpty->reachedEnd()); - EXPECT_THAT(consume(*AndWithEmpty), ElementsAre()); + EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre()); } TEST(DexIndexIterators, AndTwoLists) { @@ -72,7 +81,7 @@ auto And = createAnd(create(L1), create(L0)); EXPECT_FALSE(And->reachedEnd()); - EXPECT_THAT(consume(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); + EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); And = createAnd(create(L0), create(L1)); @@ -113,7 +122,7 @@ auto OrWithEmpty = createOr(create(L0), create(L1)); EXPECT_FALSE(OrWithEmpty->reachedEnd()); - EXPECT_THAT(consume(*OrWithEmpty), + EXPECT_THAT(consumeIDs(*OrWithEmpty), ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U)); } @@ -146,7 +155,7 @@ Or = createOr(create(L0), create(L1)); - EXPECT_THAT(consume(*Or), + EXPECT_THAT(consumeIDs(*Or), ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U)); } @@ -178,41 +187,45 @@ // FIXME(kbobyrev): The testcase below is similar to what is expected in real // queries. It should be updated once new iterators (such as boosting, limiting, // etc iterators) appear. However, it is not exhaustive and it would be -// beneficial to implement automatic generation of query trees for more -// comprehensive testing. +// beneficial to implement automatic generation (e.g. fuzzing) of query trees +// for more comprehensive testing. TEST(DexIndexIterators, QueryTree) { - // An example of more complicated query // // +-----------------+ // |And Iterator:1, 5| // +--------+--------+ // | // | - // +------------------------------------+ + // +-------------+----------------------+ // | | // | | - // +----------v----------+ +----------v---------+ - // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 5| - // +----------+----------+ +----------+---------+ + // +----------v----------+ +----------v------------+ + // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5| + // +----------+----------+ +----------+------------+ // | | - // +------+-----+ +---------+-----------+ + // +------+-----+ +---------------------+ // | | | | | - // +-------v-----+ +----v-----+ +--v--+ +-V--+ +---v---+ - // |1, 3, 5, 8, 9| |1, 5, 7, 9| |Empty| |0, 5| |0, 1, 5| - // +-------------+ +----------+ +-----+ +----+ +-------+ - + // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+ + // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4| + // +-------------+ +----+---+ +-----+ +---+----+ +----+---+ + // | | | + // +----v-----+ +-v--+ +---v---+ + // |1, 5, 7, 9| |1, 5| |0, 3, 5| + // +----------+ +----+ +-------+ + // const PostingList L0 = {1, 3, 5, 8, 9}; const PostingList L1 = {1, 5, 7, 9}; - const PostingList L2 = {0, 5}; - const PostingList L3 = {0, 1, 5}; - const PostingList L4; + const PostingList L3; + const PostingList L4 = {1, 5}; + const PostingList L5 = {0, 3, 5}; // Root of the query tree: [1, 5] auto Root = createAnd( // Lower And Iterator: [1, 5, 9] - createAnd(create(L0), create(L1)), + createAnd(create(L0), createBoost(create(L1), 2U)), // Lower Or Iterator: [0, 1, 5] - createOr(create(L2), create(L3), create(L4))); + createOr(create(L3), createBoost(create(L4), 3U), + createBoost(create(L5), 4U))); EXPECT_FALSE(Root->reachedEnd()); EXPECT_EQ(Root->peek(), 1U); @@ -221,10 +234,14 @@ Root->advanceTo(1); Root->advanceTo(0); EXPECT_EQ(Root->peek(), 1U); + auto ElementBoost = Root->consume(Root->peek()); + EXPECT_THAT(ElementBoost, 6); Root->advance(); EXPECT_EQ(Root->peek(), 5U); Root->advanceTo(5); EXPECT_EQ(Root->peek(), 5U); + ElementBoost = Root->consume(Root->peek()); + EXPECT_THAT(ElementBoost, 8); Root->advanceTo(9000); EXPECT_TRUE(Root->reachedEnd()); } @@ -256,29 +273,57 @@ const PostingList L5; auto DocIterator = create(L0); - EXPECT_THAT(consume(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100)); + EXPECT_THAT(consumeIDs(*DocIterator, 42), ElementsAre(4, 7, 8, 20, 42, 100)); DocIterator = create(L0); - EXPECT_THAT(consume(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100)); + EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(4, 7, 8, 20, 42, 100)); DocIterator = create(L0); - EXPECT_THAT(consume(*DocIterator, 3), ElementsAre(4, 7, 8)); + EXPECT_THAT(consumeIDs(*DocIterator, 3), ElementsAre(4, 7, 8)); DocIterator = create(L0); - EXPECT_THAT(consume(*DocIterator, 0), ElementsAre()); + EXPECT_THAT(consumeIDs(*DocIterator, 0), ElementsAre()); } TEST(DexIndexIterators, True) { auto TrueIterator = createTrue(0U); EXPECT_TRUE(TrueIterator->reachedEnd()); - EXPECT_THAT(consume(*TrueIterator), ElementsAre()); + EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre()); PostingList L0 = {1, 2, 5, 7}; TrueIterator = createTrue(7U); EXPECT_THAT(TrueIterator->peek(), 0); auto AndIterator = createAnd(create(L0), move(TrueIterator)); EXPECT_FALSE(AndIterator->reachedEnd()); - EXPECT_THAT(consume(*AndIterator), ElementsAre(1, 2, 5)); + EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5)); +} + +TEST(DexIndexIterators, Boost) { + auto BoostIterator = createBoost(createTrue(5U), 42U); + EXPECT_FALSE(BoostIterator->reachedEnd()); + auto ElementBoost = BoostIterator->consume(BoostIterator->peek()); + EXPECT_THAT(ElementBoost, 42U); + + const PostingList L0 = {2, 4}; + const PostingList L1 = {1, 4}; + auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U), + createBoost(create(L1), 3U)); + + ElementBoost = Root->consume(Root->peek()); + EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE); + Root->advance(); + EXPECT_THAT(Root->peek(), 1U); + ElementBoost = Root->consume(Root->peek()); + EXPECT_THAT(ElementBoost, 3); + + Root->advance(); + EXPECT_THAT(Root->peek(), 2U); + ElementBoost = Root->consume(Root->peek()); + EXPECT_THAT(ElementBoost, 2); + + Root->advanceTo(4); + ElementBoost = Root->consume(Root->peek()); + EXPECT_THAT(ElementBoost, 3); } testing::Matcher>