Index: clangd/index/dex/Iterator.h =================================================================== --- clangd/index/dex/Iterator.h +++ clangd/index/dex/Iterator.h @@ -98,8 +98,16 @@ return Iterator.dump(OS); } + /// Inspect iterator type, used internally for optimizing query trees. + enum class Kind { And, Or, True, False, Other }; + Kind kind() const { return MyKind; } + +protected: + Iterator(Kind MyKind = Kind::Other) : MyKind(MyKind) {} + private: virtual llvm::raw_ostream &dump(llvm::raw_ostream &OS) const = 0; + Kind MyKind; }; /// Advances the iterator until it is exhausted. Returns pairs of document IDs @@ -150,6 +158,9 @@ /// containing all items in range [0, Size) in an efficient manner. std::unique_ptr all() const; + /// Returns FALSE Iterator which iterates over no documents. + std::unique_ptr none() const; + /// Returns BOOST iterator which multiplies the score of each item by given /// factor. Boosting can be used as a computationally inexpensive filtering. /// Users can return significantly more items using consumeAndBoost() and Index: clangd/index/dex/Iterator.cpp =================================================================== --- clangd/index/dex/Iterator.cpp +++ clangd/index/dex/Iterator.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "Iterator.h" +#include "llvm/Support/Casting.h" #include #include #include @@ -26,9 +27,11 @@ class AndIterator : public Iterator { public: explicit AndIterator(std::vector> AllChildren) - : Children(std::move(AllChildren)) { + : Iterator(Kind::And), Children(std::move(AllChildren)) { assert(!Children.empty() && "AND iterator should have at least one child."); // Establish invariants. + for (const auto &Child : Children) + ReachedEnd |= Child->reachedEnd(); sync(); // When children are sorted by the estimateSize(), sync() calls are more // effective. Each sync() starts with the first child and makes sure all @@ -122,6 +125,7 @@ /// update the field, rather than traversing the whole subtree in each /// reachedEnd() call. bool ReachedEnd = false; + friend Corpus; // For optimizations. }; /// Implements Iterator over the union of other iterators. @@ -133,7 +137,7 @@ class OrIterator : public Iterator { public: explicit OrIterator(std::vector> AllChildren) - : Children(std::move(AllChildren)) { + : Iterator(Kind::Or), Children(std::move(AllChildren)) { assert(!Children.empty() && "OR iterator should have at least one child."); } @@ -208,6 +212,7 @@ // FIXME(kbobyrev): Would storing Children in min-heap be faster? std::vector> Children; + friend Corpus; // For optimizations. }; /// TrueIterator handles PostingLists which contain all items of the index. It @@ -215,7 +220,7 @@ /// in O(1). class TrueIterator : public Iterator { public: - explicit TrueIterator(DocID Size) : Size(Size) {} + explicit TrueIterator(DocID Size) : Iterator(Kind::True), Size(Size) {} bool reachedEnd() const override { return Index >= Size; } @@ -251,12 +256,35 @@ DocID Size; }; +/// FalseIterator yields no results. +class FalseIterator : public Iterator { +public: + FalseIterator() : Iterator(Kind::False) {} + bool reachedEnd() const override { return true; } + void advance() override { assert(false); } + void advanceTo(DocID ID) override { assert(false); } + DocID peek() const override { + assert(false); + return 0; + } + float consume() override { + assert(false); + return 1; + } + size_t estimateSize() const override { return 0; } + +private: + llvm::raw_ostream &dump(llvm::raw_ostream &OS) const override { + return OS << "false"; + } +}; + /// Boost iterator is a wrapper around its child which multiplies scores of /// each retrieved item by a given factor. class BoostIterator : public Iterator { public: BoostIterator(std::unique_ptr Child, float Factor) - : Child(move(Child)), Factor(Factor) {} + : Child(std::move(Child)), Factor(Factor) {} bool reachedEnd() const override { return Child->reachedEnd(); } @@ -286,7 +314,7 @@ class LimitIterator : public Iterator { public: LimitIterator(std::unique_ptr Child, size_t Limit) - : Child(move(Child)), Limit(Limit), ItemsLeft(Limit) {} + : Child(std::move(Child)), Limit(Limit), ItemsLeft(Limit) {} bool reachedEnd() const override { return ItemsLeft == 0 || Child->reachedEnd(); @@ -331,30 +359,86 @@ std::unique_ptr Corpus::intersect(std::vector> Children) const { - // If there is exactly one child, pull it one level up: AND(Child) -> Child. - return Children.size() == 1 ? std::move(Children.front()) - : llvm::make_unique(move(Children)); + std::vector> RealChildren; + for (auto &Child : Children) { + switch (Child->kind()) { + case Iterator::Kind::True: + break; // No effect, drop the iterator. + case Iterator::Kind::False: + return std::move(Child); // Intersection is empty. + case Iterator::Kind::And: { + // Inline nested AND into parent AND. + auto &NewChildren = static_cast(Child.get())->Children; + std::move(NewChildren.begin(), NewChildren.end(), + std::back_inserter(RealChildren)); + break; + } + default: + RealChildren.push_back(std::move(Child)); + } + } + switch (RealChildren.size()) { + case 0: + return all(); + case 1: + return std::move(RealChildren.front()); + default: + return llvm::make_unique(std::move(RealChildren)); + } } std::unique_ptr Corpus::unionOf(std::vector> Children) const { - // If there is exactly one child, pull it one level up: OR(Child) -> Child. - return Children.size() == 1 ? std::move(Children.front()) - : llvm::make_unique(move(Children)); + std::vector> RealChildren; + for (auto &Child : Children) { + switch (Child->kind()) { + case Iterator::Kind::False: + break; // No effect, drop the iterator. + case Iterator::Kind::Or: { + // Inline nested OR into parent OR. + auto &NewChildren = static_cast(Child.get())->Children; + std::move(NewChildren.begin(), NewChildren.end(), + std::back_inserter(RealChildren)); + break; + } + case Iterator::Kind::True: + // Don't return all(), which would discard sibling boosts. + default: + RealChildren.push_back(std::move(Child)); + } + } + switch (RealChildren.size()) { + case 0: + return none(); + case 1: + return std::move(RealChildren.front()); + default: + return llvm::make_unique(std::move(RealChildren)); + } } std::unique_ptr Corpus::all() const { return llvm::make_unique(Size); } +std::unique_ptr Corpus::none() const { + return llvm::make_unique(); +} + std::unique_ptr Corpus::boost(std::unique_ptr Child, float Factor) const { - return llvm::make_unique(move(Child), Factor); + if (Factor == 1) + return Child; + if (Child->kind() == Iterator::Kind::False) + return Child; + return llvm::make_unique(std::move(Child), Factor); } std::unique_ptr Corpus::limit(std::unique_ptr Child, size_t Limit) const { - return llvm::make_unique(move(Child), Limit); + if (Child->kind() == Iterator::Kind::False) + return Child; + return llvm::make_unique(std::move(Child), Limit); } } // namespace dex Index: unittests/clangd/DexTests.cpp =================================================================== --- unittests/clangd/DexTests.cpp +++ unittests/clangd/DexTests.cpp @@ -109,6 +109,18 @@ EXPECT_TRUE(And->reachedEnd()); } +TEST(DexIterators, AndEmpty) { + Corpus C{10000}; + const PostingList L1({1}); + const PostingList L2({2}); + // These iterators are empty, but the optimizer can't tell. + auto Empty1 = C.intersect(L1.iterator(), L2.iterator()); + auto Empty2 = C.intersect(L1.iterator(), L2.iterator()); + // And syncs iterators on construction, and used to fail on empty children. + auto And = C.intersect(std::move(Empty1), std::move(Empty2)); + EXPECT_TRUE(And->reachedEnd()); +} + TEST(DexIterators, OrTwoLists) { Corpus C{10000}; const PostingList L0({0, 5, 7, 10, 42, 320, 9000}); @@ -270,18 +282,8 @@ } TEST(DexIterators, True) { - Corpus C{0}; - auto TrueIterator = C.all(); - EXPECT_TRUE(TrueIterator->reachedEnd()); - EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre()); - - C = Corpus{7}; - const PostingList L0({1, 2, 5, 7}); - TrueIterator = C.all(); - EXPECT_THAT(TrueIterator->peek(), 0); - auto AndIterator = C.intersect(L0.iterator(), move(TrueIterator)); - EXPECT_FALSE(AndIterator->reachedEnd()); - EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5)); + EXPECT_TRUE(Corpus{0}.all()->reachedEnd()); + EXPECT_THAT(consumeIDs(*Corpus{4}.all()), ElementsAre(0, 1, 2, 3)); } TEST(DexIterators, Boost) { @@ -313,6 +315,38 @@ EXPECT_THAT(ElementBoost, 3); } +TEST(DexIterators, Optimizations) { + Corpus C{5}; + const PostingList L1({1}); + const PostingList L2({2}); + const PostingList L3({3}); + + // empty and/or yield true/false + EXPECT_EQ(llvm::to_string(*C.intersect()), "true"); + EXPECT_EQ(llvm::to_string(*C.unionOf()), "false"); + + // true/false inside and/or short-circuit + EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.all())), "[1]"); + EXPECT_EQ(llvm::to_string(*C.intersect(L1.iterator(), C.none())), "false"); + // Not optimized to avoid breaking boosts. + EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.all())), + "(| [1] true)"); + EXPECT_EQ(llvm::to_string(*C.unionOf(L1.iterator(), C.none())), "[1]"); + + // and/or nested inside and/or are flattened + EXPECT_EQ(llvm::to_string(*C.intersect( + L1.iterator(), C.intersect(L1.iterator(), L1.iterator()))), + "(& [1] [1] [1])"); + EXPECT_EQ(llvm::to_string(*C.unionOf( + L1.iterator(), C.unionOf(L2.iterator(), L3.iterator()))), + "(| [1] [2] [3])"); + + // optimizations combine over multiple levels + EXPECT_EQ(llvm::to_string(*C.intersect( + C.intersect(L1.iterator(), C.intersect()), C.unionOf(C.all()))), + "[1]"); +} + //===----------------------------------------------------------------------===// // Search token tests. //===----------------------------------------------------------------------===//