Index: clang-tools-extra/clangd/index/dex/Iterator.h =================================================================== --- clang-tools-extra/clangd/index/dex/Iterator.h +++ clang-tools-extra/clangd/index/dex/Iterator.h @@ -47,6 +47,14 @@ /// Contains sorted sequence of DocIDs all of which belong to symbols matching /// certain criteria, i.e. containing a Search Token. PostingLists are values /// for the inverted index. +// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are +// likely to be very dense and hence require special attention so that the index +// doesn't use too much memory. Possible solution would be to construct +// compressed posting lists which consist of ranges of DocIDs instead of +// distinct DocIDs. A special case would be the empty query: for that case +// TrueIterator should be implemented - an iterator which doesn't actually store +// any PostingList within itself, but "contains" all DocIDs in range +// [0, IndexSize). using PostingList = std::vector; /// Immutable reference to PostingList object. using PostingListRef = llvm::ArrayRef; Index: clang-tools-extra/clangd/index/dex/Trigram.h =================================================================== --- clang-tools-extra/clangd/index/dex/Trigram.h +++ clang-tools-extra/clangd/index/dex/Trigram.h @@ -44,6 +44,14 @@ /// to the next character, move to the start of the next segment, or skip over a /// segment. /// +/// Special kind of trigrams - incomplete trigrams is also present in the +/// result. Incomplete trigrams contain END_MARKER ('$') at the end. Result +/// contains unigrams which contain the first symbol of each fuzzy matching +/// segment start, i.e. symbols which are assigned Head role during fuzzy +/// matching segmentation stage. The result also contains all bigrams which are +/// generated in the same way sequences of length 3 are created. This means that +/// for short queries Dex index would do special kind of prefix matching. +/// /// Note: the returned list of trigrams does not have duplicates, if any trigram /// belongs to more than one class it is only inserted once. std::vector generateIdentifierTrigrams(llvm::StringRef Identifier); @@ -53,6 +61,11 @@ /// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then, /// the simplest trigrams - sequences of three consecutive letters and digits /// are extracted and returned after deduplication. +/// +/// For short queries (Query contains less than 3 letters and digits) this +/// returns a single trigram with all valid identifier symbols. +/// +/// Example: Query - "_u$_p", Trigram - "up". std::vector generateQueryTrigrams(llvm::StringRef Query); } // namespace dex Index: clang-tools-extra/clangd/index/dex/Trigram.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/Trigram.cpp +++ clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -10,11 +10,9 @@ #include "Trigram.h" #include "../../FuzzyMatch.h" #include "Token.h" - #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringExtras.h" - #include #include #include @@ -25,12 +23,22 @@ namespace clangd { namespace dex { -// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would -// be generating unigrams and bigrams here, too. This would prevent symbol index -// from applying fuzzy matching on a tremendous number of symbols and allow -// supplementary retrieval for short queries. -// -// Short names (total segment length <3 characters) are currently ignored. +namespace { + +/// This is used to mark unigrams and bigrams and distinct them from complete +/// trigrams. Since '$' is not present in valid identifier names, it is safe to +/// use it as the special symbol. +const auto END_MARKER = '$'; + +void insertChars(DenseSet &UniqueTrigrams, std::array Chars) { + const auto Trigram = Token(Token::Kind::Trigram, Chars.data()); + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + } +} + +} // namespace + std::vector generateIdentifierTrigrams(llvm::StringRef Identifier) { // Apply fuzzy matching text segmentation. std::vector Roles(Identifier.size()); @@ -59,25 +67,37 @@ } } + // Iterate through valid seqneces of three characters Fuzzy Matcher can + // process. DenseSet UniqueTrigrams; - std::array Chars; for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) continue; + + // Generate unigrams using the first symbol of each fuzzy matching segment. + if (Roles[I] == Head) { + insertChars(UniqueTrigrams, + {{LowercaseIdentifier[I], END_MARKER, END_MARKER, 0}}); + } + for (const unsigned J : Next[I]) { if (!J) continue; + + // Generate trigrams only if the first character is the segment start. + // Example: "StringStartsWith" would yield "st$", "ss$" but not "tr$". + if (Roles[I] == Head) { + insertChars(UniqueTrigrams, {{LowercaseIdentifier[I], + LowercaseIdentifier[J], END_MARKER, 0}}); + } + for (const unsigned K : Next[J]) { if (!K) continue; - Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J], - LowercaseIdentifier[K], 0}}; - auto Trigram = Token(Token::Kind::Trigram, Chars.data()); - // Push unique trigrams to the result. - if (!UniqueTrigrams.count(Trigram)) { - UniqueTrigrams.insert(Trigram); - } + insertChars(UniqueTrigrams, + {{LowercaseIdentifier[I], LowercaseIdentifier[J], + LowercaseIdentifier[K], 0}}); } } } @@ -89,13 +109,19 @@ return Result; } -// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short -// inputs (total segment length <3 characters). +// FIXME(kbobyrev): Correctly handle empty trigrams "$$$". std::vector generateQueryTrigrams(llvm::StringRef Query) { // Apply fuzzy matching text segmentation. std::vector Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + // Additional pass is necessary to count valid identifier characters. + // Depending on that, this function might return incomplete trigram. + unsigned ValidSymbolsCount = 0; + for (size_t I = 0; I < Roles.size(); ++I) + if (Roles[I] == Head || Roles[I] == Tail) + ++ValidSymbolsCount; + std::string LowercaseQuery = Query.lower(); DenseSet UniqueTrigrams; @@ -110,9 +136,27 @@ if (Chars.size() > 3) Chars.pop_front(); + + if (ValidSymbolsCount < 3 && Chars.size() == ValidSymbolsCount) { + while (Chars.size() < 3) { + Chars.push_back(END_MARKER); + } + auto Trigram = + Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); + UniqueTrigrams.insert(Trigram); + // The symbol does not have sufficient number of valid symbols for + // complete trigrams, all valid symbols were seen at this point. + break; + } + if (Chars.size() == 3) { auto Trigram = Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); + + if (Chars.front() == END_MARKER) + Trigram = Token(Token::Kind::Trigram, + std::string(Chars.rbegin(), Chars.rend())); + // Push unique trigrams to the result. if (!UniqueTrigrams.count(Trigram)) { UniqueTrigrams.insert(Trigram); Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -250,45 +250,55 @@ } TEST(DexIndexTrigrams, IdentifierTrigrams) { - EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"})); + EXPECT_THAT(generateIdentifierTrigrams("X86"), + trigramsAre({"x86", "x$$", "x8$"})); - EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({})); + EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"})); + + EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"})); EXPECT_THAT(generateIdentifierTrigrams("clangd"), - trigramsAre({"cla", "lan", "ang", "ngd"})); + trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd"})); EXPECT_THAT(generateIdentifierTrigrams("abc_def"), - trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"})); + trigramsAre({"a$$", "d$$", "abc", "abd", "ade", "bcd", "bde", + "cde", "def", "ab$", "ad$", "de$"})); - EXPECT_THAT( - generateIdentifierTrigrams("a_b_c_d_e_"), - trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), + trigramsAre({"a$$", "b$$", "c$$", "d$$", "e$$", "abc", "abd", + "acd", "ace", "bcd", "bce", "bde", "cde", "ab$", + "ac$", "bc$", "bd$", "cd$", "ce$", "de$"})); - EXPECT_THAT( - generateIdentifierTrigrams("unique_ptr"), - trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp", - "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"})); + EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), + trigramsAre({"u$$", "p$$", "uni", "unp", "upt", "niq", "nip", + "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt", + "uep", "ept", "ptr", "un$", "up$", "pt$"})); EXPECT_THAT(generateIdentifierTrigrams("TUDecl"), - trigramsAre({"tud", "tde", "ude", "dec", "ecl"})); - - EXPECT_THAT(generateIdentifierTrigrams("IsOK"), - trigramsAre({"iso", "iok", "sok"})); - - EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"), - trigramsAre({ - "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd", - "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk", - "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg", - "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", - "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", - })); + trigramsAre({"t$$", "d$$", "tud", "tde", "ude", "dec", "ecl", + "tu$", "td$", "de$"})); + + EXPECT_THAT( + generateIdentifierTrigrams("IsOK"), + trigramsAre({"i$$", "o$$", "iso", "iok", "sok", "is$", "io$", "ok$"})); + + EXPECT_THAT( + generateIdentifierTrigrams("abc_defGhij__klm"), + trigramsAre( + {"a$$", "d$$", "g$$", "k$$", "abc", "abd", "abg", "ade", "adg", "adk", + "agh", "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", + "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", + "efg", "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", + "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$", + "ag$", "de$", "dg$", "dk$", "gh$", "gk$", "kl$"})); } TEST(DexIndexTrigrams, QueryTrigrams) { - EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); + EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"})); + EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); + EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); - EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); EXPECT_THAT(generateQueryTrigrams("clangd"), trigramsAre({"cla", "lan", "ang", "ngd"}));