Index: clang-tools-extra/trunk/clangd/index/dex/Iterator.h =================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Iterator.h +++ clang-tools-extra/trunk/clangd/index/dex/Iterator.h @@ -47,6 +47,14 @@ /// Contains sorted sequence of DocIDs all of which belong to symbols matching /// certain criteria, i.e. containing a Search Token. PostingLists are values /// for the inverted index. +// FIXME(kbobyrev): Posting lists for incomplete trigrams (one/two symbols) are +// likely to be very dense and hence require special attention so that the index +// doesn't use too much memory. Possible solution would be to construct +// compressed posting lists which consist of ranges of DocIDs instead of +// distinct DocIDs. A special case would be the empty query: for that case +// TrueIterator should be implemented - an iterator which doesn't actually store +// any PostingList within itself, but "contains" all DocIDs in range +// [0, IndexSize). using PostingList = std::vector; /// Immutable reference to PostingList object. using PostingListRef = llvm::ArrayRef; Index: clang-tools-extra/trunk/clangd/index/dex/Trigram.h =================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Trigram.h +++ clang-tools-extra/trunk/clangd/index/dex/Trigram.h @@ -36,14 +36,20 @@ /// First, given Identifier (unqualified symbol name) is segmented using /// FuzzyMatch API and lowercased. After segmentation, the following technique /// is applied for generating trigrams: for each letter or digit in the input -/// string the algorithms looks for the possible next and skip-1-next symbols +/// string the algorithms looks for the possible next and skip-1-next characters /// which can be jumped to during fuzzy matching. Each combination of such three -/// symbols is inserted into the result. +/// characters is inserted into the result. /// /// Trigrams can start at any character in the input. Then we can choose to move /// to the next character, move to the start of the next segment, or skip over a /// segment. /// +/// This also generates incomplete trigrams for short query scenarios: +/// * Empty trigram: "$$$". +/// * Unigram: the first character of the identifier. +/// * Bigrams: a 2-char prefix of the identifier and a bigram of the first two +/// HEAD characters (if they exist). +// /// Note: the returned list of trigrams does not have duplicates, if any trigram /// belongs to more than one class it is only inserted once. std::vector generateIdentifierTrigrams(llvm::StringRef Identifier); @@ -53,6 +59,10 @@ /// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then, /// the simplest trigrams - sequences of three consecutive letters and digits /// are extracted and returned after deduplication. +/// +/// For short queries (less than 3 characters with Head or Tail roles in Fuzzy +/// Matching segmentation) this returns a single trigram with the first +/// characters (up to 3) to perfrom prefix match. std::vector generateQueryTrigrams(llvm::StringRef Query); } // namespace dex Index: clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp =================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp +++ clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp @@ -10,11 +10,9 @@ #include "Trigram.h" #include "../../FuzzyMatch.h" #include "Token.h" - #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringExtras.h" - #include #include #include @@ -25,12 +23,11 @@ namespace clangd { namespace dex { -// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would -// be generating unigrams and bigrams here, too. This would prevent symbol index -// from applying fuzzy matching on a tremendous number of symbols and allow -// supplementary retrieval for short queries. -// -// Short names (total segment length <3 characters) are currently ignored. +/// This is used to mark unigrams and bigrams and distinct them from complete +/// trigrams. Since '$' is not present in valid identifier names, it is safe to +/// use it as the special symbol. +static const char END_MARKER = '$'; + std::vector generateIdentifierTrigrams(llvm::StringRef Identifier) { // Apply fuzzy matching text segmentation. std::vector Roles(Identifier.size()); @@ -50,17 +47,45 @@ // not available then 0 is stored. std::vector> Next(LowercaseIdentifier.size()); unsigned NextTail = 0, NextHead = 0, NextNextHead = 0; + // Store two first HEAD characters in the identifier (if present). + std::deque TwoHeads; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { Next[I] = {{NextTail, NextHead, NextNextHead}}; NextTail = Roles[I] == Tail ? I : 0; if (Roles[I] == Head) { NextNextHead = NextHead; NextHead = I; + TwoHeads.push_front(LowercaseIdentifier[I]); + if (TwoHeads.size() > 2) + TwoHeads.pop_back(); } } DenseSet UniqueTrigrams; - std::array Chars; + + auto add = [&](std::string Chars) { + UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); + }; + + // FIXME(kbobyrev): Instead of producing empty trigram for each identifier, + // just use True Iterator on the query side when the query string is empty. + add({{END_MARKER, END_MARKER, END_MARKER}}); + + if (TwoHeads.size() == 2) + add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}}); + + if (!LowercaseIdentifier.empty()) + add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}}); + + if (LowercaseIdentifier.size() >= 2) + add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}}); + + if (LowercaseIdentifier.size() >= 3) + add({{LowercaseIdentifier[0], LowercaseIdentifier[1], + LowercaseIdentifier[2]}}); + + // Iterate through valid seqneces of three characters Fuzzy Matcher can + // process. for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) @@ -71,13 +96,8 @@ for (const unsigned K : Next[J]) { if (!K) continue; - Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J], - LowercaseIdentifier[K], 0}}; - auto Trigram = Token(Token::Kind::Trigram, Chars.data()); - // Push unique trigrams to the result. - if (!UniqueTrigrams.count(Trigram)) { - UniqueTrigrams.insert(Trigram); - } + add({{LowercaseIdentifier[I], LowercaseIdentifier[J], + LowercaseIdentifier[K]}}); } } } @@ -89,33 +109,43 @@ return Result; } -// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short -// inputs (total segment length <3 characters). std::vector generateQueryTrigrams(llvm::StringRef Query) { // Apply fuzzy matching text segmentation. std::vector Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + // Additional pass is necessary to count valid identifier characters. + // Depending on that, this function might return incomplete trigram. + unsigned ValidSymbolsCount = 0; + for (size_t I = 0; I < Roles.size(); ++I) + if (Roles[I] == Head || Roles[I] == Tail) + ++ValidSymbolsCount; + std::string LowercaseQuery = Query.lower(); DenseSet UniqueTrigrams; - std::deque Chars; - for (size_t I = 0; I < LowercaseQuery.size(); ++I) { - // If current symbol is delimiter, just skip it. - if (Roles[I] != Head && Roles[I] != Tail) - continue; + // If the number of symbols which can form fuzzy matching trigram is not + // sufficient, generate a single incomplete trigram for query. + if (ValidSymbolsCount < 3) { + std::string Chars = LowercaseQuery.substr(0, std::min(3UL, Query.size())); + Chars.append(3 - Chars.size(), END_MARKER); + UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); + } else { + std::deque Chars; + for (size_t I = 0; I < LowercaseQuery.size(); ++I) { + // If current symbol is delimiter, just skip it. + if (Roles[I] != Head && Roles[I] != Tail) + continue; + + Chars.push_back(LowercaseQuery[I]); - Chars.push_back(LowercaseQuery[I]); + if (Chars.size() > 3) + Chars.pop_front(); - if (Chars.size() > 3) - Chars.pop_front(); - if (Chars.size() == 3) { - auto Trigram = - Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); - // Push unique trigrams to the result. - if (!UniqueTrigrams.count(Trigram)) { - UniqueTrigrams.insert(Trigram); + if (Chars.size() == 3) { + UniqueTrigrams.insert( + Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)))); } } } Index: clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp @@ -271,45 +271,57 @@ } TEST(DexIndexTrigrams, IdentifierTrigrams) { - EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"})); + EXPECT_THAT(generateIdentifierTrigrams("X86"), + trigramsAre({"x86", "x$$", "x8$", "$$$"})); - EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({})); + EXPECT_THAT(generateIdentifierTrigrams("nl"), + trigramsAre({"nl$", "n$$", "$$$"})); + + EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$", "$$$"})); EXPECT_THAT(generateIdentifierTrigrams("clangd"), - trigramsAre({"cla", "lan", "ang", "ngd"})); + trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd", "$$$"})); EXPECT_THAT(generateIdentifierTrigrams("abc_def"), - trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"})); - - EXPECT_THAT( - generateIdentifierTrigrams("a_b_c_d_e_"), - trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde", + "def", "ab$", "ad$", "$$$"})); - EXPECT_THAT( - generateIdentifierTrigrams("unique_ptr"), - trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp", - "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"})); + EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), + trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace", + "bcd", "bce", "bde", "cde", "ab$", "$$$"})); + + EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), + trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt", + "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep", + "ept", "ptr", "un$", "up$", "$$$"})); EXPECT_THAT(generateIdentifierTrigrams("TUDecl"), - trigramsAre({"tud", "tde", "ude", "dec", "ecl"})); + trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$", + "td$", "$$$"})); EXPECT_THAT(generateIdentifierTrigrams("IsOK"), - trigramsAre({"iso", "iok", "sok"})); + trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$", "$$$"})); - EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"), - trigramsAre({ - "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd", - "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk", - "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg", - "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", - "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", - })); + EXPECT_THAT( + generateIdentifierTrigrams("abc_defGhij__klm"), + trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh", + "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", + "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", + "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl", + "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik", + "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$", "$$$"})); } TEST(DexIndexTrigrams, QueryTrigrams) { - EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); + EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"})); + EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); + EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); + + EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"})); + EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"})); + EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"})); - EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); EXPECT_THAT(generateQueryTrigrams("clangd"), trigramsAre({"cla", "lan", "ang", "ngd"}));