Index: clangd/index/dex/Trigram.h =================================================================== --- clangd/index/dex/Trigram.h +++ clangd/index/dex/Trigram.h @@ -33,26 +33,18 @@ namespace dex { /// Returns list of unique fuzzy-search trigrams from unqualified symbol. +/// The trigrams give the 3-character query substrings this symbol can match. /// -/// First, given Identifier (unqualified symbol name) is segmented using -/// FuzzyMatch API and lowercased. After segmentation, the following technique -/// is applied for generating trigrams: for each letter or digit in the input -/// string the algorithms looks for the possible next and skip-1-next characters -/// which can be jumped to during fuzzy matching. Each combination of such three -/// characters is inserted into the result. -/// +/// The symbol's name is broken into segments, e.g. "FooBar" has two segments. /// Trigrams can start at any character in the input. Then we can choose to move -/// to the next character, move to the start of the next segment, or skip over a -/// segment. +/// to the next character, move to the start of the next segment, or stop. +/// Short trigrams (length 1-2) are used for short queries. /// -/// This also generates incomplete trigrams for short query scenarios: -/// * Empty trigram: "$$$". -/// * Unigram: the first character of the identifier. -/// * Bigrams: a 2-char prefix of the identifier and a bigram of the first two -/// HEAD characters (if they exist). -// -/// Note: the returned list of trigrams does not have duplicates, if any trigram -/// belongs to more than one class it is only inserted once. +/// For "FooBar" we get the following trigrams: +/// {f, b, fo, fb, foo, fob, fba, oob, oba, bar}. +/// +/// Trigrams are lowercase, as trigram matching is case-insensitive. +/// Trigrams in the returned list are deduplicated. std::vector generateIdentifierTrigrams(llvm::StringRef Identifier); /// Returns list of unique fuzzy-search trigrams given a query. Index: clangd/index/dex/Trigram.cpp =================================================================== --- clangd/index/dex/Trigram.cpp +++ clangd/index/dex/Trigram.cpp @@ -23,11 +23,6 @@ namespace clangd { namespace dex { -/// This is used to mark unigrams and bigrams and distinct them from complete -/// trigrams. Since '$' is not present in valid identifier names, it is safe to -/// use it as the special symbol. -static const char END_MARKER = '$'; - std::vector generateIdentifierTrigrams(llvm::StringRef Identifier) { // Apply fuzzy matching text segmentation. std::vector Roles(Identifier.size()); @@ -47,62 +42,44 @@ // not available then 0 is stored. std::vector> Next(LowercaseIdentifier.size()); unsigned NextTail = 0, NextHead = 0, NextNextHead = 0; - // Store two first HEAD characters in the identifier (if present). - std::deque TwoHeads; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { Next[I] = {{NextTail, NextHead, NextNextHead}}; NextTail = Roles[I] == Tail ? I : 0; if (Roles[I] == Head) { NextNextHead = NextHead; NextHead = I; - TwoHeads.push_front(LowercaseIdentifier[I]); - if (TwoHeads.size() > 2) - TwoHeads.pop_back(); } } DenseSet UniqueTrigrams; - auto add = [&](std::string Chars) { + auto Add = [&](std::string Chars) { UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); }; - if (TwoHeads.size() == 2) - add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}}); - - if (!LowercaseIdentifier.empty()) - add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}}); - - if (LowercaseIdentifier.size() >= 2) - add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}}); - - if (LowercaseIdentifier.size() >= 3) - add({{LowercaseIdentifier[0], LowercaseIdentifier[1], - LowercaseIdentifier[2]}}); - // Iterate through valid seqneces of three characters Fuzzy Matcher can // process. for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) continue; + if (Roles[I] == Head) // Short query unigram. + Add({LowercaseIdentifier[I]}); for (const unsigned J : Next[I]) { if (J == 0) continue; + if (Roles[I] == Head) // Short query bigram. + Add({LowercaseIdentifier[I], LowercaseIdentifier[J]}); for (const unsigned K : Next[J]) { if (K == 0) continue; - add({{LowercaseIdentifier[I], LowercaseIdentifier[J], + Add({{LowercaseIdentifier[I], LowercaseIdentifier[J], LowercaseIdentifier[K]}}); } } } - std::vector Result; - for (const auto &Trigram : UniqueTrigrams) - Result.push_back(Trigram); - - return Result; + return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; } std::vector generateQueryTrigrams(llvm::StringRef Query) { @@ -110,48 +87,23 @@ std::vector Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); - // Additional pass is necessary to count valid identifier characters. - // Depending on that, this function might return incomplete trigram. - unsigned ValidSymbolsCount = 0; - for (const auto Role : Roles) - if (Role == Head || Role == Tail) - ++ValidSymbolsCount; - std::string LowercaseQuery = Query.lower(); - DenseSet UniqueTrigrams; - - // If the number of symbols which can form fuzzy matching trigram is not - // sufficient, generate a single incomplete trigram for query. - if (ValidSymbolsCount < 3) { - std::string Chars = - LowercaseQuery.substr(0, std::min(3UL, Query.size())); - Chars.append(3 - Chars.size(), END_MARKER); - UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); - } else { - std::deque Chars; - for (size_t I = 0; I < LowercaseQuery.size(); ++I) { - // If current symbol is delimiter, just skip it. - if (Roles[I] != Head && Roles[I] != Tail) - continue; - - Chars.push_back(LowercaseQuery[I]); - - if (Chars.size() > 3) - Chars.pop_front(); - - if (Chars.size() == 3) { - UniqueTrigrams.insert( - Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars)))); - } - } + std::string Chars; + for (unsigned I = 0; I < Query.size(); ++I) { + if (Roles[I] != Head && Roles[I] != Tail) + continue; // Skip delimiters. + Chars.push_back(LowercaseQuery[I]); + if (Chars.size() > 3) + Chars.erase(Chars.begin()); + if (Chars.size() == 3) + UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); } - std::vector Result; - for (const auto &Trigram : UniqueTrigrams) - Result.push_back(Trigram); - - return Result; + // If it's a short query, emit a single short trigram. + if (Chars.size() < 3 && !Chars.empty()) + return {Token(Token::Kind::Trigram, Chars)}; + return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; } } // namespace dex Index: unittests/clangd/DexTests.cpp =================================================================== --- unittests/clangd/DexTests.cpp +++ unittests/clangd/DexTests.cpp @@ -333,53 +333,60 @@ TEST(DexTrigrams, IdentifierTrigrams) { EXPECT_THAT(generateIdentifierTrigrams("X86"), - trigramsAre({"x86", "x$$", "x8$"})); + trigramsAre({"x86", "x", "x8"})); - EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"})); + EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl", "n"})); - EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"})); + EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n"})); EXPECT_THAT(generateIdentifierTrigrams("clangd"), - trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd"})); + trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"})); EXPECT_THAT(generateIdentifierTrigrams("abc_def"), - trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde", - "def", "ab$", "ad$"})); + trigramsAre({"a", "d", "abc", "abd", "ade", "bcd", "bde", "cde", + "def", "ab", "ad", "de"})); EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), - trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace", - "bcd", "bce", "bde", "cde", "ab$"})); + trigramsAre({"a", "b", "c", "d", "e", "ab", "ac", + "bc", "bd", "cd", "ce", "de", "abc", "abd", + "acd", "ace", "bcd", "bce", "bde", "cde"})); EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), - trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt", - "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep", - "ept", "ptr", "un$", "up$"})); + trigramsAre({"u", "p", "uni", "unp", "upt", "niq", "nip", + "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt", + "uep", "ept", "ptr", "un", "up", "pt"})); - EXPECT_THAT( - generateIdentifierTrigrams("TUDecl"), - trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$", "td$"})); + EXPECT_THAT(generateIdentifierTrigrams("TUDecl"), + trigramsAre({"t", "d", "tud", "tde", "ude", "dec", "ecl", "tu", + "td", "de"})); EXPECT_THAT(generateIdentifierTrigrams("IsOK"), - trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$"})); + trigramsAre({"i", "o", "iso", "iok", "sok", "is", "io", "ok"})); + auto X = generateIdentifierTrigrams("abc_defGhij__klm"); + llvm::sort(X, + [&](const Token &X, const Token &Y) { return X.Data < Y.Data; }); + for (const auto &E : X) + llvm::errs() << E << "\n"; EXPECT_THAT( generateIdentifierTrigrams("abc_defGhij__klm"), - trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh", - "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", - "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", - "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl", - "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik", - "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"})); + trigramsAre( + {"a", "d", "g", "k", "abc", "abd", "abg", "ade", "adg", "adk", + "agh", "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", + "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", + "efg", "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", + "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", "ab", "ad", + "ag", "de", "dg", "dk", "gh", "gk", "kl"})); } TEST(DexTrigrams, QueryTrigrams) { - EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"})); - EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); + EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c"})); + EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl"})); EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); - EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"})); - EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"})); - EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"})); + EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({})); EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));