Index: clang-tools-extra/clangd/index/dex/Trigram.h =================================================================== --- clang-tools-extra/clangd/index/dex/Trigram.h +++ clang-tools-extra/clangd/index/dex/Trigram.h @@ -31,6 +31,11 @@ namespace clangd { namespace dex { +/// This is used to mark unigrams and bigrams and distinct them from complete +/// trigrams. Since '$' is not present in valid identifier names, it is safe to +/// use it as the special symbol. +const auto END_SYMBOL = '$'; + /// Returns list of unique fuzzy-search trigrams from unqualified symbol. /// /// First, given Identifier (unqualified symbol name) is segmented using Index: clang-tools-extra/clangd/index/dex/Trigram.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/Trigram.cpp +++ clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -10,11 +10,9 @@ #include "Trigram.h" #include "../../FuzzyMatch.h" #include "Token.h" - #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/StringExtras.h" - #include #include #include @@ -59,15 +57,31 @@ } } + // Iterate through valid seqneces of three characters Fuzzy Matcher can + // process. DenseSet UniqueTrigrams; std::array Chars; for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) continue; + + Chars = {{LowercaseIdentifier[I], END_SYMBOL, END_SYMBOL, 0}}; + const auto Unigram = Token(Token::Kind::Trigram, Chars.data()); + if (!UniqueTrigrams.count(Unigram)) { + UniqueTrigrams.insert(Unigram); + } + for (const unsigned J : Next[I]) { if (!J) continue; + + Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J], END_SYMBOL, 0}}; + const auto Bigram = Token(Token::Kind::Trigram, Chars.data()); + if (!UniqueTrigrams.count(Bigram)) { + UniqueTrigrams.insert(Bigram); + } + for (const unsigned K : Next[J]) { if (!K) continue; @@ -99,7 +113,7 @@ std::string LowercaseQuery = Query.lower(); DenseSet UniqueTrigrams; - std::deque Chars; + std::deque Chars = {END_SYMBOL, END_SYMBOL, END_SYMBOL}; for (size_t I = 0; I < LowercaseQuery.size(); ++I) { // If current symbol is delimiter, just skip it. @@ -110,13 +124,17 @@ if (Chars.size() > 3) Chars.pop_front(); - if (Chars.size() == 3) { - auto Trigram = - Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); - // Push unique trigrams to the result. - if (!UniqueTrigrams.count(Trigram)) { - UniqueTrigrams.insert(Trigram); - } + + auto Trigram = + Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); + + if (Chars.front() == END_SYMBOL) + Trigram = Token(Token::Kind::Trigram, + std::string(Chars.rbegin(), Chars.rend())); + + // Push unique trigrams to the result. + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); } } Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -250,30 +250,44 @@ } TEST(DexIndexTrigrams, IdentifierTrigrams) { - EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"})); + EXPECT_THAT(generateIdentifierTrigrams("X86"), + trigramsAre({"x86", "x$$", "8$$", "6$$", "x8$", "86$"})); - EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({})); + EXPECT_THAT(generateIdentifierTrigrams("nl"), + trigramsAre({"nl$", "n$$", "l$$"})); - EXPECT_THAT(generateIdentifierTrigrams("clangd"), - trigramsAre({"cla", "lan", "ang", "ngd"})); - - EXPECT_THAT(generateIdentifierTrigrams("abc_def"), - trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"})); + EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"})); EXPECT_THAT( - generateIdentifierTrigrams("a_b_c_d_e_"), - trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + generateIdentifierTrigrams("clangd"), + trigramsAre({"cla", "lan", "ang", "ngd", "an$", "n$$", "g$$", "cl$", + "ng$", "d$$", "l$$", "a$$", "c$$", "gd$", "la$"})); - EXPECT_THAT( - generateIdentifierTrigrams("unique_ptr"), - trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp", - "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"})); + EXPECT_THAT(generateIdentifierTrigrams("abc_def"), + trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def", + "a$$", "b$$", "c$$", "d$$", "e$$", "f$$", "ab$", + "ad$", "bc$", "bd$", "cd$", "de$", "ef$"})); + + EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), + trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", + "cde", "a$$", "ab$", "ac$", "b$$", "bc$", "bd$", + "c$$", "cd$", "ce$", "d$$", "de$", "e$$"})); + + EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), + trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", + "iqp", "ipt", "que", "qup", "qpt", "uep", "ept", + "ptr", "u$$", "un$", "up$", "n$$", "ni$", "np$", + "i$$", "iq$", "ip$", "q$$", "qu$", "qp$", "ue$", + "e$$", "ep$", "p$$", "pt$", "t$$", "tr$", "r$$"})); EXPECT_THAT(generateIdentifierTrigrams("TUDecl"), - trigramsAre({"tud", "tde", "ude", "dec", "ecl"})); + trigramsAre({"tud", "tde", "ude", "dec", "ecl", "t$$", "tu$", + "td$", "u$$", "ud$", "d$$", "de$", "e$$", "ec$", + "c$$", "cl$", "l$$"})); EXPECT_THAT(generateIdentifierTrigrams("IsOK"), - trigramsAre({"iso", "iok", "sok"})); + trigramsAre({"iso", "iok", "sok", "i$$", "is$", "io$", "s$$", + "so$", "o$$", "ok$", "k$$"})); EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"), trigramsAre({