diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp --- a/clang-tools-extra/clangd/index/dex/Trigram.cpp +++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -12,10 +12,14 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include +#include #include #include +#include namespace clang { namespace clangd { @@ -25,21 +29,22 @@ template static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { // Apply fuzzy matching text segmentation. - std::vector Roles(Identifier.size()); + llvm::SmallVector Roles(Identifier.size()); calculateRoles(Identifier, llvm::makeMutableArrayRef(Roles.data(), Identifier.size())); std::string LowercaseIdentifier = Identifier.lower(); // For each character, store indices of the characters to which fuzzy matching - // algorithm can jump. There are 3 possible variants: + // algorithm can jump. There are 2 possible variants: // // * Next Tail - next character from the same segment // * Next Head - front character of the next segment // // Next stores tuples of three indices in the presented order, if a variant is // not available then 0 is stored. - std::vector> Next(LowercaseIdentifier.size()); + llvm::SmallVector, 12> Next( + LowercaseIdentifier.size()); unsigned NextTail = 0, NextHead = 0; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { Next[I] = {{NextTail, NextHead}}; @@ -51,14 +56,14 @@ // Iterate through valid sequences of three characters Fuzzy Matcher can // process. - for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { + for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) { // Skip delimiters. if (Roles[I] != Head && Roles[I] != Tail) continue; - for (const unsigned J : Next[I]) { + for (unsigned J : Next[I]) { if (J == 0) continue; - for (const unsigned K : Next[J]) { + for (unsigned K : Next[J]) { if (K == 0) continue; Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J], @@ -66,16 +71,29 @@ } } } - // Emit short-query trigrams: FooBar -> f, fo, fb. - if (!LowercaseIdentifier.empty()) - Out(Trigram(LowercaseIdentifier[0])); - if (LowercaseIdentifier.size() >= 2) - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1])); - for (size_t I = 1; I < LowercaseIdentifier.size(); ++I) - if (Roles[I] == Head) { - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I])); + // Short queries semantics are different. When the user dosn't type enough + // symbols to form trigrams, we still want to serve meaningful results. To + // achieve that, we form incomplete trigrams (bi- and unigrams) for the + // identifiers and also generate short trigrams on the query side from what + // is available. We allow a small number of short trigram types in order to + // prevent excessive memory usage and increase the quality of the search. + // Only the first few symbols are allowed to be used in incomplete trigrams. + // + // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams: + // "_", "_a", "a", "ab", "ad", "d", "de", "dg" + for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) { + // The first symbol might be a separator, so the loop condition should be + // stopping as soon as there is no next head or we have seen two heads. + if (Roles[Position] == Head) + ++HeadsSeen; + Out(Trigram(LowercaseIdentifier[Position])); + for (unsigned I : Next[Position]) + if (I != 0) + Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I])); + Position = Next[Position][1]; + if (Position == 0) break; - } + } } void generateIdentifierTrigrams(llvm::StringRef Identifier, @@ -101,17 +119,16 @@ std::vector generateQueryTrigrams(llvm::StringRef Query) { if (Query.empty()) return {}; - std::string LowercaseQuery = Query.lower(); - if (Query.size() < 3) // short-query trigrams only - return {Token(Token::Kind::Trigram, LowercaseQuery)}; // Apply fuzzy matching text segmentation. - std::vector Roles(Query.size()); + llvm::SmallVector Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + std::string LowercaseQuery = Query.lower(); + llvm::DenseSet UniqueTrigrams; std::string Chars; - for (unsigned I = 0; I < Query.size(); ++I) { + for (unsigned I = 0; I < LowercaseQuery.size(); ++I) { if (Roles[I] != Head && Roles[I] != Tail) continue; // Skip delimiters. Chars.push_back(LowercaseQuery[I]); @@ -121,6 +138,18 @@ UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); } + // For queries with very few letters, generateIdentifierTrigrams emulates + // outputs of this function to match the semantics. + if (UniqueTrigrams.empty()) { + // If bigram can't be formed out of letters/numbers, we prepend separator. + std::string Result(1, LowercaseQuery.front()); + for (unsigned I = 1; I < LowercaseQuery.size(); ++I) + if (Roles[I] == Head || Roles[I] == Tail) + Result += LowercaseQuery[I]; + UniqueTrigrams.insert( + Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2))); + } + return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; } diff --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp --- a/clang-tools-extra/clangd/unittests/DexTests.cpp +++ b/clang-tools-extra/clangd/unittests/DexTests.cpp @@ -386,30 +386,35 @@ trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"})); EXPECT_THAT(identifierTrigramTokens("abc_def"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde", - "cde", "def"})); + trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", "ade", + "bcd", "bde", "cde", "def"})); EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"), - trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"})); + trigramsAre({"a", "b", "ab", "bc", "abc", "bcd", "cde"})); EXPECT_THAT(identifierTrigramTokens("unique_ptr"), - trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip", - "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt", - "uep", "ept", "ptr"})); + trigramsAre({"u", "p", "un", "up", "pt", "uni", "unp", + "upt", "niq", "nip", "npt", "iqu", "iqp", "ipt", + "que", "qup", "qpt", "uep", "ept", "ptr"})); - EXPECT_THAT( - identifierTrigramTokens("TUDecl"), - trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"})); + EXPECT_THAT(identifierTrigramTokens("TUDecl"), + trigramsAre({"t", "d", "tu", "td", "de", "tud", "tde", "ude", + "dec", "ecl"})); EXPECT_THAT(identifierTrigramTokens("IsOK"), - trigramsAre({"i", "is", "io", "iso", "iok", "sok"})); + trigramsAre({"i", "o", "is", "ok", "io", "iso", "iok", "sok"})); - EXPECT_THAT( - identifierTrigramTokens("abc_defGhij__klm"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd", - "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk", - "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl", - "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); + EXPECT_THAT(identifierTrigramTokens("_pb"), + trigramsAre({"_", "_p", "p", "pb"})); + EXPECT_THAT(identifierTrigramTokens("__pb"), + trigramsAre({"_", "_p", "p", "pb"})); + + EXPECT_THAT(identifierTrigramTokens("abc_defGhij__klm"), + trigramsAre({"a", "d", "ab", "ad", "dg", "de", "abc", + "abd", "ade", "adg", "bcd", "bde", "bdg", "cde", + "cdg", "def", "deg", "dgh", "dgk", "efg", "egh", + "egk", "fgh", "fgk", "ghi", "ghk", "gkl", "hij", + "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); } TEST(DexTrigrams, QueryTrigrams) { @@ -419,8 +424,16 @@ EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({})); EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"})); - EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"})); - EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"_"})); + EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"_"})); + + EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m"})); + + EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"})); + EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"pb"})); EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); @@ -525,25 +538,45 @@ } TEST(DexTest, ShortQuery) { - auto I = Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(), + auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(), RelationSlab()); FuzzyFindRequest Req; Req.AnyScope = true; bool Incomplete; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "Empty string is not a short query"; - Req.Query = "t"; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); - EXPECT_TRUE(Incomplete) << "Short queries have different semantics"; + Req.Query = "o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head as unigram"; + + Req.Query = "_o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using delimiter and first head as bigram"; + + Req.Query = "on"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head and tail as bigram"; + + Req.Query = "ot"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first two heads as bigram"; + + Req.Query = "tw"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using second head and tail as bigram"; + + Req.Query = "tf"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using second and third heads as bigram"; - Req.Query = "tt"; + Req.Query = "fo"; EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); EXPECT_TRUE(Incomplete) << "Short queries have different semantics"; - Req.Query = "ttf"; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + Req.Query = "tfs"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "3-char string is not a short query"; }