diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp --- a/clang-tools-extra/clangd/index/dex/Trigram.cpp +++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -12,8 +12,10 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringExtras.h" #include +#include #include #include @@ -25,7 +27,7 @@ template static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { // Apply fuzzy matching text segmentation. - std::vector Roles(Identifier.size()); + llvm::SmallVector Roles(Identifier.size()); calculateRoles(Identifier, llvm::makeMutableArrayRef(Roles.data(), Identifier.size())); @@ -39,7 +41,7 @@ // // Next stores tuples of three indices in the presented order, if a variant is // not available then 0 is stored. - std::vector> Next(LowercaseIdentifier.size()); + llvm::SmallVector> Next(LowercaseIdentifier.size()); unsigned NextTail = 0, NextHead = 0; for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { Next[I] = {{NextTail, NextHead}}; @@ -66,16 +68,50 @@ } } } - // Emit short-query trigrams: FooBar -> f, fo, fb. - if (!LowercaseIdentifier.empty()) - Out(Trigram(LowercaseIdentifier[0])); - if (LowercaseIdentifier.size() >= 2) - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1])); - for (size_t I = 1; I < LowercaseIdentifier.size(); ++I) - if (Roles[I] == Head) { - Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I])); + // Short queries semantics are different: emit incomplete trigrams for them. + // + // First, the first separator if it is preceding the alphanumeric symbols. + size_t FirstSeparator = std::numeric_limits::max(); + for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { + if (Roles[I] == Head || Roles[I] == Tail) + break; + if (Roles[I] == Separator) { + FirstSeparator = I; + Out(Trigram(LowercaseIdentifier[I])); break; } + } + // Then, find the first (up to) two heads and emit the following incomplete + // trigrams (if they exist): + // + // - First head + // - Fist head + subsequent tail + // - First separator + first head + // - Second head + // - First head + second head + // - Second head + subsequent tail + llvm::SmallVector Heads; + for (size_t I = 0; I < LowercaseIdentifier.size() && Heads.size() < 2; ++I) + if (Roles[I] == Head) + Heads.push_back(I); + if (!Heads.empty()) { + Out(Trigram(LowercaseIdentifier[Heads[0]])); + if (FirstSeparator != std::numeric_limits::max()) + Out(Trigram(LowercaseIdentifier[FirstSeparator], + LowercaseIdentifier[Heads[0]])); + size_t NextIndex = Heads[0] + 1; + if (NextIndex < LowercaseIdentifier.size() && Roles[NextIndex] == Tail) + Out(Trigram(LowercaseIdentifier[Heads[0]], + LowercaseIdentifier[NextIndex])); + } + if (Heads.size() == 2) { + Out(Trigram(LowercaseIdentifier[Heads[1]])); + Out(Trigram(LowercaseIdentifier[Heads[0]], LowercaseIdentifier[Heads[1]])); + size_t NextIndex = Heads.back() + 1; + if (NextIndex < LowercaseIdentifier.size() && Roles[NextIndex] == Tail) + Out(Trigram(LowercaseIdentifier[Heads[1]], + LowercaseIdentifier[NextIndex])); + } } void generateIdentifierTrigrams(llvm::StringRef Identifier, @@ -101,17 +137,31 @@ std::vector generateQueryTrigrams(llvm::StringRef Query) { if (Query.empty()) return {}; - std::string LowercaseQuery = Query.lower(); - if (Query.size() < 3) // short-query trigrams only - return {Token(Token::Kind::Trigram, LowercaseQuery)}; // Apply fuzzy matching text segmentation. - std::vector Roles(Query.size()); + llvm::SmallVector Roles(Query.size()); calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + std::string LowercaseQuery = Query.lower(); + + unsigned ValidSymbols = + llvm::count_if(Roles, [](CharRole R) { return R == Head || R == Tail; }); + // For queries with very few letters, emulate what generateIdentifierTrigrams + // outputs for the beginning of the Identifier. + if (ValidSymbols < 3) { + // If a bigram can't be formed, we might prepend a separator. + std::string Letters = Roles.front() == Separator && ValidSymbols < 2 + ? std::string(1, Query.front()) + : ""; + for (unsigned I = 0; I < LowercaseQuery.size(); ++I) + if (Roles[I] == Head || Roles[I] == Tail) + Letters += LowercaseQuery[I]; + return {Token(Token::Kind::Trigram, Letters)}; + } + llvm::DenseSet UniqueTrigrams; std::string Chars; - for (unsigned I = 0; I < Query.size(); ++I) { + for (unsigned I = 0; I < LowercaseQuery.size(); ++I) { if (Roles[I] != Head && Roles[I] != Tail) continue; // Skip delimiters. Chars.push_back(LowercaseQuery[I]); diff --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp --- a/clang-tools-extra/clangd/unittests/DexTests.cpp +++ b/clang-tools-extra/clangd/unittests/DexTests.cpp @@ -386,30 +386,35 @@ trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"})); EXPECT_THAT(identifierTrigramTokens("abc_def"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde", - "cde", "def"})); + trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", "ade", + "bcd", "bde", "cde", "def"})); EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"), - trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"})); + trigramsAre({"a", "b", "ab", "abc", "bcd", "cde"})); EXPECT_THAT(identifierTrigramTokens("unique_ptr"), - trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip", - "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt", - "uep", "ept", "ptr"})); + trigramsAre({"u", "p", "un", "up", "pt", "uni", "unp", + "upt", "niq", "nip", "npt", "iqu", "iqp", "ipt", + "que", "qup", "qpt", "uep", "ept", "ptr"})); - EXPECT_THAT( - identifierTrigramTokens("TUDecl"), - trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"})); + EXPECT_THAT(identifierTrigramTokens("TUDecl"), + trigramsAre({"t", "d", "tu", "td", "de", "tud", "tde", "ude", + "dec", "ecl"})); EXPECT_THAT(identifierTrigramTokens("IsOK"), - trigramsAre({"i", "is", "io", "iso", "iok", "sok"})); + trigramsAre({"i", "o", "is", "ok", "io", "iso", "iok", "sok"})); - EXPECT_THAT( - identifierTrigramTokens("abc_defGhij__klm"), - trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd", - "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk", - "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl", - "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"})); + EXPECT_THAT(identifierTrigramTokens("_pb"), + trigramsAre({"_", "_p", "p", "pb"})); + EXPECT_THAT(identifierTrigramTokens("__pb"), + trigramsAre({"_", "_p", "p", "pb"})); + + EXPECT_THAT(identifierTrigramTokens("abc_defGhij__klm"), + trigramsAre({"a", "d", "ab", "ad", "de", "abc", "abd", + "ade", "adg", "bcd", "bde", "bdg", "cde", "cdg", + "def", "deg", "dgh", "dgk", "efg", "egh", "egk", + "fgh", "fgk", "ghi", "ghk", "gkl", "hij", "hik", + "hkl", "ijk", "ikl", "jkl", "klm"})); } TEST(DexTrigrams, QueryTrigrams) { @@ -419,8 +424,16 @@ EXPECT_THAT(generateQueryTrigrams(""), trigramsAre({})); EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"})); - EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"})); - EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({})); + EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"_"})); + EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"_"})); + + EXPECT_THAT(generateQueryTrigrams("m_"), trigramsAre({"m"})); + + EXPECT_THAT(generateQueryTrigrams("p_b"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("_p"), trigramsAre({"_p"})); + EXPECT_THAT(generateQueryTrigrams("_pb_"), trigramsAre({"pb"})); + EXPECT_THAT(generateQueryTrigrams("__pb"), trigramsAre({"pb"})); EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); @@ -525,25 +538,45 @@ } TEST(DexTest, ShortQuery) { - auto I = Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(), + auto I = Dex::build(generateSymbols({"_OneTwoFourSix"}), RefSlab(), RelationSlab()); FuzzyFindRequest Req; Req.AnyScope = true; bool Incomplete; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "Empty string is not a short query"; - Req.Query = "t"; + Req.Query = "o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head as unigram"; + + Req.Query = "_o"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using delimiter and first head as bigram"; + + Req.Query = "on"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first head and tail as bigram"; + + Req.Query = "ot"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first two heads as bigram"; + + Req.Query = "tw"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); + EXPECT_TRUE(Incomplete) << "Using first two second head and tail as bigram"; + + Req.Query = "tf"; EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); EXPECT_TRUE(Incomplete) << "Short queries have different semantics"; - Req.Query = "tt"; + Req.Query = "fo"; EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre()); EXPECT_TRUE(Incomplete) << "Short queries have different semantics"; - Req.Query = "ttf"; - EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour")); + Req.Query = "tfs"; + EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("_OneTwoFourSix")); EXPECT_FALSE(Incomplete) << "3-char string is not a short query"; }