Skip to content

Commit

Permalink
[cland] Dex: fix/simplify short-trigram generation
Browse files Browse the repository at this point in the history
Summary:
1) Instead of x$$ for a short-query trigram, just use x
2) Make rules more coherent: prefixes of length 1-2, and first char + next head
3) Fix Dex::fuzzyFind to mark results as incomplete, because
   short-trigram rules only yield a subset of results.

Reviewers: ioeric

Subscribers: ilya-biryukov, jkorous, mgrang, arphaman, kadircet, cfe-commits

Differential Revision: https://reviews.llvm.org/D52808

llvm-svn: 343775
  • Loading branch information
sam-mccall committed Oct 4, 2018
1 parent 87f69ea commit b5bbfef
Showing 4 changed files with 93 additions and 112 deletions.
4 changes: 3 additions & 1 deletion clang-tools-extra/clangd/index/dex/Dex.cpp
Original file line number Diff line number Diff line change
@@ -156,7 +156,9 @@ bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
"There must be no :: in query.");
trace::Span Tracer("Dex fuzzyFind");
FuzzyMatcher Filter(Req.Query);
bool More = false;
// For short queries we use specialized trigrams that don't yield all results.
// Prevent clients from postfiltering them for longer queries.
bool More = !Req.Query.empty() && Req.Query.size() < 3;

std::vector<std::unique_ptr<Iterator>> TopLevelChildren;
const auto TrigramTokens = generateQueryTrigrams(Req.Query);
98 changes: 28 additions & 70 deletions clang-tools-extra/clangd/index/dex/Trigram.cpp
Original file line number Diff line number Diff line change
@@ -23,11 +23,6 @@ namespace clang {
namespace clangd {
namespace dex {

/// This is used to mark unigrams and bigrams and distinct them from complete
/// trigrams. Since '$' is not present in valid identifier names, it is safe to
/// use it as the special symbol.
static const char END_MARKER = '$';

std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Identifier.size());
@@ -47,40 +42,22 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
// not available then 0 is stored.
std::vector<std::array<unsigned, 3>> Next(LowercaseIdentifier.size());
unsigned NextTail = 0, NextHead = 0, NextNextHead = 0;
// Store two first HEAD characters in the identifier (if present).
std::deque<char> TwoHeads;
for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
Next[I] = {{NextTail, NextHead, NextNextHead}};
NextTail = Roles[I] == Tail ? I : 0;
if (Roles[I] == Head) {
NextNextHead = NextHead;
NextHead = I;
TwoHeads.push_front(LowercaseIdentifier[I]);
if (TwoHeads.size() > 2)
TwoHeads.pop_back();
}
}

DenseSet<Token> UniqueTrigrams;

auto add = [&](std::string Chars) {
auto Add = [&](std::string Chars) {
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
};

if (TwoHeads.size() == 2)
add({{TwoHeads.front(), TwoHeads.back(), END_MARKER}});

if (!LowercaseIdentifier.empty())
add({{LowercaseIdentifier.front(), END_MARKER, END_MARKER}});

if (LowercaseIdentifier.size() >= 2)
add({{LowercaseIdentifier[0], LowercaseIdentifier[1], END_MARKER}});

if (LowercaseIdentifier.size() >= 3)
add({{LowercaseIdentifier[0], LowercaseIdentifier[1],
LowercaseIdentifier[2]}});

// Iterate through valid seqneces of three characters Fuzzy Matcher can
// Iterate through valid sequneces of three characters Fuzzy Matcher can
// process.
for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) {
// Skip delimiters.
@@ -92,66 +69,47 @@ std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier) {
for (const unsigned K : Next[J]) {
if (K == 0)
continue;
add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
Add({{LowercaseIdentifier[I], LowercaseIdentifier[J],
LowercaseIdentifier[K]}});
}
}
}
// Emit short-query trigrams: FooBar -> f, fo, fb.
if (!LowercaseIdentifier.empty())
Add({LowercaseIdentifier[0]});
if (LowercaseIdentifier.size() >= 2)
Add({LowercaseIdentifier[0], LowercaseIdentifier[1]});
for (size_t I = 1; I < LowercaseIdentifier.size(); ++I)
if (Roles[I] == Head) {
Add({LowercaseIdentifier[0], LowercaseIdentifier[I]});
break;
}

std::vector<Token> Result;
for (const auto &Trigram : UniqueTrigrams)
Result.push_back(Trigram);

return Result;
return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
}

std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
std::string LowercaseQuery = Query.lower();
if (Query.size() < 3) // short-query trigrams only
return {Token(Token::Kind::Trigram, LowercaseQuery)};

// Apply fuzzy matching text segmentation.
std::vector<CharRole> Roles(Query.size());
calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));

// Additional pass is necessary to count valid identifier characters.
// Depending on that, this function might return incomplete trigram.
unsigned ValidSymbolsCount = 0;
for (const auto Role : Roles)
if (Role == Head || Role == Tail)
++ValidSymbolsCount;

std::string LowercaseQuery = Query.lower();

DenseSet<Token> UniqueTrigrams;

// If the number of symbols which can form fuzzy matching trigram is not
// sufficient, generate a single incomplete trigram for query.
if (ValidSymbolsCount < 3) {
std::string Chars =
LowercaseQuery.substr(0, std::min<size_t>(3UL, Query.size()));
Chars.append(3 - Chars.size(), END_MARKER);
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
} else {
std::deque<char> Chars;
for (size_t I = 0; I < LowercaseQuery.size(); ++I) {
// If current symbol is delimiter, just skip it.
if (Roles[I] != Head && Roles[I] != Tail)
continue;

Chars.push_back(LowercaseQuery[I]);

if (Chars.size() > 3)
Chars.pop_front();

if (Chars.size() == 3) {
UniqueTrigrams.insert(
Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))));
}
}
std::string Chars;
for (unsigned I = 0; I < Query.size(); ++I) {
if (Roles[I] != Head && Roles[I] != Tail)
continue; // Skip delimiters.
Chars.push_back(LowercaseQuery[I]);
if (Chars.size() > 3)
Chars.erase(Chars.begin());
if (Chars.size() == 3)
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
}

std::vector<Token> Result;
for (const auto &Trigram : UniqueTrigrams)
Result.push_back(Trigram);

return Result;
return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
}

} // namespace dex
29 changes: 12 additions & 17 deletions clang-tools-extra/clangd/index/dex/Trigram.h
Original file line number Diff line number Diff line change
@@ -33,26 +33,21 @@ namespace clangd {
namespace dex {

/// Returns list of unique fuzzy-search trigrams from unqualified symbol.
/// The trigrams give the 3-character query substrings this symbol can match.
///
/// First, given Identifier (unqualified symbol name) is segmented using
/// FuzzyMatch API and lowercased. After segmentation, the following technique
/// is applied for generating trigrams: for each letter or digit in the input
/// string the algorithms looks for the possible next and skip-1-next characters
/// which can be jumped to during fuzzy matching. Each combination of such three
/// characters is inserted into the result.
///
/// The symbol's name is broken into segments, e.g. "FooBar" has two segments.
/// Trigrams can start at any character in the input. Then we can choose to move
/// to the next character, move to the start of the next segment, or skip over a
/// segment.
/// to the next character, move to the start of the next segment, or stop.
///
/// This also generates incomplete trigrams for short query scenarios:
/// * Empty trigram: "$$$".
/// * Unigram: the first character of the identifier.
/// * Bigrams: a 2-char prefix of the identifier and a bigram of the first two
/// HEAD characters (if they exist).
//
/// Note: the returned list of trigrams does not have duplicates, if any trigram
/// belongs to more than one class it is only inserted once.
/// Short trigrams (length 1-2) are used for short queries. These are:
/// - prefixes of the identifier, of length 1 and 2
/// - the first character + next head character
///
/// For "FooBar" we get the following trigrams:
/// {f, fo, fb, foo, fob, fba, oob, oba, bar}.
///
/// Trigrams are lowercase, as trigram matching is case-insensitive.
/// Trigrams in the returned list are deduplicated.
std::vector<Token> generateIdentifierTrigrams(llvm::StringRef Identifier);

/// Returns list of unique fuzzy-search trigrams given a query.
74 changes: 50 additions & 24 deletions clang-tools-extra/unittests/clangd/DexTests.cpp
Original file line number Diff line number Diff line change
@@ -367,53 +367,54 @@ trigramsAre(std::initializer_list<std::string> Trigrams) {

TEST(DexTrigrams, IdentifierTrigrams) {
EXPECT_THAT(generateIdentifierTrigrams("X86"),
trigramsAre({"x86", "x$$", "x8$"}));
trigramsAre({"x86", "x", "x8"}));

EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"}));
EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl", "n"}));

EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"}));
EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n"}));

EXPECT_THAT(generateIdentifierTrigrams("clangd"),
trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd"}));
trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"}));

EXPECT_THAT(generateIdentifierTrigrams("abc_def"),
trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde",
"def", "ab$", "ad$"}));
trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde",
"cde", "def"}));

EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"),
trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace",
"bcd", "bce", "bde", "cde", "ab$"}));
trigramsAre({"a", "a_", "ab", "abc", "abd", "acd", "ace", "bcd",
"bce", "bde", "cde"}));

EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"),
trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt",
"iqu", "iqp", "ipt", "que", "qup", "qpt", "uep",
"ept", "ptr", "un$", "up$"}));
trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip",
"npt", "iqu", "iqp", "ipt", "que", "qup", "qpt",
"uep", "ept", "ptr"}));

EXPECT_THAT(
generateIdentifierTrigrams("TUDecl"),
trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$", "td$"}));
trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"}));

EXPECT_THAT(generateIdentifierTrigrams("IsOK"),
trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$"}));
trigramsAre({"i", "is", "io", "iso", "iok", "sok"}));

auto X = generateIdentifierTrigrams("abc_defGhij__klm");
EXPECT_THAT(
generateIdentifierTrigrams("abc_defGhij__klm"),
trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh",
"agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk",
"cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek",
"dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl",
"fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik",
"hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"}));
trigramsAre({"a", "ab", "ad", "abc", "abd", "abg", "ade", "adg",
"adk", "agh", "agk", "bcd", "bcg", "bde", "bdg", "bdk",
"bgh", "bgk", "cde", "cdg", "cdk", "cgh", "cgk", "def",
"deg", "dek", "dgh", "dgk", "dkl", "efg", "efk", "egh",
"egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", "gkl",
"hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm"}));
}

TEST(DexTrigrams, QueryTrigrams) {
EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"}));
EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"}));
EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c"}));
EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl"}));
EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"}));

EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"}));
EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"}));
EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"}));
EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_"}));
EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__"}));
EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({}));

EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"}));

@@ -529,6 +530,31 @@ TEST(DexTest, FuzzyMatch) {
UnorderedElementsAre("LaughingOutLoud", "LittleOldLady"));
}

// TODO(sammccall): enable after D52796 bugfix.
#if 0
TEST(DexTest, ShortQuery) {
auto I =
Dex::build(generateSymbols({"OneTwoThreeFour"}), RefSlab(), URISchemes);
FuzzyFindRequest Req;
bool Incomplete;

EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
EXPECT_FALSE(Incomplete) << "Empty string is not a short query";

Req.Query = "t";
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
EXPECT_TRUE(Incomplete) << "Short queries have different semantics";

Req.Query = "tt";
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre());
EXPECT_TRUE(Incomplete) << "Short queries have different semantics";

Req.Query = "ttf";
EXPECT_THAT(match(*I, Req, &Incomplete), ElementsAre("OneTwoThreeFour"));
EXPECT_FALSE(Incomplete) << "3-char string is not a short query";
}
#endif

TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) {
auto I = Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), RefSlab(),
URISchemes);

0 comments on commit b5bbfef

Please sign in to comment.