Index: clangd/CMakeLists.txt =================================================================== --- clangd/CMakeLists.txt +++ clangd/CMakeLists.txt @@ -8,6 +8,7 @@ ClangdUnit.cpp ClangdUnitStore.cpp DraftStore.cpp + FuzzyMatch.cpp GlobalCompilationDatabase.cpp JSONRPCDispatcher.cpp JSONExpr.cpp Index: clangd/FuzzyMatch.h =================================================================== --- /dev/null +++ clangd/FuzzyMatch.h @@ -0,0 +1,66 @@ +//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements fuzzy-matching of strings against identifiers. +// It indicates both the existence and quality of a match: +// 'eb' matches both 'emplace_back' and 'embed', the former has a better score. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_FUZZYMATCH_H + +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/raw_ostream.h" + +namespace clang { +namespace clangd { +namespace detail { using ScoreT = int_least16_t; } + +// A matcher capable of matching and scoring strings against a single pattern. +// It's optimized for matching against many strings - match() does not allocate. +class FuzzyMatcher { +public: + FuzzyMatcher(llvm::StringRef Pattern); + + // If Word matches the pattern, return a score in [0,1] (higher is better). + llvm::Optional match(llvm::StringRef Word); + + // Dump internal state from the last match() to the stream, for debugging. + void dumpLast(llvm::raw_ostream &) const; + +private: + constexpr static int MaxPat = 63; + constexpr static int MaxWord = 127; + enum CharRole { Unknown, Tail, Head, Separator }; + + bool init(llvm::StringRef Word); + void buildGraph(); + void calculateRoles(const char *Text, CharRole *Out, int N); + detail::ScoreT skipPenalty(int W); + detail::ScoreT matchBonus(int P, int W); + + int NPat, NWord; + char Pat[MaxPat]; + char LPat[MaxPat]; + char Word[MaxWord]; + char LWord[MaxWord]; + CharRole PatRole[MaxPat]; + CharRole WordRole[MaxWord]; + detail::ScoreT Score[MaxPat + 1][MaxWord + 1]; + bool Matched[MaxPat + 1][MaxWord + 1]; // Oversized for alignment. + float ScoreScale; + bool IsSubstring; +}; + +} // namespace clangd +} // namespace clang + +#endif Index: clangd/FuzzyMatch.cpp =================================================================== --- /dev/null +++ clangd/FuzzyMatch.cpp @@ -0,0 +1,292 @@ +//===--- FuzzyMatch.h - Approximate identifier matching ---------*- C++-*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// To check for a match between a Pattern ('u_p') and a Word ('unique_ptr'), +// we consider the possible partial match states: +// +// u n i q u e _ p t r +// +--------------------- +// |A . . . . . . . . . . +// u| +// |. . . . . . . . . . . +// _| +// |. . . . . . . O . . . +// p| +// |. . . . . . . . . . B +// +// Each dot represents some prefix of the pattern being matched against some +// prefix of the word. +// - A is the initial state: '' matched against '' +// - O is an intermediate state: 'u_' matched against 'unique_' +// - B is the target state: 'u_p' matched against 'unique_ptr' +// +// We aim to find the best path from A->B. +// - Moving right (consuming a word character) +// Always legal: not all word characters must match. +// - Moving diagonally (consuming both a word and pattern character) +// Legal if the characters match. +// - Moving down (consuming a pattern character) is never legal. +// Never legal: all pattern characters must match something. +// +// The scoring is based on heuristics: +// - when matching a character, apply a bonus or penalty depending on the +// match quality (does case match, do word segments align, etc) +// - when skipping a character, apply a penalty if it starts a word segment +// - if the first pattern character matches the middle of a word segment, then +// apply such a huge penalty that this match is never used. +// +// We treat strings as byte-sequences, so only ASCII has first-class support. +// +// This algorithm is inspired by VS code's client-side filtering. +// +//===----------------------------------------------------------------------===// + +#include "FuzzyMatch.h" +#include "llvm/ADT/Optional.h" +#include "llvm/Support/Format.h" + +using namespace llvm; +using namespace clang::clangd; +using clang::clangd::detail::ScoreT; + +const int FuzzyMatcher::MaxPat; +const int FuzzyMatcher::MaxWord; + +static char lower(char C) { return C >= 'A' && C <= 'Z' ? C + ('a' - 'A') : C; } +// A "negative infinity" score that won't overflow. +// We use this to mark unreachable states and forbidden solutions. +static constexpr ScoreT AwfulScore = std::numeric_limits::min() / 2; +static bool isAwful(ScoreT S) { return S < AwfulScore / 2; } + +FuzzyMatcher::FuzzyMatcher(StringRef Pattern) + : NPat(std::min(MaxPat, Pattern.size())), NWord(0), + ScoreScale(0.5f / NPat) { + memcpy(Pat, Pattern.data(), NPat); + for (int I = 0; I < NPat; ++I) + LPat[I] = lower(Pat[I]); + Score[0][0] = 0; + for (int P = 0; P <= NPat; ++P) + for (int W = 0; W < P; ++W) + Score[P][W] = AwfulScore; + calculateRoles(Pat, PatRole, NPat); +} + +Optional FuzzyMatcher::match(StringRef Word) { + if (!NPat) + return 1; + if (!(IsSubstring = init(Word))) + return None; + buildGraph(); + if (isAwful(Score[NPat][NWord])) + return None; + return ScoreScale * std::min(2 * NPat, std::max(0, Score[NPat][NWord])); +} + +// Segmentation of words and patterns +// We mark each character as the Head or Tail of a segment, or a Separator. +// e.g. XMLHttpRequest_Async +// +--+---+------ +---- +// +// This happens in two stages: +// 1. We classify chars into Types: +// Upper is uppercase letters +// Lower is lowercase letters, and also numbers +// Punctuation is most other things +// 2. A characters segment role can be determined by the Types of itself and +// its neighbors. e.g. the middle entry in (Upper, Upper, Lower) is a Head. +// The Empty type is used to represent the start/end of string. +// +// Both of these stages are performed using lookup tables. +// Type and Role can each be represented into 2 bits, so we pack 4 into a byte. +namespace { +enum CharType { Empty, Lower, Upper, Punctuation }; +// 4 packed CharTypes per entry, indexed by char. +constexpr uint8_t CharTypes[] = { + 0x00, 0x00, 0x00, 0x00, // Control characters + 0x00, 0x00, 0x00, 0x00, // Control characters + 0xff, 0xff, 0xff, 0xff, // Punctuation + 0x55, 0x55, 0xf5, 0xff, // Numbers->Lower, more Punctuation. + 0xab, 0xaa, 0xaa, 0xaa, // @ and A-O + 0xaa, 0xaa, 0xea, 0xff, // P-Z, more Punctuation. + 0x57, 0x55, 0x55, 0x55, // ` and a-o + 0x55, 0x55, 0xd5, 0x3f, // p-z, Punctuation, DEL. + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // Bytes over 127 -> Lower. + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, // This includes UTF-8. + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, + 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, 0x55, +}; +// 4 packed CharRoles per entry, indexed by a packed triple of CharTypes: +// (previous, current, next). +constexpr uint8_t CharRoles[] = { + 0x00, 0xaa, 0xaa, 0xff, // At start of word, Lower|Upper -> Head + 0x00, 0x55, 0xaa, 0xff, // After Lower, Upper->Head, Lower->Tail + 0x00, 0x55, 0x59, 0xff, // After Upper, Upper->Tail, Lower->Tail + // Exception: [Upper] Upper [Lower]->Head + 0x00, 0xaa, 0xaa, 0xff, // After Separator, Lower|Upper -> Head +}; +template T PackedLookup(const uint8_t *Data, int I) { + return static_cast((Data[I >> 2] >> ((I & 3) * 2)) & 3); +} +} // namespace +void FuzzyMatcher::calculateRoles(const char *Text, CharRole *Out, int N) { + int Types = PackedLookup(CharTypes, Text[0]); + auto Rotate = [&](CharType T) { Types = ((Types << 2) | T) & 0x3f; }; + for (int I = 0; I < N - 1; ++I) { + Rotate(PackedLookup(CharTypes, Text[I + 1])); + *Out++ = PackedLookup(CharRoles, Types); + } + Rotate(Empty); + *Out++ = PackedLookup(CharRoles, Types); +} + +// Sets up the data structures matching Word. +// Returns false if we can cheaply determine that no match is possible. +bool FuzzyMatcher::init(StringRef NewWord) { + NWord = std::min(MaxWord, NewWord.size()); + if (NPat > NWord) + return false; + memcpy(Word, NewWord.data(), NWord); + for (int I = 0; I < NWord; ++I) + LWord[I] = lower(Word[I]); + + // Cheap subsequence check. + for (int W = 0, P = 0; P != NPat; ++W) { + if (W == NWord) + return false; + if (LWord[W] == LPat[P]) + ++P; + } + + calculateRoles(Word, WordRole, NWord); + return true; +} + +// The forwards pass finds the mappings of Pattern onto Word. +// Score = best score achieved matching Word[..W] against Pat[..P]. +// Unlike other tables, indices range from 0 to N *inclusive* +// Matched = whether we chose to match Word[W] with Pat[P] or not. +// +// Points are mostly assigned to matched characters, with 1 being a good score +// and 2 being a great one. So we treat the score range as [0, 2 * NPat]. +// This range is not strict: we can apply larger bonuses/penalties, or penalize +// non-matched characters. +void FuzzyMatcher::buildGraph() { + for (int W = 0; W < NWord; ++W) + Score[0][W + 1] = Score[0][W] - skipPenalty(W); + for (int P = 0; P < NPat; ++P) { + for (int W = P; W < NWord; ++W) { + ScoreT Left = Score[P + 1][W]; // Score if not matching char W. + if (P < NPat - 1) // Skipping trailing characters is free. + Left -= skipPenalty(W); + if (LPat[P] == LWord[W]) { // Is a match possible? + ScoreT Diag = Score[P][W] + matchBonus(P, W); // Score if matching. + if (Diag >= Left) { // Is matching better than skipping? + Left = Score[P + 1][W + 1] = Diag; + Matched[P][W] = true; + continue; + } + } + Score[P + 1][W + 1] = Left; + Matched[P][W] = false; + } + } +} + +ScoreT FuzzyMatcher::skipPenalty(int W) { + if (WordRole[W] == Head) // Skipping a segment. + return 2; + return 0; +} + +ScoreT FuzzyMatcher::matchBonus(int P, int W) { + // Forbidden: matching the first pattern character in the middle of a segment. + if (!P && WordRole[W] == Tail) + return AwfulScore; + ScoreT S = 0; + // Bonus: pattern is part of a word prefix. + if (P == W) + ++S; + // Bonus: case matches, or an asserted word break matches an actual. + if (Pat[P] == Word[W] || (PatRole[P] == Head && WordRole[W] == Head)) + ++S; + // Penalty: matching inside a word where the previous didn't match. + if (WordRole[W] == Tail && P && !Matched[P - 1][W - 1]) + S -= 3; + // Penalty: asserted word break but no actual. + if (PatRole[P] == Head && WordRole[W] == Tail) + --S; + return S; +} + +void FuzzyMatcher::dumpLast(llvm::raw_ostream &OS) const { + OS << "=== Match \"" << StringRef(Word, NWord) << "\" against [" + << StringRef(Pat, NPat) << "] ===\n"; + if (!IsSubstring) { + OS << "Substring check failed.\n"; + return; + } + + // Traverse Matched table backwards to reconstruct the Pattern/Word mapping. + // The Score table has cumulative scores, subtracting along this path gives + // us the per-letter scores. + ScoreT S[MaxWord]; + bool M[MaxWord] = {}; + for (int W = NWord - 1, P = NPat - 1; W >= 0; --W) { + if (P >= 0 && Matched[P][W]) { + S[W] = Score[P + 1][W + 1] - Score[P][W]; + --P; + M[W] = true; + } else + S[W] = Score[P + 1][W + 1] - Score[P + 1][W]; + } + for (char C : StringRef(Word, NWord)) + OS << " " << C << " "; + OS << "\n"; + for (int I = 0, J = 0; I < NWord; I++) + OS << " " << (M[I] ? Pat[J++] : ' ') << " "; + OS << "\n"; + for (int I = 0; I < NWord; I++) + OS << format("%2d ", S[I]); + OS << "\n"; + + OS << "\nSegmentation:"; + OS << "\n'" << StringRef(Word, NWord) << "'\n "; + for (int I = 0; I < NWord; ++I) + OS << "?-+ "[static_cast(WordRole[I])]; + OS << "\n[" << StringRef(Pat, NPat) << "]\n "; + for (int I = 0; I < NPat; ++I) + OS << "?-+ "[static_cast(PatRole[I])]; + OS << "\n"; + + OS << "\nScoring table:\n"; + OS << " | "; + for (char C : StringRef(Word, NWord)) + OS << " " << C << " "; + OS << "\n"; + OS << "-+----" << std::string(NWord * 4, '-') << "\n"; + for (int I = 0; I <= NPat; ++I) { + OS << (I ? Pat[I - 1] : ' ') << "|"; + for (int J = 0; J <= NWord; ++J) + if (isAwful(Score[I][J])) + OS << format("%3d ", Score[I][J]); + else + OS << " "; + OS << "\n"; + } + + OS << "\nMatch graph:\n"; + OS << " |" << StringRef(Word, NWord) << "\n"; + OS << "-+" << std::string(NWord, '-') << "\n"; + for (int I = 0; I < NPat; ++I) { + OS << Pat[I] << "|"; + for (int J = 0; J < NWord; ++J) + OS << " 1"[static_cast(Matched[I][J])]; + OS << "\n"; + } +} Index: unittests/clangd/CMakeLists.txt =================================================================== --- unittests/clangd/CMakeLists.txt +++ unittests/clangd/CMakeLists.txt @@ -10,6 +10,7 @@ add_extra_unittest(ClangdTests ClangdTests.cpp + FuzzyMatchTests.cpp JSONExprTests.cpp TraceTests.cpp ) Index: unittests/clangd/FuzzyMatchTests.cpp =================================================================== --- /dev/null +++ unittests/clangd/FuzzyMatchTests.cpp @@ -0,0 +1,130 @@ +//===-- FuzzyMatchTests.cpp - String fuzzy matcher tests --------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "FuzzyMatch.h" + +#include "llvm/ADT/StringExtras.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +namespace clang { +namespace clangd { +namespace { +using namespace llvm; +using testing::Not; + +struct MatchesMatcher : public testing::MatcherInterface { + StringRef Candidate; + MatchesMatcher(StringRef Candidate) : Candidate(Candidate) {} + + void DescribeTo(::std::ostream *OS) const override { + *OS << "Matches '" << Candidate.str() << "'"; + } + + bool MatchAndExplain(StringRef Pattern, + testing::MatchResultListener *L) const override { + std::unique_ptr OS( + L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream())) + : new raw_null_ostream()); + FuzzyMatcher Matcher(Pattern); + auto Result = Matcher.match(Candidate); + Matcher.dumpLast(*OS << "\n"); + return !!Result; + } +}; + +// Accepts patterns that match a given word. +// Dumps the debug tables on match failure. +testing::Matcher matches(StringRef Word) { + return testing::MakeMatcher(new MatchesMatcher(Word)); +} + +TEST(FuzzyMatch, Matches) { + EXPECT_THAT("u_p", matches("unique_ptr")); + EXPECT_THAT("up", matches("unique_ptr")); + EXPECT_THAT("uq", matches("unique_ptr")); + EXPECT_THAT("qp", Not(matches("unique_ptr"))); + EXPECT_THAT("log", Not(matches("SVGFEMorphologyElement"))); +} + +struct RankMatcher : public testing::MatcherInterface { + std::vector RankedStrings; + RankMatcher(std::initializer_list RankedStrings) + : RankedStrings(RankedStrings) {} + + void DescribeTo(::std::ostream *OS) const override { + *OS << "Ranks strings in order: [" + << join(RankedStrings.begin(), RankedStrings.end(), ", ") << "]"; + } + + bool MatchAndExplain(StringRef Pattern, + testing::MatchResultListener *L) const override { + std::unique_ptr OS( + L->stream() ? (raw_ostream *)(new raw_os_ostream(*L->stream())) + : new raw_null_ostream()); + FuzzyMatcher Matcher(Pattern); + StringRef LastString; + Optional LastScore; + bool Ok = true; + for (StringRef Str : RankedStrings) { + auto Score = Matcher.match(Str); + if (!Score) { + *OS << "\nDoesn't match '" << Str.str() << "'"; + Matcher.dumpLast(*OS << "\n"); + Ok = false; + } else if (LastScore && *LastScore < *Score) { + *OS << "\nRanks '" << Str.str() << "'=" << *Score << " above '" + << LastString.str() << "'=" << *LastScore; + Matcher.dumpLast(*OS << "\n"); + Matcher.match(LastString); + Matcher.dumpLast(*OS << "\n"); + Ok = false; + } + LastString = Str; + LastScore = Score; + } + return Ok; + } +}; + +// Accepts patterns that match all the strings and rank them in the given order. +// Dumps the debug tables on match failure. +template testing::Matcher ranks(T... RankedStrings) { + return testing::MakeMatcher(new RankMatcher{RankedStrings...}); +} + +TEST(FuzzyMatch, Ranking) { + EXPECT_THAT("eb", ranks("emplace_back", "embed")); + EXPECT_THAT("cons", ranks("console", "Console", "ArrayBufferConstructor")); + EXPECT_THAT("foo", ranks("foo", "Foo")); + EXPECT_THAT("onMess", ranks("onMessage", "onmessage", "onThisMegaEscapes")); + EXPECT_THAT("CC", ranks("CamelCase", "camelCase")); + EXPECT_THAT("cC", ranks("camelCase", "CamelCase")); + EXPECT_THAT("p", ranks("parse", "posix", "pafdsa", "path", "p")); + EXPECT_THAT("pa", ranks("parse", "path", "pafdsa")); + EXPECT_THAT("log", ranks("log", "ScrollLogicalPosition")); + EXPECT_THAT("e", ranks("else", "AbstractElement")); + EXPECT_THAT("workbench.sideb", + ranks("workbench.sideBar.location", + "workbench.editor.defaultSideBySideLayout")); + EXPECT_THAT("editor.r", ranks("editor.renderControlCharacter", + "editor.overviewRulerlanes", + "diffEditor.renderSideBySide")); + EXPECT_THAT("-mo", ranks("-moz-columns", "-ms-ime-mode")); + EXPECT_THAT("convertModelPosition", + ranks("convertModelPositionToViewPosition", + "convertViewToModelPosition")); + EXPECT_THAT("is", ranks("isValidViewletId", "import statement")); + EXPECT_THAT("title", ranks("window.title", "files.trimTrailingWhitespace")); + EXPECT_THAT("strcpy", ranks("strcpy", "strcpy_s", "strncpy")); +} + +} // namespace +} // namespace clangd +} // namespace clang