Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -34,6 +34,7 @@ TUScheduler.cpp URI.cpp XRefs.cpp + index/CanonicalIncludes.cpp index/FileIndex.cpp index/Index.cpp @@ -42,6 +43,9 @@ index/SymbolCollector.cpp index/SymbolYAML.cpp + index/dex/Token.cpp + index/dex/Trigram.cpp + LINK_LIBS clangAST clangASTMatchers Index: clang-tools-extra/clangd/index/dex/Token.h =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/Token.h @@ -0,0 +1,115 @@ +//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Tokens are keys for inverted index which are mapped to the +// corresponding posting lists. Token objects represent a characteristic +// of a symbol, which can be used to perform efficient search. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H + +#include "llvm/ADT/DenseMap.h" + +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +/// Hashable Token, which represents a search token primitive, such as +/// trigram for fuzzy search on unqualified symbol names. +/// +/// Tokens can be used to perform more sophisticated search queries by +/// constructing complex iterator trees. +class Token { +public: + /// Kind specifies Token type which defines semantics for the internal + /// representation (Data field), examples of such types are: + /// + /// * Trigram for fuzzy search on unqualified symbol names. + /// * Scope primitives, e.g. "symbol belongs to namespace foo::bar". + /// * If the symbol represents a variable, token can be its type such as int, + /// clang::Decl, … + /// * For a symbol representing a function, this can be the + /// return type. + /// + /// Each Kind has different representation (i.e. Data field contents): + /// + /// * Trigram: 3 bytes containing trigram characters + /// * Scope: full scope name, such as "foo::bar::baz::" or "" (global scope) + /// * Path: full or relative path to the directory + /// * Type: full type name or the USR associated with this type + /// + /// More Kinds can be added in the future. + enum class Kind { + Trigram, + Scope, + }; + + Token(llvm::StringRef Data, Kind TokenKind); + + // Returns precomputed hash. + size_t hash(const Token &T) const { return Hash; } + + bool operator==(const Token &Other) const { + return Hash == Other.Hash && TokenKind == Other.TokenKind && + Data == Other.Data; + } + + llvm::StringRef data() const { return Data; } + + const Kind &kind() const { return TokenKind; } + +private: + friend llvm::hash_code hash_value(const Token &Token) { return Token.Hash; } + + /// Representation which is unique among Token with the same Kind. + // FIXME(kbobyrev): Put this into another structure + std::string Data; + /// Precomputed hash which is used as a key for inverted index. + size_t Hash; + Kind TokenKind; +}; + +} // namespace dex +} // namespace clangd +} // namespace clang + +namespace llvm { + +// Support Tokens as DenseMap keys. +template <> struct DenseMapInfo { + static inline clang::clangd::dex::Token getEmptyKey() { + static clang::clangd::dex::Token EmptyKey( + "EMPTYKEY", clang::clangd::dex::Token::Kind::Scope); + return EmptyKey; + } + + static inline clang::clangd::dex::Token getTombstoneKey() { + static clang::clangd::dex::Token TombstoneKey( + "TOMBSTONE_KEY", clang::clangd::dex::Token::Kind::Scope); + return TombstoneKey; + } + + static unsigned getHashValue(const clang::clangd::dex::Token &Tag) { + return hash_value(Tag); + } + + static bool isEqual(const clang::clangd::dex::Token &LHS, + const clang::clangd::dex::Token &RHS) { + return LHS == RHS; + } +}; + +} // namespace llvm + +#endif Index: clang-tools-extra/clangd/index/dex/Token.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/Token.cpp @@ -0,0 +1,37 @@ +//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Token.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Twine.h" + +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +Token::Token(llvm::StringRef Data, Kind TokenKind) + : Data(Data), TokenKind(TokenKind) { + assert(TokenKind != Kind::Trigram || + Data.size() == 3 && "Trigram should contain three characters."); + switch (TokenKind) { + case Kind::Trigram: + Hash = ((Data[0] << 16) & (Data[1] << 8) & Data[2]); + break; + default: + Hash = std::hash{}(Data); + break; + } +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clang-tools-extra/clangd/index/dex/Trigram.h =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/Trigram.h @@ -0,0 +1,101 @@ +//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// T +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H + +#include "Token.h" + +#include + +namespace clang { +namespace clangd { +namespace dex { + +/// Splits unqualified symbol name into segments and casts letters to lowercase +/// for trigram generation. +/// +/// First stage of trigram generation algorithm. Given an unqualified symbol +/// name, this outputs a sequence of string segments using the following rules: +/// +/// * '_' is a separator. Multiple consecutive underscores are treated as a +/// single separator. Underscores at the beginning and the end of the symbol +/// name are skipped. +/// +/// Examples: "unique_ptr" -> ["unique", "ptr"], +/// "__builtin_popcount" -> ["builtin", "popcount"] +/// "snake____case___" -> ["snake", "case"] +/// +/// * Lowercase letter followed by an uppercase letter is a separator. +/// +/// Examples: "kItemsCount" -> ["k", "Items", "Count"] +/// +/// * Sequences of consecutive uppercase letters followed by a lowercase letter: +/// the last uppercase letter is treated as the beginning of a next segment. +/// +/// Examples: "TUDecl" -> ["TU", "Decl"] +/// "kDaysInAWeek" -> ["k", "Days", "In", "A", "Week"] +/// +/// Note: digits are treated as lowercase letters. Example: "X86" -> ["X86"] +/// +/// FIXME(kbobyrev): Use the same segmentation algorithm as in fuzzy matching. +/// FIXME(kbobyrev): Return StringRefs or Offsets. +std::vector +segmentIdentifier(llvm::StringRef SymbolName); + +/// Returns list of unique fuzzy-search trigrams from unqualified symbol. +/// +/// Runs trigram generation for fuzzy-search index on segments produced by +/// segmentIdentifier(SymbolName); +/// +/// +/// The motivation for trigram generation algorithm is that extracted trigrams +/// are 3-char suffixes of paths through the fuzzy matching automaton. There are +/// four classes of extracted trigrams: +/// +/// * The simplest one consists of consecutive 3-char sequences of each segment. +/// +/// Example: "trigram" -> ["tri", "rig", "igr", "gra", "ram" +/// +/// * Next class consists of front character of subsequent segments. +/// +/// Example: ["translation", "unit", "decl"] -> ["tud"] +/// +/// Note: skipping segments is allowed, but not more than one. For example, +/// given ["a", "b", "c", "d", "e"] -> "ace" is allowed, but "ade" is not. +/// +/// * Another class of trigrams consists of those with 2 charactersin one +/// segment and the front character of subsequent segment (just as before, +/// skipping up to one segment is allowed). +/// +/// Example: ["ab", "c", "d", "e"] -> ["abc", "abd", "abe"] +/// Note: similarly to the previous case, "abe" would not be allowed. +/// +/// * The last class of trigrams is similar to the previous one: it takes one +/// character from one segment and two front characters from the next or +/// skip-1-next segments. +/// +/// Example: ["a", "bc", "de", "fg"] -> ["abc", "ade"] +/// But not "afg". +/// +/// Note: the returned list of trigrams does not have duplicates, if any +/// trigram belongs to more than one class it is only inserted once. +std::vector +generateTrigrams(const std::vector &Segments); + + +} // namespace dex +} // namespace clangd +} // namespace clang + +#endif Index: clang-tools-extra/clangd/index/dex/Trigram.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -0,0 +1,178 @@ +//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// +// +//===----------------------------------------------------------------------===// + +#include "Trigram.h" +#include "Token.h" + +#include "llvm/ADT/DenseSet.h" + +#include +#include + +using namespace llvm; + +namespace clang { +namespace clangd { +namespace dex { + +// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would +// be generating unigrams and bigrams here, too. This would prevent symbol index +// from applying fuzzy matching on a tremendous number of symbols and allow +// supplementary retrieval for short queries. +// Short names (< 3 characters) are currently ignored. +std::vector generateTrigrams(const std::vector &Segments) { + llvm::DenseSet UniqueTrigrams; + std::vector Trigrams; + + // Extract trigrams consisting of first characters of tokens sorted bytoken + // positions. Trigram generator is allowed to skip 1 word between each token. + // + // Example: ["a", "b", "c", "d", "e"] + // + // would produce -> ["abc", "acd", "ace", ...] (among the others) + // + // but not -> ["ade"] because two tokens ("b" and "c") would be skipped in + // this case. + for (auto FirstSegment = Segments.begin(); FirstSegment != Segments.end(); + ++FirstSegment) { + for (auto SecondSegment = FirstSegment + 1; + (SecondSegment <= FirstSegment + 2) && + (SecondSegment != Segments.end()); + ++SecondSegment) { + for (auto ThirdSegment = SecondSegment + 1; + (ThirdSegment <= SecondSegment + 2) && + (ThirdSegment != Segments.end()); + ++ThirdSegment) { + // FIXME(kbobryev): This is wrong. Should be *FirstSegment[0] + ... + Token Trigram((*FirstSegment + *SecondSegment + *ThirdSegment), + Token::Kind::Trigram); + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + Trigrams.push_back(Trigram); + } + } + } + } + + // Iterate through each token with a sliding window and extract trigrams + // consisting of 3 consecutive characters. + // + // Example: "delete" -> ["del", "ele", "let", "ete"] + for (const auto &Segment : Segments) { + // Token should have at least three characters to have trigram substrings. + if (Segment.size() < 3) + continue; + + for (size_t Position = 0; Position + 2 < Segment.size(); ++Position) + Trigrams.push_back( + Token(Segment.substr(Position, 3), Token::Kind::Trigram)); + } + + // This loop generates both trigrams of the third and fourth classes. It + // iterates through each two "subsequent" (consecutive or skip-1-next) tokens + // and extracts trigrams out of each pair. + for (auto FirstSegment = Segments.begin(); FirstSegment != Segments.end(); + ++FirstSegment) { + for (auto SecondSegment = FirstSegment + 1; + (SecondSegment <= FirstSegment + 2) && + (SecondSegment != Segments.end()); + ++SecondSegment) { + for (size_t FirstSegmentIndex = 0; + FirstSegmentIndex < FirstSegment->size(); ++FirstSegmentIndex) { + // Extract trigrams of the third class: one character of the first token + // and two characters from the next or skip-1-next token. + if (FirstSegmentIndex + 1 < FirstSegment->size()) { + Token Trigram((FirstSegment->substr(FirstSegmentIndex, 2) + + SecondSegment->substr(0, 1)), + Token::Kind::Trigram); + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + Trigrams.push_back(Trigram); + } + } + // Extract trigrams of the last class: two character from the first + // token and front character from the next or skip-1-next token. + if (SecondSegment->size() > 1) { + Token Trigram((FirstSegment->substr(FirstSegmentIndex, 1) + + SecondSegment->substr(0, 2)), + Token::Kind::Trigram); + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + Trigrams.push_back(Trigram); + } + } + } + } + } + + return Trigrams; +} + +std::vector segmentIdentifier(StringRef SymbolName) { + std::vector Segments; + size_t SegmentStart = 0; + // Skip underscores at the beginning, e.g. "__builtin_popcount". + while (SymbolName[SegmentStart] == '_') + ++SegmentStart; + + for (size_t Index = SegmentStart; Index + 1 < SymbolName.size(); ++Index) { + const char CurrentSymbol = SymbolName[Index]; + const char NextSymbol = SymbolName[Index + 1]; + // Skip sequences of underscores, e.g. "my__type". + if (CurrentSymbol == '_' && NextSymbol == '_') { + ++SegmentStart; + continue; + } + + // Splits if the next symbol is underscore or if processed characters are + // [lowercase, Uppercase] which indicates beginning of next token. Digits + // are equivalent to lowercase symbols. + if ((NextSymbol == '_') || + ((islower(CurrentSymbol) || isdigit(CurrentSymbol)) && + isupper(NextSymbol))) { + Segments.push_back( + SymbolName.substr(SegmentStart, Index - SegmentStart + 1)); + SegmentStart = Index + 1; + if (NextSymbol == '_') + ++SegmentStart; + } + + // If there were N (> 1) consecutive uppercase letter the split should + // generate two tokens, one of which would consist of N - 1 first uppercase + // letters, the next token begins with the last uppercase letter. + // + // Example: "TUDecl" -> ["TU", "Decl"] + if (isupper(CurrentSymbol) && + (islower(NextSymbol) || (isdigit(NextSymbol)))) { + // Don't perform split if Index points to the beginning of new token, + // otherwise "NamedDecl" would be split into ["N", "amed", "D", "ecl"] + if (Index == SegmentStart) + continue; + Segments.push_back(SymbolName.substr(SegmentStart, Index - SegmentStart)); + SegmentStart = Index; + } + } + + if (SegmentStart < SymbolName.size()) + Segments.push_back(SymbolName.substr(SegmentStart)); + + // Apply lowercase text normalization. + for (auto &Segment : Segments) + std::for_each(Segment.begin(), Segment.end(), ::tolower); + + return Segments; +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clang-tools-extra/unittests/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/unittests/clangd/CMakeLists.txt +++ clang-tools-extra/unittests/clangd/CMakeLists.txt @@ -15,9 +15,10 @@ CodeCompleteTests.cpp CodeCompletionStringsTests.cpp ContextTests.cpp + DexIndexTests.cpp DraftStoreTests.cpp - FileIndexTests.cpp FileDistanceTests.cpp + FileIndexTests.cpp FindSymbolsTests.cpp FuzzyMatchTests.cpp GlobalCompilationDatabaseTests.cpp @@ -27,11 +28,11 @@ SourceCodeTests.cpp SymbolCollectorTests.cpp SyncAPI.cpp + TUSchedulerTests.cpp TestFS.cpp TestTU.cpp ThreadingTests.cpp TraceTests.cpp - TUSchedulerTests.cpp URITests.cpp XRefsTests.cpp ) Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- /dev/null +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -0,0 +1,92 @@ +//===-- DexIndexTests.cpp ----------------------------*- C++ -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "index/dex/Token.h" +#include "index/dex/Trigram.h" +#include "gtest/gtest.h" + +#include +#include + +using std::string; +using std::vector; + +namespace clang { +namespace clangd { +namespace dex { + +vector getTrigrams(std::initializer_list Trigrams) { + vector Result; + for (const auto &Symbols : Trigrams) { + Result.push_back(Token(Symbols, Token::Kind::Trigram)); + } + return Result; +} + +TEST(DexIndexTokens, TrigramSymbolNameTokenization) { + EXPECT_EQ(segmentIdentifier("unique_ptr"), vector({"unique", "ptr"})); + + EXPECT_EQ(segmentIdentifier("TUDecl"), vector({"TU", "Decl"})); + + EXPECT_EQ(segmentIdentifier("table_name_"), + vector({"table", "name"})); + + EXPECT_EQ(segmentIdentifier("kDaysInAWeek"), + vector({"k", "Days", "In", "A", "Week"})); + + EXPECT_EQ(segmentIdentifier("AlternateUrlTableErrors"), + vector({"Alternate", "Url", "Table", "Errors"})); + + EXPECT_EQ(segmentIdentifier("IsOK"), vector({"Is", "OK"})); + + EXPECT_EQ(segmentIdentifier("ABSL_FALLTHROUGH_INTENDED"), + vector({"ABSL", "FALLTHROUGH", "INTENDED"})); + + EXPECT_EQ(segmentIdentifier("SystemZ"), vector({"System", "Z"})); + + EXPECT_EQ(segmentIdentifier("X86"), vector({"X86"})); + + EXPECT_EQ(segmentIdentifier("ASTNodeKind"), + vector({"AST", "Node", "Kind"})); + + EXPECT_EQ(segmentIdentifier("ObjCDictionaryElement"), + vector({"Obj", "C", "Dictionary", "Element"})); + + EXPECT_EQ(segmentIdentifier("multiple__underscores___everywhere____"), + vector({"multiple", "underscores", "everywhere"})); + + EXPECT_EQ(segmentIdentifier("__cuda_builtin_threadIdx_t"), + vector({"cuda", "builtin", "thread", "Idx", "t"})); + + EXPECT_EQ(segmentIdentifier("longUPPERCASESequence"), + vector({"long", "UPPERCASE", "Sequence"})); +} + +// FIXME(kbobyrev): Add a test for "ab_cd_ef_gh". +TEST(DexIndexTrigrams, TrigramGeneration) { + EXPECT_EQ( + generateTrigrams(segmentIdentifier("a_b_c_d_e_")), + getTrigrams({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + + EXPECT_EQ(generateTrigrams(segmentIdentifier("clangd")), + getTrigrams({"cla", "lan", "ang", "ngd"})); + + EXPECT_EQ(generateTrigrams(segmentIdentifier("abc_def")), + getTrigrams({"abc", "def", "abd", "ade", "bcd", "bde", "cde"})); + + EXPECT_EQ(generateTrigrams(segmentIdentifier("unique_ptr")), + getTrigrams({"uni", "niq", "iqu", "que", "ptr", "unp", "upt", "nip", + "npt", "iqp", "ipt", "qup", "qpt", "uep", "ept"})); + + EXPECT_EQ(generateTrigrams(segmentIdentifier("nl")), getTrigrams({})); +} + +} // namespace dex +} // namespace clangd +} // namespace clang