Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -34,6 +34,7 @@ TUScheduler.cpp URI.cpp XRefs.cpp + index/CanonicalIncludes.cpp index/FileIndex.cpp index/Index.cpp @@ -42,6 +43,8 @@ index/SymbolCollector.cpp index/SymbolYAML.cpp + index/noctem/SearchAtom.cpp + LINK_LIBS clangAST clangASTMatchers Index: clang-tools-extra/clangd/index/noctem/SearchAtom.h =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/noctem/SearchAtom.h @@ -0,0 +1,182 @@ +//===--- SearchAtom.h- Symbol Search primitive ------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// SearchAtoms are keys for inverted index which are mapped to the corresponding +// posting lists. SearchAtom objects represent a characteristic of a symbol, +// which can be used to perform efficient search. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_NOCTEM_TRIGRAM_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_NOCTEM_TRIGRAM_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallString.h" +#include + +namespace clang { +namespace clangd { +namespace noctem { + +/// \brief Hashable SearchAtom, which represents a search token primitive. +/// +/// The following items are examples of tokens: +/// +/// * Symbol name for trigram-based search. * Proximity path primitives, e.g. +/// "symbol is defined in directory $HOME/dev/llvm or its prefix". * Scope +/// primitives, e.g. "symbol belongs to namespace foo::bar or its prefix". * If +/// the symbol represents a variable, token can be its type such as int, +/// clang::Decl, … * For a symbol representing a function, this can be the +/// return type. +/// +/// Tokens can be used to perform more sophisticated search queries by +/// constructing complex iterator trees. +class SearchAtom { +public: + enum class Namespace : short { + Trigram, + Scope, + Path, + }; + + SearchAtom() = default; + SearchAtom(llvm::StringRef Data, Namespace Type = Namespace::Trigram) + : Data(Data), Hash(std::hash{}(Data)), Type(Type) {} + + // Returns precomputed hash. + size_t operator()(const SearchAtom &T) const { return Hash; } + + bool operator==(const SearchAtom &Other) const { + return Type == Other.Type && Data == Other.Data; + } + + const llvm::StringRef getData() const { return Data; } + + const Namespace &getType() const { return Type; } + +private: + friend llvm::hash_code hash_value(const SearchAtom &Atom) { + return Atom.Hash; + } + + // FIXME(kbobyrev): Instead of storing the plain string (e.g. trigram + // characters or namespace name) it might make sense to store a unique ID of + // the tag. Otherwise the storage will be populated with partial strings, the + // concern is for long namespaces and file paths: e.g. if there is path + // a/b/c/d/e all prefixes will be stored. However, it's not clear how and when + // exactly these prefix paths (scopes, etc) should be mapped to IDs. It's + // probably worth introducing some kind of SearchAtomManager or something + // similar so that each path/scope part is stored exactly once and the Manager + // generates tags for each prefix. + // TODO(kbobyrev): Do a better job at documenting this one. + llvm::SmallString<3> Data; + size_t Hash; + Namespace Type; +}; + +/// \brief Splits unqualified symbol name into tokens for trigram generation. +/// +/// First stage of trigram generation algorithm. Given an unqualified symbol +/// name, this outputs a sequence of string tokens using the following rules: +/// +/// * '_' is a separator. Multiple consecutive underscores are treated as a +/// single separator. Underscores at the beginning and the end of the symbol +/// name are skipped. +/// +/// Examples: "unique_ptr" -> ["unique", "ptr"], +/// "__builtin_popcount" -> ["builtin", "popcount"] +/// "snake____case___" -> ["snake", "case"] +/// +/// * Lowercase letter followed by an uppercase letter is a separator. +/// +/// Examples: "kItemsCount" -> ["k", "Items", "Count"] +/// +/// * Sequences of consecutive uppercase letters followed by a lowercase letter: +/// the last uppercase letter is treated as the beginning of a next token. +/// +/// Examples: "TUDecl" -> ["TU", "Decl"] +/// "kDaysInAWeek" -> ["k", "Days", "In", "A", "Week"] +/// +/// Note: digits are treated as lowercase letters. Example: "X86" -> ["X86"] +std::vector> tokenize(llvm::StringRef SymbolName); + +// TODO(kbobyrev): Do a better job at documenting this one. +/// \brief Returns list of unique fuzzy-search trigrams from unqualified symbol. +/// +/// Combines all stages of trigram generation for fuzzy-search index. +/// +/// 0. Splits SymbolName into tokens by applying tokenize() +/// 1. Casts all letters to lowercase. +/// 2. Generates trigrams. +/// +/// The motivation for trigram generation algorithm is that extracted trigrams +/// are 3-char suffixes of paths through the fuzzy matching automaton. There are +/// four classes of extracted trigrams: +/// +/// * The simplest one consists of consecutive 3-char sequences of each token. +/// +/// Example: "trigram" -> ["tri", "rig", "igr", "gra", "ram" +/// +/// * Next class consists of front character of subsequent tokens. +/// +/// Example: ["translation", "unit", "decl"] -> ["tud"] +/// +/// Note: skipping tokens is allowed, but not more than one. For example, +/// given ["a", "b", "c", "d", "e"] -> "ace" is allowed, but "ade" is not. +/// +/// * Another class of trigrams consists of those with 2 charactersin one token +/// and the front character of subsequent token (just as before, skipping up +/// to one token is allowed). +/// +/// Example: ["ab", "c", "d", "e"] -> ["abc", "abd", "abe"] +/// Note: similarly to the previous case, "abe" would not be allowed. +/// +/// * The last class of trigrams is similar to the previous one: it takes one +/// character from one token and two front characters from the next or +/// skip-1-next tokens. +/// +/// Example: ["a", "bc", "de", "fg"] -> ["abc", "ade"] +/// But not "afg". +/// +/// Note: the returned list of trigrams does not have duplicates, if any +/// trigram belongs to more than one class it is only inserted once. +std::vector generateSearchAtoms(llvm::StringRef SymbolName); + +} // namespace noctem +} // namespace clangd +} // namespace clang + +namespace llvm { + +// Support SearchAtoms as DenseMap keys. +template <> struct DenseMapInfo { + + static inline clang::clangd::noctem::SearchAtom getEmptyKey() { + static clang::clangd::noctem::SearchAtom EmptyKey("EMPTYKEY"); + return EmptyKey; + } + + static inline clang::clangd::noctem::SearchAtom getTombstoneKey() { + static clang::clangd::noctem::SearchAtom TombstoneKey("TOMBSTONE_KEY"); + return TombstoneKey; + } + + static unsigned getHashValue(const clang::clangd::noctem::SearchAtom &Tag) { + return hash_value(Tag); + } + + static bool isEqual(const clang::clangd::noctem::SearchAtom &LHS, + const clang::clangd::noctem::SearchAtom &RHS) { + return LHS == RHS; + } +}; + +} // namespace llvm + +#endif Index: clang-tools-extra/clangd/index/noctem/SearchAtom.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/noctem/SearchAtom.cpp @@ -0,0 +1,162 @@ +//===--- SearchAtom.cpp- Symbol Search primitive ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "SearchAtom.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Twine.h" + +#include +#include + +using namespace llvm; + +namespace clang { +namespace clangd { +namespace noctem { + +// FIXME(kbobyrev): Deal with short symbol symbol names. +std::vector generateSearchAtoms(StringRef SymbolName) { + auto Tokens = tokenize(SymbolName); + + // Apply lowercase text normalization. + for (auto &Token : Tokens) + std::for_each(Token.begin(), Token.end(), ::tolower); + + llvm::DenseSet UniqueTrigrams; + std::vector Trigrams; + + // Extract trigrams consisting of first characters of tokens sorted by of + // token positions. Trigram generator is allowed to skip 1 word between each + // token. + // + // Example: ["a", "b", "c", "d", "e"] + // + // would produce -> ["abc", "acd", "ace", ...] (among the others) + // + // but not -> ["ade"] because two tokens ("b" and "c") would be skipped in + // this case. + for (auto FirstToken = Tokens.begin(); FirstToken != Tokens.end(); + ++FirstToken) { + for (auto SecondToken = FirstToken + 1; + (SecondToken <= FirstToken + 2) && (SecondToken != Tokens.end()); + ++SecondToken) { + for (auto ThirdToken = SecondToken + 1; + (ThirdToken <= SecondToken + 2) && (ThirdToken != Tokens.end()); + ++ThirdToken) { + SearchAtom Trigram((*FirstToken + *SecondToken + *ThirdToken).str()); + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + Trigrams.push_back(Trigram); + } + } + } + } + + // Iterate through each token with a sliding window and extract trigrams + // consisting of 3 consecutive characters. + // + // Example: "delete" -> ["del", "ele", "let", "ete"] + for (const auto &Token : Tokens) { + // Token should have at least three characters to have trigram substrings. + if (Token.size() < 3) + continue; + + for (size_t Position = 0; Position + 2 < Token.size(); ++Position) + Trigrams.push_back(SearchAtom(Token.substr(Position, 3))); + } + + for (auto FirstToken = Tokens.begin(); FirstToken != Tokens.end(); + ++FirstToken) { + for (auto SecondToken = FirstToken + 1; + (SecondToken <= FirstToken + 2) && (SecondToken != Tokens.end()); + ++SecondToken) { + for (size_t FirstTokenIndex = 0; FirstTokenIndex < FirstToken->size(); + ++FirstTokenIndex) { + // Extract trigrams of the third class: one character of the first token + // and two characters from the next or skip-1-next token. + if (FirstTokenIndex + 1 < FirstToken->size()) { + SearchAtom Trigram((FirstToken->substr(FirstTokenIndex, 2) + + SecondToken->substr(0, 1)) + .str()); + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + Trigrams.push_back(Trigram); + } + } + // Extract trigrams of the last class: two character from the first + // token and front character from the next or skip-1-next token. + if (SecondToken->size() > 1) { + SearchAtom Trigram((FirstToken->substr(FirstTokenIndex, 1) + + SecondToken->substr(0, 2)) + .str()); + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + Trigrams.push_back(Trigram); + } + } + } + } + } + + return Trigrams; +} + +std::vector> tokenize(StringRef SymbolName) { + std::vector> Tokens; + size_t TokenStart = 0; + // Skip underscores at the beginning, e.g. "__builtin_popcount". + while (SymbolName[TokenStart] == '_') + ++TokenStart; + + for (size_t Index = TokenStart; Index + 1 < SymbolName.size(); ++Index) { + const char CurrentSymbol = SymbolName[Index]; + const char NextSymbol = SymbolName[Index + 1]; + // Skip sequences of underscores, e.g. "my__type". + if (CurrentSymbol == '_' && NextSymbol == '_') { + ++TokenStart; + continue; + } + + // Splits if the next symbol is underscore or if processed characters are + // [lowercase, Uppercase] which indicates beginning of next token. Digits + // are equivalent to lowercase symbols. + if ((NextSymbol == '_') || + ((islower(CurrentSymbol) || isdigit(CurrentSymbol)) && + isupper(NextSymbol))) { + Tokens.push_back(SymbolName.substr(TokenStart, Index - TokenStart + 1)); + TokenStart = Index + 1; + if (NextSymbol == '_') + ++TokenStart; + } + + // If there were N (> 1) consecutive uppercase letter the split should + // generate two tokens, one of which would consist of N - 1 first uppercase + // letters, the next token begins with the last uppercase letter. + // + // Example: "TUDecl" -> ["TU", "Decl"] + if (isupper(CurrentSymbol) && + (islower(NextSymbol) || (isdigit(NextSymbol)))) { + // Don't perform split if Index points to the beginning of new token, + // otherwise "NamedDecl" would be split into ["N", "amed", "D", "ecl"] + if (Index == TokenStart) + continue; + Tokens.push_back(SymbolName.substr(TokenStart, Index - TokenStart)); + TokenStart = Index; + } + } + + if (TokenStart < SymbolName.size()) + Tokens.push_back(SymbolName.substr(TokenStart)); + + return Tokens; +} + +} // namespace noctem +} // namespace clangd +} // namespace clang Index: clang-tools-extra/unittests/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/unittests/clangd/CMakeLists.txt +++ clang-tools-extra/unittests/clangd/CMakeLists.txt @@ -23,6 +23,7 @@ GlobalCompilationDatabaseTests.cpp HeadersTests.cpp IndexTests.cpp + NoctemIndexTests.cpp QualityTests.cpp SourceCodeTests.cpp SymbolCollectorTests.cpp Index: clang-tools-extra/unittests/clangd/NoctemIndexTests.cpp =================================================================== --- /dev/null +++ clang-tools-extra/unittests/clangd/NoctemIndexTests.cpp @@ -0,0 +1,93 @@ +//===-- IndexTests.cpp -------------------------------*- C++ -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "index/noctem/SearchAtom.h" +#include "llvm/ADT/SmallString.h" +#include "gtest/gtest.h" + +namespace clang { +namespace clangd { +namespace noctem { + +std::vector> +toSmallStrings(const std::vector Strings) { + std::vector> Result(Strings.size()); + for (size_t Index = 0; Index < Strings.size(); ++Index) { + Result[Index] = Strings[Index]; + } + return Result; +} + +std::vector +getTrigrams(std::initializer_list Trigrams) { + std::vector Result; + for (const auto &Symbols : Trigrams) { + Result.push_back(SearchAtom(Symbols)); + } + return Result; +} + +TEST(NoctemIndexTokens, TrigramSymbolNameTokenization) { + EXPECT_EQ(tokenize("unique_ptr"), toSmallStrings({"unique", "ptr"})); + + EXPECT_EQ(tokenize("TUDecl"), toSmallStrings({"TU", "Decl"})); + + EXPECT_EQ(tokenize("table_name_"), toSmallStrings({"table", "name"})); + + EXPECT_EQ(tokenize("kDaysInAWeek"), + toSmallStrings({"k", "Days", "In", "A", "Week"})); + + EXPECT_EQ(tokenize("AlternateUrlTableErrors"), + toSmallStrings({"Alternate", "Url", "Table", "Errors"})); + + EXPECT_EQ(tokenize("IsOK"), toSmallStrings({"Is", "OK"})); + + EXPECT_EQ(tokenize("ABSL_FALLTHROUGH_INTENDED"), + toSmallStrings({"ABSL", "FALLTHROUGH", "INTENDED"})); + + EXPECT_EQ(tokenize("SystemZ"), toSmallStrings({"System", "Z"})); + + EXPECT_EQ(tokenize("X86"), toSmallStrings({"X86"})); + + EXPECT_EQ(tokenize("ASTNodeKind"), toSmallStrings({"AST", "Node", "Kind"})); + + EXPECT_EQ(tokenize("ObjCDictionaryElement"), + toSmallStrings({"Obj", "C", "Dictionary", "Element"})); + + EXPECT_EQ(tokenize("multiple__underscores___everywhere____"), + toSmallStrings({"multiple", "underscores", "everywhere"})); + + EXPECT_EQ(tokenize("__cuda_builtin_threadIdx_t"), + toSmallStrings({"cuda", "builtin", "thread", "Idx", "t"})); + + EXPECT_EQ(tokenize("longUPPERCASESequence"), + toSmallStrings({"long", "UPPERCASE", "Sequence"})); +} + +TEST(NoctemIndexTrigrams, TrigramGeneration) { + EXPECT_EQ( + generateSearchAtoms("a_b_c_d_e_"), + getTrigrams({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + + EXPECT_EQ(generateSearchAtoms("clangd"), + getTrigrams({"cla", "lan", "ang", "ngd"})); + + EXPECT_EQ(generateSearchAtoms("abc_def"), + getTrigrams({"abc", "def", "abd", "ade", "bcd", "bde", "cde"})); + + EXPECT_EQ(generateSearchAtoms("unique_ptr"), + getTrigrams({"uni", "niq", "iqu", "que", "ptr", "unp", "upt", "nip", + "npt", "iqp", "ipt", "qup", "qpt", "uep", "ept"})); + + EXPECT_EQ(generateSearchAtoms("nl"), getTrigrams({})); +} + +} // namespace noctem +} // namespace clangd +} // namespace clang