Index: clang-tools-extra/trunk/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/trunk/clangd/CMakeLists.txt +++ clang-tools-extra/trunk/clangd/CMakeLists.txt @@ -34,6 +34,7 @@ TUScheduler.cpp URI.cpp XRefs.cpp + index/CanonicalIncludes.cpp index/FileIndex.cpp index/Index.cpp @@ -42,6 +43,8 @@ index/SymbolCollector.cpp index/SymbolYAML.cpp + index/dex/Trigram.cpp + LINK_LIBS clangAST clangASTMatchers Index: clang-tools-extra/trunk/clangd/index/dex/Token.h =================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Token.h +++ clang-tools-extra/trunk/clangd/index/dex/Token.h @@ -0,0 +1,112 @@ +//===--- Token.h - Symbol Search primitive ----------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Token objects represent a characteristic of a symbol, which can be used to +// perform efficient search. Tokens are keys for inverted index which are mapped +// to the corresponding posting lists. +// +// The symbol std::cout might have the tokens: +// * Scope "std::" +// * Trigram "cou" +// * Trigram "out" +// * Type "std::ostream" +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/Support/raw_ostream.h" + +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +/// A Token represents an attribute of a symbol, such as a particular trigram +/// present in the name (used for fuzzy search). +/// +/// Tokens can be used to perform more sophisticated search queries by +/// constructing complex iterator trees. +struct Token { + /// Kind specifies Token type which defines semantics for the internal + /// representation. Each Kind has different representation stored in Data + /// field. + enum class Kind { + /// Represents trigram used for fuzzy search of unqualified symbol names. + /// + /// Data contains 3 bytes with trigram contents. + Trigram, + /// Scope primitives, e.g. "symbol belongs to namespace foo::bar". + /// + /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global + /// scope). + Scope, + /// Internal Token type for invalid/special tokens, e.g. empty tokens for + /// llvm::DenseMap. + Sentinel, + /// FIXME(kbobyrev): Add other Token Kinds + /// * Path with full or relative path to the directory in which symbol is + /// defined + /// * Type with qualified type name or its USR + }; + + Token(Kind TokenKind, llvm::StringRef Data) + : Data(Data), TokenKind(TokenKind) {} + + bool operator==(const Token &Other) const { + return TokenKind == Other.TokenKind && Data == Other.Data; + } + + /// Representation which is unique among Token with the same Kind. + std::string Data; + Kind TokenKind; + + friend llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const Token &T) { + return OS << T.Data; + } + +private: + friend llvm::hash_code hash_value(const Token &Token) { + return llvm::hash_combine(static_cast(Token.TokenKind), Token.Data); + } +}; + +} // namespace dex +} // namespace clangd +} // namespace clang + +namespace llvm { + +// Support Tokens as DenseMap keys. +template <> struct DenseMapInfo { + static inline clang::clangd::dex::Token getEmptyKey() { + return {clang::clangd::dex::Token::Kind::Sentinel, "EmptyKey"}; + } + + static inline clang::clangd::dex::Token getTombstoneKey() { + return {clang::clangd::dex::Token::Kind::Sentinel, "TombstoneKey"}; + } + + static unsigned getHashValue(const clang::clangd::dex::Token &Tag) { + return hash_value(Tag); + } + + static bool isEqual(const clang::clangd::dex::Token &LHS, + const clang::clangd::dex::Token &RHS) { + return LHS == RHS; + } +}; + +} // namespace llvm + +#endif Index: clang-tools-extra/trunk/clangd/index/dex/Trigram.h =================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Trigram.h +++ clang-tools-extra/trunk/clangd/index/dex/Trigram.h @@ -0,0 +1,62 @@ +//===--- Trigram.h - Trigram generation for Fuzzy Matching ------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Trigrams are attributes of the symbol unqualified name used to effectively +// extract symbols which can be fuzzy-matched given user query from the inverted +// index. To match query with the extracted set of trigrams Q, the set of +// generated trigrams T for identifier (unqualified symbol name) should contain +// all items of Q, i.e. Q ⊆ T. +// +// Trigram sets extracted from unqualified name and from query are different: +// the set of query trigrams only contains consecutive sequences of three +// characters (which is only a subset of all trigrams generated for an +// identifier). +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H + +#include "Token.h" + +#include + +namespace clang { +namespace clangd { +namespace dex { + +/// Returns list of unique fuzzy-search trigrams from unqualified symbol. +/// +/// First, given Identifier (unqualified symbol name) is segmented using +/// FuzzyMatch API and lowercased. After segmentation, the following technique +/// is applied for generating trigrams: for each letter or digit in the input +/// string the algorithms looks for the possible next and skip-1-next symbols +/// which can be jumped to during fuzzy matching. Each combination of such three +/// symbols is inserted into the result. +/// +/// Trigrams can start at any character in the input. Then we can choose to move +/// to the next character, move to the start of the next segment, or skip over a +/// segment. +/// +/// Note: the returned list of trigrams does not have duplicates, if any trigram +/// belongs to more than one class it is only inserted once. +std::vector generateIdentifierTrigrams(llvm::StringRef Identifier); + +/// Returns list of unique fuzzy-search trigrams given a query. +/// +/// Query is segmented using FuzzyMatch API and downcasted to lowercase. Then, +/// the simplest trigrams - sequences of three consecutive letters and digits +/// are extracted and returned after deduplication. +std::vector generateQueryTrigrams(llvm::StringRef Query); + +} // namespace dex +} // namespace clangd +} // namespace clang + +#endif Index: clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp =================================================================== --- clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp +++ clang-tools-extra/trunk/clangd/index/dex/Trigram.cpp @@ -0,0 +1,132 @@ +//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Trigram.h" +#include "../../FuzzyMatch.h" +#include "Token.h" + +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/StringExtras.h" + +#include +#include +#include + +using namespace llvm; + +namespace clang { +namespace clangd { +namespace dex { + +// FIXME(kbobyrev): Deal with short symbol symbol names. A viable approach would +// be generating unigrams and bigrams here, too. This would prevent symbol index +// from applying fuzzy matching on a tremendous number of symbols and allow +// supplementary retrieval for short queries. +// +// Short names (total segment length <3 characters) are currently ignored. +std::vector generateIdentifierTrigrams(llvm::StringRef Identifier) { + // Apply fuzzy matching text segmentation. + std::vector Roles(Identifier.size()); + calculateRoles(Identifier, + llvm::makeMutableArrayRef(Roles.data(), Identifier.size())); + + std::string LowercaseIdentifier = Identifier.lower(); + + // For each character, store indices of the characters to which fuzzy matching + // algorithm can jump. There are 3 possible variants: + // + // * Next Tail - next character from the same segment + // * Next Head - front character of the next segment + // * Skip-1-Next Head - front character of the skip-1-next segment + // + // Next stores tuples of three indices in the presented order, if a variant is + // not available then 0 is stored. + std::vector> Next(LowercaseIdentifier.size()); + unsigned NextTail = 0, NextHead = 0, NextNextHead = 0; + for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) { + Next[I] = {{NextTail, NextHead, NextNextHead}}; + NextTail = Roles[I] == Tail ? I : 0; + if (Roles[I] == Head) { + NextNextHead = NextHead; + NextHead = I; + } + } + + DenseSet UniqueTrigrams; + std::array Chars; + for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { + // Skip delimiters. + if (Roles[I] != Head && Roles[I] != Tail) + continue; + for (const unsigned J : Next[I]) { + if (!J) + continue; + for (const unsigned K : Next[J]) { + if (!K) + continue; + Chars = {{LowercaseIdentifier[I], LowercaseIdentifier[J], + LowercaseIdentifier[K], 0}}; + auto Trigram = Token(Token::Kind::Trigram, Chars.data()); + // Push unique trigrams to the result. + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + } + } + } + } + + std::vector Result; + for (const auto &Trigram : UniqueTrigrams) + Result.push_back(Trigram); + + return Result; +} + +// FIXME(kbobyrev): Similarly, to generateIdentifierTrigrams, this ignores short +// inputs (total segment length <3 characters). +std::vector generateQueryTrigrams(llvm::StringRef Query) { + // Apply fuzzy matching text segmentation. + std::vector Roles(Query.size()); + calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size())); + + std::string LowercaseQuery = Query.lower(); + + DenseSet UniqueTrigrams; + std::deque Chars; + + for (size_t I = 0; I < LowercaseQuery.size(); ++I) { + // If current symbol is delimiter, just skip it. + if (Roles[I] != Head && Roles[I] != Tail) + continue; + + Chars.push_back(LowercaseQuery[I]); + + if (Chars.size() > 3) + Chars.pop_front(); + if (Chars.size() == 3) { + auto Trigram = + Token(Token::Kind::Trigram, std::string(begin(Chars), end(Chars))); + // Push unique trigrams to the result. + if (!UniqueTrigrams.count(Trigram)) { + UniqueTrigrams.insert(Trigram); + } + } + } + + std::vector Result; + for (const auto &Trigram : UniqueTrigrams) + Result.push_back(Trigram); + + return Result; +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt +++ clang-tools-extra/trunk/unittests/clangd/CMakeLists.txt @@ -15,9 +15,10 @@ CodeCompleteTests.cpp CodeCompletionStringsTests.cpp ContextTests.cpp + DexIndexTests.cpp DraftStoreTests.cpp - FileIndexTests.cpp FileDistanceTests.cpp + FileIndexTests.cpp FindSymbolsTests.cpp FuzzyMatchTests.cpp GlobalCompilationDatabaseTests.cpp @@ -27,11 +28,11 @@ SourceCodeTests.cpp SymbolCollectorTests.cpp SyncAPI.cpp + TUSchedulerTests.cpp TestFS.cpp TestTU.cpp ThreadingTests.cpp TraceTests.cpp - TUSchedulerTests.cpp URITests.cpp XRefsTests.cpp ) Index: clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/trunk/unittests/clangd/DexIndexTests.cpp @@ -0,0 +1,96 @@ +//===-- DexIndexTests.cpp ----------------------------*- C++ -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "index/dex/Token.h" +#include "index/dex/Trigram.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" + +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +testing::Matcher> +trigramsAre(std::initializer_list Trigrams) { + std::vector Tokens; + for (const auto &Symbols : Trigrams) { + Tokens.push_back(Token(Token::Kind::Trigram, Symbols)); + } + return testing::UnorderedElementsAreArray(Tokens); +} + +TEST(DexIndexTrigrams, IdentifierTrigrams) { + EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86"})); + + EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({})); + + EXPECT_THAT(generateIdentifierTrigrams("clangd"), + trigramsAre({"cla", "lan", "ang", "ngd"})); + + EXPECT_THAT(generateIdentifierTrigrams("abc_def"), + trigramsAre({"abc", "abd", "ade", "bcd", "bde", "cde", "def"})); + + EXPECT_THAT( + generateIdentifierTrigrams("a_b_c_d_e_"), + trigramsAre({"abc", "abd", "acd", "ace", "bcd", "bce", "bde", "cde"})); + + EXPECT_THAT( + generateIdentifierTrigrams("unique_ptr"), + trigramsAre({"uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp", + "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"})); + + EXPECT_THAT(generateIdentifierTrigrams("TUDecl"), + trigramsAre({"tud", "tde", "ude", "dec", "ecl"})); + + EXPECT_THAT(generateIdentifierTrigrams("IsOK"), + trigramsAre({"iso", "iok", "sok"})); + + EXPECT_THAT(generateIdentifierTrigrams("abc_defGhij__klm"), + trigramsAre({ + "abc", "abd", "abg", "ade", "adg", "adk", "agh", "agk", "bcd", + "bcg", "bde", "bdg", "bdk", "bgh", "bgk", "cde", "cdg", "cdk", + "cgh", "cgk", "def", "deg", "dek", "dgh", "dgk", "dkl", "efg", + "efk", "egh", "egk", "ekl", "fgh", "fgk", "fkl", "ghi", "ghk", + "gkl", "hij", "hik", "hkl", "ijk", "ikl", "jkl", "klm", + })); +} + +TEST(DexIndexTrigrams, QueryTrigrams) { + EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); + + EXPECT_THAT(generateQueryTrigrams("nl"), trigramsAre({})); + + EXPECT_THAT(generateQueryTrigrams("clangd"), + trigramsAre({"cla", "lan", "ang", "ngd"})); + + EXPECT_THAT(generateQueryTrigrams("abc_def"), + trigramsAre({"abc", "bcd", "cde", "def"})); + + EXPECT_THAT(generateQueryTrigrams("a_b_c_d_e_"), + trigramsAre({"abc", "bcd", "cde"})); + + EXPECT_THAT(generateQueryTrigrams("unique_ptr"), + trigramsAre({"uni", "niq", "iqu", "que", "uep", "ept", "ptr"})); + + EXPECT_THAT(generateQueryTrigrams("TUDecl"), + trigramsAre({"tud", "ude", "dec", "ecl"})); + + EXPECT_THAT(generateQueryTrigrams("IsOK"), trigramsAre({"iso", "sok"})); + + EXPECT_THAT(generateQueryTrigrams("abc_defGhij__klm"), + trigramsAre({"abc", "bcd", "cde", "def", "efg", "fgh", "ghi", + "hij", "ijk", "jkl", "klm"})); +} + +} // namespace dex +} // namespace clangd +} // namespace clang