diff --git a/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp b/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp --- a/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp +++ b/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp @@ -83,6 +83,12 @@ } BENCHMARK(DexQueries); +static void DexBuild(benchmark::State &State) { + for (auto _ : State) + buildDex(); +} +BENCHMARK(DexBuild); + } // namespace } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/index/dex/Dex.cpp b/clang-tools-extra/clangd/index/dex/Dex.cpp --- a/clang-tools-extra/clangd/index/dex/Dex.cpp +++ b/clang-tools-extra/clangd/index/dex/Dex.cpp @@ -39,28 +39,60 @@ const Token RestrictedForCodeCompletion = Token(Token::Kind::Sentinel, "Restricted For Code Completion"); -// Returns the tokens which are given symbol's characteristics. Currently, the -// generated tokens only contain fuzzy matching trigrams and symbol's scope, -// but in the future this will also return path proximity tokens and other -// types of tokens such as symbol type (if applicable). -// Returns the tokens which are given symbols's characteristics. For example, -// trigrams and scopes. -// FIXME(kbobyrev): Support more token types: -// * Namespace proximity -std::vector generateSearchTokens(const Symbol &Sym) { - std::vector Result = generateIdentifierTrigrams(Sym.Name); - Result.emplace_back(Token::Kind::Scope, Sym.Scope); - // Skip token generation for symbols with unknown declaration location. - if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty()) - for (const auto &ProximityURI : - generateProximityURIs(Sym.CanonicalDeclaration.FileURI)) - Result.emplace_back(Token::Kind::ProximityURI, ProximityURI); - if (Sym.Flags & Symbol::IndexedForCodeCompletion) - Result.emplace_back(RestrictedForCodeCompletion); - if (!Sym.Type.empty()) - Result.emplace_back(Token::Kind::Type, Sym.Type); - return Result; -} +// Helper to efficiently assemble the inverse index (token -> matching docs). +// The output is a nice uniform structure keyed on Token, but constructing +// the Token object every time we want to insert into the map is wasteful. +// Instead we have various maps keyed on things that are cheap to compute, +// and produce the Token keys once at the end. +class IndexBuilder { + llvm::DenseMap> TrigramDocs; + std::vector RestrictedCCDocs; + llvm::StringMap> TypeDocs; + llvm::StringMap> ScopeDocs; + llvm::StringMap> ProximityDocs; + std::vector TrigramScratch; + +public: + // Add the tokens which are given symbol's characteristics. + // This includes fuzzy matching trigrams, symbol's scope, etc. + // FIXME(kbobyrev): Support more token types: + // * Namespace proximity + void add(const Symbol &Sym, DocID D) { + generateIdentifierTrigrams(Sym.Name, TrigramScratch); + for (Trigram T : TrigramScratch) + TrigramDocs[T].push_back(D); + ScopeDocs[Sym.Scope].push_back(D); + if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty()) + for (const auto &ProximityURI : + generateProximityURIs(Sym.CanonicalDeclaration.FileURI)) + ProximityDocs[ProximityURI].push_back(D); + if (Sym.Flags & Symbol::IndexedForCodeCompletion) + RestrictedCCDocs.push_back(D); + if (!Sym.Type.empty()) + TypeDocs[Sym.Type].push_back(D); + } + + // Assemble the final compressed posting lists for the added symbols. + llvm::DenseMap build() { + llvm::DenseMap Result(/*InitialReserve=*/ + TrigramDocs.size() + + RestrictedCCDocs.size() + + TypeDocs.size() + + ScopeDocs.size() + + ProximityDocs.size()); + for (const auto &E : TrigramDocs) + Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second); + for (const auto &E : TypeDocs) + Result.try_emplace(Token(Token::Kind::Type, E.first()), E.second); + for (const auto &E : ScopeDocs) + Result.try_emplace(Token(Token::Kind::Scope, E.first()), E.second); + for (const auto &E : ProximityDocs) + Result.try_emplace(Token(Token::Kind::ProximityURI, E.first()), E.second); + if (!RestrictedCCDocs.empty()) + Result.try_emplace(RestrictedForCodeCompletion, RestrictedCCDocs); + return Result; + } +}; } // namespace @@ -86,18 +118,11 @@ Symbols[I] = ScoredSymbols[I].second; } - // Populate TempInvertedIndex with lists for index symbols. - llvm::DenseMap> TempInvertedIndex; - for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) { - const auto *Sym = Symbols[SymbolRank]; - for (const auto &Token : generateSearchTokens(*Sym)) - TempInvertedIndex[Token].push_back(SymbolRank); - } - - // Convert lists of items to posting lists. - for (const auto &TokenToPostingList : TempInvertedIndex) - InvertedIndex.insert( - {TokenToPostingList.first, PostingList(TokenToPostingList.second)}); + // Build posting lists for symbols. + IndexBuilder Builder; + for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) + Builder.add(*Symbols[SymbolRank], SymbolRank); + InvertedIndex = Builder.build(); } std::unique_ptr Dex::iterator(const Token &Tok) const { diff --git a/clang-tools-extra/clangd/index/dex/Trigram.h b/clang-tools-extra/clangd/index/dex/Trigram.h --- a/clang-tools-extra/clangd/index/dex/Trigram.h +++ b/clang-tools-extra/clangd/index/dex/Trigram.h @@ -24,6 +24,7 @@ #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H #include "Token.h" +#include "llvm/ADT/bit.h" #include @@ -31,7 +32,27 @@ namespace clangd { namespace dex { -/// Returns list of unique fuzzy-search trigrams from unqualified symbol. +// Compact representation of a trigram (string with up to 3 characters). +// Trigram generation is the hot path of indexing, so Token is too wasteful. +class Trigram { + std::array Data; // Last element is length. + // Steal some invalid bit patterns for DenseMap sentinels. + enum class Sentinel { Tombstone = 4, Empty = 5 }; + Trigram(Sentinel S) : Data{0, 0, 0, static_cast(S)} {} + uint32_t id() const { return llvm::bit_cast(Data); } + +public: + Trigram() : Data{0, 0, 0, 0} {} + Trigram(char A) : Data{A, 0, 0, 1} {} + Trigram(char A, char B) : Data{A, B, 0, 2} {} + Trigram(char A, char B, char C) : Data{A, B, C, 3} {} + std::string str() const { return std::string(Data.data(), Data[3]); } + friend struct ::llvm::DenseMapInfo; + friend bool operator==(Trigram L, Trigram R) { return L.id() == R.id(); } + friend bool operator<(Trigram L, Trigram R) { return L.id() < R.id(); } +}; + +/// Produces list of unique fuzzy-search trigrams from unqualified symbol. /// The trigrams give the 3-character query substrings this symbol can match. /// /// The symbol's name is broken into segments, e.g. "FooBar" has two segments. @@ -46,8 +67,9 @@ /// {f, fo, fb, foo, fob, fba, oob, oba, bar}. /// /// Trigrams are lowercase, as trigram matching is case-insensitive. -/// Trigrams in the returned list are deduplicated. -std::vector generateIdentifierTrigrams(llvm::StringRef Identifier); +/// Trigrams in the list are deduplicated. +void generateIdentifierTrigrams(llvm::StringRef Identifier, + std::vector &Out); /// Returns list of unique fuzzy-search trigrams given a query. /// @@ -64,4 +86,29 @@ } // namespace clangd } // namespace clang +namespace llvm { +template <> struct llvm::DenseMapInfo { + using Trigram = clang::clangd::dex::Trigram; + static inline Trigram getEmptyKey() { + return Trigram(Trigram::Sentinel::Empty); + } + static inline Trigram getTombstoneKey() { + return Trigram(Trigram::Sentinel::Tombstone); + } + static unsigned getHashValue(Trigram V) { + // Finalize step from MurmurHash3. + uint32_t X = V.id(); + X ^= X >> 16; + X *= uint32_t{0x85ebca6b}; + X ^= X >> 13; + X *= uint32_t{0xc2b2ae35}; + X ^= X >> 16; + return X; + } + static bool isEqual(const Trigram &LHS, const Trigram &RHS) { + return LHS == RHS; + } +}; +} // namespace llvm + #endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TRIGRAM_H diff --git a/clang-tools-extra/clangd/index/dex/Trigram.cpp b/clang-tools-extra/clangd/index/dex/Trigram.cpp --- a/clang-tools-extra/clangd/index/dex/Trigram.cpp +++ b/clang-tools-extra/clangd/index/dex/Trigram.cpp @@ -11,6 +11,7 @@ #include "Token.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include #include @@ -20,7 +21,9 @@ namespace clangd { namespace dex { -std::vector generateIdentifierTrigrams(llvm::StringRef Identifier) { +// Produce trigrams (including duplicates) and pass them to Out(). +template +static void identifierTrigrams(llvm::StringRef Identifier, Func Out) { // Apply fuzzy matching text segmentation. std::vector Roles(Identifier.size()); calculateRoles(Identifier, @@ -46,12 +49,6 @@ } } - llvm::DenseSet UniqueTrigrams; - - auto Add = [&](std::string Chars) { - UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars)); - }; - // Iterate through valid sequences of three characters Fuzzy Matcher can // process. for (size_t I = 0; I < LowercaseIdentifier.size(); ++I) { @@ -64,23 +61,41 @@ for (const unsigned K : Next[J]) { if (K == 0) continue; - Add({{LowercaseIdentifier[I], LowercaseIdentifier[J], - LowercaseIdentifier[K]}}); + Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J], + LowercaseIdentifier[K])); } } } // Emit short-query trigrams: FooBar -> f, fo, fb. if (!LowercaseIdentifier.empty()) - Add({LowercaseIdentifier[0]}); + Out(Trigram(LowercaseIdentifier[0])); if (LowercaseIdentifier.size() >= 2) - Add({LowercaseIdentifier[0], LowercaseIdentifier[1]}); + Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[1])); for (size_t I = 1; I < LowercaseIdentifier.size(); ++I) if (Roles[I] == Head) { - Add({LowercaseIdentifier[0], LowercaseIdentifier[I]}); + Out(Trigram(LowercaseIdentifier[0], LowercaseIdentifier[I])); break; } +} - return {UniqueTrigrams.begin(), UniqueTrigrams.end()}; +void generateIdentifierTrigrams(llvm::StringRef Identifier, + std::vector &Result) { + // Empirically, scanning for duplicates is faster with fewer trigrams, and + // a hashtable is faster with more. This is a hot path, so dispatch based on + // expected number of trigrams. Longer identifiers produce more trigrams. + // The magic number was tuned by running IndexBenchmark.DexBuild. + constexpr unsigned ManyTrigramsIdentifierThreshold = 14; + Result.clear(); + if (Identifier.size() < ManyTrigramsIdentifierThreshold) { + identifierTrigrams(Identifier, [&](Trigram T) { + if (!llvm::is_contained(Result, T)) + Result.push_back(T); + }); + } else { + identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); }); + llvm::sort(Result); + Result.erase(std::unique(Result.begin(), Result.end()), Result.end()); + } } std::vector generateQueryTrigrams(llvm::StringRef Query) { diff --git a/clang-tools-extra/clangd/test/Inputs/requests.json b/clang-tools-extra/clangd/test/Inputs/requests.json --- a/clang-tools-extra/clangd/test/Inputs/requests.json +++ b/clang-tools-extra/clangd/test/Inputs/requests.json @@ -1,7 +1,7 @@ -[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false}, -{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false}, -{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false}, -{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false}, -{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false}, -{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false}, -{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false}] +[{"Limit":100,"ProximityPaths":["/usr/home/user/clang-tools-extra/clangd/benchmarks/IndexBenchmark.cpp"],"Query":"OMP","RestrictForCodeCompletion":true,"Scopes":["clang::"], "AnyScope":false, "PreferredTypes":[]}, +{"Limit":100,"ProximityPaths":[],"Query":"s","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]}, +{"Limit":100,"ProximityPaths":[],"Query":"sy","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]}, +{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]}, +{"Limit":100,"ProximityPaths":[],"Query":"sys","RestrictForCodeCompletion":true,"Scopes":["llvm::", ""], "AnyScope":false, "PreferredTypes":[]}, +{"Limit":100,"ProximityPaths":[],"Query":"Dex","RestrictForCodeCompletion":true,"Scopes":["clang::clangd::", "clang::", "clang::clangd::dex::"],"AnyScope":false, "PreferredTypes":[]}, +{"Limit":100,"ProximityPaths":[],"Query":"Variable","RestrictForCodeCompletion":true,"Scopes":[""], "AnyScope":false, "PreferredTypes":[]}] diff --git a/clang-tools-extra/clangd/unittests/DexTests.cpp b/clang-tools-extra/clangd/unittests/DexTests.cpp --- a/clang-tools-extra/clangd/unittests/DexTests.cpp +++ b/clang-tools-extra/clangd/unittests/DexTests.cpp @@ -366,38 +366,46 @@ return tokensAre(Trigrams, Token::Kind::Trigram); } +std::vector identifierTrigramTokens(llvm::StringRef S) { + std::vector Trigrams; + generateIdentifierTrigrams(S, Trigrams); + std::vector Tokens; + for (Trigram T : Trigrams) + Tokens.emplace_back(Token::Kind::Trigram, T.str()); + return Tokens; +} + TEST(DexTrigrams, IdentifierTrigrams) { - EXPECT_THAT(generateIdentifierTrigrams("X86"), - trigramsAre({"x86", "x", "x8"})); + EXPECT_THAT(identifierTrigramTokens("X86"), trigramsAre({"x86", "x", "x8"})); - EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl", "n"})); + EXPECT_THAT(identifierTrigramTokens("nl"), trigramsAre({"nl", "n"})); - EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n"})); + EXPECT_THAT(identifierTrigramTokens("n"), trigramsAre({"n"})); - EXPECT_THAT(generateIdentifierTrigrams("clangd"), + EXPECT_THAT(identifierTrigramTokens("clangd"), trigramsAre({"c", "cl", "cla", "lan", "ang", "ngd"})); - EXPECT_THAT(generateIdentifierTrigrams("abc_def"), + EXPECT_THAT(identifierTrigramTokens("abc_def"), trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "bcd", "bde", "cde", "def"})); - EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), + EXPECT_THAT(identifierTrigramTokens("a_b_c_d_e_"), trigramsAre({"a", "a_", "ab", "abc", "bcd", "cde"})); - EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), + EXPECT_THAT(identifierTrigramTokens("unique_ptr"), trigramsAre({"u", "un", "up", "uni", "unp", "upt", "niq", "nip", "npt", "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep", "ept", "ptr"})); EXPECT_THAT( - generateIdentifierTrigrams("TUDecl"), + identifierTrigramTokens("TUDecl"), trigramsAre({"t", "tu", "td", "tud", "tde", "ude", "dec", "ecl"})); - EXPECT_THAT(generateIdentifierTrigrams("IsOK"), + EXPECT_THAT(identifierTrigramTokens("IsOK"), trigramsAre({"i", "is", "io", "iso", "iok", "sok"})); EXPECT_THAT( - generateIdentifierTrigrams("abc_defGhij__klm"), + identifierTrigramTokens("abc_defGhij__klm"), trigramsAre({"a", "ab", "ad", "abc", "abd", "ade", "adg", "bcd", "bde", "bdg", "cde", "cdg", "def", "deg", "dgh", "dgk", "efg", "egh", "egk", "fgh", "fgk", "ghi", "ghk", "gkl",