Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -46,6 +46,7 @@ index/dex/DexIndex.cpp index/dex/Iterator.cpp + index/dex/Token.cpp index/dex/Trigram.cpp LINK_LIBS Index: clang-tools-extra/clangd/index/dex/DexIndex.h =================================================================== --- clang-tools-extra/clangd/index/dex/DexIndex.h +++ clang-tools-extra/clangd/index/dex/DexIndex.h @@ -41,10 +41,12 @@ public: /// \brief (Re-)Build index for `Symbols`. All symbol pointers must remain /// accessible as long as `Symbols` is kept alive. - void build(std::shared_ptr> Symbols); + void build(std::shared_ptr> Symbols, + llvm::ArrayRef URISchemes); /// \brief Build index from a symbol slab. - static std::unique_ptr build(SymbolSlab Slab); + static std::unique_ptr + build(SymbolSlab Slab, llvm::ArrayRef URISchemes); bool fuzzyFind(const FuzzyFindRequest &Req, @@ -59,8 +61,15 @@ size_t estimateMemoryUsage() const override; -private: + /// For fuzzyFind() Dex retrieves getRetrievalItemsMultiplier() more items + /// than requested via Req.MaxCandidateCount in the first stage of filtering. + size_t getRetrievalItemsMultiplier() const; + + /// For fuzzyFind() Dex scores getScoringItemsMultiplier() more items + /// than requested via Req.MaxCandidateCount in the second stage of filtering. + size_t getScoringItemsMultiplier() const; +private: mutable std::mutex Mutex; std::shared_ptr> Symbols /*GUARDED_BY(Mutex)*/; @@ -73,6 +82,10 @@ // Inverted index is used to retrieve posting lists which are processed during // the fuzzyFind process. llvm::DenseMap InvertedIndex /*GUARDED_BY(Mutex)*/; + std::vector URISchemes /*GUARDED BY(Mutex)*/; + + const static size_t RETRIEVAL_ITEMS_MULTIPLIER = 100; + const static size_t SCORING_ITEMS_MULTIPLIER = 10; }; } // namespace dex Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/DexIndex.cpp +++ clang-tools-extra/clangd/index/dex/DexIndex.cpp @@ -26,17 +26,21 @@ // Returns the tokens which are given symbols's characteristics. For example, // trigrams and scopes. // FIXME(kbobyrev): Support more token types: -// * Path proximity // * Types std::vector generateSearchTokens(const Symbol &Sym) { std::vector Result = generateIdentifierTrigrams(Sym.Name); Result.push_back(Token(Token::Kind::Scope, Sym.Scope)); + for (const auto &PathToken : + generateProximityPaths(Sym.CanonicalDeclaration.FileURI)) { + Result.push_back(PathToken); + } return Result; } } // namespace -void DexIndex::build(std::shared_ptr> Syms) { +void DexIndex::build(std::shared_ptr> Syms, + llvm::ArrayRef ServerURISchemes) { llvm::DenseMap TempLookupTable; llvm::DenseMap TempSymbolQuality; for (const Symbol *Sym : *Syms) { @@ -66,15 +70,17 @@ Symbols = std::move(Syms); InvertedIndex = std::move(TempInvertedIndex); SymbolQuality = std::move(TempSymbolQuality); + URISchemes = ServerURISchemes; } vlog("Built DexIndex with estimated memory usage {0} bytes.", estimateMemoryUsage()); } -std::unique_ptr DexIndex::build(SymbolSlab Slab) { +std::unique_ptr +DexIndex::build(SymbolSlab Slab, llvm::ArrayRef URISchemes) { auto Idx = llvm::make_unique(); - Idx->build(getSymbolsFromSlab(std::move(Slab))); + Idx->build(getSymbolsFromSlab(std::move(Slab)), URISchemes); return std::move(Idx); } @@ -117,6 +123,26 @@ if (!ScopeIterators.empty()) TopLevelChildren.push_back(createOr(move(ScopeIterators))); + // Add proximity paths boosting. + std::vector> BoostingIterators; + // This should generate proximity path boosting iterators for each different + // URI Scheme the Index is aware of. + for (const auto &ProximityPath : Req.ProximityPaths) { + for (const auto &P : + generateQueryProximityPaths(ProximityPath, URISchemes)) { + const auto It = InvertedIndex.find(P.first); + if (It != InvertedIndex.end()) + // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. + BoostingIterators.push_back( + createBoost(create(It->second), P.second)); + } + } + // Boosting iterators do not actually filter symbols. In order to preserve + // the validity of resulting query, TRUE iterator should be added along + // BOOSTs. + BoostingIterators.push_back(createTrue(Symbols->size())); + TopLevelChildren.push_back(createOr(move(BoostingIterators))); + // Use TRUE iterator if both trigrams and scopes from the query are not // present in the symbol index. auto QueryIterator = TopLevelChildren.empty() @@ -127,16 +153,24 @@ // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as // using 100x of the requested number might not be good in practice, e.g. // when the requested number of items is small. - const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount; + const size_t ItemsToRetrieve = + getRetrievalItemsMultiplier() * Req.MaxCandidateCount; auto Root = createLimit(move(QueryIterator), ItemsToRetrieve); - // FIXME(kbobyrev): Add boosting to the query and utilize retrieved - // boosting scores. std::vector> SymbolDocIDs = consume(*Root); + // Sort items using boosting score as the key. + std::sort(begin(SymbolDocIDs), end(SymbolDocIDs), + [](const std::pair &LHS, + const std::pair &RHS) { + return LHS.second > RHS.second; + }); + // Retrieve top Req.MaxCandidateCount items. + const size_t ItemsToScore = + getScoringItemsMultiplier() * Req.MaxCandidateCount; std::priority_queue> Top; - for (const auto &P : SymbolDocIDs) { - const DocID SymbolDocID = P.first; + for (size_t I = 0; I < std::min(SymbolDocIDs.size(), ItemsToScore); ++I) { + const DocID SymbolDocID = SymbolDocIDs[I].first; const auto *Sym = (*Symbols)[SymbolDocID]; const llvm::Optional Score = Filter.match(Sym->Name); if (!Score) @@ -188,6 +222,14 @@ return Bytes; } +size_t DexIndex::getRetrievalItemsMultiplier() const { + return RETRIEVAL_ITEMS_MULTIPLIER; +} + +size_t DexIndex::getScoringItemsMultiplier() const { + return SCORING_ITEMS_MULTIPLIER; +} + } // namespace dex } // namespace clangd } // namespace clang Index: clang-tools-extra/clangd/index/dex/Token.h =================================================================== --- clang-tools-extra/clangd/index/dex/Token.h +++ clang-tools-extra/clangd/index/dex/Token.h @@ -48,15 +48,18 @@ Trigram, /// Scope primitives, e.g. "symbol belongs to namespace foo::bar". /// - /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global + /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global /// scope). Scope, + /// Path to symbol declaration. + /// + /// Data stores URI path to the directory with the trailing '/', e.g. + /// "file:///path/to/clang-tools-extra/clangd/index/dex/". + Path, /// Internal Token type for invalid/special tokens, e.g. empty tokens for /// llvm::DenseMap. Sentinel, /// FIXME(kbobyrev): Add other Token Kinds - /// * Path with full or relative path to the directory in which symbol is - /// defined /// * Type with qualified type name or its USR }; @@ -81,6 +84,23 @@ } }; +/// Returns Search Token for each parent directory of given Path. Should be used +/// within the index build process. +/// +/// When Limit is set, returns no more than Limit tokens. +std::vector +generateProximityPaths(llvm::StringRef Path, + size_t Limit = std::numeric_limits::max()); + +/// Returns Search Token and its boost for each path created via converting +/// given absolute path by using provided URI scheme. +/// +/// Boost starts with Count and decreases by 1 for each parent directory token. +std::vector> +generateQueryProximityPaths(llvm::StringRef AbsolutePath, + llvm::ArrayRef URISchemes, + size_t Count = 5); + } // namespace dex } // namespace clangd } // namespace clang @@ -109,4 +129,4 @@ } // namespace llvm -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_DEX_TOKEN_H +#endif Index: clang-tools-extra/clangd/index/dex/Token.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/Token.cpp @@ -0,0 +1,58 @@ +//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Token.h" +#include "URI.h" +#include + +namespace clang { +namespace clangd { +namespace dex { + +std::vector generateProximityPaths(llvm::StringRef URIPath, + size_t Limit) { + std::vector Result; + const static char Separator = '/'; + for (size_t SeparatorPosition = URIPath.find_last_of(Separator); + SeparatorPosition != StringRef::npos && !URIPath.endswith("://") && + Limit > 0; + SeparatorPosition = URIPath.find_last_of(Separator), --Limit) { + // Drop everything after last '/'. + URIPath = URIPath.drop_back(URIPath.size() - SeparatorPosition - 1); + Result.push_back(Token(Token::Kind::Path, URIPath)); + // Drop trailing '/' so that the next one could be found. + URIPath = URIPath.drop_back(); + } + return Result; +} + +std::vector> +generateQueryProximityPaths(llvm::StringRef AbsolutePath, + llvm::ArrayRef URISchemes, + size_t Count) { + std::vector> Result; + for (const auto Scheme : URISchemes) { + float Boost = Count + 1; + auto URI = URI::create(AbsolutePath, Scheme); + // For some paths, conversion to different URI schemes is impossible. These + // should be just skipped. + if (!URI) + continue; + for (const auto ProximityPath : + generateProximityPaths(URI.get().toString(), Count)) + Result.push_back({ProximityPath, Boost--}); + // As soon as one valid URI scheme for given absolute path is found, the + // token generation process should be stopped. + break; + } + return Result; +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clang-tools-extra/clangd/tool/ClangdMain.cpp =================================================================== --- clang-tools-extra/clangd/tool/ClangdMain.cpp +++ clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -42,7 +42,9 @@ // Build an in-memory static index for global symbols from a YAML-format file. // The size of global symbols should be relatively small, so that all symbols // can be managed in memory. -std::unique_ptr buildStaticIndex(llvm::StringRef YamlSymbolFile) { +std::unique_ptr +buildStaticIndex(llvm::StringRef YamlSymbolFile, + std::vector &URISchemes) { auto Buffer = llvm::MemoryBuffer::getFile(YamlSymbolFile); if (!Buffer) { llvm::errs() << "Can't open " << YamlSymbolFile << "\n"; @@ -53,8 +55,9 @@ for (auto Sym : Slab) SymsBuilder.insert(Sym); - return UseDex ? dex::DexIndex::build(std::move(SymsBuilder).build()) - : MemIndex::build(std::move(SymsBuilder).build()); + return UseDex + ? dex::DexIndex::build(std::move(SymsBuilder).build(), URISchemes) + : MemIndex::build(std::move(SymsBuilder).build()); } } // namespace @@ -298,7 +301,7 @@ Opts.BuildDynamicSymbolIndex = EnableIndex; std::unique_ptr StaticIdx; if (EnableIndex && !YamlSymbolFile.empty()) { - StaticIdx = buildStaticIndex(YamlSymbolFile); + StaticIdx = buildStaticIndex(YamlSymbolFile, Opts.URISchemes); Opts.StaticIndex = StaticIdx.get(); } Opts.AsyncThreadsCount = WorkerThreadsCount; Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "FuzzyMatch.h" #include "TestIndex.h" #include "index/Index.h" #include "index/Merge.h" @@ -29,6 +30,12 @@ namespace dex { namespace { +std::vector URISchemes = {"file"}; + +//===----------------------------------------------------------------------===// +// Query Iterators tests. +//===----------------------------------------------------------------------===// + std::vector consumeIDs(Iterator &It) { auto IDAndScore = consume(It); std::vector IDs(IDAndScore.size()); @@ -324,16 +331,39 @@ EXPECT_THAT(ElementBoost, 3); } +//===----------------------------------------------------------------------===// +// Search Token tests. +//===----------------------------------------------------------------------===// + testing::Matcher> -trigramsAre(std::initializer_list Trigrams) { +tokensAre(std::initializer_list Strings, Token::Kind Kind) { std::vector Tokens; - for (const auto &Symbols : Trigrams) { - Tokens.push_back(Token(Token::Kind::Trigram, Symbols)); + for (const auto &TokenData : Strings) { + Tokens.push_back(Token(Kind, TokenData)); } return testing::UnorderedElementsAreArray(Tokens); } -TEST(DexIndexTrigrams, IdentifierTrigrams) { +testing::Matcher> +trigramsAre(std::initializer_list Trigrams) { + return tokensAre(Trigrams, Token::Kind::Trigram); +} + +testing::Matcher> +pathsAre(std::initializer_list Paths) { + return tokensAre(Paths, Token::Kind::Path); +} + +testing::Matcher>> proximityPathsAre( + std::initializer_list> ProximityPaths) { + std::vector> Result; + for (const auto &P : ProximityPaths) { + Result.push_back({Token(Token::Kind::Path, P.first), P.second}); + } + return testing::UnorderedElementsAreArray(Result); +} + +TEST(DexSearchTokens, IdentifierTrigrams) { EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86", "x$$", "x8$"})); @@ -374,7 +404,7 @@ "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"})); } -TEST(DexIndexTrigrams, QueryTrigrams) { +TEST(DexSearchTokens, QueryTrigrams) { EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"})); EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); @@ -407,9 +437,31 @@ "hij", "ijk", "jkl", "klm"})); } +TEST(DexSearchTokens, SymbolPath) { + EXPECT_THAT(generateProximityPaths( + "file:///clang-tools-extra/clangd/index/dex/Token.h"), + pathsAre({"file:///clang-tools-extra/clangd/index/dex/", + "file:///clang-tools-extra/clangd/index/", + "file:///clang-tools-extra/clangd/", + "file:///clang-tools-extra/", "file:///"})); +} + +TEST(DexSearchTokens, QueryProximityPaths) { + EXPECT_THAT(generateQueryProximityPaths("/a/b/c/d/e/f/g.h", URISchemes), + proximityPathsAre({{"file:///a/b/c/d/e/f/", 6.}, + {"file:///a/b/c/d/e/", 5.}, + {"file:///a/b/c/d/", 4.}, + {"file:///a/b/c/", 3.}, + {"file:///a/b/", 2.}})); +} + +//===----------------------------------------------------------------------===// +// Index tests. +//===----------------------------------------------------------------------===// + TEST(DexIndex, Lookup) { DexIndex I; - I.build(generateSymbols({"ns::abc", "ns::xyz"})); + I.build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes); EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), UnorderedElementsAre("ns::abc", "ns::xyz")); @@ -421,7 +473,8 @@ TEST(DexIndex, FuzzyFind) { DexIndex Index; Index.build(generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", - "other::ABC", "other::A"})); + "other::ABC", "other::A"}), + URISchemes); FuzzyFindRequest Req; Req.Query = "ABC"; Req.Scopes = {"ns::"}; @@ -444,7 +497,8 @@ TEST(DexIndexTest, FuzzyMatchQ) { DexIndex I; I.build( - generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); + generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}), + URISchemes); FuzzyFindRequest Req; Req.Query = "lol"; Req.MaxCandidateCount = 2; @@ -455,14 +509,14 @@ TEST(DexIndexTest, DexIndexSymbolsRecycled) { DexIndex I; std::weak_ptr Symbols; - I.build(generateNumSymbols(0, 10, &Symbols)); + I.build(generateNumSymbols(0, 10, &Symbols), URISchemes); FuzzyFindRequest Req; Req.Query = "7"; EXPECT_THAT(match(I, Req), UnorderedElementsAre("7")); EXPECT_FALSE(Symbols.expired()); // Release old symbols. - I.build(generateNumSymbols(0, 0)); + I.build(generateNumSymbols(0, 0), URISchemes); EXPECT_TRUE(Symbols.expired()); } @@ -483,14 +537,14 @@ FuzzyFindRequest Req; Req.Query = "7"; DexIndex I; - I.build(std::move(Symbols)); + I.build(std::move(Symbols), URISchemes); auto Matches = match(I, Req); EXPECT_EQ(Matches.size(), 4u); } TEST(DexIndexTest, DexIndexLimitedNumMatches) { DexIndex I; - I.build(generateNumSymbols(0, 100)); + I.build(generateNumSymbols(0, 100), URISchemes); FuzzyFindRequest Req; Req.Query = "5"; Req.MaxCandidateCount = 3; @@ -503,7 +557,8 @@ TEST(DexIndexTest, FuzzyMatch) { DexIndex I; I.build( - generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); + generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}), + URISchemes); FuzzyFindRequest Req; Req.Query = "lol"; Req.MaxCandidateCount = 2; @@ -513,7 +568,7 @@ TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) { DexIndex I; - I.build(generateSymbols({"a::y1", "b::y2", "y3"})); + I.build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); FuzzyFindRequest Req; Req.Query = "y"; EXPECT_THAT(match(I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3")); @@ -521,7 +576,7 @@ TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) { DexIndex I; - I.build(generateSymbols({"a::y1", "b::y2", "y3"})); + I.build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {""}; @@ -530,7 +585,8 @@ TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) { DexIndex I; - I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"})); + I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}), + URISchemes); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a::"}; @@ -539,7 +595,8 @@ TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) { DexIndex I; - I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"})); + I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}), + URISchemes); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a::", "b::"}; @@ -548,7 +605,7 @@ TEST(DexIndexTest, NoMatchNestedScopes) { DexIndex I; - I.build(generateSymbols({"a::y1", "a::b::y2"})); + I.build(generateSymbols({"a::y1", "a::b::y2"}), URISchemes); FuzzyFindRequest Req; Req.Query = "y"; Req.Scopes = {"a::"}; @@ -557,7 +614,7 @@ TEST(DexIndexTest, IgnoreCases) { DexIndex I; - I.build(generateSymbols({"ns::ABC", "ns::abc"})); + I.build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes); FuzzyFindRequest Req; Req.Query = "AB"; Req.Scopes = {"ns::"}; @@ -566,7 +623,7 @@ TEST(DexIndexTest, Lookup) { DexIndex I; - I.build(generateSymbols({"ns::abc", "ns::xyz"})); + I.build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes); EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), UnorderedElementsAre("ns::abc", "ns::xyz")); @@ -575,6 +632,52 @@ EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre()); } +TEST(DexIndexTest, ProximityPathsBoosting) { + DexIndex I; + + // All strings should be stored in local variables to ensure their lifetime is + // the same as Slab's. + std::string RootFileURI = "file::///file.h"; + std::string CloseFileURI = "file:///a/b/c/d/e/file.h"; + + // Populate Symbol Slab + std::vector Slab; + // Symbol in the root will perfectly match the request, but its canonical + // declaration is in the root directory. + Symbol RootSymbol; + std::string RootSymbolName = "abc"; + RootSymbol.Name = RootSymbolName; + RootSymbol.CanonicalDeclaration.FileURI = RootFileURI; + Slab.push_back(&RootSymbol); + + // Another symbol has much lower Fuzzy Matching Score, but it is very close to + // the directory the request would come from. + Symbol CloseSymbol; + std::string CloseSymbolName = "AbsolutelyBizzareCoincidenceMatch"; + CloseSymbol.Name = CloseSymbolName; + CloseSymbol.CanonicalDeclaration.FileURI = CloseFileURI; + + // Insert duplicates so that only the boosted symbols end up in the scoring + // stage. + for (size_t Index = 0; Index < I.getScoringItemsMultiplier(); ++Index) + Slab.push_back(&CloseSymbol); + + I.build(std::make_shared>(Slab), URISchemes); + + + FuzzyFindRequest Req; + Req.Query = "abc"; + Req.MaxCandidateCount = 1; + // FuzzyFind request comes from the file which is far from the root. + Req.ProximityPaths.push_back("/a/b/c/d/e/f/file.h"); + + FuzzyMatcher Matcher(Req.Query); + EXPECT_TRUE(Matcher.match(RootSymbolName) > Matcher.match(CloseSymbolName)); + + EXPECT_THAT(match(I, Req), + UnorderedElementsAre("AbsolutelyBizzareCoincidenceMatch")); +} + } // namespace } // namespace dex } // namespace clangd