Index: clang-tools-extra/clangd/CMakeLists.txt =================================================================== --- clang-tools-extra/clangd/CMakeLists.txt +++ clang-tools-extra/clangd/CMakeLists.txt @@ -46,6 +46,7 @@ index/dex/DexIndex.cpp index/dex/Iterator.cpp + index/dex/Token.cpp index/dex/Trigram.cpp LINK_LIBS Index: clang-tools-extra/clangd/index/dex/DexIndex.h =================================================================== --- clang-tools-extra/clangd/index/dex/DexIndex.h +++ clang-tools-extra/clangd/index/dex/DexIndex.h @@ -39,12 +39,15 @@ // index on disk and then load it if available. class DexIndex : public SymbolIndex { public: + DexIndex(llvm::ArrayRef URISchemes) : URISchemes(URISchemes) {} + /// \brief (Re-)Build index for `Symbols`. All symbol pointers must remain /// accessible as long as `Symbols` is kept alive. - void build(std::shared_ptr> Syms); + void build(std::shared_ptr> Symbols); /// \brief Build index from a symbol slab. - static std::unique_ptr build(SymbolSlab Slab); + static std::unique_ptr + build(SymbolSlab Slab, llvm::ArrayRef URISchemes); bool fuzzyFind(const FuzzyFindRequest &Req, @@ -60,7 +63,6 @@ size_t estimateMemoryUsage() const override; private: - mutable std::mutex Mutex; std::shared_ptr> Symbols /*GUARDED_BY(Mutex)*/; @@ -73,10 +75,12 @@ // Inverted index is used to retrieve posting lists which are processed during // the fuzzyFind process. llvm::DenseMap InvertedIndex /*GUARDED_BY(Mutex)*/; + + const std::vector URISchemes; }; } // namespace dex } // namespace clangd } // namespace clang -#endif +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H Index: clang-tools-extra/clangd/index/dex/DexIndex.cpp =================================================================== --- clang-tools-extra/clangd/index/dex/DexIndex.cpp +++ clang-tools-extra/clangd/index/dex/DexIndex.cpp @@ -26,11 +26,14 @@ // Returns the tokens which are given symbols's characteristics. For example, // trigrams and scopes. // FIXME(kbobyrev): Support more token types: -// * Path proximity // * Types std::vector generateSearchTokens(const Symbol &Sym) { std::vector Result = generateIdentifierTrigrams(Sym.Name); Result.emplace_back(Token::Kind::Scope, Sym.Scope); + for (const auto &PathToken : + generateProximityPaths(Sym.CanonicalDeclaration.FileURI)) { + Result.push_back(PathToken); + } return Result; } @@ -72,8 +75,9 @@ estimateMemoryUsage()); } -std::unique_ptr DexIndex::build(SymbolSlab Slab) { - auto Idx = llvm::make_unique(); +std::unique_ptr +DexIndex::build(SymbolSlab Slab, llvm::ArrayRef URISchemes) { + auto Idx = llvm::make_unique(URISchemes); Idx->build(getSymbolsFromSlab(std::move(Slab))); return std::move(Idx); } @@ -117,6 +121,28 @@ if (!ScopeIterators.empty()) TopLevelChildren.push_back(createOr(move(ScopeIterators))); + // Add proximity paths boosting. + std::vector> BoostingIterators; + // This should generate proximity path boosting iterators for each different + // URI Scheme the Index is aware of. + const size_t PARENTS_TO_BOOST = 5; + for (const auto &ProximityPath : Req.ProximityPaths) { + for (const auto &P : generateQueryProximityPaths( + ProximityPath, URISchemes, PARENTS_TO_BOOST)) { + const auto It = InvertedIndex.find(P.first); + if (It != InvertedIndex.end()) + // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. + // Each parent directory is boosted by PARENTS_TO_BOOST + 1 - Level. + BoostingIterators.push_back( + createBoost(create(It->second), PARENTS_TO_BOOST + 1 - P.second)); + } + } + // Boosting iterators do not actually filter symbols. In order to preserve + // the validity of resulting query, TRUE iterator should be added along + // BOOSTs. + BoostingIterators.push_back(createTrue(Symbols->size())); + TopLevelChildren.push_back(createOr(move(BoostingIterators))); + // Use TRUE iterator if both trigrams and scopes from the query are not // present in the symbol index. auto QueryIterator = TopLevelChildren.empty() @@ -127,16 +153,30 @@ // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as // using 100x of the requested number might not be good in practice, e.g. // when the requested number of items is small. - const unsigned ItemsToRetrieve = 100 * Req.MaxCandidateCount; + const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount; auto Root = createLimit(move(QueryIterator), ItemsToRetrieve); - // FIXME(kbobyrev): Add boosting to the query and utilize retrieved - // boosting scores. + std::vector> SymbolDocIDs = consume(*Root); + // Sort items using boosting score as the key. + // FIXME(kbobyrev): Consider merging this stage with the next one? This is + // a performance/quality trade-off and if the performance is not an issue + // there's plenty of quality to be bought by sacrificing part of + // performance. + std::sort(begin(SymbolDocIDs), end(SymbolDocIDs), + [&](const std::pair &LHS, + const std::pair &RHS) { + return SymbolQuality.find((*Symbols)[LHS.first])->second * + LHS.second > + SymbolQuality.find((*Symbols)[RHS.first])->second * + RHS.second; + }); + // Retrieve top Req.MaxCandidateCount items. + const size_t ItemsToScore = 10 * Req.MaxCandidateCount; std::priority_queue> Top; - for (const auto &P : SymbolDocIDs) { - const DocID SymbolDocID = P.first; + for (size_t I = 0; I < std::min(SymbolDocIDs.size(), ItemsToScore); ++I) { + const DocID SymbolDocID = SymbolDocIDs[I].first; const auto *Sym = (*Symbols)[SymbolDocID]; const llvm::Optional Score = Filter.match(Sym->Name); if (!Score) Index: clang-tools-extra/clangd/index/dex/Token.h =================================================================== --- clang-tools-extra/clangd/index/dex/Token.h +++ clang-tools-extra/clangd/index/dex/Token.h @@ -41,6 +41,8 @@ /// Kind specifies Token type which defines semantics for the internal /// representation. Each Kind has different representation stored in Data /// field. + // FIXME(kbobyrev): Storing Data hash would be more efficient than storing raw + // strings. enum class Kind { /// Represents trigram used for fuzzy search of unqualified symbol names. /// @@ -48,15 +50,20 @@ Trigram, /// Scope primitives, e.g. "symbol belongs to namespace foo::bar". /// - /// Data stroes full scope name , e.g. "foo::bar::baz::" or "" (for global + /// Data stroes full scope name, e.g. "foo::bar::baz::" or "" (for global /// scope). Scope, + /// Path to symbol declaration. + /// + /// Data stores URI the parent directory of symbol declaration location. + /// + /// Example: + /// "file:path/to/clang-tools-extra/clangd/index/dex/". + Path, /// Internal Token type for invalid/special tokens, e.g. empty tokens for /// llvm::DenseMap. Sentinel, /// FIXME(kbobyrev): Add other Token Kinds - /// * Path with full or relative path to the directory in which symbol is - /// defined /// * Type with qualified type name or its USR }; @@ -81,6 +88,21 @@ } }; +/// Returns Search Token for each parent directory of given Path. Should be used +/// within the index build process. +/// +/// When Limit is set, returns no more than Limit tokens. +std::vector +generateProximityPaths(llvm::StringRef Path, + size_t Limit = std::numeric_limits::max()); + +/// Returns Search Token and its distance from the absolute path for the first +/// URI scheme allows valid conversions from AbsolutePath. +std::vector> +generateQueryProximityPaths(llvm::StringRef AbsolutePath, + llvm::ArrayRef URISchemes, + size_t Count = 5); + } // namespace dex } // namespace clangd } // namespace clang Index: clang-tools-extra/clangd/index/dex/Token.cpp =================================================================== --- /dev/null +++ clang-tools-extra/clangd/index/dex/Token.cpp @@ -0,0 +1,66 @@ +//===--- Token.cpp - Symbol Search primitive --------------------*- C++ -*-===// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Token.h" +#include "URI.h" +#include "llvm/Support/Path.h" +#include + +namespace clang { +namespace clangd { +namespace dex { + +std::vector generateProximityPaths(llvm::StringRef URIPath, + size_t Limit) { + std::vector Result; + // Don't generate any tokens for empty paths. + if (URIPath.empty()) + return Result; + auto ParsedURI = URI::parse(URIPath); + assert( + static_cast(ParsedURI) && + "Non-empty argument of generateProximityPaths() should be a valid URI."); + const auto Scheme = ParsedURI.get().scheme(); + const auto Authority = ParsedURI.get().authority(); + StringRef Path = ParsedURI.get().body(); + while (!Path.empty() && Limit--) { + URI TokenURI(Scheme, Authority, Path); + Result.emplace_back(Token::Kind::Path, TokenURI.toString()); + Path = llvm::sys::path::parent_path(Path); + } + return Result; +} + +std::vector> +generateQueryProximityPaths(llvm::StringRef AbsolutePath, + llvm::ArrayRef URISchemes, + size_t Count) { + std::vector> Result; + for (const auto Scheme : URISchemes) { + unsigned Distance = 0; + auto URI = URI::create(AbsolutePath, Scheme); + // For some paths, conversion to different URI schemes is impossible. These + // should be just skipped. + if (!static_cast(URI)) { + // Ignore the error. + handleAllErrors(URI.takeError(), [](const llvm::ErrorInfoBase &) {}); + continue; + } + for (const auto ProximityPath : + generateProximityPaths(URI.get().toString(), Count)) + Result.emplace_back(ProximityPath, Distance++); + // As soon as one valid URI scheme for given absolute path is found, the + // token generation process should be stopped. + break; + } + return Result; +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clang-tools-extra/clangd/tool/ClangdMain.cpp =================================================================== --- clang-tools-extra/clangd/tool/ClangdMain.cpp +++ clang-tools-extra/clangd/tool/ClangdMain.cpp @@ -42,7 +42,9 @@ // Build an in-memory static index for global symbols from a YAML-format file. // The size of global symbols should be relatively small, so that all symbols // can be managed in memory. -std::unique_ptr buildStaticIndex(llvm::StringRef YamlSymbolFile) { +std::unique_ptr +buildStaticIndex(llvm::StringRef YamlSymbolFile, + std::vector &URISchemes) { auto Buffer = llvm::MemoryBuffer::getFile(YamlSymbolFile); if (!Buffer) { llvm::errs() << "Can't open " << YamlSymbolFile << "\n"; @@ -53,8 +55,9 @@ for (auto Sym : Slab) SymsBuilder.insert(Sym); - return UseDex ? dex::DexIndex::build(std::move(SymsBuilder).build()) - : MemIndex::build(std::move(SymsBuilder).build()); + return UseDex + ? dex::DexIndex::build(std::move(SymsBuilder).build(), URISchemes) + : MemIndex::build(std::move(SymsBuilder).build()); } } // namespace @@ -298,7 +301,7 @@ Opts.BuildDynamicSymbolIndex = EnableIndex; std::unique_ptr StaticIdx; if (EnableIndex && !YamlSymbolFile.empty()) { - StaticIdx = buildStaticIndex(YamlSymbolFile); + StaticIdx = buildStaticIndex(YamlSymbolFile, Opts.URISchemes); Opts.StaticIndex = StaticIdx.get(); } Opts.AsyncThreadsCount = WorkerThreadsCount; Index: clang-tools-extra/unittests/clangd/DexIndexTests.cpp =================================================================== --- clang-tools-extra/unittests/clangd/DexIndexTests.cpp +++ clang-tools-extra/unittests/clangd/DexIndexTests.cpp @@ -7,6 +7,8 @@ // //===----------------------------------------------------------------------===// +#include "FuzzyMatch.h" +#include "TestFS.h" #include "TestIndex.h" #include "index/Index.h" #include "index/Merge.h" @@ -29,6 +31,8 @@ namespace dex { namespace { +std::vector URISchemes = {"unittest"}; + std::vector consumeIDs(Iterator &It) { auto IDAndScore = consume(It); std::vector IDs(IDAndScore.size()); @@ -325,14 +329,33 @@ } testing::Matcher> -trigramsAre(std::initializer_list Trigrams) { +tokensAre(std::initializer_list Strings, Token::Kind Kind) { std::vector Tokens; - for (const auto &Symbols : Trigrams) { - Tokens.push_back(Token(Token::Kind::Trigram, Symbols)); + for (const auto &TokenData : Strings) { + Tokens.push_back(Token(Kind, TokenData)); } return testing::UnorderedElementsAreArray(Tokens); } +testing::Matcher> +trigramsAre(std::initializer_list Trigrams) { + return tokensAre(Trigrams, Token::Kind::Trigram); +} + +testing::Matcher> +pathsAre(std::initializer_list Paths) { + return tokensAre(Paths, Token::Kind::Path); +} + +testing::Matcher>> proximityPathsAre( + std::initializer_list> ProximityPaths) { + std::vector> Result; + for (const auto &P : ProximityPaths) { + Result.push_back({Token(Token::Kind::Path, P.first), P.second}); + } + return testing::UnorderedElementsAreArray(Result); +} + TEST(DexIndexTrigrams, IdentifierTrigrams) { EXPECT_THAT(generateIdentifierTrigrams("X86"), trigramsAre({"x86", "x$$", "x8$"})); @@ -407,8 +430,28 @@ "hij", "ijk", "jkl", "klm"})); } +TEST(DexSearchTokens, SymbolPath) { + EXPECT_THAT(generateProximityPaths( + "unittest:///clang-tools-extra/clangd/index/dex/Token.h"), + pathsAre({"unittest:///clang-tools-extra/clangd/index/dex/", + "unittest:///clang-tools-extra/clangd/index/", + "unittest:///clang-tools-extra/clangd/", + "unittest:///clang-tools-extra/", + "unittest:///"})); +} + +TEST(DexSearchTokens, QueryProximityDistances) { + EXPECT_THAT( + generateQueryProximityPaths(testRoot() + "/a/b/c/d/e/f/g.h", URISchemes), + proximityPathsAre({{"unittest:///a/b/c/d/e/f/", 0}, + {"unittest:///a/b/c/d/e/", 1}, + {"unittest:///a/b/c/d/", 2}, + {"unittest:///a/b/c/", 3}, + {"unittest:///a/b/", 4}})); +} + TEST(DexIndex, Lookup) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"ns::abc", "ns::xyz"})); EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), @@ -419,9 +462,10 @@ } TEST(DexIndex, FuzzyFind) { - DexIndex Index; + DexIndex Index(URISchemes); Index.build(generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", - "other::ABC", "other::A"})); + "other::ABC", "other::A"}) + ); FuzzyFindRequest Req; Req.Query = "ABC"; Req.Scopes = {"ns::"}; @@ -442,7 +486,7 @@ } TEST(DexIndexTest, FuzzyMatchQ) { - DexIndex I; + DexIndex I(URISchemes); I.build( generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); FuzzyFindRequest Req; @@ -453,7 +497,7 @@ } TEST(DexIndexTest, DexIndexSymbolsRecycled) { - DexIndex I; + DexIndex I(URISchemes); std::weak_ptr Symbols; I.build(generateNumSymbols(0, 10, &Symbols)); FuzzyFindRequest Req; @@ -482,14 +526,14 @@ FuzzyFindRequest Req; Req.Query = "7"; - DexIndex I; + DexIndex I(URISchemes); I.build(std::move(Symbols)); auto Matches = match(I, Req); EXPECT_EQ(Matches.size(), 4u); } TEST(DexIndexTest, DexIndexLimitedNumMatches) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateNumSymbols(0, 100)); FuzzyFindRequest Req; Req.Query = "5"; @@ -501,7 +545,7 @@ } TEST(DexIndexTest, FuzzyMatch) { - DexIndex I; + DexIndex I(URISchemes); I.build( generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"})); FuzzyFindRequest Req; @@ -512,7 +556,7 @@ } TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; @@ -520,7 +564,7 @@ } TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; @@ -529,7 +573,7 @@ } TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; @@ -538,7 +582,7 @@ } TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"})); FuzzyFindRequest Req; Req.Query = "y"; @@ -547,7 +591,7 @@ } TEST(DexIndexTest, NoMatchNestedScopes) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"a::y1", "a::b::y2"})); FuzzyFindRequest Req; Req.Query = "y"; @@ -556,7 +600,7 @@ } TEST(DexIndexTest, IgnoreCases) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"ns::ABC", "ns::abc"})); FuzzyFindRequest Req; Req.Query = "AB"; @@ -565,7 +609,7 @@ } TEST(DexIndexTest, Lookup) { - DexIndex I; + DexIndex I(URISchemes); I.build(generateSymbols({"ns::abc", "ns::xyz"})); EXPECT_THAT(lookup(I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); EXPECT_THAT(lookup(I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), @@ -575,6 +619,58 @@ EXPECT_THAT(lookup(I, SymbolID("ns::nonono")), UnorderedElementsAre()); } +TEST(DexIndexTest, ProximityPathsBoosting) { + DexIndex I(URISchemes); + + // All strings should be stored in local variables to ensure their lifetime is + // the same as Slab's. + auto RootURI = URI::create(testRoot() + "/file.h", "unittest"); + assert(static_cast(RootURI)); + auto CloseFileURI = URI::create(testRoot() + "/a/b/c/d/e/file.h", "unittest"); + assert(static_cast(CloseFileURI)); + + // Populate Symbol Slab + std::vector Slab; + // Symbol in the root will perfectly match the request, but its canonical + // declaration is in the root directory. + Symbol RootSymbol; + std::string RootSymbolName = "abc"; + RootSymbol.Name = RootSymbolName; + std::string RootFilename = RootURI->toString(); + RootSymbol.CanonicalDeclaration.FileURI = RootFilename; + Slab.push_back(&RootSymbol); + + // Another symbol has much lower Fuzzy Matching Score, but it is very close to + // the directory the request would come from. + Symbol CloseSymbol; + std::string CloseSymbolName = "AbsolutelyBizzareCoincidenceMatch"; + CloseSymbol.Name = CloseSymbolName; + std::string CloseFileName = CloseFileURI->toString(); + CloseSymbol.CanonicalDeclaration.FileURI = CloseFileName; + + FuzzyFindRequest Req; + Req.Query = "abc"; + Req.MaxCandidateCount = 1; + // FuzzyFind request comes from the file which is far from the root. + Req.ProximityPaths.push_back(testRoot() + "/a/b/c/d/e/f/file.h"); + + FuzzyMatcher Matcher(Req.Query); + EXPECT_TRUE(Matcher.match(RootSymbolName) > Matcher.match(CloseSymbolName)); + + // Insert duplicates so that only the boosted symbols end up in the scoring + // stage. + // FIXME(kbobyrev): 10 is the number of items passed to the last stage which + // involves fuzzy scoring. Would having access to that information be better + // here? + for (size_t Index = 0; Index < 10; ++Index) + Slab.push_back(&CloseSymbol); + + I.build(std::make_shared>(Slab)); + + EXPECT_THAT(match(I, Req), + UnorderedElementsAre("AbsolutelyBizzareCoincidenceMatch")); +} + } // namespace } // namespace dex } // namespace clangd Index: clang-tools-extra/unittests/clangd/TestFS.h =================================================================== --- clang-tools-extra/unittests/clangd/TestFS.h +++ clang-tools-extra/unittests/clangd/TestFS.h @@ -59,7 +59,7 @@ }; // Returns an absolute (fake) test directory for this OS. -const char *testRoot(); +std::string testRoot(); // Returns a suitable absolute path for this OS. std::string testPath(PathRef File); Index: clang-tools-extra/unittests/clangd/TestFS.cpp =================================================================== --- clang-tools-extra/unittests/clangd/TestFS.cpp +++ clang-tools-extra/unittests/clangd/TestFS.cpp @@ -64,7 +64,7 @@ FileName, std::move(CommandLine), "")}; } -const char *testRoot() { +std::string testRoot() { #ifdef _WIN32 return "C:\\clangd-test"; #else