Index: clangd/CMakeLists.txt =================================================================== --- clangd/CMakeLists.txt +++ clangd/CMakeLists.txt @@ -46,7 +46,7 @@ index/SymbolCollector.cpp index/SymbolYAML.cpp - index/dex/DexIndex.cpp + index/dex/Dex.cpp index/dex/Iterator.cpp index/dex/Trigram.cpp Index: clangd/index/SymbolYAML.cpp =================================================================== --- clangd/index/SymbolYAML.cpp +++ clangd/index/SymbolYAML.cpp @@ -11,7 +11,7 @@ #include "Index.h" #include "Serialization.h" #include "Trace.h" -#include "dex/DexIndex.h" +#include "dex/Dex.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/Support/Errc.h" @@ -225,7 +225,7 @@ if (!Slab) return nullptr; trace::Span Tracer("BuildIndex"); - return UseDex ? dex::DexIndex::build(std::move(*Slab), URISchemes) + return UseDex ? dex::Dex::build(std::move(*Slab), URISchemes) : MemIndex::build(std::move(*Slab), RefSlab()); } Index: clangd/index/dex/Dex.h =================================================================== --- clangd/index/dex/Dex.h +++ clangd/index/dex/Dex.h @@ -0,0 +1,113 @@ +//===--- Dex.h - Dex Symbol Index Implementation ----------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This defines Dex - a symbol index implementation based on query iterators +// over symbol tokens, such as fuzzy matching trigrams, scopes, types, etc. +// While consuming more memory and having longer build stage due to +// preprocessing, Dex will have substantially lower latency. It will also allow +// efficient symbol searching which is crucial for operations like code +// completion, and can be very important for a number of different code +// transformations which will be eventually supported by Clangd. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEX_H +#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEX_H + +#include "Iterator.h" +#include "Token.h" +#include "Trigram.h" +#include "index/Index.h" +#include "index/MemIndex.h" +#include "index/SymbolCollector.h" + +namespace clang { +namespace clangd { +namespace dex { + +/// In-memory Dex trigram-based index implementation. +// FIXME(kbobyrev): Introduce serialization and deserialization of the symbol +// index so that it can be loaded from the disk. Since static index is not +// changed frequently, it's safe to assume that it has to be built only once +// (when the clangd process starts). Therefore, it can be easier to store built +// index on disk and then load it if available. +class Dex : public SymbolIndex { +public: + // All symbols must outlive this index. + template + Dex(Range &&Symbols, llvm::ArrayRef Schemes) + : URISchemes(Schemes) { + // If Schemes don't contain any items, fall back to SymbolCollector's + // default URI schemes. + if (URISchemes.empty()) { + SymbolCollector::Options Opts; + URISchemes = Opts.URISchemes; + } + for (auto &&Sym : Symbols) + this->Symbols.push_back(&Sym); + buildIndex(); + } + // Symbols are owned by BackingData, Index takes ownership. + template + Dex(Range &&Symbols, Payload &&BackingData, + llvm::ArrayRef URISchemes) + : Dex(std::forward(Symbols), URISchemes) { + KeepAlive = std::shared_ptr( + std::make_shared(std::move(BackingData)), nullptr); + } + + /// Builds an index from a slab. The index takes ownership of the slab. + static std::unique_ptr + build(SymbolSlab Slab, llvm::ArrayRef URISchemes) { + return llvm::make_unique(Slab, std::move(Slab), URISchemes); + } + + bool + fuzzyFind(const FuzzyFindRequest &Req, + llvm::function_ref Callback) const override; + + void lookup(const LookupRequest &Req, + llvm::function_ref Callback) const override; + + void refs(const RefsRequest &Req, + llvm::function_ref Callback) const override; + + size_t estimateMemoryUsage() const override; + +private: + void buildIndex(); + + /// Stores symbols sorted in the descending order of symbol quality.. + std::vector Symbols; + /// SymbolQuality[I] is the quality of Symbols[I]. + std::vector SymbolQuality; + llvm::DenseMap LookupTable; + /// Inverted index is a mapping from the search token to the posting list, + /// which contains all items which can be characterized by such search token. + /// For example, if the search token is scope "std::", the corresponding + /// posting list would contain all indices of symbols defined in namespace + /// std. Inverted index is used to retrieve posting lists which are processed + /// during the fuzzyFind process. + llvm::DenseMap InvertedIndex; + std::shared_ptr KeepAlive; // poor man's move-only std::any + + std::vector URISchemes; +}; + +/// Returns Search Token for a number of parent directories of given Path. +/// Should be used within the index build process. +/// +/// This function is exposed for testing only. +std::vector generateProximityURIs(llvm::StringRef URIPath); + +} // namespace dex +} // namespace clangd +} // namespace clang + +#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEX_H Index: clangd/index/dex/Dex.cpp =================================================================== --- clangd/index/dex/Dex.cpp +++ clangd/index/dex/Dex.cpp @@ -0,0 +1,270 @@ +//===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Dex.h" +#include "FileDistance.h" +#include "FuzzyMatch.h" +#include "Logger.h" +#include "Quality.h" +#include "llvm/ADT/StringSet.h" +#include +#include + +namespace clang { +namespace clangd { +namespace dex { + +namespace { + +// Returns the tokens which are given symbol's characteristics. Currently, the +// generated tokens only contain fuzzy matching trigrams and symbol's scope, +// but in the future this will also return path proximity tokens and other +// types of tokens such as symbol type (if applicable). +// Returns the tokens which are given symbols's characteristics. For example, +// trigrams and scopes. +// FIXME(kbobyrev): Support more token types: +// * Types +// * Namespace proximity +std::vector generateSearchTokens(const Symbol &Sym) { + std::vector Result = generateIdentifierTrigrams(Sym.Name); + Result.emplace_back(Token::Kind::Scope, Sym.Scope); + // Skip token generation for symbols with unknown declaration location. + if (!Sym.CanonicalDeclaration.FileURI.empty()) + for (const auto &ProximityURI : + generateProximityURIs(Sym.CanonicalDeclaration.FileURI)) + Result.emplace_back(Token::Kind::ProximityURI, ProximityURI); + return Result; +} + +// Constructs BOOST iterators for Path Proximities. +std::vector> createFileProximityIterators( + llvm::ArrayRef ProximityPaths, + llvm::ArrayRef URISchemes, + const llvm::DenseMap &InvertedIndex) { + std::vector> BoostingIterators; + // Deduplicate parent URIs extracted from the ProximityPaths. + llvm::StringSet<> ParentURIs; + llvm::StringMap Sources; + for (const auto &Path : ProximityPaths) { + Sources[Path] = SourceParams(); + auto PathURI = URI::create(Path, URISchemes); + if (!PathURI) { + elog("Given ProximityPath {0} is can not be converted to any known URI " + "scheme. fuzzyFind request will ignore it.", + Path); + llvm::consumeError(PathURI.takeError()); + continue; + } + const auto PathProximityURIs = generateProximityURIs(PathURI->toString()); + for (const auto &ProximityURI : PathProximityURIs) + ParentURIs.insert(ProximityURI); + } + // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults + // for all parameters except for Proximity Path distance signal. + SymbolRelevanceSignals PathProximitySignals; + // DistanceCalculator will find the shortest distance from ProximityPaths to + // any URI extracted from the ProximityPaths. + URIDistance DistanceCalculator(Sources); + PathProximitySignals.FileProximityMatch = &DistanceCalculator; + // Try to build BOOST iterator for each Proximity Path provided by + // ProximityPaths. Boosting factor should depend on the distance to the + // Proximity Path: the closer processed path is, the higher boosting factor. + for (const auto &ParentURI : ParentURIs.keys()) { + const auto It = + InvertedIndex.find(Token(Token::Kind::ProximityURI, ParentURI)); + if (It != InvertedIndex.end()) { + // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. + PathProximitySignals.SymbolURI = ParentURI; + BoostingIterators.push_back( + createBoost(create(It->second), PathProximitySignals.evaluate())); + } + } + return BoostingIterators; +} + +} // namespace + +void Dex::buildIndex() { + std::vector> ScoredSymbols(Symbols.size()); + + for (size_t I = 0; I < Symbols.size(); ++I) { + const Symbol *Sym = Symbols[I]; + LookupTable[Sym->ID] = Sym; + ScoredSymbols[I] = {quality(*Sym), Sym}; + } + + // Symbols are sorted by symbol qualities so that items in the posting lists + // are stored in the descending order of symbol quality. + std::sort(begin(ScoredSymbols), end(ScoredSymbols), + std::greater>()); + + // SymbolQuality was empty up until now. + SymbolQuality.resize(Symbols.size()); + // Populate internal storage using Symbol + Score pairs. + for (size_t I = 0; I < ScoredSymbols.size(); ++I) { + SymbolQuality[I] = ScoredSymbols[I].first; + Symbols[I] = ScoredSymbols[I].second; + } + + // Populate TempInvertedIndex with posting lists for index symbols. + for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) { + const auto *Sym = Symbols[SymbolRank]; + for (const auto &Token : generateSearchTokens(*Sym)) + InvertedIndex[Token].push_back(SymbolRank); + } + + vlog("Built Dex with estimated memory usage {0} bytes.", + estimateMemoryUsage()); +} + +/// Constructs iterators over tokens extracted from the query and exhausts it +/// while applying Callback to each symbol in the order of decreasing quality +/// of the matched symbols. +bool Dex::fuzzyFind(const FuzzyFindRequest &Req, + llvm::function_ref Callback) const { + assert(!StringRef(Req.Query).contains("::") && + "There must be no :: in query."); + FuzzyMatcher Filter(Req.Query); + bool More = false; + + std::vector> TopLevelChildren; + const auto TrigramTokens = generateIdentifierTrigrams(Req.Query); + + // Generate query trigrams and construct AND iterator over all query + // trigrams. + std::vector> TrigramIterators; + for (const auto &Trigram : TrigramTokens) { + const auto It = InvertedIndex.find(Trigram); + if (It != InvertedIndex.end()) + TrigramIterators.push_back(create(It->second)); + } + if (!TrigramIterators.empty()) + TopLevelChildren.push_back(createAnd(move(TrigramIterators))); + + // Generate scope tokens for search query. + std::vector> ScopeIterators; + for (const auto &Scope : Req.Scopes) { + const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope)); + if (It != InvertedIndex.end()) + ScopeIterators.push_back(create(It->second)); + } + // Add OR iterator for scopes if there are any Scope Iterators. + if (!ScopeIterators.empty()) + TopLevelChildren.push_back(createOr(move(ScopeIterators))); + + // Add proximity paths boosting. + auto BoostingIterators = createFileProximityIterators( + Req.ProximityPaths, URISchemes, InvertedIndex); + // Boosting iterators do not actually filter symbols. In order to preserve + // the validity of resulting query, TRUE iterator should be added along + // BOOSTs. + if (!BoostingIterators.empty()) { + BoostingIterators.push_back(createTrue(Symbols.size())); + TopLevelChildren.push_back(createOr(move(BoostingIterators))); + } + + // Use TRUE iterator if both trigrams and scopes from the query are not + // present in the symbol index. + auto QueryIterator = TopLevelChildren.empty() + ? createTrue(Symbols.size()) + : createAnd(move(TopLevelChildren)); + // Retrieve more items than it was requested: some of the items with high + // final score might not be retrieved otherwise. + // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as + // using 100x of the requested number might not be good in practice, e.g. + // when the requested number of items is small. + const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount; + auto Root = createLimit(move(QueryIterator), ItemsToRetrieve); + + using IDAndScore = std::pair; + std::vector IDAndScores = consume(*Root); + + auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) { + return LHS.second > RHS.second; + }; + TopN Top(Req.MaxCandidateCount, Compare); + for (const auto &IDAndScore : IDAndScores) { + const DocID SymbolDocID = IDAndScore.first; + const auto *Sym = Symbols[SymbolDocID]; + const llvm::Optional Score = Filter.match(Sym->Name); + if (!Score) + continue; + // Combine Fuzzy Matching score, precomputed symbol quality and boosting + // score for a cumulative final symbol score. + const float FinalScore = + (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second; + // If Top.push(...) returns true, it means that it had to pop an item. In + // this case, it is possible to retrieve more symbols. + if (Top.push({SymbolDocID, FinalScore})) + More = true; + } + + // Apply callback to the top Req.MaxCandidateCount items in the descending + // order of cumulative score. + for (const auto &Item : std::move(Top).items()) + Callback(*Symbols[Item.first]); + return More; +} + +void Dex::lookup(const LookupRequest &Req, + llvm::function_ref Callback) const { + for (const auto &ID : Req.IDs) { + auto I = LookupTable.find(ID); + if (I != LookupTable.end()) + Callback(*I->second); + } +} + +void Dex::refs(const RefsRequest &Req, + llvm::function_ref Callback) const { + log("refs is not implemented."); +} + +size_t Dex::estimateMemoryUsage() const { + size_t Bytes = + LookupTable.size() * sizeof(std::pair); + Bytes += SymbolQuality.size() * sizeof(std::pair); + Bytes += InvertedIndex.size() * sizeof(Token); + + for (const auto &P : InvertedIndex) { + Bytes += P.second.size() * sizeof(DocID); + } + return Bytes; +} + +std::vector generateProximityURIs(llvm::StringRef URIPath) { + std::vector Result; + auto ParsedURI = URI::parse(URIPath); + assert(ParsedURI && + "Non-empty argument of generateProximityURIs() should be a valid " + "URI."); + StringRef Body = ParsedURI->body(); + // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum + // size of resulting vector. Some projects might want to have higher limit if + // the file hierarchy is deeper. For the generic case, it would be useful to + // calculate Limit in the index build stage by calculating the maximum depth + // of the project source tree at runtime. + size_t Limit = 5; + // Insert original URI before the loop: this would save a redundant iteration + // with a URI parse. + Result.emplace_back(ParsedURI->toString()); + while (!Body.empty() && --Limit > 0) { + // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and + // could be optimized. + Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix); + URI TokenURI(ParsedURI->scheme(), ParsedURI->authority(), Body); + if (!Body.empty()) + Result.emplace_back(TokenURI.toString()); + } + return Result; +} + +} // namespace dex +} // namespace clangd +} // namespace clang Index: clangd/index/dex/DexIndex.h =================================================================== --- clangd/index/dex/DexIndex.h +++ clangd/index/dex/DexIndex.h @@ -1,113 +0,0 @@ -//===--- DexIndex.h - Dex Symbol Index Implementation -----------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// This defines Dex - a symbol index implementation based on query iterators -// over symbol tokens, such as fuzzy matching trigrams, scopes, types, etc. -// While consuming more memory and having longer build stage due to -// preprocessing, Dex will have substantially lower latency. It will also allow -// efficient symbol searching which is crucial for operations like code -// completion, and can be very important for a number of different code -// transformations which will be eventually supported by Clangd. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H -#define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H - -#include "Iterator.h" -#include "Token.h" -#include "Trigram.h" -#include "index/Index.h" -#include "index/MemIndex.h" -#include "index/SymbolCollector.h" - -namespace clang { -namespace clangd { -namespace dex { - -/// In-memory Dex trigram-based index implementation. -// FIXME(kbobyrev): Introduce serialization and deserialization of the symbol -// index so that it can be loaded from the disk. Since static index is not -// changed frequently, it's safe to assume that it has to be built only once -// (when the clangd process starts). Therefore, it can be easier to store built -// index on disk and then load it if available. -class DexIndex : public SymbolIndex { -public: - // All symbols must outlive this index. - template - DexIndex(Range &&Symbols, llvm::ArrayRef Schemes) - : URISchemes(Schemes) { - // If Schemes don't contain any items, fall back to SymbolCollector's - // default URI schemes. - if (URISchemes.empty()) { - SymbolCollector::Options Opts; - URISchemes = Opts.URISchemes; - } - for (auto &&Sym : Symbols) - this->Symbols.push_back(&Sym); - buildIndex(); - } - // Symbols are owned by BackingData, Index takes ownership. - template - DexIndex(Range &&Symbols, Payload &&BackingData, - llvm::ArrayRef URISchemes) - : DexIndex(std::forward(Symbols), URISchemes) { - KeepAlive = std::shared_ptr( - std::make_shared(std::move(BackingData)), nullptr); - } - - /// Builds an index from a slab. The index takes ownership of the slab. - static std::unique_ptr - build(SymbolSlab Slab, llvm::ArrayRef URISchemes) { - return llvm::make_unique(Slab, std::move(Slab), URISchemes); - } - - bool - fuzzyFind(const FuzzyFindRequest &Req, - llvm::function_ref Callback) const override; - - void lookup(const LookupRequest &Req, - llvm::function_ref Callback) const override; - - void refs(const RefsRequest &Req, - llvm::function_ref Callback) const override; - - size_t estimateMemoryUsage() const override; - -private: - void buildIndex(); - - /// Stores symbols sorted in the descending order of symbol quality.. - std::vector Symbols; - /// SymbolQuality[I] is the quality of Symbols[I]. - std::vector SymbolQuality; - llvm::DenseMap LookupTable; - /// Inverted index is a mapping from the search token to the posting list, - /// which contains all items which can be characterized by such search token. - /// For example, if the search token is scope "std::", the corresponding - /// posting list would contain all indices of symbols defined in namespace - /// std. Inverted index is used to retrieve posting lists which are processed - /// during the fuzzyFind process. - llvm::DenseMap InvertedIndex; - std::shared_ptr KeepAlive; // poor man's move-only std::any - - std::vector URISchemes; -}; - -/// Returns Search Token for a number of parent directories of given Path. -/// Should be used within the index build process. -/// -/// This function is exposed for testing only. -std::vector generateProximityURIs(llvm::StringRef URIPath); - -} // namespace dex -} // namespace clangd -} // namespace clang - -#endif // LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_DEX_DEXINDEX_H Index: clangd/index/dex/DexIndex.cpp =================================================================== --- clangd/index/dex/DexIndex.cpp +++ clangd/index/dex/DexIndex.cpp @@ -1,271 +0,0 @@ -//===--- DexIndex.cpp - Dex Symbol Index Implementation ---------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "DexIndex.h" -#include "FileDistance.h" -#include "FuzzyMatch.h" -#include "Logger.h" -#include "Quality.h" -#include "llvm/ADT/StringSet.h" -#include -#include - -namespace clang { -namespace clangd { -namespace dex { - -namespace { - -// Returns the tokens which are given symbol's characteristics. Currently, the -// generated tokens only contain fuzzy matching trigrams and symbol's scope, -// but in the future this will also return path proximity tokens and other -// types of tokens such as symbol type (if applicable). -// Returns the tokens which are given symbols's characteristics. For example, -// trigrams and scopes. -// FIXME(kbobyrev): Support more token types: -// * Types -// * Namespace proximity -std::vector generateSearchTokens(const Symbol &Sym) { - std::vector Result = generateIdentifierTrigrams(Sym.Name); - Result.emplace_back(Token::Kind::Scope, Sym.Scope); - // Skip token generation for symbols with unknown declaration location. - if (!Sym.CanonicalDeclaration.FileURI.empty()) - for (const auto &ProximityURI : - generateProximityURIs(Sym.CanonicalDeclaration.FileURI)) - Result.emplace_back(Token::Kind::ProximityURI, ProximityURI); - return Result; -} - -// Constructs BOOST iterators for Path Proximities. -std::vector> createFileProximityIterators( - llvm::ArrayRef ProximityPaths, - llvm::ArrayRef URISchemes, - const llvm::DenseMap &InvertedIndex) { - std::vector> BoostingIterators; - // Deduplicate parent URIs extracted from the ProximityPaths. - llvm::StringSet<> ParentURIs; - llvm::StringMap Sources; - for (const auto &Path : ProximityPaths) { - Sources[Path] = SourceParams(); - auto PathURI = URI::create(Path, URISchemes); - if (!PathURI) { - elog("Given ProximityPath {0} is can not be converted to any known URI " - "scheme. fuzzyFind request will ignore it.", - Path); - llvm::consumeError(PathURI.takeError()); - continue; - } - const auto PathProximityURIs = generateProximityURIs(PathURI->toString()); - for (const auto &ProximityURI : PathProximityURIs) - ParentURIs.insert(ProximityURI); - } - // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults - // for all parameters except for Proximity Path distance signal. - SymbolRelevanceSignals PathProximitySignals; - // DistanceCalculator will find the shortest distance from ProximityPaths to - // any URI extracted from the ProximityPaths. - URIDistance DistanceCalculator(Sources); - PathProximitySignals.FileProximityMatch = &DistanceCalculator; - // Try to build BOOST iterator for each Proximity Path provided by - // ProximityPaths. Boosting factor should depend on the distance to the - // Proximity Path: the closer processed path is, the higher boosting factor. - for (const auto &ParentURI : ParentURIs.keys()) { - const auto It = - InvertedIndex.find(Token(Token::Kind::ProximityURI, ParentURI)); - if (It != InvertedIndex.end()) { - // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator. - PathProximitySignals.SymbolURI = ParentURI; - BoostingIterators.push_back( - createBoost(create(It->second), PathProximitySignals.evaluate())); - } - } - return BoostingIterators; -} - -} // namespace - -void DexIndex::buildIndex() { - std::vector> ScoredSymbols(Symbols.size()); - - for (size_t I = 0; I < Symbols.size(); ++I) { - const Symbol *Sym = Symbols[I]; - LookupTable[Sym->ID] = Sym; - ScoredSymbols[I] = {quality(*Sym), Sym}; - } - - // Symbols are sorted by symbol qualities so that items in the posting lists - // are stored in the descending order of symbol quality. - std::sort(begin(ScoredSymbols), end(ScoredSymbols), - std::greater>()); - - // SymbolQuality was empty up until now. - SymbolQuality.resize(Symbols.size()); - // Populate internal storage using Symbol + Score pairs. - for (size_t I = 0; I < ScoredSymbols.size(); ++I) { - SymbolQuality[I] = ScoredSymbols[I].first; - Symbols[I] = ScoredSymbols[I].second; - } - - // Populate TempInvertedIndex with posting lists for index symbols. - for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank) { - const auto *Sym = Symbols[SymbolRank]; - for (const auto &Token : generateSearchTokens(*Sym)) - InvertedIndex[Token].push_back(SymbolRank); - } - - vlog("Built DexIndex with estimated memory usage {0} bytes.", - estimateMemoryUsage()); -} - -/// Constructs iterators over tokens extracted from the query and exhausts it -/// while applying Callback to each symbol in the order of decreasing quality -/// of the matched symbols. -bool DexIndex::fuzzyFind( - const FuzzyFindRequest &Req, - llvm::function_ref Callback) const { - assert(!StringRef(Req.Query).contains("::") && - "There must be no :: in query."); - FuzzyMatcher Filter(Req.Query); - bool More = false; - - std::vector> TopLevelChildren; - const auto TrigramTokens = generateIdentifierTrigrams(Req.Query); - - // Generate query trigrams and construct AND iterator over all query - // trigrams. - std::vector> TrigramIterators; - for (const auto &Trigram : TrigramTokens) { - const auto It = InvertedIndex.find(Trigram); - if (It != InvertedIndex.end()) - TrigramIterators.push_back(create(It->second)); - } - if (!TrigramIterators.empty()) - TopLevelChildren.push_back(createAnd(move(TrigramIterators))); - - // Generate scope tokens for search query. - std::vector> ScopeIterators; - for (const auto &Scope : Req.Scopes) { - const auto It = InvertedIndex.find(Token(Token::Kind::Scope, Scope)); - if (It != InvertedIndex.end()) - ScopeIterators.push_back(create(It->second)); - } - // Add OR iterator for scopes if there are any Scope Iterators. - if (!ScopeIterators.empty()) - TopLevelChildren.push_back(createOr(move(ScopeIterators))); - - // Add proximity paths boosting. - auto BoostingIterators = createFileProximityIterators( - Req.ProximityPaths, URISchemes, InvertedIndex); - // Boosting iterators do not actually filter symbols. In order to preserve - // the validity of resulting query, TRUE iterator should be added along - // BOOSTs. - if (!BoostingIterators.empty()) { - BoostingIterators.push_back(createTrue(Symbols.size())); - TopLevelChildren.push_back(createOr(move(BoostingIterators))); - } - - // Use TRUE iterator if both trigrams and scopes from the query are not - // present in the symbol index. - auto QueryIterator = TopLevelChildren.empty() - ? createTrue(Symbols.size()) - : createAnd(move(TopLevelChildren)); - // Retrieve more items than it was requested: some of the items with high - // final score might not be retrieved otherwise. - // FIXME(kbobyrev): Pre-scoring retrieval threshold should be adjusted as - // using 100x of the requested number might not be good in practice, e.g. - // when the requested number of items is small. - const size_t ItemsToRetrieve = 100 * Req.MaxCandidateCount; - auto Root = createLimit(move(QueryIterator), ItemsToRetrieve); - - using IDAndScore = std::pair; - std::vector IDAndScores = consume(*Root); - - auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) { - return LHS.second > RHS.second; - }; - TopN Top(Req.MaxCandidateCount, Compare); - for (const auto &IDAndScore : IDAndScores) { - const DocID SymbolDocID = IDAndScore.first; - const auto *Sym = Symbols[SymbolDocID]; - const llvm::Optional Score = Filter.match(Sym->Name); - if (!Score) - continue; - // Combine Fuzzy Matching score, precomputed symbol quality and boosting - // score for a cumulative final symbol score. - const float FinalScore = - (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second; - // If Top.push(...) returns true, it means that it had to pop an item. In - // this case, it is possible to retrieve more symbols. - if (Top.push({SymbolDocID, FinalScore})) - More = true; - } - - // Apply callback to the top Req.MaxCandidateCount items in the descending - // order of cumulative score. - for (const auto &Item : std::move(Top).items()) - Callback(*Symbols[Item.first]); - return More; -} - -void DexIndex::lookup(const LookupRequest &Req, - llvm::function_ref Callback) const { - for (const auto &ID : Req.IDs) { - auto I = LookupTable.find(ID); - if (I != LookupTable.end()) - Callback(*I->second); - } -} - -void DexIndex::refs(const RefsRequest &Req, - llvm::function_ref Callback) const { - log("refs is not implemented."); -} - -size_t DexIndex::estimateMemoryUsage() const { - size_t Bytes = - LookupTable.size() * sizeof(std::pair); - Bytes += SymbolQuality.size() * sizeof(std::pair); - Bytes += InvertedIndex.size() * sizeof(Token); - - for (const auto &P : InvertedIndex) { - Bytes += P.second.size() * sizeof(DocID); - } - return Bytes; -} - -std::vector generateProximityURIs(llvm::StringRef URIPath) { - std::vector Result; - auto ParsedURI = URI::parse(URIPath); - assert(ParsedURI && - "Non-empty argument of generateProximityURIs() should be a valid " - "URI."); - StringRef Body = ParsedURI->body(); - // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum - // size of resulting vector. Some projects might want to have higher limit if - // the file hierarchy is deeper. For the generic case, it would be useful to - // calculate Limit in the index build stage by calculating the maximum depth - // of the project source tree at runtime. - size_t Limit = 5; - // Insert original URI before the loop: this would save a redundant iteration - // with a URI parse. - Result.emplace_back(ParsedURI->toString()); - while (!Body.empty() && --Limit > 0) { - // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and - // could be optimized. - Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix); - URI TokenURI(ParsedURI->scheme(), ParsedURI->authority(), Body); - if (!Body.empty()) - Result.emplace_back(TokenURI.toString()); - } - return Result; -} - -} // namespace dex -} // namespace clangd -} // namespace clang Index: clangd/tool/ClangdMain.cpp =================================================================== --- clangd/tool/ClangdMain.cpp +++ clangd/tool/ClangdMain.cpp @@ -13,7 +13,6 @@ #include "RIFF.h" #include "Trace.h" #include "index/SymbolYAML.h" -#include "index/dex/DexIndex.h" #include "clang/Basic/Version.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/FileSystem.h" Index: unittests/clangd/CMakeLists.txt =================================================================== --- unittests/clangd/CMakeLists.txt +++ unittests/clangd/CMakeLists.txt @@ -16,7 +16,7 @@ CodeCompleteTests.cpp CodeCompletionStringsTests.cpp ContextTests.cpp - DexIndexTests.cpp + DexTests.cpp DraftStoreTests.cpp FileDistanceTests.cpp FileIndexTests.cpp Index: unittests/clangd/DexIndexTests.cpp =================================================================== --- unittests/clangd/DexIndexTests.cpp +++ unittests/clangd/DexIndexTests.cpp @@ -1,615 +0,0 @@ -//===-- DexIndexTests.cpp ----------------------------*- C++ -*-----------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "FuzzyMatch.h" -#include "TestFS.h" -#include "TestIndex.h" -#include "index/Index.h" -#include "index/Merge.h" -#include "index/dex/DexIndex.h" -#include "index/dex/Iterator.h" -#include "index/dex/Token.h" -#include "index/dex/Trigram.h" -#include "llvm/Support/ScopedPrinter.h" -#include "llvm/Support/raw_ostream.h" -#include "gmock/gmock.h" -#include "gtest/gtest.h" -#include -#include - -using ::testing::ElementsAre; -using ::testing::UnorderedElementsAre; -using namespace llvm; - -namespace clang { -namespace clangd { -namespace dex { -namespace { - -std::vector URISchemes = {"unittest"}; - -//===----------------------------------------------------------------------===// -// Query iterator tests. -//===----------------------------------------------------------------------===// - -std::vector consumeIDs(Iterator &It) { - auto IDAndScore = consume(It); - std::vector IDs(IDAndScore.size()); - for (size_t I = 0; I < IDAndScore.size(); ++I) - IDs[I] = IDAndScore[I].first; - return IDs; -} - -TEST(DexIndexIterators, DocumentIterator) { - const PostingList L = {4, 7, 8, 20, 42, 100}; - auto DocIterator = create(L); - - EXPECT_EQ(DocIterator->peek(), 4U); - EXPECT_FALSE(DocIterator->reachedEnd()); - - DocIterator->advance(); - EXPECT_EQ(DocIterator->peek(), 7U); - EXPECT_FALSE(DocIterator->reachedEnd()); - - DocIterator->advanceTo(20); - EXPECT_EQ(DocIterator->peek(), 20U); - EXPECT_FALSE(DocIterator->reachedEnd()); - - DocIterator->advanceTo(65); - EXPECT_EQ(DocIterator->peek(), 100U); - EXPECT_FALSE(DocIterator->reachedEnd()); - - DocIterator->advanceTo(420); - EXPECT_TRUE(DocIterator->reachedEnd()); -} - -TEST(DexIndexIterators, AndWithEmpty) { - const PostingList L0; - const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; - - auto AndEmpty = createAnd(create(L0)); - EXPECT_TRUE(AndEmpty->reachedEnd()); - - auto AndWithEmpty = createAnd(create(L0), create(L1)); - EXPECT_TRUE(AndWithEmpty->reachedEnd()); - - EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre()); -} - -TEST(DexIndexIterators, AndTwoLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; - - auto And = createAnd(create(L1), create(L0)); - - EXPECT_FALSE(And->reachedEnd()); - EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); - - And = createAnd(create(L0), create(L1)); - - And->advanceTo(0); - EXPECT_EQ(And->peek(), 0U); - And->advanceTo(5); - EXPECT_EQ(And->peek(), 7U); - And->advanceTo(10); - EXPECT_EQ(And->peek(), 10U); - And->advanceTo(42); - EXPECT_EQ(And->peek(), 320U); - And->advanceTo(8999); - EXPECT_EQ(And->peek(), 9000U); - And->advanceTo(9001); -} - -TEST(DexIndexIterators, AndThreeLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; - const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; - - auto And = createAnd(create(L0), create(L1), create(L2)); - EXPECT_EQ(And->peek(), 7U); - And->advanceTo(300); - EXPECT_EQ(And->peek(), 320U); - And->advanceTo(100000); - - EXPECT_TRUE(And->reachedEnd()); -} - -TEST(DexIndexIterators, OrWithEmpty) { - const PostingList L0; - const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; - - auto OrEmpty = createOr(create(L0)); - EXPECT_TRUE(OrEmpty->reachedEnd()); - - auto OrWithEmpty = createOr(create(L0), create(L1)); - EXPECT_FALSE(OrWithEmpty->reachedEnd()); - - EXPECT_THAT(consumeIDs(*OrWithEmpty), - ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U)); -} - -TEST(DexIndexIterators, OrTwoLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; - - auto Or = createOr(create(L0), create(L1)); - - EXPECT_FALSE(Or->reachedEnd()); - EXPECT_EQ(Or->peek(), 0U); - Or->advance(); - EXPECT_EQ(Or->peek(), 4U); - Or->advance(); - EXPECT_EQ(Or->peek(), 5U); - Or->advance(); - EXPECT_EQ(Or->peek(), 7U); - Or->advance(); - EXPECT_EQ(Or->peek(), 10U); - Or->advance(); - EXPECT_EQ(Or->peek(), 30U); - Or->advanceTo(42); - EXPECT_EQ(Or->peek(), 42U); - Or->advanceTo(300); - EXPECT_EQ(Or->peek(), 320U); - Or->advanceTo(9000); - EXPECT_EQ(Or->peek(), 9000U); - Or->advanceTo(9001); - EXPECT_TRUE(Or->reachedEnd()); - - Or = createOr(create(L0), create(L1)); - - EXPECT_THAT(consumeIDs(*Or), - ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U)); -} - -TEST(DexIndexIterators, OrThreeLists) { - const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; - const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; - const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; - - auto Or = createOr(create(L0), create(L1), create(L2)); - - EXPECT_FALSE(Or->reachedEnd()); - EXPECT_EQ(Or->peek(), 0U); - - Or->advance(); - EXPECT_EQ(Or->peek(), 1U); - - Or->advance(); - EXPECT_EQ(Or->peek(), 4U); - - Or->advanceTo(7); - - Or->advanceTo(59); - EXPECT_EQ(Or->peek(), 60U); - - Or->advanceTo(9001); - EXPECT_TRUE(Or->reachedEnd()); -} - -// FIXME(kbobyrev): The testcase below is similar to what is expected in real -// queries. It should be updated once new iterators (such as boosting, limiting, -// etc iterators) appear. However, it is not exhaustive and it would be -// beneficial to implement automatic generation (e.g. fuzzing) of query trees -// for more comprehensive testing. -TEST(DexIndexIterators, QueryTree) { - // - // +-----------------+ - // |And Iterator:1, 5| - // +--------+--------+ - // | - // | - // +-------------+----------------------+ - // | | - // | | - // +----------v----------+ +----------v------------+ - // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5| - // +----------+----------+ +----------+------------+ - // | | - // +------+-----+ +---------------------+ - // | | | | | - // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+ - // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4| - // +-------------+ +----+---+ +-----+ +---+----+ +----+---+ - // | | | - // +----v-----+ +-v--+ +---v---+ - // |1, 5, 7, 9| |1, 5| |0, 3, 5| - // +----------+ +----+ +-------+ - // - const PostingList L0 = {1, 3, 5, 8, 9}; - const PostingList L1 = {1, 5, 7, 9}; - const PostingList L3; - const PostingList L4 = {1, 5}; - const PostingList L5 = {0, 3, 5}; - - // Root of the query tree: [1, 5] - auto Root = createAnd( - // Lower And Iterator: [1, 5, 9] - createAnd(create(L0), createBoost(create(L1), 2U)), - // Lower Or Iterator: [0, 1, 5] - createOr(create(L3), createBoost(create(L4), 3U), - createBoost(create(L5), 4U))); - - EXPECT_FALSE(Root->reachedEnd()); - EXPECT_EQ(Root->peek(), 1U); - Root->advanceTo(0); - // Advance multiple times. Shouldn't do anything. - Root->advanceTo(1); - Root->advanceTo(0); - EXPECT_EQ(Root->peek(), 1U); - auto ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, 6); - Root->advance(); - EXPECT_EQ(Root->peek(), 5U); - Root->advanceTo(5); - EXPECT_EQ(Root->peek(), 5U); - ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, 8); - Root->advanceTo(9000); - EXPECT_TRUE(Root->reachedEnd()); -} - -TEST(DexIndexIterators, StringRepresentation) { - const PostingList L0 = {4, 7, 8, 20, 42, 100}; - const PostingList L1 = {1, 3, 5, 8, 9}; - const PostingList L2 = {1, 5, 7, 9}; - const PostingList L3 = {0, 5}; - const PostingList L4 = {0, 1, 5}; - const PostingList L5; - - EXPECT_EQ(llvm::to_string(*(create(L0))), "[{4}, 7, 8, 20, 42, 100, END]"); - - auto Nested = createAnd(createAnd(create(L1), create(L2)), - createOr(create(L3), create(L4), create(L5))); - - EXPECT_EQ(llvm::to_string(*Nested), - "(& (| [0, {5}, END] [0, {1}, 5, END] [{END}]) (& [{1}, 5, 7, 9, " - "END] [{1}, 3, 5, 8, 9, END]))"); -} - -TEST(DexIndexIterators, Limit) { - const PostingList L0 = {3, 6, 7, 20, 42, 100}; - const PostingList L1 = {1, 3, 5, 6, 7, 30, 100}; - const PostingList L2 = {0, 3, 5, 7, 8, 100}; - - auto DocIterator = createLimit(create(L0), 42); - EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7, 20, 42, 100)); - - DocIterator = createLimit(create(L0), 3); - EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7)); - - DocIterator = createLimit(create(L0), 0); - EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre()); - - auto AndIterator = - createAnd(createLimit(createTrue(9000), 343), createLimit(create(L0), 2), - createLimit(create(L1), 3), createLimit(create(L2), 42)); - EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(3, 7)); -} - -TEST(DexIndexIterators, True) { - auto TrueIterator = createTrue(0U); - EXPECT_TRUE(TrueIterator->reachedEnd()); - EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre()); - - PostingList L0 = {1, 2, 5, 7}; - TrueIterator = createTrue(7U); - EXPECT_THAT(TrueIterator->peek(), 0); - auto AndIterator = createAnd(create(L0), move(TrueIterator)); - EXPECT_FALSE(AndIterator->reachedEnd()); - EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5)); -} - -TEST(DexIndexIterators, Boost) { - auto BoostIterator = createBoost(createTrue(5U), 42U); - EXPECT_FALSE(BoostIterator->reachedEnd()); - auto ElementBoost = BoostIterator->consume(); - EXPECT_THAT(ElementBoost, 42U); - - const PostingList L0 = {2, 4}; - const PostingList L1 = {1, 4}; - auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U), - createBoost(create(L1), 3U)); - - ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE); - Root->advance(); - EXPECT_THAT(Root->peek(), 1U); - ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, 3); - - Root->advance(); - EXPECT_THAT(Root->peek(), 2U); - ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, 2); - - Root->advanceTo(4); - ElementBoost = Root->consume(); - EXPECT_THAT(ElementBoost, 3); -} - -//===----------------------------------------------------------------------===// -// Search token tests. -//===----------------------------------------------------------------------===// - -testing::Matcher> -tokensAre(std::initializer_list Strings, Token::Kind Kind) { - std::vector Tokens; - for (const auto &TokenData : Strings) { - Tokens.push_back(Token(Kind, TokenData)); - } - return testing::UnorderedElementsAreArray(Tokens); -} - -testing::Matcher> -trigramsAre(std::initializer_list Trigrams) { - return tokensAre(Trigrams, Token::Kind::Trigram); -} - -TEST(DexIndexTrigrams, IdentifierTrigrams) { - EXPECT_THAT(generateIdentifierTrigrams("X86"), - trigramsAre({"x86", "x$$", "x8$"})); - - EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"})); - - EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"})); - - EXPECT_THAT(generateIdentifierTrigrams("clangd"), - trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd"})); - - EXPECT_THAT(generateIdentifierTrigrams("abc_def"), - trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde", - "def", "ab$", "ad$"})); - - EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), - trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace", - "bcd", "bce", "bde", "cde", "ab$"})); - - EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), - trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt", - "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep", - "ept", "ptr", "un$", "up$"})); - - EXPECT_THAT( - generateIdentifierTrigrams("TUDecl"), - trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$", "td$"})); - - EXPECT_THAT(generateIdentifierTrigrams("IsOK"), - trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$"})); - - EXPECT_THAT( - generateIdentifierTrigrams("abc_defGhij__klm"), - trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh", - "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", - "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", - "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl", - "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik", - "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"})); -} - -TEST(DexIndexTrigrams, QueryTrigrams) { - EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"})); - EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); - EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); - - EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"})); - EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"})); - EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"})); - - EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); - - EXPECT_THAT(generateQueryTrigrams("clangd"), - trigramsAre({"cla", "lan", "ang", "ngd"})); - - EXPECT_THAT(generateQueryTrigrams("abc_def"), - trigramsAre({"abc", "bcd", "cde", "def"})); - - EXPECT_THAT(generateQueryTrigrams("a_b_c_d_e_"), - trigramsAre({"abc", "bcd", "cde"})); - - EXPECT_THAT(generateQueryTrigrams("unique_ptr"), - trigramsAre({"uni", "niq", "iqu", "que", "uep", "ept", "ptr"})); - - EXPECT_THAT(generateQueryTrigrams("TUDecl"), - trigramsAre({"tud", "ude", "dec", "ecl"})); - - EXPECT_THAT(generateQueryTrigrams("IsOK"), trigramsAre({"iso", "sok"})); - - EXPECT_THAT(generateQueryTrigrams("abc_defGhij__klm"), - trigramsAre({"abc", "bcd", "cde", "def", "efg", "fgh", "ghi", - "hij", "ijk", "jkl", "klm"})); -} - -TEST(DexSearchTokens, SymbolPath) { - EXPECT_THAT(generateProximityURIs( - "unittest:///clang-tools-extra/clangd/index/Token.h"), - ElementsAre("unittest:///clang-tools-extra/clangd/index/Token.h", - "unittest:///clang-tools-extra/clangd/index", - "unittest:///clang-tools-extra/clangd", - "unittest:///clang-tools-extra", "unittest:///")); - - EXPECT_THAT(generateProximityURIs("unittest:///a/b/c.h"), - ElementsAre("unittest:///a/b/c.h", "unittest:///a/b", - "unittest:///a", "unittest:///")); -} - -//===----------------------------------------------------------------------===// -// Index tests. -//===----------------------------------------------------------------------===// - -TEST(DexIndex, Lookup) { - auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes); - EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); - EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), - UnorderedElementsAre("ns::abc", "ns::xyz")); - EXPECT_THAT(lookup(*I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}), - UnorderedElementsAre("ns::xyz")); - EXPECT_THAT(lookup(*I, SymbolID("ns::nonono")), UnorderedElementsAre()); -} - -TEST(DexIndex, FuzzyFind) { - auto Index = DexIndex::build( - generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", - "other::ABC", "other::A"}), - URISchemes); - FuzzyFindRequest Req; - Req.Query = "ABC"; - Req.Scopes = {"ns::"}; - EXPECT_THAT(match(*Index, Req), UnorderedElementsAre("ns::ABC")); - Req.Scopes = {"ns::", "ns::nested::"}; - EXPECT_THAT(match(*Index, Req), - UnorderedElementsAre("ns::ABC", "ns::nested::ABC")); - Req.Query = "A"; - Req.Scopes = {"other::"}; - EXPECT_THAT(match(*Index, Req), - UnorderedElementsAre("other::A", "other::ABC")); - Req.Query = ""; - Req.Scopes = {}; - EXPECT_THAT(match(*Index, Req), - UnorderedElementsAre("ns::ABC", "ns::BCD", "::ABC", - "ns::nested::ABC", "other::ABC", - "other::A")); -} - -TEST(DexIndexTest, FuzzyMatchQ) { - auto I = DexIndex::build( - generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}), - URISchemes); - FuzzyFindRequest Req; - Req.Query = "lol"; - Req.MaxCandidateCount = 2; - EXPECT_THAT(match(*I, Req), - UnorderedElementsAre("LaughingOutLoud", "LittleOldLady")); -} - -// FIXME(kbobyrev): This test is different for DexIndex and MemIndex: while -// MemIndex manages response deduplication, DexIndex simply returns all matched -// symbols which means there might be equivalent symbols in the response. -// Before drop-in replacement of MemIndex with DexIndex happens, FileIndex -// should handle deduplication instead. -TEST(DexIndexTest, DexIndexDeduplicate) { - std::vector Symbols = {symbol("1"), symbol("2"), symbol("3"), - symbol("2") /* duplicate */}; - FuzzyFindRequest Req; - Req.Query = "2"; - DexIndex I(Symbols, URISchemes); - EXPECT_THAT(match(I, Req), ElementsAre("2", "2")); -} - -TEST(DexIndexTest, DexIndexLimitedNumMatches) { - auto I = DexIndex::build(generateNumSymbols(0, 100), URISchemes); - FuzzyFindRequest Req; - Req.Query = "5"; - Req.MaxCandidateCount = 3; - bool Incomplete; - auto Matches = match(*I, Req, &Incomplete); - EXPECT_EQ(Matches.size(), Req.MaxCandidateCount); - EXPECT_TRUE(Incomplete); -} - -TEST(DexIndexTest, FuzzyMatch) { - auto I = DexIndex::build( - generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}), - URISchemes); - FuzzyFindRequest Req; - Req.Query = "lol"; - Req.MaxCandidateCount = 2; - EXPECT_THAT(match(*I, Req), - UnorderedElementsAre("LaughingOutLoud", "LittleOldLady")); -} - -TEST(DexIndexTest, MatchQualifiedNamesWithoutSpecificScope) { - auto I = - DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); - FuzzyFindRequest Req; - Req.Query = "y"; - EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3")); -} - -TEST(DexIndexTest, MatchQualifiedNamesWithGlobalScope) { - auto I = - DexIndex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); - FuzzyFindRequest Req; - Req.Query = "y"; - Req.Scopes = {""}; - EXPECT_THAT(match(*I, Req), UnorderedElementsAre("y3")); -} - -TEST(DexIndexTest, MatchQualifiedNamesWithOneScope) { - auto I = DexIndex::build( - generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}), URISchemes); - FuzzyFindRequest Req; - Req.Query = "y"; - Req.Scopes = {"a::"}; - EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2")); -} - -TEST(DexIndexTest, MatchQualifiedNamesWithMultipleScopes) { - auto I = DexIndex::build( - generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}), URISchemes); - FuzzyFindRequest Req; - Req.Query = "y"; - Req.Scopes = {"a::", "b::"}; - EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3")); -} - -TEST(DexIndexTest, NoMatchNestedScopes) { - auto I = DexIndex::build(generateSymbols({"a::y1", "a::b::y2"}), URISchemes); - FuzzyFindRequest Req; - Req.Query = "y"; - Req.Scopes = {"a::"}; - EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1")); -} - -TEST(DexIndexTest, IgnoreCases) { - auto I = DexIndex::build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes); - FuzzyFindRequest Req; - Req.Query = "AB"; - Req.Scopes = {"ns::"}; - EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc")); -} - -TEST(DexIndexTest, Lookup) { - auto I = DexIndex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes); - EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); - EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), - UnorderedElementsAre("ns::abc", "ns::xyz")); - EXPECT_THAT(lookup(*I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}), - UnorderedElementsAre("ns::xyz")); - EXPECT_THAT(lookup(*I, SymbolID("ns::nonono")), UnorderedElementsAre()); -} - -TEST(DexIndexTest, ProximityPathsBoosting) { - auto RootSymbol = symbol("root::abc"); - RootSymbol.CanonicalDeclaration.FileURI = "unittest:///file.h"; - auto CloseSymbol = symbol("close::abc"); - CloseSymbol.CanonicalDeclaration.FileURI = "unittest:///a/b/c/d/e/f/file.h"; - - std::vector Symbols{CloseSymbol, RootSymbol}; - DexIndex I(Symbols, URISchemes); - - FuzzyFindRequest Req; - Req.Query = "abc"; - // The best candidate can change depending on the proximity paths. - Req.MaxCandidateCount = 1; - - // FuzzyFind request comes from the file which is far from the root: expect - // CloseSymbol to come out. - Req.ProximityPaths = {testPath("a/b/c/d/e/f/file.h")}; - EXPECT_THAT(match(I, Req), ElementsAre("close::abc")); - - // FuzzyFind request comes from the file which is close to the root: expect - // RootSymbol to come out. - Req.ProximityPaths = {testPath("file.h")}; - EXPECT_THAT(match(I, Req), ElementsAre("root::abc")); -} - -} // namespace -} // namespace dex -} // namespace clangd -} // namespace clang Index: unittests/clangd/DexTests.cpp =================================================================== --- unittests/clangd/DexTests.cpp +++ unittests/clangd/DexTests.cpp @@ -0,0 +1,615 @@ +//===-- DexTests.cpp ---------------------------------*- C++ -*-----------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "FuzzyMatch.h" +#include "TestFS.h" +#include "TestIndex.h" +#include "index/Index.h" +#include "index/Merge.h" +#include "index/dex/Dex.h" +#include "index/dex/Iterator.h" +#include "index/dex/Token.h" +#include "index/dex/Trigram.h" +#include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/raw_ostream.h" +#include "gmock/gmock.h" +#include "gtest/gtest.h" +#include +#include + +using ::testing::ElementsAre; +using ::testing::UnorderedElementsAre; +using namespace llvm; + +namespace clang { +namespace clangd { +namespace dex { +namespace { + +std::vector URISchemes = {"unittest"}; + +//===----------------------------------------------------------------------===// +// Query iterator tests. +//===----------------------------------------------------------------------===// + +std::vector consumeIDs(Iterator &It) { + auto IDAndScore = consume(It); + std::vector IDs(IDAndScore.size()); + for (size_t I = 0; I < IDAndScore.size(); ++I) + IDs[I] = IDAndScore[I].first; + return IDs; +} + +TEST(DexIterators, DocumentIterator) { + const PostingList L = {4, 7, 8, 20, 42, 100}; + auto DocIterator = create(L); + + EXPECT_EQ(DocIterator->peek(), 4U); + EXPECT_FALSE(DocIterator->reachedEnd()); + + DocIterator->advance(); + EXPECT_EQ(DocIterator->peek(), 7U); + EXPECT_FALSE(DocIterator->reachedEnd()); + + DocIterator->advanceTo(20); + EXPECT_EQ(DocIterator->peek(), 20U); + EXPECT_FALSE(DocIterator->reachedEnd()); + + DocIterator->advanceTo(65); + EXPECT_EQ(DocIterator->peek(), 100U); + EXPECT_FALSE(DocIterator->reachedEnd()); + + DocIterator->advanceTo(420); + EXPECT_TRUE(DocIterator->reachedEnd()); +} + +TEST(DexIterators, AndWithEmpty) { + const PostingList L0; + const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; + + auto AndEmpty = createAnd(create(L0)); + EXPECT_TRUE(AndEmpty->reachedEnd()); + + auto AndWithEmpty = createAnd(create(L0), create(L1)); + EXPECT_TRUE(AndWithEmpty->reachedEnd()); + + EXPECT_THAT(consumeIDs(*AndWithEmpty), ElementsAre()); +} + +TEST(DexIterators, AndTwoLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + + auto And = createAnd(create(L1), create(L0)); + + EXPECT_FALSE(And->reachedEnd()); + EXPECT_THAT(consumeIDs(*And), ElementsAre(0U, 7U, 10U, 320U, 9000U)); + + And = createAnd(create(L0), create(L1)); + + And->advanceTo(0); + EXPECT_EQ(And->peek(), 0U); + And->advanceTo(5); + EXPECT_EQ(And->peek(), 7U); + And->advanceTo(10); + EXPECT_EQ(And->peek(), 10U); + And->advanceTo(42); + EXPECT_EQ(And->peek(), 320U); + And->advanceTo(8999); + EXPECT_EQ(And->peek(), 9000U); + And->advanceTo(9001); +} + +TEST(DexIterators, AndThreeLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; + + auto And = createAnd(create(L0), create(L1), create(L2)); + EXPECT_EQ(And->peek(), 7U); + And->advanceTo(300); + EXPECT_EQ(And->peek(), 320U); + And->advanceTo(100000); + + EXPECT_TRUE(And->reachedEnd()); +} + +TEST(DexIterators, OrWithEmpty) { + const PostingList L0; + const PostingList L1 = {0, 5, 7, 10, 42, 320, 9000}; + + auto OrEmpty = createOr(create(L0)); + EXPECT_TRUE(OrEmpty->reachedEnd()); + + auto OrWithEmpty = createOr(create(L0), create(L1)); + EXPECT_FALSE(OrWithEmpty->reachedEnd()); + + EXPECT_THAT(consumeIDs(*OrWithEmpty), + ElementsAre(0U, 5U, 7U, 10U, 42U, 320U, 9000U)); +} + +TEST(DexIterators, OrTwoLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + + auto Or = createOr(create(L0), create(L1)); + + EXPECT_FALSE(Or->reachedEnd()); + EXPECT_EQ(Or->peek(), 0U); + Or->advance(); + EXPECT_EQ(Or->peek(), 4U); + Or->advance(); + EXPECT_EQ(Or->peek(), 5U); + Or->advance(); + EXPECT_EQ(Or->peek(), 7U); + Or->advance(); + EXPECT_EQ(Or->peek(), 10U); + Or->advance(); + EXPECT_EQ(Or->peek(), 30U); + Or->advanceTo(42); + EXPECT_EQ(Or->peek(), 42U); + Or->advanceTo(300); + EXPECT_EQ(Or->peek(), 320U); + Or->advanceTo(9000); + EXPECT_EQ(Or->peek(), 9000U); + Or->advanceTo(9001); + EXPECT_TRUE(Or->reachedEnd()); + + Or = createOr(create(L0), create(L1)); + + EXPECT_THAT(consumeIDs(*Or), + ElementsAre(0U, 4U, 5U, 7U, 10U, 30U, 42U, 60U, 320U, 9000U)); +} + +TEST(DexIterators, OrThreeLists) { + const PostingList L0 = {0, 5, 7, 10, 42, 320, 9000}; + const PostingList L1 = {0, 4, 7, 10, 30, 60, 320, 9000}; + const PostingList L2 = {1, 4, 7, 11, 30, 60, 320, 9000}; + + auto Or = createOr(create(L0), create(L1), create(L2)); + + EXPECT_FALSE(Or->reachedEnd()); + EXPECT_EQ(Or->peek(), 0U); + + Or->advance(); + EXPECT_EQ(Or->peek(), 1U); + + Or->advance(); + EXPECT_EQ(Or->peek(), 4U); + + Or->advanceTo(7); + + Or->advanceTo(59); + EXPECT_EQ(Or->peek(), 60U); + + Or->advanceTo(9001); + EXPECT_TRUE(Or->reachedEnd()); +} + +// FIXME(kbobyrev): The testcase below is similar to what is expected in real +// queries. It should be updated once new iterators (such as boosting, limiting, +// etc iterators) appear. However, it is not exhaustive and it would be +// beneficial to implement automatic generation (e.g. fuzzing) of query trees +// for more comprehensive testing. +TEST(DexIterators, QueryTree) { + // + // +-----------------+ + // |And Iterator:1, 5| + // +--------+--------+ + // | + // | + // +-------------+----------------------+ + // | | + // | | + // +----------v----------+ +----------v------------+ + // |And Iterator: 1, 5, 9| |Or Iterator: 0, 1, 3, 5| + // +----------+----------+ +----------+------------+ + // | | + // +------+-----+ +---------------------+ + // | | | | | + // +-------v-----+ +----+---+ +--v--+ +---v----+ +----v---+ + // |1, 3, 5, 8, 9| |Boost: 2| |Empty| |Boost: 3| |Boost: 4| + // +-------------+ +----+---+ +-----+ +---+----+ +----+---+ + // | | | + // +----v-----+ +-v--+ +---v---+ + // |1, 5, 7, 9| |1, 5| |0, 3, 5| + // +----------+ +----+ +-------+ + // + const PostingList L0 = {1, 3, 5, 8, 9}; + const PostingList L1 = {1, 5, 7, 9}; + const PostingList L3; + const PostingList L4 = {1, 5}; + const PostingList L5 = {0, 3, 5}; + + // Root of the query tree: [1, 5] + auto Root = createAnd( + // Lower And Iterator: [1, 5, 9] + createAnd(create(L0), createBoost(create(L1), 2U)), + // Lower Or Iterator: [0, 1, 5] + createOr(create(L3), createBoost(create(L4), 3U), + createBoost(create(L5), 4U))); + + EXPECT_FALSE(Root->reachedEnd()); + EXPECT_EQ(Root->peek(), 1U); + Root->advanceTo(0); + // Advance multiple times. Shouldn't do anything. + Root->advanceTo(1); + Root->advanceTo(0); + EXPECT_EQ(Root->peek(), 1U); + auto ElementBoost = Root->consume(); + EXPECT_THAT(ElementBoost, 6); + Root->advance(); + EXPECT_EQ(Root->peek(), 5U); + Root->advanceTo(5); + EXPECT_EQ(Root->peek(), 5U); + ElementBoost = Root->consume(); + EXPECT_THAT(ElementBoost, 8); + Root->advanceTo(9000); + EXPECT_TRUE(Root->reachedEnd()); +} + +TEST(DexIterators, StringRepresentation) { + const PostingList L0 = {4, 7, 8, 20, 42, 100}; + const PostingList L1 = {1, 3, 5, 8, 9}; + const PostingList L2 = {1, 5, 7, 9}; + const PostingList L3 = {0, 5}; + const PostingList L4 = {0, 1, 5}; + const PostingList L5; + + EXPECT_EQ(llvm::to_string(*(create(L0))), "[{4}, 7, 8, 20, 42, 100, END]"); + + auto Nested = createAnd(createAnd(create(L1), create(L2)), + createOr(create(L3), create(L4), create(L5))); + + EXPECT_EQ(llvm::to_string(*Nested), + "(& (| [0, {5}, END] [0, {1}, 5, END] [{END}]) (& [{1}, 5, 7, 9, " + "END] [{1}, 3, 5, 8, 9, END]))"); +} + +TEST(DexIterators, Limit) { + const PostingList L0 = {3, 6, 7, 20, 42, 100}; + const PostingList L1 = {1, 3, 5, 6, 7, 30, 100}; + const PostingList L2 = {0, 3, 5, 7, 8, 100}; + + auto DocIterator = createLimit(create(L0), 42); + EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7, 20, 42, 100)); + + DocIterator = createLimit(create(L0), 3); + EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre(3, 6, 7)); + + DocIterator = createLimit(create(L0), 0); + EXPECT_THAT(consumeIDs(*DocIterator), ElementsAre()); + + auto AndIterator = + createAnd(createLimit(createTrue(9000), 343), createLimit(create(L0), 2), + createLimit(create(L1), 3), createLimit(create(L2), 42)); + EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(3, 7)); +} + +TEST(DexIterators, True) { + auto TrueIterator = createTrue(0U); + EXPECT_TRUE(TrueIterator->reachedEnd()); + EXPECT_THAT(consumeIDs(*TrueIterator), ElementsAre()); + + PostingList L0 = {1, 2, 5, 7}; + TrueIterator = createTrue(7U); + EXPECT_THAT(TrueIterator->peek(), 0); + auto AndIterator = createAnd(create(L0), move(TrueIterator)); + EXPECT_FALSE(AndIterator->reachedEnd()); + EXPECT_THAT(consumeIDs(*AndIterator), ElementsAre(1, 2, 5)); +} + +TEST(DexIterators, Boost) { + auto BoostIterator = createBoost(createTrue(5U), 42U); + EXPECT_FALSE(BoostIterator->reachedEnd()); + auto ElementBoost = BoostIterator->consume(); + EXPECT_THAT(ElementBoost, 42U); + + const PostingList L0 = {2, 4}; + const PostingList L1 = {1, 4}; + auto Root = createOr(createTrue(5U), createBoost(create(L0), 2U), + createBoost(create(L1), 3U)); + + ElementBoost = Root->consume(); + EXPECT_THAT(ElementBoost, Iterator::DEFAULT_BOOST_SCORE); + Root->advance(); + EXPECT_THAT(Root->peek(), 1U); + ElementBoost = Root->consume(); + EXPECT_THAT(ElementBoost, 3); + + Root->advance(); + EXPECT_THAT(Root->peek(), 2U); + ElementBoost = Root->consume(); + EXPECT_THAT(ElementBoost, 2); + + Root->advanceTo(4); + ElementBoost = Root->consume(); + EXPECT_THAT(ElementBoost, 3); +} + +//===----------------------------------------------------------------------===// +// Search token tests. +//===----------------------------------------------------------------------===// + +testing::Matcher> +tokensAre(std::initializer_list Strings, Token::Kind Kind) { + std::vector Tokens; + for (const auto &TokenData : Strings) { + Tokens.push_back(Token(Kind, TokenData)); + } + return testing::UnorderedElementsAreArray(Tokens); +} + +testing::Matcher> +trigramsAre(std::initializer_list Trigrams) { + return tokensAre(Trigrams, Token::Kind::Trigram); +} + +TEST(DexTrigrams, IdentifierTrigrams) { + EXPECT_THAT(generateIdentifierTrigrams("X86"), + trigramsAre({"x86", "x$$", "x8$"})); + + EXPECT_THAT(generateIdentifierTrigrams("nl"), trigramsAre({"nl$", "n$$"})); + + EXPECT_THAT(generateIdentifierTrigrams("n"), trigramsAre({"n$$"})); + + EXPECT_THAT(generateIdentifierTrigrams("clangd"), + trigramsAre({"c$$", "cl$", "cla", "lan", "ang", "ngd"})); + + EXPECT_THAT(generateIdentifierTrigrams("abc_def"), + trigramsAre({"a$$", "abc", "abd", "ade", "bcd", "bde", "cde", + "def", "ab$", "ad$"})); + + EXPECT_THAT(generateIdentifierTrigrams("a_b_c_d_e_"), + trigramsAre({"a$$", "a_$", "a_b", "abc", "abd", "acd", "ace", + "bcd", "bce", "bde", "cde", "ab$"})); + + EXPECT_THAT(generateIdentifierTrigrams("unique_ptr"), + trigramsAre({"u$$", "uni", "unp", "upt", "niq", "nip", "npt", + "iqu", "iqp", "ipt", "que", "qup", "qpt", "uep", + "ept", "ptr", "un$", "up$"})); + + EXPECT_THAT( + generateIdentifierTrigrams("TUDecl"), + trigramsAre({"t$$", "tud", "tde", "ude", "dec", "ecl", "tu$", "td$"})); + + EXPECT_THAT(generateIdentifierTrigrams("IsOK"), + trigramsAre({"i$$", "iso", "iok", "sok", "is$", "io$"})); + + EXPECT_THAT( + generateIdentifierTrigrams("abc_defGhij__klm"), + trigramsAre({"a$$", "abc", "abd", "abg", "ade", "adg", "adk", "agh", + "agk", "bcd", "bcg", "bde", "bdg", "bdk", "bgh", "bgk", + "cde", "cdg", "cdk", "cgh", "cgk", "def", "deg", "dek", + "dgh", "dgk", "dkl", "efg", "efk", "egh", "egk", "ekl", + "fgh", "fgk", "fkl", "ghi", "ghk", "gkl", "hij", "hik", + "hkl", "ijk", "ikl", "jkl", "klm", "ab$", "ad$"})); +} + +TEST(DexTrigrams, QueryTrigrams) { + EXPECT_THAT(generateQueryTrigrams("c"), trigramsAre({"c$$"})); + EXPECT_THAT(generateQueryTrigrams("cl"), trigramsAre({"cl$"})); + EXPECT_THAT(generateQueryTrigrams("cla"), trigramsAre({"cla"})); + + EXPECT_THAT(generateQueryTrigrams("_"), trigramsAre({"_$$"})); + EXPECT_THAT(generateQueryTrigrams("__"), trigramsAre({"__$"})); + EXPECT_THAT(generateQueryTrigrams("___"), trigramsAre({"___"})); + + EXPECT_THAT(generateQueryTrigrams("X86"), trigramsAre({"x86"})); + + EXPECT_THAT(generateQueryTrigrams("clangd"), + trigramsAre({"cla", "lan", "ang", "ngd"})); + + EXPECT_THAT(generateQueryTrigrams("abc_def"), + trigramsAre({"abc", "bcd", "cde", "def"})); + + EXPECT_THAT(generateQueryTrigrams("a_b_c_d_e_"), + trigramsAre({"abc", "bcd", "cde"})); + + EXPECT_THAT(generateQueryTrigrams("unique_ptr"), + trigramsAre({"uni", "niq", "iqu", "que", "uep", "ept", "ptr"})); + + EXPECT_THAT(generateQueryTrigrams("TUDecl"), + trigramsAre({"tud", "ude", "dec", "ecl"})); + + EXPECT_THAT(generateQueryTrigrams("IsOK"), trigramsAre({"iso", "sok"})); + + EXPECT_THAT(generateQueryTrigrams("abc_defGhij__klm"), + trigramsAre({"abc", "bcd", "cde", "def", "efg", "fgh", "ghi", + "hij", "ijk", "jkl", "klm"})); +} + +TEST(DexSearchTokens, SymbolPath) { + EXPECT_THAT(generateProximityURIs( + "unittest:///clang-tools-extra/clangd/index/Token.h"), + ElementsAre("unittest:///clang-tools-extra/clangd/index/Token.h", + "unittest:///clang-tools-extra/clangd/index", + "unittest:///clang-tools-extra/clangd", + "unittest:///clang-tools-extra", "unittest:///")); + + EXPECT_THAT(generateProximityURIs("unittest:///a/b/c.h"), + ElementsAre("unittest:///a/b/c.h", "unittest:///a/b", + "unittest:///a", "unittest:///")); +} + +//===----------------------------------------------------------------------===// +// Index tests. +//===----------------------------------------------------------------------===// + +TEST(Dex, Lookup) { + auto I = Dex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes); + EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); + EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::abc", "ns::xyz")); + EXPECT_THAT(lookup(*I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::xyz")); + EXPECT_THAT(lookup(*I, SymbolID("ns::nonono")), UnorderedElementsAre()); +} + +TEST(Dex, FuzzyFind) { + auto Index = Dex::build( + generateSymbols({"ns::ABC", "ns::BCD", "::ABC", "ns::nested::ABC", + "other::ABC", "other::A"}), + URISchemes); + FuzzyFindRequest Req; + Req.Query = "ABC"; + Req.Scopes = {"ns::"}; + EXPECT_THAT(match(*Index, Req), UnorderedElementsAre("ns::ABC")); + Req.Scopes = {"ns::", "ns::nested::"}; + EXPECT_THAT(match(*Index, Req), + UnorderedElementsAre("ns::ABC", "ns::nested::ABC")); + Req.Query = "A"; + Req.Scopes = {"other::"}; + EXPECT_THAT(match(*Index, Req), + UnorderedElementsAre("other::A", "other::ABC")); + Req.Query = ""; + Req.Scopes = {}; + EXPECT_THAT(match(*Index, Req), + UnorderedElementsAre("ns::ABC", "ns::BCD", "::ABC", + "ns::nested::ABC", "other::ABC", + "other::A")); +} + +TEST(DexTest, FuzzyMatchQ) { + auto I = Dex::build( + generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}), + URISchemes); + FuzzyFindRequest Req; + Req.Query = "lol"; + Req.MaxCandidateCount = 2; + EXPECT_THAT(match(*I, Req), + UnorderedElementsAre("LaughingOutLoud", "LittleOldLady")); +} + +// FIXME(kbobyrev): This test is different for Dex and MemIndex: while +// MemIndex manages response deduplication, Dex simply returns all matched +// symbols which means there might be equivalent symbols in the response. +// Before drop-in replacement of MemIndex with Dex happens, FileIndex +// should handle deduplication instead. +TEST(DexTest, DexDeduplicate) { + std::vector Symbols = {symbol("1"), symbol("2"), symbol("3"), + symbol("2") /* duplicate */}; + FuzzyFindRequest Req; + Req.Query = "2"; + Dex I(Symbols, URISchemes); + EXPECT_THAT(match(I, Req), ElementsAre("2", "2")); +} + +TEST(DexTest, DexLimitedNumMatches) { + auto I = Dex::build(generateNumSymbols(0, 100), URISchemes); + FuzzyFindRequest Req; + Req.Query = "5"; + Req.MaxCandidateCount = 3; + bool Incomplete; + auto Matches = match(*I, Req, &Incomplete); + EXPECT_EQ(Matches.size(), Req.MaxCandidateCount); + EXPECT_TRUE(Incomplete); +} + +TEST(DexTest, FuzzyMatch) { + auto I = Dex::build( + generateSymbols({"LaughingOutLoud", "LionPopulation", "LittleOldLady"}), + URISchemes); + FuzzyFindRequest Req; + Req.Query = "lol"; + Req.MaxCandidateCount = 2; + EXPECT_THAT(match(*I, Req), + UnorderedElementsAre("LaughingOutLoud", "LittleOldLady")); +} + +TEST(DexTest, MatchQualifiedNamesWithoutSpecificScope) { + auto I = + Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); + FuzzyFindRequest Req; + Req.Query = "y"; + EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "b::y2", "y3")); +} + +TEST(DexTest, MatchQualifiedNamesWithGlobalScope) { + auto I = + Dex::build(generateSymbols({"a::y1", "b::y2", "y3"}), URISchemes); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {""}; + EXPECT_THAT(match(*I, Req), UnorderedElementsAre("y3")); +} + +TEST(DexTest, MatchQualifiedNamesWithOneScope) { + auto I = Dex::build( + generateSymbols({"a::y1", "a::y2", "a::x", "b::y2", "y3"}), URISchemes); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {"a::"}; + EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2")); +} + +TEST(DexTest, MatchQualifiedNamesWithMultipleScopes) { + auto I = Dex::build( + generateSymbols({"a::y1", "a::y2", "a::x", "b::y3", "y3"}), URISchemes); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {"a::", "b::"}; + EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1", "a::y2", "b::y3")); +} + +TEST(DexTest, NoMatchNestedScopes) { + auto I = Dex::build(generateSymbols({"a::y1", "a::b::y2"}), URISchemes); + FuzzyFindRequest Req; + Req.Query = "y"; + Req.Scopes = {"a::"}; + EXPECT_THAT(match(*I, Req), UnorderedElementsAre("a::y1")); +} + +TEST(DexTest, IgnoreCases) { + auto I = Dex::build(generateSymbols({"ns::ABC", "ns::abc"}), URISchemes); + FuzzyFindRequest Req; + Req.Query = "AB"; + Req.Scopes = {"ns::"}; + EXPECT_THAT(match(*I, Req), UnorderedElementsAre("ns::ABC", "ns::abc")); +} + +TEST(DexTest, Lookup) { + auto I = Dex::build(generateSymbols({"ns::abc", "ns::xyz"}), URISchemes); + EXPECT_THAT(lookup(*I, SymbolID("ns::abc")), UnorderedElementsAre("ns::abc")); + EXPECT_THAT(lookup(*I, {SymbolID("ns::abc"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::abc", "ns::xyz")); + EXPECT_THAT(lookup(*I, {SymbolID("ns::nonono"), SymbolID("ns::xyz")}), + UnorderedElementsAre("ns::xyz")); + EXPECT_THAT(lookup(*I, SymbolID("ns::nonono")), UnorderedElementsAre()); +} + +TEST(DexTest, ProximityPathsBoosting) { + auto RootSymbol = symbol("root::abc"); + RootSymbol.CanonicalDeclaration.FileURI = "unittest:///file.h"; + auto CloseSymbol = symbol("close::abc"); + CloseSymbol.CanonicalDeclaration.FileURI = "unittest:///a/b/c/d/e/f/file.h"; + + std::vector Symbols{CloseSymbol, RootSymbol}; + Dex I(Symbols, URISchemes); + + FuzzyFindRequest Req; + Req.Query = "abc"; + // The best candidate can change depending on the proximity paths. + Req.MaxCandidateCount = 1; + + // FuzzyFind request comes from the file which is far from the root: expect + // CloseSymbol to come out. + Req.ProximityPaths = {testPath("a/b/c/d/e/f/file.h")}; + EXPECT_THAT(match(I, Req), ElementsAre("close::abc")); + + // FuzzyFind request comes from the file which is close to the root: expect + // RootSymbol to come out. + Req.ProximityPaths = {testPath("file.h")}; + EXPECT_THAT(match(I, Req), ElementsAre("root::abc")); +} + +} // namespace +} // namespace dex +} // namespace clangd +} // namespace clang Index: unittests/clangd/TestIndex.h =================================================================== --- unittests/clangd/TestIndex.h +++ unittests/clangd/TestIndex.h @@ -12,10 +12,6 @@ #include "index/Index.h" #include "index/Merge.h" -#include "index/dex/DexIndex.h" -#include "index/dex/Iterator.h" -#include "index/dex/Token.h" -#include "index/dex/Trigram.h" namespace clang { namespace clangd {