diff --git a/clang-tools-extra/clangd/index/Background.cpp b/clang-tools-extra/clangd/index/Background.cpp --- a/clang-tools-extra/clangd/index/Background.cpp +++ b/clang-tools-extra/clangd/index/Background.cpp @@ -61,51 +61,6 @@ namespace clangd { namespace { -// Resolves URI to file paths with cache. -class URIToFileCache { -public: - URIToFileCache(llvm::StringRef HintPath) : HintPath(HintPath) {} - - llvm::StringRef resolve(llvm::StringRef FileURI) { - auto I = URIToPathCache.try_emplace(FileURI); - if (I.second) { - auto Path = URI::resolve(FileURI, HintPath); - if (!Path) { - elog("Failed to resolve URI {0}: {1}", FileURI, Path.takeError()); - assert(false && "Failed to resolve URI"); - return ""; - } - I.first->second = *Path; - } - return I.first->second; - } - -private: - std::string HintPath; - llvm::StringMap URIToPathCache; -}; - -// We keep only the node "U" and its edges. Any node other than "U" will be -// empty in the resultant graph. -IncludeGraph getSubGraph(const URI &U, const IncludeGraph &FullGraph) { - IncludeGraph IG; - - std::string FileURI = U.toString(); - auto Entry = IG.try_emplace(FileURI).first; - auto &Node = Entry->getValue(); - Node = FullGraph.lookup(Entry->getKey()); - Node.URI = Entry->getKey(); - - // URIs inside nodes must point into the keys of the same IncludeGraph. - for (auto &Include : Node.DirectIncludes) { - auto I = IG.try_emplace(Include).first; - I->getValue().URI = I->getKey(); - Include = I->getKey(); - } - - return IG; -} - // We cannot use vfs->makeAbsolute because Cmd.FileName is either absolute or // relative to Cmd.Directory, which might not be the same as current working // directory. @@ -219,85 +174,33 @@ llvm::StringRef MainFile, IndexFileIn Index, const llvm::StringMap &ShardVersionsSnapshot, bool HadErrors) { - // Partition symbols/references into files. - struct File { - llvm::DenseSet Symbols; - llvm::DenseSet Refs; - llvm::DenseSet Relations; - FileDigest Digest; - }; - llvm::StringMap Files; - URIToFileCache URICache(MainFile); + llvm::StringMap FilesToUpdate; for (const auto &IndexIt : *Index.Sources) { const auto &IGN = IndexIt.getValue(); // Note that sources do not contain any information regarding missing // headers, since we don't even know what absolute path they should fall in. - const auto AbsPath = URICache.resolve(IGN.URI); + auto AbsPath = llvm::cantFail(URI::resolve(IGN.URI, MainFile), + "Failed to resovle URI"); const auto DigestIt = ShardVersionsSnapshot.find(AbsPath); // File has different contents, or indexing was successful this time. if (DigestIt == ShardVersionsSnapshot.end() || DigestIt->getValue().Digest != IGN.Digest || (DigestIt->getValue().HadErrors && !HadErrors)) - Files.try_emplace(AbsPath).first->getValue().Digest = IGN.Digest; - } - // This map is used to figure out where to store relations. - llvm::DenseMap SymbolIDToFile; - for (const auto &Sym : *Index.Symbols) { - if (Sym.CanonicalDeclaration) { - auto DeclPath = URICache.resolve(Sym.CanonicalDeclaration.FileURI); - const auto FileIt = Files.find(DeclPath); - if (FileIt != Files.end()) { - FileIt->second.Symbols.insert(&Sym); - SymbolIDToFile[Sym.ID] = &FileIt->second; - } - } - // For symbols with different declaration and definition locations, we store - // the full symbol in both the header file and the implementation file, so - // that merging can tell the preferred symbols (from canonical headers) from - // other symbols (e.g. forward declarations). - if (Sym.Definition && - Sym.Definition.FileURI != Sym.CanonicalDeclaration.FileURI) { - auto DefPath = URICache.resolve(Sym.Definition.FileURI); - const auto FileIt = Files.find(DefPath); - if (FileIt != Files.end()) - FileIt->second.Symbols.insert(&Sym); - } - } - llvm::DenseMap RefToIDs; - for (const auto &SymRefs : *Index.Refs) { - for (const auto &R : SymRefs.second) { - auto Path = URICache.resolve(R.Location.FileURI); - const auto FileIt = Files.find(Path); - if (FileIt != Files.end()) { - auto &F = FileIt->getValue(); - RefToIDs[&R] = SymRefs.first; - F.Refs.insert(&R); - } - } - } - for (const auto &Rel : *Index.Relations) { - const auto FileIt = SymbolIDToFile.find(Rel.Subject); - if (FileIt != SymbolIDToFile.end()) - FileIt->second->Relations.insert(&Rel); + FilesToUpdate[AbsPath] = IGN.Digest; } + // Shard slabs into files. + ShardIndexToFiles ShardedIndex(std::move(Index), MainFile); + // Build and store new slabs for each updated file. - for (const auto &FileIt : Files) { - llvm::StringRef Path = FileIt.getKey(); - SymbolSlab::Builder Syms; - RefSlab::Builder Refs; - RelationSlab::Builder Relations; - for (const auto *S : FileIt.second.Symbols) - Syms.insert(*S); - for (const auto *R : FileIt.second.Refs) - Refs.insert(RefToIDs[R], *R); - for (const auto *Rel : FileIt.second.Relations) - Relations.insert(*Rel); - auto SS = std::make_unique(std::move(Syms).build()); - auto RS = std::make_unique(std::move(Refs).build()); - auto RelS = std::make_unique(std::move(Relations).build()); - auto IG = std::make_unique( - getSubGraph(URI::create(Path), Index.Sources.getValue())); + for (const auto &FileIt : FilesToUpdate) { + PathRef Path = FileIt.first(); + auto SS = std::make_unique(ShardedIndex.buildSymbolSlab(Path)); + auto RS = std::make_unique(ShardedIndex.buildRefSlab(Path)); + auto RelS = + std::make_unique(ShardedIndex.buildRelationSlab(Path)); + auto IG = + std::make_unique(ShardedIndex.getIncludeGraph(Path)); // We need to store shards before updating the index, since the latter // consumes slabs. @@ -312,7 +215,7 @@ // Only store command line hash for main files of the TU, since our // current model keeps only one version of a header file. if (Path == MainFile) - Shard.Cmd = Index.Cmd.getPointer(); + Shard.Cmd = ShardedIndex.getMainFileCmd(); if (auto Error = IndexStorage->storeShard(Path, Shard)) elog("Failed to write background-index shard for file {0}: {1}", Path, @@ -320,7 +223,7 @@ { std::lock_guard Lock(ShardVersionsMu); - auto Hash = FileIt.second.Digest; + const auto &Hash = FileIt.getValue(); auto DigestIt = ShardVersions.try_emplace(Path); ShardVersion &SV = DigestIt.first->second; // Skip if file is already up to date, unless previous index was broken diff --git a/clang-tools-extra/clangd/index/FileIndex.h b/clang-tools-extra/clangd/index/FileIndex.h --- a/clang-tools-extra/clangd/index/FileIndex.h +++ b/clang-tools-extra/clangd/index/FileIndex.h @@ -15,14 +15,24 @@ #ifndef LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_FILEINDEX_H #define LLVM_CLANG_TOOLS_EXTRA_CLANGD_INDEX_FILEINDEX_H +#include "Headers.h" #include "Index.h" #include "MemIndex.h" #include "Merge.h" #include "Path.h" #include "index/CanonicalIncludes.h" +#include "index/Ref.h" +#include "index/Relation.h" +#include "index/Serialization.h" #include "index/Symbol.h" #include "clang/Lex/Preprocessor.h" +#include "clang/Tooling/CompilationDatabase.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/Optional.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include +#include namespace clang { class ASTContext; @@ -109,17 +119,15 @@ private: bool UseDex; // FIXME: this should be always on. - // Contains information from each file's preamble only. - // These are large, but update fairly infrequently (preambles are stable). + // Contains information from each file's preamble only. Symbols and relations + // are sharded per declaration file to deduplicate multiple symbols and reduce + // memory usage. // Missing information: // - symbol refs (these are always "from the main file") // - definition locations in the main file // - // FIXME: Because the preambles for different TUs have large overlap and - // FileIndex doesn't deduplicate, this uses lots of extra RAM. - // The biggest obstacle in fixing this: the obvious approach of partitioning - // by declaring file (rather than main file) fails if headers provide - // different symbols based on preprocessor state. + // Note that we store only one version of a heder, hence symbols appearing in + // different PP states will be missing. FileSymbols PreambleSymbols; SwapIndex PreambleIndex; @@ -146,6 +154,48 @@ std::shared_ptr PP, const CanonicalIncludes &Includes); +/// Takes slabs coming from a TU(multiple files) and shards them per declaration +/// location. +struct ShardIndexToFiles { + /// \p HintPath is used to convert file URIs stored in symbols into absolute + /// paths. + explicit ShardIndexToFiles(IndexFileIn Input, PathRef HintPath); + + /// Returns absolute paths for all files that has a shard. + std::vector getAllFiles() const; + + const tooling::CompileCommand *getMainFileCmd() const { + return Index.Cmd.getPointer(); + } + + /// Generates slabs for the \p File using the sharded information. Note that + /// these functions results in a copy of all the relevant data. + SymbolSlab buildSymbolSlab(PathRef File) const; + RefSlab buildRefSlab(PathRef File) const; + RelationSlab buildRelationSlab(PathRef File) const; + const IncludeGraph &getIncludeGraph(PathRef File) const; + +private: + // Contains all the information that belongs to a single file. + struct FileShard { + // Either declared or defined in the file. + llvm::DenseSet Symbols; + // Reference occurs in the file. + llvm::DenseSet Refs; + // Subject is declared in the file. + llvm::DenseSet Relations; + // Contains edges for only the direct includes. + IncludeGraph IG; + }; + + // Keeps all the information alive. + const IndexFileIn Index; + // Mapping from absolute paths to slab information. + llvm::StringMap Shards; + // Used to build RefSlabs. + llvm::DenseMap RefToSymID; +}; + } // namespace clangd } // namespace clang diff --git a/clang-tools-extra/clangd/index/FileIndex.cpp b/clang-tools-extra/clangd/index/FileIndex.cpp --- a/clang-tools-extra/clangd/index/FileIndex.cpp +++ b/clang-tools-extra/clangd/index/FileIndex.cpp @@ -10,11 +10,17 @@ #include "CollectMacros.h" #include "Logger.h" #include "ParsedAST.h" +#include "Path.h" #include "SymbolCollector.h" #include "index/CanonicalIncludes.h" #include "index/Index.h" #include "index/MemIndex.h" #include "index/Merge.h" +#include "index/Ref.h" +#include "index/Relation.h" +#include "index/Serialization.h" +#include "index/Symbol.h" +#include "index/SymbolID.h" #include "index/SymbolOrigin.h" #include "index/dex/Dex.h" #include "clang/AST/ASTContext.h" @@ -23,19 +29,23 @@ #include "clang/Lex/MacroInfo.h" #include "clang/Lex/Preprocessor.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" #include +#include +#include namespace clang { namespace clangd { +namespace { -static SlabTuple indexSymbols(ASTContext &AST, std::shared_ptr PP, - llvm::ArrayRef DeclsToIndex, - const MainFileMacros *MacroRefsToIndex, - const CanonicalIncludes &Includes, - bool IsIndexMainAST, llvm::StringRef Version) { +SlabTuple indexSymbols(ASTContext &AST, std::shared_ptr PP, + llvm::ArrayRef DeclsToIndex, + const MainFileMacros *MacroRefsToIndex, + const CanonicalIncludes &Includes, bool IsIndexMainAST, + llvm::StringRef Version) { SymbolCollector::Options CollectorOpts; CollectorOpts.CollectIncludePath = true; CollectorOpts.Includes = &Includes; @@ -85,6 +95,142 @@ std::move(Relations)); } +// Resolves URI to file paths with cache. +class URIToFileCache { +public: + URIToFileCache(PathRef HintPath) : HintPath(HintPath) {} + + llvm::StringRef operator[](llvm::StringRef FileURI) { + if (FileURI.empty()) + return ""; + auto I = URIToPathCache.try_emplace(FileURI); + if (I.second) { + I.first->second = llvm::cantFail(URI::resolve(FileURI, HintPath), + "Failed to resolve URI"); + } + return I.first->second; + } + +private: + PathRef HintPath; + llvm::StringMap URIToPathCache; +}; + +// We keep only the node "U" and its edges. Any node other than "U" will be +// empty in the resultant graph. +IncludeGraph getSubGraph(llvm::StringRef URI, const IncludeGraph &FullGraph) { + IncludeGraph IG; + + auto Entry = IG.try_emplace(URI).first; + auto &Node = Entry->getValue(); + Node = FullGraph.lookup(Entry->getKey()); + Node.URI = Entry->getKey(); + + // URIs inside nodes must point into the keys of the same IncludeGraph. + for (auto &Include : Node.DirectIncludes) { + auto I = IG.try_emplace(Include).first; + I->getValue().URI = I->getKey(); + Include = I->getKey(); + } + return IG; +} +} // namespace + +ShardIndexToFiles::ShardIndexToFiles(IndexFileIn Input, PathRef HintPath) + : Index(std::move(Input)) { + URIToFileCache UriToFile(HintPath); + // Used to build RelationSlabs. + llvm::DenseMap SymbolIDToFile; + + // Attribute each Symbol to both their declaration and definition locations. + if (Index.Symbols) { + for (const auto &S : *Index.Symbols) { + auto File = UriToFile[S.CanonicalDeclaration.FileURI]; + auto It = Shards.try_emplace(File); + It.first->getValue().Symbols.insert(&S); + SymbolIDToFile[S.ID] = &It.first->getValue(); + // Only bother if definition file is different than declaration file. + if (S.Definition && + S.Definition.FileURI != S.CanonicalDeclaration.FileURI) { + auto File = UriToFile[S.Definition.FileURI]; + auto It = Shards.try_emplace(File); + It.first->getValue().Symbols.insert(&S); + } + } + } + // Attribute references into each file they occured in. + if (Index.Refs) { + for (const auto &SymRefs : *Index.Refs) { + for (const auto &R : SymRefs.second) { + auto File = UriToFile[R.Location.FileURI]; + const auto It = Shards.try_emplace(File); + It.first->getValue().Refs.insert(&R); + RefToSymID[&R] = SymRefs.first; + } + } + } + // Attribute relations to the file declaraing their Subject as Object might + // not have been indexed, see SymbolCollector::processRelations for details. + if (Index.Relations) { + for (const auto &R : *Index.Relations) { + auto *File = SymbolIDToFile.lookup(R.Subject); + assert(File && "unknown subject in relation"); + File->Relations.insert(&R); + } + } + // Store only the direct includes of a file in a shard. + if (Index.Sources) { + const auto &FullGraph = *Index.Sources; + for (const auto &It : FullGraph) { + auto File = UriToFile[It.first()]; + auto ShardIt = Shards.try_emplace(File); + ShardIt.first->getValue().IG = getSubGraph(It.first(), FullGraph); + } + } +} +std::vector ShardIndexToFiles::getAllFiles() const { + return {Shards.keys().begin(), Shards.keys().end()}; +} + +SymbolSlab ShardIndexToFiles::buildSymbolSlab(PathRef File) const { + auto It = Shards.find(File); + assert(It != Shards.end() && "received unknown file"); + + SymbolSlab::Builder B; + for (const auto *S : It->getValue().Symbols) + B.insert(*S); + return std::move(B).build(); +} + +RefSlab ShardIndexToFiles::buildRefSlab(PathRef File) const { + auto It = Shards.find(File); + assert(It != Shards.end() && "received unknown file"); + + RefSlab::Builder B; + for (const auto *Ref : It->getValue().Refs) { + auto SID = RefToSymID.lookup(Ref); + B.insert(SID, *Ref); + } + return std::move(B).build(); +} + +RelationSlab ShardIndexToFiles::buildRelationSlab(PathRef File) const { + auto It = Shards.find(File); + assert(It != Shards.end() && "received unknown file"); + + RelationSlab::Builder B; + for (const auto *Rel : It->getValue().Relations) { + B.insert(*Rel); + } + return std::move(B).build(); +} + +const IncludeGraph &ShardIndexToFiles::getIncludeGraph(PathRef File) const { + auto It = Shards.find(File); + assert(It != Shards.end() && "received unknown file"); + return It->getValue().IG; +} + SlabTuple indexMainDecls(ParsedAST &AST) { return indexSymbols(AST.getASTContext(), AST.getPreprocessorPtr(), AST.getLocalTopLevelDecls(), &AST.getMacros(), @@ -254,15 +400,23 @@ ASTContext &AST, std::shared_ptr PP, const CanonicalIncludes &Includes) { - auto Slabs = indexHeaderSymbols(Version, AST, std::move(PP), Includes); - PreambleSymbols.update( - Path, std::make_unique(std::move(std::get<0>(Slabs))), - std::make_unique(), - std::make_unique(std::move(std::get<2>(Slabs))), - /*CountReferences=*/false); + IndexFileIn IF; + std::tie(IF.Symbols, std::ignore, IF.Relations) = + indexHeaderSymbols(Version, AST, std::move(PP), Includes); + ShardIndexToFiles ShardedIndex(std::move(IF), Path); + for (PathRef File : ShardedIndex.getAllFiles()) { + PreambleSymbols.update( + File, std::make_unique(ShardedIndex.buildSymbolSlab(File)), + std::make_unique(), + std::make_unique(ShardedIndex.buildRelationSlab(File)), + /*CountReferences=*/false); + } PreambleIndex.reset( PreambleSymbols.buildIndex(UseDex ? IndexType::Heavy : IndexType::Light, DuplicateHandling::PickOne)); + vlog("Build dynamic index for header symbols with estimated memory usage of " + "{0} bytes", + PreambleIndex.estimateMemoryUsage()); } void FileIndex::updateMain(PathRef Path, ParsedAST &AST) { @@ -274,6 +428,9 @@ /*CountReferences=*/true); MainFileIndex.reset( MainFileSymbols.buildIndex(IndexType::Light, DuplicateHandling::Merge)); + vlog("Build dynamic index for main-file symbols with estimated memory usage " + "of {0} bytes", + MainFileIndex.estimateMemoryUsage()); } } // namespace clangd diff --git a/clang-tools-extra/clangd/index/SymbolCollector.cpp b/clang-tools-extra/clangd/index/SymbolCollector.cpp --- a/clang-tools-extra/clangd/index/SymbolCollector.cpp +++ b/clang-tools-extra/clangd/index/SymbolCollector.cpp @@ -283,6 +283,18 @@ if (!ID) return true; + // ND is the canonical (i.e. first) declaration. If it's in the main file + // (which is not a header), then no public declaration was visible, so assume + // it's main-file only. + bool IsMainFileOnly = + SM.isWrittenInMainFile(SM.getExpansionLoc(ND->getBeginLoc())) && + !isHeaderFile(SM.getFileEntryForID(SM.getMainFileID())->getName(), + ASTCtx->getLangOpts()); + // In C, printf is a redecl of an implicit builtin! So check OrigD instead. + if (ASTNode.OrigD->isImplicit() || + !shouldCollectSymbol(*ND, *ASTCtx, Opts, IsMainFileOnly)) + return true; + // Note: we need to process relations for all decl occurrences, including // refs, because the indexing code only populates relations for specific // occurrences. For example, RelationBaseOf is only populated for the @@ -297,17 +309,6 @@ if (IsOnlyRef && !CollectRef) return true; - // ND is the canonical (i.e. first) declaration. If it's in the main file - // (which is not a header), then no public declaration was visible, so assume - // it's main-file only. - bool IsMainFileOnly = - SM.isWrittenInMainFile(SM.getExpansionLoc(ND->getBeginLoc())) && - !isHeaderFile(SM.getFileEntryForID(SM.getMainFileID())->getName(), - ASTCtx->getLangOpts()); - // In C, printf is a redecl of an implicit builtin! So check OrigD instead. - if (ASTNode.OrigD->isImplicit() || - !shouldCollectSymbol(*ND, *ASTCtx, Opts, IsMainFileOnly)) - return true; // Do not store references to main-file symbols. // Unlike other fields, e.g. Symbols (which use spelling locations), we use // file locations for references (as it aligns the behavior of clangd's diff --git a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp --- a/clang-tools-extra/clangd/unittests/FileIndexTests.cpp +++ b/clang-tools-extra/clangd/unittests/FileIndexTests.cpp @@ -151,8 +151,9 @@ File.HeaderFilename = (Basename + ".h").str(); File.HeaderCode = std::string(Code); auto AST = File.build(); - M.updatePreamble(File.Filename, /*Version=*/"null", AST.getASTContext(), - AST.getPreprocessorPtr(), AST.getCanonicalIncludes()); + M.updatePreamble(testPath(File.Filename), /*Version=*/"null", + AST.getASTContext(), AST.getPreprocessorPtr(), + AST.getCanonicalIncludes()); } TEST(FileIndexTest, CustomizedURIScheme) { @@ -393,8 +394,9 @@ TU.HeaderCode = "class A {}; class B : public A {};"; auto AST = TU.build(); FileIndex Index; - Index.updatePreamble(TU.Filename, /*Version=*/"null", AST.getASTContext(), - AST.getPreprocessorPtr(), AST.getCanonicalIncludes()); + Index.updatePreamble(testPath(TU.Filename), /*Version=*/"null", + AST.getASTContext(), AST.getPreprocessorPtr(), + AST.getCanonicalIncludes()); SymbolID A = findSymbol(TU.headerSymbols(), "A").ID; uint32_t Results = 0; RelationsRequest Req; diff --git a/clang-tools-extra/clangd/unittests/TestTU.cpp b/clang-tools-extra/clangd/unittests/TestTU.cpp --- a/clang-tools-extra/clangd/unittests/TestTU.cpp +++ b/clang-tools-extra/clangd/unittests/TestTU.cpp @@ -116,9 +116,10 @@ std::unique_ptr TestTU::index() const { auto AST = build(); auto Idx = std::make_unique(/*UseDex=*/true); - Idx->updatePreamble(Filename, /*Version=*/"null", AST.getASTContext(), - AST.getPreprocessorPtr(), AST.getCanonicalIncludes()); - Idx->updateMain(Filename, AST); + Idx->updatePreamble(testPath(Filename), /*Version=*/"null", + AST.getASTContext(), AST.getPreprocessorPtr(), + AST.getCanonicalIncludes()); + Idx->updateMain(testPath(Filename), AST); return std::move(Idx); }