Index: include/clang/Basic/VirtualFileSystem.h =================================================================== --- include/clang/Basic/VirtualFileSystem.h +++ include/clang/Basic/VirtualFileSystem.h @@ -461,6 +461,9 @@ std::error_code setCurrentWorkingDirectory(const Twine &Path) override; }; +// Wrap a filesystem in a cache to avoid stats on missing files. +std::unique_ptr avoidStats(llvm::IntrusiveRefCntPtr); + /// Get a globally unique ID for a virtual file or directory. llvm::sys::fs::UniqueID getNextVirtualUniqueID(); Index: lib/Basic/AvoidStatsVFS.cpp =================================================================== --- /dev/null +++ lib/Basic/AvoidStatsVFS.cpp @@ -0,0 +1,296 @@ +//===- AvoidStatsVFS.cpp - Implements a stat-reducing VFS wrapper ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// We implement a VFS wrapper that uses directory listings to prune status() and +// open() operations for files that *do not exist*. +// +// These operations are common in clang because of include paths. +// With an include path of {A, B, ...}, an #include directive results +// in attempts to open or stat {A/foo/bar, B/foo/bar, ...}. +// +// This is expensive because this typically takes one syscall per path, so we +// have O(NumIncludes * IncludePathLen) syscalls. +// +// To optimize this, we can list parent directories such as A/. +// If A only contains {config.h}, attempts to open A/foo/bar, A/foo/baz, A/qux +// can all fail instantly. +// Listing a directory tends to be a single syscall, and in practice most +// missing files can be recognized by looking at the same few directories. +// In practice the number of syscalls is O(NumIncludes + IncludePathLen). +// +// Our constant factor is higher, but this improves performance for large TUs +// with many include paths. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/VirtualFileSystem.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/Path.h" +#include + +namespace clang { +namespace vfs { +namespace { +using namespace llvm; +namespace path = llvm::sys::path; +using llvm::sys::fs::file_type; + +class StatLessFS : public ProxyFileSystem { +public: + // In fact we only read a directory once we've wanted its contents several + // times. This avoids a common pattern: + // - we read files under /some/long/path/... + // - we don't read anything else under /some/long/... + // - we readdir(/some/long), which tells us that /some/long/path is a dir + // - we still need to readdir(/some/long/path) to get its contents + // - that also tells us it's a directory, so readdir(/some/long) was useless + constexpr static unsigned ReadDirThreshold = 3; + + StatLessFS(IntrusiveRefCntPtr Base) + : ProxyFileSystem(std::move(Base)) {} + + llvm::ErrorOr> + openFileForRead(const Twine &Path) override { + auto NormPath = normalizePath(Path); + if (!mayBeFile(NormPath)) + return std::make_error_code(std::errc::no_such_file_or_directory); + auto Result = ProxyFileSystem::openFileForRead(Path); + recordState(NormPath, Result ? File : MissingOrDir); + return Result; + } + + llvm::ErrorOr status(const Twine &Path) override { + auto NormPath = normalizePath(Path); + if (!mayBeFile(NormPath)) + return std::make_error_code(std::errc::no_such_file_or_directory); + auto Result = ProxyFileSystem::status(Path); + recordState(NormPath, Result + ? (Result->getType() == file_type::directory_file + ? DirUnknownChildren + : File) + : Missing); + return Result; + } + +private: + // What we know about a path, which may be partial information. + // Values are ordered: higher values have more info and "upgrade" lower ones. + enum PathState { + // Ambiguous states. + Unknown, + MissingOrDir, + MissingOrFile, + // Type known, but not all relevant details. + DirUnknownChildren, + DirUnenumerable, // unknown children, but don't try to enumerate again! + // Complete information. + DirKnownChildren, + File, + Missing, + }; + + template + bool mayBeFile(StringRef NormPath) { + // First, just see if we can answer from the cache. + { + std::lock_guard Lock(CacheMu); + switch (Cache.lookup(NormPath)) { + case Unknown: + case MissingOrFile: + break; + case MissingOrDir: + if (!OrDir) { + llvm::errs() << "hitB1: " << NormPath << "\n"; + return false; + } + break; + case DirUnknownChildren: + case DirUnenumerable: + case DirKnownChildren: + if (!OrDir) llvm::errs() << "hitB1: " << NormPath << "\n"; + return OrDir; + case File: + return true; + case Missing: + llvm::errs() << "hitB1: " << NormPath << "\n"; + return false; + } + } + + // Next, maybe we can get an answer based on the parent directory. + auto Parent = path::parent_path(NormPath); + if (Parent.empty()) + return OrDir; // Root is a directory. + // We may need to populate its cache entry. + if (!populateCacheForDir(Parent)) + return true; // No more information. + + std::lock_guard Lock(CacheMu); + switch (Cache.lookup(Parent)) { + case Unknown: + case MissingOrDir: + case DirUnknownChildren: + llvm_unreachable("populateCacheForDir didn't provide enough info"); + case MissingOrFile: + case File: + case Missing: + llvm::errs() << "hitB2: " << Parent << "\n"; + return false; + case DirUnenumerable: + return true; + case DirKnownChildren: + // The point: we listed the parent, all children are now in cache. + switch (Cache.lookup(NormPath)) { + case MissingOrDir: + case MissingOrFile: + llvm_unreachable("populateCacheForDir didn't provide child info"); + case Unknown: + case Missing: + llvm::errs() << "hitB3: " << Parent << "\n"; + return false; + case DirUnknownChildren: + case DirUnenumerable: + case DirKnownChildren: + return OrDir; + case File: + return true; + }; + } + } + + void recordState(StringRef NormPath, PathState State) { + std::lock_guard Lock(CacheMu); + auto& Current = Cache[NormPath]; + // Sherlock Holmes would be proud of this special case. + if ((Current == MissingOrDir && State == MissingOrFile) || + (Current == MissingOrFile && State == MissingOrDir)) { + Current = Missing; + return; + } + Current = std::max(Current, State); + } + + // Roughly: if the directory children are not known, readdir() to fill it. + // But details are important. If we know the directory doesn't exist, we + // shouldn't read from it. And we must populate parent caches to know that. + // Postcondition: if returning true, cache will indicate either that: + // - NormPath is not a directory + // - NormPath is a directory, and all its children are cached + // - NormPath is a directory whose children can't be listed + bool populateCacheForDir(StringRef NormPath) { + // First, just see if we have any work to do. + { + std::lock_guard Lock(CacheMu); + switch (Cache.lookup(NormPath)) { + case Unknown: + case MissingOrDir: + case DirUnknownChildren: + break; // Need to populate cache with more info. + case MissingOrFile: + case DirUnenumerable: + case DirKnownChildren: + case File: + case Missing: + llvm::errs() << "hitA1: " << NormPath << "\n"; + return true; // Cache already satisfies postcondition. + } + } + // Next, populate parent info and see if that determines our state. + auto Parent = path::parent_path(NormPath); + if (!Parent.empty() && populateCacheForDir(Parent)) { + std::lock_guard Lock(CacheMu); + switch (Cache.lookup(Parent)) { + case Unknown: + case MissingOrDir: + case DirUnknownChildren: + llvm_unreachable("populateCacheForDir didn't provide enough info"); + case MissingOrFile: + case File: + case Missing: + // Child can't exist if parent is not a directory. + Cache[NormPath] = Missing; + return true; + case DirUnenumerable: + break; // No info about child, need to read it. + case DirKnownChildren: + // Parent is a directory, and we can tell whether child is. + switch (Cache.lookup(NormPath)) { + case MissingOrDir: + case MissingOrFile: + llvm_unreachable("populateCacheForDir didn't provide child info"); + case Unknown: + Cache[NormPath] = Missing; + llvm::errs() << "hitA2: " << Parent << "\n"; + return true; + case DirUnknownChildren: + break; // Need to list children. + case DirUnenumerable: + case DirKnownChildren: + // populateCacheForDir shouldn't do this, but we might be racing. + return true; + case File: + case Missing: + llvm::errs() << "hitA2: " << Parent << "\n"; + return true; // Cache now satisfies postcondition. + } + break; + } + } + // Finally, we need to read the directory. + { + std::lock_guard Lock(CacheMu); + if (++ReadDirAttempts[NormPath] < ReadDirThreshold) // not hot enough + return false; + } + std::error_code EC; + llvm::errs() << "Induced readdir " << NormPath << "\n"; + auto It = dir_begin(NormPath, EC); + if (EC) { + recordState(NormPath, MissingOrFile); + return true; + } + for (unsigned I = 0; I < 100; ++I) { + if (It == clang::vfs::directory_iterator()) { // at end of list + recordState(NormPath, DirKnownChildren); + return true; + } + recordState(It->path(), It->type() == file_type::directory_file + ? DirUnknownChildren + : File); + It.increment(EC); + if (EC) { + recordState(NormPath, DirUnenumerable); + return true; + } + } + recordState(NormPath, DirUnenumerable); + return true; + } + + SmallString<256> normalizePath(const Twine &Path) { + SmallString<256> Result; + Path.toVector(Result); + makeAbsolute(Result); + path::remove_dots(Result, /*remove_dot_dot=*/true); + return Result; + } + + std::mutex CacheMu; + StringMap Cache; + StringMap ReadDirAttempts; +}; +} // namespace + +std::unique_ptr +avoidStats(llvm::IntrusiveRefCntPtr FS) { + return llvm::make_unique(std::move(FS)); +} + +} // namespace vfs +} // namespace clang Index: lib/Basic/CMakeLists.txt =================================================================== --- lib/Basic/CMakeLists.txt +++ lib/Basic/CMakeLists.txt @@ -46,6 +46,7 @@ add_clang_library(clangBasic Attributes.cpp + AvoidStatsVFS.cpp Builtins.cpp CharInfo.cpp Cuda.cpp Index: lib/Basic/VirtualFileSystem.cpp =================================================================== --- lib/Basic/VirtualFileSystem.cpp +++ lib/Basic/VirtualFileSystem.cpp @@ -303,11 +303,6 @@ return llvm::sys::fs::real_path(Path, Output); } -IntrusiveRefCntPtr vfs::getRealFileSystem() { - static IntrusiveRefCntPtr FS = new RealFileSystem(); - return FS; -} - namespace { class RealFSDirIter : public clang::vfs::detail::DirIterImpl { @@ -2141,3 +2136,289 @@ return *this; } + +namespace { +class StatFS : public FileSystem { +public: + StatFS(const char *Name, llvm::IntrusiveRefCntPtr FS) + : Name(Name), FS(std::move(FS)) { + llvm::errs() << "created statfs " << Name << "\n"; + } + + llvm::ErrorOr status(const Twine &Path) override { + auto Ret = FS->status(Path); + bump(Ret ? StatusOK : StatusErr); + return Ret; + } + + llvm::ErrorOr> + openFileForRead(const Twine &Path) override { + auto Ret = FS->openFileForRead(Path); + bump(Ret ? OpenOK : OpenErr); + return Ret; + } + + directory_iterator dir_begin(const Twine &Dir, std::error_code &EC) override { + bump(DirBegin); + return FS->dir_begin(Dir, EC); + } + + virtual std::error_code setCurrentWorkingDirectory(const Twine &Path) override { + bump(SetCWD); + return FS->setCurrentWorkingDirectory(Path); + } + + virtual llvm::ErrorOr getCurrentWorkingDirectory() const override { + bump(GetCWD); + return FS->getCurrentWorkingDirectory(); + } + + virtual std::error_code + getRealPath(const Twine &Path, SmallVectorImpl &Output) const override { + bump(GetRealPath); + return FS->getRealPath(Path, Output); + } + +private: + void bump(std::atomic &I) const { + ++I; + if (++All % 1 == 0) { + llvm::errs() << "== FILESYSTEM " << Name << " ==\n" + << "Status: " << StatusOK << "+" << StatusErr << "\n" + << "Open: " << OpenOK << "+" << OpenErr << "\n" + << "Dir: " << DirBegin << "\n" + << "GetRealPath: " << GetRealPath << "\n" + << "===========================\n"; + } + } + + const char* Name; + llvm::IntrusiveRefCntPtr FS; + mutable std::atomic StatusOK = {0}, StatusErr = {0}, OpenOK = {0}, + OpenErr = {0}, DirBegin = {0}, GetRealPath = {0}, + SetCWD = {0}, GetCWD = {0}; + mutable std::atomic All= {0}; +}; + +#if 0 +class StatLessFS : public FileSystem { + enum State : uint8_t { + NoInfo, // Initial state. + DirKnownChildren, // Directory, all children's existence is in the cache. + DirTooBig, // Directory, known to be too big to enumerate. + DirUnknownChildren, // Directory, but children are not listed. + File, // Known to be a file. + Missing, // Known to not exist. + MissingOrDir, // May be a directory, or may not exist. + MissingOrFile // May be a file, or may not exist. + }; + mutable llvm::StringMap Cache; + mutable std::mutex Mu; +public: + StatLessFS(llvm::IntrusiveRefCntPtr FS) : FS(std::move(FS)) { + llvm::errs() << "created statlessfs \n"; + } + + bool mayBeFile(StringRef Path, bool AllowDir) const { + llvm::errs() << "Could " << Path << " be a file" + << (AllowDir ? " or dir" : "") << "?\n"; + auto Parent = llvm::sys::path::parent_path(Path); + if (!Parent.empty()) + fillCache(Parent); + std::lock_guard Lock(Mu); + switch (Cache.lookup(Path)) { + case NoInfo: + break; + case File: + case MissingOrFile: + llvm::errs() << "Cache says maybe\n"; + return true; + case Missing: + llvm::errs() << "Cache says no\n"; + return false; + case DirKnownChildren: + case DirUnknownChildren: + case DirTooBig: + case MissingOrDir: + return AllowDir; + llvm::errs() << "Cache says " << (AllowDir ? "maybedir" : "no,dir") + << "\n"; + return false; + } + if (Parent.empty()) + return true; + // We don't know anything about the actual file. Consider the parent... + llvm::errs() << "Cache says nothing. parent is " << int(Cache.lookup(Parent)) << "\n"; + return Cache.lookup(Parent) == DirTooBig; + } + + // Populate cache enough to answer questions about Dir's children. + void fillCache(StringRef Dir) const { + llvm::errs() << "fillcache "< Lock(Mu); + switch (Cache.lookup(Dir)) { + case NoInfo: + case MissingOrDir: + case DirUnknownChildren: + // Still needs to be filled in. + break; + case File: + case Missing: + case MissingOrFile: + case DirKnownChildren: + case DirTooBig: + // all complete here. + return; + } + if (!Parent.empty()) { + switch (Cache.lookup(Parent)) { + case NoInfo: + case File: + case Missing: + case MissingOrDir: + case MissingOrFile: + // Couldn't resolve parent as a dir, so child doesn't exist. + Cache[Dir] = Missing; + return; + case DirKnownChildren: + case DirUnknownChildren: + case DirTooBig: + // Parent is a directory, go ahead and read this dir. + break; + } + } + } + // read directory + std::error_code EC; + llvm::errs() << "Readdir " << Dir << "\n"; + auto It = FS->dir_begin(Dir, EC); + if (EC) { + std::lock_guard Lock(Mu); + Cache[Dir] = MissingOrFile; + return; + } + for (unsigned I = 0; I < 100; ++I) { + if (It == clang::vfs::directory_iterator()) { // at end + std::lock_guard Lock(Mu); + Cache[Dir] = DirKnownChildren; + return; + } + { + std::lock_guard Lock(Mu); + Cache[It->path()] = + It->type() == llvm::sys::fs::file_type::directory_file + ? DirUnknownChildren + : File; + } + It.increment(EC); + if (EC) { + std::lock_guard Lock(Mu); + Cache[Dir] = Missing; + return; + } + } + std::lock_guard Lock(Mu); + Cache[Dir] = DirTooBig; + } + + llvm::ErrorOr status(const Twine &Path) override { + SmallString<128> AbsPath; + Path.toVector(AbsPath); + makeAbsolute(AbsPath); + llvm::sys::path::remove_dots(AbsPath, /*remove_dot_dot=*/true); + if (!mayBeFile(AbsPath, true)) { + bump(StatusCachedMissing); + return std::make_error_code(std::errc::no_such_file_or_directory); + } + auto Ret = FS->status(Path); + { + std::lock_guard Lock(Mu); + auto &State = Cache[AbsPath]; + if (!Ret) { + State = State::Missing; + bump(StatusMissing); + } else if (Ret->isDirectory()) { + State = DirUnknownChildren; + bump(StatusDir); + } else { + State = File; + bump(StatusFile); + } + } + return Ret; + } + + llvm::ErrorOr> + openFileForRead(const Twine &Path) override { + SmallString<128> AbsPath; + Path.toVector(AbsPath); + makeAbsolute(AbsPath); + llvm::sys::path::remove_dots(AbsPath, /*remove_dot_dot=*/true); + if (!mayBeFile(AbsPath, false)) { + bump(OpenCachedMissing); + return std::make_error_code(std::errc::no_such_file_or_directory); + } + auto Ret = FS->openFileForRead(Path); + { + std::lock_guard Lock(Mu); + auto &State = Cache[AbsPath]; + if (!Ret) { + if (!State) + State = MissingOrDir; + bump(OpenErr); + } else { + State = File; + bump(OpenOK); + } + } + return Ret; + } + + directory_iterator dir_begin(const Twine &Dir, std::error_code &EC) override { + return FS->dir_begin(Dir, EC); + } + + virtual std::error_code setCurrentWorkingDirectory(const Twine &Path) override { + return FS->setCurrentWorkingDirectory(Path); + } + + virtual llvm::ErrorOr getCurrentWorkingDirectory() const override { + return FS->getCurrentWorkingDirectory(); + } + + virtual std::error_code + getRealPath(const Twine &Path, SmallVectorImpl &Output) const override { + return FS->getRealPath(Path, Output); + } + +private: + void bump(std::atomic &I) const { + ++I; + if (++All % 1 == 0) { + llvm::errs() << "== StatLessFS ==\n" + << "Status: " << StatusFile << "+" << StatusDir << "+" << StatusMissing << " (" << StatusCachedMissing << ")\n" + << "Open: " << OpenOK << "+" << OpenErr << " (" << OpenCachedMissing << ")\n" + << "===========================\n"; + } + } + + llvm::IntrusiveRefCntPtr FS; + mutable std::atomic StatusCachedMissing = {0}, StatusMissing = {0}, + StatusDir = {0}, StatusFile = {0}, OpenOK = {0}, + OpenErr = {0}, OpenCachedMissing = {0}; + mutable std::atomic All= {0}; +}; +#endif + +} // namespace + +IntrusiveRefCntPtr vfs::getRealFileSystem() { + static IntrusiveRefCntPtr FS = new StatFS( + "outer", avoidStats(new StatFS("Real", new RealFileSystem())).release()); + return FS; +}