Index: include/clang/Basic/VirtualFileSystem.h =================================================================== --- include/clang/Basic/VirtualFileSystem.h +++ include/clang/Basic/VirtualFileSystem.h @@ -461,6 +461,20 @@ std::error_code setCurrentWorkingDirectory(const Twine &Path) override; }; +/// Wrap a filesystem to avoid calling status() and open() on missing files. +/// +/// When files are accessed, the FS lists ancestor directories and caches their +/// contents. This allows subsequent status() and open() calls for sibling +/// nonexistent paths to fail without making any syscalls. +/// +/// This is likely to improve performance when a large fraction of open()ed or +/// status()ed files do not exist, and their paths are related. +/// Notably: resolving #included files with a long include path. +/// +/// This wrapper never invalidates cached information, so is only suitable when +/// the underlying FS is assumed to be static. +std::unique_ptr avoidStats(llvm::IntrusiveRefCntPtr); + /// Get a globally unique ID for a virtual file or directory. llvm::sys::fs::UniqueID getNextVirtualUniqueID(); Index: lib/Basic/AvoidStatsVFS.cpp =================================================================== --- /dev/null +++ lib/Basic/AvoidStatsVFS.cpp @@ -0,0 +1,368 @@ +//===- AvoidStatsVFS.cpp - Implements a stat-reducing VFS wrapper ---------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// We implement a VFS wrapper that uses directory listings to prune status() and +// open() operations for files that *do not exist*. +// +// These operations are common in clang because of include paths. +// With an include path of {A, B, ...}, an #include directive results +// in attempts to open or stat {A/foo/bar, B/foo/bar, ...}. +// +// This is expensive because this typically takes one syscall per path, so we +// have O(NumIncludes * IncludePathLen) syscalls. +// +// To optimize this, we can list parent directories such as A/. +// If A only contains {config.h}, attempts to open A/foo/bar, A/foo/baz, A/qux +// can all fail instantly. +// Listing a directory tends to be a single syscall, and in practice most +// missing files can be recognized by looking at the same few directories. +// In practice the number of syscalls is O(NumIncludes + IncludePathLen). +// +// Our constant factor is higher, but this improves performance for large TUs +// with many include paths. +// +//===----------------------------------------------------------------------===// + +#include "clang/Basic/VirtualFileSystem.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Path.h" +#include +#define DEBUG_TYPE "AvoidStatsVFS" + +namespace clang { +namespace vfs { +namespace { +using namespace llvm; +namespace path = llvm::sys::path; +using llvm::sys::fs::file_type; + +class AvoidStatVFS : public ProxyFileSystem { +public: + // In fact we only read a directory once we've wanted its contents several + // times. This avoids a common pattern: + // - we read files under /some/long/path/... + // - we don't read anything else under /some/long/... + // - we readdir(/some/long), which tells us that /some/long/path is a dir + // - we still need to readdir(/some/long/path) to get its contents + // - that also tells us it's a directory, so readdir(/some/long) was useless + // We refuse to do IO to understand each dir until we reach this threshold. + constexpr static unsigned ReadDirThreshold = 2; + + AvoidStatVFS(IntrusiveRefCntPtr Base) + : ProxyFileSystem(std::move(Base)) {} + + llvm::ErrorOr> + openFileForRead(const Twine &Path) override { + LLVM_DEBUG(dump("before open " + Path)); + auto G = make_scope_exit([&] { LLVM_DEBUG(dump("after open " + Path)); }); + + auto NormPath = normalizePath(Path); + { + std::lock_guard Lock(CacheMu); + if (!mayBeFile(NormPath)) + return std::make_error_code(std::errc::no_such_file_or_directory); + } + LLVM_DEBUG(llvm::dbgs() << "passthru open " << Path << "\n"); + auto Result = ProxyFileSystem::openFileForRead(Path); + + std::lock_guard Lock(CacheMu); + recordState(NormPath, Result ? File : MissingOrDir); + return Result; + } + + llvm::ErrorOr status(const Twine &Path) override { + LLVM_DEBUG(dump("before status " + Path)); + auto G = make_scope_exit([&] { LLVM_DEBUG(dump("after status " + Path)); }); + + auto NormPath = normalizePath(Path); + { + std::lock_guard Lock(CacheMu); + if (!mayBeFile(NormPath)) + return std::make_error_code(std::errc::no_such_file_or_directory); + } + LLVM_DEBUG(llvm::dbgs() << "passthru status " << Path << "\n"); + auto Result = ProxyFileSystem::status(Path); + + std::lock_guard Lock(CacheMu); + recordState(NormPath, Result + ? (Result->getType() == file_type::directory_file + ? DirUnknownChildren + : File) + : Missing); + return Result; + } + +private: + // What we know about a path, which may be partial information. + // Values are ordered: higher values have more info and "upgrade" lower ones. + enum PathState { + // Ambiguous states. + Unknown, + MissingOrDir, + MissingOrFile, + // Type known, but not all relevant details. + DirUnknownChildren, + DirUnenumerable, // unknown children, but don't try to enumerate again! + // Complete information. + DirKnownChildren, + File, + Missing, + }; + + // The cache always uses absolute paths with ./ and ../ segments removed. + SmallString<256> normalizePath(const Twine &Path) { + SmallString<256> Result; + Path.toVector(Result); + makeAbsolute(Result); + path::remove_dots(Result, /*remove_dot_dot=*/true); + return Result; + } + + // Helpers used in LLVM_DEBUG() logging. +#ifndef NDEBUG + void dump(const Twine &When) { + llvm::dbgs() << "AvoidStatsVFS " << When << ":\n"; + std::vector> V; + for (const auto& E : Cache) + V.emplace_back(E.getKey(), E.getValue()); + llvm::sort(V.begin(), V.end()); + for (const auto &E : V) + llvm::dbgs() << " - " << E.first << " = " << str(E.second) << "\n"; + } + + const char* str(PathState S) { + switch (S) { + case Unknown: return "Unknown"; + case MissingOrDir: return "MissingOrDir"; + case MissingOrFile: return "MissingOrFile"; + case DirUnknownChildren: return "DirUnknownChildren"; + case DirUnenumerable: return "DirUnenumerable"; + case DirKnownChildren: return "DirKnownChildren"; + case File: return "File"; + case Missing: return "Missing"; + } + } +#endif + + // Returns false if we know NormPath points to a file that, based on cached + // information about the path and its parents. + // May retrieve extra information about parent directories to better answer. + // If the function returns true, NormPath may or may not be a file. + // If OrDir is true, both files and directories are acceptable "files". + // REQUIRES: CacheMu is held. + template + bool mayBeFile(StringRef NormPath) { + // First, just see if we can answer from the cache. + switch (Cache.lookup(NormPath)) { + case Unknown: + case MissingOrFile: + break; + case MissingOrDir: + if (!OrDir) + return false; + break; + case DirUnknownChildren: + case DirUnenumerable: + case DirKnownChildren: + return OrDir; + case File: + return true; + case Missing: + return false; + } + + // Next, maybe we can get an answer based on the parent directory. + auto Parent = path::parent_path(NormPath); + if (Parent.empty()) + return OrDir; // Root is a directory. + // We may need to populate its cache entry. + if (!populateCacheForDir(Parent)) + return true; // No more information. + + switch (Cache.lookup(Parent)) { + case Unknown: + case MissingOrDir: + case DirUnknownChildren: + llvm_unreachable("populateCacheForDir didn't provide enough info"); + case MissingOrFile: + case File: + case Missing: + return false; + case DirUnenumerable: + return true; + case DirKnownChildren: + // The point: we listed the parent, all children are now in cache. + switch (Cache.lookup(NormPath)) { + case MissingOrDir: + case MissingOrFile: + llvm_unreachable("populateCacheForDir didn't provide child info"); + case Unknown: + case Missing: + return false; + case DirUnknownChildren: + case DirUnenumerable: + case DirKnownChildren: + return OrDir; + case File: + return true; + }; + } + } + + // Record new information about a path's state. + // Combines appropriately with existing cached information (won't overwrite + // a more specific state with less specific state). + // REQUIRES: CacheMu is held. + void recordState(StringRef NormPath, PathState State) { + auto& Current = Cache[NormPath]; + // Sherlock Holmes would be proud of this special case. + if ((Current == MissingOrDir && State == MissingOrFile) || + (Current == MissingOrFile && State == MissingOrDir)) { + Current = Missing; + return; + } + Current = std::max(Current, State); + switch (State) { + case Unknown: + case MissingOrDir: + case MissingOrFile: + case Missing: + break; + case DirUnenumerable: + case DirUnknownChildren: + case DirKnownChildren: + case File: + auto Parent = path::parent_path(NormPath); + if (!Parent.empty()) + recordState(Parent, DirUnknownChildren); + break; + } + } + + // Attempts to populate the cache with information about a directory. + // Roughly: if the directory children are not known, readdir() to fill it. + // + // Details are messy: + // - If we know the directory doesn't exist, don't read from it. + // - We may need to populate parent caches in order to know that. + // - We may decline to populate the cache because the directory wasn't + // accessed enough yet. + // + // After returning, exactly one of the following is true: + // - we returned false, and declined to populate the cache + // - Cache[NormPath] indicates NormPath is not a directory + // - Cache[NormPath] == DirKnownChildren, all children are in cache + // - Cache[NormPath] == DirUnenumerable + // + // If AllowIO is false, we *only* attempt to propagate information already in + // cache. Only AllowIO=true calls count towards the ReadDirThreshold. + // + // REQUIRES: CacheMu is held. + bool populateCacheForDir(StringRef NormPath, bool AllowIO = true) { + // First, just see if we have any work to do. + bool ParentMightHelp = true; + switch (Cache.lookup(NormPath)) { + case Unknown: + case MissingOrDir: + break; // Need to populate cache with more info. + case DirUnknownChildren: + ParentMightHelp = false; // Listing parent won't detail our children. + break; + case MissingOrFile: + case DirUnenumerable: + case DirKnownChildren: + case File: + case Missing: + return true; // Cache already satisfies postcondition. + } + if (AllowIO && ++ReadDirAttempts[NormPath] < ReadDirThreshold) + AllowIO = false; + // Next, populate parent info and see if that determines our state. + auto Parent = path::parent_path(NormPath); + if (ParentMightHelp && !Parent.empty() && + populateCacheForDir(Parent, AllowIO)) { + switch (Cache.lookup(Parent)) { + case Unknown: + case MissingOrDir: + case DirUnknownChildren: + llvm_unreachable("populateCacheForDir didn't provide enough info"); + case MissingOrFile: + case File: + case Missing: + // Child can't exist if parent is not a directory. + recordState(NormPath, Missing); + return true; + case DirUnenumerable: + break; // No info about child, need to read it. + case DirKnownChildren: + // Parent is a directory, and we can tell whether child is. + switch (Cache.lookup(NormPath)) { + case MissingOrDir: + case MissingOrFile: + llvm_unreachable("populateCacheForDir didn't provide child info"); + case Unknown: + recordState(NormPath, Missing); + return true; + case DirUnknownChildren: + break; // Need to list children. + case DirUnenumerable: + case DirKnownChildren: + // populateCacheForDir shouldn't do this, but we might be racing. + return true; + case File: + case Missing: + return true; // Cache now satisfies postcondition. + } + break; + } + } + + // No other sources of info, we have to list the directory. + if (!AllowIO) + return false; + // Unlock around IO - no accessing the cache! + CacheMu.unlock(); + LLVM_DEBUG(llvm::dbgs() << "synthetic readdir " << NormPath << "\n"); + std::vector> DirListing; + std::error_code EC; + for (auto It = dir_begin(NormPath, EC); + !EC && It != directory_iterator();) { + DirListing.emplace_back(It->path(), It->type()); + It.increment(EC); + if (DirListing.size() == 100) + EC = std::make_error_code(std::errc::result_out_of_range); + } + CacheMu.lock(); + + // Record the results of the directory listing. + for (const auto& Entry : DirListing) + recordState(Entry.first, Entry.second == file_type::directory_file + ? DirUnknownChildren + : File); + recordState(NormPath, + EC ? DirListing.empty() ? MissingOrFile : DirUnenumerable + : DirKnownChildren); + return true; + } + + std::mutex CacheMu; + StringMap Cache; + StringMap ReadDirAttempts; +}; +} // namespace + +std::unique_ptr +avoidStats(llvm::IntrusiveRefCntPtr FS) { + return llvm::make_unique(std::move(FS)); +} + +} // namespace vfs +} // namespace clang Index: lib/Basic/CMakeLists.txt =================================================================== --- lib/Basic/CMakeLists.txt +++ lib/Basic/CMakeLists.txt @@ -46,6 +46,7 @@ add_clang_library(clangBasic Attributes.cpp + AvoidStatsVFS.cpp Builtins.cpp CharInfo.cpp Cuda.cpp Index: lib/Basic/VirtualFileSystem.cpp =================================================================== --- lib/Basic/VirtualFileSystem.cpp +++ lib/Basic/VirtualFileSystem.cpp @@ -303,11 +303,6 @@ return llvm::sys::fs::real_path(Path, Output); } -IntrusiveRefCntPtr vfs::getRealFileSystem() { - static IntrusiveRefCntPtr FS = new RealFileSystem(); - return FS; -} - namespace { class RealFSDirIter : public clang::vfs::detail::DirIterImpl { @@ -2141,3 +2136,9 @@ return *this; } + +IntrusiveRefCntPtr vfs::getRealFileSystem() { + static IntrusiveRefCntPtr FS( + avoidStats(new RealFileSystem()).release()); + return FS; +} Index: unittests/Basic/VirtualFileSystemTest.cpp =================================================================== --- unittests/Basic/VirtualFileSystemTest.cpp +++ unittests/Basic/VirtualFileSystemTest.cpp @@ -1616,3 +1616,139 @@ EXPECT_EQ(3, NumDiagnostics); } + +class AvoidStatsVFSTest : public ::testing::Test { + protected: + struct Counts { + unsigned Status = 0; + unsigned OpenFile = 0; + unsigned ListDir = 0; + }; + Counts Outer, Inner; + IntrusiveRefCntPtr MemFS; + + IntrusiveRefCntPtr reset() { + Outer = {}; + Inner = {}; + MemFS = new vfs::InMemoryFileSystem(); + return new CountingFS( + Outer, vfs::avoidStats(new CountingFS(Inner, MemFS)).release()); + } + + void touch(const Twine &Path) { + MemFS->addFile(Path, 0, MemoryBuffer::getMemBuffer("")); + } + + private: + class CountingFS : public vfs::ProxyFileSystem { + Counts &C; + + public: + CountingFS(Counts &C, IntrusiveRefCntPtr Base) + : ProxyFileSystem(Base), C(C) {} + + llvm::ErrorOr> + openFileForRead(const Twine &Path) override { + ++C.OpenFile; + return ProxyFileSystem::openFileForRead(Path); + } + + vfs::directory_iterator dir_begin(const Twine &Dir, + std::error_code &EC) override { + ++C.ListDir; + return ProxyFileSystem::dir_begin(Dir, EC); + } + + llvm::ErrorOr status(const Twine &Path) override { + ++C.Status; + return ProxyFileSystem::status(Path); + } + }; +}; + +TEST_F(AvoidStatsVFSTest, MissingFileNoStat) { + auto FS = reset(); + touch("/foo/a"); + touch("/foo/c"); + touch("/foo/f"); + + EXPECT_FALSE(FS->status("/foo/a").getError()); // Just stat, /foo wanted once. + EXPECT_TRUE(FS->status("/foo/b").getError()); // List /foo, b known missing + EXPECT_FALSE(FS->status("/foo/c").getError()); // c exists, must stat + EXPECT_TRUE(FS->status("/foo/d").getError()); // d known missing + EXPECT_TRUE(FS->openFileForRead("/foo/e").getError()); // e known missing + EXPECT_FALSE(FS->openFileForRead("/foo/f").getError()); // f exists, must open + + EXPECT_EQ(Outer.Status, 4u); + EXPECT_EQ(Outer.OpenFile, 2u); + EXPECT_EQ(Outer.ListDir, 0u); + EXPECT_EQ(Inner.Status, 2u); + EXPECT_EQ(Inner.OpenFile, 1u); + EXPECT_EQ(Inner.ListDir, 1u); +} + +TEST_F(AvoidStatsVFSTest, ParentDirsMissing) { + auto FS = reset(); + touch("/a/b/z"); + + EXPECT_TRUE(FS->status("/a/b/1").getError()); // Just stat. + EXPECT_TRUE(FS->status("/a/b/2").getError()); // List /a/b + EXPECT_TRUE(FS->status("/a/b/3").getError()); // Known missing + EXPECT_TRUE(FS->status("/a/b/c/d").getError()); // Known missing + EXPECT_TRUE(FS->status("/a/b/z/d").getError()); // Parent is a file + EXPECT_EQ(Outer.Status, 5u); + EXPECT_EQ(Outer.ListDir, 0u); + EXPECT_EQ(Inner.Status, 1u); + EXPECT_EQ(Inner.ListDir, 1u); +} + +TEST_F(AvoidStatsVFSTest, HugeDir) { + auto FS = reset(); + for (int I = 0; I < 10000; ++I) + touch("/big/" + Twine(I)); + + EXPECT_FALSE(FS->status("/big/1").getError()); // Just stat. + EXPECT_FALSE(FS->status("/big/9998").getError()); // Try to list, fail, stat. + EXPECT_FALSE(FS->status("/big/9999").getError()); // stat, don't list + EXPECT_TRUE(FS->status("/big/10001").getError()); + EXPECT_TRUE(FS->status("/big/x/10001").getError()); + EXPECT_TRUE(FS->status("/big/1/10001").getError()); // missing parent in cache + + EXPECT_EQ(Outer.Status, 6u); + EXPECT_EQ(Outer.ListDir, 0u); + EXPECT_EQ(Inner.Status, 5u); + EXPECT_EQ(Inner.ListDir, 1u); +} + +TEST_F(AvoidStatsVFSTest, Duplicate) { + // Stat + stat on a missing file is one operation. + auto FS = reset(); + EXPECT_TRUE(FS->status("/x").getError()); // Just stat. + EXPECT_TRUE(FS->status("/x").getError()); // No need to stat or list again. + EXPECT_EQ(Inner.Status, 1u); + EXPECT_EQ(Inner.ListDir, 0u); + + // Same with open + open. + FS = reset(); + EXPECT_TRUE(FS->openFileForRead("/x").getError()); + EXPECT_TRUE(FS->openFileForRead("/x").getError()); + EXPECT_EQ(Inner.OpenFile, 1u); + EXPECT_EQ(Inner.ListDir, 0u); + + // And stat + open. + FS = reset(); + EXPECT_TRUE(FS->status("/x").getError()); + EXPECT_TRUE(FS->openFileForRead("/x").getError()); + EXPECT_EQ(Inner.Status, 1u); + EXPECT_EQ(Inner.OpenFile, 0u); + EXPECT_EQ(Inner.ListDir, 0u); + + // But open + stat is two: as far as open() knows, it might be a dir. + // We list the directory (second attempt) and see that it's not. + FS = reset(); + EXPECT_TRUE(FS->openFileForRead("/x").getError()); + EXPECT_TRUE(FS->status("/x").getError()); + EXPECT_EQ(Inner.OpenFile, 1u); + EXPECT_EQ(Inner.Status, 0u); + EXPECT_EQ(Inner.ListDir, 1u); +}