Index: lib/Tooling/InterpolatingCompilationDatabase.cpp =================================================================== --- lib/Tooling/InterpolatingCompilationDatabase.cpp +++ lib/Tooling/InterpolatingCompilationDatabase.cpp @@ -217,46 +217,49 @@ } }; -// CommandIndex does the real work: given a filename, it produces the best -// matching TransferableCommand by matching filenames. Basic strategy: +// Given a filename, FileIndex picks the best matching file from the underlying +// DB. This is the proxy file whose CompileCommand will be reused. The +// heuristics incorporate file name, extension, and directory structure. +// Strategy: // - Build indexes of each of the substrings we want to look up by. // These indexes are just sorted lists of the substrings. -// - Forward requests to the inner CDB. If it fails, we must pick a proxy. // - Each criterion corresponds to a range lookup into the index, so we only // need O(log N) string comparisons to determine scores. -// - We then break ties among the candidates with the highest score. -class CommandIndex { +// +// Apart from path proximity signals, also takes file extensions into account +// when scoring the candidates. +class FileIndex { public: - CommandIndex(std::vector AllCommands) - : Commands(std::move(AllCommands)), Strings(Arena) { + FileIndex(std::vector Files) + : OriginalPaths(std::move(Files)), Strings(Arena) { // Sort commands by filename for determinism (index is a tiebreaker later). - llvm::sort( - Commands.begin(), Commands.end(), - [](const TransferableCommand &Left, const TransferableCommand &Right) { - return Left.Cmd.Filename < Right.Cmd.Filename; - }); - for (size_t I = 0; I < Commands.size(); ++I) { - StringRef Path = - Strings.save(StringRef(Commands[I].Cmd.Filename).lower()); - Paths.push_back({Path, I}); + llvm::sort(OriginalPaths.begin(), OriginalPaths.end()); + Paths.reserve(OriginalPaths.size()); + Types.reserve(OriginalPaths.size()); + Stems.reserve(OriginalPaths.size()); + for (size_t I = 0; I < OriginalPaths.size(); ++I) { + StringRef Path = Strings.save(StringRef(OriginalPaths[I]).lower()); + + Paths.emplace_back(Path, I); + Types.push_back(foldType(guessType(Path))); Stems.emplace_back(sys::path::stem(Path), I); auto Dir = ++sys::path::rbegin(Path), DirEnd = sys::path::rend(Path); for (int J = 0; J < DirectorySegmentsIndexed && Dir != DirEnd; ++J, ++Dir) if (Dir->size() > ShortDirectorySegment) // not trivial ones Components.emplace_back(*Dir, I); } - llvm::sort(Paths.begin(), Paths.end()); + // Paths are already sorted at this point. llvm::sort(Stems.begin(), Stems.end()); llvm::sort(Components.begin(), Components.end()); } - bool empty() const { return Commands.empty(); } + bool empty() const { return Paths.empty(); } - // Returns the command that best fits OriginalFilename. - // Candidates with PreferLanguage will be chosen over others (unless it's - // TY_INVALID, or all candidates are bad). - const TransferableCommand &chooseProxy(StringRef OriginalFilename, - types::ID PreferLanguage) const { + // Returns the path for the file that best fits OriginalFilename. + // Candidates with extensions matching PreferLanguage will be chosen over + // others (unless it's TY_INVALID, or all candidates are bad). + StringRef chooseProxy(StringRef OriginalFilename, + types::ID PreferLanguage) const { assert(!empty() && "need at least one candidate!"); std::string Filename = OriginalFilename.lower(); auto Candidates = scoreCandidates(Filename); @@ -264,15 +267,14 @@ pickWinner(Candidates, Filename, PreferLanguage); DEBUG_WITH_TYPE("interpolate", - llvm::dbgs() - << "interpolate: chose " - << Commands[Best.first].Cmd.Filename << " as proxy for " - << OriginalFilename << " preferring " - << (PreferLanguage == types::TY_INVALID - ? "none" - : types::getTypeName(PreferLanguage)) - << " score=" << Best.second << "\n"); - return Commands[Best.first]; + llvm::dbgs() << "interpolate: chose " + << OriginalPaths[Best.first] << " as proxy for " + << OriginalFilename << " preferring " + << (PreferLanguage == types::TY_INVALID + ? "none" + : types::getTypeName(PreferLanguage)) + << " score=" << Best.second << "\n"); + return OriginalPaths[Best.first]; } private: @@ -338,7 +340,7 @@ ScoredCandidate S; S.Index = Candidate.first; S.Preferred = PreferredLanguage == types::TY_INVALID || - PreferredLanguage == Commands[S.Index].Type; + PreferredLanguage == Types[S.Index]; S.Points = Candidate.second; if (!S.Preferred && Best.Preferred) continue; @@ -371,7 +373,7 @@ // If Prefix is true, it's instead the range starting with Key. template ArrayRef - indexLookup(StringRef Key, const std::vector &Idx) const { + indexLookup(StringRef Key, ArrayRef Idx) const { // Use pointers as iteratiors to ease conversion of result to ArrayRef. auto Range = std::equal_range(Idx.data(), Idx.data() + Idx.size(), Key, Less()); @@ -379,8 +381,8 @@ } // Performs a point lookup into a nonempty index, returning a longest match. - SubstringAndIndex - longestMatch(StringRef Key, const std::vector &Idx) const { + SubstringAndIndex longestMatch(StringRef Key, + ArrayRef Idx) const { assert(!Idx.empty()); // Longest substring match will be adjacent to a direct lookup. auto It = @@ -395,22 +397,27 @@ return Prefix > PrevPrefix ? *It : *--It; } - std::vector Commands; // Indexes point into this. + // Original paths, everything else is in lowercase. + std::vector OriginalPaths; BumpPtrAllocator Arena; - StringSaver Strings; + llvm::StringSaver Strings; // Indexes of candidates by certain substrings. // String is lowercase and sorted, index points into OriginalPaths. std::vector Paths; // Full path. + // Lang types obtained by guessing on the corresponding path. I-th element is + // a type for the I-th path. + std::vector Types; std::vector Stems; // Basename, without extension. std::vector Components; // Last path components. }; // The actual CompilationDatabase wrapper delegates to its inner database. -// If no match, looks up a command in CommandIndex and transfers it to the file. +// If no match, looks up a proxy file in FileIndex and transfers its +// command to the requested file. class InterpolatingCompilationDatabase : public CompilationDatabase { public: InterpolatingCompilationDatabase(std::unique_ptr Inner) - : Inner(std::move(Inner)), Index(allCommands()) {} + : Inner(std::move(Inner)), Index(this->Inner->getAllFiles()) {} std::vector getCompileCommands(StringRef Filename) const override { @@ -421,7 +428,11 @@ auto Lang = guessType(Filename, &TypeCertain); if (!TypeCertain) Lang = types::TY_INVALID; - return {Index.chooseProxy(Filename, foldType(Lang)).transferTo(Filename)}; + auto ProxyCommands = + Inner->getCompileCommands(Index.chooseProxy(Filename, foldType(Lang))); + if (ProxyCommands.empty()) + return {}; + return {TransferableCommand(ProxyCommands[0]).transferTo(Filename)}; } std::vector getAllFiles() const override { @@ -433,18 +444,8 @@ } private: - std::vector allCommands() { - std::vector Result; - for (auto Command : Inner->getAllCompileCommands()) { - Result.emplace_back(std::move(Command)); - if (Result.back().Type == types::TY_INVALID) - Result.pop_back(); - } - return Result; - } - std::unique_ptr Inner; - CommandIndex Index; + FileIndex Index; }; } // namespace