diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -91,6 +91,10 @@ Expected> setupStatsFile(StringRef StatsFilename); +/// Produces a container ordering for optimal multi-threaded processing. Returns +/// ordered indices to elements in the input array. +std::vector generateModulesOrdering(ArrayRef R); + class LTO; struct SymbolResolution; class ThinBackendProc; diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1094,10 +1094,17 @@ const StringMap &ModuleToDefinedGVSummaries; public: - ThinBackendProc(const Config &Conf, ModuleSummaryIndex &CombinedIndex, + enum Kind { + InProcessKind, + WriteIndexesKind + }; + Kind kind() const { return ThinLtoKind; } + + ThinBackendProc(Kind K, const Config &Conf, ModuleSummaryIndex &CombinedIndex, const StringMap &ModuleToDefinedGVSummaries) : Conf(Conf), CombinedIndex(CombinedIndex), - ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries) {} + ModuleToDefinedGVSummaries(ModuleToDefinedGVSummaries), ThinLtoKind(K) { + } virtual ~ThinBackendProc() {} virtual Error start( @@ -1107,6 +1114,9 @@ const std::map &ResolvedODR, MapVector &ModuleMap) = 0; virtual Error wait() = 0; + +private: + const Kind ThinLtoKind; }; namespace { @@ -1126,7 +1136,8 @@ ThreadPoolStrategy ThinLTOParallelism, const StringMap &ModuleToDefinedGVSummaries, AddStreamFn AddStream, NativeObjectCache Cache) - : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), + : ThinBackendProc(InProcessKind, Conf, CombinedIndex, + ModuleToDefinedGVSummaries), BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)), Cache(std::move(Cache)) { for (auto &Name : CombinedIndex.cfiFunctionDefs()) @@ -1267,7 +1278,8 @@ const StringMap &ModuleToDefinedGVSummaries, std::string OldPrefix, std::string NewPrefix, bool ShouldEmitImportsFiles, raw_fd_ostream *LinkedObjectsFile, lto::IndexWriteCallback OnWrite) - : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries), + : ThinBackendProc(WriteIndexesKind, Conf, CombinedIndex, + ModuleToDefinedGVSummaries), OldPrefix(OldPrefix), NewPrefix(NewPrefix), ShouldEmitImportsFiles(ShouldEmitImportsFiles), LinkedObjectsFile(LinkedObjectsFile), OnWrite(OnWrite) {} @@ -1443,17 +1455,38 @@ auto &ModuleMap = ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap; - // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined - // module and parallel code generation partitions. - unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel; - for (auto &Mod : ModuleMap) { - if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first], - ExportLists[Mod.first], - ResolvedODR[Mod.first], ThinLTO.ModuleMap)) - return E; - ++Task; - } + auto ProcessOneModule = [&](int I) -> Error { + auto &Mod = *(ModuleMap.begin() + I); + // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for + // combined module and parallel code generation partitions. + return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I, + Mod.second, ImportLists[Mod.first], + ExportLists[Mod.first], ResolvedODR[Mod.first], + ThinLTO.ModuleMap); + }; + if (RegularLTO.ParallelCodeGenParallelismLevel == 1 || + BackendProc->kind() == ThinBackendProc::WriteIndexesKind) { + // Process the modules in the order they were provided. + for (int I = 0, E = ModuleMap.size(); I != E; ++I) + if (Error E = ProcessOneModule(I)) + return E; + } else { + assert(BackendProc->kind() == ThinBackendProc::InProcessKind); + // When executing in parallel, process largest bitsize modules first to + // improve parallelism, and avoid starving the thread pool near the end. + // This saves about 15 sec on a 36-core machine while link `clang.exe` (out + // of 100 sec). + std::vector ModulesVec; + ModulesVec.reserve(ModuleMap.size()); + for (auto &Mod : ModuleMap) + ModulesVec.push_back(&Mod.second); + std::vector ModulesOrdering; + ModulesOrdering = generateModulesOrdering(ModulesVec); + for (int I : ModulesOrdering) + if (Error E = ProcessOneModule(I)) + return E; + } return BackendProc->wait(); } @@ -1495,3 +1528,18 @@ StatsFile->keep(); return std::move(StatsFile); } + +// Compute the ordering we will process the inputs: the rough heuristic here +// is to sort them per size so that the largest module get schedule as soon as +// possible. This is purely a compile-time optimization. +std::vector lto::generateModulesOrdering(ArrayRef R) { + std::vector ModulesOrdering; + ModulesOrdering.resize(R.size()); + std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0); + llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) { + auto LSize = R[LeftIndex]->getBuffer().size(); + auto RSize = R[RightIndex]->getBuffer().size(); + return LSize > RSize; + }); + return ModulesOrdering; +} diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -1054,19 +1054,11 @@ ModuleToDefinedGVSummaries[ModuleIdentifier]; } - // Compute the ordering we will process the inputs: the rough heuristic here - // is to sort them per size so that the largest module get schedule as soon as - // possible. This is purely a compile-time optimization. - std::vector ModulesOrdering; - ModulesOrdering.resize(Modules.size()); - std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0); - llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) { - auto LSize = - Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size(); - auto RSize = - Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size(); - return LSize > RSize; - }); + std::vector ModulesVec; + ModulesVec.reserve(Modules.size()); + for (auto &Mod : Modules) + ModulesVec.push_back(&Mod->getSingleBitcodeModule()); + std::vector ModulesOrdering = lto::generateModulesOrdering(ModulesVec); // Parallel optimizer + codegen {