Index: llvm/lib/LTO/LTO.cpp =================================================================== --- llvm/lib/LTO/LTO.cpp +++ llvm/lib/LTO/LTO.cpp @@ -36,6 +36,7 @@ #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/SplitModule.h" +#include #include using namespace llvm; @@ -743,15 +744,32 @@ // ParallelCodeGenParallelismLevel if an LTO module is present, as tasks 0 // through ParallelCodeGenParallelismLevel-1 are reserved for parallel code // generation partitions. - unsigned Task = RegularLTO.CombinedModule - ? RegularLTO.ParallelCodeGenParallelismLevel - : 0; + unsigned FirstTask = RegularLTO.CombinedModule + ? RegularLTO.ParallelCodeGenParallelismLevel + : 0; unsigned Partition = 1; - for (auto &Mod : ThinLTO.ModuleMap) { - if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first], - ExportLists[Mod.first], - ResolvedODR[Mod.first], ThinLTO.ModuleMap)) + // Compute the ordering we will process the inputs: the rough heuristic here + // is to sort them per size so that the largest module get schedule as soon as + // possible. This is purely a compile-time optimization. + std::vector ModulesOrdering; + ModulesOrdering.resize(ThinLTO.ModuleMap.size()); + std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), FirstTask); + std::sort( + ModulesOrdering.begin(), ModulesOrdering.end(), + [&](int LeftIndex, int RightIndex) { + auto LSize = + (ThinLTO.ModuleMap.begin() + LeftIndex)->second.getBufferSize(); + auto RSize = + (ThinLTO.ModuleMap.begin() + RightIndex)->second.getBufferSize(); + return LSize > RSize; + }); + + for (auto &Task : ModulesOrdering) { + auto Mod = ThinLTO.ModuleMap.begin() + Task; + if (Error E = BackendProc->start( + Task, Mod->second, ImportLists[Mod->first], ExportLists[Mod->first], + ResolvedODR[Mod->first], ThinLTO.ModuleMap)) return E; ++Task;