diff --git a/lld/test/COFF/thinlto-module-order.ll b/lld/test/COFF/thinlto-module-order.ll new file mode 100644 --- /dev/null +++ b/lld/test/COFF/thinlto-module-order.ll @@ -0,0 +1,26 @@ +; REQUIRES: x86 + +; RUN: opt -thinlto-bc %s -o %t1.obj +; RUN: opt -thinlto-bc %p/Inputs/thinlto.ll -o %t2.obj + +; Ensure module re-ordering in LTO::runThinLTO does not affect the processing order. + +; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t1.obj %t2.obj +; RUN: cat %t3 | FileCheck %s --check-prefix=NORMAL +; NORMAL: thinlto-module-order.ll.tmp1.o +; NORMAL: thinlto-module-order.ll.tmp2.o + +; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t2.obj %t1.obj +; RUN: cat %t3 | FileCheck %s --check-prefix=REVERSED +; REVERSED: thinlto-module-order.ll.tmp2.o +; REVERSED: thinlto-module-order.ll.tmp1.o + +target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-pc-windows-msvc19.0.24215" + +declare void @g(...) + +define void @main() { + call void (...) @g() + ret void +} diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h --- a/llvm/include/llvm/LTO/LTO.h +++ b/llvm/include/llvm/LTO/LTO.h @@ -91,6 +91,10 @@ Expected> setupStatsFile(StringRef StatsFilename); +/// Produces a container ordering for optimal multi-threaded processing. Returns +/// ordered indices to elements in the input array. +std::vector generateModulesOrdering(ArrayRef R); + class LTO; struct SymbolResolution; class ThinBackendProc; diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1107,6 +1107,7 @@ const std::map &ResolvedODR, MapVector &ModuleMap) = 0; virtual Error wait() = 0; + virtual unsigned getThreadCount() = 0; }; namespace { @@ -1221,6 +1222,10 @@ else return Error::success(); } + + unsigned getThreadCount() override { + return BackendThreadPool.getThreadCount(); + } }; } // end anonymous namespace @@ -1309,6 +1314,10 @@ } Error wait() override { return Error::success(); } + + // WriteIndexesThinBackend should always return 1 to prevent module + // re-ordering and avoid non-determinism in the final link. + unsigned getThreadCount() override { return 1; } }; } // end anonymous namespace @@ -1443,17 +1452,37 @@ auto &ModuleMap = ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap; - // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined - // module and parallel code generation partitions. - unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel; - for (auto &Mod : ModuleMap) { - if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first], - ExportLists[Mod.first], - ResolvedODR[Mod.first], ThinLTO.ModuleMap)) - return E; - ++Task; - } + auto ProcessOneModule = [&](int I) -> Error { + auto &Mod = *(ModuleMap.begin() + I); + // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for + // combined module and parallel code generation partitions. + return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I, + Mod.second, ImportLists[Mod.first], + ExportLists[Mod.first], ResolvedODR[Mod.first], + ThinLTO.ModuleMap); + }; + if (BackendProc->getThreadCount() == 1) { + // Process the modules in the order they were provided on the command-line. + // It is important for this codepath to be used for WriteIndexesThinBackend, + // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same + // order as the inputs, which otherwise would affect the final link order. + for (int I = 0, E = ModuleMap.size(); I != E; ++I) + if (Error E = ProcessOneModule(I)) + return E; + } else { + // When executing in parallel, process largest bitsize modules first to + // improve parallelism, and avoid starving the thread pool near the end. + // This saves about 15 sec on a 36-core machine while link `clang.exe` (out + // of 100 sec). + std::vector ModulesVec; + ModulesVec.reserve(ModuleMap.size()); + for (auto &Mod : ModuleMap) + ModulesVec.push_back(&Mod.second); + for (int I : generateModulesOrdering(ModulesVec)) + if (Error E = ProcessOneModule(I)) + return E; + } return BackendProc->wait(); } @@ -1495,3 +1524,18 @@ StatsFile->keep(); return std::move(StatsFile); } + +// Compute the ordering we will process the inputs: the rough heuristic here +// is to sort them per size so that the largest module get schedule as soon as +// possible. This is purely a compile-time optimization. +std::vector lto::generateModulesOrdering(ArrayRef R) { + std::vector ModulesOrdering; + ModulesOrdering.resize(R.size()); + std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0); + llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) { + auto LSize = R[LeftIndex]->getBuffer().size(); + auto RSize = R[RightIndex]->getBuffer().size(); + return LSize > RSize; + }); + return ModulesOrdering; +} diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -1054,19 +1054,11 @@ ModuleToDefinedGVSummaries[ModuleIdentifier]; } - // Compute the ordering we will process the inputs: the rough heuristic here - // is to sort them per size so that the largest module get schedule as soon as - // possible. This is purely a compile-time optimization. - std::vector ModulesOrdering; - ModulesOrdering.resize(Modules.size()); - std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0); - llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) { - auto LSize = - Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size(); - auto RSize = - Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size(); - return LSize > RSize; - }); + std::vector ModulesVec; + ModulesVec.reserve(Modules.size()); + for (auto &Mod : Modules) + ModulesVec.push_back(&Mod->getSingleBitcodeModule()); + std::vector ModulesOrdering = lto::generateModulesOrdering(ModulesVec); // Parallel optimizer + codegen {