Index: include/llvm/Transforms/IPO.h =================================================================== --- include/llvm/Transforms/IPO.h +++ include/llvm/Transforms/IPO.h @@ -88,7 +88,7 @@ //===----------------------------------------------------------------------===// /// This pass performs iterative function importing from other modules. -Pass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr); +Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr); //===----------------------------------------------------------------------===// /// createFunctionInliningPass - Return a new pass object that uses a heuristic Index: include/llvm/Transforms/IPO/PassManagerBuilder.h =================================================================== --- include/llvm/Transforms/IPO/PassManagerBuilder.h +++ include/llvm/Transforms/IPO/PassManagerBuilder.h @@ -117,7 +117,7 @@ Pass *Inliner; /// The function summary index to use for function importing. - FunctionInfoIndex *FunctionIndex; + const FunctionInfoIndex *FunctionIndex; bool DisableTailCalls; bool DisableUnitAtATime; Index: lib/Transforms/IPO/FunctionImport.cpp =================================================================== --- lib/Transforms/IPO/FunctionImport.cpp +++ lib/Transforms/IPO/FunctionImport.cpp @@ -258,7 +258,7 @@ class FunctionImportPass : public ModulePass { /// Optional function summary index to use for importing, otherwise /// the summary-file option must be specified. - FunctionInfoIndex *Index; + const FunctionInfoIndex *Index; public: /// Pass identification, replacement for typeid @@ -269,7 +269,7 @@ return "Function Importing"; } - explicit FunctionImportPass(FunctionInfoIndex *Index = nullptr) + explicit FunctionImportPass(const FunctionInfoIndex *Index = nullptr) : ModulePass(ID), Index(Index) {} bool runOnModule(Module &M) override { @@ -308,7 +308,7 @@ "Summary Based Function Import", false, false) namespace llvm { -Pass *createFunctionImportPass(FunctionInfoIndex *Index = nullptr) { +Pass *createFunctionImportPass(const FunctionInfoIndex *Index = nullptr) { return new FunctionImportPass(Index); } } Index: tools/gold/gold-plugin.cpp =================================================================== --- tools/gold/gold-plugin.cpp +++ tools/gold/gold-plugin.cpp @@ -31,14 +31,15 @@ #include "llvm/IR/Verifier.h" #include "llvm/Linker/Linker.h" #include "llvm/MC/SubtargetFeature.h" -#include "llvm/Object/IRObjectFile.h" #include "llvm/Object/FunctionIndexObjectFile.h" -#include "llvm/Support/raw_ostream.h" +#include "llvm/Object/IRObjectFile.h" #include "llvm/Support/Host.h" #include "llvm/Support/ManagedStatic.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/TargetRegistry.h" #include "llvm/Support/TargetSelect.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Support/thread.h" #include "llvm/Transforms/IPO.h" #include "llvm/Transforms/IPO/PassManagerBuilder.h" #include "llvm/Transforms/Utils/GlobalStatus.h" @@ -94,7 +95,7 @@ static bool generate_api_file = false; static OutputType TheOutputType = OT_NORMAL; static unsigned OptLevel = 2; - static unsigned Parallelism = 1; + static unsigned Parallelism = 0; #ifdef NDEBUG static bool DisableVerify = true; #else @@ -108,6 +109,11 @@ // the information from intermediate files and write a combined // global index for the ThinLTO backends. static bool thinlto = false; + // If false, all ThinLTO backend compilations through code gen are performed + // using multiple threads in the gold-plugin, before handing control back to + // gold. If true, exit after creating the combined index, the assuming is + // that the build system will launch the backend processes. + static bool thinlto_index_only = false; // Additional options to pass into the code generator. // Note: This array will contain all plugin options which are not claimed // as plugin exclusive to pass to the code generator. @@ -139,6 +145,8 @@ TheOutputType = OT_DISABLE; } else if (opt == "thinlto") { thinlto = true; + } else if (opt == "thinlto-index-only") { + thinlto_index_only = true; } else if (opt.size() == 2 && opt[0] == 'O') { if (opt[1] < '0' || opt[1] > '3') message(LDPL_FATAL, "Optimization level must be between 0 and 3"); @@ -390,7 +398,7 @@ // If we are doing ThinLTO compilation, don't need to process the symbols. // Later we simply build a combined index file after all files are claimed. - if (options::thinlto) + if (options::thinlto && options::thinlto_index_only) return LDPS_OK; for (auto &Sym : Obj->symbols()) { @@ -774,7 +782,8 @@ return Obj.takeModule(); } -static void runLTOPasses(Module &M, TargetMachine &TM) { +static void runLTOPasses(Module &M, TargetMachine &TM, + const FunctionInfoIndex *Index) { M.setDataLayout(TM.createDataLayout()); legacy::PassManager passes; @@ -790,6 +799,7 @@ PMB.LoopVectorize = true; PMB.SLPVectorize = true; PMB.OptLevel = options::OptLevel; + PMB.FunctionIndex = Index; PMB.populateLTOPassManager(passes); passes.run(M); } @@ -802,7 +812,8 @@ WriteBitcodeToFile(&M, OS, /* ShouldPreserveUseListOrder */ true); } -static void codegen(std::unique_ptr M) { +static void codegen(std::unique_ptr M, unsigned int MaxThreads, + const FunctionInfoIndex *CombinedIndex = nullptr) { const std::string &TripleStr = M->getTargetTriple(); Triple TheTriple(TripleStr); @@ -839,7 +850,7 @@ TripleStr, options::mcpu, Features.getString(), Options, RelocationModel, CodeModel::Default, CGOptLevel)); - runLTOPasses(*M, *TM); + runLTOPasses(*M, *TM, CombinedIndex); if (options::TheOutputType == options::OT_SAVE_TEMPS) saveBCFile(output_name + ".opt.bc", *M); @@ -850,14 +861,14 @@ else if (options::TheOutputType == options::OT_SAVE_TEMPS) Filename = output_name + ".o"; - std::vector> Filenames(options::Parallelism); + std::vector> Filenames(MaxThreads); bool TempOutFile = Filename.empty(); { // Open a file descriptor for each backend thread. This is done in a block // so that the output file descriptors are closed before gold opens them. std::list OSs; - std::vector OSPtrs(options::Parallelism); - for (unsigned I = 0; I != options::Parallelism; ++I) { + std::vector OSPtrs(MaxThreads); + for (unsigned I = 0; I != MaxThreads; ++I) { int FD; if (TempOutFile) { std::error_code EC = @@ -867,7 +878,7 @@ EC.message().c_str()); } else { Filenames[I] = Filename; - if (options::Parallelism != 1) + if (MaxThreads != 1) Filenames[I] += utostr(I); std::error_code EC = sys::fs::openFileForWrite(Filenames[I], FD, sys::fs::F_None); @@ -893,6 +904,80 @@ } } +/// Perform the backend on a single module, invoking the LTO and codegen +/// pipelines. +static void ThinLTOBackendThread(claimed_file *F, raw_fd_ostream *ApiFile, + const FunctionInfoIndex &CombinedIndex, + const SmallString<128> &Filename) { + ld_plugin_input_file File; + if (get_input_file(F->handle, &File) != LDPS_OK) + message(LDPL_FATAL, "Failed to get file information"); + + LLVMContext Context; + Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true); + + StringSet<> Dummy; + std::unique_ptr M = + getModuleForFile(Context, *F, File, ApiFile, Dummy, Dummy); + if (!options::triple.empty()) + M->setTargetTriple(options::triple.c_str()); + else if (M->getTargetTriple().empty()) { + M->setTargetTriple(sys::getDefaultTargetTriple()); + } + + std::unique_ptr RenamedModule = + renameModuleForThinLTO(M, &CombinedIndex, diagnosticHandler); + if (!RenamedModule) + message(LDPL_FATAL, "Failed to rename module for ThinLTO"); + + if (release_input_file(F->handle) != LDPS_OK) + message(LDPL_FATAL, "Failed to release file information"); + + // We are already running in a thread, don't use split code gen. + codegen(std::move(RenamedModule), 1 /* MaxThreads */, &CombinedIndex); +} + +/// Launch each module's backend pipeline in a separate thread. +static void ThinLTOBackends(raw_fd_ostream *ApiFile, + const FunctionInfoIndex &CombinedIndex) { + SmallString<128> Filename; + if (!options::obj_path.empty()) + Filename = options::obj_path; + else if (options::TheOutputType == options::OT_SAVE_TEMPS) + Filename = output_name + ".o"; + + std::vector Worklist; + for (claimed_file &F : Modules) + Worklist.push_back(&F); + + unsigned ThreadCount = 0; + unsigned ThreadIndex = 0; + std::vector Threads; + unsigned int MaxThreads = options::Parallelism + ? options::Parallelism + : std::thread::hardware_concurrency(); + + // TODO: Use a thread pool for better parallelism. Otherwise we will wait on + // slow backend threads to finish before launching more threads. + while (!Worklist.empty()) { + claimed_file *F = Worklist.back(); + Worklist.pop_back(); + + Threads.emplace_back(ThinLTOBackendThread, F, ApiFile, + std::ref(CombinedIndex), Filename); + // If we hit the max number of threads, wait for the oldest thread to + // complete before launching another. + if (++ThreadCount >= MaxThreads) + Threads[ThreadIndex++].join(); + } + + // Wait for the remaining threads to complete. + while (ThreadIndex < ThreadCount) { + Threads[ThreadIndex].join(); + ++ThreadIndex; + } +} + /// gold informs us that all symbols have been read. At this point, we use /// get_symbols to see if any of our definitions have been overridden by a /// native object file. Then, perform optimization and codegen. @@ -900,9 +985,6 @@ if (Modules.empty()) return LDPS_OK; - LLVMContext Context; - Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true); - // If we are doing ThinLTO compilation, simply build the combined // function index/summary and emit it. We don't need to parse the modules // and link them in this case. @@ -922,6 +1004,9 @@ continue; CombinedIndex.mergeFrom(std::move(Index), ++NextModuleId); + + if (release_input_file(F.handle) != LDPS_OK) + message(LDPL_FATAL, "Failed to release file information"); } std::error_code EC; @@ -933,10 +1018,17 @@ WriteFunctionSummaryToFile(CombinedIndex, OS); OS.close(); - cleanup_hook(); - exit(0); + if (options::thinlto_index_only) { + cleanup_hook(); + exit(0); + } + ThinLTOBackends(ApiFile, CombinedIndex); + return LDPS_OK; } + LLVMContext Context; + Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true); + std::unique_ptr Combined(new Module("ld-temp.o", Context)); Linker L(*Combined, diagnosticHandler); @@ -991,7 +1083,11 @@ return LDPS_OK; } - codegen(std::move(Combined)); + // TODO: Should this use std::thread::hardware_concurrency() if + // -jobs option was not specified? Currently preserve behavior of + // default parallelism being 1. + unsigned int MaxThreads = options::Parallelism ? options::Parallelism : 1; + codegen(std::move(Combined), MaxThreads); if (!options::extra_library_path.empty() && set_extra_library_path(options::extra_library_path.c_str()) != LDPS_OK)