Index: tools/gold/gold-plugin.cpp
--- tools/gold/gold-plugin.cpp
+++ tools/gold/gold-plugin.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/thread.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
@@ -94,7 +95,7 @@
   static bool generate_api_file = false;
   static OutputType TheOutputType = OT_NORMAL;
   static unsigned OptLevel = 2;
-  static unsigned Parallelism = 1;
+  static unsigned Parallelism = 0;
 #ifdef NDEBUG
   static bool DisableVerify = true;
@@ -108,6 +109,11 @@
   // the information from intermediate files and write a combined
   // global index for the ThinLTO backends.
   static bool thinlto = false;
+  // If false, all ThinLTO backend compilations through code gen are performed
+  // using multiple threads in the gold-plugin, before handing control back to
+  // gold. If true, exit after creating the combined index, the assuming is
+  // that the build system will launch the backend processes.
+  static bool thinlto_index_only = false;
   // Additional options to pass into the code generator.
   // Note: This array will contain all plugin options which are not claimed
   // as plugin exclusive to pass to the code generator.
@@ -139,6 +145,8 @@
       TheOutputType = OT_DISABLE;
     } else if (opt == "thinlto") {
       thinlto = true;
+    } else if (opt == "thinlto-index-only") {
+      thinlto_index_only = true;
     } else if (opt.size() == 2 && opt[0] == 'O') {
       if (opt[1] < '0' || opt[1] > '3')
         message(LDPL_FATAL, "Optimization level must be between 0 and 3");
@@ -390,7 +398,7 @@
   // If we are doing ThinLTO compilation, don't need to process the symbols.
   // Later we simply build a combined index file after all files are claimed.
-  if (options::thinlto)
+  if (options::thinlto && options::thinlto_index_only)
     return LDPS_OK;
   for (auto &Sym : Obj->symbols()) {
@@ -774,7 +782,8 @@
   return Obj.takeModule();
-static void runLTOPasses(Module &M, TargetMachine &TM) {
+static void runLTOPasses(Module &M, TargetMachine &TM,
+                         const FunctionInfoIndex *Index) {
   legacy::PassManager passes;
@@ -790,6 +799,7 @@
   PMB.LoopVectorize = true;
   PMB.SLPVectorize = true;
   PMB.OptLevel = options::OptLevel;
+  PMB.FunctionIndex = Index;
@@ -802,7 +812,8 @@
   WriteBitcodeToFile(&M, OS, /* ShouldPreserveUseListOrder */ true);
-static void codegen(std::unique_ptr<Module> M) {
+static void codegen(std::unique_ptr<Module> M, unsigned int MaxThreads,
+                    const FunctionInfoIndex *CombinedIndex = nullptr) {
   const std::string &TripleStr = M->getTargetTriple();
   Triple TheTriple(TripleStr);
@@ -839,7 +850,7 @@
       TripleStr, options::mcpu, Features.getString(), Options, RelocationModel,
       CodeModel::Default, CGOptLevel));
-  runLTOPasses(*M, *TM);
+  runLTOPasses(*M, *TM, CombinedIndex);
   if (options::TheOutputType == options::OT_SAVE_TEMPS)
     saveBCFile(output_name + ".opt.bc", *M);
@@ -850,14 +861,14 @@
   else if (options::TheOutputType == options::OT_SAVE_TEMPS)
     Filename = output_name + ".o";
-  std::vector<SmallString<128>> Filenames(options::Parallelism);
+  std::vector<SmallString<128>> Filenames(MaxThreads);
   bool TempOutFile = Filename.empty();
     // Open a file descriptor for each backend thread. This is done in a block
     // so that the output file descriptors are closed before gold opens them.
     std::list<llvm::raw_fd_ostream> OSs;
-    std::vector<llvm::raw_pwrite_stream *> OSPtrs(options::Parallelism);
-    for (unsigned I = 0; I != options::Parallelism; ++I) {
+    std::vector<llvm::raw_pwrite_stream *> OSPtrs(MaxThreads);
+    for (unsigned I = 0; I != MaxThreads; ++I) {
       int FD;
       if (TempOutFile) {
         std::error_code EC =
@@ -867,7 +878,7 @@
       } else {
         Filenames[I] = Filename;
-        if (options::Parallelism != 1)
+        if (MaxThreads != 1)
           Filenames[I] += utostr(I);
         std::error_code EC =
             sys::fs::openFileForWrite(Filenames[I], FD, sys::fs::F_None);
@@ -893,6 +904,80 @@
+/// Perform the backend on a single module, invoking the LTO and codegen
+/// pipelines.
+static void ThinLTOBackendThread(claimed_file *F, raw_fd_ostream *ApiFile,
+                                 const FunctionInfoIndex &CombinedIndex,
+                                 const SmallString<128> &Filename) {
+  ld_plugin_input_file File;
+  if (get_input_file(F->handle, &File) != LDPS_OK)
+    message(LDPL_FATAL, "Failed to get file information");
+  LLVMContext Context;
+  Context.setDiagnosticHandler(diagnosticHandlerForContext, nullptr, true);
+  StringSet<> Dummy;
+  std::unique_ptr<Module> M =
+      getModuleForFile(Context, *F, File, ApiFile, Dummy, Dummy);
+  if (!options::triple.empty())
+    M->setTargetTriple(options::triple.c_str());
+  else if (M->getTargetTriple().empty()) {
+    M->setTargetTriple(sys::getDefaultTargetTriple());
+  }
+  std::unique_ptr<llvm::Module> RenamedModule =
+      renameModuleForThinLTO(M, &CombinedIndex, diagnosticHandler);
+  if (!RenamedModule)
+    message(LDPL_FATAL, "Failed to rename module for ThinLTO");
+  if (release_input_file(F->handle) != LDPS_OK)
+    message(LDPL_FATAL, "Failed to release file information");
+  // We are already running in a thread, don't use split code gen.
+  codegen(std::move(RenamedModule), 1 /* MaxThreads */, &CombinedIndex);
+/// Launch each module's backend pipeline in a separate thread.
+static void ThinLTOBackends(raw_fd_ostream *ApiFile,
+                            const FunctionInfoIndex &CombinedIndex) {
+  SmallString<128> Filename;
+  if (!options::obj_path.empty())
+    Filename = options::obj_path;
+  else if (options::TheOutputType == options::OT_SAVE_TEMPS)
+    Filename = output_name + ".o";
+  std::vector<claimed_file *> Worklist;
+  for (claimed_file &F : Modules)
+    Worklist.push_back(&F);
+  unsigned ThreadCount = 0;
+  unsigned ThreadIndex = 0;
+  std::vector<thread> Threads;
+  unsigned int MaxThreads = options::Parallelism
+                                ? options::Parallelism
+                                : std::thread::hardware_concurrency();
+  // TODO: Use a thread pool for better parallelism. Otherwise we will wait on
+  // slow backend threads to finish before launching more threads.
+  while (!Worklist.empty()) {
+    claimed_file *F = Worklist.back();
+    Worklist.pop_back();
+    Threads.emplace_back(ThinLTOBackendThread, F, ApiFile,
+                         std::ref(CombinedIndex), Filename);
+    // If we hit the max number of threads, wait for the oldest thread to
+    // complete before launching another.
+    if (++ThreadCount >= MaxThreads)
+      Threads[ThreadIndex++].join();
+  }
+  // Wait for the remaining threads to complete.
+  while (ThreadIndex < ThreadCount) {
+    Threads[ThreadIndex].join();
+    ++ThreadIndex;
+  }
 /// gold informs us that all symbols have been read. At this point, we use
 /// get_symbols to see if any of our definitions have been overridden by a
 /// native object file. Then, perform optimization and codegen.
@@ -933,8 +1018,12 @@
     WriteFunctionSummaryToFile(CombinedIndex, OS);
-    cleanup_hook();
-    exit(0);
+    if (options::thinlto_index_only) {
+      cleanup_hook();
+      exit(0);
+    }
+    ThinLTOBackends(ApiFile, CombinedIndex);
+    return LDPS_OK;
   LLVMContext Context;
@@ -994,7 +1083,11 @@
       return LDPS_OK;
-  codegen(std::move(Combined));
+  // TODO: Should this use std::thread::hardware_concurrency() if
+  // -jobs option was not specified? Currently preserve behavior of
+  // default parallelism being 1.
+  unsigned int MaxThreads = options::Parallelism ? options::Parallelism : 1;
+  codegen(std::move(Combined), MaxThreads);
   if (!options::extra_library_path.empty() &&
       set_extra_library_path(options::extra_library_path.c_str()) != LDPS_OK)