Index: llvm/trunk/include/llvm/CodeGen/ParallelCG.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/ParallelCG.h
+++ llvm/trunk/include/llvm/CodeGen/ParallelCG.h
@@ -0,0 +1,41 @@
+//===-- llvm/CodeGen/ParallelCG.h - Parallel code generation ----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This header declares functions that can be used for parallel code generation.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_PARALLELCG_H
+#define LLVM_CODEGEN_PARALLELCG_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/CodeGen.h"
+
+namespace llvm {
+
+class Module;
+class TargetOptions;
+class raw_pwrite_stream;
+
+/// Split M into OSs.size() partitions, and generate code for each. Writes
+/// OSs.size() object files to the output streams in OSs. The resulting object
+/// files if linked together are intended to be equivalent to the single object
+/// file that would have been code generated from M.
+///
+/// \returns M if OSs.size() == 1, otherwise returns std::unique_ptr<Module>().
+std::unique_ptr<Module>
+splitCodeGen(std::unique_ptr<Module> M, ArrayRef<raw_pwrite_stream *> OSs,
+             StringRef CPU, StringRef Features, const TargetOptions &Options,
+             Reloc::Model RM = Reloc::Default,
+             CodeModel::Model CM = CodeModel::Default,
+             CodeGenOpt::Level OL = CodeGenOpt::Default);
+
+} // namespace llvm
+
+#endif
Index: llvm/trunk/include/llvm/LTO/LTOCodeGenerator.h
===================================================================
--- llvm/trunk/include/llvm/LTO/LTOCodeGenerator.h
+++ llvm/trunk/include/llvm/LTO/LTOCodeGenerator.h
@@ -133,6 +133,12 @@
   // if the compilation was not successful.
   std::unique_ptr<MemoryBuffer> compileOptimized(std::string &errMsg);
 
+  // Compile the merged optimized module into out.size() object files each
+  // representing a linkable partition of the module. If out contains more than
+  // one element, code generation is done in parallel with out.size() threads.
+  // Object files will be written to members of out. Returns true on success.
+  bool compileOptimized(ArrayRef<raw_pwrite_stream *> out, std::string &errMsg);
+
   void setDiagnosticHandler(lto_diagnostic_handler_t, void *);
 
   LLVMContext &getContext() { return Context; }
@@ -140,7 +146,6 @@
 private:
   void initializeLTOPasses();
 
-  bool compileOptimized(raw_pwrite_stream &out, std::string &errMsg);
   bool compileOptimizedToFile(const char **name, std::string &errMsg);
   void applyScopeRestrictions();
   void applyRestriction(GlobalValue &GV, ArrayRef<StringRef> Libcalls,
Index: llvm/trunk/lib/CodeGen/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/CodeGen/CMakeLists.txt
+++ llvm/trunk/lib/CodeGen/CMakeLists.txt
@@ -80,6 +80,7 @@
   OptimizePHIs.cpp
   PHIElimination.cpp
   PHIEliminationUtils.cpp
+  ParallelCG.cpp
   Passes.cpp
   PeepholeOptimizer.cpp
   PostRASchedulerList.cpp
Index: llvm/trunk/lib/CodeGen/ParallelCG.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/ParallelCG.cpp
+++ llvm/trunk/lib/CodeGen/ParallelCG.cpp
@@ -0,0 +1,95 @@
+//===-- ParallelCG.cpp ----------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines functions that can be used for parallel code generation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ParallelCG.h"
+#include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/thread.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SplitModule.h"
+
+using namespace llvm;
+
+static void codegen(Module *M, llvm::raw_pwrite_stream &OS,
+                    const Target *TheTarget, StringRef CPU, StringRef Features,
+                    const TargetOptions &Options, Reloc::Model RM,
+                    CodeModel::Model CM, CodeGenOpt::Level OL) {
+  std::unique_ptr<TargetMachine> TM(TheTarget->createTargetMachine(
+      M->getTargetTriple(), CPU, Features, Options, RM, CM, OL));
+
+  legacy::PassManager CodeGenPasses;
+  if (TM->addPassesToEmitFile(CodeGenPasses, OS,
+                              TargetMachine::CGFT_ObjectFile))
+    report_fatal_error("Failed to setup codegen");
+  CodeGenPasses.run(*M);
+}
+
+std::unique_ptr<Module>
+llvm::splitCodeGen(std::unique_ptr<Module> M,
+                   ArrayRef<llvm::raw_pwrite_stream *> OSs, StringRef CPU,
+                   StringRef Features, const TargetOptions &Options,
+                   Reloc::Model RM, CodeModel::Model CM, CodeGenOpt::Level OL) {
+  StringRef TripleStr = M->getTargetTriple();
+  std::string ErrMsg;
+  const Target *TheTarget = TargetRegistry::lookupTarget(TripleStr, ErrMsg);
+  if (!TheTarget)
+    report_fatal_error(Twine("Target not found: ") + ErrMsg);
+
+  if (OSs.size() == 1) {
+    codegen(M.get(), *OSs[0], TheTarget, CPU, Features, Options, RM, CM,
+            OL);
+    return M;
+  }
+
+  std::vector<std::thread> Threads;
+  SplitModule(std::move(M), OSs.size(), [&](std::unique_ptr<Module> MPart) {
+    // We want to clone the module in a new context to multi-thread the codegen.
+    // We do it by serializing partition modules to bitcode (while still on the
+    // main thread, in order to avoid data races) and spinning up new threads
+    // which deserialize the partitions into separate contexts.
+    // FIXME: Provide a more direct way to do this in LLVM.
+    SmallVector<char, 0> BC;
+    raw_svector_ostream BCOS(BC);
+    WriteBitcodeToFile(MPart.get(), BCOS);
+
+    llvm::raw_pwrite_stream *ThreadOS = OSs[Threads.size()];
+    Threads.emplace_back(
+        [TheTarget, CPU, Features, Options, RM, CM, OL,
+         ThreadOS](const SmallVector<char, 0> &BC) {
+          LLVMContext Ctx;
+          ErrorOr<std::unique_ptr<Module>> MOrErr =
+              parseBitcodeFile(MemoryBufferRef(StringRef(BC.data(), BC.size()),
+                                               "<split-module>"),
+                               Ctx);
+          if (!MOrErr)
+            report_fatal_error("Failed to read bitcode");
+          std::unique_ptr<Module> MPartInCtx = std::move(MOrErr.get());
+
+          codegen(MPartInCtx.get(), *ThreadOS, TheTarget, CPU, Features,
+                  Options, RM, CM, OL);
+        },
+        // Pass BC using std::move to ensure that it get moved rather than
+        // copied into the thread's context.
+        std::move(BC));
+  });
+
+  for (std::thread &T : Threads)
+    T.join();
+
+  return {};
+}
Index: llvm/trunk/lib/LTO/LTOCodeGenerator.cpp
===================================================================
--- llvm/trunk/lib/LTO/LTOCodeGenerator.cpp
+++ llvm/trunk/lib/LTO/LTOCodeGenerator.cpp
@@ -18,6 +18,7 @@
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/ReaderWriter.h"
+#include "llvm/CodeGen/ParallelCG.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
 #include "llvm/Config/config.h"
 #include "llvm/IR/Constants.h"
@@ -218,7 +219,7 @@
   // generate object file
   tool_output_file objFile(Filename.c_str(), FD);
 
-  bool genResult = compileOptimized(objFile.os(), errMsg);
+  bool genResult = compileOptimized(&objFile.os(), errMsg);
   objFile.os().close();
   if (objFile.os().has_error()) {
     objFile.os().clear_error();
@@ -495,25 +496,26 @@
   return true;
 }
 
-bool LTOCodeGenerator::compileOptimized(raw_pwrite_stream &out,
+bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> out,
                                         std::string &errMsg) {
   if (!this->determineTarget(errMsg))
     return false;
 
-  legacy::PassManager codeGenPasses;
+  legacy::PassManager preCodeGenPasses;
 
   // If the bitcode files contain ARC code and were compiled with optimization,
   // the ObjCARCContractPass must be run, so do it unconditionally here.
-  codeGenPasses.add(createObjCARCContractPass());
+  preCodeGenPasses.add(createObjCARCContractPass());
+  preCodeGenPasses.run(*MergedModule);
 
-  if (TargetMach->addPassesToEmitFile(codeGenPasses, out,
-                                      TargetMachine::CGFT_ObjectFile)) {
-    errMsg = "target file type not supported";
-    return false;
-  }
-
-  // Run the code generator, and write object file
-  codeGenPasses.run(*MergedModule);
+  // Do code generation. We need to preserve the module in case the client calls
+  // writeMergedModules() after compilation, but we only need to allow this at
+  // parallelism level 1. This is achieved by having splitCodeGen return the
+  // original module at parallelism level 1 which we then assign back to
+  // MergedModule.
+  MergedModule =
+      splitCodeGen(std::move(MergedModule), out, MCpu, FeatureStr, Options,
+                   RelocModel, CodeModel::Default, CGOptLevel);
 
   return true;
 }
Index: llvm/trunk/test/LTO/X86/parallel.ll
===================================================================
--- llvm/trunk/test/LTO/X86/parallel.ll
+++ llvm/trunk/test/LTO/X86/parallel.ll
@@ -0,0 +1,22 @@
+; RUN: llvm-as -o %t.bc %s
+; RUN: llvm-lto -exported-symbol=foo -exported-symbol=bar -j2 -o %t.o %t.bc
+; RUN: llvm-nm %t.o.0 | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-nm %t.o.1 | FileCheck --check-prefix=CHECK1 %s
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK0-NOT: bar
+; CHECK0: T foo
+; CHECK0-NOT: bar
+define void @foo() {
+  call void @bar()
+  ret void
+}
+
+; CHECK1-NOT: foo
+; CHECK1: T bar
+; CHECK1-NOT: foo
+define void @bar() {
+  call void @foo()
+  ret void
+}
Index: llvm/trunk/tools/llvm-lto/llvm-lto.cpp
===================================================================
--- llvm/trunk/tools/llvm-lto/llvm-lto.cpp
+++ llvm/trunk/tools/llvm-lto/llvm-lto.cpp
@@ -22,7 +22,9 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/raw_ostream.h"
+#include <list>
 
 using namespace llvm;
 
@@ -77,6 +79,9 @@
     "set-merged-module", cl::init(false),
     cl::desc("Use the first input module as the merged module"));
 
+static cl::opt<unsigned> Parallelism("j", cl::Prefix, cl::init(1),
+                                     cl::desc("Number of backend threads"));
+
 namespace {
 struct ModuleInfo {
   std::vector<bool> CanBeHidden;
@@ -240,24 +245,41 @@
 
   if (!OutputFilename.empty()) {
     std::string ErrorInfo;
-    std::unique_ptr<MemoryBuffer> Code = CodeGen.compile(
-        DisableInline, DisableGVNLoadPRE, DisableLTOVectorization, ErrorInfo);
-    if (!Code) {
-      errs() << argv[0]
-             << ": error compiling the code: " << ErrorInfo << "\n";
+    if (!CodeGen.optimize(DisableInline, DisableGVNLoadPRE,
+                          DisableLTOVectorization, ErrorInfo)) {
+      errs() << argv[0] << ": error optimizing the code: " << ErrorInfo << "\n";
       return 1;
     }
 
-    std::error_code EC;
-    raw_fd_ostream FileStream(OutputFilename, EC, sys::fs::F_None);
-    if (EC) {
-      errs() << argv[0] << ": error opening the file '" << OutputFilename
-             << "': " << EC.message() << "\n";
+    std::list<tool_output_file> OSs;
+    std::vector<raw_pwrite_stream *> OSPtrs;
+    for (unsigned I = 0; I != Parallelism; ++I) {
+      std::string PartFilename = OutputFilename;
+      if (Parallelism != 1)
+        PartFilename += "." + utostr(I);
+      std::error_code EC;
+      OSs.emplace_back(PartFilename, EC, sys::fs::F_None);
+      if (EC) {
+        errs() << argv[0] << ": error opening the file '" << PartFilename
+               << "': " << EC.message() << "\n";
+        return 1;
+      }
+      OSPtrs.push_back(&OSs.back().os());
+    }
+
+    if (!CodeGen.compileOptimized(OSPtrs, ErrorInfo)) {
+      errs() << argv[0] << ": error compiling the code: " << ErrorInfo << "\n";
       return 1;
     }
 
-    FileStream.write(Code->getBufferStart(), Code->getBufferSize());
+    for (tool_output_file &OS : OSs)
+      OS.keep();
   } else {
+    if (Parallelism != 1) {
+      errs() << argv[0] << ": -j must be specified together with -o\n";
+      return 1;
+    }
+
     std::string ErrorInfo;
     const char *OutputName = nullptr;
     if (!CodeGen.compile_to_file(&OutputName, DisableInline,