diff --git a/lld/test/COFF/thinlto-module-order.ll b/lld/test/COFF/thinlto-module-order.ll
new file mode 100644
--- /dev/null
+++ b/lld/test/COFF/thinlto-module-order.ll
@@ -0,0 +1,26 @@
+; REQUIRES: x86
+
+; RUN: opt -thinlto-bc %s -o %t1.obj
+; RUN: opt -thinlto-bc %p/Inputs/thinlto.ll -o %t2.obj
+
+; Ensure module re-ordering in LTO::runThinLTO does not affect the processing order.
+
+; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t1.obj %t2.obj
+; RUN: cat %t3 | FileCheck %s --check-prefix=NORMAL
+; NORMAL: thinlto-module-order.ll.tmp1.o
+; NORMAL: thinlto-module-order.ll.tmp2.o
+
+; RUN: lld-link -thinlto-index-only:%t3 /entry:main %t2.obj %t1.obj 
+; RUN: cat %t3 | FileCheck %s --check-prefix=REVERSED
+; REVERSED: thinlto-module-order.ll.tmp2.o
+; REVERSED: thinlto-module-order.ll.tmp1.o
+
+target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+declare void @g(...)
+
+define void @main() {
+  call void (...) @g()
+  ret void
+}
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -91,6 +91,10 @@
 Expected<std::unique_ptr<ToolOutputFile>>
 setupStatsFile(StringRef StatsFilename);
 
+/// Produces a container ordering for optimal multi-threaded processing. Returns
+/// ordered indices to elements in the input array.
+std::vector<int> generateModulesOrdering(ArrayRef<BitcodeModule *> R);
+
 class LTO;
 struct SymbolResolution;
 class ThinBackendProc;
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1107,6 +1107,7 @@
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       MapVector<StringRef, BitcodeModule> &ModuleMap) = 0;
   virtual Error wait() = 0;
+  virtual unsigned getThreadCount() = 0;
 };
 
 namespace {
@@ -1221,6 +1222,10 @@
     else
       return Error::success();
   }
+
+  unsigned getThreadCount() override {
+    return BackendThreadPool.getThreadCount();
+  }
 };
 } // end anonymous namespace
 
@@ -1309,6 +1314,10 @@
   }
 
   Error wait() override { return Error::success(); }
+
+  // WriteIndexesThinBackend should always return 1 to prevent module
+  // re-ordering and avoid non-determinism in the final link.
+  unsigned getThreadCount() override { return 1; }
 };
 } // end anonymous namespace
 
@@ -1443,17 +1452,37 @@
   auto &ModuleMap =
       ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
 
-  // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
-  // module and parallel code generation partitions.
-  unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
-  for (auto &Mod : ModuleMap) {
-    if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
-                                     ExportLists[Mod.first],
-                                     ResolvedODR[Mod.first], ThinLTO.ModuleMap))
-      return E;
-    ++Task;
-  }
+  auto ProcessOneModule = [&](int I) -> Error {
+    auto &Mod = *(ModuleMap.begin() + I);
+    // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for
+    // combined module and parallel code generation partitions.
+    return BackendProc->start(RegularLTO.ParallelCodeGenParallelismLevel + I,
+                              Mod.second, ImportLists[Mod.first],
+                              ExportLists[Mod.first], ResolvedODR[Mod.first],
+                              ThinLTO.ModuleMap);
+  };
 
+  if (BackendProc->getThreadCount() == 1) {
+    // Process the modules in the order they were provided on the command-line.
+    // It is important for this codepath to be used for WriteIndexesThinBackend,
+    // to ensure the emitted LinkedObjectsFile lists ThinLTO objects in the same
+    // order as the inputs, which otherwise would affect the final link order.
+    for (int I = 0, E = ModuleMap.size(); I != E; ++I)
+      if (Error E = ProcessOneModule(I))
+        return E;
+  } else {
+    // When executing in parallel, process largest bitsize modules first to
+    // improve parallelism, and avoid starving the thread pool near the end.
+    // This saves about 15 sec on a 36-core machine while link `clang.exe` (out
+    // of 100 sec).
+    std::vector<BitcodeModule *> ModulesVec;
+    ModulesVec.reserve(ModuleMap.size());
+    for (auto &Mod : ModuleMap)
+      ModulesVec.push_back(&Mod.second);
+    for (int I : generateModulesOrdering(ModulesVec))
+      if (Error E = ProcessOneModule(I))
+        return E;
+  }
   return BackendProc->wait();
 }
 
@@ -1495,3 +1524,18 @@
   StatsFile->keep();
   return std::move(StatsFile);
 }
+
+// Compute the ordering we will process the inputs: the rough heuristic here
+// is to sort them per size so that the largest module get schedule as soon as
+// possible. This is purely a compile-time optimization.
+std::vector<int> lto::generateModulesOrdering(ArrayRef<BitcodeModule *> R) {
+  std::vector<int> ModulesOrdering;
+  ModulesOrdering.resize(R.size());
+  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
+  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
+    auto LSize = R[LeftIndex]->getBuffer().size();
+    auto RSize = R[RightIndex]->getBuffer().size();
+    return LSize > RSize;
+  });
+  return ModulesOrdering;
+}
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -1054,19 +1054,11 @@
     ModuleToDefinedGVSummaries[ModuleIdentifier];
   }
 
-  // Compute the ordering we will process the inputs: the rough heuristic here
-  // is to sort them per size so that the largest module get schedule as soon as
-  // possible. This is purely a compile-time optimization.
-  std::vector<int> ModulesOrdering;
-  ModulesOrdering.resize(Modules.size());
-  std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
-  llvm::sort(ModulesOrdering, [&](int LeftIndex, int RightIndex) {
-    auto LSize =
-        Modules[LeftIndex]->getSingleBitcodeModule().getBuffer().size();
-    auto RSize =
-        Modules[RightIndex]->getSingleBitcodeModule().getBuffer().size();
-    return LSize > RSize;
-  });
+  std::vector<BitcodeModule *> ModulesVec;
+  ModulesVec.reserve(Modules.size());
+  for (auto &Mod : Modules)
+    ModulesVec.push_back(&Mod->getSingleBitcodeModule());
+  std::vector<int> ModulesOrdering = lto::generateModulesOrdering(ModulesVec);
 
   // Parallel optimizer + codegen
   {