diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h
--- a/lld/ELF/OutputSections.h
+++ b/lld/ELF/OutputSections.h
@@ -12,6 +12,7 @@
 #include "InputSection.h"
 #include "LinkerScript.h"
 #include "lld/Common/LLVM.h"
+#include "llvm/Support/Parallel.h"
 
 #include <array>
 
@@ -105,7 +106,8 @@
   bool relro = false;
 
   void finalize();
-  template <class ELFT> void writeTo(uint8_t *buf);
+  template <class ELFT>
+  void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg);
   // Check that the addends for dynamic relocations were written correctly.
   void checkDynRelAddends(const uint8_t *bufStart);
   template <class ELFT> void maybeCompress();
@@ -115,6 +117,8 @@
   void sortCtorsDtors();
 
 private:
+  SmallVector<InputSection *, 0> storage;
+
   // Used for implementation of --compress-debug-sections option.
   CompressedData compressed;
 
diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp
--- a/lld/ELF/OutputSections.cpp
+++ b/lld/ELF/OutputSections.cpp
@@ -332,7 +332,10 @@
 
   // Write uncompressed data to a temporary zero-initialized buffer.
   auto buf = std::make_unique<uint8_t[]>(size);
-  writeTo<ELFT>(buf.get());
+  {
+    parallel::TaskGroup tg;
+    writeTo<ELFT>(buf.get(), tg);
+  }
   // We chose 1 (Z_BEST_SPEED) as the default compression level because it is
   // the fastest. If -O2 is given, we use level 6 to compress debug info more by
   // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more
@@ -386,7 +389,8 @@
     llvm_unreachable("unsupported Size argument");
 }
 
-template <class ELFT> void OutputSection::writeTo(uint8_t *buf) {
+template <class ELFT>
+void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) {
   llvm::TimeTraceScope timeScope("Write sections", name);
   if (type == SHT_NOBITS)
     return;
@@ -419,14 +423,13 @@
   }
 
   // Write leading padding.
-  SmallVector<InputSection *, 0> storage;
   ArrayRef<InputSection *> sections = getInputSections(*this, storage);
   std::array<uint8_t, 4> filler = getFiller();
   bool nonZeroFiller = read32(filler.data()) != 0;
   if (nonZeroFiller)
     fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler);
 
-  parallelFor(0, sections.size(), [&](size_t i) {
+  std::function<void(size_t)> fn = [=](size_t i) {
     InputSection *isec = sections[i];
     if (auto *s = dyn_cast<SyntheticSection>(isec))
       s->writeTo(buf + isec->outSecOff);
@@ -447,13 +450,25 @@
       } else
         fill(start, end - start, filler);
     }
-  });
+  };
 
   // Linker scripts may have BYTE()-family commands with which you
   // can write arbitrary bytes to the output. Process them if any.
+  bool written = false;
   for (SectionCommand *cmd : commands)
-    if (auto *data = dyn_cast<ByteCommand>(cmd))
+    if (auto *data = dyn_cast<ByteCommand>(cmd)) {
+      if (!std::exchange(written, true))
+        parallelFor(0, sections.size(), fn);
       writeInt(buf + data->offset, data->expression().getValue(), data->size);
+    }
+  if (written)
+    return;
+
+  // There is no data command. Write content asynchronously to overlap the write
+  // time with other output sections. Note, if a linker script specifies
+  // overlapping output sections (usually broken), the output may be
+  // non-deterministic.
+  asyncParallelFor(tg, 128, 0, sections.size(), fn);
 }
 
 static void finalizeShtGroup(OutputSection *os, InputSection *section) {
@@ -673,10 +688,14 @@
 template void OutputSection::writeHeaderTo<ELF64LE>(ELF64LE::Shdr *Shdr);
 template void OutputSection::writeHeaderTo<ELF64BE>(ELF64BE::Shdr *Shdr);
 
-template void OutputSection::writeTo<ELF32LE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF32BE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF64LE>(uint8_t *Buf);
-template void OutputSection::writeTo<ELF64BE>(uint8_t *Buf);
+template void OutputSection::writeTo<ELF32LE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF32BE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF64LE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);
+template void OutputSection::writeTo<ELF64BE>(uint8_t *,
+                                              llvm::parallel::TaskGroup &);
 
 template void OutputSection::maybeCompress<ELF32LE>();
 template void OutputSection::maybeCompress<ELF32BE>();
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2834,9 +2834,10 @@
 }
 
 template <class ELFT> void Writer<ELFT>::writeSectionsBinary() {
+  parallel::TaskGroup tg;
   for (OutputSection *sec : outputSections)
     if (sec->flags & SHF_ALLOC)
-      sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
+      sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
 }
 
 static void fillTrap(uint8_t *i, uint8_t *end) {
@@ -2879,16 +2880,21 @@
 template <class ELFT> void Writer<ELFT>::writeSections() {
   llvm::TimeTraceScope timeScope("Write sections");
 
-  // In -r or --emit-relocs mode, write the relocation sections first as in
-  // ELf_Rel targets we might find out that we need to modify the relocated
-  // section while doing it.
-  for (OutputSection *sec : outputSections)
-    if (sec->type == SHT_REL || sec->type == SHT_RELA)
-      sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
-
-  for (OutputSection *sec : outputSections)
-    if (sec->type != SHT_REL && sec->type != SHT_RELA)
-      sec->writeTo<ELFT>(Out::bufferStart + sec->offset);
+  {
+    // In -r or --emit-relocs mode, write the relocation sections first as in
+    // ELf_Rel targets we might find out that we need to modify the relocated
+    // section while doing it.
+    parallel::TaskGroup tg;
+    for (OutputSection *sec : outputSections)
+      if (sec->type == SHT_REL || sec->type == SHT_RELA)
+        sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
+  }
+  {
+    parallel::TaskGroup tg;
+    for (OutputSection *sec : outputSections)
+      if (sec->type != SHT_REL && sec->type != SHT_RELA)
+        sec->writeTo<ELFT>(Out::bufferStart + sec->offset, tg);
+  }
 
   // Finally, check that all dynamic relocation addends were written correctly.
   if (config->checkDynamicRelocs && config->writeAddends) {
diff --git a/lld/test/ELF/arm-thumb-interwork-notfunc.s b/lld/test/ELF/arm-thumb-interwork-notfunc.s
--- a/lld/test/ELF/arm-thumb-interwork-notfunc.s
+++ b/lld/test/ELF/arm-thumb-interwork-notfunc.s
@@ -1,6 +1,7 @@
 // REQUIRES: arm
 // RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s
-// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN
+/// Use --threads=1 to keep emitted warnings across sections sequential.
+// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN
 // RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s
 
 .syntax unified
diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s
--- a/lld/test/ELF/hexagon-jump-error.s
+++ b/lld/test/ELF/hexagon-jump-error.s
@@ -1,6 +1,7 @@
 # REQUIRES: hexagon
 # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o
-# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s
+## Use --threads=1 to keep emitted warnings across sections sequential.
+# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s
 
 	.globl	_start
 	.type	_start, @function
diff --git a/lld/test/ELF/linkerscript/overlapping-sections.s b/lld/test/ELF/linkerscript/overlapping-sections.s
--- a/lld/test/ELF/linkerscript/overlapping-sections.s
+++ b/lld/test/ELF/linkerscript/overlapping-sections.s
@@ -88,8 +88,8 @@
 # BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101
 # BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101
 # BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101
-# Starting here the contents of .sec2 overwrites .sec1:
-# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202
+## Starting here the content may be from either .sec1 or .sec2, depending on the write order.
+# BROKEN-OUTPUT-FILE-NEXT: 8040
 
 # RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH
 # BAD-BOTH-LABEL: Section Headers:
diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h
--- a/llvm/include/llvm/Support/Parallel.h
+++ b/llvm/include/llvm/Support/Parallel.h
@@ -30,9 +30,6 @@
 extern ThreadPoolStrategy strategy;
 
 namespace detail {
-
-#if LLVM_ENABLE_THREADS
-
 class Latch {
   uint32_t Count;
   mutable std::mutex Mutex;
@@ -61,9 +58,10 @@
     Cond.wait(lock, [&] { return Count == 0; });
   }
 };
+} // namespace detail
 
 class TaskGroup {
-  Latch L;
+  detail::Latch L;
   bool Parallel;
 
 public:
@@ -75,6 +73,9 @@
   void sync() const { L.sync(); }
 };
 
+namespace detail {
+
+#if LLVM_ENABLE_THREADS
 const ptrdiff_t MinParallelSize = 1024;
 
 /// Inclusive median.
@@ -253,6 +254,11 @@
       [&Fn](auto &&V) { return wrap(Fn(V)); }));
 }
 
+// Spawn iteration tasks to TG, but does not wait for them to finish. When
+// ThreadsRequested == 1, the loop is executedly eagerly.
+void asyncParallelFor(parallel::TaskGroup &TG, size_t TaskSize, size_t Begin,
+                      size_t End, std::function<void(size_t)> Fn);
+
 } // namespace llvm
 
 #endif // LLVM_SUPPORT_PARALLEL_H
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -143,6 +143,7 @@
   return Exec.get();
 }
 } // namespace
+} // namespace detail
 
 static std::atomic<int> TaskGroupInstances;
 
@@ -161,7 +162,7 @@
 void TaskGroup::spawn(std::function<void()> F) {
   if (Parallel) {
     L.inc();
-    Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
+    detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] {
       F();
       L.dec();
     });
@@ -169,8 +170,6 @@
     F();
   }
 }
-
-} // namespace detail
 } // namespace parallel
 } // namespace llvm
 #endif // LLVM_ENABLE_THREADS
@@ -190,7 +189,7 @@
     if (TaskSize == 0)
       TaskSize = 1;
 
-    parallel::detail::TaskGroup TG;
+    parallel::TaskGroup TG;
     for (; Begin + TaskSize < End; Begin += TaskSize) {
       TG.spawn([=, &Fn] {
         for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)
@@ -206,3 +205,23 @@
   for (; Begin != End; ++Begin)
     Fn(Begin);
 }
+
+void llvm::asyncParallelFor(parallel::TaskGroup &TG, size_t TaskSize,
+                            size_t Begin, size_t End,
+                            std::function<void(size_t)> Fn) {
+  // With one thread, run the loop eagerly.
+  if (parallel::strategy.ThreadsRequested == 1) {
+    for (size_t I = Begin; I != End; ++I)
+      Fn(I);
+    return;
+  }
+
+  while (Begin < End) {
+    size_t Next = std::min(Begin + TaskSize, End);
+    TG.spawn([=] {
+      for (size_t I = Begin; I != Next; ++I)
+        Fn(I);
+    });
+    Begin = Next;
+  }
+}