Index: ELF/InputSection.h
===================================================================
--- ELF/InputSection.h
+++ ELF/InputSection.h
@@ -160,14 +160,16 @@
 // be found by looking at the next one) and put the hash in a side table.
 struct SectionPiece {
   SectionPiece(size_t Off, bool Live = false)
-      : InputOff(Off), OutputOff(-1), Live(Live || !Config->GcSections) {}
-
-  size_t InputOff;
-  ssize_t OutputOff : 8 * sizeof(ssize_t) - 1;
-  size_t Live : 1;
+      : InputOff(Off), Live(Live || !Config->GcSections), OutputOff(-1) {}
+
+  uint64_t InputOff : 40;
+  uint64_t Live : 1;
+  union {
+    int64_t OutputOff;
+    SectionPiece *Next;
+  };
 };
-static_assert(sizeof(SectionPiece) == 2 * sizeof(size_t),
-              "SectionPiece is too big");
+static_assert(sizeof(SectionPiece) == 16, "SectionPiece is too big");
 
 // This corresponds to a SHF_MERGE section of an input file.
 template <class ELFT> class MergeInputSection : public InputSectionBase<ELFT> {
Index: ELF/OutputSections.h
===================================================================
--- ELF/OutputSections.h
+++ ELF/OutputSections.h
@@ -21,6 +21,7 @@
 namespace elf {
 
 class SymbolBody;
+class ParallelStringTableBuilder;
 struct EhSectionPiece;
 template <class ELFT> class EhInputSection;
 template <class ELFT> class InputSection;
@@ -143,8 +144,11 @@
   void finalizeTailMerge();
   void finalizeNoTailMerge();
 
+  ParallelStringTableBuilder *ParallelBuilder = nullptr;
+
   llvm::StringTableBuilder Builder;
   std::vector<MergeInputSection<ELFT> *> Sections;
+  size_t StringAlignment;
 };
 
 struct CieRecord {
Index: ELF/OutputSections.cpp
===================================================================
--- ELF/OutputSections.cpp
+++ ELF/OutputSections.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SHA1.h"
+#include <atomic>
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -466,14 +467,226 @@
   }
 }
 
+// This is a thread-safe string table builder. Internally it uses
+// atomic variables to keep the inserted strings and associated
+// SectionPieces.
+//
+// The internal hash table is an open-addressing one. It doesn't
+// support resizing. Once it becomes full, the behavior is undefined.
+//
+// Just like DenseMap, keys and values are directory stored to buckets
+// as pairs. Originally, keys and values are all null. Keys are
+// pointers to strings.
+//
+// SectionPieces are managed using singly linked list. If a bucket
+// have a value, it has a pointer to a SectionPiece. Other
+// SectionPieces having the same string key can be followed by
+// `Next` pointers of SectionPieces.
+//
+// Once you added all strings, you need to call finalize() to fix
+// string table contents. When finalize() is called, hash table
+// contents is nondeterministic because no one knows which thread
+// have claimed earlier buckets in the hash table.
+//
+// Thus, the next step is to make it deterministic. For a streak of
+// buckets having some values, we sort them. No matter what order
+// multiple threads claim buckets, claimed buckets are claimed, and
+// unclaimed buckets remain unclaimed. Therefore, by sortint buckets
+// for each streak, we can make the entire hash table deterministic.
+//
+// Finally, we assign offsets to all SectionPieces associated with
+// this hash table.
+class elf::ParallelStringTableBuilder {
+  // Bucket entry type.
+  struct EntryTy {
+    // String key information. String key is guaranteed to be unique
+    // in a hash table.
+    std::atomic<const char *> Str;
+    std::atomic<uint32_t> Size;
+
+    // Offset in the string table. Filled by finalize().
+    uint32_t Offset;
+
+    // SectionPieces having the same string contents are chained
+    // using pointers. This pointer points to the first node.
+    std::atomic<SectionPiece *> Piece;
+
+    EntryTy(const EntryTy &Other) { memcpy(this, &Other, sizeof(EntryTy)); }
+
+    EntryTy &operator=(const EntryTy &Other) {
+      memcpy(this, &Other, sizeof(EntryTy));
+      return *this;
+    }
+
+    // Define a total order for string keys. The detail is not important,
+    // but it needs to be deterministic, so that we can get deterministic
+    // output from the hash table.
+    bool operator<(const EntryTy &Other) const {
+      size_t SizeA = Size.load();
+      size_t SizeB = Other.Size.load();
+      if (SizeA != SizeB)
+        return SizeA < SizeB;
+      return memcmp(Str.load(), Other.Str.load(), SizeA) < 0;
+    };
+  };
+
+  static_assert(sizeof(EntryTy) == 24, "EntryTy is too big");
+
+  size_t align2(size_t Val) { return (Val + Alignment - 1) & ~(Alignment - 1); }
+
+public:
+  ParallelStringTableBuilder(size_t Size, size_t Alignment)
+      : Alignment(Alignment),
+        NumBuckets(std::max<size_t>(PowerOf2Ceil(Size), 16)) {
+    Buckets = (EntryTy *)calloc(NumBuckets, sizeof(EntryTy));
+  }
+
+  ~ParallelStringTableBuilder() { free(Buckets); }
+
+  // Inserts a given section piece to the string table.
+  void insert(SectionPiece &Piece, CachedHashStringRef Str) {
+    assert(Str.size() != 0);
+
+    size_t Start = Str.hash() & (NumBuckets - 1);
+
+    for (size_t I = Start; I != Start - 1; I = (I + 1) & (NumBuckets - 1)) {
+      EntryTy &Bucket = Buckets[I];
+
+      // If the current bucket is empty, claim it.
+      const char *Null = nullptr;
+      if (Bucket.Str.compare_exchange_strong(Null, Str.val().data())) {
+        Bucket.Size.store(Str.size());
+        append(Bucket, Piece);
+        return;
+      }
+
+      // The current bucket contains some string. Its size might not be
+      // written by other thread yet, so try loading until it becomes
+      // observable.
+      const char *OldStr = Bucket.Str.load();
+      uint64_t OldSize = Bucket.Size.load();
+      while (OldSize == 0)
+        OldSize = Bucket.Size.load();
+
+      // If there is an existing key, append Piece to the bucket.
+      // Otherwise, go to next bucket.
+      if (Str.val() == StringRef(OldStr, OldSize)) {
+        append(Bucket, Piece);
+        return;
+      }
+    }
+
+    error("table is full");
+  }
+
+  void finalize() {
+    // Sort buckets to make the hash table contents deterministic.
+    sortBuckets();
+    sortWrapAround();
+
+    // Set an offset in the string table for each bucket.
+    size_t Off = 0;
+    for (size_t I = 0; I < NumBuckets; ++I) {
+      if (size_t Size = Buckets[I].Size.load()) {
+        Off = align2(Off);
+        Buckets[I].Offset = Off;
+        Off += Buckets[I].Size.load();
+      }
+    }
+    StringTableSize = Off;
+
+    // Update SectionPieces' offsets.
+    parallel_for(size_t(0), NumBuckets, [&](size_t I) {
+      if (SectionPiece *Cur = Buckets[I].Piece.load()) {
+        for (SectionPiece *Next = Cur->Next; Next;
+             Cur = Next, Next = Next->Next)
+          Cur->OutputOff = Buckets[I].Offset;
+        Cur->OutputOff = Buckets[I].Offset;
+      }
+    });
+  }
+
+  void writeTo(uint8_t *Buf) {
+    for (size_t I = 0; I < NumBuckets; ++I) {
+      if (const char *Str = Buckets[I].Str.load()) {
+        size_t Size = Buckets[I].Size.load();
+        memcpy(Buf + Buckets[I].Offset, Str, Size);
+      }
+    }
+  }
+
+  size_t size() const { return StringTableSize; }
+
+private:
+  // Atomically append Piece to Bucket.
+  void append(EntryTy &Bucket, SectionPiece &Piece) {
+    for (;;) {
+      Piece.Next = Bucket.Piece.load();
+      if (Bucket.Piece.compare_exchange_strong(Piece.Next, &Piece))
+        return;
+    }
+  }
+
+  // Find runs of buckets having some values and sort them.
+  void sortBuckets() {
+    for (size_t I = 0; I < NumBuckets;) {
+      if (Buckets[I].Size.load() == 0) {
+        ++I;
+        continue;
+      }
+      size_t Begin = I;
+      size_t End = I + 1;
+
+      while (End < NumBuckets && Buckets[End].Size.load())
+        ++End;
+      std::sort(Buckets + Begin, Buckets + End);
+      I = End;
+    }
+  }
+
+  // After the end of the buckets, it wraps around to the beginning,
+  // so we sort them as one unit. sortBuckets() didn't handle this
+  // corner case.
+  void sortWrapAround() {
+    if (Buckets[0].Size.load() == 0 || Buckets[NumBuckets - 1].Size.load() == 0)
+      return;
+
+    size_t First = 0;
+    while (Buckets[First].Size.load())
+      ++First;
+
+    size_t Last = NumBuckets - 1;
+    while (Buckets[Last - 1].Size.load())
+      --Last;
+
+    std::vector<EntryTy> Vec;
+    Vec.insert(Vec.end(), Buckets, Buckets + First);
+    Vec.insert(Vec.end(), Buckets + Last, Buckets + NumBuckets);
+
+    std::sort(Vec.begin(), Vec.end());
+
+    size_t LastSize = NumBuckets - Last;
+    std::copy(Vec.begin(), Vec.begin() + LastSize, Buckets + Last);
+    std::copy(Vec.begin() + LastSize, Vec.end(), Buckets);
+  }
+
+  size_t StringTableSize = 0;
+  size_t Alignment;
+  size_t NumBuckets;
+  EntryTy *Buckets;
+};
+
 template <class ELFT>
 MergeOutputSection<ELFT>::MergeOutputSection(StringRef Name, uint32_t Type,
                                              uintX_t Flags, uintX_t Alignment)
     : OutputSectionBase(Name, Type, Flags),
-      Builder(StringTableBuilder::RAW, Alignment) {}
+      Builder(StringTableBuilder::RAW, Alignment), StringAlignment(Alignment) {}
 
 template <class ELFT> void MergeOutputSection<ELFT>::writeTo(uint8_t *Buf) {
-  Builder.write(Buf);
+  if (ParallelBuilder)
+    ParallelBuilder->writeTo(Buf);
+  else
+    Builder.write(Buf);
 }
 
 template <class ELFT>
@@ -525,10 +738,27 @@
 }
 
 template <class ELFT> void MergeOutputSection<ELFT>::finalize() {
-  if (shouldTailMerge())
+  if (shouldTailMerge()) {
     finalizeTailMerge();
-  else
-    finalizeNoTailMerge();
+    return;
+  }
+
+  size_t NumPieces = 0;
+  for (MergeInputSection<ELFT> *Sec : Sections)
+    NumPieces += Sec->Pieces.size();
+  ParallelBuilder =
+      new ParallelStringTableBuilder(NumPieces / 2, StringAlignment);
+
+  parallel_for_each(
+      Sections.begin(), Sections.end(), [&](MergeInputSection<ELFT> *Sec) {
+        for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
+          if (Sec->Pieces[I].Live)
+            ParallelBuilder->insert(Sec->Pieces[I], Sec->getData(I));
+      });
+
+  ParallelBuilder->finalize();
+
+  this->Size = ParallelBuilder->size();
 }
 
 template <class ELFT>