Index: ELF/CMakeLists.txt
===================================================================
--- ELF/CMakeLists.txt
+++ ELF/CMakeLists.txt
@@ -3,6 +3,7 @@
 add_public_tablegen_target(ELFOptionsTableGen)
 
 add_lld_library(lldELF
+  Concurrent.cpp
   Driver.cpp
   DriverUtils.cpp
   EhFrame.cpp
Index: ELF/Concurrent.h
===================================================================
--- /dev/null
+++ ELF/Concurrent.h
@@ -0,0 +1,142 @@
+//===- Concurrent.h -------------------------------------------------------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLD_ELF_CONCURRENT_H
+#define LLD_ELF_CONCURRENT_H
+
+#include "lld/Core/LLVM.h"
+#include <atomic>
+
+namespace llvm {
+class CachedHashStringRef;
+}
+
+namespace lld {
+namespace elf {
+struct SectionPiece;
+
+// This is a concurrent lock-free string table builder. Internally
+// it uses atomic variables to keep inserted strings and associated
+// SectionPieces.
+//
+// The reason why we have a special-purpose concurrent hash table in
+// LLD is because we need to process a very large number of mergeable
+// strings. For example, in the final build of clang with debug info,
+// thousands of input sections containing 30 million mergeable strings
+// in total are fed to the linker. It takes a few seconds to merge
+// them in single thread. This concurrent hash table can uniquify them
+// in a few hundred milliseconds using 40 cores.
+//
+// The internal hash table is an open-addressing one. It doesn't
+// support resizing. Once it becomes full, you need to redo with
+// a larger fresh string table builder.
+//
+// Just like DenseMap, keys and values are directly stored to buckets
+// as pairs. Originally, keys and values are all null. Keys are
+// pointers to strings.
+//
+// SectionPieces are managed using singly linked list. If a bucket
+// have a value, it has a pointer to a SectionPiece. Other
+// SectionPieces having the same string key are chained using `Next`
+// pointer of SectionPiece.
+//
+// Once you added all strings, you need to call finalize() to fix
+// string table contents. When finalize() is called, hash table
+// contents is nondeterministic because no one knows which thread
+// have claimed earlier buckets in the hash table.
+//
+// Thus, the next step is to make it deterministic. For each streak of
+// occupied buckets, we sort that. Recall that this hash table
+// resolves conflicts with linear probing. No matter what order
+// multiple threads claim buckets, claimed buckets are claimed, and
+// unclaimed buckets remain unclaimed. Therefore, by sorting buckets
+// for each streak, we can make the entire hash table deterministic.
+//
+// Finally, we assign offsets to all SectionPieces associated with
+// this hash table.
+class ConcurrentStringTableBuilder {
+public:
+  ConcurrentStringTableBuilder(size_t EstimatedNumEntries, size_t Alignment);
+
+  void insert(SectionPiece &Piece, llvm::CachedHashStringRef Str);
+  void finalize();
+  bool isFull();
+  void writeTo(uint8_t *Buf);
+  size_t size() const { return StringTableSize; }
+
+private:
+  // Bucket entry type.
+  struct EntryTy {
+    // std::sort needs these functions.
+    EntryTy(const EntryTy &Other) { *this = Other; }
+
+    EntryTy &operator=(const EntryTy &Other) {
+      memcpy(this, &Other, sizeof(EntryTy));
+      return *this;
+    }
+
+    EntryTy() = default;
+
+    // Define a total order for string keys. The detail is not important,
+    // but it needs to be defined that we can get deterministic output
+    // from the hash table.
+    bool operator<(const EntryTy &Other) const {
+      size_t SizeA = Size.load();
+      size_t SizeB = Other.Size.load();
+      if (SizeA != SizeB)
+        return SizeA < SizeB;
+      return memcmp(Str.load(), Other.Str.load(), SizeA) < 0;
+    };
+
+    // String key information. String key is guaranteed to be unique
+    // in a hash table.
+    std::atomic<const char *> Str = {nullptr};
+    std::atomic<uint32_t> Size = {0};
+
+    // Offset in the string table. Filled by finalize().
+    uint32_t Offset = 0;
+
+    // SectionPieces having the same string contents are chained
+    // using pointers. This pointer points to the first node.
+    std::atomic<SectionPiece *> Piece = {nullptr};
+  };
+
+  // We could allocate tens of millions of objects of this type,
+  // so we want to keep it small.
+  static_assert(sizeof(EntryTy) <= 24, "EntryTy is too big");
+
+  size_t align2(size_t Val) { return (Val + Alignment - 1) & ~(Alignment - 1); }
+  void append(EntryTy &Bucket, SectionPiece &Piece);
+  void sortBuckets();
+  void sortWrapAround();
+
+  // All strings are aligned to this boundary.
+  size_t Alignment;
+
+  // Filled by finalize().
+  size_t NumEntries = 0;
+  size_t StringTableSize = 0;
+
+  // The number of allocated buckets and buckets.
+  size_t NumBuckets;
+  std::vector<EntryTy> Buckets;
+
+  // The counter to track number of inserted strings.
+  // Note that the number is approximate.
+  std::atomic<size_t> Counter = {0};
+
+  // We do not support table resizing. If it becomes almost full,
+  // we should bail out and redo with a larger table. This bool
+  // value keeps track of that.
+  std::atomic<bool> IsTableFull = {false};
+};
+}
+}
+
+#endif
Index: ELF/Concurrent.cpp
===================================================================
--- /dev/null
+++ ELF/Concurrent.cpp
@@ -0,0 +1,185 @@
+//===- Concurrent.cpp -----------------------------------------------------===//
+//
+//                             The LLVM Linker
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Concurrent.h"
+#include "InputSection.h"
+#include "Threads.h"
+
+#include "llvm/ADT/CachedHashString.h"
+
+using namespace lld;
+using namespace lld::elf;
+using namespace llvm;
+
+// Our target load factor is 0.5.
+ConcurrentStringTableBuilder::ConcurrentStringTableBuilder(
+    size_t EstimatedNumEntries, size_t Alignment)
+    : Alignment(Alignment), NumBuckets(PowerOf2Ceil(EstimatedNumEntries * 2)),
+      Buckets(NumBuckets) {}
+
+// Inserts a given section piece to the string table.
+void ConcurrentStringTableBuilder::insert(SectionPiece &Piece,
+                                          CachedHashStringRef Str) {
+  if (IsTableFull)
+    return;
+  assert(Str.size() != 0);
+
+  size_t Start = Str.hash() & (NumBuckets - 1);
+
+  for (size_t I = Start; I != Start - 1; I = (I + 1) & (NumBuckets - 1)) {
+    EntryTy &Bucket = Buckets[I];
+
+    // If the current bucket is empty, claim it.
+    const char *Null = nullptr;
+    if (Bucket.Str.compare_exchange_strong(Null, Str.val().data())) {
+      Bucket.Size.store(Str.size());
+      append(Bucket, Piece);
+
+      // Update the counter for inserted items. We do this only in
+      // 1/256 chance to avoid contention, so the counter is an
+      // estimate but deterministic because it does not depend on any
+      // randomness nor have race condition.
+      if (Str.hash() % 256 != 0)
+        return;
+      size_t Cnt = Counter.fetch_add(256);
+
+      // If 3/4 of the buckets are occupied, we mark this table as full.
+      // Open-addressing table gets exponentially slower when it
+      // approaches to load factor 1.0, and 3/4 is a reasonable cutoff.
+      // If we have more numbers of items, we should bail out early
+      // and redo with a larger table.
+      if (Cnt * 4 > NumBuckets * 3)
+        IsTableFull = true;
+      return;
+    }
+
+    // The current bucket contains some string. Its size might not be
+    // written by other thread yet, so try loading until it becomes
+    // observable.
+    const char *OldStr = Bucket.Str.load();
+    uint64_t OldSize = 0;
+    while (OldSize == 0)
+      OldSize = Bucket.Size.load();
+
+    // If the current bucket contains the key that we are looking for,
+    // append Piece to the bucket. Otherwise, go to next bucket.
+    if (Str.val() == StringRef(OldStr, OldSize)) {
+      append(Bucket, Piece);
+      return;
+    }
+  }
+  IsTableFull = true;
+}
+
+void ConcurrentStringTableBuilder::finalize() {
+  if (IsTableFull)
+    return;
+
+  // Sort buckets to make the hash table contents deterministic.
+  sortBuckets();
+  sortWrapAround();
+
+  // Set an offset in the string table for each bucket.
+  size_t Off = 0;
+  for (size_t I = 0; I < NumBuckets; ++I) {
+    if (size_t Size = Buckets[I].Size.load()) {
+      Off = align2(Off);
+      Buckets[I].Offset = Off;
+      Off += Buckets[I].Size.load();
+      ++NumEntries;
+    }
+  }
+  StringTableSize = Off;
+
+  // Update SectionPieces' offsets.
+  forLoop(0, NumBuckets, [&](size_t I) {
+    SectionPiece *Cur = Buckets[I].Piece.load();
+    while (Cur) {
+      // OffsetOff and Next are a union, so we need to save Next before
+      // writing to OutputOff.
+      SectionPiece *Next = Cur->Next;
+      Cur->OutputOff = Buckets[I].Offset;
+      Cur = Next;
+    }
+  });
+}
+
+bool ConcurrentStringTableBuilder::isFull() { return IsTableFull; }
+
+void ConcurrentStringTableBuilder::writeTo(uint8_t *Buf) {
+  assert(!isFull());
+
+  forLoop(size_t(0), NumBuckets, [&](size_t I) {
+    if (const char *Str = Buckets[I].Str.load()) {
+      size_t Size = Buckets[I].Size.load();
+      memcpy(Buf + Buckets[I].Offset, Str, Size);
+    }
+  });
+}
+
+// Atomically append Piece to Bucket.
+void ConcurrentStringTableBuilder::append(EntryTy &Bucket,
+                                          SectionPiece &Piece) {
+  for (;;) {
+    Piece.Next = Bucket.Piece.load();
+    if (Bucket.Piece.compare_exchange_weak(Piece.Next, &Piece))
+      return;
+  }
+}
+
+// Find runs of occupied buckets and sort them. This assumes we are
+// using linera probing to find an unused bucket.
+void ConcurrentStringTableBuilder::sortBuckets() {
+  for (size_t I = 0; I < NumBuckets;) {
+    if (Buckets[I].Size.load() == 0) {
+      ++I;
+      continue;
+    }
+    size_t Begin = I;
+    size_t End = I + 1;
+
+    while (End < NumBuckets && Buckets[End].Size.load())
+      ++End;
+    std::sort(Buckets.begin() + Begin, Buckets.begin() + End);
+    I = End;
+  }
+}
+
+// After the end of the buckets, it wraps around to the beginning,
+// so we need to sort them as one unit. sortBuckets() didn't handle
+// this corner case.
+void ConcurrentStringTableBuilder::sortWrapAround() {
+  if (Buckets[0].Size.load() == 0 || Buckets[NumBuckets - 1].Size.load() == 0)
+    return;
+
+  // Copies a wrapped-around streak to a vector, sort the vector,
+  // and then write them back.
+  size_t First = 1;
+  while (Buckets[First].Size.load() && First < NumBuckets)
+    ++First;
+  if (First == NumBuckets) {
+    IsTableFull = true;
+    return;
+  }
+
+  size_t Last = NumBuckets - 1;
+  while (Buckets[Last - 1].Size.load())
+    --Last;
+
+  std::vector<EntryTy> V;
+  auto Begin = Buckets.begin();
+  V.insert(V.end(), Begin, Begin + First);
+  V.insert(V.end(), Begin + Last, Begin + NumBuckets);
+
+  std::sort(V.begin(), V.end());
+
+  size_t LastSize = NumBuckets - Last;
+  std::copy(V.begin(), V.begin() + LastSize, Begin + Last);
+  std::copy(V.begin() + LastSize, V.end(), Begin);
+}
Index: ELF/InputSection.h
===================================================================
--- ELF/InputSection.h
+++ ELF/InputSection.h
@@ -160,11 +160,14 @@
 // be found by looking at the next one) and put the hash in a side table.
 struct SectionPiece {
   SectionPiece(size_t Off, bool Live = false)
-      : InputOff(Off), OutputOff(-1), Live(Live || !Config->GcSections) {}
+      : InputOff(Off), Live(Live || !Config->GcSections), OutputOff(-1) {}
 
-  size_t InputOff;
-  ssize_t OutputOff : 8 * sizeof(ssize_t) - 1;
+  size_t InputOff : 8 * sizeof(ssize_t) - 1;
   size_t Live : 1;
+  union {
+    ssize_t OutputOff;
+    SectionPiece *Next; // used by ConcurrentStringTableBuilder
+  };
 };
 static_assert(sizeof(SectionPiece) == 2 * sizeof(size_t),
               "SectionPiece is too big");
Index: ELF/OutputSections.h
===================================================================
--- ELF/OutputSections.h
+++ ELF/OutputSections.h
@@ -21,6 +21,7 @@
 namespace elf {
 
 class SymbolBody;
+class ConcurrentStringTableBuilder;
 struct EhSectionPiece;
 template <class ELFT> class EhInputSection;
 template <class ELFT> class InputSection;
@@ -143,11 +144,15 @@
   }
 
 private:
+  void finalizeDefault();
   void finalizeTailMerge();
-  void finalizeNoTailMerge();
+  void finalizeConcurrent(size_t NumPieces);
 
   llvm::StringTableBuilder Builder;
+  std::unique_ptr<ConcurrentStringTableBuilder> ConcurrentBuilder;
+
   std::vector<MergeInputSection<ELFT> *> Sections;
+  size_t StringAlignment;
 };
 
 struct CieRecord {
Index: ELF/OutputSections.cpp
===================================================================
--- ELF/OutputSections.cpp
+++ ELF/OutputSections.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "OutputSections.h"
+#include "Concurrent.h"
 #include "Config.h"
 #include "EhFrame.h"
 #include "LinkerScript.h"
@@ -468,10 +469,13 @@
 MergeOutputSection<ELFT>::MergeOutputSection(StringRef Name, uint32_t Type,
                                              uintX_t Flags, uintX_t Alignment)
     : OutputSectionBase(Name, Type, Flags),
-      Builder(StringTableBuilder::RAW, Alignment) {}
+      Builder(StringTableBuilder::RAW, Alignment), StringAlignment(Alignment) {}
 
 template <class ELFT> void MergeOutputSection<ELFT>::writeTo(uint8_t *Buf) {
-  Builder.write(Buf);
+  if (ConcurrentBuilder)
+    ConcurrentBuilder->writeTo(Buf);
+  else
+    Builder.write(Buf);
 }
 
 template <class ELFT>
@@ -508,7 +512,7 @@
         Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I));
 }
 
-template <class ELFT> void MergeOutputSection<ELFT>::finalizeNoTailMerge() {
+template <class ELFT> void MergeOutputSection<ELFT>::finalizeDefault() {
   // Add all string pieces to the string table builder to create section
   // contents. Because we are not tail-optimizing, offsets of strings are
   // fixed when they are added to the builder (string table builder contains
@@ -522,11 +526,78 @@
   this->Size = Builder.getSize();
 }
 
+template <class ELFT>
+void MergeOutputSection<ELFT>::finalizeConcurrent(size_t NumPieces) {
+  // The concurrent hash table does not support resizing,
+  // so if it becomes full, redo with a larger table.
+  // Our initial estimation is that the table contains 14 duplicate
+  // entries for an item on average.
+  size_t Estimate = std::max<size_t>(NumPieces / 15, 1024);
+
+  for (;;) {
+    ConcurrentBuilder.reset(
+        new ConcurrentStringTableBuilder(Estimate, StringAlignment));
+
+    // Approximate number of inserted items.
+    std::atomic<size_t> NumInserted = {0};
+
+    // Insert strings to the table.
+    parallel_for_each(
+        Sections.begin(), Sections.end(), [&](MergeInputSection<ELFT> *Sec) {
+          if (ConcurrentBuilder->isFull())
+            return;
+          NumInserted.fetch_add(Sec->Pieces.size());
+          for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
+            if (Sec->Pieces[I].Live)
+              ConcurrentBuilder->insert(Sec->Pieces[I], Sec->getData(I));
+        });
+
+    ConcurrentBuilder->finalize();
+    if (!ConcurrentBuilder->isFull())
+      break;
+
+    // If the table became full, we need to redo. We know how many
+    // strings were inserted before the table got full, so we can
+    // make an estimate based on that number.
+    double Factor = (double)NumInserted.load() / NumPieces;
+    Estimate = Estimate * (1 / Factor);
+  }
+  this->Size = ConcurrentBuilder->size();
+}
+
 template <class ELFT> void MergeOutputSection<ELFT>::finalize() {
-  if (shouldTailMerge())
+  // If -O2 is specified, we do tail merging.
+  if (shouldTailMerge()) {
     finalizeTailMerge();
+    return;
+  }
+
+  // If -no-threads is specified, we can't use the concurrent map.
+  if (!Config->Threads) {
+    finalizeDefault();
+    return;
+  }
+
+  // If we have a very large number of mergeable strings, use the
+  // concurrent string table builder. Otherwise, use the single-
+  // threaded one. There's an overhead of using the concurrent one,
+  // so we don't want to use that unconditionally. The threshold
+  // is currently set to 100,000.
+
+  size_t NumPieces = 0;
+  for (MergeInputSection<ELFT> *Sec : Sections)
+    NumPieces += Sec->Pieces.size();
+
+  // This is for unit test.
+  if (StringRef(getenv("LLD_USE_CONCURRENT_STRING_TABLE_BUILDER")) == "1") {
+    finalizeConcurrent(NumPieces);
+    return;
+  }
+
+  if (NumPieces < 100000)
+    finalizeDefault();
   else
-    finalizeNoTailMerge();
+    finalizeConcurrent(NumPieces);
 }
 
 template <class ELFT>
Index: test/ELF/merge-strings-concurrent.s
===================================================================
--- /dev/null
+++ test/ELF/merge-strings-concurrent.s
@@ -0,0 +1,25 @@
+// REQUIRES: x86
+// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
+// RUN: env LLD_USE_CONCURRENT_STRING_TABLE_BUILDER=1 ld.lld %t.o -o %t.so -shared
+// RUN: llvm-readobj -s -section-data -t %t.so | FileCheck %s
+
+.section .rodata.str1.1, "aMS", @progbits,1
+.asciz	"abc"
+.asciz	"def"
+.asciz	"ghijklmn"
+.asciz	"o"
+.asciz	"pqrstuvwxyz"
+
+.section .rodata.str2.2, "aMS", @progbits,1
+.asciz	"ABC"
+.asciz	"def"
+.asciz	"ghijklmn"
+.asciz	"O"
+.asciz	"pqrstuvwxyz"
+
+// CHECK:      Name: .rodata (1)
+// CHECK:      SectionData (
+// CHECK-NEXT:   0000: 41424300 64656600 61626300 6768696A  |ABC.def.abc.ghij|
+// CHECK-NEXT:   0010: 6B6C6D6E 006F004F 00707172 73747576  |klmn.o.O.pqrstuv|
+// CHECK-NEXT:   0020: 7778797A 00                          |wxyz.|
+// CHECK-NEXT: )