Index: ELF/InputSection.h
===================================================================
--- ELF/InputSection.h
+++ ELF/InputSection.h
@@ -160,11 +160,13 @@
 // be found by looking at the next one) and put the hash in a side table.
 struct SectionPiece {
   SectionPiece(size_t Off, bool Live = false)
-      : InputOff(Off), OutputOff(-1), Live(Live || !Config->GcSections) {}
+      : InputOff(Off), Live(Live || !Config->GcSections), First(false),
+        OutputOff(-1) {}
 
-  size_t InputOff;
-  ssize_t OutputOff : 8 * sizeof(ssize_t) - 1;
+  size_t InputOff : 8 * sizeof(size_t) - 2;
   size_t Live : 1;
+  size_t First : 1;
+  ssize_t OutputOff;
 };
 static_assert(sizeof(SectionPiece) == 2 * sizeof(size_t),
               "SectionPiece is too big");
@@ -194,20 +196,18 @@
   // Splittable sections are handled as a sequence of data
   // rather than a single large blob of data.
   std::vector<SectionPiece> Pieces;
+  std::vector<uint32_t> Hashes;
 
   // Returns I'th piece's data. This function is very hot when
   // string merging is enabled, so we want to inline.
-  LLVM_ATTRIBUTE_ALWAYS_INLINE
-  llvm::CachedHashStringRef getData(size_t I) const {
+  StringRef getString(size_t I) const {
     size_t Begin = Pieces[I].InputOff;
     size_t End;
     if (Pieces.size() - 1 == I)
       End = this->Data.size();
     else
       End = Pieces[I + 1].InputOff;
-
-    StringRef S = {(const char *)(this->Data.data() + Begin), End - Begin};
-    return {S, Hashes[I]};
+    return {(const char *)(this->Data.data() + Begin), End - Begin};
   }
 
   // Returns the SectionPiece at a given input section offset.
@@ -218,8 +218,6 @@
   void splitStrings(ArrayRef<uint8_t> A, size_t Size);
   void splitNonStrings(ArrayRef<uint8_t> A, size_t Size);
 
-  std::vector<uint32_t> Hashes;
-
   mutable llvm::DenseMap<uintX_t, uintX_t> OffsetMap;
   mutable std::once_flag InitOffsetMap;
 
Index: ELF/OutputSections.h
===================================================================
--- ELF/OutputSections.h
+++ ELF/OutputSections.h
@@ -145,9 +145,11 @@
 private:
   void finalizeTailMerge();
   void finalizeNoTailMerge();
+  void finalizeConcurrent();
 
-  llvm::StringTableBuilder Builder;
+  std::unique_ptr<llvm::StringTableBuilder> Builder;
   std::vector<MergeInputSection<ELFT> *> Sections;
+  size_t Alignment;
 };
 
 struct CieRecord {
Index: ELF/OutputSections.cpp
===================================================================
--- ELF/OutputSections.cpp
+++ ELF/OutputSections.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SHA1.h"
+#include <atomic>
 
 using namespace llvm;
 using namespace llvm::dwarf;
@@ -467,11 +468,27 @@
 template <class ELFT>
 MergeOutputSection<ELFT>::MergeOutputSection(StringRef Name, uint32_t Type,
                                              uintX_t Flags, uintX_t Alignment)
-    : OutputSectionBase(Name, Type, Flags),
-      Builder(StringTableBuilder::RAW, Alignment) {}
+    : OutputSectionBase(Name, Type, Flags), Alignment(Alignment) {
+  assert(Alignment != 0 && isPowerOf2_64(Alignment));
+}
 
 template <class ELFT> void MergeOutputSection<ELFT>::writeTo(uint8_t *Buf) {
-  Builder.write(Buf);
+  if (Builder) {
+    Builder->write(Buf);
+    return;
+  }
+
+  // Builder is not used for sharded string table construction.
+  for (MergeInputSection<ELFT> *Sec : Sections) {
+    for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) {
+      SectionPiece &Piece = Sec->Pieces[I];
+      if (!Piece.Live || !Piece.First)
+        continue;
+
+      StringRef S = Sec->getString(I);
+      memcpy(Buf + Piece.OutputOff, S.data(), S.size());
+    }
+  }
 }
 
 template <class ELFT>
@@ -493,11 +510,11 @@
   for (MergeInputSection<ELFT> *Sec : Sections)
     for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
       if (Sec->Pieces[I].Live)
-        Builder.add(Sec->getData(I));
+        Builder->add({Sec->getString(I), Sec->Hashes[I]});
 
   // Fix the string table content. After this, the contents will never change.
-  Builder.finalize();
-  this->Size = Builder.getSize();
+  Builder->finalize();
+  this->Size = Builder->getSize();
 
   // finalize() fixed tail-optimized strings, so we can now get
   // offsets of strings. Get an offset for each string and save it
@@ -505,7 +522,8 @@
   for (MergeInputSection<ELFT> *Sec : Sections)
     for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
       if (Sec->Pieces[I].Live)
-        Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I));
+        Sec->Pieces[I].OutputOff =
+            Builder->getOffset({Sec->getString(I), Sec->Hashes[I]});
 }
 
 template <class ELFT> void MergeOutputSection<ELFT>::finalizeNoTailMerge() {
@@ -516,17 +534,112 @@
   for (MergeInputSection<ELFT> *Sec : Sections)
     for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
       if (Sec->Pieces[I].Live)
-        Sec->Pieces[I].OutputOff = Builder.add(Sec->getData(I));
+        Sec->Pieces[I].OutputOff = Builder->add(Sec->getString(I));
 
-  Builder.finalizeInOrder();
-  this->Size = Builder.getSize();
+  Builder->finalizeInOrder();
+  this->Size = Builder->getSize();
+}
+
+// If tail merge is not needed, we construct a string table ourselves
+// to use a parallel algorithm.
+//
+// Here, we create not only one but N hash tables, where N is a
+// parallelism. We invoke N threads. Each thread knows its thread
+// index T where 0 <= T < N. For each string S in a given string set,
+// thread T adds S to its own hash table only if hash(S) % N == T.
+//
+// The following loop can finish in 1/N time if you have infinite
+// number of CPU cores, but in reality you don't want to make N
+// infinitely large, because all N threads have to visit all section
+// pieces. Larger N means larger overall cost. Skipping 30 million
+// section pieces takes about 50 milliseconds, for example, so it's
+// not very cheap. Currently, N=8 is chosen as a reasonable default.
+//
+// (In case you are wondering: we cannot adjust N at runtime because
+// changing N changes output layout. We want to make the linker emit
+// the same output on every invocation for the same input, so
+// adjusting N at runtime that is not a viable option.)
+template <class ELFT> void MergeOutputSection<ELFT>::finalizeConcurrent() {
+  // This is a DenseMap with padding at end to prevent false sharing.
+  // We assume cache line size is 64 bytes.
+  typedef struct {
+    DenseMap<CachedHashStringRef, size_t> Map;
+    uint8_t Pad[(64 - sizeof(Map) > 0) ? 64 - sizeof(Map) : 1];
+  } OffsetMapTy;
+
+  const int NumShards = 8;
+  OffsetMapTy OffsetMap[NumShards];
+  size_t ShardSize[NumShards];
+
+  // Construct NumShards number of string tables in parallel.
+  forLoop(0, NumShards, [&](size_t Idx) {
+    size_t Offset = 0;
+    for (MergeInputSection<ELFT> *Sec : Sections) {
+      for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) {
+        uint32_t Hash = Sec->Hashes[I];
+        if ((Hash % NumShards) != Idx)
+          continue;
+
+        SectionPiece &Piece = Sec->Pieces[I];
+        if (!Piece.Live)
+          continue;
+
+        StringRef S = Sec->getString(I);
+        size_t Off = alignTo(Offset, Alignment);
+        auto P = OffsetMap[Idx].Map.insert({{S, Hash}, Off});
+        if (P.second) {
+          Piece.First = true;
+          Piece.OutputOff = Off;
+          Offset = Off + S.size();
+        } else {
+          Piece.OutputOff = P.first->second;
+        }
+      }
+    }
+    ShardSize[Idx] = Offset;
+  });
+
+  // Piece.OutputOff was set independently, so we need to fix them.
+  // First, we compute starting offset for each shard.
+  size_t ShardOffset[NumShards];
+  ShardOffset[0] = 0;
+  for (int I = 1; I != NumShards; ++I) {
+    size_t Off = ShardOffset[I - 1] + ShardSize[I - 1];
+    if (ShardSize[I] > 0)
+      Off = alignTo(Off, Alignment);
+    ShardOffset[I] = Off;
+  }
+
+  // Add a shard offset to each section piece.
+  forEach(Sections.begin(), Sections.end(), [&](MergeInputSection<ELFT> *Sec) {
+    for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
+      if (Sec->Pieces[I].Live)
+        Sec->Pieces[I].OutputOff += ShardOffset[Sec->Hashes[I] % NumShards];
+  });
+
+  // Set the size of this output section.
+  this->Size = ShardOffset[NumShards - 1] + ShardSize[NumShards - 1];
 }
 
 template <class ELFT> void MergeOutputSection<ELFT>::finalize() {
-  if (shouldTailMerge())
+  // If we need tail merging, use the default string table builder
+  // because our concurrent one does not support tail merging.
+  if (shouldTailMerge()) {
+    Builder.reset(new StringTableBuilder(StringTableBuilder::RAW, Alignment));
     finalizeTailMerge();
-  else
+    return;
+  }
+
+  // If threading is disabled, use the default string table builder
+  // because our concurrent one is not as fast as the single-threaded
+  // one on single core.
+  if (!Config->Threads || StringRef(getenv("LLD_TEST")) == "1") {
+    Builder.reset(new StringTableBuilder(StringTableBuilder::RAW, Alignment));
     finalizeNoTailMerge();
+    return;
+  }
+
+  finalizeConcurrent();
 }
 
 template <class ELFT>
Index: test/ELF/comment-gc.s
===================================================================
--- test/ELF/comment-gc.s
+++ test/ELF/comment-gc.s
@@ -5,7 +5,7 @@
 # RUN: llvm-objdump -s %t1 | FileCheck %s
 
 # CHECK:      Contents of section .comment:
-# CHECK-NEXT:  0000 00666f6f 00626172 004c4c44 20312e30 .foo.bar.LLD 1.0
+# CHECK-NEXT:  0000 62617200 4c4c4420 312e3000 666f6f00 bar.LLD 1.0.foo.
 # CHECK-NEXT:  0010 00 .
 
 .ident "foo"
Index: test/ELF/compressed-debug-input.s
===================================================================
--- test/ELF/compressed-debug-input.s
+++ test/ELF/compressed-debug-input.s
@@ -62,10 +62,10 @@
 # DATA-NEXT:   EntrySize: 1
 # DATA-NEXT:   SectionData (
 # DATA-NEXT:     0000: 73686F72 7420756E 7369676E 65642069  |short unsigned i|
-# DATA-NEXT:     0010: 6E740075 6E736967 6E656420 696E7400  |nt.unsigned int.|
-# DATA-NEXT:     0020: 6C6F6E67 20756E73 69676E65 6420696E  |long unsigned in|
-# DATA-NEXT:     0030: 74006368 61720075 6E736967 6E656420  |t.char.unsigned |
-# DATA-NEXT:     0040: 63686172 00                          |char.|
+# DATA-NEXT:     0010: 6E740063 68617200 756E7369 676E6564  |nt.char.unsigned|
+# DATA-NEXT:     0020: 20636861 7200756E 7369676E 65642069  | char.unsigned i|
+# DATA-NEXT:     0030: 6E74006C 6F6E6720 756E7369 676E6564  |nt.long unsigned|
+# DATA-NEXT:     0040: 20696E74 00                          | int.|
 # DATA-NEXT:   )
 # DATA-NEXT: }
 
Index: test/ELF/debug-gc.s
===================================================================
--- test/ELF/debug-gc.s
+++ test/ELF/debug-gc.s
@@ -4,11 +4,11 @@
 # RUN: llvm-objdump -s %t1 | FileCheck %s
 
 # CHECK:      Contents of section .debug_str:
-# CHECK-NEXT:  0000 41414100 42424200 43434300           AAA.BBB.CCC.
+# CHECK-NEXT: 0000 41414100 43434300 42424200   AAA.CCC.BBB.
 # CHECK:      Contents of section .foo:
 # CHECK-NEXT:  0000 2a000000
 # CHECK:      Contents of section .debug_info:
-# CHECK-NEXT:  0000 00000000 04000000
+# CHECK-NEXT:  0000 00000000 08000000
 
 .globl _start
 _start:
Index: test/ELF/gc-sections-merge.s
===================================================================
--- test/ELF/gc-sections-merge.s
+++ test/ELF/gc-sections-merge.s
@@ -20,7 +20,7 @@
 // CHECK-NEXT: AddressAlignment: 1
 // CHECK-NEXT: EntrySize: 1
 // CHECK-NEXT: SectionData (
-// CHECK-NEXT:   0000: 666F6F00 62617200                    |foo.bar.|
+// CHECK-NEXT:   0000: 62617200 666F6F00                    |bar.foo.|
 // CHECK-NEXT: )
 
 // GC:      Name: .rodata
Index: test/ELF/merge-string-align.s
===================================================================
--- test/ELF/merge-string-align.s
+++ test/ELF/merge-string-align.s
@@ -30,8 +30,8 @@
 // CHECK-NEXT: AddressAlignment: 16
 // CHECK-NEXT: EntrySize:
 // CHECK-NEXT: SectionData (
-// CHECK-NEXT:   0000:  666F6F00 00000000 00000000 00000000  |foo.............|
-// CHECK-NEXT:   0010:  62617200                             |bar.|
+// CHECK-NEXT:   0000:  62617200 00000000 00000000 00000000  |bar.............|
+// CHECK-NEXT:   0010:  666F6F00                             |foo.|
 // CHECK-NEXT: )
 
         .section        .rodata.str1.1,"aMS",@progbits,1
Index: test/ELF/merge.s
===================================================================
--- test/ELF/merge.s
+++ test/ELF/merge.s
@@ -31,17 +31,17 @@
 // CHECK-NEXT:    AddressAlignment: 4
 // CHECK-NEXT:    EntrySize: 4
 // CHECK-NEXT:    SectionData (
-// CHECK-NEXT:      0000: 10000000 42000000
+// CHECK-NEXT:      0000: 42000000 10000000
 // CHECK-NEXT:    )
 
 
-// Address of the constant 0x10 = 0x200120 = 2097440
-// Address of the constant 0x42 = 0x200124 = 2097444
+// Address of the constant 0x10 = 0x200124 = 2097444
+// Address of the constant 0x42 = 0x200120 = 2097440
 
 // CHECK:      Symbols [
 
 // CHECK:        Name: bar
-// CHECK-NEXT:   Value: 0x200124
+// CHECK-NEXT:   Value: 0x200120
 // CHECK-NEXT:   Size: 0
 // CHECK-NEXT:   Binding: Loca
 // CHECK-NEXT:   Type: None
@@ -49,7 +49,7 @@
 // CHECK-NEXT:   Section: .mysec
 
 // CHECK:        Name: zed
-// CHECK-NEXT:   Value: 0x200124
+// CHECK-NEXT:   Value: 0x200120
 // CHECK-NEXT:   Size: 0
 // CHECK-NEXT:   Binding: Local
 // CHECK-NEXT:   Type: None
@@ -57,7 +57,7 @@
 // CHECK-NEXT:   Section: .mysec
 
 // CHECK:        Name: foo
-// CHECK-NEXT:   Value: 0x200124
+// CHECK-NEXT:   Value: 0x200120
 // CHECK-NEXT:   Size: 0
 // CHECK-NEXT:   Binding: Local
 // CHECK-NEXT:   Type: None
@@ -75,37 +75,37 @@
 // DISASM-NEXT: _start:
 
         movl .mysec, %eax
-// addr(0x10) = 2097440
-// DISASM-NEXT:   movl    2097440, %eax
+// addr(0x10) = 2097444
+// DISASM-NEXT:   movl    2097444, %eax
 
         movl .mysec+7, %eax
-// addr(0x42) + 3 = 2097444 + 3 = 2097447
-// DISASM-NEXT:   movl    2097447, %eax
+// addr(0x42) + 3 = 2097440 + 3 = 2097443
+// DISASM-NEXT:   movl    2097443, %eax
 
         movl .mysec+8, %eax
-// addr(0x42) = 2097444
-// DISASM-NEXT:   movl    2097444, %eax
+// addr(0x42) = 2097440
+// DISASM-NEXT:   movl    2097440, %eax
 
         movl bar+7, %eax
-// addr(0x42) + 7 = 2097444 + 7 = 2097451
-// DISASM-NEXT:   movl    2097451, %eax
+// addr(0x42) + 7 = 2097440 + 7 = 2097447
+// DISASM-NEXT:   movl    2097447, %eax
 
         movl bar+8, %eax
-// addr(0x42) + 8 = 2097444 + 8 = 2097452
-// DISASM-NEXT:   movl    2097452, %eax
+// addr(0x42) + 8 = 2097440 + 8 = 2097448
+// DISASM-NEXT:   movl    2097448, %eax
 
         movl foo, %eax
-// addr(0x42) = 2097444
-// DISASM-NEXT:   movl    2097444, %eax
+// addr(0x42) = 2097440
+// DISASM-NEXT:   movl    2097440, %eax
 
         movl foo+7, %eax
-// addr(0x42) + 7 =  = 2097444 + 7 = 2097451
-// DISASM-NEXT:   movl    2097451, %eax
+// addr(0x42) + 7 =  = 2097440 + 7 = 2097447
+// DISASM-NEXT:   movl    2097447, %eax
 
         movl foo+8, %eax
-// addr(0x42) + 8 =  = 2097444 + 8 = 2097452
-// DISASM-NEXT:   movl    2097452, %eax
+// addr(0x42) + 8 =  = 2097440 + 8 = 2097448
+// DISASM-NEXT:   movl    2097448, %eax
 
 //  From the other file:  movl .mysec, %eax
-// addr(0x42) = 2097444
-// DISASM-NEXT:   movl    2097444, %eax
+// addr(0x42) = 2097440
+// DISASM-NEXT:   movl    2097440, %eax