Index: lld/ELF/InputSection.h
===================================================================
--- lld/ELF/InputSection.h
+++ lld/ELF/InputSection.h
@@ -228,6 +228,9 @@
   // rather than a single large blob of data.
   std::vector<SectionPiece> Pieces;
 
+  // Used by MergeTailSection.
+  std::vector<uint16_t> TailHashes;
+
   // Returns I'th piece's data. This function is very hot when
   // string merging is enabled, so we want to inline.
   LLVM_ATTRIBUTE_ALWAYS_INLINE
@@ -254,7 +257,6 @@
   void splitNonStrings(ArrayRef<uint8_t> A, size_t Size);
 
   std::vector<uint32_t> Hashes;
-
   mutable llvm::DenseMap<uint64_t, uint64_t> OffsetMap;
   mutable llvm::once_flag InitOffsetMap;
 
Index: lld/ELF/SyntheticSections.h
===================================================================
--- lld/ELF/SyntheticSections.h
+++ lld/ELF/SyntheticSections.h
@@ -667,26 +667,33 @@
 class MergeSyntheticSection : public SyntheticSection {
 public:
   void addSection(MergeInputSection *MS);
+  size_t getSize() const override { return Size; }
+  void writeTo(uint8_t *Buf) override;
 
 protected:
   MergeSyntheticSection(StringRef Name, uint32_t Type, uint64_t Flags,
-                        uint32_t Alignment)
-      : SyntheticSection(Flags, Type, Alignment, Name) {}
+                        uint32_t Alignment);
 
   std::vector<MergeInputSection *> Sections;
+
+  // Section size
+  size_t Size;
+
+  // Concurrency level.
+  size_t Concurrency;
+
+  // String table contents
+  constexpr static size_t NumShards = 32;
+  std::vector<llvm::StringTableBuilder> Shards;
+  size_t ShardOffsets[NumShards];
 };
 
 class MergeTailSection final : public MergeSyntheticSection {
 public:
   MergeTailSection(StringRef Name, uint32_t Type, uint64_t Flags,
-                   uint32_t Alignment);
-
-  size_t getSize() const override;
-  void writeTo(uint8_t *Buf) override;
+                   uint32_t Alignment)
+      : MergeSyntheticSection(Name, Type, Flags, Alignment) {}
   void finalizeContents() override;
-
-private:
-  llvm::StringTableBuilder Builder;
 };
 
 class MergeNoTailSection final : public MergeSyntheticSection {
@@ -694,9 +701,6 @@
   MergeNoTailSection(StringRef Name, uint32_t Type, uint64_t Flags,
                      uint32_t Alignment)
       : MergeSyntheticSection(Name, Type, Flags, Alignment) {}
-
-  size_t getSize() const override { return Size; }
-  void writeTo(uint8_t *Buf) override;
   void finalizeContents() override;
 
 private:
@@ -708,14 +712,6 @@
   size_t getShardId(uint32_t Hash) {
     return Hash >> (32 - llvm::countTrailingZeros(NumShards));
   }
-
-  // Section size
-  size_t Size;
-
-  // String table contents
-  constexpr static size_t NumShards = 32;
-  std::vector<llvm::StringTableBuilder> Shards;
-  size_t ShardOffsets[NumShards];
 };
 
 // .MIPS.abiflags section.
Index: lld/ELF/SyntheticSections.cpp
===================================================================
--- lld/ELF/SyntheticSections.cpp
+++ lld/ELF/SyntheticSections.cpp
@@ -49,7 +49,7 @@
 using namespace lld;
 using namespace lld::elf;
 
-constexpr size_t MergeNoTailSection::NumShards;
+constexpr size_t MergeSyntheticSection::NumShards;
 
 uint64_t SyntheticSection::getVA() const {
   if (OutputSection *Sec = getParent())
@@ -2179,41 +2179,28 @@
   return getNeedNum() == 0;
 }
 
+MergeSyntheticSection::MergeSyntheticSection(StringRef Name, uint32_t Type,
+                                             uint64_t Flags, uint32_t Alignment)
+    : SyntheticSection(Flags, Type, Alignment, Name) {
+  // Initialize string builders.
+  for (size_t I = 0; I < NumShards; ++I)
+    Shards.emplace_back(StringTableBuilder::RAW, Alignment);
+
+  // Initializes concurrency level. Must be a power of 2 to avoid
+  // the expensive modulo operation when dividing with this value.
+  if (Config->Threads)
+    Concurrency =
+        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), NumShards);
+  else
+    Concurrency = 1;
+}
+
 void MergeSyntheticSection::addSection(MergeInputSection *MS) {
   MS->Parent = this;
   Sections.push_back(MS);
 }
 
-MergeTailSection::MergeTailSection(StringRef Name, uint32_t Type,
-                                   uint64_t Flags, uint32_t Alignment)
-    : MergeSyntheticSection(Name, Type, Flags, Alignment),
-      Builder(StringTableBuilder::RAW, Alignment) {}
-
-size_t MergeTailSection::getSize() const { return Builder.getSize(); }
-
-void MergeTailSection::writeTo(uint8_t *Buf) { Builder.write(Buf); }
-
-void MergeTailSection::finalizeContents() {
-  // Add all string pieces to the string table builder to create section
-  // contents.
-  for (MergeInputSection *Sec : Sections)
-    for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
-      if (Sec->Pieces[I].Live)
-        Builder.add(Sec->getData(I));
-
-  // Fix the string table content. After this, the contents will never change.
-  Builder.finalize();
-
-  // finalize() fixed tail-optimized strings, so we can now get
-  // offsets of strings. Get an offset for each string and save it
-  // to a corresponding StringPiece for easy access.
-  for (MergeInputSection *Sec : Sections)
-    for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
-      if (Sec->Pieces[I].Live)
-        Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I));
-}
-
-void MergeNoTailSection::writeTo(uint8_t *Buf) {
+void MergeSyntheticSection::writeTo(uint8_t *Buf) {
   for (size_t I = 0; I < NumShards; ++I)
     Shards[I].write(Buf + ShardOffsets[I]);
 }
@@ -2227,17 +2214,6 @@
 // T into different string builders without worrying about merge misses.
 // We do it in parallel.
 void MergeNoTailSection::finalizeContents() {
-  // Initializes string table builders.
-  for (size_t I = 0; I < NumShards; ++I)
-    Shards.emplace_back(StringTableBuilder::RAW, Alignment);
-
-  // Concurrency level. Must be a power of 2 to avoid expensive modulo
-  // operations in the following tight loop.
-  size_t Concurrency = 1;
-  if (Config->Threads)
-    Concurrency =
-        std::min<size_t>(PowerOf2Floor(hardware_concurrency()), NumShards);
-
   // Add section pieces to the builders.
   parallelForEachN(0, Concurrency, [&](size_t ThreadId) {
     for (MergeInputSection *Sec : Sections) {
@@ -2273,6 +2249,70 @@
   });
 }
 
+static uint32_t tailHash(StringRef S) {
+  if (S.size() < 4)
+    return 0;
+  return xxHash64(S.substr(S.size() - 4));
+}
+
+// This function basically does the same thing as MergeNoTailSection
+// does but creates a tail-merged string table. E.g. "def\0" is merged
+// with "abcdef\0". This is enabled only when -O2 is given.
+//
+// Input strings are sharded based on their very last few characters.
+// This works because, if strings S and T can be tail-merged, S and T
+// must share the same tail substring. Currently we use the last 4
+// characters for sharding.
+//
+// Note that strings shorter than 4 characters may not be tail-merged.
+// That is okay because the loss seems negligible. My observation is
+// that a loss is 8 KiB when we create a 1.7 GiB clang executable.
+void MergeTailSection::finalizeContents() {
+  // Initializes TailHashes by hash values of last few characters.
+  parallelForEach(Sections, [&](MergeInputSection *Sec) {
+    Sec->TailHashes.resize(Sec->Pieces.size());
+    for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I)
+      if (Sec->Pieces[I].Live)
+        Sec->TailHashes[I] = tailHash(Sec->getData(I).val());
+  });
+
+  // Add section pieces to the builders.
+  parallelForEachN(0, Concurrency, [&](size_t ThreadId) {
+    for (MergeInputSection *Sec : Sections) {
+      for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) {
+        if (!Sec->Pieces[I].Live)
+          continue;
+        size_t ShardId = Sec->TailHashes[I] % NumShards;
+        if ((ShardId & (Concurrency - 1)) == ThreadId)
+          Shards[ShardId].add(Sec->getData(I));
+      }
+    }
+  });
+
+  parallelForEach(Shards, [](StringTableBuilder &S) { S.finalize(); });
+
+  // Compute an in-section offset for each shard.
+  size_t Off = 0;
+  for (size_t I = 0; I < NumShards; ++I) {
+    if (Shards[I].getSize() > 0)
+      Off = alignTo(Off, Alignment);
+    ShardOffsets[I] = Off;
+    Off += Shards[I].getSize();
+  }
+  Size = Off;
+
+  // Set to section piece offsets.
+  parallelForEach(Sections, [&](MergeInputSection *Sec) {
+    for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) {
+      if (!Sec->Pieces[I].Live)
+        continue;
+      size_t ShardId = Sec->TailHashes[I] % NumShards;
+      Sec->Pieces[I].OutputOff =
+          ShardOffsets[ShardId] + Shards[ShardId].getOffset(Sec->getData(I));
+    }
+  });
+}
+
 static MergeSyntheticSection *createMergeSynthetic(StringRef Name,
                                                    uint32_t Type,
                                                    uint64_t Flags,
Index: lld/test/ELF/merge-shared-str.s
===================================================================
--- lld/test/ELF/merge-shared-str.s
+++ lld/test/ELF/merge-shared-str.s
@@ -1,16 +1,14 @@
 // REQUIRES: x86
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-// RUN: ld.lld %t.o -o %t.so -shared -O3
+// RUN: ld.lld %t.o -o %t.so -shared -O2
 // RUN: llvm-readobj -r -s %t.so | FileCheck %s
 
+.section foo,"aMS",@progbits,1
+.asciz "foobarbaz"
+.asciz "barbaz"
 
-        .section        foo,"aMS",@progbits,1
-        .asciz "bar"
-        .asciz "ar"
-
-        .data
-        .quad foo + 4
-
+.data
+.quad foo + 11
 
 // CHECK:      Name: foo
 // CHECK-NEXT: Type: SHT_PROGBITS
@@ -23,6 +21,6 @@
 
 // CHECK:      Relocations [
 // CHECK-NEXT:   Section ({{.*}}) .rela.dyn {
-// CHECK-NEXT:     0x{{.*}} R_X86_64_RELATIVE - 0x1C9
+// CHECK-NEXT:     0x{{.*}} R_X86_64_RELATIVE - 0x1CC
 // CHECK-NEXT:   }
 // CHECK-NEXT: ]
Index: lld/test/ELF/merge-string.s
===================================================================
--- lld/test/ELF/merge-string.s
+++ lld/test/ELF/merge-string.s
@@ -8,12 +8,12 @@
 // RUN: llvm-readobj -s -section-data -t %t.so | FileCheck --check-prefix=NOMERGE %s
 
         .section	.rodata1,"aMS",@progbits,1
-	.asciz	"abc"
+	.asciz	"abcdef"
 foo:
-	.ascii	"a"
+	.ascii	"abcd"
 bar:
-        .asciz  "bc"
-        .asciz  "bc"
+        .asciz  "ef"
+        .asciz  "cdef"
 
         .section        .rodata2,"aMS",@progbits,2
         .align  2
@@ -30,13 +30,13 @@
 // CHECK-NEXT: ]
 // CHECK-NEXT: Address:         0x1C8
 // CHECK-NEXT: Offset:  0x1C8
-// CHECK-NEXT: Size:    4
+// CHECK-NEXT: Size:
 // CHECK-NEXT: Link: 0
 // CHECK-NEXT: Info: 0
 // CHECK-NEXT: AddressAlignment: 1
 // CHECK-NEXT: EntrySize: 0
 // CHECK-NEXT: SectionData (
-// CHECK-NEXT:   0000: 61626300                             |abc.|
+// CHECK-NEXT:   |abcdef.|
 // CHECK-NEXT: )
 
 // NOTAIL:      Name:    .rodata1
@@ -48,13 +48,13 @@
 // NOTAIL-NEXT: ]
 // NOTAIL-NEXT: Address:         0x1C8
 // NOTAIL-NEXT: Offset:  0x1C8
-// NOTAIL-NEXT: Size:    7
+// NOTAIL-NEXT: Size:
 // NOTAIL-NEXT: Link: 0
 // NOTAIL-NEXT: Info: 0
 // NOTAIL-NEXT: AddressAlignment: 1
 // NOTAIL-NEXT: EntrySize: 0
 // NOTAIL-NEXT: SectionData (
-// NOTAIL-NEXT:   0000: 62630061 626300                     |bc.abc.|
+// NOTAIL-NEXT:   |cdef.abcdef.|
 // NOTAIL-NEXT: )
 
 // NOMERGE:      Name:    .rodata1
@@ -66,13 +66,14 @@
 // NOMERGE-NEXT: ]
 // NOMERGE-NEXT: Address:         0x1C8
 // NOMERGE-NEXT: Offset:  0x1C8
-// NOMERGE-NEXT: Size:    11
+// NOMERGE-NEXT: Size:
 // NOMERGE-NEXT: Link: 0
 // NOMERGE-NEXT: Info: 0
 // NOMERGE-NEXT: AddressAlignment: 1
 // NOMERGE-NEXT: EntrySize: 1
 // NOMERGE-NEXT: SectionData (
-// NOMERGE-NEXT:   0000: 61626300 61626300 626300 |abc.abc.bc.|
+// NOMERGE-NEXT:   |abcdef.abcdef.cd|
+// NOMERGE-NEXT:   |ef.|
 // NOMERGE-NEXT: )
 
 // CHECK:      Name: .rodata2
@@ -82,24 +83,24 @@
 // CHECK-NEXT:   SHF_MERGE
 // CHECK-NEXT:   SHF_STRINGS
 // CHECK-NEXT: ]
-// CHECK-NEXT: Address: 0x1CC
-// CHECK-NEXT: Offset: 0x1CC
-// CHECK-NEXT: Size: 4
+// CHECK-NEXT: Address: 0x1D0
+// CHECK-NEXT: Offset: 0x1D0
+// CHECK-NEXT: Size:
 // CHECK-NEXT: Link: 0
 // CHECK-NEXT: Info: 0
 // CHECK-NEXT: AddressAlignment: 2
 // CHECK-NEXT: EntrySize: 0
 // CHECK-NEXT: SectionData (
-// CHECK-NEXT:   0000: 14000000                             |....|
+// CHECK-NEXT:   |....|
 // CHECK-NEXT: )
 
 
 // CHECK:      Name:    bar
-// CHECK-NEXT: Value:   0x1C9
+// CHECK-NEXT: Value:   0x1CC
 
 // CHECK:      Name:    foo
 // CHECK-NEXT: Value:   0x1C8
 
 // CHECK:      Name: zed
-// CHECK-NEXT: Value: 0x1CC
+// CHECK-NEXT: Value: 0x1D0
 // CHECK-NEXT: Size: 0
Index: lld/test/ELF/tail-merge-string-align.s
===================================================================
--- lld/test/ELF/tail-merge-string-align.s
+++ lld/test/ELF/tail-merge-string-align.s
@@ -1,20 +1,19 @@
 // REQUIRES: x86
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o
-// RUN: ld.lld %t.o -o %t.so -shared -O3
+// RUN: ld.lld %t.o -o %t.so -shared -O2
 // RUN: llvm-readobj -s -section-data %t.so | FileCheck %s
 
-        .section        .rodata.4a,"aMS",@progbits,1
-        .align 4
-        .asciz "abcdef"
+.section        .rodata.4a,"aMS",@progbits,1
+.align 4
+.asciz "abcdefg"
 
-        .section        .rodata.4b,"aMS",@progbits,1
-        .align 4
-        .asciz "ef"
-
-        .section        .rodata.4c,"aMS",@progbits,1
-        .align 4
-        .asciz "f"
+.section        .rodata.4b,"aMS",@progbits,1
+.align 4
+.asciz "efg"
 
+.section        .rodata.4c,"aMS",@progbits,1
+.align 4
+.asciz "defgh"
 
 // CHECK:      Name: .rodata
 // CHECK-NEXT: Type: SHT_PROGBITS
@@ -25,11 +24,11 @@
 // CHECK-NEXT: ]
 // CHECK-NEXT: Address:
 // CHECK-NEXT: Offset:
-// CHECK-NEXT: Size: 1
+// CHECK-NEXT: Size:
 // CHECK-NEXT: Link: 0
 // CHECK-NEXT: Info: 0
 // CHECK-NEXT: AddressAlignment: 4
 // CHECK-NEXT: EntrySize:
 // CHECK-NEXT: SectionData (
-// CHECK-NEXT:   0000:    61626364 65660000 6600               |abcdef..f.|
+// CHECK-NEXT:   |abcdefg.defgh.|
 // CHECK-NEXT: )