Index: lld/ELF/InputSection.h =================================================================== --- lld/ELF/InputSection.h +++ lld/ELF/InputSection.h @@ -228,6 +228,9 @@ // rather than a single large blob of data. std::vector Pieces; + // Used by MergeTailSection. + std::vector TailHashes; + // Returns I'th piece's data. This function is very hot when // string merging is enabled, so we want to inline. LLVM_ATTRIBUTE_ALWAYS_INLINE @@ -254,7 +257,6 @@ void splitNonStrings(ArrayRef A, size_t Size); std::vector Hashes; - mutable llvm::DenseMap OffsetMap; mutable llvm::once_flag InitOffsetMap; Index: lld/ELF/SyntheticSections.h =================================================================== --- lld/ELF/SyntheticSections.h +++ lld/ELF/SyntheticSections.h @@ -667,26 +667,33 @@ class MergeSyntheticSection : public SyntheticSection { public: void addSection(MergeInputSection *MS); + size_t getSize() const override { return Size; } + void writeTo(uint8_t *Buf) override; protected: MergeSyntheticSection(StringRef Name, uint32_t Type, uint64_t Flags, - uint32_t Alignment) - : SyntheticSection(Flags, Type, Alignment, Name) {} + uint32_t Alignment); std::vector Sections; + + // Section size + size_t Size; + + // Concurrency level. + size_t Concurrency; + + // String table contents + constexpr static size_t NumShards = 32; + std::vector Shards; + size_t ShardOffsets[NumShards]; }; class MergeTailSection final : public MergeSyntheticSection { public: MergeTailSection(StringRef Name, uint32_t Type, uint64_t Flags, - uint32_t Alignment); - - size_t getSize() const override; - void writeTo(uint8_t *Buf) override; + uint32_t Alignment) + : MergeSyntheticSection(Name, Type, Flags, Alignment) {} void finalizeContents() override; - -private: - llvm::StringTableBuilder Builder; }; class MergeNoTailSection final : public MergeSyntheticSection { @@ -694,9 +701,6 @@ MergeNoTailSection(StringRef Name, uint32_t Type, uint64_t Flags, uint32_t Alignment) : MergeSyntheticSection(Name, Type, Flags, Alignment) {} - - size_t getSize() const override { return Size; } - void writeTo(uint8_t *Buf) override; void finalizeContents() override; private: @@ -708,14 +712,6 @@ size_t getShardId(uint32_t Hash) { return Hash >> (32 - llvm::countTrailingZeros(NumShards)); } - - // Section size - size_t Size; - - // String table contents - constexpr static size_t NumShards = 32; - std::vector Shards; - size_t ShardOffsets[NumShards]; }; // .MIPS.abiflags section. Index: lld/ELF/SyntheticSections.cpp =================================================================== --- lld/ELF/SyntheticSections.cpp +++ lld/ELF/SyntheticSections.cpp @@ -49,7 +49,7 @@ using namespace lld; using namespace lld::elf; -constexpr size_t MergeNoTailSection::NumShards; +constexpr size_t MergeSyntheticSection::NumShards; uint64_t SyntheticSection::getVA() const { if (OutputSection *Sec = getParent()) @@ -2179,41 +2179,28 @@ return getNeedNum() == 0; } +MergeSyntheticSection::MergeSyntheticSection(StringRef Name, uint32_t Type, + uint64_t Flags, uint32_t Alignment) + : SyntheticSection(Flags, Type, Alignment, Name) { + // Initialize string builders. + for (size_t I = 0; I < NumShards; ++I) + Shards.emplace_back(StringTableBuilder::RAW, Alignment); + + // Initializes concurrency level. Must be a power of 2 to avoid + // the expensive modulo operation when dividing with this value. + if (Config->Threads) + Concurrency = + std::min(PowerOf2Floor(hardware_concurrency()), NumShards); + else + Concurrency = 1; +} + void MergeSyntheticSection::addSection(MergeInputSection *MS) { MS->Parent = this; Sections.push_back(MS); } -MergeTailSection::MergeTailSection(StringRef Name, uint32_t Type, - uint64_t Flags, uint32_t Alignment) - : MergeSyntheticSection(Name, Type, Flags, Alignment), - Builder(StringTableBuilder::RAW, Alignment) {} - -size_t MergeTailSection::getSize() const { return Builder.getSize(); } - -void MergeTailSection::writeTo(uint8_t *Buf) { Builder.write(Buf); } - -void MergeTailSection::finalizeContents() { - // Add all string pieces to the string table builder to create section - // contents. - for (MergeInputSection *Sec : Sections) - for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) - if (Sec->Pieces[I].Live) - Builder.add(Sec->getData(I)); - - // Fix the string table content. After this, the contents will never change. - Builder.finalize(); - - // finalize() fixed tail-optimized strings, so we can now get - // offsets of strings. Get an offset for each string and save it - // to a corresponding StringPiece for easy access. - for (MergeInputSection *Sec : Sections) - for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) - if (Sec->Pieces[I].Live) - Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I)); -} - -void MergeNoTailSection::writeTo(uint8_t *Buf) { +void MergeSyntheticSection::writeTo(uint8_t *Buf) { for (size_t I = 0; I < NumShards; ++I) Shards[I].write(Buf + ShardOffsets[I]); } @@ -2227,17 +2214,6 @@ // T into different string builders without worrying about merge misses. // We do it in parallel. void MergeNoTailSection::finalizeContents() { - // Initializes string table builders. - for (size_t I = 0; I < NumShards; ++I) - Shards.emplace_back(StringTableBuilder::RAW, Alignment); - - // Concurrency level. Must be a power of 2 to avoid expensive modulo - // operations in the following tight loop. - size_t Concurrency = 1; - if (Config->Threads) - Concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), NumShards); - // Add section pieces to the builders. parallelForEachN(0, Concurrency, [&](size_t ThreadId) { for (MergeInputSection *Sec : Sections) { @@ -2273,6 +2249,70 @@ }); } +static uint32_t tailHash(StringRef S) { + if (S.size() < 4) + return 0; + return xxHash64(S.substr(S.size() - 4)); +} + +// This function basically does the same thing as MergeNoTailSection +// does but creates a tail-merged string table. E.g. "def\0" is merged +// with "abcdef\0". This is enabled only when -O2 is given. +// +// Input strings are sharded based on their very last few characters. +// This works because, if strings S and T can be tail-merged, S and T +// must share the same tail substring. Currently we use the last 4 +// characters for sharding. +// +// Note that strings shorter than 4 characters may not be tail-merged. +// That is okay because the loss seems negligible. My observation is +// that a loss is 8 KiB when we create a 1.7 GiB clang executable. +void MergeTailSection::finalizeContents() { + // Initializes TailHashes by hash values of last few characters. + parallelForEach(Sections, [&](MergeInputSection *Sec) { + Sec->TailHashes.resize(Sec->Pieces.size()); + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) + if (Sec->Pieces[I].Live) + Sec->TailHashes[I] = tailHash(Sec->getData(I).val()); + }); + + // Add section pieces to the builders. + parallelForEachN(0, Concurrency, [&](size_t ThreadId) { + for (MergeInputSection *Sec : Sections) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) { + if (!Sec->Pieces[I].Live) + continue; + size_t ShardId = Sec->TailHashes[I] % NumShards; + if ((ShardId & (Concurrency - 1)) == ThreadId) + Shards[ShardId].add(Sec->getData(I)); + } + } + }); + + parallelForEach(Shards, [](StringTableBuilder &S) { S.finalize(); }); + + // Compute an in-section offset for each shard. + size_t Off = 0; + for (size_t I = 0; I < NumShards; ++I) { + if (Shards[I].getSize() > 0) + Off = alignTo(Off, Alignment); + ShardOffsets[I] = Off; + Off += Shards[I].getSize(); + } + Size = Off; + + // Set to section piece offsets. + parallelForEach(Sections, [&](MergeInputSection *Sec) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) { + if (!Sec->Pieces[I].Live) + continue; + size_t ShardId = Sec->TailHashes[I] % NumShards; + Sec->Pieces[I].OutputOff = + ShardOffsets[ShardId] + Shards[ShardId].getOffset(Sec->getData(I)); + } + }); +} + static MergeSyntheticSection *createMergeSynthetic(StringRef Name, uint32_t Type, uint64_t Flags, Index: lld/test/ELF/merge-shared-str.s =================================================================== --- lld/test/ELF/merge-shared-str.s +++ lld/test/ELF/merge-shared-str.s @@ -1,16 +1,14 @@ // REQUIRES: x86 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: ld.lld %t.o -o %t.so -shared -O3 +// RUN: ld.lld %t.o -o %t.so -shared -O2 // RUN: llvm-readobj -r -s %t.so | FileCheck %s +.section foo,"aMS",@progbits,1 +.asciz "foobarbaz" +.asciz "barbaz" - .section foo,"aMS",@progbits,1 - .asciz "bar" - .asciz "ar" - - .data - .quad foo + 4 - +.data +.quad foo + 11 // CHECK: Name: foo // CHECK-NEXT: Type: SHT_PROGBITS @@ -23,6 +21,6 @@ // CHECK: Relocations [ // CHECK-NEXT: Section ({{.*}}) .rela.dyn { -// CHECK-NEXT: 0x{{.*}} R_X86_64_RELATIVE - 0x1C9 +// CHECK-NEXT: 0x{{.*}} R_X86_64_RELATIVE - 0x1CC // CHECK-NEXT: } // CHECK-NEXT: ] Index: lld/test/ELF/merge-string.s =================================================================== --- lld/test/ELF/merge-string.s +++ lld/test/ELF/merge-string.s @@ -8,12 +8,12 @@ // RUN: llvm-readobj -s -section-data -t %t.so | FileCheck --check-prefix=NOMERGE %s .section .rodata1,"aMS",@progbits,1 - .asciz "abc" + .asciz "abcdef" foo: - .ascii "a" + .ascii "abcd" bar: - .asciz "bc" - .asciz "bc" + .asciz "ef" + .asciz "cdef" .section .rodata2,"aMS",@progbits,2 .align 2 @@ -30,13 +30,13 @@ // CHECK-NEXT: ] // CHECK-NEXT: Address: 0x1C8 // CHECK-NEXT: Offset: 0x1C8 -// CHECK-NEXT: Size: 4 +// CHECK-NEXT: Size: // CHECK-NEXT: Link: 0 // CHECK-NEXT: Info: 0 // CHECK-NEXT: AddressAlignment: 1 // CHECK-NEXT: EntrySize: 0 // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 61626300 |abc.| +// CHECK-NEXT: |abcdef.| // CHECK-NEXT: ) // NOTAIL: Name: .rodata1 @@ -48,13 +48,13 @@ // NOTAIL-NEXT: ] // NOTAIL-NEXT: Address: 0x1C8 // NOTAIL-NEXT: Offset: 0x1C8 -// NOTAIL-NEXT: Size: 7 +// NOTAIL-NEXT: Size: // NOTAIL-NEXT: Link: 0 // NOTAIL-NEXT: Info: 0 // NOTAIL-NEXT: AddressAlignment: 1 // NOTAIL-NEXT: EntrySize: 0 // NOTAIL-NEXT: SectionData ( -// NOTAIL-NEXT: 0000: 62630061 626300 |bc.abc.| +// NOTAIL-NEXT: |cdef.abcdef.| // NOTAIL-NEXT: ) // NOMERGE: Name: .rodata1 @@ -66,13 +66,14 @@ // NOMERGE-NEXT: ] // NOMERGE-NEXT: Address: 0x1C8 // NOMERGE-NEXT: Offset: 0x1C8 -// NOMERGE-NEXT: Size: 11 +// NOMERGE-NEXT: Size: // NOMERGE-NEXT: Link: 0 // NOMERGE-NEXT: Info: 0 // NOMERGE-NEXT: AddressAlignment: 1 // NOMERGE-NEXT: EntrySize: 1 // NOMERGE-NEXT: SectionData ( -// NOMERGE-NEXT: 0000: 61626300 61626300 626300 |abc.abc.bc.| +// NOMERGE-NEXT: |abcdef.abcdef.cd| +// NOMERGE-NEXT: |ef.| // NOMERGE-NEXT: ) // CHECK: Name: .rodata2 @@ -82,24 +83,24 @@ // CHECK-NEXT: SHF_MERGE // CHECK-NEXT: SHF_STRINGS // CHECK-NEXT: ] -// CHECK-NEXT: Address: 0x1CC -// CHECK-NEXT: Offset: 0x1CC -// CHECK-NEXT: Size: 4 +// CHECK-NEXT: Address: 0x1D0 +// CHECK-NEXT: Offset: 0x1D0 +// CHECK-NEXT: Size: // CHECK-NEXT: Link: 0 // CHECK-NEXT: Info: 0 // CHECK-NEXT: AddressAlignment: 2 // CHECK-NEXT: EntrySize: 0 // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 14000000 |....| +// CHECK-NEXT: |....| // CHECK-NEXT: ) // CHECK: Name: bar -// CHECK-NEXT: Value: 0x1C9 +// CHECK-NEXT: Value: 0x1CC // CHECK: Name: foo // CHECK-NEXT: Value: 0x1C8 // CHECK: Name: zed -// CHECK-NEXT: Value: 0x1CC +// CHECK-NEXT: Value: 0x1D0 // CHECK-NEXT: Size: 0 Index: lld/test/ELF/tail-merge-string-align.s =================================================================== --- lld/test/ELF/tail-merge-string-align.s +++ lld/test/ELF/tail-merge-string-align.s @@ -1,20 +1,19 @@ // REQUIRES: x86 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: ld.lld %t.o -o %t.so -shared -O3 +// RUN: ld.lld %t.o -o %t.so -shared -O2 // RUN: llvm-readobj -s -section-data %t.so | FileCheck %s - .section .rodata.4a,"aMS",@progbits,1 - .align 4 - .asciz "abcdef" +.section .rodata.4a,"aMS",@progbits,1 +.align 4 +.asciz "abcdefg" - .section .rodata.4b,"aMS",@progbits,1 - .align 4 - .asciz "ef" - - .section .rodata.4c,"aMS",@progbits,1 - .align 4 - .asciz "f" +.section .rodata.4b,"aMS",@progbits,1 +.align 4 +.asciz "efg" +.section .rodata.4c,"aMS",@progbits,1 +.align 4 +.asciz "defgh" // CHECK: Name: .rodata // CHECK-NEXT: Type: SHT_PROGBITS @@ -25,11 +24,11 @@ // CHECK-NEXT: ] // CHECK-NEXT: Address: // CHECK-NEXT: Offset: -// CHECK-NEXT: Size: 1 +// CHECK-NEXT: Size: // CHECK-NEXT: Link: 0 // CHECK-NEXT: Info: 0 // CHECK-NEXT: AddressAlignment: 4 // CHECK-NEXT: EntrySize: // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 61626364 65660000 6600 |abcdef..f.| +// CHECK-NEXT: |abcdefg.defgh.| // CHECK-NEXT: )