Index: lld/ELF/InputSection.h =================================================================== --- lld/ELF/InputSection.h +++ lld/ELF/InputSection.h @@ -215,13 +215,14 @@ // be found by looking at the next one) and put the hash in a side table. struct SectionPiece { SectionPiece(size_t Off, uint32_t Hash, bool Live) - : InputOff(Off), Hash(Hash), OutputOff(-1), - Live(Live || !Config->GcSections) {} + : InputOff(Off), Hash(Hash), Live(Live || !Config->GcSections), + TailHash(0), OutputOff(-1) {} uint32_t InputOff; uint32_t Hash; - uint64_t OutputOff : 63; uint64_t Live : 1; + uint64_t TailHash : 10; + uint64_t OutputOff : 53; }; static_assert(sizeof(SectionPiece) == 16, "SectionPiece is too big"); Index: lld/ELF/SyntheticSections.h =================================================================== --- lld/ELF/SyntheticSections.h +++ lld/ELF/SyntheticSections.h @@ -662,26 +662,33 @@ class MergeSyntheticSection : public SyntheticSection { public: void addSection(MergeInputSection *MS); + size_t getSize() const override { return Size; } + void writeTo(uint8_t *Buf) override; protected: MergeSyntheticSection(StringRef Name, uint32_t Type, uint64_t Flags, - uint32_t Alignment) - : SyntheticSection(Flags, Type, Alignment, Name) {} + uint32_t Alignment); std::vector Sections; + + // Section size + size_t Size; + + // Concurrency level. + size_t Concurrency; + + // String table contents + constexpr static size_t NumShards = 32; + std::vector Shards; + size_t ShardOffsets[NumShards]; }; class MergeTailSection final : public MergeSyntheticSection { public: MergeTailSection(StringRef Name, uint32_t Type, uint64_t Flags, - uint32_t Alignment); - - size_t getSize() const override; - void writeTo(uint8_t *Buf) override; + uint32_t Alignment) + : MergeSyntheticSection(Name, Type, Flags, Alignment) {} void finalizeContents() override; - -private: - llvm::StringTableBuilder Builder; }; class MergeNoTailSection final : public MergeSyntheticSection { @@ -689,9 +696,6 @@ MergeNoTailSection(StringRef Name, uint32_t Type, uint64_t Flags, uint32_t Alignment) : MergeSyntheticSection(Name, Type, Flags, Alignment) {} - - size_t getSize() const override { return Size; } - void writeTo(uint8_t *Buf) override; void finalizeContents() override; private: @@ -703,14 +707,6 @@ size_t getShardId(uint32_t Hash) { return Hash >> (32 - llvm::countTrailingZeros(NumShards)); } - - // Section size - size_t Size; - - // String table contents - constexpr static size_t NumShards = 32; - std::vector Shards; - size_t ShardOffsets[NumShards]; }; // .MIPS.abiflags section. Index: lld/ELF/SyntheticSections.cpp =================================================================== --- lld/ELF/SyntheticSections.cpp +++ lld/ELF/SyntheticSections.cpp @@ -49,7 +49,7 @@ using namespace lld; using namespace lld::elf; -constexpr size_t MergeNoTailSection::NumShards; +constexpr size_t MergeSyntheticSection::NumShards; uint64_t SyntheticSection::getVA() const { if (OutputSection *Sec = getParent()) @@ -2204,41 +2204,28 @@ return getNeedNum() == 0; } +MergeSyntheticSection::MergeSyntheticSection(StringRef Name, uint32_t Type, + uint64_t Flags, uint32_t Alignment) + : SyntheticSection(Flags, Type, Alignment, Name) { + // Initialize string builders. + for (size_t I = 0; I < NumShards; ++I) + Shards.emplace_back(StringTableBuilder::RAW, Alignment); + + // Initializes concurrency level. Must be a power of 2 to avoid + // the expensive modulo operation when dividing with this value. + if (ThreadsEnabled) + Concurrency = + std::min(PowerOf2Floor(hardware_concurrency()), NumShards); + else + Concurrency = 1; +} + void MergeSyntheticSection::addSection(MergeInputSection *MS) { MS->Parent = this; Sections.push_back(MS); } -MergeTailSection::MergeTailSection(StringRef Name, uint32_t Type, - uint64_t Flags, uint32_t Alignment) - : MergeSyntheticSection(Name, Type, Flags, Alignment), - Builder(StringTableBuilder::RAW, Alignment) {} - -size_t MergeTailSection::getSize() const { return Builder.getSize(); } - -void MergeTailSection::writeTo(uint8_t *Buf) { Builder.write(Buf); } - -void MergeTailSection::finalizeContents() { - // Add all string pieces to the string table builder to create section - // contents. - for (MergeInputSection *Sec : Sections) - for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) - if (Sec->Pieces[I].Live) - Builder.add(Sec->getData(I)); - - // Fix the string table content. After this, the contents will never change. - Builder.finalize(); - - // finalize() fixed tail-optimized strings, so we can now get - // offsets of strings. Get an offset for each string and save it - // to a corresponding StringPiece for easy access. - for (MergeInputSection *Sec : Sections) - for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) - if (Sec->Pieces[I].Live) - Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I)); -} - -void MergeNoTailSection::writeTo(uint8_t *Buf) { +void MergeSyntheticSection::writeTo(uint8_t *Buf) { for (size_t I = 0; I < NumShards; ++I) Shards[I].write(Buf + ShardOffsets[I]); } @@ -2252,17 +2239,6 @@ // T into different string builders without worrying about merge misses. // We do it in parallel. void MergeNoTailSection::finalizeContents() { - // Initializes string table builders. - for (size_t I = 0; I < NumShards; ++I) - Shards.emplace_back(StringTableBuilder::RAW, Alignment); - - // Concurrency level. Must be a power of 2 to avoid expensive modulo - // operations in the following tight loop. - size_t Concurrency = 1; - if (ThreadsEnabled) - Concurrency = - std::min(PowerOf2Floor(hardware_concurrency()), NumShards); - // Add section pieces to the builders. parallelForEachN(0, Concurrency, [&](size_t ThreadId) { for (MergeInputSection *Sec : Sections) { @@ -2297,6 +2273,70 @@ }); } +static uint32_t tailHash(StringRef S) { + if (S.size() < 4) + return 0; + const uint8_t *P = (const uint8_t *)S.data() + S.size(); + return hash_value((P[-4] << 24) | (P[-3] << 16) | (P[-2] << 8) | P[-1]); +} + +// This function basically does the same thing as MergeNoTailSection +// does but creates a tail-merged string table. E.g. "def\0" is merged +// with "abcdef\0". This is enabled only when -O2 is given. +// +// Input strings are sharded based on their very last few characters. +// This works because, if strings S and T can be tail-merged, S and T +// must have the same tail substring. Currently we use the last 4 +// characters for sharding. +// +// Note that strings shorter than 4 characters may not be tail-merged. +// That is okay because the loss seems negligible. My observation is +// that a loss is 8 KiB when we create a 1.7 GiB clang executable. +void MergeTailSection::finalizeContents() { + // Initializes TailHashes by hash values of last few characters. + parallelForEach(Sections, [&](MergeInputSection *Sec) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) + if (Sec->Pieces[I].Live) + Sec->Pieces[I].TailHash = tailHash(Sec->getData(I).val()); + }); + + // Add section pieces to the builders. + parallelForEachN(0, Concurrency, [&](size_t ThreadId) { + for (MergeInputSection *Sec : Sections) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) { + if (!Sec->Pieces[I].Live) + continue; + size_t ShardId = Sec->Pieces[I].TailHash % NumShards; + if ((ShardId & (Concurrency - 1)) == ThreadId) + Shards[ShardId].add(Sec->getData(I)); + } + } + }); + + parallelForEach(Shards, [](StringTableBuilder &S) { S.finalize(); }); + + // Compute an in-section offset for each shard. + size_t Off = 0; + for (size_t I = 0; I < NumShards; ++I) { + if (Shards[I].getSize() > 0) + Off = alignTo(Off, Alignment); + ShardOffsets[I] = Off; + Off += Shards[I].getSize(); + } + Size = Off; + + // Set to section piece offsets. + parallelForEach(Sections, [&](MergeInputSection *Sec) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) { + if (!Sec->Pieces[I].Live) + continue; + size_t ShardId = Sec->Pieces[I].TailHash % NumShards; + Sec->Pieces[I].OutputOff = + ShardOffsets[ShardId] + Shards[ShardId].getOffset(Sec->getData(I)); + } + }); +} + static MergeSyntheticSection *createMergeSynthetic(StringRef Name, uint32_t Type, uint64_t Flags, Index: lld/test/ELF/merge-shared-str.s =================================================================== --- lld/test/ELF/merge-shared-str.s +++ lld/test/ELF/merge-shared-str.s @@ -1,16 +1,14 @@ // REQUIRES: x86 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: ld.lld %t.o -o %t.so -shared -O3 +// RUN: ld.lld %t.o -o %t.so -shared -O2 // RUN: llvm-readobj -r -s %t.so | FileCheck %s +.section foo,"aMS",@progbits,1 +.asciz "foobarbaz" +.asciz "barbaz" - .section foo,"aMS",@progbits,1 - .asciz "bar" - .asciz "ar" - - .data - .quad foo + 4 - +.data +.quad foo + 11 // CHECK: Name: foo // CHECK-NEXT: Type: SHT_PROGBITS @@ -23,6 +21,6 @@ // CHECK: Relocations [ // CHECK-NEXT: Section ({{.*}}) .rela.dyn { -// CHECK-NEXT: 0x{{.*}} R_X86_64_RELATIVE - 0x1C9 +// CHECK-NEXT: 0x{{.*}} R_X86_64_RELATIVE - 0x1CC // CHECK-NEXT: } // CHECK-NEXT: ] Index: lld/test/ELF/merge-string.s =================================================================== --- lld/test/ELF/merge-string.s +++ lld/test/ELF/merge-string.s @@ -8,12 +8,12 @@ // RUN: llvm-readobj -s -section-data -t %t.so | FileCheck --check-prefix=NOMERGE %s .section .rodata1,"aMS",@progbits,1 - .asciz "abc" + .asciz "abcdef" foo: - .ascii "a" + .ascii "abcd" bar: - .asciz "bc" - .asciz "bc" + .asciz "ef" + .asciz "cdef" .section .rodata2,"aMS",@progbits,2 .align 2 @@ -30,13 +30,13 @@ // CHECK-NEXT: ] // CHECK-NEXT: Address: 0x1C8 // CHECK-NEXT: Offset: 0x1C8 -// CHECK-NEXT: Size: 4 +// CHECK-NEXT: Size: // CHECK-NEXT: Link: 0 // CHECK-NEXT: Info: 0 // CHECK-NEXT: AddressAlignment: 1 // CHECK-NEXT: EntrySize: 0 // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 61626300 |abc.| +// CHECK-NEXT: |abcdef.| // CHECK-NEXT: ) // NOTAIL: Name: .rodata1 @@ -48,13 +48,13 @@ // NOTAIL-NEXT: ] // NOTAIL-NEXT: Address: 0x1C8 // NOTAIL-NEXT: Offset: 0x1C8 -// NOTAIL-NEXT: Size: 7 +// NOTAIL-NEXT: Size: // NOTAIL-NEXT: Link: 0 // NOTAIL-NEXT: Info: 0 // NOTAIL-NEXT: AddressAlignment: 1 // NOTAIL-NEXT: EntrySize: 0 // NOTAIL-NEXT: SectionData ( -// NOTAIL-NEXT: 0000: 62630061 626300 |bc.abc.| +// NOTAIL-NEXT: |cdef.abcdef.| // NOTAIL-NEXT: ) // NOMERGE: Name: .rodata1 @@ -66,13 +66,14 @@ // NOMERGE-NEXT: ] // NOMERGE-NEXT: Address: 0x1C8 // NOMERGE-NEXT: Offset: 0x1C8 -// NOMERGE-NEXT: Size: 11 +// NOMERGE-NEXT: Size: // NOMERGE-NEXT: Link: 0 // NOMERGE-NEXT: Info: 0 // NOMERGE-NEXT: AddressAlignment: 1 // NOMERGE-NEXT: EntrySize: 1 // NOMERGE-NEXT: SectionData ( -// NOMERGE-NEXT: 0000: 61626300 61626300 626300 |abc.abc.bc.| +// NOMERGE-NEXT: |abcdef.abcdef.cd| +// NOMERGE-NEXT: |ef.| // NOMERGE-NEXT: ) // CHECK: Name: .rodata2 @@ -82,24 +83,24 @@ // CHECK-NEXT: SHF_MERGE // CHECK-NEXT: SHF_STRINGS // CHECK-NEXT: ] -// CHECK-NEXT: Address: 0x1CC -// CHECK-NEXT: Offset: 0x1CC -// CHECK-NEXT: Size: 4 +// CHECK-NEXT: Address: 0x1D0 +// CHECK-NEXT: Offset: 0x1D0 +// CHECK-NEXT: Size: // CHECK-NEXT: Link: 0 // CHECK-NEXT: Info: 0 // CHECK-NEXT: AddressAlignment: 2 // CHECK-NEXT: EntrySize: 0 // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 14000000 |....| +// CHECK-NEXT: |....| // CHECK-NEXT: ) // CHECK: Name: bar -// CHECK-NEXT: Value: 0x1C9 +// CHECK-NEXT: Value: 0x1CC // CHECK: Name: foo // CHECK-NEXT: Value: 0x1C8 // CHECK: Name: zed -// CHECK-NEXT: Value: 0x1CC +// CHECK-NEXT: Value: 0x1D0 // CHECK-NEXT: Size: 0 Index: lld/test/ELF/tail-merge-string-align.s =================================================================== --- lld/test/ELF/tail-merge-string-align.s +++ lld/test/ELF/tail-merge-string-align.s @@ -1,20 +1,19 @@ // REQUIRES: x86 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o -// RUN: ld.lld %t.o -o %t.so -shared -O3 +// RUN: ld.lld %t.o -o %t.so -shared -O2 // RUN: llvm-readobj -s -section-data %t.so | FileCheck %s - .section .rodata.4a,"aMS",@progbits,1 - .align 4 - .asciz "abcdef" +.section .rodata.4a,"aMS",@progbits,1 +.align 4 +.asciz "abcdefg" - .section .rodata.4b,"aMS",@progbits,1 - .align 4 - .asciz "ef" - - .section .rodata.4c,"aMS",@progbits,1 - .align 4 - .asciz "f" +.section .rodata.4b,"aMS",@progbits,1 +.align 4 +.asciz "efg" +.section .rodata.4c,"aMS",@progbits,1 +.align 4 +.asciz "defgh" // CHECK: Name: .rodata // CHECK-NEXT: Type: SHT_PROGBITS @@ -25,11 +24,11 @@ // CHECK-NEXT: ] // CHECK-NEXT: Address: // CHECK-NEXT: Offset: -// CHECK-NEXT: Size: 1 +// CHECK-NEXT: Size: // CHECK-NEXT: Link: 0 // CHECK-NEXT: Info: 0 // CHECK-NEXT: AddressAlignment: 4 // CHECK-NEXT: EntrySize: // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 61626364 65660000 6600 |abcdef..f.| +// CHECK-NEXT: |defgh...abcdefg.| // CHECK-NEXT: )