Index: ELF/InputSection.h =================================================================== --- ELF/InputSection.h +++ ELF/InputSection.h @@ -160,11 +160,13 @@ // be found by looking at the next one) and put the hash in a side table. struct SectionPiece { SectionPiece(size_t Off, bool Live = false) - : InputOff(Off), OutputOff(-1), Live(Live || !Config->GcSections) {} + : InputOff(Off), Live(Live || !Config->GcSections), First(false), + OutputOff(-1) {} - size_t InputOff; - ssize_t OutputOff : 8 * sizeof(ssize_t) - 1; + size_t InputOff : 8 * sizeof(size_t) - 2; size_t Live : 1; + size_t First : 1; + ssize_t OutputOff; }; static_assert(sizeof(SectionPiece) == 2 * sizeof(size_t), "SectionPiece is too big"); @@ -194,20 +196,18 @@ // Splittable sections are handled as a sequence of data // rather than a single large blob of data. std::vector Pieces; + std::vector Hashes; // Returns I'th piece's data. This function is very hot when // string merging is enabled, so we want to inline. - LLVM_ATTRIBUTE_ALWAYS_INLINE - llvm::CachedHashStringRef getData(size_t I) const { + StringRef getString(size_t I) const { size_t Begin = Pieces[I].InputOff; size_t End; if (Pieces.size() - 1 == I) End = this->Data.size(); else End = Pieces[I + 1].InputOff; - - StringRef S = {(const char *)(this->Data.data() + Begin), End - Begin}; - return {S, Hashes[I]}; + return {(const char *)(this->Data.data() + Begin), End - Begin}; } // Returns the SectionPiece at a given input section offset. @@ -218,8 +218,6 @@ void splitStrings(ArrayRef A, size_t Size); void splitNonStrings(ArrayRef A, size_t Size); - std::vector Hashes; - mutable llvm::DenseMap OffsetMap; mutable std::once_flag InitOffsetMap; Index: ELF/OutputSections.h =================================================================== --- ELF/OutputSections.h +++ ELF/OutputSections.h @@ -145,9 +145,11 @@ private: void finalizeTailMerge(); void finalizeNoTailMerge(); + void finalizeConcurrent(); - llvm::StringTableBuilder Builder; + std::unique_ptr Builder; std::vector *> Sections; + size_t Alignment; }; struct CieRecord { Index: ELF/OutputSections.cpp =================================================================== --- ELF/OutputSections.cpp +++ ELF/OutputSections.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/MD5.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SHA1.h" +#include using namespace llvm; using namespace llvm::dwarf; @@ -467,11 +468,27 @@ template MergeOutputSection::MergeOutputSection(StringRef Name, uint32_t Type, uintX_t Flags, uintX_t Alignment) - : OutputSectionBase(Name, Type, Flags), - Builder(StringTableBuilder::RAW, Alignment) {} + : OutputSectionBase(Name, Type, Flags), Alignment(Alignment) { + assert(Alignment != 0 && isPowerOf2_64(Alignment)); +} template void MergeOutputSection::writeTo(uint8_t *Buf) { - Builder.write(Buf); + if (Builder) { + Builder->write(Buf); + return; + } + + // Builder is not used for sharded string table construction. + for (MergeInputSection *Sec : Sections) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) { + SectionPiece &Piece = Sec->Pieces[I]; + if (!Piece.Live || !Piece.First) + continue; + + StringRef S = Sec->getString(I); + memcpy(Buf + Piece.OutputOff, S.data(), S.size()); + } + } } template @@ -493,11 +510,11 @@ for (MergeInputSection *Sec : Sections) for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) if (Sec->Pieces[I].Live) - Builder.add(Sec->getData(I)); + Builder->add({Sec->getString(I), Sec->Hashes[I]}); // Fix the string table content. After this, the contents will never change. - Builder.finalize(); - this->Size = Builder.getSize(); + Builder->finalize(); + this->Size = Builder->getSize(); // finalize() fixed tail-optimized strings, so we can now get // offsets of strings. Get an offset for each string and save it @@ -505,7 +522,8 @@ for (MergeInputSection *Sec : Sections) for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) if (Sec->Pieces[I].Live) - Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I)); + Sec->Pieces[I].OutputOff = + Builder->getOffset({Sec->getString(I), Sec->Hashes[I]}); } template void MergeOutputSection::finalizeNoTailMerge() { @@ -516,17 +534,112 @@ for (MergeInputSection *Sec : Sections) for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) if (Sec->Pieces[I].Live) - Sec->Pieces[I].OutputOff = Builder.add(Sec->getData(I)); + Sec->Pieces[I].OutputOff = Builder->add(Sec->getString(I)); - Builder.finalizeInOrder(); - this->Size = Builder.getSize(); + Builder->finalizeInOrder(); + this->Size = Builder->getSize(); +} + +// If tail merge is not needed, we construct a string table ourselves +// to use a parallel algorithm. +// +// Here, we create not only one but N hash tables, where N is a +// parallelism. We invoke N threads. Each thread knows its thread +// index T where 0 <= T < N. For each string S in a given string set, +// thread T adds S to its own hash table only if hash(S) % N == T. +// +// The following loop can finish in 1/N time if you have infinite +// number of CPU cores, but in reality you don't want to make N +// infinitely large, because all N threads have to visit all section +// pieces. Larger N means larger overall cost. Skipping 30 million +// section pieces takes about 50 milliseconds, for example, so it's +// not very cheap. Currently, N=8 is chosen as a reasonable default. +// +// (In case you are wondering: we cannot adjust N at runtime because +// changing N changes output layout. We want to make the linker emit +// the same output on every invocation for the same input, so +// adjusting N at runtime that is not a viable option.) +template void MergeOutputSection::finalizeConcurrent() { + // This is a DenseMap with padding at end to prevent false sharing. + // We assume cache line size is 64 bytes. + typedef struct { + DenseMap Map; + uint8_t Pad[(64 - sizeof(Map) > 0) ? 64 - sizeof(Map) : 1]; + } OffsetMapTy; + + const int NumShards = 8; + OffsetMapTy OffsetMap[NumShards]; + size_t ShardSize[NumShards]; + + // Construct NumShards number of string tables in parallel. + forLoop(0, NumShards, [&](size_t Idx) { + size_t Offset = 0; + for (MergeInputSection *Sec : Sections) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) { + uint32_t Hash = Sec->Hashes[I]; + if ((Hash % NumShards) != Idx) + continue; + + SectionPiece &Piece = Sec->Pieces[I]; + if (!Piece.Live) + continue; + + StringRef S = Sec->getString(I); + size_t Off = alignTo(Offset, Alignment); + auto P = OffsetMap[Idx].Map.insert({{S, Hash}, Off}); + if (P.second) { + Piece.First = true; + Piece.OutputOff = Off; + Offset = Off + S.size(); + } else { + Piece.OutputOff = P.first->second; + } + } + } + ShardSize[Idx] = Offset; + }); + + // Piece.OutputOff was set independently, so we need to fix them. + // First, we compute starting offset for each shard. + size_t ShardOffset[NumShards]; + ShardOffset[0] = 0; + for (int I = 1; I != NumShards; ++I) { + size_t Off = ShardOffset[I - 1] + ShardSize[I - 1]; + if (ShardSize[I] > 0) + Off = alignTo(Off, Alignment); + ShardOffset[I] = Off; + } + + // Add a shard offset to each section piece. + forEach(Sections.begin(), Sections.end(), [&](MergeInputSection *Sec) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) + if (Sec->Pieces[I].Live) + Sec->Pieces[I].OutputOff += ShardOffset[Sec->Hashes[I] % NumShards]; + }); + + // Set the size of this output section. + this->Size = ShardOffset[NumShards - 1] + ShardSize[NumShards - 1]; } template void MergeOutputSection::finalize() { - if (shouldTailMerge()) + // If we need tail merging, use the default string table builder + // because our concurrent one does not support tail merging. + if (shouldTailMerge()) { + Builder.reset(new StringTableBuilder(StringTableBuilder::RAW, Alignment)); finalizeTailMerge(); - else + return; + } + + // If threading is disabled, use the default string table builder + // because our concurrent one is not as fast as the single-threaded + // one on single core. + if (!Config->Threads || StringRef(getenv("LLD_TEST")) == "1") { + Builder.reset(new StringTableBuilder(StringTableBuilder::RAW, Alignment)); finalizeNoTailMerge(); + return; + } + + finalizeConcurrent(); } template Index: test/ELF/comment-gc.s =================================================================== --- test/ELF/comment-gc.s +++ test/ELF/comment-gc.s @@ -5,7 +5,7 @@ # RUN: llvm-objdump -s %t1 | FileCheck %s # CHECK: Contents of section .comment: -# CHECK-NEXT: 0000 00666f6f 00626172 004c4c44 20312e30 .foo.bar.LLD 1.0 +# CHECK-NEXT: 0000 62617200 4c4c4420 312e3000 666f6f00 bar.LLD 1.0.foo. # CHECK-NEXT: 0010 00 . .ident "foo" Index: test/ELF/compressed-debug-input.s =================================================================== --- test/ELF/compressed-debug-input.s +++ test/ELF/compressed-debug-input.s @@ -62,10 +62,10 @@ # DATA-NEXT: EntrySize: 1 # DATA-NEXT: SectionData ( # DATA-NEXT: 0000: 73686F72 7420756E 7369676E 65642069 |short unsigned i| -# DATA-NEXT: 0010: 6E740075 6E736967 6E656420 696E7400 |nt.unsigned int.| -# DATA-NEXT: 0020: 6C6F6E67 20756E73 69676E65 6420696E |long unsigned in| -# DATA-NEXT: 0030: 74006368 61720075 6E736967 6E656420 |t.char.unsigned | -# DATA-NEXT: 0040: 63686172 00 |char.| +# DATA-NEXT: 0010: 6E740063 68617200 756E7369 676E6564 |nt.char.unsigned| +# DATA-NEXT: 0020: 20636861 7200756E 7369676E 65642069 | char.unsigned i| +# DATA-NEXT: 0030: 6E74006C 6F6E6720 756E7369 676E6564 |nt.long unsigned| +# DATA-NEXT: 0040: 20696E74 00 | int.| # DATA-NEXT: ) # DATA-NEXT: } Index: test/ELF/debug-gc.s =================================================================== --- test/ELF/debug-gc.s +++ test/ELF/debug-gc.s @@ -4,11 +4,11 @@ # RUN: llvm-objdump -s %t1 | FileCheck %s # CHECK: Contents of section .debug_str: -# CHECK-NEXT: 0000 41414100 42424200 43434300 AAA.BBB.CCC. +# CHECK-NEXT: 0000 41414100 43434300 42424200 AAA.CCC.BBB. # CHECK: Contents of section .foo: # CHECK-NEXT: 0000 2a000000 # CHECK: Contents of section .debug_info: -# CHECK-NEXT: 0000 00000000 04000000 +# CHECK-NEXT: 0000 00000000 08000000 .globl _start _start: Index: test/ELF/gc-sections-merge.s =================================================================== --- test/ELF/gc-sections-merge.s +++ test/ELF/gc-sections-merge.s @@ -20,7 +20,7 @@ // CHECK-NEXT: AddressAlignment: 1 // CHECK-NEXT: EntrySize: 1 // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 666F6F00 62617200 |foo.bar.| +// CHECK-NEXT: 0000: 62617200 666F6F00 |bar.foo.| // CHECK-NEXT: ) // GC: Name: .rodata Index: test/ELF/merge-string-align.s =================================================================== --- test/ELF/merge-string-align.s +++ test/ELF/merge-string-align.s @@ -30,8 +30,8 @@ // CHECK-NEXT: AddressAlignment: 16 // CHECK-NEXT: EntrySize: // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 666F6F00 00000000 00000000 00000000 |foo.............| -// CHECK-NEXT: 0010: 62617200 |bar.| +// CHECK-NEXT: 0000: 62617200 00000000 00000000 00000000 |bar.............| +// CHECK-NEXT: 0010: 666F6F00 |foo.| // CHECK-NEXT: ) .section .rodata.str1.1,"aMS",@progbits,1 Index: test/ELF/merge.s =================================================================== --- test/ELF/merge.s +++ test/ELF/merge.s @@ -31,17 +31,17 @@ // CHECK-NEXT: AddressAlignment: 4 // CHECK-NEXT: EntrySize: 4 // CHECK-NEXT: SectionData ( -// CHECK-NEXT: 0000: 10000000 42000000 +// CHECK-NEXT: 0000: 42000000 10000000 // CHECK-NEXT: ) -// Address of the constant 0x10 = 0x200120 = 2097440 -// Address of the constant 0x42 = 0x200124 = 2097444 +// Address of the constant 0x10 = 0x200124 = 2097444 +// Address of the constant 0x42 = 0x200120 = 2097440 // CHECK: Symbols [ // CHECK: Name: bar -// CHECK-NEXT: Value: 0x200124 +// CHECK-NEXT: Value: 0x200120 // CHECK-NEXT: Size: 0 // CHECK-NEXT: Binding: Loca // CHECK-NEXT: Type: None @@ -49,7 +49,7 @@ // CHECK-NEXT: Section: .mysec // CHECK: Name: zed -// CHECK-NEXT: Value: 0x200124 +// CHECK-NEXT: Value: 0x200120 // CHECK-NEXT: Size: 0 // CHECK-NEXT: Binding: Local // CHECK-NEXT: Type: None @@ -57,7 +57,7 @@ // CHECK-NEXT: Section: .mysec // CHECK: Name: foo -// CHECK-NEXT: Value: 0x200124 +// CHECK-NEXT: Value: 0x200120 // CHECK-NEXT: Size: 0 // CHECK-NEXT: Binding: Local // CHECK-NEXT: Type: None @@ -75,37 +75,37 @@ // DISASM-NEXT: _start: movl .mysec, %eax -// addr(0x10) = 2097440 -// DISASM-NEXT: movl 2097440, %eax +// addr(0x10) = 2097444 +// DISASM-NEXT: movl 2097444, %eax movl .mysec+7, %eax -// addr(0x42) + 3 = 2097444 + 3 = 2097447 -// DISASM-NEXT: movl 2097447, %eax +// addr(0x42) + 3 = 2097440 + 3 = 2097443 +// DISASM-NEXT: movl 2097443, %eax movl .mysec+8, %eax -// addr(0x42) = 2097444 -// DISASM-NEXT: movl 2097444, %eax +// addr(0x42) = 2097440 +// DISASM-NEXT: movl 2097440, %eax movl bar+7, %eax -// addr(0x42) + 7 = 2097444 + 7 = 2097451 -// DISASM-NEXT: movl 2097451, %eax +// addr(0x42) + 7 = 2097440 + 7 = 2097447 +// DISASM-NEXT: movl 2097447, %eax movl bar+8, %eax -// addr(0x42) + 8 = 2097444 + 8 = 2097452 -// DISASM-NEXT: movl 2097452, %eax +// addr(0x42) + 8 = 2097440 + 8 = 2097448 +// DISASM-NEXT: movl 2097448, %eax movl foo, %eax -// addr(0x42) = 2097444 -// DISASM-NEXT: movl 2097444, %eax +// addr(0x42) = 2097440 +// DISASM-NEXT: movl 2097440, %eax movl foo+7, %eax -// addr(0x42) + 7 = = 2097444 + 7 = 2097451 -// DISASM-NEXT: movl 2097451, %eax +// addr(0x42) + 7 = = 2097440 + 7 = 2097447 +// DISASM-NEXT: movl 2097447, %eax movl foo+8, %eax -// addr(0x42) + 8 = = 2097444 + 8 = 2097452 -// DISASM-NEXT: movl 2097452, %eax +// addr(0x42) + 8 = = 2097440 + 8 = 2097448 +// DISASM-NEXT: movl 2097448, %eax // From the other file: movl .mysec, %eax -// addr(0x42) = 2097444 -// DISASM-NEXT: movl 2097444, %eax +// addr(0x42) = 2097440 +// DISASM-NEXT: movl 2097440, %eax