Index: ELF/InputSection.h =================================================================== --- ELF/InputSection.h +++ ELF/InputSection.h @@ -160,11 +160,13 @@ // be found by looking at the next one) and put the hash in a side table. struct SectionPiece { SectionPiece(size_t Off, bool Live = false) - : InputOff(Off), OutputOff(-1), Live(Live || !Config->GcSections) {} + : InputOff(Off), Live(Live || !Config->GcSections), OutputOff(-1), + First(false) {} - size_t InputOff; - ssize_t OutputOff : 8 * sizeof(ssize_t) - 1; + size_t InputOff : 8 * sizeof(size_t) - 1; size_t Live : 1; + ssize_t OutputOff : 8 * sizeof(ssize_t) - 1; + size_t First : 1; }; static_assert(sizeof(SectionPiece) == 2 * sizeof(size_t), "SectionPiece is too big"); Index: ELF/OutputSections.h =================================================================== --- ELF/OutputSections.h +++ ELF/OutputSections.h @@ -16,12 +16,14 @@ #include "lld/Core/LLVM.h" #include "llvm/MC/StringTableBuilder.h" #include "llvm/Object/ELF.h" +#include namespace lld { namespace elf { class SymbolBody; struct EhSectionPiece; +struct SectionPiece; template class EhInputSection; template class InputSection; template class InputSectionBase; @@ -142,9 +144,13 @@ private: void finalizeTailMerge(); void finalizeNoTailMerge(); + void forEachPiece( + ArrayRef *> Sections, + std::function Fn); llvm::StringTableBuilder Builder; std::vector *> Sections; + size_t StringAlignment; }; struct CieRecord { Index: ELF/OutputSections.cpp =================================================================== --- ELF/OutputSections.cpp +++ ELF/OutputSections.cpp @@ -21,6 +21,7 @@ #include "llvm/Support/MD5.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SHA1.h" +#include using namespace llvm; using namespace llvm::dwarf; @@ -470,10 +471,26 @@ MergeOutputSection::MergeOutputSection(StringRef Name, uint32_t Type, uintX_t Flags, uintX_t Alignment) : OutputSectionBase(Name, Type, Flags), - Builder(StringTableBuilder::RAW, Alignment) {} + Builder(StringTableBuilder::RAW, Alignment), StringAlignment(Alignment) { + assert(Alignment != 0 && isPowerOf2_64(Alignment)); +} template void MergeOutputSection::writeTo(uint8_t *Buf) { - Builder.write(Buf); + if (shouldTailMerge()) { + Builder.write(Buf); + return; + } + + // Builder is not used for probabilistic string merging. + for (MergeInputSection *Sec : Sections) { + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) { + SectionPiece &Piece = Sec->Pieces[I]; + if (!Piece.Live || !Piece.First) + continue; + StringRef S = Sec->getData(I).val(); + memcpy(Buf + Piece.OutputOff, S.data(), S.size()); + } + } } template @@ -524,11 +541,103 @@ this->Size = Builder.getSize(); } +static size_t align2(size_t Val, size_t Alignment) { + return (Val + Alignment - 1) & ~(Alignment - 1); +} + +// Call Fn for each section piece. +template +void MergeOutputSection::forEachPiece( + ArrayRef *> Sections, + std::function Fn) { + for (MergeInputSection *Sec : Sections) + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) + if (Sec->Pieces[I].Live) + Fn(Sec->Pieces[I], Sec->getData(I)); +} + +// Split a vector Vec into smaller vectors. +template +static std::vector> split(std::vector Vec, size_t NumShards) { + std::vector> Ret(NumShards); + size_t I = 0; + for (T &Elem : Vec) + Ret[I++ % NumShards].push_back(Elem); + return Ret; +} + template void MergeOutputSection::finalize() { - if (shouldTailMerge()) + if (shouldTailMerge()) { finalizeTailMerge(); - else - finalizeNoTailMerge(); + return; + } + + // This implements a probabilistic string merging algorithm. + // + // In this function, we merge identical strings and assign contiguous + // offsets to unique strings to create a string table. This is one of the + // most time-consuming passes in LLD because of the sheer number of strings. + // When we link large programs such as Clang with debug info, we need to + // merge thousands of sections containing millions of string pieces. + // On my Xeon 2.8 GHz machine, merging 30 million strings (average length + // is about 50 bytes) using a single-threaded hash table takes about 2 + // seconds. + // + // The probabilistic algorithm improves the latency to 300 milliseconds by + // allowing a loss in output size. In other words, this algorithm faster + // but produces larger string table. If you need to optimize in size, + // you should pass -O2 to LLD. + // + // Here's how it works. In the first step, we take 10% of input string set + // to create a small string table. The resulting string table is very + // unlikely to contain all strings of the entire set, but it is likely to + // contain most of duplicated strings, because duplicated strings are + // repeated many times. + // + // The second step processes the remaining 90% in parallel. In this step, + // we do not merge strings. So, if a string is not in the small string + // table we created in the first step, that will be just appended to end + // of the string table. This step completes the string table. + + DenseMap OffsetMap; + std::atomic Offset; + Offset.store(0); + + size_t NumShards = 100; + std::vector *>> Shards = + split(Sections, NumShards); + + // Step 1: construct a small string table + for (size_t I = 0; I < NumShards / 10; ++I) { + forEachPiece(Shards[I], [&](SectionPiece &Piece, CachedHashStringRef S) { + auto P = OffsetMap.insert({S, Offset.load()}); + if (P.second) { + Piece.First = true; + Piece.OutputOff = Offset.load(); + Offset.store(align2(Offset.load() + S.size(), StringAlignment)); + } else { + Piece.OutputOff = P.first->second; + } + }); + } + + // Step 2: append remaining strings + parallel_for_each( + Shards.begin() + NumShards / 10, Shards.end(), + [&](ArrayRef *> Sections) { + forEachPiece(Sections, [&](SectionPiece &Piece, CachedHashStringRef S) { + auto It = OffsetMap.find(S); + if (It == OffsetMap.end()) { + Piece.First = true; + size_t Size = align2(S.size(), StringAlignment); + Piece.OutputOff = Offset.fetch_add(Size); + } else { + Piece.OutputOff = It->second; + } + }); + }); + + this->Size = Offset.load(); } template