Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -3,6 +3,7 @@ add_public_tablegen_target(ELFOptionsTableGen) add_lld_library(lldELF + Concurrent.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/Concurrent.h =================================================================== --- /dev/null +++ ELF/Concurrent.h @@ -0,0 +1,139 @@ +//===- Concurrent.h -------------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CONCURRENT_H +#define LLD_ELF_CONCURRENT_H + +#include "lld/Core/LLVM.h" +#include + +namespace llvm { +class CachedHashStringRef; +} + +namespace lld { +namespace elf { +struct SectionPiece; + +// This is a concurrent lock-free string table builder. Internally +// it uses atomic variables to keep inserted strings and associated +// SectionPieces. +// +// The reason why we have a special-purpose concurrent hash table in +// LLD is because we need to process a very large number of mergeable +// strings. For example, in the final build of clang with debug info, +// thousands of input sections containing 30 million mergeable strings +// in total are fed to the linker. It takes a few seconds to merge +// them in single thread. This concurrent hash table can uniquify them +// in a few hundred milliseconds. +// +// The internal hash table is an open-addressing one. It doesn't +// support resizing. Once it becomes full, you need to redo with +// a larger fresh string table builder. +// +// Just like DenseMap, keys and values are directly stored to buckets +// as pairs. Originally, keys and values are all null. Keys are +// pointers to strings. +// +// SectionPieces are managed using singly linked list. If a bucket +// have a value, it has a pointer to a SectionPiece. Other +// SectionPieces having the same string key are chaned using `Next` +// pointer of SectionPiece. +// +// Once you added all strings, you need to call finalize() to fix +// string table contents. When finalize() is called, hash table +// contents is nondeterministic because no one knows which thread +// have claimed earlier buckets in the hash table. +// +// Thus, the next step is to make it deterministic. For a streak of +// buckets having some values, we sort them. No matter what order +// multiple threads claim buckets, claimed buckets are claimed, and +// unclaimed buckets remain unclaimed. Therefore, by sorting buckets +// for each streak, we can make the entire hash table deterministic. +// +// Finally, we assign offsets to all SectionPieces associated with +// this hash table. +class ConcurrentStringTableBuilder { +public: + ConcurrentStringTableBuilder(size_t EstimatedNumEntries, size_t Alignment); + ~ConcurrentStringTableBuilder(); + + void insert(SectionPiece &Piece, llvm::CachedHashStringRef Str); + void finalize(); + bool isFull(); + void writeTo(uint8_t *Buf); + size_t size() const { return StringTableSize; } + +private: + // Bucket entry type. + struct EntryTy { + // String key information. String key is guaranteed to be unique + // in a hash table. + std::atomic Str; + std::atomic Size; + + // Offset in the string table. Filled by finalize(). + uint32_t Offset; + + // SectionPieces having the same string contents are chained + // using pointers. This pointer points to the first node. + std::atomic Piece; + + EntryTy(const EntryTy &Other) { memcpy(this, &Other, sizeof(EntryTy)); } + + EntryTy &operator=(const EntryTy &Other) { + memcpy(this, &Other, sizeof(EntryTy)); + return *this; + } + + // Define a total order for string keys. The detail is not important, + // but it needs to be deterministic, so that we can get deterministic + // output from the hash table. + bool operator<(const EntryTy &Other) const { + size_t SizeA = Size.load(); + size_t SizeB = Other.Size.load(); + if (SizeA != SizeB) + return SizeA < SizeB; + return memcmp(Str.load(), Other.Str.load(), SizeA) < 0; + }; + }; + + // We could allocate tens of millions of object of this type, + // so we want to make sure that that's small. + static_assert(sizeof(EntryTy) <= 24, "EntryTy is too big"); + + size_t align2(size_t Val) { return (Val + Alignment - 1) & ~(Alignment - 1); } + void append(EntryTy &Bucket, SectionPiece &Piece); + void sortBuckets(); + void sortWrapAround(); + + // All strings are aligned to this boundary. + size_t Alignment; + + // Filled by finalize(). + size_t NumEntries = 0; + size_t StringTableSize = 0; + + // The number of allocated buckets and buckets. + EntryTy *Buckets; + size_t NumBuckets; + + // The counter to track number of inserted strings. + // Note that the number is approximate. + std::atomic Counter; + + // We do not support table resizing. If it becomes almost full, + // we should bail out and redo with a larger table. This bool + // value keeps track of that. + bool IsTableFull = false; +}; +} +} + +#endif Index: ELF/Concurrent.cpp =================================================================== --- /dev/null +++ ELF/Concurrent.cpp @@ -0,0 +1,173 @@ +//===- Concurrent.cpp -----------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Concurrent.h" +#include "InputSection.h" +#include "lld/Core/Parallel.h" + +#include "llvm/ADT/CachedHashString.h" + +using namespace lld; +using namespace lld::elf; +using namespace llvm; + +// Our target load factor is 0.5. +ConcurrentStringTableBuilder::ConcurrentStringTableBuilder( + size_t EstimatedNumEntries, size_t Alignment) + : Alignment(Alignment), + NumBuckets(std::max(PowerOf2Ceil(EstimatedNumEntries * 2), 1024)), + Counter(0) { + Buckets = (EntryTy *)calloc(NumBuckets, sizeof(EntryTy)); +} + +ConcurrentStringTableBuilder::~ConcurrentStringTableBuilder() { free(Buckets); } + +// Inserts a given section piece to the string table. +void ConcurrentStringTableBuilder::insert(SectionPiece &Piece, + CachedHashStringRef Str) { + assert(Str.size() != 0); + + size_t Start = Str.hash() & (NumBuckets - 1); + + for (size_t I = Start; I != Start - 1; I = (I + 1) & (NumBuckets - 1)) { + EntryTy &Bucket = Buckets[I]; + + // If the current bucket is empty, claim it. + const char *Null = nullptr; + if (Bucket.Str.compare_exchange_strong(Null, Str.val().data())) { + Bucket.Size.store(Str.size()); + append(Bucket, Piece); + + // Update the counter for inserted items. If 3/4 of the buckets + // are occupied, it is considered full. We do this only in 1/256 + // chance to avoid contention, so the counter is approximate. + if (Str.hash() % 256 != 0) + return; + size_t Cnt = Counter.fetch_add(256); + if (Cnt * 4 > NumBuckets * 3) + IsTableFull = true; + return; + } + + // The current bucket contains some string. Its size might not be + // written by other thread yet, so try loading until it becomes + // observable. + const char *OldStr = Bucket.Str.load(); + uint64_t OldSize = 0; + while (OldSize == 0) + OldSize = Bucket.Size.load(); + + // If the current bucket contains the key that we are looking for, + // append Piece to the bucket. Otherwise, go to next bucket. + if (Str.val() == StringRef(OldStr, OldSize)) { + append(Bucket, Piece); + return; + } + } + IsTableFull = true; +} + +void ConcurrentStringTableBuilder::finalize() { + // Sort buckets to make the hash table contents deterministic. + sortBuckets(); + sortWrapAround(); + + // Set an offset in the string table for each bucket. + size_t Off = 0; + for (size_t I = 0; I < NumBuckets; ++I) { + if (size_t Size = Buckets[I].Size.load()) { + Off = align2(Off); + Buckets[I].Offset = Off; + Off += Buckets[I].Size.load(); + ++NumEntries; + } + } + StringTableSize = Off; + + // Update SectionPieces' offsets. + parallel_for(size_t(0), NumBuckets, [&](size_t I) { + if (SectionPiece *Cur = Buckets[I].Piece.load()) { + for (SectionPiece *Next = Cur->Next; Next; Cur = Next, Next = Next->Next) + Cur->OutputOff = Buckets[I].Offset; + Cur->OutputOff = Buckets[I].Offset; + } + }); +} + +bool ConcurrentStringTableBuilder::isFull() { + return IsTableFull || (NumEntries * 4 > NumBuckets * 3); +} + +void ConcurrentStringTableBuilder::writeTo(uint8_t *Buf) { + assert(!isFull()); + + for (size_t I = 0; I < NumBuckets; ++I) { + if (const char *Str = Buckets[I].Str.load()) { + size_t Size = Buckets[I].Size.load(); + memcpy(Buf + Buckets[I].Offset, Str, Size); + } + } +} + +// Atomically append Piece to Bucket. +void ConcurrentStringTableBuilder::append(EntryTy &Bucket, + SectionPiece &Piece) { + for (;;) { + Piece.Next = Bucket.Piece.load(); + if (Bucket.Piece.compare_exchange_weak(Piece.Next, &Piece)) + return; + } +} + +// Find runs of occupied buckets and sort them. +void ConcurrentStringTableBuilder::sortBuckets() { + for (size_t I = 0; I < NumBuckets;) { + if (Buckets[I].Size.load() == 0) { + ++I; + continue; + } + size_t Begin = I; + size_t End = I + 1; + + while (End < NumBuckets && Buckets[End].Size.load()) + ++End; + std::sort(Buckets + Begin, Buckets + End); + I = End; + } +} + +// After the end of the buckets, it wraps around to the beginning, +// so we sort them as one unit. sortBuckets() didn't handle this +// corner case. +void ConcurrentStringTableBuilder::sortWrapAround() { + if (Buckets[0].Size.load() == 0 || Buckets[NumBuckets - 1].Size.load() == 0) + return; + + // Copies a wrapped-around streak to a vector, sort the vector, + // and then write them back. + size_t First = 1; + while (Buckets[First].Size.load() && First < NumBuckets) + ++First; + if (First == NumBuckets) + return; + + size_t Last = NumBuckets - 1; + while (Buckets[Last - 1].Size.load()) + --Last; + + std::vector V; + V.insert(V.end(), Buckets, Buckets + First); + V.insert(V.end(), Buckets + Last, Buckets + NumBuckets); + + std::sort(V.begin(), V.end()); + + size_t LastSize = NumBuckets - Last; + std::copy(V.begin(), V.begin() + LastSize, Buckets + Last); + std::copy(V.begin() + LastSize, V.end(), Buckets); +} Index: ELF/InputSection.h =================================================================== --- ELF/InputSection.h +++ ELF/InputSection.h @@ -160,11 +160,14 @@ // be found by looking at the next one) and put the hash in a side table. struct SectionPiece { SectionPiece(size_t Off, bool Live = false) - : InputOff(Off), OutputOff(-1), Live(Live || !Config->GcSections) {} + : InputOff(Off), Live(Live || !Config->GcSections), OutputOff(-1) {} - size_t InputOff; - ssize_t OutputOff : 8 * sizeof(ssize_t) - 1; + size_t InputOff : 8 * sizeof(ssize_t) - 1; size_t Live : 1; + union { + ssize_t OutputOff; + SectionPiece *Next; // used by ConcurrentStringTableBuilder + }; }; static_assert(sizeof(SectionPiece) == 2 * sizeof(size_t), "SectionPiece is too big"); Index: ELF/OutputSections.h =================================================================== --- ELF/OutputSections.h +++ ELF/OutputSections.h @@ -21,6 +21,7 @@ namespace elf { class SymbolBody; +class ConcurrentStringTableBuilder; struct EhSectionPiece; template class EhInputSection; template class InputSection; @@ -144,7 +145,10 @@ void finalizeNoTailMerge(); llvm::StringTableBuilder Builder; + std::unique_ptr ConcurrentBuilder; + std::vector *> Sections; + size_t StringAlignment; }; struct CieRecord { Index: ELF/OutputSections.cpp =================================================================== --- ELF/OutputSections.cpp +++ ELF/OutputSections.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "OutputSections.h" +#include "Concurrent.h" #include "Config.h" #include "EhFrame.h" #include "LinkerScript.h" @@ -21,6 +22,7 @@ #include "llvm/Support/MD5.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/SHA1.h" +#include using namespace llvm; using namespace llvm::dwarf; @@ -470,10 +472,13 @@ MergeOutputSection::MergeOutputSection(StringRef Name, uint32_t Type, uintX_t Flags, uintX_t Alignment) : OutputSectionBase(Name, Type, Flags), - Builder(StringTableBuilder::RAW, Alignment) {} + Builder(StringTableBuilder::RAW, Alignment), StringAlignment(Alignment) {} template void MergeOutputSection::writeTo(uint8_t *Buf) { - Builder.write(Buf); + if (ConcurrentBuilder) + ConcurrentBuilder->writeTo(Buf); + else + Builder.write(Buf); } template @@ -525,10 +530,41 @@ } template void MergeOutputSection::finalize() { - if (shouldTailMerge()) + if (shouldTailMerge()) { finalizeTailMerge(); - else - finalizeNoTailMerge(); + return; + } + + // Uniquiefy strings using the concurrent hash table. + size_t NumPieces = 0; + for (MergeInputSection *Sec : Sections) + NumPieces += Sec->Pieces.size(); + + // The concurrent hash table does not support resizing, + // so if it becomes full, redo with a larger table. + // Our initial estimation is that the table contains 9 duplicate + // entries for an item on average. + size_t Estimated = std::max(NumPieces / 10, 1024); + for (;;) { + ConcurrentBuilder.reset( + new ConcurrentStringTableBuilder(Estimated, StringAlignment)); + + parallel_for_each( + Sections.begin(), Sections.end(), [&](MergeInputSection *Sec) { + if (ConcurrentBuilder->isFull()) + return; + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) + if (Sec->Pieces[I].Live) + ConcurrentBuilder->insert(Sec->Pieces[I], Sec->getData(I)); + }); + + ConcurrentBuilder->finalize(); + if (!ConcurrentBuilder->isFull()) + break; + Estimated *= 2; + } + + this->Size = ConcurrentBuilder->size(); } template