Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -3,6 +3,7 @@ add_public_tablegen_target(ELFOptionsTableGen) add_lld_library(lldELF + Concurrent.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/Concurrent.h =================================================================== --- /dev/null +++ ELF/Concurrent.h @@ -0,0 +1,142 @@ +//===- Concurrent.h -------------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CONCURRENT_H +#define LLD_ELF_CONCURRENT_H + +#include "lld/Core/LLVM.h" +#include + +namespace llvm { +class CachedHashStringRef; +} + +namespace lld { +namespace elf { +struct SectionPiece; + +// This is a concurrent lock-free string table builder. Internally +// it uses atomic variables to keep inserted strings and associated +// SectionPieces. +// +// The reason why we have a special-purpose concurrent hash table in +// LLD is because we need to process a very large number of mergeable +// strings. For example, in the final build of clang with debug info, +// thousands of input sections containing 30 million mergeable strings +// in total are fed to the linker. It takes a few seconds to merge +// them in single thread. This concurrent hash table can uniquify them +// in a few hundred milliseconds using 40 cores. +// +// The internal hash table is an open-addressing one. It doesn't +// support resizing. Once it becomes full, you need to redo with +// a larger fresh string table builder. +// +// Just like DenseMap, keys and values are directly stored to buckets +// as pairs. Originally, keys and values are all null. Keys are +// pointers to strings. +// +// SectionPieces are managed using singly linked list. If a bucket +// have a value, it has a pointer to a SectionPiece. Other +// SectionPieces having the same string key are chained using `Next` +// pointer of SectionPiece. +// +// Once you added all strings, you need to call finalize() to fix +// string table contents. When finalize() is called, hash table +// contents is nondeterministic because no one knows which thread +// have claimed earlier buckets in the hash table. +// +// Thus, the next step is to make it deterministic. For each streak of +// occupied buckets, we sort that. Recall that this hash table +// resolves conflicts with linear probing. No matter what order +// multiple threads claim buckets, claimed buckets are claimed, and +// unclaimed buckets remain unclaimed. Therefore, by sorting buckets +// for each streak, we can make the entire hash table deterministic. +// +// Finally, we assign offsets to all SectionPieces associated with +// this hash table. +class ConcurrentStringTableBuilder { +public: + ConcurrentStringTableBuilder(size_t EstimatedNumEntries, size_t Alignment); + + void insert(SectionPiece &Piece, llvm::CachedHashStringRef Str); + void finalize(); + bool isFull(); + void writeTo(uint8_t *Buf); + size_t size() const { return StringTableSize; } + +private: + // Bucket entry type. + struct EntryTy { + // std::sort needs these functions. + EntryTy(const EntryTy &Other) { *this = Other; } + + EntryTy &operator=(const EntryTy &Other) { + memcpy(this, &Other, sizeof(EntryTy)); + return *this; + } + + EntryTy() = default; + + // Define a total order for string keys. The detail is not important, + // but it needs to be defined that we can get deterministic output + // from the hash table. + bool operator<(const EntryTy &Other) const { + size_t SizeA = Size.load(); + size_t SizeB = Other.Size.load(); + if (SizeA != SizeB) + return SizeA < SizeB; + return memcmp(Str.load(), Other.Str.load(), SizeA) < 0; + }; + + // String key information. String key is guaranteed to be unique + // in a hash table. + std::atomic Str = {nullptr}; + std::atomic Size = {0}; + + // Offset in the string table. Filled by finalize(). + uint32_t Offset = 0; + + // SectionPieces having the same string contents are chained + // using pointers. This pointer points to the first node. + std::atomic Piece = {nullptr}; + }; + + // We could allocate tens of millions of objects of this type, + // so we want to keep it small. + static_assert(sizeof(EntryTy) <= 24, "EntryTy is too big"); + + size_t align2(size_t Val) { return (Val + Alignment - 1) & ~(Alignment - 1); } + void append(EntryTy &Bucket, SectionPiece &Piece); + void sortBuckets(); + void sortWrapAround(); + + // All strings are aligned to this boundary. + size_t Alignment; + + // Filled by finalize(). + size_t NumEntries = 0; + size_t StringTableSize = 0; + + // The number of allocated buckets and buckets. + size_t NumBuckets; + std::vector Buckets; + + // The counter to track number of inserted strings. + // Note that the number is approximate. + std::atomic Counter = {0}; + + // We do not support table resizing. If it becomes almost full, + // we should bail out and redo with a larger table. This bool + // value keeps track of that. + std::atomic IsTableFull = {false}; +}; +} +} + +#endif Index: ELF/Concurrent.cpp =================================================================== --- /dev/null +++ ELF/Concurrent.cpp @@ -0,0 +1,185 @@ +//===- Concurrent.cpp -----------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "Concurrent.h" +#include "InputSection.h" +#include "Threads.h" + +#include "llvm/ADT/CachedHashString.h" + +using namespace lld; +using namespace lld::elf; +using namespace llvm; + +// Our target load factor is 0.5. +ConcurrentStringTableBuilder::ConcurrentStringTableBuilder( + size_t EstimatedNumEntries, size_t Alignment) + : Alignment(Alignment), NumBuckets(PowerOf2Ceil(EstimatedNumEntries * 2)), + Buckets(NumBuckets) {} + +// Inserts a given section piece to the string table. +void ConcurrentStringTableBuilder::insert(SectionPiece &Piece, + CachedHashStringRef Str) { + if (IsTableFull) + return; + assert(Str.size() != 0); + + size_t Start = Str.hash() & (NumBuckets - 1); + + for (size_t I = Start; I != Start - 1; I = (I + 1) & (NumBuckets - 1)) { + EntryTy &Bucket = Buckets[I]; + + // If the current bucket is empty, claim it. + const char *Null = nullptr; + if (Bucket.Str.compare_exchange_strong(Null, Str.val().data())) { + Bucket.Size.store(Str.size()); + append(Bucket, Piece); + + // Update the counter for inserted items. We do this only in + // 1/256 chance to avoid contention, so the counter is an + // estimate but deterministic because it does not depend on any + // randomness nor have race condition. + if (Str.hash() % 256 != 0) + return; + size_t Cnt = Counter.fetch_add(256); + + // If 3/4 of the buckets are occupied, we mark this table as full. + // Open-addressing table gets exponentially slower when it + // approaches to load factor 1.0, and 3/4 is a reasonable cutoff. + // If we have more numbers of items, we should bail out early + // and redo with a larger table. + if (Cnt * 4 > NumBuckets * 3) + IsTableFull = true; + return; + } + + // The current bucket contains some string. Its size might not be + // written by other thread yet, so try loading until it becomes + // observable. + const char *OldStr = Bucket.Str.load(); + uint64_t OldSize = 0; + while (OldSize == 0) + OldSize = Bucket.Size.load(); + + // If the current bucket contains the key that we are looking for, + // append Piece to the bucket. Otherwise, go to next bucket. + if (Str.val() == StringRef(OldStr, OldSize)) { + append(Bucket, Piece); + return; + } + } + IsTableFull = true; +} + +void ConcurrentStringTableBuilder::finalize() { + if (IsTableFull) + return; + + // Sort buckets to make the hash table contents deterministic. + sortBuckets(); + sortWrapAround(); + + // Set an offset in the string table for each bucket. + size_t Off = 0; + for (size_t I = 0; I < NumBuckets; ++I) { + if (size_t Size = Buckets[I].Size.load()) { + Off = align2(Off); + Buckets[I].Offset = Off; + Off += Buckets[I].Size.load(); + ++NumEntries; + } + } + StringTableSize = Off; + + // Update SectionPieces' offsets. + forLoop(0, NumBuckets, [&](size_t I) { + SectionPiece *Cur = Buckets[I].Piece.load(); + while (Cur) { + // OffsetOff and Next are a union, so we need to save Next before + // writing to OutputOff. + SectionPiece *Next = Cur->Next; + Cur->OutputOff = Buckets[I].Offset; + Cur = Next; + } + }); +} + +bool ConcurrentStringTableBuilder::isFull() { return IsTableFull; } + +void ConcurrentStringTableBuilder::writeTo(uint8_t *Buf) { + assert(!isFull()); + + forLoop(size_t(0), NumBuckets, [&](size_t I) { + if (const char *Str = Buckets[I].Str.load()) { + size_t Size = Buckets[I].Size.load(); + memcpy(Buf + Buckets[I].Offset, Str, Size); + } + }); +} + +// Atomically append Piece to Bucket. +void ConcurrentStringTableBuilder::append(EntryTy &Bucket, + SectionPiece &Piece) { + for (;;) { + Piece.Next = Bucket.Piece.load(); + if (Bucket.Piece.compare_exchange_weak(Piece.Next, &Piece)) + return; + } +} + +// Find runs of occupied buckets and sort them. This assumes we are +// using linera probing to find an unused bucket. +void ConcurrentStringTableBuilder::sortBuckets() { + for (size_t I = 0; I < NumBuckets;) { + if (Buckets[I].Size.load() == 0) { + ++I; + continue; + } + size_t Begin = I; + size_t End = I + 1; + + while (End < NumBuckets && Buckets[End].Size.load()) + ++End; + std::sort(Buckets.begin() + Begin, Buckets.begin() + End); + I = End; + } +} + +// After the end of the buckets, it wraps around to the beginning, +// so we need to sort them as one unit. sortBuckets() didn't handle +// this corner case. +void ConcurrentStringTableBuilder::sortWrapAround() { + if (Buckets[0].Size.load() == 0 || Buckets[NumBuckets - 1].Size.load() == 0) + return; + + // Copies a wrapped-around streak to a vector, sort the vector, + // and then write them back. + size_t First = 1; + while (Buckets[First].Size.load() && First < NumBuckets) + ++First; + if (First == NumBuckets) { + IsTableFull = true; + return; + } + + size_t Last = NumBuckets - 1; + while (Buckets[Last - 1].Size.load()) + --Last; + + std::vector V; + auto Begin = Buckets.begin(); + V.insert(V.end(), Begin, Begin + First); + V.insert(V.end(), Begin + Last, Begin + NumBuckets); + + std::sort(V.begin(), V.end()); + + size_t LastSize = NumBuckets - Last; + std::copy(V.begin(), V.begin() + LastSize, Begin + Last); + std::copy(V.begin() + LastSize, V.end(), Begin); +} Index: ELF/InputSection.h =================================================================== --- ELF/InputSection.h +++ ELF/InputSection.h @@ -160,11 +160,14 @@ // be found by looking at the next one) and put the hash in a side table. struct SectionPiece { SectionPiece(size_t Off, bool Live = false) - : InputOff(Off), OutputOff(-1), Live(Live || !Config->GcSections) {} + : InputOff(Off), Live(Live || !Config->GcSections), OutputOff(-1) {} - size_t InputOff; - ssize_t OutputOff : 8 * sizeof(ssize_t) - 1; + size_t InputOff : 8 * sizeof(ssize_t) - 1; size_t Live : 1; + union { + ssize_t OutputOff; + SectionPiece *Next; // used by ConcurrentStringTableBuilder + }; }; static_assert(sizeof(SectionPiece) == 2 * sizeof(size_t), "SectionPiece is too big"); Index: ELF/OutputSections.h =================================================================== --- ELF/OutputSections.h +++ ELF/OutputSections.h @@ -21,6 +21,7 @@ namespace elf { class SymbolBody; +class ConcurrentStringTableBuilder; struct EhSectionPiece; template class EhInputSection; template class InputSection; @@ -143,11 +144,15 @@ } private: + void finalizeDefault(); void finalizeTailMerge(); - void finalizeNoTailMerge(); + void finalizeConcurrent(size_t NumPieces); llvm::StringTableBuilder Builder; + std::unique_ptr ConcurrentBuilder; + std::vector *> Sections; + size_t StringAlignment; }; struct CieRecord { Index: ELF/OutputSections.cpp =================================================================== --- ELF/OutputSections.cpp +++ ELF/OutputSections.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "OutputSections.h" +#include "Concurrent.h" #include "Config.h" #include "EhFrame.h" #include "LinkerScript.h" @@ -468,10 +469,13 @@ MergeOutputSection::MergeOutputSection(StringRef Name, uint32_t Type, uintX_t Flags, uintX_t Alignment) : OutputSectionBase(Name, Type, Flags), - Builder(StringTableBuilder::RAW, Alignment) {} + Builder(StringTableBuilder::RAW, Alignment), StringAlignment(Alignment) {} template void MergeOutputSection::writeTo(uint8_t *Buf) { - Builder.write(Buf); + if (ConcurrentBuilder) + ConcurrentBuilder->writeTo(Buf); + else + Builder.write(Buf); } template @@ -508,7 +512,7 @@ Sec->Pieces[I].OutputOff = Builder.getOffset(Sec->getData(I)); } -template void MergeOutputSection::finalizeNoTailMerge() { +template void MergeOutputSection::finalizeDefault() { // Add all string pieces to the string table builder to create section // contents. Because we are not tail-optimizing, offsets of strings are // fixed when they are added to the builder (string table builder contains @@ -522,11 +526,78 @@ this->Size = Builder.getSize(); } +template +void MergeOutputSection::finalizeConcurrent(size_t NumPieces) { + // The concurrent hash table does not support resizing, + // so if it becomes full, redo with a larger table. + // Our initial estimation is that the table contains 14 duplicate + // entries for an item on average. + size_t Estimate = std::max(NumPieces / 15, 1024); + + for (;;) { + ConcurrentBuilder.reset( + new ConcurrentStringTableBuilder(Estimate, StringAlignment)); + + // Approximate number of inserted items. + std::atomic NumInserted = {0}; + + // Insert strings to the table. + parallel_for_each( + Sections.begin(), Sections.end(), [&](MergeInputSection *Sec) { + if (ConcurrentBuilder->isFull()) + return; + NumInserted.fetch_add(Sec->Pieces.size()); + for (size_t I = 0, E = Sec->Pieces.size(); I != E; ++I) + if (Sec->Pieces[I].Live) + ConcurrentBuilder->insert(Sec->Pieces[I], Sec->getData(I)); + }); + + ConcurrentBuilder->finalize(); + if (!ConcurrentBuilder->isFull()) + break; + + // If the table became full, we need to redo. We know how many + // strings were inserted before the table got full, so we can + // make an estimate based on that number. + double Factor = (double)NumInserted.load() / NumPieces; + Estimate = Estimate * (1 / Factor); + } + this->Size = ConcurrentBuilder->size(); +} + template void MergeOutputSection::finalize() { - if (shouldTailMerge()) + // If -O2 is specified, we do tail merging. + if (shouldTailMerge()) { finalizeTailMerge(); + return; + } + + // If -no-threads is specified, we can't use the concurrent map. + if (!Config->Threads) { + finalizeDefault(); + return; + } + + // If we have a very large number of mergeable strings, use the + // concurrent string table builder. Otherwise, use the single- + // threaded one. There's an overhead of using the concurrent one, + // so we don't want to use that unconditionally. The threshold + // is currently set to 100,000. + + size_t NumPieces = 0; + for (MergeInputSection *Sec : Sections) + NumPieces += Sec->Pieces.size(); + + // This is for unit test. + if (StringRef(getenv("LLD_USE_CONCURRENT_STRING_TABLE_BUILDER")) == "1") { + finalizeConcurrent(NumPieces); + return; + } + + if (NumPieces < 100000) + finalizeDefault(); else - finalizeNoTailMerge(); + finalizeConcurrent(NumPieces); } template Index: test/ELF/merge-strings-concurrent.s =================================================================== --- /dev/null +++ test/ELF/merge-strings-concurrent.s @@ -0,0 +1,25 @@ +// REQUIRES: x86 +// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +// RUN: env LLD_USE_CONCURRENT_STRING_TABLE_BUILDER=1 ld.lld %t.o -o %t.so -shared +// RUN: llvm-readobj -s -section-data -t %t.so | FileCheck %s + +.section .rodata.str1.1, "aMS", @progbits,1 +.asciz "abc" +.asciz "def" +.asciz "ghijklmn" +.asciz "o" +.asciz "pqrstuvwxyz" + +.section .rodata.str2.2, "aMS", @progbits,1 +.asciz "ABC" +.asciz "def" +.asciz "ghijklmn" +.asciz "O" +.asciz "pqrstuvwxyz" + +// CHECK: Name: .rodata (1) +// CHECK: SectionData ( +// CHECK-NEXT: 0000: 41424300 64656600 61626300 6768696A |ABC.def.abc.ghij| +// CHECK-NEXT: 0010: 6B6C6D6E 006F004F 00707172 73747576 |klmn.o.O.pqrstuv| +// CHECK-NEXT: 0020: 7778797A 00 |wxyz.| +// CHECK-NEXT: )