diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h --- a/lld/ELF/OutputSections.h +++ b/lld/ELF/OutputSections.h @@ -12,6 +12,7 @@ #include "InputSection.h" #include "LinkerScript.h" #include "lld/Common/LLVM.h" +#include "llvm/Support/Parallel.h" #include @@ -104,7 +105,8 @@ bool relro = false; void finalize(); - template void writeTo(uint8_t *buf); + template + void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg); // Check that the addends for dynamic relocations were written correctly. void checkDynRelAddends(const uint8_t *bufStart); template void maybeCompress(); @@ -114,6 +116,8 @@ void sortCtorsDtors(); private: + SmallVector storage; + // Used for implementation of --compress-debug-sections option. CompressedData compressed; diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -332,7 +332,10 @@ // Write uncompressed data to a temporary zero-initialized buffer. auto buf = std::make_unique(size); - writeTo(buf.get()); + { + parallel::TaskGroup tg; + writeTo(buf.get(), tg); + } // We chose 1 (Z_BEST_SPEED) as the default compression level because it is // the fastest. If -O2 is given, we use level 6 to compress debug info more by // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more @@ -386,7 +389,8 @@ llvm_unreachable("unsupported Size argument"); } -template void OutputSection::writeTo(uint8_t *buf) { +template +void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) { llvm::TimeTraceScope timeScope("Write sections", name); if (type == SHT_NOBITS) return; @@ -419,41 +423,68 @@ } // Write leading padding. - SmallVector storage; ArrayRef sections = getInputSections(*this, storage); std::array filler = getFiller(); bool nonZeroFiller = read32(filler.data()) != 0; if (nonZeroFiller) fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler); - parallelFor(0, sections.size(), [&](size_t i) { - InputSection *isec = sections[i]; - if (auto *s = dyn_cast(isec)) - s->writeTo(buf + isec->outSecOff); - else - isec->writeTo(buf + isec->outSecOff); - - // Fill gaps between sections. - if (nonZeroFiller) { - uint8_t *start = buf + isec->outSecOff + isec->getSize(); - uint8_t *end; - if (i + 1 == sections.size()) - end = buf + size; + auto fn = [=](size_t begin, size_t end) { + size_t numSections = sections.size(); + for (size_t i = begin; i != end; ++i) { + InputSection *isec = sections[i]; + if (auto *s = dyn_cast(isec)) + s->writeTo(buf + isec->outSecOff); else - end = buf + sections[i + 1]->outSecOff; - if (isec->nopFiller) { - assert(target->nopInstrs); - nopInstrFill(start, end - start); - } else - fill(start, end - start, filler); + isec->writeTo(buf + isec->outSecOff); + + // Fill gaps between sections. + if (nonZeroFiller) { + uint8_t *start = buf + isec->outSecOff + isec->getSize(); + uint8_t *end; + if (i + 1 == numSections) + end = buf + size; + else + end = buf + sections[i + 1]->outSecOff; + if (isec->nopFiller) { + assert(target->nopInstrs); + nopInstrFill(start, end - start); + } else + fill(start, end - start, filler); + } } - }); + }; - // Linker scripts may have BYTE()-family commands with which you - // can write arbitrary bytes to the output. Process them if any. + // If there is any BYTE()-family command (rare), write the section content + // first then process BYTE to overwrite the filler content. The write is + // serial due to the limitation of llvm/Support/Parallel.h. + bool written = false; + size_t numSections = sections.size(); for (SectionCommand *cmd : commands) - if (auto *data = dyn_cast(cmd)) + if (auto *data = dyn_cast(cmd)) { + if (!std::exchange(written, true)) + fn(0, numSections); writeInt(buf + data->offset, data->expression().getValue(), data->size); + } + if (written || !numSections) + return; + + // There is no data command. Write content asynchronously to overlap the write + // time with other output sections. Note, if a linker script specifies + // overlapping output sections (needs --noinhibit-exec or --no-check-sections + // to supress the error), the output may be non-deterministic. + const size_t taskSizeLimit = 4 << 20; + for (size_t begin = 0, i = 0, taskSize = 0;;) { + taskSize += sections[i]->getSize(); + bool done = ++i == numSections; + if (done || taskSize >= taskSizeLimit) { + tg.execute([=] { fn(begin, i); }); + if (done) + break; + begin = i; + taskSize = 0; + } + } } static void finalizeShtGroup(OutputSection *os, InputSection *section) { @@ -673,10 +704,14 @@ template void OutputSection::writeHeaderTo(ELF64LE::Shdr *Shdr); template void OutputSection::writeHeaderTo(ELF64BE::Shdr *Shdr); -template void OutputSection::writeTo(uint8_t *Buf); -template void OutputSection::writeTo(uint8_t *Buf); -template void OutputSection::writeTo(uint8_t *Buf); -template void OutputSection::writeTo(uint8_t *Buf); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); template void OutputSection::maybeCompress(); template void OutputSection::maybeCompress(); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2839,9 +2839,10 @@ } template void Writer::writeSectionsBinary() { + parallel::TaskGroup tg; for (OutputSection *sec : outputSections) if (sec->flags & SHF_ALLOC) - sec->writeTo(Out::bufferStart + sec->offset); + sec->writeTo(Out::bufferStart + sec->offset, tg); } static void fillTrap(uint8_t *i, uint8_t *end) { @@ -2884,16 +2885,21 @@ template void Writer::writeSections() { llvm::TimeTraceScope timeScope("Write sections"); - // In -r or --emit-relocs mode, write the relocation sections first as in - // ELf_Rel targets we might find out that we need to modify the relocated - // section while doing it. - for (OutputSection *sec : outputSections) - if (sec->type == SHT_REL || sec->type == SHT_RELA) - sec->writeTo(Out::bufferStart + sec->offset); - - for (OutputSection *sec : outputSections) - if (sec->type != SHT_REL && sec->type != SHT_RELA) - sec->writeTo(Out::bufferStart + sec->offset); + { + // In -r or --emit-relocs mode, write the relocation sections first as in + // ELf_Rel targets we might find out that we need to modify the relocated + // section while doing it. + parallel::TaskGroup tg; + for (OutputSection *sec : outputSections) + if (sec->type == SHT_REL || sec->type == SHT_RELA) + sec->writeTo(Out::bufferStart + sec->offset, tg); + } + { + parallel::TaskGroup tg; + for (OutputSection *sec : outputSections) + if (sec->type != SHT_REL && sec->type != SHT_RELA) + sec->writeTo(Out::bufferStart + sec->offset, tg); + } // Finally, check that all dynamic relocation addends were written correctly. if (config->checkDynamicRelocs && config->writeAddends) { diff --git a/lld/test/ELF/arm-thumb-interwork-notfunc.s b/lld/test/ELF/arm-thumb-interwork-notfunc.s --- a/lld/test/ELF/arm-thumb-interwork-notfunc.s +++ b/lld/test/ELF/arm-thumb-interwork-notfunc.s @@ -1,6 +1,7 @@ // REQUIRES: arm // RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s -// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN +/// Use --threads=1 to keep emitted warnings across sections sequential. +// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN // RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s .syntax unified diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s --- a/lld/test/ELF/hexagon-jump-error.s +++ b/lld/test/ELF/hexagon-jump-error.s @@ -1,6 +1,7 @@ # REQUIRES: hexagon # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o -# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s +## Use --threads=1 to keep emitted warnings across sections sequential. +# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s .globl _start .type _start, @function diff --git a/lld/test/ELF/linkerscript/overlapping-sections.s b/lld/test/ELF/linkerscript/overlapping-sections.s --- a/lld/test/ELF/linkerscript/overlapping-sections.s +++ b/lld/test/ELF/linkerscript/overlapping-sections.s @@ -88,8 +88,8 @@ # BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101 # BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101 # BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101 -# Starting here the contents of .sec2 overwrites .sec1: -# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202 +## Starting here the content may be from either .sec1 or .sec2, depending on the write order. +# BROKEN-OUTPUT-FILE-NEXT: 8040 # RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH # BAD-BOTH-LABEL: Section Headers: diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h --- a/llvm/include/llvm/Support/Parallel.h +++ b/llvm/include/llvm/Support/Parallel.h @@ -30,9 +30,6 @@ extern ThreadPoolStrategy strategy; namespace detail { - -#if LLVM_ENABLE_THREADS - class Latch { uint32_t Count; mutable std::mutex Mutex; @@ -61,20 +58,42 @@ Cond.wait(lock, [&] { return Count == 0; }); } }; +} // namespace detail class TaskGroup { - Latch L; + detail::Latch L; bool Parallel; public: TaskGroup(); ~TaskGroup(); + // Spawn a task, but does not wait for it to finish. void spawn(std::function f); + // Similar to spawn, but execute the task immediately when ThreadsRequested == + // 1. The difference is to give the following pattern a more intuitive order + // when single threading is requested. + // + // for (size_t begin = 0, i = 0, taskSize = 0;;) { + // taskSize += ... + // bool done = ++i == end; + // if (done || taskSize >= taskSizeLimit) { + // tg.execute([=] { fn(begin, i); }); + // if (done) + // break; + // begin = i; + // taskSize = 0; + // } + // } + void execute(std::function f); + void sync() const { L.sync(); } }; +namespace detail { + +#if LLVM_ENABLE_THREADS const ptrdiff_t MinParallelSize = 1024; /// Inclusive median. diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -19,10 +19,9 @@ llvm::ThreadPoolStrategy llvm::parallel::strategy; -#if LLVM_ENABLE_THREADS - namespace llvm { namespace parallel { +#if LLVM_ENABLE_THREADS namespace detail { namespace { @@ -143,6 +142,8 @@ return Exec.get(); } } // namespace +} // namespace detail +#endif static std::atomic TaskGroupInstances; @@ -159,21 +160,27 @@ } void TaskGroup::spawn(std::function F) { +#if LLVM_ENABLE_THREADS if (Parallel) { L.inc(); - Executor::getDefaultExecutor()->add([&, F = std::move(F)] { + detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] { F(); L.dec(); }); - } else { - F(); + return; } +#endif + F(); } -} // namespace detail +void TaskGroup::execute(std::function F) { + if (parallel::strategy.ThreadsRequested == 1) + F(); + else + spawn(F); +} } // namespace parallel } // namespace llvm -#endif // LLVM_ENABLE_THREADS void llvm::parallelFor(size_t Begin, size_t End, llvm::function_ref Fn) { @@ -190,7 +197,7 @@ if (TaskSize == 0) TaskSize = 1; - parallel::detail::TaskGroup TG; + parallel::TaskGroup TG; for (; Begin + TaskSize < End; Begin += TaskSize) { TG.spawn([=, &Fn] { for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I)