diff --git a/lld/ELF/OutputSections.h b/lld/ELF/OutputSections.h --- a/lld/ELF/OutputSections.h +++ b/lld/ELF/OutputSections.h @@ -12,6 +12,7 @@ #include "InputSection.h" #include "LinkerScript.h" #include "lld/Common/LLVM.h" +#include "llvm/Support/Parallel.h" #include @@ -105,7 +106,8 @@ bool relro = false; void finalize(); - template void writeTo(uint8_t *buf); + template + void writeTo(uint8_t *buf, llvm::parallel::TaskGroup &tg); // Check that the addends for dynamic relocations were written correctly. void checkDynRelAddends(const uint8_t *bufStart); template void maybeCompress(); @@ -115,6 +117,8 @@ void sortCtorsDtors(); private: + SmallVector storage; + // Used for implementation of --compress-debug-sections option. CompressedData compressed; diff --git a/lld/ELF/OutputSections.cpp b/lld/ELF/OutputSections.cpp --- a/lld/ELF/OutputSections.cpp +++ b/lld/ELF/OutputSections.cpp @@ -332,7 +332,10 @@ // Write uncompressed data to a temporary zero-initialized buffer. auto buf = std::make_unique(size); - writeTo(buf.get()); + { + parallel::TaskGroup tg; + writeTo(buf.get(), tg); + } // We chose 1 (Z_BEST_SPEED) as the default compression level because it is // the fastest. If -O2 is given, we use level 6 to compress debug info more by // ~15%. We found that level 7 to 9 doesn't make much difference (~1% more @@ -386,7 +389,8 @@ llvm_unreachable("unsupported Size argument"); } -template void OutputSection::writeTo(uint8_t *buf) { +template +void OutputSection::writeTo(uint8_t *buf, parallel::TaskGroup &tg) { llvm::TimeTraceScope timeScope("Write sections", name); if (type == SHT_NOBITS) return; @@ -419,14 +423,13 @@ } // Write leading padding. - SmallVector storage; ArrayRef sections = getInputSections(*this, storage); std::array filler = getFiller(); bool nonZeroFiller = read32(filler.data()) != 0; if (nonZeroFiller) fill(buf, sections.empty() ? size : sections[0]->outSecOff, filler); - parallelFor(0, sections.size(), [&](size_t i) { + std::function fn = [=](size_t i) { InputSection *isec = sections[i]; if (auto *s = dyn_cast(isec)) s->writeTo(buf + isec->outSecOff); @@ -447,13 +450,25 @@ } else fill(start, end - start, filler); } - }); + }; // Linker scripts may have BYTE()-family commands with which you // can write arbitrary bytes to the output. Process them if any. + bool written = false; for (SectionCommand *cmd : commands) - if (auto *data = dyn_cast(cmd)) + if (auto *data = dyn_cast(cmd)) { + if (!std::exchange(written, true)) + parallelFor(0, sections.size(), fn); writeInt(buf + data->offset, data->expression().getValue(), data->size); + } + if (written) + return; + + // There is no data command. Write content asynchronously to overlap the write + // time with other output sections. Note, if a linker script specifies + // overlapping output sections (usually broken), the output may be + // non-deterministic. + asyncParallelFor(tg, 128, 0, sections.size(), fn); } static void finalizeShtGroup(OutputSection *os, InputSection *section) { @@ -673,10 +688,14 @@ template void OutputSection::writeHeaderTo(ELF64LE::Shdr *Shdr); template void OutputSection::writeHeaderTo(ELF64BE::Shdr *Shdr); -template void OutputSection::writeTo(uint8_t *Buf); -template void OutputSection::writeTo(uint8_t *Buf); -template void OutputSection::writeTo(uint8_t *Buf); -template void OutputSection::writeTo(uint8_t *Buf); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); +template void OutputSection::writeTo(uint8_t *, + llvm::parallel::TaskGroup &); template void OutputSection::maybeCompress(); template void OutputSection::maybeCompress(); diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp --- a/lld/ELF/Writer.cpp +++ b/lld/ELF/Writer.cpp @@ -2834,9 +2834,10 @@ } template void Writer::writeSectionsBinary() { + parallel::TaskGroup tg; for (OutputSection *sec : outputSections) if (sec->flags & SHF_ALLOC) - sec->writeTo(Out::bufferStart + sec->offset); + sec->writeTo(Out::bufferStart + sec->offset, tg); } static void fillTrap(uint8_t *i, uint8_t *end) { @@ -2879,16 +2880,21 @@ template void Writer::writeSections() { llvm::TimeTraceScope timeScope("Write sections"); - // In -r or --emit-relocs mode, write the relocation sections first as in - // ELf_Rel targets we might find out that we need to modify the relocated - // section while doing it. - for (OutputSection *sec : outputSections) - if (sec->type == SHT_REL || sec->type == SHT_RELA) - sec->writeTo(Out::bufferStart + sec->offset); - - for (OutputSection *sec : outputSections) - if (sec->type != SHT_REL && sec->type != SHT_RELA) - sec->writeTo(Out::bufferStart + sec->offset); + { + // In -r or --emit-relocs mode, write the relocation sections first as in + // ELf_Rel targets we might find out that we need to modify the relocated + // section while doing it. + parallel::TaskGroup tg; + for (OutputSection *sec : outputSections) + if (sec->type == SHT_REL || sec->type == SHT_RELA) + sec->writeTo(Out::bufferStart + sec->offset, tg); + } + { + parallel::TaskGroup tg; + for (OutputSection *sec : outputSections) + if (sec->type != SHT_REL && sec->type != SHT_RELA) + sec->writeTo(Out::bufferStart + sec->offset, tg); + } // Finally, check that all dynamic relocation addends were written correctly. if (config->checkDynamicRelocs && config->writeAddends) { diff --git a/lld/test/ELF/arm-thumb-interwork-notfunc.s b/lld/test/ELF/arm-thumb-interwork-notfunc.s --- a/lld/test/ELF/arm-thumb-interwork-notfunc.s +++ b/lld/test/ELF/arm-thumb-interwork-notfunc.s @@ -1,6 +1,7 @@ // REQUIRES: arm // RUN: llvm-mc -g --triple=armv7a-linux-gnueabihf -arm-add-build-attributes -filetype=obj -o %t.o %s -// RUN: ld.lld %t.o -o %t 2>&1 | FileCheck %s --check-prefix=WARN +/// Use --threads=1 to keep emitted warnings across sections sequential. +// RUN: ld.lld %t.o -o %t --threads=1 2>&1 | FileCheck %s --check-prefix=WARN // RUN: llvm-objdump --no-show-raw-insn -d %t | FileCheck %s .syntax unified diff --git a/lld/test/ELF/hexagon-jump-error.s b/lld/test/ELF/hexagon-jump-error.s --- a/lld/test/ELF/hexagon-jump-error.s +++ b/lld/test/ELF/hexagon-jump-error.s @@ -1,6 +1,7 @@ # REQUIRES: hexagon # RUN: llvm-mc -filetype=obj -triple=hexagon-unknown-elf %s -o %t.o -# RUN: not ld.lld %t.o -o /dev/null 2>&1 | FileCheck --implicit-check-not "out of range" %s +## Use --threads=1 to keep emitted warnings across sections sequential. +# RUN: not ld.lld %t.o -o /dev/null --threads=1 2>&1 | FileCheck --implicit-check-not "out of range" %s .globl _start .type _start, @function diff --git a/lld/test/ELF/linkerscript/overlapping-sections.s b/lld/test/ELF/linkerscript/overlapping-sections.s --- a/lld/test/ELF/linkerscript/overlapping-sections.s +++ b/lld/test/ELF/linkerscript/overlapping-sections.s @@ -88,8 +88,8 @@ # BROKEN-OUTPUT-FILE-NEXT: 8010 01010101 01010101 01010101 01010101 # BROKEN-OUTPUT-FILE-NEXT: 8020 01010101 01010101 01010101 01010101 # BROKEN-OUTPUT-FILE-NEXT: 8030 01010101 01010101 01010101 01010101 -# Starting here the contents of .sec2 overwrites .sec1: -# BROKEN-OUTPUT-FILE-NEXT: 8040 02020202 02020202 02020202 02020202 +## Starting here the content may be from either .sec1 or .sec2, depending on the write order. +# BROKEN-OUTPUT-FILE-NEXT: 8040 # RUN: llvm-readelf --sections -l %t.so | FileCheck %s -check-prefix BAD-BOTH # BAD-BOTH-LABEL: Section Headers: diff --git a/llvm/include/llvm/Support/Parallel.h b/llvm/include/llvm/Support/Parallel.h --- a/llvm/include/llvm/Support/Parallel.h +++ b/llvm/include/llvm/Support/Parallel.h @@ -30,9 +30,6 @@ extern ThreadPoolStrategy strategy; namespace detail { - -#if LLVM_ENABLE_THREADS - class Latch { uint32_t Count; mutable std::mutex Mutex; @@ -61,9 +58,10 @@ Cond.wait(lock, [&] { return Count == 0; }); } }; +} // namespace detail class TaskGroup { - Latch L; + detail::Latch L; bool Parallel; public: @@ -75,6 +73,9 @@ void sync() const { L.sync(); } }; +namespace detail { + +#if LLVM_ENABLE_THREADS const ptrdiff_t MinParallelSize = 1024; /// Inclusive median. @@ -253,6 +254,11 @@ [&Fn](auto &&V) { return wrap(Fn(V)); })); } +// Spawn iteration tasks to TG, but does not wait for them to finish. When +// ThreadsRequested == 1, the loop is executedly eagerly. +void asyncParallelFor(parallel::TaskGroup &TG, size_t TaskSize, size_t Begin, + size_t End, std::function Fn); + } // namespace llvm #endif // LLVM_SUPPORT_PARALLEL_H diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp --- a/llvm/lib/Support/Parallel.cpp +++ b/llvm/lib/Support/Parallel.cpp @@ -143,6 +143,7 @@ return Exec.get(); } } // namespace +} // namespace detail static std::atomic TaskGroupInstances; @@ -161,7 +162,7 @@ void TaskGroup::spawn(std::function F) { if (Parallel) { L.inc(); - Executor::getDefaultExecutor()->add([&, F = std::move(F)] { + detail::Executor::getDefaultExecutor()->add([&, F = std::move(F)] { F(); L.dec(); }); @@ -169,8 +170,6 @@ F(); } } - -} // namespace detail } // namespace parallel } // namespace llvm #endif // LLVM_ENABLE_THREADS @@ -190,7 +189,7 @@ if (TaskSize == 0) TaskSize = 1; - parallel::detail::TaskGroup TG; + parallel::TaskGroup TG; for (; Begin + TaskSize < End; Begin += TaskSize) { TG.spawn([=, &Fn] { for (size_t I = Begin, E = Begin + TaskSize; I != E; ++I) @@ -206,3 +205,23 @@ for (; Begin != End; ++Begin) Fn(Begin); } + +void llvm::asyncParallelFor(parallel::TaskGroup &TG, size_t TaskSize, + size_t Begin, size_t End, + std::function Fn) { + // With one thread, run the loop eagerly. + if (parallel::strategy.ThreadsRequested == 1) { + for (size_t I = Begin; I != End; ++I) + Fn(I); + return; + } + + while (Begin < End) { + size_t Next = std::min(Begin + TaskSize, End); + TG.spawn([=] { + for (size_t I = Begin; I != Next; ++I) + Fn(I); + }); + Begin = Next; + } +}