diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -30,6 +30,8 @@ SyntheticSections.cpp Target.cpp UnwindInfoSection.cpp + WorkStealingQueue.h + WorkStealingQueue.inc Writer.cpp LINK_COMPONENTS diff --git a/lld/MachO/MarkLive.cpp b/lld/MachO/MarkLive.cpp --- a/lld/MachO/MarkLive.cpp +++ b/lld/MachO/MarkLive.cpp @@ -12,49 +12,147 @@ #include "SymbolTable.h" #include "Symbols.h" #include "UnwindInfoSection.h" +#include "WorkStealingQueue.h" #include "mach-o/compact_unwind_encoding.h" #include "llvm/Support/TimeProfiler.h" +#include + namespace lld { namespace macho { using namespace llvm; using namespace llvm::MachO; -// Set live bit on for each reachable chunk. Unmarked (unreachable) -// InputSections will be ignored by Writer, so they will be excluded -// from the final output. -void markLive() { - TimeTraceScope timeScope("markLive"); +static const int poolSize = 16; // arbitrary - could be something else. + +struct Task { + Task() : idx(0), pool(nullptr) {} - // We build up a worklist of sections which have been marked as live. We only - // push into the worklist when we discover an unmarked section, and we mark - // as we push, so sections never appear twice in the list. - // Literal sections cannot contain references to other sections, so we only - // store ConcatInputSections in our worklist. - SmallVector worklist; + Task(int idx, WorkStealingQueue *queues) + : idx(idx), pool(queues) { + // calcuate the portion of inputSections this task were to own. + // we dive it into poolSize groups. d = inputSections.size() /size + start = idx * d; + end = std::min(start + d - 1, inputSections.size() - 1); + if (start >= inputSections.size()) + skip = true; + else + skip = false; + } - auto enqueue = [&](InputSection *isec, uint64_t off) { - if (isec->isLive(off)) + void addSect(InputSection *s, uint64_t off) { + if (s->isLive(off)) return; - isec->markLive(off); - if (auto s = dyn_cast(isec)) { - assert(!s->isCoalescedWeak()); - worklist.push_back(s); + s->markLive(off); + + if (auto cs = dyn_cast(s)) { + assert(!cs->isCoalescedWeak()); + pool[idx].push(cs); } - }; + } - auto addSym = [&](Symbol *s) { + void addSym(Symbol *s) { s->used = true; if (auto *d = dyn_cast(s)) if (d->isec) - enqueue(d->isec, d->value); - }; + addSect(d->isec, d->value); + } + + void help(int target_idx) { + do { + auto item = + target_idx == idx ? pool[idx].pop() : pool[target_idx].steal(); + while (item) { + InputSection *s = *item; + // Mark all symbols listed in the relocation table for this section. + for (const Reloc &r : s->relocs) { + if (auto *s = r.referent.dyn_cast()) + addSym(s); + else + addSect(r.referent.get(), r.addend); + } + target_idx == idx ? pool[idx].pop() : pool[target_idx].steal(); + } - // Add GC roots. - if (config->entry) - addSym(config->entry); - for (Symbol *sym : symtab->getSymbols()) { + // TODO: we can avoid iterating inputSections multiple times. + // + // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live + // section. + // Process them in a second pass. + if (!skip) { + for (int i = start; i <= end; ++i) { + ConcatInputSection *isec = inputSections[i]; + // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a + // separate vector and only walking that here is faster. + if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live) + continue; + + for (const Reloc &r : isec->relocs) { + bool referentLive; + if (auto *s = r.referent.dyn_cast()) + referentLive = s->isLive(); + else + referentLive = r.referent.get()->isLive(r.addend); + if (referentLive) + addSect(isec, 0); + } + } + } + + if (pool[idx].isEmpty()) { + // a check is faster than the % + size_t target = idx + 1; + if (target >= poolSize) + target = 0; + help(target); + } + } while (!pool[idx].isEmpty()); + } + + void work() { help(idx); } + const int d = inputSections.size() / poolSize; + const int idx; // index of this task's queue in the pool + int start; // start inputsect + int end; // end + bool skip; + WorkStealingQueue *pool; +}; + +// Func : (Task& task, int j ) +// task: is the queue to add to. +// j: is the index of the item to be added +template +static void splitTasks(WorkStealingQueue *queues, + std::vector &tasks, size_t size, Func func) { + assert(size >= poolSize); + const size_t d = size / poolSize; + std::vector threads; + threads.reserve(poolSize); + + for (int idx = 0; idx < poolSize; ++idx) { + int start = idx * d; + int end = std::min(start + d - 1, size - 1); + threads.push_back(std::thread([&]() { + Task &task = tasks[idx]; + for (int j = start; j <= end; ++j) { + func(task, j); + } + })); + } + + for (auto &thread : threads) { + thread.join(); + } +} + +void populateFromSymtab(WorkStealingQueue *queues, + std::vector &tasks) { + TimeTraceScope timeScope("populateFromSymtab"); + + const auto &entries = symtab->getSymbols(); + auto do_add = [&](Task &task, int i) { + Symbol *sym = entries[i]; if (auto *defined = dyn_cast(sym)) { // -exported_symbol(s_list) if (!config->exportedSymbols.empty() && @@ -64,17 +162,18 @@ // explicitUndefineds code below would handle this automatically. assert(!defined->privateExtern && "should have been rejected by driver"); - addSym(defined); - continue; + task.addSym(defined); + return; } // public symbols explicitly marked .no_dead_strip if (defined->referencedDynamically || defined->noDeadStrip) { - addSym(defined); - continue; + task.addSym(defined); + return; } - // FIXME: When we implement these flags, make symbols from them GC roots: + // FIXME: When we implement these flags, make symbols from them GC + // roots: // * -reexported_symbol(s_list) // * -alias(-list) // * -init @@ -84,40 +183,87 @@ bool externsAreRoots = config->outputType != MH_EXECUTE || config->exportDynamic; if (externsAreRoots && !defined->privateExtern) { - addSym(defined); - continue; + task.addSym(defined); + return; } } + }; + + const size_t size = entries.size(); + if (size <= poolSize) { + for (int i = 0; i < size; ++i) + do_add(tasks[i], i); + return; + } + + splitTasks(queues, tasks, size, do_add); +} + +void populateFromSpecialSymbols(WorkStealingQueue *queues, + std::vector &tasks) { + TimeTraceScope timeScope("populateFromSpecialSymbols"); + + // For simplicity, add the -u symbols to the queue with the fewest tasks. + int min_idx = 0; + size_t min = queues[0].size(); + for (int i = 1; i < poolSize; ++i) { + if (queues[i].size() < min) + min = i; } - // -u symbols + Task &task = tasks[min_idx]; for (Symbol *sym : config->explicitUndefineds) if (auto *defined = dyn_cast(sym)) - addSym(defined); + task.addSym(defined); + if (auto *stubBinder = + dyn_cast_or_null(symtab->find("dyld_stub_binder"))) + task.addSym(stubBinder); +} + +void populateDeadSrips(WorkStealingQueue *queues, + std::vector &tasks) { + std::vector syms; // local symbols explicitly marked .no_dead_strip for (const InputFile *file : inputFiles) if (auto *objFile = dyn_cast(file)) for (Symbol *sym : objFile->symbols) if (auto *defined = dyn_cast_or_null(sym)) if (!defined->isExternal() && defined->noDeadStrip) - addSym(defined); - if (auto *stubBinder = - dyn_cast_or_null(symtab->find("dyld_stub_binder"))) - addSym(stubBinder); - for (ConcatInputSection *isec : inputSections) { + syms.push_back(defined); + + if (syms.size() < poolSize) { + for (int i = 0; i < syms.size(); ++i) + tasks[i].addSym(syms[i]); + return; + } + + splitTasks(queues, tasks, syms.size(), + [&](Task &task, int i) { task.addSym(syms[i]); }); +} + +void populateFromInputSections(WorkStealingQueue *queues, + std::vector &tasks) { + + auto do_add = [&](Task &task, int i) { + ConcatInputSection *isec = inputSections[i]; // Sections marked no_dead_strip if (isec->getFlags() & S_ATTR_NO_DEAD_STRIP) { - enqueue(isec, 0); - continue; + task.addSect(isec, 0); + return; } // mod_init_funcs, mod_term_funcs sections if (sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS || sectionType(isec->getFlags()) == S_MOD_TERM_FUNC_POINTERS) { - enqueue(isec, 0); - continue; + task.addSect(isec, 0); + return; } - } + }; + splitTasks(queues, tasks, inputSections.size(), do_add); +} + +void populateFromUnwindInfo(WorkStealingQueue *queues, + std::vector &tasks) { // Dead strip runs before UnwindInfoSection handling so we need to keep // __LD,__compact_unwind alive here. // But that section contains absolute references to __TEXT,__text and @@ -125,7 +271,9 @@ // section: We must skip the relocations for the functionAddress // in each CompactUnwindEntry. // See also scanEhFrameSection() in lld/ELF/MarkLive.cpp. - for (ConcatInputSection *isec : in.unwindInfo->getInputs()) { + const std::vector &inputs = in.unwindInfo->getInputs(); + auto do_add = [&](Task &task, int i) { + ConcatInputSection *isec = inputs[i]; isec->live = true; const int compactUnwindEntrySize = target->wordSize == 8 ? sizeof(CompactUnwindEntry) @@ -137,51 +285,49 @@ continue; if (auto *s = r.referent.dyn_cast()) - addSym(s); + task.addSym(s); else - enqueue(r.referent.get(), r.addend); + task.addSect(r.referent.get(), r.addend); } + }; + + if (inputs.size() < poolSize) { + for (int i = 0; i < inputs.size(); ++i) + do_add(tasks[i], i); + return; } + splitTasks(queues, tasks, inputs.size(), do_add); +} +// Set live bit on for each reachable chunk. Unmarked (unreachable) +// InputSections will be ignored by Writer, so they will be excluded +// from the final output. +void markLive() { + TimeTraceScope timeScope("markLive"); - do { - // Mark things reachable from GC roots as live. - while (!worklist.empty()) { - ConcatInputSection *s = worklist.pop_back_val(); - assert(s->live && "We mark as live when pushing onto the worklist!"); - - // Mark all symbols listed in the relocation table for this section. - for (const Reloc &r : s->relocs) { - if (auto *s = r.referent.dyn_cast()) - addSym(s); - else - enqueue(r.referent.get(), r.addend); - } - } + WorkStealingQueue queues[poolSize]; + std::vector tasks; + std::vector threads; + tasks.reserve(poolSize); + threads.reserve(poolSize); + for (int i = 0; i < poolSize; ++i) { + tasks.push_back(Task(i, queues)); + } - // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live section. - // Process them in a second pass. - for (ConcatInputSection *isec : inputSections) { - // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a - // separate vector and only walking that here is faster. - if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live) - continue; + // Give the last queue the entry because chances are it has the fewest work. + tasks[poolSize - 1].addSym(config->entry); - for (const Reloc &r : isec->relocs) { - bool referentLive; - if (auto *s = r.referent.dyn_cast()) - referentLive = s->isLive(); - else - referentLive = r.referent.get()->isLive(r.addend); - if (referentLive) - enqueue(isec, 0); - } - } + populateFromSymtab(queues, tasks); + populateFromSpecialSymbols(queues, tasks); + populateDeadSrips(queues, tasks); + populateFromInputSections(queues, tasks); + populateFromUnwindInfo(queues, tasks); - // S_ATTR_LIVE_SUPPORT could have marked additional sections live, - // which in turn could mark additional S_ATTR_LIVE_SUPPORT sections live. - // Iterate. In practice, the second iteration won't mark additional - // S_ATTR_LIVE_SUPPORT sections live. - } while (!worklist.empty()); + for (int i = 0; i < poolSize; ++i) { + threads.push_back(std::thread([&tasks, i]() { tasks[i].work(); })); + } + for (auto &thread : threads) { + thread.join(); + } } } // namespace macho diff --git a/lld/MachO/WorkStealingQueue.h b/lld/MachO/WorkStealingQueue.h new file mode 100644 --- /dev/null +++ b/lld/MachO/WorkStealingQueue.h @@ -0,0 +1,83 @@ +#ifndef LLD_MACHO_WORKSTEALINGQUEUE_H +#define LLD_MACHO_WORKSTEALINGQUEUE_H + +#include "llvm/ADT/Optional.h" + +#include +#include +#include +#include + +namespace lld { +namespace macho { +namespace internal { + +// Actual data holder for the queue. +// This is a convenience wrapper around an array of atomics to simplify +// operations in WorkStealingQueue class. +template class Node { +public: + explicit Node(size_t capacity) + : cap_{capacity}, mask_{capacity - 1}, arr_{new std::atomic[cap_]} {} + + ~Node() { delete[] arr_; } + + size_t capacity() const { return cap_; } + + // Resize to cap * 2 and copy all items over in the range [head, tail). + Node *resize(size_t head, size_t tail); + + // Store item to index i. + template void store(size_t i, E &&e); + + // Remove item at index i and return it. + T remove(size_t i); + +private: + size_t cap_; + const size_t mask_; + std::atomic *arr_; +}; + +} // namespace internal + +template class WorkStealingQueue { +public: + explicit WorkStealingQueue(size_t capacity = 1024); + + ~WorkStealingQueue(); + + size_t capacity() const; + size_t size() const; + bool isEmpty() const; + + // Standard queue operations. + // Note: only 1 thread can push/pop. + // But many threads can steal() + + // Push item into the queue. + template void push(E &&e); + // Pop item from the queue + llvm::Optional pop(); + // Steal item from the queue. + llvm::Optional steal(); + +private: + std::atomic head_; // Index to the first item. + std::atomic tail_; // Index to the last item. + + std::atomic *> node_; // pointer to the actual data array + + // This holds all the used nodes that were too small (ie, prior to resizing) + // We don't want to pause and delete in the middle of the runs, so we + // collect them in this vector to delete later. + // FIXME: may not need this. + std::vector *> retired_; // retired nodes +}; + +} // namespace macho +} // namespace lld + +#include "WorkStealingQueue.inc" + +#endif // LLD_MACHO_WORKSTEALINGQUEUE_H diff --git a/lld/MachO/WorkStealingQueue.inc b/lld/MachO/WorkStealingQueue.inc new file mode 100644 --- /dev/null +++ b/lld/MachO/WorkStealingQueue.inc @@ -0,0 +1,123 @@ +#ifndef LLD_MACHO_WORKSTEALINGQUEUE_H +#error Must be included with header +#else + +namespace lld { +namespace macho { +namespace internal { + +// Resize to cap * 2 and copy all items over in the range [head, tail) +template Node *Node::resize(size_t head, size_t tail) { + Node *ptr = new Node{2 * cap_}; + for (size_t i = head; i < tail; ++i) { + ptr->store(i, remove(i)); + } + return ptr; +} + +template +template +void Node::store(size_t i, E &&e) { + arr_[i & mask_].store(std::forward(e), std::memory_order_relaxed); +} + +template T Node::remove(size_t i) { + return arr_[i & mask_].load(std::memory_order_relaxed); +} + +} // namespace internal + +template WorkStealingQueue::WorkStealingQueue(size_t cap) { + assert(cap > 0); + head_.store(0, std::memory_order_relaxed); + tail_.store(0, std::memory_order_relaxed); + node_.store(new internal::Node{cap}, std::memory_order_relaxed); + retired_.reserve(64); +} + +template WorkStealingQueue::~WorkStealingQueue() { + for (auto r : retired_) + delete r; + retired_.clear(); + delete node_.load(); +} + +template bool WorkStealingQueue::isEmpty() const { + return head_.load(std::memory_order_relaxed) >= + tail_.load(std::memory_order_relaxed); +} + +template size_t WorkStealingQueue::size() const { + size_t t = tail_.load(std::memory_order_relaxed); + size_t h = head_.load(std::memory_order_relaxed); + return t >= h ? t - h : 0; +} + +template +template +void WorkStealingQueue::push(E &&e) { + size_t tailIdx = tail_.load(std::memory_order_relaxed); + size_t headIdx = head_.load(std::memory_order_acquire); + internal::Node *a = node_.load(std::memory_order_relaxed); + + if (a->capacity() < tailIdx - headIdx + 1) { + internal::Node *tmp = a->resize(headIdx, tailIdx); + retired_.push_back(a); + std::swap(a, tmp); + node_.store(tmp, std::memory_order_relaxed); + } + + a->store(tailIdx, std::forward(e)); + std::atomic_thread_fence(std::memory_order_release); + tail_.store(tailIdx + 1, std::memory_order_relaxed); +} + +template llvm::Optional WorkStealingQueue::pop() { + size_t tailIdx = tail_.load(std::memory_order_relaxed) - 1; + internal::Node *curNode = node_.load(std::memory_order_relaxed); + tail_.store(tailIdx, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + size_t headIdx = head_.load(std::memory_order_relaxed); + + if (headIdx > tailIdx) { + tail_.store(tailIdx + 1, std::memory_order_relaxed); + return llvm::None; + } + + llvm::Optional ret = curNode->remove(tailIdx); + if (headIdx == tailIdx) { + if (!head_.compare_exchange_strong(headIdx, headIdx + 1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) + ret = llvm::None; + tail_.store(tailIdx + 1, std::memory_order_relaxed); + } + + return ret; +} + +template llvm::Optional WorkStealingQueue::steal() { + size_t headIdx = head_.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + size_t tailIdx = tail_.load(std::memory_order_acquire); + + if (headIdx >= tailIdx) + return llvm::None; + + internal::Node *curNode = node_.load(std::memory_order_consume); + llvm::Optional ret = curNode->remove(headIdx); + if (!head_.compare_exchange_strong(headIdx, headIdx + 1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) + return llvm::None; + return ret; +} + +template size_t WorkStealingQueue::capacity() const { + return node_.load(std::memory_order_relaxed)->capacity(); +} + +} // namespace macho +} // namespace lld + +#endif diff --git a/lld/unittests/CMakeLists.txt b/lld/unittests/CMakeLists.txt --- a/lld/unittests/CMakeLists.txt +++ b/lld/unittests/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory(DriverTests) add_subdirectory(MachOTests) +add_subdirectory(MachOUnitTests) diff --git a/lld/unittests/MachOUnitTests/CMakeLists.txt b/lld/unittests/MachOUnitTests/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/lld/unittests/MachOUnitTests/CMakeLists.txt @@ -0,0 +1,9 @@ +add_lld_unittest(lldMachOOldTests + WorkSteallingQueueTest.cpp + ) + +target_link_libraries(lldMachOUnitTests + PRIVATE + lldMachO + lldYAML + ) diff --git a/lld/unittests/MachOUnitTests/WorkSteallingQueueTest.cpp b/lld/unittests/MachOUnitTests/WorkSteallingQueueTest.cpp new file mode 100644 --- /dev/null +++ b/lld/unittests/MachOUnitTests/WorkSteallingQueueTest.cpp @@ -0,0 +1,56 @@ +#include +#include +#include +#include + +#include "../../lld/MachO/WorkStealingQueue.h" +#include "gtest/gtest.h" + +TEST(SmokeTest, Test) { + lld::macho::WorkStealingQueue queue(-1); + // expect death??? +} + +TEST(FunctionalTest, Test) { + const size_t cap = 999; + // Count number of accesses. + std::atomic counts[max]; + for (int i = 0; i < cap; ++i) { + counts[i].store(0, std::memory_order_relaxed); + } + + // Multiple thieves. + auto do_steal = [&]() { + while (!queue.isEmpty()) { + llvm::Optional item = queue.steal(); + if (item) + counts[*item].fetch_add(1, std::memory_order_relaxed); + } + }; + + // Only one producer + std::thread producer([&]() { + for (int i = 0; i < max; i = i + 1) { + queue.push(i); + } + + do_steal(); + do_steal(); + }); + + const int thiefCount = 10; + std::thread thieves[thiefCount]; + for (int i = 0; i < thiefCount; ++i) { + thieves[i] = std::thread(do_steal); + } + + producer.join(); + for (auto &thief : thieves) + thief.join(); + + // Check that everything is incremented exactly once. + for (int i = 0; i < max; ++i) { + const int value = counts[i].load(std::memory_order_relaxed); + EXPECT_EQ(1, value); + } +}