diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -30,6 +30,8 @@ SyntheticSections.cpp Target.cpp UnwindInfoSection.cpp + WorkStealingQueue.h + WorkStealingQueue.inc Writer.cpp LINK_COMPONENTS diff --git a/lld/MachO/MarkLive.cpp b/lld/MachO/MarkLive.cpp --- a/lld/MachO/MarkLive.cpp +++ b/lld/MachO/MarkLive.cpp @@ -12,54 +12,252 @@ #include "SymbolTable.h" #include "Symbols.h" #include "UnwindInfoSection.h" +#include "WorkStealingQueue.h" #include "mach-o/compact_unwind_encoding.h" + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/ScopeExit.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/Parallel.h" +#include "llvm/Support/Threading.h" #include "llvm/Support/TimeProfiler.h" +#include + namespace lld { namespace macho { using namespace llvm; using namespace llvm::MachO; -// Set live bit on for each reachable chunk. Unmarked (unreachable) -// InputSections will be ignored by Writer, so they will be excluded -// from the final output. -void markLive() { - TimeTraceScope timeScope("markLive"); +static const size_t poolSize = PowerOf2Floor(std::min( + llvm::hardware_concurrency(llvm::parallel::strategy.ThreadsRequested) + .compute_thread_count(), + 32)); +static std::atomic done(0); + +struct Worker { + + Worker() : idx(0), pool(nullptr) {} - // We build up a worklist of sections which have been marked as live. We only - // push into the worklist when we discover an unmarked section, and we mark - // as we push, so sections never appear twice in the list. - // Literal sections cannot contain references to other sections, so we only - // store ConcatInputSections in our worklist. - SmallVector worklist; + Worker(size_t idx, WorkStealingQueue *queues) + : idx(idx), pool(queues) { + // Calcuate the portion of inputSections this worker is to own. + // We divide inputSections into poolSize groups. + // If there are fewer inputSections than the number of workers, then + // only the first x workers will each have one items, where x is + // the number of inputSections. + start = idx * groupSize; + end = start + groupSize - 1; + if (start < idx || end >= inputSections.size()) { + // Not all workers will have inputSections + if (idx < inputSections.size()) { + start = end = idx; + skip = false; + } else { + skip = true; + } + } else { + skip = false; + } + } - auto enqueue = [&](InputSection *isec, uint64_t off) { - if (isec->isLive(off)) + size_t queueSize() const { return pool[idx].size(); } + + void addSect(InputSection *s, uint64_t off) { + if (!s) + return; + if (s->isLive(off)) return; - isec->markLive(off); - if (auto s = dyn_cast(isec)) { - assert(!s->isCoalescedWeak()); - worklist.push_back(s); + s->markLive(off); + + if (auto *cs = dyn_cast(s)) { + assert(!cs->isCoalescedWeak()); + pool[idx].push(s); } - }; - auto addSym = [&](Symbol *s) { - if (s->used) + checkLiveSupport(); + } + + void addSym(Symbol *s) { + if (!s || s->used) return; + s->used = true; if (auto *d = dyn_cast(s)) { if (d->isec) - enqueue(d->isec, d->value); + addSect(d->isec, d->value); if (d->compactUnwind) - enqueue(d->compactUnwind, 0); + addSect(d->compactUnwind, 0); } - }; + checkSymbolLiveSupport(); + } + + void runOnQueue(size_t targetIdx) { + do { + while (!pool[targetIdx].isEmpty()) { + auto item = + targetIdx == idx ? pool[targetIdx].pop() : pool[targetIdx].steal(); + + if (!item) + continue; + InputSection *sect = *item; + if (sect) { + // Mark all symbols listed in the relocation table for this section. + for (const Reloc &r : sect->relocs) { + if (auto *ref = r.referent.dyn_cast()) + addSym(ref); + else + addSect(r.referent.get(), r.addend); + } + } + } + + // TODO: we can avoid iterating inputSections multiple times. + // + // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live + // section. + // Process them in a second pass. + if (!skip) { + for (size_t i = start; i <= end; ++i) { + ConcatInputSection *isec = inputSections[i]; + // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a + // separate vector and only walking that here is faster. + if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live) + continue; + + for (const Reloc &r : isec->relocs) { + if (auto *s = r.referent.dyn_cast()) { + if (s->isLive()) + addSect(isec, 0); + else + symbolLiveSupport[s] = isec; + } else { + InputSection *referentIsec = r.referent.get(); + if (referentIsec->isLive(r.addend)) + addSect(isec, 0); + else + liveSupport[std::pair( + referentIsec, r.addend)] = isec; + } + } + } + } + } while (!pool[idx].isEmpty()); + + while (done.load(std::memory_order_relaxed) < poolSize) { + size_t target = targetIdx; + // Keep looking for a non-empty queue until we're all done + do { + ++target; + // a check is faster than the % + if (target >= poolSize) + target = 0; + if (target == targetIdx) + break; + } while (pool[target].isEmpty()); + + if (!pool[target].isEmpty()) { + runOnQueue(target); + } else { + if (!markDone) + done.fetch_add(1, std::memory_order_acquire); + markDone = true; + break; + } + } + + assert(liveSupport.empty() && symbolLiveSupport.empty()); + } + + void checkLiveSupport() { + for (auto entry = liveSupport.begin(); entry != liveSupport.end();) { + if (entry->first.first->isLive(entry->first.second)) { + InputSection *supported = entry->second; + liveSupport.erase(entry++); + addSect(supported, 0); + } else { + ++entry; + } + } + } + + void checkSymbolLiveSupport() { + for (auto entry = symbolLiveSupport.begin(); + entry != symbolLiveSupport.end();) { + if (entry->first->isLive()) { + InputSection *supported = entry->second; + symbolLiveSupport.erase(entry++); + addSect(supported, 0); + } else { + ++entry; + } + } + } + + void run() { + // Start by looking at our own queue first. + runOnQueue(idx); + } + + // The number of InputSections each Worker is initially assigned. + // Each Worker owns a contiguous block of InputSection's [start, end] + const size_t groupSize = inputSections.size() / poolSize; + + const size_t idx; // index of this worker's queue in the pool + bool markDone = false; // if true, this thread has signaled that it's done. + size_t start; // start index + size_t end; // end index. + bool skip; + WorkStealingQueue *pool; + + // Mapping of referents to their referers. + DenseMap, InputSection *> liveSupport; + DenseMap symbolLiveSupport; +}; - // Add GC roots. - if (config->entry) - addSym(config->entry); - for (Symbol *sym : symtab->getSymbols()) { +// Func : (Worker& worker, size_t j ) +// worker: whose queue the item should be added to. +// j: is the index of the item to be added +template +static void splitTasks(std::vector &workers, size_t size, Func func) { + const size_t d = size / poolSize; + std::vector threads; + threads.reserve(poolSize); + + for (size_t idx = 0; idx < poolSize; ++idx) { + size_t start = idx * d; + size_t end = start + d - 1; + bool skip = false; + + if (start < idx || end >= size) { + if (idx < size) { + start = end = idx; + skip = false; + } else { + skip = true; + } + } + if (!skip) + threads.push_back(std::thread([&func, idx, start, end, &workers]() { + Worker &worker = workers[idx]; + for (size_t j = start; j <= end; ++j) { + func(worker, j); + } + })); + } + + for (auto &thread : threads) { + thread.join(); + } +} + +void populateFromSymtab(std::vector &workers) { + TimeTraceScope timeScope("populateFromSymtab"); + + const auto &entries = symtab->getSymbols(); + auto doAdd = [&](Worker &worker, int i) { + Symbol *sym = entries[i]; if (auto *defined = dyn_cast(sym)) { // -exported_symbol(s_list) if (!config->exportedSymbols.empty() && @@ -69,17 +267,18 @@ // explicitUndefineds code below would handle this automatically. assert(!defined->privateExtern && "should have been rejected by driver"); - addSym(defined); - continue; + worker.addSym(defined); + return; } // public symbols explicitly marked .no_dead_strip if (defined->referencedDynamically || defined->noDeadStrip) { - addSym(defined); - continue; + worker.addSym(defined); + return; } - // FIXME: When we implement these flags, make symbols from them GC roots: + // FIXME: When we implement these flags, make symbols from them GC + // roots: // * -reexported_symbol(s_list) // * -alias(-list) // * -init @@ -89,81 +288,107 @@ bool externsAreRoots = config->outputType != MH_EXECUTE || config->exportDynamic; if (externsAreRoots && !defined->privateExtern) { - addSym(defined); - continue; + worker.addSym(defined); + return; } } + }; + + const size_t size = entries.size(); + splitTasks(workers, size, doAdd); +} + +void populateFromSpecialSymbols(std::vector &workers) { + TimeTraceScope timeScope("populateFromSpecialSymbols"); + + // For simplicity, add the -u symbols to the queue with the fewest tasks. + int minIdx = 0; + size_t min = workers[minIdx].queueSize(); + for (size_t i = 1; i < poolSize; ++i) { + if (workers[i].queueSize() < min) + min = i; } - // -u symbols + Worker &worker = workers[minIdx]; for (Symbol *sym : config->explicitUndefineds) if (auto *defined = dyn_cast(sym)) - addSym(defined); + worker.addSym(defined); + if (auto *stubBinder = + dyn_cast_or_null(symtab->find("dyld_stub_binder"))) + worker.addSym(stubBinder); +} + +void populateNoDeadStrips(std::vector &workers) { + TimeTraceScope timeScope("populateDeadSrips"); + std::vector syms; // local symbols explicitly marked .no_dead_strip for (const InputFile *file : inputFiles) if (auto *objFile = dyn_cast(file)) for (Symbol *sym : objFile->symbols) if (auto *defined = dyn_cast_or_null(sym)) if (!defined->isExternal() && defined->noDeadStrip) - addSym(defined); - if (auto *stubBinder = - dyn_cast_or_null(symtab->find("dyld_stub_binder"))) - addSym(stubBinder); - for (ConcatInputSection *isec : inputSections) { + syms.push_back(defined); + + splitTasks(workers, syms.size(), + [&](Worker &worker, int i) { worker.addSym(syms[i]); }); +} + +void populateFromInputSections(std::vector &workers) { + TimeTraceScope timeScope("populateFromInputSections"); + auto doAdd = [&](Worker &worker, int i) { + ConcatInputSection *isec = inputSections[i]; // Sections marked no_dead_strip if (isec->getFlags() & S_ATTR_NO_DEAD_STRIP) { - enqueue(isec, 0); - continue; + worker.addSect(isec, 0); + return; } // mod_init_funcs, mod_term_funcs sections if (sectionType(isec->getFlags()) == S_MOD_INIT_FUNC_POINTERS || sectionType(isec->getFlags()) == S_MOD_TERM_FUNC_POINTERS) { - enqueue(isec, 0); - continue; + worker.addSect(isec, 0); + return; } - } + }; - do { - // Mark things reachable from GC roots as live. - while (!worklist.empty()) { - ConcatInputSection *s = worklist.pop_back_val(); - assert(s->live && "We mark as live when pushing onto the worklist!"); - - // Mark all symbols listed in the relocation table for this section. - for (const Reloc &r : s->relocs) { - if (auto *s = r.referent.dyn_cast()) - addSym(s); - else - enqueue(r.referent.get(), r.addend); - } - for (Defined *d : s->symbols) - addSym(d); - } + splitTasks(workers, inputSections.size(), doAdd); +} - // S_ATTR_LIVE_SUPPORT sections are live if they point _to_ a live section. - // Process them in a second pass. - for (ConcatInputSection *isec : inputSections) { - // FIXME: Check if copying all S_ATTR_LIVE_SUPPORT sections into a - // separate vector and only walking that here is faster. - if (!(isec->getFlags() & S_ATTR_LIVE_SUPPORT) || isec->live) - continue; - - for (const Reloc &r : isec->relocs) { - bool referentLive; - if (auto *s = r.referent.dyn_cast()) - referentLive = s->isLive(); - else - referentLive = r.referent.get()->isLive(r.addend); - if (referentLive) - enqueue(isec, 0); - } - } +// Set live bit on for each reachable chunk. Unmarked (unreachable) +// InputSections will be ignored by Writer, so they will be excluded +// from the final output. +void markLive() { + TimeTraceScope timeScope("markLive"); + + WorkStealingQueue *queues = + new WorkStealingQueue[poolSize]; + std::vector workers; + auto Cleanup = make_scope_exit([&] { + workers.clear(); + delete[] queues; + }); + + workers.reserve(poolSize); + for (size_t i = 0; i < poolSize; ++i) + workers.push_back(Worker(i, queues)); - // S_ATTR_LIVE_SUPPORT could have marked additional sections live, - // which in turn could mark additional S_ATTR_LIVE_SUPPORT sections live. - // Iterate. In practice, the second iteration won't mark additional - // S_ATTR_LIVE_SUPPORT sections live. - } while (!worklist.empty()); + // Give the last queue the entry because chances are it has the least work. + workers[poolSize - 1].addSym(config->entry); + + // Populate the queues with initial stuff. + populateFromSymtab(workers); + populateFromSpecialSymbols(workers); + populateNoDeadStrips(workers); + populateFromInputSections(workers); + + std::vector threads; + threads.reserve(poolSize); + for (size_t i = 0; i < poolSize; ++i) { + threads.emplace_back([&workers, i]() { workers[i].run(); }); + } + + for (auto &thread : threads) { + thread.join(); + } } } // namespace macho diff --git a/lld/MachO/WorkStealingQueue.h b/lld/MachO/WorkStealingQueue.h new file mode 100644 --- /dev/null +++ b/lld/MachO/WorkStealingQueue.h @@ -0,0 +1,84 @@ +#ifndef LLD_MACHO_WORK_STEALING_QUEUE_H +#define LLD_MACHO_WORK_STEALING_QUEUE_H + +#include "llvm/ADT/Optional.h" + +#include +#include +#include +#include + +namespace lld { +namespace macho { +namespace internal { + +// Actual data holder for the queue. +// This is a convenience wrapper around an array of atomics to simplify +// operations in WorkStealingQueue class. +template class Node { +public: + explicit Node(size_t capacity) + : capacity{capacity}, mask{capacity - 1}, + arr{new std::atomic[capacity]} {} + + ~Node() { delete[] arr; } + + size_t getCapacity() const { return capacity; } + + // Resize to cap * 2 and copy all items over in the range [head, tail). + Node *resize(size_t head, size_t tail); + + // Store item to index i. + template void store(size_t i, E &&e); + + // Remove item at index i and return it. + T remove(size_t i); + +private: + size_t capacity; + const size_t mask; + std::atomic *arr; +}; + +} // namespace internal + +template class WorkStealingQueue { +public: + explicit WorkStealingQueue(size_t capacity = 1024); + + ~WorkStealingQueue(); + + size_t getCapacity() const; + size_t size() const; + bool isEmpty() const; + + // Standard queue operations. + // Note: only 1 thread can push/pop. + // But many threads can steal() + + // Push item into the queue. + template void push(E &&e); + // Pop item from the queue + llvm::Optional pop(); + // Steal item from the queue. + llvm::Optional steal(); + +private: + std::atomic head; // Index to the first item. + std::atomic tail; // Index to the last item. + + std::atomic *> node; // pointer to the actual data array + + // This holds all the used nodes that were too small (ie, prior to resizing) + // We don't want to pause and delete in the middle of the runs, so we + // collect them in this vector to delete later. + // FIXME: may not need this. + std::vector *> retired; // retired nodes +}; + +} // namespace macho +} // namespace lld + +#include "WorkStealingQueue.inc" + +#endif // LLD_MACHO_WORK_STEALING_QUEUE_H diff --git a/lld/MachO/WorkStealingQueue.inc b/lld/MachO/WorkStealingQueue.inc new file mode 100644 --- /dev/null +++ b/lld/MachO/WorkStealingQueue.inc @@ -0,0 +1,123 @@ +#ifndef LLD_MACHO_WORK_STEALING_QUEUE_H +#error Must be included with header +#else + +namespace lld { +namespace macho { +namespace internal { + +// Resize to cap * 2 and copy all items over in the range [head, tail) +template Node *Node::resize(size_t head, size_t tail) { + Node *ptr = new Node{2 * capacity}; + for (size_t i = head; i < tail; ++i) { + ptr->store(i, remove(i)); + } + return ptr; +} + +template +template +void Node::store(size_t i, E &&e) { + arr[i & mask].store(std::forward(e), std::memory_order_relaxed); +} + +template T Node::remove(size_t i) { + return arr[i & mask].load(std::memory_order_relaxed); +} + +} // namespace internal + +template WorkStealingQueue::WorkStealingQueue(size_t cap) { + assert(cap > 0); + head.store(0, std::memory_order_relaxed); + tail.store(0, std::memory_order_relaxed); + node.store(new internal::Node{cap}, std::memory_order_relaxed); + retired.reserve(64); +} + +template WorkStealingQueue::~WorkStealingQueue() { + for (auto r : retired) + delete r; + retired.clear(); + delete node.load(); +} + +template bool WorkStealingQueue::isEmpty() const { + return head.load(std::memory_order_relaxed) >= + tail.load(std::memory_order_relaxed); +} + +template size_t WorkStealingQueue::size() const { + size_t t = tail.load(std::memory_order_relaxed); + size_t h = head.load(std::memory_order_relaxed); + return t >= h ? t - h : 0; +} + +template +template +void WorkStealingQueue::push(E &&e) { + size_t tailIdx = tail.load(std::memory_order_relaxed); + size_t headIdx = head.load(std::memory_order_acquire); + internal::Node *a = node.load(std::memory_order_relaxed); + + if (a->getCapacity() < tailIdx - headIdx + 1) { + internal::Node *tmp = a->resize(headIdx, tailIdx); + retired.push_back(a); + std::swap(a, tmp); + node.store(tmp, std::memory_order_relaxed); + } + + a->store(tailIdx, std::forward(e)); + std::atomic_thread_fence(std::memory_order_release); + tail.store(tailIdx + 1, std::memory_order_relaxed); +} + +template llvm::Optional WorkStealingQueue::pop() { + size_t tailIdx = tail.load(std::memory_order_relaxed) - 1; + internal::Node *curNode = node.load(std::memory_order_relaxed); + tail.store(tailIdx, std::memory_order_relaxed); + std::atomic_thread_fence(std::memory_order_seq_cst); + size_t headIdx = head.load(std::memory_order_relaxed); + + if (headIdx > tailIdx) { + tail.store(tailIdx + 1, std::memory_order_relaxed); + return llvm::None; + } + + llvm::Optional ret = curNode->remove(tailIdx); + if (headIdx == tailIdx) { + if (!head.compare_exchange_strong(headIdx, headIdx + 1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) + ret = llvm::None; + tail.store(tailIdx + 1, std::memory_order_relaxed); + } + + return ret; +} + +template llvm::Optional WorkStealingQueue::steal() { + size_t headIdx = head.load(std::memory_order_acquire); + std::atomic_thread_fence(std::memory_order_seq_cst); + size_t tailIdx = tail.load(std::memory_order_acquire); + + if (headIdx >= tailIdx) + return llvm::None; + + internal::Node *curNode = node.load(std::memory_order_consume); + llvm::Optional ret = curNode->remove(headIdx); + if (!head.compare_exchange_strong(headIdx, headIdx + 1, + std::memory_order_seq_cst, + std::memory_order_relaxed)) + return llvm::None; + return ret; +} + +template size_t WorkStealingQueue::getCapacity() const { + return node.load(std::memory_order_relaxed)->getCapacity(); +} + +} // namespace macho +} // namespace lld + +#endif // LLD_MACHO_WORK_STEALING_QUEUE_H diff --git a/lld/unittests/CMakeLists.txt b/lld/unittests/CMakeLists.txt --- a/lld/unittests/CMakeLists.txt +++ b/lld/unittests/CMakeLists.txt @@ -14,3 +14,4 @@ add_subdirectory(DriverTests) add_subdirectory(MachOTests) +add_subdirectory(MachOUnitTests) diff --git a/lld/unittests/MachOUnitTests/CMakeLists.txt b/lld/unittests/MachOUnitTests/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/lld/unittests/MachOUnitTests/CMakeLists.txt @@ -0,0 +1,9 @@ +add_lld_unittest(lldMachOUnitTests + WorkStealingQueueTest.cpp + ) + +target_link_libraries(lldMachOUnitTests + PRIVATE + lldMachO + lldYAML + ) diff --git a/lld/unittests/MachOUnitTests/WorkStealingQueueTest.cpp b/lld/unittests/MachOUnitTests/WorkStealingQueueTest.cpp new file mode 100644 --- /dev/null +++ b/lld/unittests/MachOUnitTests/WorkStealingQueueTest.cpp @@ -0,0 +1,58 @@ +#include +#include +#include +#include + +#include "../../lld/MachO/WorkStealingQueue.h" +#include "gtest/gtest.h" + +TEST(SmokeTest, Test) { + // lld::macho::WorkStealingQueue queue(-1); + // expect death??? +} + +TEST(FunctionalTest, Test) { + const size_t cap = 999; + // Count number of accesses. + std::atomic counts[cap]; + for (int i = 0; i < cap; ++i) { + counts[i].store(0, std::memory_order_relaxed); + } + + lld::macho::WorkStealingQueue queue; + + // Multiple thieves. + auto do_steal = [&]() { + while (!queue.isEmpty()) { + llvm::Optional item = queue.steal(); + if (item) + counts[*item].fetch_add(1, std::memory_order_relaxed); + } + }; + + // Only one producer + std::thread producer([&]() { + for (int i = 0; i < cap; ++i) { + queue.push(i); + } + + do_steal(); + do_steal(); + }); + + const int thiefCount = 10; + std::thread thieves[thiefCount]; + for (int i = 0; i < thiefCount; ++i) { + thieves[i] = std::thread(do_steal); + } + + producer.join(); + for (auto &thief : thieves) + thief.join(); + + // Check that everything is incremented exactly once. + for (int i = 0; i < cap; ++i) { + const int value = counts[i].load(std::memory_order_relaxed); + EXPECT_EQ(1, value); + } +}