diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp --- a/lld/MachO/Arch/ARM64.cpp +++ b/lld/MachO/Arch/ARM64.cpp @@ -19,6 +19,7 @@ #include "llvm/Support/Endian.h" #include "llvm/Support/MathExtras.h" +using namespace llvm; using namespace llvm::MachO; using namespace llvm::support::endian; using namespace lld; @@ -33,6 +34,7 @@ void writeStubHelperEntry(uint8_t *buf, const DylibSymbol &, uint64_t entryAddr) const override; const RelocAttrs &getRelocAttrs(uint8_t type) const override; + void populateThunk(InputSection *thunk, Symbol *funcSym) override; }; } // namespace @@ -103,11 +105,36 @@ ::writeStubHelperEntry(buf8, stubHelperEntryCode, sym, entryVA); } +// A thunk is the relaxed variation of stubCode. We don't need the +// extra indirection through a lazy pointer because the target address +// is known at link time. +static constexpr uint32_t thunkCode[] = { + 0x90000010, // 00: adrp x16, @page + 0x91000210, // 04: add x16, [x16,@pageoff] + 0xd61f0200, // 08: br x16 +}; + +void ARM64::populateThunk(InputSection *thunk, Symbol *funcSym) { + thunk->align = 4; + thunk->data = {reinterpret_cast(thunkCode), + sizeof(thunkCode)}; + thunk->relocs.push_back({/*type=*/ARM64_RELOC_PAGEOFF12, + /*length=*/2, /*pcrel=*/false, + /*offset=*/4, /*addend=*/0, + /*referent=*/funcSym}); + thunk->relocs.push_back({/*type=*/ARM64_RELOC_PAGE21, + /*length=*/2, /*pcrel=*/true, + /*offset=*/0, /*addend=*/0, + /*referent=*/funcSym}); +} + ARM64::ARM64() : ARM64Common(LP64()) { cpuType = CPU_TYPE_ARM64; cpuSubtype = CPU_SUBTYPE_ARM64_ALL; stubSize = sizeof(stubCode); + thunkSize = sizeof(thunkCode); + branchRange = maxIntN(28) - thunkSize; stubHelperHeaderSize = sizeof(stubHelperHeaderCode); stubHelperEntrySize = sizeof(stubHelperEntryCode); } diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -901,6 +901,7 @@ "too many errors emitted, stopping now " "(use --error-limit=0 to see all errors)"; errorHandler().errorLimit = args::getInteger(args, OPT_error_limit_eq, 20); + errorHandler().verbose = args.hasArg(OPT_verbose); if (args.hasArg(OPT_help_hidden)) { parser.printHelp(argsArr[0], /*showHidden=*/true); diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h --- a/lld/MachO/InputSection.h +++ b/lld/MachO/InputSection.h @@ -42,6 +42,8 @@ uint32_t align = 1; uint32_t flags = 0; + uint32_t callSiteCount = 0; + bool isFinal = false; // is address assigned? // How many symbols refer to this InputSection. uint32_t numRefs = 0; diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp --- a/lld/MachO/InputSection.cpp +++ b/lld/MachO/InputSection.cpp @@ -34,20 +34,15 @@ uint64_t InputSection::getVA() const { return parent->addr + outSecOff; } -static uint64_t resolveSymbolVA(uint8_t *loc, const Symbol &sym, uint8_t type) { +static uint64_t resolveSymbolVA(const Symbol *sym, uint8_t type) { const RelocAttrs &relocAttrs = target->getRelocAttrs(type); - if (relocAttrs.hasAttr(RelocAttrBits::BRANCH)) { - if (sym.isInStubs()) - return in.stubs->addr + sym.stubsIndex * target->stubSize; - } else if (relocAttrs.hasAttr(RelocAttrBits::GOT)) { - if (sym.isInGot()) - return in.got->addr + sym.gotIndex * target->wordSize; - } else if (relocAttrs.hasAttr(RelocAttrBits::TLV)) { - if (sym.isInGot()) - return in.tlvPointers->addr + sym.gotIndex * target->wordSize; - assert(isa(&sym)); - } - return sym.getVA(); + if (relocAttrs.hasAttr(RelocAttrBits::BRANCH)) + return sym->resolveBranchVA(); + else if (relocAttrs.hasAttr(RelocAttrBits::GOT)) + return sym->resolveGotVA(); + else if (relocAttrs.hasAttr(RelocAttrBits::TLV)) + return sym->resolveTlvVA(); + return sym->getVA(); } void InputSection::writeTo(uint8_t *buf) { @@ -78,7 +73,7 @@ if (target->hasAttr(r.type, RelocAttrBits::LOAD) && !referentSym->isInGot()) target->relaxGotLoad(loc, r.type); - referentVA = resolveSymbolVA(loc, *referentSym, r.type); + referentVA = resolveSymbolVA(referentSym, r.type); if (isThreadLocalVariables(flags)) { // References from thread-local variable sections are treated as offsets diff --git a/lld/MachO/MergedOutputSection.h b/lld/MachO/MergedOutputSection.h --- a/lld/MachO/MergedOutputSection.h +++ b/lld/MachO/MergedOutputSection.h @@ -12,11 +12,14 @@ #include "InputSection.h" #include "OutputSection.h" #include "lld/Common/LLVM.h" +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" namespace lld { namespace macho { +class Defined; + // Linking multiple files will inevitably mean resolving sections in different // files that are labeled with the same segment and section name. This class // contains all such sections and writes the data from each section sequentially @@ -34,10 +37,14 @@ void mergeInput(InputSection *input); void finalize() override; + bool needsThunks() const; + uint64_t estimateStubsInRangeVA(size_t callIdx) const; void writeTo(uint8_t *buf) const override; std::vector inputs; + std::vector thunks; + uint64_t callSiteCount = 0; static bool classof(const OutputSection *sec) { return sec->kind() == MergedKind; @@ -50,6 +57,30 @@ uint64_t fileSize = 0; }; +// We maintain one ThunkInfo per real function. +// +// The "active thunk" is represented by the sym/isec pair that +// turns-over during finalize(): as the call-site address advances, +// the active thunk goes out of branch-range, and we create a new +// thunk to take its place. +// +// The remaining members -- bools and counters -- apply to the +// collection of thunks associated with the real function. + +struct ThunkInfo { + // These denote the active thunk: + Defined *sym = nullptr; // private-extern symbol for active thunk + InputSection *isec = nullptr; // input section for active thunk + + // The following values are cumulative across all thunks on this function + uint32_t callSiteCount = 0; // how many calls to the real function? + uint32_t callSitesUsed = 0; // how many call sites processed so-far? + uint32_t thunkCallCount = 0; // how many call sites went to thunk? + uint8_t sequence = 0; // how many thunks created so-far? +}; + +extern llvm::DenseMap thunkMap; + } // namespace macho } // namespace lld diff --git a/lld/MachO/MergedOutputSection.cpp b/lld/MachO/MergedOutputSection.cpp --- a/lld/MachO/MergedOutputSection.cpp +++ b/lld/MachO/MergedOutputSection.cpp @@ -7,12 +7,19 @@ //===----------------------------------------------------------------------===// #include "MergedOutputSection.h" +#include "Config.h" #include "OutputSegment.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "SyntheticSections.h" +#include "Target.h" #include "lld/Common/ErrorHandler.h" #include "lld/Common/Memory.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/Support/ScopedPrinter.h" +#include + using namespace llvm; using namespace llvm::MachO; using namespace lld; @@ -26,29 +33,288 @@ align = std::max(align, input->align); mergeFlags(input); } - inputs.push_back(input); input->parent = this; + callSiteCount += input->callSiteCount; +} + +// Branch-range extension can be implemented in two ways, either through ... +// +// (1) Branch islands: Single branch instructions (also of limited range), +// that might be chained in multiple hops to reach the desired +// destination. On ARM64, as 16 branch islands are needed to hop between +// opposite ends of a 2 GiB program. LD64 uses branch islands exclusively, +// even when it needs excessive hops. +// +// (2) Thunks: Instruction(s) to load the destination address into a scratch +// register, followed by a register-indirect branch. Thunks are +// constructed to reach any arbitrary address, so need not be +// chained. Although thunks need not be chained, a program might need +// multiple thunks to the same destiation distributed throughout a large +// program so that all call sites can have one within range. +// +// The optimal approach is to mix islands for distinations within two hops, +// and use thunks for destinations at greater distance. For now, we only +// implement thunks. TODO: Adding suppport for branch islands! +// +// Internally -- as expressed in LLD's data structures -- a +// branch-range-extension thunk comprises ... +// +// (1) new Defined privateExtern symbol for the thunk named +// .thunk., which references ... +// (2) new InputSection, which contains ... +// (3.1) new data for the instructions to load & branch to the far address + +// (3.2) new Relocs on instructions to load the far address, which reference ... +// (4.1) existing Defined extern symbol for the real function in __text, or +// (4.2) existing DylibSymbol for the real function in a dylib +// +// Nearly-optimal thunk-placement algorithm features: +// +// * Single pass: O(n) on the number of call sites. +// +// * Accounts for the exact space overhead of thunks - no heuristics +// +// * Exploits the full range of call instructions - forward & backward +// +// Data: +// +// * DenseMap thunkMap: Maps the function symbol +// to its thunk bookkeeper. +// +// * struct ThunkInfo (bookkeeper): Call instructions have limited range, and +// distant call sites might be unable to reach the same thunk, so multiple +// thunks are necessary to serve all call sites in a very large program. A +// thunkInfo stores state for all thunks associated with a particular +// function: (a) thunk symbol, (b) input section containing stub code, and +// (c) sequence number for the active thunk incarnation. When an old thunk +// goes out of range, we increment the sequence number and create a new +// thunk named .thunk.. +// +// * A thunk incarnation comprises (a) private-extern Defined symbol pointing +// to (b) an InputSection holding machine instructions (similar to a MachO +// stub), and (c) Reloc(s) that reference the real function for fixing-up +// the stub code. +// +// * std::vector MergedInputSection::thunks: A vector parallel +// to the inputs vector. We store new thunks via cheap vector append, rather +// than costly insertion into the inputs vector. +// +// Control Flow: +// +// * Writer::scanRelocations() via its helpers prepareSymbolRelocation() and +// prepareBranchTarget() dig into Reloc records. RelocAttrBits::BRANCH, +// InputSection::callSiteCount, and MergedOutputSection::callSiteCount +// memoize paths to call sites that might need thunks so that the +// MergedInputSection::finalize() can skip any Reloc, InputSection, or +// MergedOutputSection that needs no attention. +// +// * During address assignment, MergedInputSection::finalize() examines call +// sites by ascending address and creates thunks. When a function is beyond +// the range of a call site, we need a thunk. Place it at the largest +// available forward address from the call site. Call sites increase +// monotonically and thunks are always placed as far forward as possible; +// thus, we place thunks at monotonically increasing addresses. Once a thunk +// is placed, it and all previous input-section addresses are final. +// +// * MergedInputSection::finalize() and MergedInputSection::writeTo() merge +// the inputs and thunks vectors (both ordered by ascending address), which +// is simple and cheap. + +bool MergedOutputSection::needsThunks() const { + if (!target->usesThunks()) + return false; + uint64_t isecAddr = addr; + for (InputSection *isec : inputs) + isecAddr = alignTo(isecAddr, isec->align) + isec->getSize(); + uint64_t totalSize = isecAddr - addr; + return totalSize > target->branchRange; +} + +DenseMap lld::macho::thunkMap; + +// When __stubs is placed after __text, we must estimate the address +// beyond which stubs are within range of a simple forward branch. +uint64_t MergedOutputSection::estimateStubsInRangeVA(size_t callIdx) const { + uint64_t branchRange = target->branchRange; + size_t endIdx = inputs.size(); + InputSection *isec = inputs[callIdx]; + uint64_t isecVA = isec->getVA(); + // Tally the non-stub functions which still have call sites + // remaining to process, which yields the maximum number + // of thunks we might yet place. + size_t maxPotentialThunks = 0; + for (auto &tp : thunkMap) { + ThunkInfo &ti = tp.second; + maxPotentialThunks += + !tp.first->isInStubs() && ti.callSitesUsed < ti.callSiteCount; + } + // Tally the total size of input sections remaining to process. + uint64_t isecEnd = isec->getVA(); + for (size_t i = callIdx; i < endIdx; i++) { + InputSection *isec = inputs[i]; + isecEnd = alignTo(isecEnd, isec->align) + isec->getSize(); + } + // Estimate the address after which call sites can safely call stubs + // directly rather than through intermediary thunks. + uint64_t stubsInRangeVA = isecEnd + maxPotentialThunks * target->thunkSize + + in.stubs->getSize() - branchRange; + log("thunks = " + std::to_string(thunkMap.size()) + + ", potential = " + std::to_string(maxPotentialThunks) + + ", stubs = " + std::to_string(in.stubs->getSize()) + ", isecVA = " + + to_hexString(isecVA) + ", threshold = " + to_hexString(stubsInRangeVA) + + ", isecEnd = " + to_hexString(isecEnd) + + ", tail = " + to_hexString(isecEnd - isecVA) + + ", slop = " + to_hexString(branchRange - (isecEnd - isecVA))); + return stubsInRangeVA; } void MergedOutputSection::finalize() { uint64_t isecAddr = addr; uint64_t isecFileOff = fileOff; - for (InputSection *isec : inputs) { + auto finalizeOne = [&](InputSection *isec) { isecAddr = alignTo(isecAddr, isec->align); isecFileOff = alignTo(isecFileOff, isec->align); isec->outSecOff = isecAddr - addr; isec->outSecFileOff = isecFileOff - fileOff; + isec->isFinal = true; isecAddr += isec->getSize(); isecFileOff += isec->getFileSize(); + }; + + if (!needsThunks()) { + for (InputSection *isec : inputs) + finalizeOne(isec); + size = isecAddr - addr; + fileSize = isecFileOff - fileOff; + return; + } + + uint64_t branchRange = target->branchRange; + uint64_t stubsInRangeVA = target->outOfRangeVA; + size_t thunkSize = target->thunkSize; + size_t relocCount = 0; + size_t callSiteCount = 0; + size_t thunkCallCount = 0; + size_t thunkCount = 0; + + // inputs[finalIdx] is for finalization (address-assignment) + size_t finalIdx = 0; + // Kick-off by ensuring that the first input section has an address + for (size_t callIdx = 0, endIdx = inputs.size(); callIdx < endIdx; + ++callIdx) { + if (finalIdx == callIdx) + finalizeOne(inputs[finalIdx++]); + InputSection *isec = inputs[callIdx]; + assert(isec->isFinal); + uint64_t isecVA = isec->getVA(); + // Assign addresses up-to the forward branch-range limit + while (finalIdx < endIdx && + isecAddr + inputs[finalIdx]->getSize() < isecVA + branchRange) + finalizeOne(inputs[finalIdx++]); + if (isec->callSiteCount == 0) + continue; + if (finalIdx == endIdx && stubsInRangeVA == target->outOfRangeVA) { + // When we have finalized all input sections, __stubs (destined + // to follow __text) comes within range of forward branches and + // we can estimate the threshold address after which we can + // reach any stub with a forward branch. Note that although it + // sits in the middle of a loop, this code executes only once. + // It is in the loop because we need to call it at the proper + // time: the earliest call site from which the end of __text + // (and start of __stubs) comes within range of a forward branch. + stubsInRangeVA = estimateStubsInRangeVA(callIdx); + } + // Process relocs by ascending address, i.e., ascending offset within isec + std::vector &relocs = isec->relocs; + assert(is_sorted(relocs, + [](Reloc &a, Reloc &b) { return a.offset > b.offset; })); + for (Reloc &r : reverse(relocs)) { + ++relocCount; + if (!target->hasAttr(r.type, RelocAttrBits::BRANCH)) + continue; + ++callSiteCount; + // Calculate branch reachability boundaries + uint64_t callVA = isecVA + r.offset; + uint64_t lowVA = branchRange < callVA ? callVA - branchRange : 0; + uint64_t highVA = callVA + branchRange; + // Calculate our call referent address + auto *funcSym = r.referent.get(); + ThunkInfo &thunkInfo = thunkMap[funcSym]; + // The referent is not reachable, so we need to use a thunk ... + if (funcSym->isInStubs() && callVA >= stubsInRangeVA) { + // ... Oh, wait! We are close enough to the end that __stubs + // are now within range of a simple forward branch. + continue; + } + uint64_t funcVA = funcSym->resolveBranchVA(); + ++thunkInfo.callSitesUsed; + if (lowVA < funcVA && funcVA < highVA) { + // The referent is reachable with a simple call instruction. + continue; + } + ++thunkInfo.thunkCallCount; + ++thunkCallCount; + // If an existing thunk is reachable, use it ... + if (thunkInfo.sym) { + uint64_t thunkVA = thunkInfo.isec->getVA(); + if (lowVA < thunkVA && thunkVA < highVA) { + r.referent = thunkInfo.sym; + continue; + } + } + // ... otherwise, create a new thunk + if (isecAddr > highVA) { + // When there is small-to-no margin between highVA and + // isecAddr and the distance between subsequent call sites is + // smaller than thunkSize, then a new thunk can go out of + // range. Fix by unfinalizing inputs[finalIdx] to reduce the + // distance between callVA and highVA, then shift some thunks + // to occupy address-space formerly occupied by the + // unfinalized inputs[finalIdx]. + fatal(Twine(__FUNCTION__) + ": FIXME: thunk range overrun"); + } + thunkInfo.isec = make(); + thunkInfo.isec->name = isec->name; + thunkInfo.isec->segname = isec->segname; + thunkInfo.isec->parent = this; + StringRef thunkName = saver.save(funcSym->getName() + ".thunk." + + std::to_string(thunkInfo.sequence++)); + r.referent = thunkInfo.sym = symtab->addDefined( + thunkName, /*file=*/nullptr, thunkInfo.isec, /*value=*/0, + /*size=*/thunkSize, /*isWeakDef=*/false, /*isPrivateExtern=*/true, + /*isThumb=*/false); + target->populateThunk(thunkInfo.isec, funcSym); + finalizeOne(thunkInfo.isec); + thunks.push_back(thunkInfo.isec); + ++thunkCount; + } } size = isecAddr - addr; fileSize = isecFileOff - fileOff; + + log("thunks for " + parent->name + "," + name + + ": funcs = " + std::to_string(thunkMap.size()) + + ", relocs = " + std::to_string(relocCount) + + ", all calls = " + std::to_string(callSiteCount) + + ", thunk calls = " + std::to_string(thunkCallCount) + + ", thunks = " + std::to_string(thunkCount)); } void MergedOutputSection::writeTo(uint8_t *buf) const { - for (InputSection *isec : inputs) - isec->writeTo(buf + isec->outSecFileOff); + // Merge input sections from thunk & ordinary vectors + size_t i = 0, ie = inputs.size(); + size_t t = 0, te = thunks.size(); + while (i < ie || t < te) { + while (i < ie && (t == te || inputs[i]->getSize() == 0 || + inputs[i]->outSecOff < thunks[t]->outSecOff)) { + inputs[i]->writeTo(buf + inputs[i]->outSecFileOff); + ++i; + } + while (t < te && (i == ie || thunks[t]->outSecOff < inputs[i]->outSecOff)) { + thunks[t]->writeTo(buf + thunks[t]->outSecFileOff); + ++t; + } + } } // TODO: this is most likely wrong; reconsider how section flags diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -10,6 +10,8 @@ def help_hidden : Flag<["--"], "help-hidden">, HelpText<"Display help for hidden options">, Group; +def verbose : Flag<["--"], "verbose">, + Group; def error_limit_eq : Joined<["--"], "error-limit=">, HelpText<"Maximum number of errors to print before exiting (default: 20)">, Group; diff --git a/lld/MachO/Relocations.h b/lld/MachO/Relocations.h --- a/lld/MachO/Relocations.h +++ b/lld/MachO/Relocations.h @@ -52,8 +52,9 @@ struct Reloc { uint8_t type = llvm::MachO::GENERIC_RELOC_INVALID; - bool pcrel = false; uint8_t length = 0; + bool pcrel = false; + // The offset from the start of the subsection that this relocation belongs // to. uint64_t offset = 0; diff --git a/lld/MachO/Symbols.h b/lld/MachO/Symbols.h --- a/lld/MachO/Symbols.h +++ b/lld/MachO/Symbols.h @@ -72,6 +72,16 @@ // Whether this symbol is in the StubsSection. bool isInStubs() const { return stubsIndex != UINT32_MAX; } + uint64_t getStubVA() const; + uint64_t getGotVA() const; + uint64_t getTlvVA() const; + uint64_t resolveBranchVA() const { + assert(isa(this) || isa(this)); + return isInStubs() ? getStubVA() : getVA(); + } + uint64_t resolveGotVA() const { return isInGot() ? getGotVA() : getVA(); } + uint64_t resolveTlvVA() const { return isInGot() ? getTlvVA() : getVA(); } + // The index of this symbol in the GOT or the TLVPointer section, depending // on whether it is a thread-local. A given symbol cannot be referenced by // both these sections at once. @@ -207,6 +217,7 @@ : Symbol(DylibKind, name, file), refState(refState), weakDef(isWeakDef), tlv(isTlv) {} + uint64_t getVA() const override; bool isWeakDef() const override { return weakDef; } bool isWeakRef() const override { return refState == RefState::Weak; } bool isReferenced() const { return refState != RefState::Unreferenced; } diff --git a/lld/MachO/Symbols.cpp b/lld/MachO/Symbols.cpp --- a/lld/MachO/Symbols.cpp +++ b/lld/MachO/Symbols.cpp @@ -27,9 +27,25 @@ return demangle(b.getName()); } +uint64_t Symbol::getStubVA() const { return in.stubs->getVA(stubsIndex); } +uint64_t Symbol::getGotVA() const { return in.got->getVA(gotIndex); } +uint64_t Symbol::getTlvVA() const { return in.tlvPointers->getVA(gotIndex); } + uint64_t Defined::getVA() const { if (isAbsolute()) return value; + + if (!isec->isFinal) { + // A target arch that does not use thunks ought never ask for + // the address of a function that has not yet been finalized. + assert(target->usesThunks()); + + // MergedOutputSection::finalize() can seek the address of a + // function before its address is assigned. The thunking algorithm + // knows that unfinalized functions will be out of range, so it is + // expedient to return a contrived out-of-range address. + return target->outOfRangeVA; + } return isec->getVA() + value; } @@ -42,4 +58,8 @@ return isec->getFileOffset() + value; } +uint64_t DylibSymbol::getVA() const { + return isInStubs() ? getStubVA() : Symbol::getVA(); +} + void LazySymbol::fetchArchiveMember() { getFile()->fetch(sym); } diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -123,6 +123,10 @@ void addEntry(Symbol *sym); + uint64_t getVA(uint32_t gotIndex) const { + return addr + gotIndex * target->wordSize; + } + private: llvm::SetVector entries; }; @@ -285,11 +289,21 @@ StubsSection(); uint64_t getSize() const override; bool isNeeded() const override { return !entries.empty(); } + void finalize() override; void writeTo(uint8_t *buf) const override; const llvm::SetVector &getEntries() const { return entries; } // Returns whether the symbol was added. Note that every stubs entry will // have a corresponding entry in the LazyPointerSection. bool addEntry(Symbol *); + uint64_t getVA(uint32_t stubsIndex) const { + // MergedOutputSection::finalize() can seek the address of a + // stub before its address is assigned. Before __stubs is + // finalized, return a contrived out-of-range address. + return isFinal ? addr + stubsIndex * target->stubSize + : TargetInfo::outOfRangeVA; + } + + bool isFinal = false; // is address assigned? private: llvm::SetVector entries; diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -64,6 +64,7 @@ // Setting the index to 1 to pretend that this section is the text // section. index = 1; + isec->isFinal = true; } void MachHeaderSection::addLoadCommand(LoadCommand *lc) { @@ -425,6 +426,8 @@ } } +void StubsSection::finalize() { isFinal = true; } + bool StubsSection::addEntry(Symbol *sym) { bool inserted = entries.insert(sym); if (inserted) @@ -1101,12 +1104,12 @@ // __TEXT, __text) // Otherwise, it's an absolute symbol. if (config->isPic) - symtab->addSynthetic("__mh_execute_header", in.header->isec, 0, + symtab->addSynthetic("__mh_execute_header", in.header->isec, /*value=*/0, /*privateExtern=*/false, /*includeInSymtab=*/true); else symtab->addSynthetic("__mh_execute_header", - /*isec*/ nullptr, 0, + /*isec*/ nullptr, /*value=*/0, /*privateExtern=*/false, /*includeInSymtab=*/true); break; diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h --- a/lld/MachO/Target.h +++ b/lld/MachO/Target.h @@ -24,6 +24,7 @@ LLVM_ENABLE_BITMASK_ENUMS_IN_NAMESPACE(); class Symbol; +class Defined; class DylibSymbol; class InputSection; @@ -65,10 +66,16 @@ virtual uint64_t getPageSize() const = 0; + virtual void populateThunk(InputSection *thunk, Symbol *funcSym) { + llvm_unreachable("target does not use thunks"); + } + bool hasAttr(uint8_t type, RelocAttrBits bit) const { return getRelocAttrs(type).hasAttr(bit); } + bool usesThunks() const { return thunkSize > 0; } + uint32_t magic; llvm::MachO::CPUType cpuType; uint32_t cpuSubtype; @@ -79,6 +86,13 @@ size_t stubHelperHeaderSize; size_t stubHelperEntrySize; size_t wordSize; + + size_t thunkSize = 0; + uint64_t branchRange = 0; + + // We contrive this value as sufficiently far from any valid address + // that it will always be out-of-range for any architecture. + static constexpr uint64_t outOfRangeVA = 0xfull << 60; }; TargetInfo *createX86_64TargetInfo(); diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -511,7 +511,7 @@ } // namespace -// Adds stubs and bindings where necessary (e.g. if the symbol is a +// Add stubs and bindings where necessary (e.g. if the symbol is a // DylibSymbol.) static void prepareBranchTarget(Symbol *sym) { if (auto *dysym = dyn_cast(sym)) { @@ -535,7 +535,7 @@ } } } else { - assert(false && "invalid symbol type for branch"); + llvm_unreachable("invalid branch target symbol type"); } } @@ -548,12 +548,16 @@ return false; } -static void prepareSymbolRelocation(Symbol *sym, const InputSection *isec, - const Reloc &r) { +static void prepareSymbolRelocation(Symbol *sym, InputSection *isec, Reloc &r) { const RelocAttrs &relocAttrs = target->getRelocAttrs(r.type); if (relocAttrs.hasAttr(RelocAttrBits::BRANCH)) { prepareBranchTarget(sym); + if (target->usesThunks()) { + ThunkInfo &thunkInfo = thunkMap[sym]; + ++thunkInfo.callSiteCount; + ++isec->callSiteCount; + } } else if (relocAttrs.hasAttr(RelocAttrBits::GOT)) { if (relocAttrs.hasAttr(RelocAttrBits::POINTER) || needsBinding(sym)) in.got->addEntry(sym); @@ -958,8 +962,6 @@ seg->vmSize = addr - seg->firstSection()->addr; seg->fileSize = fileOff - seg->fileOff; } - - // FIXME(gkm): create branch-extension thunks here, then adjust addresses } void Writer::finalizeLinkEditSegment() { @@ -1062,7 +1064,11 @@ in.stubHelper->setup(); scanSymbols(); createOutputSections(); - // No more sections nor segments are created beyond this point. + // After this point, we create no new segments; HOWEVER, we might + // yet create branch-range extension thunks for architectures whose + // hardware call instructions have limited range, e.g., ARM(64). + // The thunks are created as InputSections interspersed among + // the ordinary __TEXT,_text InputSections. sortSegmentsAndSections(); createLoadCommands(); finalizeAddresses(); diff --git a/lld/test/MachO/arm64-thunks.s b/lld/test/MachO/arm64-thunks.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/arm64-thunks.s @@ -0,0 +1,300 @@ +# REQUIRES: aarch64 + +## Check for the following: +## (1) address match between thunk definitions and call destinations +## (2) address match between thunk page+offset computations and function definitions +## (3) a second thunk is created when the first one goes out of range +## (4) early calls to a dylib stub use a thunk, and later calls the stub directly +## Notes: +## 0x4000000 = 64 Mi = half the magnitude of the forward-branch range + +# RUN: rm -rf %t; mkdir %t +# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t/input.o +# RUN: %lld -arch arm64 -lSystem -o %t/thunk %t/input.o +# RUN: llvm-objdump -d --no-show-raw-insn %t/thunk | FileCheck %s + +# CHECK: Disassembly of section __TEXT,__text: + +# CHECK: [[#%.13x, A_PAGE:]][[#%.3x, A_OFFSET:]] <_a>: +# CHECK: bl 0x[[#%x, A:]] <_a> +# CHECK: bl 0x[[#%x, B:]] <_b> +# CHECK: bl 0x[[#%x, C:]] <_c> +# CHECK: bl 0x[[#%x, D_THUNK_0:]] <_d.thunk.0> +# CHECK: bl 0x[[#%x, E_THUNK_0:]] <_e.thunk.0> +# CHECK: bl 0x[[#%x, F_THUNK_0:]] <_f.thunk.0> +# CHECK: bl 0x[[#%x, G_THUNK_0:]] <_g.thunk.0> +# CHECK: bl 0x[[#%x, H_THUNK_0:]] <_h.thunk.0> +# CHECK: bl 0x[[#%x, NAN_THUNK_0:]] <___nan.thunk.0> + +# CHECK: [[#%.13x, B_PAGE:]][[#%.3x, B_OFFSET:]] <_b>: +# CHECK: bl 0x[[#%x, A]] <_a> +# CHECK: bl 0x[[#%x, B]] <_b> +# CHECK: bl 0x[[#%x, C]] <_c> +# CHECK: bl 0x[[#%x, D_THUNK_0]] <_d.thunk.0> +# CHECK: bl 0x[[#%x, E_THUNK_0]] <_e.thunk.0> +# CHECK: bl 0x[[#%x, F_THUNK_0]] <_f.thunk.0> +# CHECK: bl 0x[[#%x, G_THUNK_0]] <_g.thunk.0> +# CHECK: bl 0x[[#%x, H_THUNK_0]] <_h.thunk.0> +# CHECK: bl 0x[[#%x, NAN_THUNK_0]] <___nan.thunk.0> + +# CHECK: [[#%.13x, C_PAGE:]][[#%.3x, C_OFFSET:]] <_c>: +# CHECK: bl 0x[[#%x, A]] <_a> +# CHECK: bl 0x[[#%x, B]] <_b> +# CHECK: bl 0x[[#%x, C]] <_c> +# CHECK: bl 0x[[#%x, D:]] <_d> +# CHECK: bl 0x[[#%x, E:]] <_e> +# CHECK: bl 0x[[#%x, F_THUNK_0]] <_f.thunk.0> +# CHECK: bl 0x[[#%x, G_THUNK_0]] <_g.thunk.0> +# CHECK: bl 0x[[#%x, H_THUNK_0]] <_h.thunk.0> +# CHECK: bl 0x[[#%x, NAN_THUNK_0]] <___nan.thunk.0> + +# CHECK: [[#%x, D_THUNK_0]] <_d.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, D_PAGE:]] +# CHECK: add x16, x16, #[[#D_OFFSET:]] + +# CHECK: [[#%x, E_THUNK_0]] <_e.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, E_PAGE:]] +# CHECK: add x16, x16, #[[#E_OFFSET:]] + +# CHECK: [[#%x, F_THUNK_0]] <_f.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, F_PAGE:]] +# CHECK: add x16, x16, #[[#F_OFFSET:]] + +# CHECK: [[#%x, G_THUNK_0]] <_g.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, G_PAGE:]] +# CHECK: add x16, x16, #[[#G_OFFSET:]] + +# CHECK: [[#%x, H_THUNK_0]] <_h.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, H_PAGE:]] +# CHECK: add x16, x16, #[[#H_OFFSET:]] + +# CHECK: [[#%x, NAN_THUNK_0]] <___nan.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, NAN_PAGE:]] +# CHECK: add x16, x16, #[[#NAN_OFFSET:]] + +# CHECK: [[#%x, D_PAGE + D_OFFSET]] <_d>: +# CHECK: bl 0x[[#%x, A]] <_a> +# CHECK: bl 0x[[#%x, B]] <_b> +# CHECK: bl 0x[[#%x, C]] <_c> +# CHECK: bl 0x[[#%x, D]] <_d> +# CHECK: bl 0x[[#%x, E]] <_e> +# CHECK: bl 0x[[#%x, F_THUNK_0]] <_f.thunk.0> +# CHECK: bl 0x[[#%x, G_THUNK_0]] <_g.thunk.0> +# CHECK: bl 0x[[#%x, H_THUNK_0]] <_h.thunk.0> +# CHECK: bl 0x[[#%x, NAN_THUNK_0]] <___nan.thunk.0> + +# CHECK: [[#%x, E_PAGE + E_OFFSET]] <_e>: +# CHECK: bl 0x[[#%x, A_THUNK_0:]] <_a.thunk.0> +# CHECK: bl 0x[[#%x, B_THUNK_0:]] <_b.thunk.0> +# CHECK: bl 0x[[#%x, C]] <_c> +# CHECK: bl 0x[[#%x, D]] <_d> +# CHECK: bl 0x[[#%x, E]] <_e> +# CHECK: bl 0x[[#%x, F:]] <_f> +# CHECK: bl 0x[[#%x, G:]] <_g> +# CHECK: bl 0x[[#%x, H_THUNK_0]] <_h.thunk.0> +# CHECK: bl 0x[[#%x, NAN_THUNK_0]] <___nan.thunk.0> + +# CHECK: [[#%x, F_PAGE + F_OFFSET]] <_f>: +# CHECK: bl 0x[[#%x, A_THUNK_0]] <_a.thunk.0> +# CHECK: bl 0x[[#%x, B_THUNK_0]] <_b.thunk.0> +# CHECK: bl 0x[[#%x, C]] <_c> +# CHECK: bl 0x[[#%x, D]] <_d> +# CHECK: bl 0x[[#%x, E]] <_e> +# CHECK: bl 0x[[#%x, F]] <_f> +# CHECK: bl 0x[[#%x, G]] <_g> +# CHECK: bl 0x[[#%x, H_THUNK_0]] <_h.thunk.0> +# CHECK: bl 0x[[#%x, NAN_THUNK_0]] <___nan.thunk.0> + +# CHECK: [[#%x, G_PAGE + G_OFFSET]] <_g>: +# CHECK: bl 0x[[#%x, A_THUNK_0]] <_a.thunk.0> +# CHECK: bl 0x[[#%x, B_THUNK_0]] <_b.thunk.0> +# CHECK: bl 0x[[#%x, C_THUNK_0:]] <_c.thunk.0> +# CHECK: bl 0x[[#%x, D_THUNK_1:]] <_d.thunk.1> +# CHECK: bl 0x[[#%x, E]] <_e> +# CHECK: bl 0x[[#%x, F]] <_f> +# CHECK: bl 0x[[#%x, G]] <_g> +# CHECK: bl 0x[[#%x, H:]] <_h> +# CHECK: bl 0x[[#%x, STUBS:]] + +# CHECK: [[#%x, A_THUNK_0]] <_a.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, A_PAGE]]000 +# CHECK: add x16, x16, #[[#%d, A_OFFSET]] + +# CHECK: [[#%x, B_THUNK_0]] <_b.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, B_PAGE]]000 +# CHECK: add x16, x16, #[[#%d, B_OFFSET]] + +# CHECK: [[#%x, H_PAGE + H_OFFSET]] <_h>: +# CHECK: bl 0x[[#%x, A_THUNK_0]] <_a.thunk.0> +# CHECK: bl 0x[[#%x, B_THUNK_0]] <_b.thunk.0> +# CHECK: bl 0x[[#%x, C_THUNK_0]] <_c.thunk.0> +# CHECK: bl 0x[[#%x, D_THUNK_1]] <_d.thunk.1> +# CHECK: bl 0x[[#%x, E]] <_e> +# CHECK: bl 0x[[#%x, F]] <_f> +# CHECK: bl 0x[[#%x, G]] <_g> +# CHECK: bl 0x[[#%x, H]] <_h> +# CHECK: bl 0x[[#%x, STUBS]] + +# CHECK: <_main>: +# CHECK: bl 0x[[#%x, A_THUNK_0]] <_a.thunk.0> +# CHECK: bl 0x[[#%x, B_THUNK_0]] <_b.thunk.0> +# CHECK: bl 0x[[#%x, C_THUNK_0]] <_c.thunk.0> +# CHECK: bl 0x[[#%x, D_THUNK_1]] <_d.thunk.1> +# CHECK: bl 0x[[#%x, E_THUNK_1:]] <_e.thunk.1> +# CHECK: bl 0x[[#%x, F_THUNK_1:]] <_f.thunk.1> +# CHECK: bl 0x[[#%x, G]] <_g> +# CHECK: bl 0x[[#%x, H]] <_h> +# CHECK: bl 0x[[#%x, STUBS]] + +# CHECK: [[#%x, C_THUNK_0]] <_c.thunk.0>: +# CHECK: adrp x16, 0x[[#%x, C_PAGE]]000 +# CHECK: add x16, x16, #[[#%d, C_OFFSET]] + +# CHECK: [[#%x, D_THUNK_1]] <_d.thunk.1>: +# CHECK: adrp x16, 0x[[#%x, D_PAGE]] +# CHECK: add x16, x16, #[[#D_OFFSET]] + +# CHECK: [[#%x, E_THUNK_1]] <_e.thunk.1>: +# CHECK: adrp x16, 0x[[#%x, E_PAGE]] +# CHECK: add x16, x16, #[[#E_OFFSET]] + +# CHECK: [[#%x, F_THUNK_1]] <_f.thunk.1>: +# CHECK: adrp x16, 0x[[#%x, F_PAGE]] +# CHECK: add x16, x16, #[[#F_OFFSET]] + +# CHECK: Disassembly of section __TEXT,__stubs: + +# CHECK: [[#%x, NAN_PAGE + NAN_OFFSET]] <__stubs>: + +.subsections_via_symbols +.text + +.globl _a +.p2align 2 +_a: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + ret + +.globl _b +.p2align 2 +_b: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + .space 0x4000000-0x3c + ret + +.globl _c +.p2align 2 +_c: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + ret + +.globl _d +.p2align 2 +_d: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + .space 0x4000000-0x38 + ret + +.globl _e +.p2align 2 +_e: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + ret + +.globl _f +.p2align 2 +_f: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + .space 0x4000000-0x34 + ret + +.globl _g +.p2align 2 +_g: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + ret + +.globl _h +.p2align 2 +_h: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + .space 0x4000000-0x30 + ret + +.globl _main +.p2align 2 +_main: + bl _a + bl _b + bl _c + bl _d + bl _e + bl _f + bl _g + bl _h + bl ___nan + ret diff --git a/lld/test/MachO/tools/generate-thunkable-program.py b/lld/test/MachO/tools/generate-thunkable-program.py new file mode 100755 --- /dev/null +++ b/lld/test/MachO/tools/generate-thunkable-program.py @@ -0,0 +1,429 @@ +#!/usr/bin/env python3 + +"""Generate many skeletal functions with a thick call graph spanning a +large address space to induce lld to create branch-islands for arm64. + +""" +from __future__ import print_function +import random +import argparse +import string +from pprint import pprint +from math import factorial +from itertools import permutations + +# This list comes from libSystem.tbd and contains a sizeable subset +# of dylib calls available for all MacOS target archs. +libSystem_calls = ( + "__CurrentRuneLocale", "__DefaultRuneLocale", "__Exit", "__NSGetArgc", + "__NSGetArgv", "__NSGetEnviron", "__NSGetMachExecuteHeader", + "__NSGetProgname", "__PathLocale", "__Read_RuneMagi", "___Balloc_D2A", + "___Bfree_D2A", "___ULtod_D2A", "____mb_cur_max", "____mb_cur_max_l", + "____runetype", "____runetype_l", "____tolower", "____tolower_l", + "____toupper", "____toupper_l", "___add_ovflpage", "___addel", + "___any_on_D2A", "___assert_rtn", "___b2d_D2A", "___big_delete", + "___big_insert", "___big_keydata", "___big_return", "___big_split", + "___bigtens_D2A", "___bt_close", "___bt_cmp", "___bt_defcmp", + "___bt_defpfx", "___bt_delete", "___bt_dleaf", "___bt_fd", + "___bt_free", "___bt_get", "___bt_new", "___bt_open", "___bt_pgin", + "___bt_pgout", "___bt_put", "___bt_ret", "___bt_search", "___bt_seq", + "___bt_setcur", "___bt_split", "___bt_sync", "___buf_free", + "___call_hash", "___cleanup", "___cmp_D2A", "___collate_equiv_match", + "___collate_load_error", "___collate_lookup", "___collate_lookup_l", + "___copybits_D2A", "___cxa_atexit", "___cxa_finalize", + "___cxa_finalize_ranges", "___cxa_thread_atexit", "___d2b_D2A", + "___dbpanic", "___decrement_D2A", "___default_hash", "___default_utx", + "___delpair", "___diff_D2A", "___dtoa", "___expand_table", + "___fflush", "___fgetwc", "___find_bigpair", "___find_last_page", + "___fix_locale_grouping_str", "___fread", "___free_ovflpage", + "___freedtoa", "___gdtoa", "___gdtoa_locks", "___get_buf", + "___get_page", "___gethex_D2A", "___getonlyClocaleconv", + "___hash_open", "___hdtoa", "___hexdig_D2A", "___hexdig_init_D2A", + "___hexnan_D2A", "___hi0bits_D2A", "___hldtoa", "___i2b_D2A", + "___ibitmap", "___increment_D2A", "___isctype", "___istype", + "___istype_l", "___ldtoa", "___libc_init", "___lo0bits_D2A", + "___log2", "___lshift_D2A", "___maskrune", "___maskrune_l", + "___match_D2A", "___mb_cur_max", "___mb_sb_limit", "___memccpy_chk", + "___memcpy_chk", "___memmove_chk", "___memset_chk", "___mult_D2A", + "___multadd_D2A", "___nrv_alloc_D2A", "___opendir2", "___ovfl_delete", + "___ovfl_get", "___ovfl_put", "___pow5mult_D2A", "___put_page", + "___quorem_D2A", "___ratio_D2A", "___rec_close", "___rec_delete", + "___rec_dleaf", "___rec_fd", "___rec_fmap", "___rec_fpipe", + "___rec_get", "___rec_iput", "___rec_open", "___rec_put", + "___rec_ret", "___rec_search", "___rec_seq", "___rec_sync", + "___rec_vmap", "___rec_vpipe", "___reclaim_buf", "___rshift_D2A", + "___rv_alloc_D2A", "___s2b_D2A", "___sF", "___sclose", "___sdidinit", + "___set_ones_D2A", "___setonlyClocaleconv", "___sflags", "___sflush", + "___sfp", "___sfvwrite", "___sglue", "___sinit", "___slbexpand", + "___smakebuf", "___snprintf_chk", "___snprintf_object_size_chk", + "___split_page", "___sprintf_chk", "___sprintf_object_size_chk", + "___sread", "___srefill", "___srget", "___sseek", "___stack_chk_fail", + "___stack_chk_guard", "___stderrp", "___stdinp", "___stdoutp", + "___stpcpy_chk", "___stpncpy_chk", "___strcat_chk", "___strcp_D2A", + "___strcpy_chk", "___strlcat_chk", "___strlcpy_chk", "___strncat_chk", + "___strncpy_chk", "___strtodg", "___strtopdd", "___sum_D2A", + "___svfscanf", "___swbuf", "___swhatbuf", "___swrite", "___swsetup", + "___tens_D2A", "___tinytens_D2A", "___tolower", "___tolower_l", + "___toupper", "___toupper_l", "___trailz_D2A", "___ulp_D2A", + "___ungetc", "___ungetwc", "___vsnprintf_chk", "___vsprintf_chk", + "___wcwidth", "___wcwidth_l", "__allocenvstate", "__atexit_receipt", + "__c_locale", "__cleanup", "__closeutx", "__copyenv", + "__cthread_init_routine", "__deallocenvstate", "__endutxent", + "__flockfile_debug_stub", "__fseeko", "__ftello", "__fwalk", + "__getenvp", "__getutxent", "__getutxid", "__getutxline", + "__inet_aton_check", "__init_clock_port", "__int_to_time", + "__libc_fork_child", "__libc_initializer", "__long_to_time", + "__mkpath_np", "__mktemp", "__openutx", "__os_assert_log", + "__os_assert_log_ctx", "__os_assumes_log", "__os_assumes_log_ctx", + "__os_avoid_tail_call", "__os_crash", "__os_crash_callback", + "__os_crash_fmt", "__os_debug_log", "__os_debug_log_error_str", + "__putenvp", "__pututxline", "__rand48_add", "__rand48_mult", + "__rand48_seed", "__readdir_unlocked", "__reclaim_telldir", + "__seekdir", "__setenvp", "__setutxent", "__sigaction_nobind", + "__sigintr", "__signal_nobind", "__sigvec_nobind", "__sread", + "__sseek", "__subsystem_init", "__swrite", "__time32_to_time", + "__time64_to_time", "__time_to_int", "__time_to_long", + "__time_to_time32", "__time_to_time64", "__unsetenvp", "__utmpxname", + "_a64l", "_abort", "_abort_report_np", "_abs", "_acl_add_flag_np", + "_acl_add_perm", "_acl_calc_mask", "_acl_clear_flags_np", + "_acl_clear_perms", "_acl_copy_entry", "_acl_copy_ext", + "_acl_copy_ext_native", "_acl_copy_int", "_acl_copy_int_native", + "_acl_create_entry", "_acl_create_entry_np", "_acl_delete_def_file", + "_acl_delete_entry", "_acl_delete_fd_np", "_acl_delete_file_np", + "_acl_delete_flag_np", "_acl_delete_link_np", "_acl_delete_perm", + "_acl_dup", "_acl_free", "_acl_from_text", "_acl_get_entry", + "_acl_get_fd", "_acl_get_fd_np", "_acl_get_file", "_acl_get_flag_np", + "_acl_get_flagset_np", "_acl_get_link_np", "_acl_get_perm_np", + "_acl_get_permset", "_acl_get_permset_mask_np", "_acl_get_qualifier", + "_acl_get_tag_type", "_acl_init", "_acl_maximal_permset_mask_np", + "_acl_set_fd", "_acl_set_fd_np", "_acl_set_file", "_acl_set_flagset_np", + "_acl_set_link_np", "_acl_set_permset", "_acl_set_permset_mask_np", + "_acl_set_qualifier", "_acl_set_tag_type", "_acl_size", "_acl_to_text", + "_acl_valid", "_acl_valid_fd_np", "_acl_valid_file_np", + "_acl_valid_link", "_addr2ascii", "_alarm", "_alphasort", + "_arc4random", "_arc4random_addrandom", "_arc4random_buf", + "_arc4random_stir", "_arc4random_uniform", "_ascii2addr", "_asctime", + "_asctime_r", "_asprintf", "_asprintf_l", "_asxprintf", + "_asxprintf_exec", "_atexit", "_atexit_b", "_atof", "_atof_l", + "_atoi", "_atoi_l", "_atol", "_atol_l", "_atoll", "_atoll_l", + "_backtrace", "_backtrace_from_fp", "_backtrace_image_offsets", + "_backtrace_symbols", "_backtrace_symbols_fd", "_basename", + "_basename_r", "_bcopy", "_brk", "_bsd_signal", "_bsearch", + "_bsearch_b", "_btowc", "_btowc_l", "_catclose", "_catgets", + "_catopen", "_cfgetispeed", "_cfgetospeed", "_cfmakeraw", + "_cfsetispeed", "_cfsetospeed", "_cfsetspeed", "_cgetcap", + "_cgetclose", "_cgetent", "_cgetfirst", "_cgetmatch", "_cgetnext", + "_cgetnum", "_cgetset", "_cgetstr", "_cgetustr", "_chmodx_np", + "_clearerr", "_clearerr_unlocked", "_clock", "_clock_getres", + "_clock_gettime", "_clock_gettime_nsec_np", "_clock_port", + "_clock_sem", "_clock_settime", "_closedir", "_compat_mode", + "_confstr", "_copy_printf_domain", "_creat", "_crypt", "_ctermid", + "_ctermid_r", "_ctime", "_ctime_r", "_daemon", "_daylight", + "_dbm_clearerr", "_dbm_close", "_dbm_delete", "_dbm_dirfno", + "_dbm_error", "_dbm_fetch", "_dbm_firstkey", "_dbm_nextkey", + "_dbm_open", "_dbm_store", "_dbopen", "_devname", "_devname_r", + "_difftime", "_digittoint", "_digittoint_l", "_dirfd", "_dirname", + "_dirname_r", "_div", "_dprintf", "_dprintf_l", "_drand48", + "_duplocale", "_dxprintf", "_dxprintf_exec", "_ecvt", "_encrypt", + "_endttyent", "_endusershell", "_endutxent", "_endutxent_wtmp", + "_erand48", "_err", "_err_set_exit", "_err_set_exit_b", + "_err_set_file", "_errc", "_errx", "_execl", "_execle", "_execlp", + "_execv", "_execvP", "_execvp", "_exit", "_f_prealloc", "_fchmodx_np", + "_fclose", "_fcvt", "_fdopen", "_fdopendir", "_feof", "_feof_unlocked", + "_ferror", "_ferror_unlocked", "_fflagstostr", "_fflush", "_fgetc", + "_fgetln", "_fgetpos", "_fgetrune", "_fgets", "_fgetwc", "_fgetwc_l", + "_fgetwln", "_fgetwln_l", "_fgetws", "_fgetws_l", "_fileno", + "_fileno_unlocked", "_filesec_dup", "_filesec_free", + "_filesec_get_property", "_filesec_init", "_filesec_query_property", + "_filesec_set_property", "_filesec_unset_property", "_flockfile", + "_fmemopen", "_fmtcheck", "_fmtmsg", "_fnmatch", "_fopen", "_fork", + "_forkpty", "_fparseln", "_fprintf", "_fprintf_l", "_fpurge", + "_fputc", "_fputrune", "_fputs", "_fputwc", "_fputwc_l", "_fputws", + "_fputws_l", "_fread", "_free_printf_comp", "_free_printf_domain", + "_freelocale", "_freopen", "_fscanf", "_fscanf_l", "_fseek", + "_fseeko", "_fsetpos", "_fstatvfs", "_fstatx_np", "_fsync_volume_np", + "_ftell", "_ftello", "_ftime", "_ftok", "_ftrylockfile", + "_fts_children", "_fts_close", "_fts_open", "_fts_open_b", + "_fts_read", "_fts_set", "_ftw", "_fungetrune", "_funlockfile", + "_funopen", "_fwide", "_fwprintf", "_fwprintf_l", "_fwrite", + "_fwscanf", "_fwscanf_l", "_fxprintf", "_fxprintf_exec", "_gcvt", + "_getbsize", "_getc", "_getc_unlocked", "_getchar", "_getchar_unlocked", + "_getcwd", "_getdate", "_getdate_err", "_getdelim", "_getdiskbyname", + "_getenv", "_gethostid", "_gethostname", "_getipv4sourcefilter", + "_getlastlogx", "_getlastlogxbyname", "_getline", "_getloadavg", + "_getlogin", "_getlogin_r", "_getmntinfo", "_getmntinfo_r_np", + "_getmode", "_getopt", "_getopt_long", "_getopt_long_only", + "_getpagesize", "_getpass", "_getpeereid", "_getprogname", "_gets", + "_getsourcefilter", "_getsubopt", "_gettimeofday", "_getttyent", + "_getttynam", "_getusershell", "_getutmp", "_getutmpx", "_getutxent", + "_getutxent_wtmp", "_getutxid", "_getutxline", "_getvfsbyname", + "_getw", "_getwc", "_getwc_l", "_getwchar", "_getwchar_l", "_getwd", + "_glob", "_glob_b", "_globfree", "_gmtime", "_gmtime_r", "_grantpt", + "_hash_create", "_hash_destroy", "_hash_purge", "_hash_search", + "_hash_stats", "_hash_traverse", "_hcreate", "_hdestroy", + "_heapsort", "_heapsort_b", "_hsearch", "_imaxabs", "_imaxdiv", + "_inet_addr", "_inet_aton", "_inet_lnaof", "_inet_makeaddr", + "_inet_net_ntop", "_inet_net_pton", "_inet_neta", "_inet_netof", + "_inet_network", "_inet_nsap_addr", "_inet_nsap_ntoa", "_inet_ntoa", + "_inet_ntop", "_inet_ntop4", "_inet_ntop6", "_inet_pton", + "_initstate", "_insque", "_isalnum", "_isalnum_l", "_isalpha", + "_isalpha_l", "_isascii", "_isatty", "_isblank", "_isblank_l", + "_iscntrl", "_iscntrl_l", "_isdigit", "_isdigit_l", "_isgraph", + "_isgraph_l", "_ishexnumber", "_ishexnumber_l", "_isideogram", + "_isideogram_l", "_islower", "_islower_l", "_isnumber", "_isnumber_l", + "_isphonogram", "_isphonogram_l", "_isprint", "_isprint_l", + "_ispunct", "_ispunct_l", "_isrune", "_isrune_l", "_isspace", + "_isspace_l", "_isspecial", "_isspecial_l", "_isupper", "_isupper_l", + "_iswalnum", "_iswalnum_l", "_iswalpha", "_iswalpha_l", "_iswascii", + "_iswblank", "_iswblank_l", "_iswcntrl", "_iswcntrl_l", "_iswctype", + "_iswctype_l", "_iswdigit", "_iswdigit_l", "_iswgraph", "_iswgraph_l", + "_iswhexnumber", "_iswhexnumber_l", "_iswideogram", "_iswideogram_l", + "_iswlower", "_iswlower_l", "_iswnumber", "_iswnumber_l", + "_iswphonogram", "_iswphonogram_l", "_iswprint", "_iswprint_l", + "_iswpunct", "_iswpunct_l", "_iswrune", "_iswrune_l", "_iswspace", + "_iswspace_l", "_iswspecial", "_iswspecial_l", "_iswupper", + "_iswupper_l", "_iswxdigit", "_iswxdigit_l", "_isxdigit", + "_isxdigit_l", "_jrand48", "_kOSThermalNotificationPressureLevelName", + "_killpg", "_l64a", "_labs", "_lchflags", "_lchmod", "_lcong48", + "_ldiv", "_lfind", "_link_addr", "_link_ntoa", "_llabs", "_lldiv", + "_localeconv", "_localeconv_l", "_localtime", "_localtime_r", + "_lockf", "_login", "_login_tty", "_logout", "_logwtmp", "_lrand48", + "_lsearch", "_lstatx_np", "_lutimes", "_mblen", "_mblen_l", + "_mbmb", "_mbrlen", "_mbrlen_l", "_mbrrune", "_mbrtowc", "_mbrtowc_l", + "_mbrune", "_mbsinit", "_mbsinit_l", "_mbsnrtowcs", "_mbsnrtowcs_l", + "_mbsrtowcs", "_mbsrtowcs_l", "_mbstowcs", "_mbstowcs_l", "_mbtowc", + "_mbtowc_l", "_memmem", "_memset_s", "_mergesort", "_mergesort_b", + "_mkdirx_np", "_mkdtemp", "_mkdtempat_np", "_mkfifox_np", + "_mkostemp", "_mkostemps", "_mkostempsat_np", "_mkpath_np", + "_mkpathat_np", "_mkstemp", "_mkstemp_dprotected_np", "_mkstemps", + "_mkstempsat_np", "_mktemp", "_mktime", "_monaddition", "_moncontrol", + "_moncount", "_moninit", "_monitor", "_monoutput", "_monreset", + "_monstartup", "_mpool_close", "_mpool_filter", "_mpool_get", + "_mpool_new", "_mpool_open", "_mpool_put", "_mpool_sync", "_mrand48", + "_nanosleep", "_new_printf_comp", "_new_printf_domain", "_newlocale", + "_nextwctype", "_nextwctype_l", "_nftw", "_nice", "_nl_langinfo", + "_nl_langinfo_l", "_nrand48", "_nvis", "_off32", "_off64", + "_offtime", "_open_memstream", "_open_with_subsystem", + "_open_wmemstream", "_opendev", "_opendir", "_openpty", "_openx_np", + "_optarg", "_opterr", "_optind", "_optopt", "_optreset", "_pause", + "_pclose", "_perror", "_popen", "_posix2time", "_posix_openpt", + "_posix_spawnp", "_printf", "_printf_l", "_psignal", "_psort", + "_psort_b", "_psort_r", "_ptsname", "_ptsname_r", "_putc", + "_putc_unlocked", "_putchar", "_putchar_unlocked", "_putenv", + "_puts", "_pututxline", "_putw", "_putwc", "_putwc_l", "_putwchar", + "_putwchar_l", "_qsort", "_qsort_b", "_qsort_r", "_querylocale", + "_radixsort", "_raise", "_rand", "_rand_r", "_random", "_rb_tree_count", + "_rb_tree_find_node", "_rb_tree_find_node_geq", "_rb_tree_find_node_leq", + "_rb_tree_init", "_rb_tree_insert_node", "_rb_tree_iterate", + "_rb_tree_remove_node", "_readdir", "_readdir_r", "_readpassphrase", + "_reallocf", "_realpath", "_recv", "_regcomp", "_regcomp_l", + "_regerror", "_regexec", "_regfree", "_register_printf_domain_function", + "_register_printf_domain_render_std", "_regncomp", "_regncomp_l", + "_regnexec", "_regwcomp", "_regwcomp_l", "_regwexec", "_regwncomp", + "_regwncomp_l", "_regwnexec", "_remove", "_remque", "_rewind", + "_rewinddir", "_rindex", "_rpmatch", "_sbrk", "_scandir", + "_scandir_b", "_scanf", "_scanf_l", "_seed48", "_seekdir", "_send", + "_setbuf", "_setbuffer", "_setenv", "_sethostid", "_sethostname", + "_setinvalidrune", "_setipv4sourcefilter", "_setkey", "_setlinebuf", + "_setlocale", "_setlogin", "_setmode", "_setpgrp", "_setprogname", + "_setrgid", "_setruid", "_setrunelocale", "_setsourcefilter", + "_setstate", "_settimeofday", "_setttyent", "_setusershell", + "_setutxent", "_setutxent_wtmp", "_setvbuf", "_sigaction", + "_sigaddset", "_sigaltstack", "_sigblock", "_sigdelset", + "_sigemptyset", "_sigfillset", "_sighold", "_sigignore", + "_siginterrupt", "_sigismember", "_signal", "_sigpause", "_sigrelse", + "_sigset", "_sigsetmask", "_sigvec", "_skip", "_sl_add", "_sl_find", + "_sl_free", "_sl_init", "_sleep", "_snprintf", "_snprintf_l", + "_snvis", "_sockatmark", "_sprintf", "_sprintf_l", "_sradixsort", + "_srand", "_srand48", "_sranddev", "_srandom", "_srandomdev", + "_sscanf", "_sscanf_l", "_stat_with_subsystem", "_statvfs", + "_statx_np", "_stpcpy", "_stpncpy", "_strcasecmp", "_strcasecmp_l", + "_strcasestr", "_strcasestr_l", "_strcat", "_strcoll", "_strcoll_l", + "_strcspn", "_strdup", "_strenvisx", "_strerror", "_strerror_r", + "_strfmon", "_strfmon_l", "_strftime", "_strftime_l", "_strmode", + "_strncasecmp", "_strncasecmp_l", "_strncat", "_strndup", "_strnstr", + "_strnunvis", "_strnunvisx", "_strnvis", "_strnvisx", "_strpbrk", + "_strptime", "_strptime_l", "_strrchr", "_strsenvisx", "_strsep", + "_strsignal", "_strsignal_r", "_strsnvis", "_strsnvisx", "_strspn", + "_strsvis", "_strsvisx", "_strtod", "_strtod_l", "_strtof", + "_strtof_l", "_strtofflags", "_strtoimax", "_strtoimax_l", + "_strtok", "_strtok_r", "_strtol", "_strtol_l", "_strtold", + "_strtold_l", "_strtoll", "_strtoll_l", "_strtonum", "_strtoq", + "_strtoq_l", "_strtoul", "_strtoul_l", "_strtoull", "_strtoull_l", + "_strtoumax", "_strtoumax_l", "_strtouq", "_strtouq_l", "_strunvis", + "_strunvisx", "_strvis", "_strvisx", "_strxfrm", "_strxfrm_l", + "_suboptarg", "_svis", "_swab", "_swprintf", "_swprintf_l", + "_swscanf", "_swscanf_l", "_sxprintf", "_sxprintf_exec", + "_sync_volume_np", "_sys_errlist", "_sys_nerr", "_sys_siglist", + "_sys_signame", "_sysconf", "_sysctl", "_sysctlbyname", + "_sysctlnametomib", "_system", "_tcdrain", "_tcflow", "_tcflush", + "_tcgetattr", "_tcgetpgrp", "_tcgetsid", "_tcsendbreak", "_tcsetattr", + "_tcsetpgrp", "_tdelete", "_telldir", "_tempnam", "_tfind", + "_thread_stack_pcs", "_time", "_time2posix", "_timegm", "_timelocal", + "_timeoff", "_times", "_timespec_get", "_timezone", "_timingsafe_bcmp", + "_tmpfile", "_tmpnam", "_toascii", "_tolower", "_tolower_l", + "_toupper", "_toupper_l", "_towctrans", "_towctrans_l", "_towlower", + "_towlower_l", "_towupper", "_towupper_l", "_tre_ast_new_catenation", + "_tre_ast_new_iter", "_tre_ast_new_literal", "_tre_ast_new_node", + "_tre_ast_new_union", "_tre_compile", "_tre_fill_pmatch", + "_tre_free", "_tre_mem_alloc_impl", "_tre_mem_destroy", + "_tre_mem_new_impl", "_tre_parse", "_tre_stack_destroy", + "_tre_stack_new", "_tre_stack_num_objects", "_tre_tnfa_run_backtrack", + "_tre_tnfa_run_parallel", "_tsearch", "_ttyname", "_ttyname_r", + "_ttyslot", "_twalk", "_tzname", "_tzset", "_tzsetwall", "_ualarm", + "_ulimit", "_umaskx_np", "_uname", "_ungetc", "_ungetwc", + "_ungetwc_l", "_unlockpt", "_unsetenv", "_unvis", "_uselocale", + "_usleep", "_utime", "_utmpxname", "_uuid_clear", "_uuid_compare", + "_uuid_copy", "_uuid_generate", "_uuid_generate_random", + "_uuid_generate_time", "_uuid_is_null", "_uuid_pack", "_uuid_parse", + "_uuid_unpack", "_uuid_unparse", "_uuid_unparse_lower", + "_uuid_unparse_upper", "_vasprintf", "_vasprintf_l", "_vasxprintf", + "_vasxprintf_exec", "_vdprintf", "_vdprintf_l", "_vdxprintf", + "_vdxprintf_exec", "_verr", "_verrc", "_verrx", "_vfprintf", + "_vfprintf_l", "_vfscanf", "_vfscanf_l", "_vfwprintf", "_vfwprintf_l", + "_vfwscanf", "_vfwscanf_l", "_vfxprintf", "_vfxprintf_exec", + "_vis", "_vprintf", "_vprintf_l", "_vscanf", "_vscanf_l", + "_vsnprintf", "_vsnprintf_l", "_vsprintf", "_vsprintf_l", "_vsscanf", + "_vsscanf_l", "_vswprintf", "_vswprintf_l", "_vswscanf", + "_vswscanf_l", "_vsxprintf", "_vsxprintf_exec", "_vwarn", "_vwarnc", + "_vwarnx", "_vwprintf", "_vwprintf_l", "_vwscanf", "_vwscanf_l", + "_vxprintf", "_vxprintf_exec", "_wait", "_wait3", "_waitpid", + "_warn", "_warnc", "_warnx", "_wcpcpy", "_wcpncpy", "_wcrtomb", + "_wcrtomb_l", "_wcscasecmp", "_wcscasecmp_l", "_wcscat", "_wcschr", + "_wcscmp", "_wcscoll", "_wcscoll_l", "_wcscpy", "_wcscspn", + "_wcsdup", "_wcsftime", "_wcsftime_l", "_wcslcat", "_wcslcpy", + "_wcslen", "_wcsncasecmp", "_wcsncasecmp_l", "_wcsncat", "_wcsncmp", + "_wcsncpy", "_wcsnlen", "_wcsnrtombs", "_wcsnrtombs_l", "_wcspbrk", + "_wcsrchr", "_wcsrtombs", "_wcsrtombs_l", "_wcsspn", "_wcsstr", + "_wcstod", "_wcstod_l", "_wcstof", "_wcstof_l", "_wcstoimax", + "_wcstoimax_l", "_wcstok", "_wcstol", "_wcstol_l", "_wcstold", + "_wcstold_l", "_wcstoll", "_wcstoll_l", "_wcstombs", "_wcstombs_l", + "_wcstoul", "_wcstoul_l", "_wcstoull", "_wcstoull_l", "_wcstoumax", + "_wcstoumax_l", "_wcswidth", "_wcswidth_l", "_wcsxfrm", "_wcsxfrm_l", + "_wctob", "_wctob_l", "_wctomb", "_wctomb_l", "_wctrans", + "_wctrans_l", "_wctype", "_wctype_l", "_wcwidth", "_wcwidth_l", + "_wmemchr", "_wmemcmp", "_wmemcpy", "_wmemmove", "_wmemset", + "_wordexp", "_wordfree", "_wprintf", "_wprintf_l", "_wscanf", + "_wscanf_l", "_wtmpxname", "_xprintf", "_xprintf_exec" +) + +def print_here_head(name): + print("""\ +(tee %s.s |llvm-mc -filetype=obj -triple %s -o %s.o) <>12) + print_here_head(name) + print("""\ +### %s size=%x calls=%x""" % (name, size, calls)) + print_function_head(4, name) + for i in range(calls): + print(" bl %sx%08x\n .p2align 4" % + ("_" if args.os == "macos" else "", + addrs[random.randint(0, len(addrs)-1)])) + if args.os == "macos": + print(" bl %s\n .p2align 4" % + (libSystem_calls[random.randint(0, len(libSystem_calls)-1)])) + fill = size - 4 * (calls + 1) + assert fill > 0 + print("""\ + .fill 0x%x + ret""" % (fill)) + print_here_tail() + +def random_seed(): + """Generate a seed that can easily be passsed back in via --seed=STRING""" + return ''.join(random.choice(string.ascii_lowercase) for i in range(10)) + +def generate_sizes(base, megabytes): + total = 0 + while total < megabytes: + size = random.randint(0x100, 0x10000) * 0x10 + yield size + total += size + +def generate_addrs(addr, sizes): + i = 0 + while i < len(sizes): + yield addr + addr += sizes[i] + i += 1 + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + epilog="""\ +WRITEME +""") + parser.add_argument('--seed', type=str, default=random_seed(), + help='Seed the random number generator') + parser.add_argument('--size', type=int, default=None, + help='Total text size to generate, in megabytes') + parser.add_argument('--os', type=str, default="macos", + help='Target OS: macos, windows, or linux') + global args + args = parser.parse_args() + triples = { + "macos": "arm64-apple-macos", + "linux": "aarch64-pc-linux", + "windows": "aarch64-pc-windows" + } + global triple + triple = triples.get(args.os) + + print("""\ +### seed=%s triple=%s +""" % (args.seed, triple)) + + random.seed(args.seed) + + base = 0x4010 + megabytes = (int(args.size) if args.size else 512) * 1024 * 1024 + sizes = [size for size in generate_sizes(base, megabytes)] + addrs = [addr for addr in generate_addrs(base, sizes)] + + for i in range(len(addrs)): + print_function(addrs[i], sizes[i], addrs) + + print_here_head("main") + print("""\ +### _x%08x +""" % (addrs[-1] + sizes[-1])) + print_function_head(14 if args.os == "macos" else 4, "main") + print(" ret") + print_here_tail() + print("wait") + + +if __name__ == '__main__': + main()