diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -5,6 +5,7 @@ add_lld_library(lldMachO2 Arch/X86_64.cpp Driver.cpp + ExportTrie.cpp InputFiles.cpp InputSection.cpp OutputSegment.cpp diff --git a/lld/MachO/ExportTrie.h b/lld/MachO/ExportTrie.h new file mode 100644 --- /dev/null +++ b/lld/MachO/ExportTrie.h @@ -0,0 +1,41 @@ +//===- ExportTrie.h ---------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_MACHO_EXPORT_TRIE_H +#define LLD_MACHO_EXPORT_TRIE_H + +#include "llvm/ADT/ArrayRef.h" + +#include + +namespace lld { +namespace macho { + +struct TrieNode; +class Symbol; + +class TrieBuilder { +public: + void addSymbol(const Symbol &sym) { exported.push_back(&sym); } + // Returns the size in bytes of the serialized trie. + size_t build(); + void writeTo(uint8_t *buf); + +private: + TrieNode *makeNode(); + void sortAndBuild(llvm::MutableArrayRef vec, size_t lastPos, + size_t pos, TrieNode *node); + + std::vector exported; + std::vector nodes; +}; + +} // namespace macho +} // namespace lld + +#endif diff --git a/lld/MachO/ExportTrie.cpp b/lld/MachO/ExportTrie.cpp new file mode 100644 --- /dev/null +++ b/lld/MachO/ExportTrie.cpp @@ -0,0 +1,218 @@ +//===- ExportTrie.cpp -----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This is a partial implementation of the Mach-O export trie format. It's +// essentially a symbol table encoded as a compressed prefix trie, meaning that +// the common prefixes of each symbol name are shared for a more compact +// representation. For example, given two exported symbols _bar and _baz, we +// will have a trie like this: +// +// +--+--+ +// | _ba | +// +--+--+ +// / \ +// +-+-+ +-+-+ +// | r | | z | +// +-+-+ +-+-+ +// +// More documentation of the format can be found in +// llvm/tools/obj2yaml/macho2yaml.cpp. +// +//===----------------------------------------------------------------------===// + +#include "ExportTrie.h" +#include "Symbols.h" + +#include "lld/Common/Memory.h" +#include "llvm/ADT/Optional.h" +#include "llvm/BinaryFormat/MachO.h" +#include "llvm/Support/LEB128.h" + +using namespace llvm; +using namespace llvm::MachO; +using namespace lld; +using namespace lld::macho; + +namespace { + +struct Edge { + Edge(StringRef s, TrieNode *node) : substring(s), child(node) {} + + StringRef substring; + struct TrieNode *child; +}; + +} // namespace + +namespace lld { +namespace macho { + +struct ExportInfo { + uint64_t address; +}; + +struct TrieNode { + std::vector edges; + Optional info; + // Estimated offset from the start of the serialized trie to the current node. + // This will converge to the true offset when updateOffset() is run to a + // fixpoint. + size_t offset = 0; + + bool updateOffset(size_t &nextOffset); + void writeTo(uint8_t *buf); +}; + +bool TrieNode::updateOffset(size_t &nextOffset) { + size_t nodeSize = 1; // Length when no Symbol info + if (info) { + uint64_t flags = 0; + nodeSize = getULEB128Size(flags) + getULEB128Size(info->address); + // Overall node size so far is uleb128 of Symbol info + actual Symbol info. + nodeSize += getULEB128Size(nodeSize); + } + // Compute size of all child edges. + ++nodeSize; // Byte for number of children. + for (Edge &edge : edges) { + nodeSize += edge.substring.size() + 1 // String length. + + getULEB128Size(edge.child->offset); // Offset len. + } + // On input, 'nextOffset' is the new preferred location for this node. + bool result = (offset != nextOffset); + // Store new location in node object for use by parents. + offset = nextOffset; + nextOffset += nodeSize; + return result; +} + +void TrieNode::writeTo(uint8_t *buf) { + buf += offset; + if (info) { + // TrieNodes with Symbol info: size, flags address + uint64_t flags = 0; // TODO: emit proper flags + uint32_t nodeSize = getULEB128Size(flags) + getULEB128Size(info->address); + assert(nodeSize < 256); + *buf++ = nodeSize; + buf += encodeULEB128(flags, buf); + buf += encodeULEB128(info->address, buf); + } else { + // TrieNode with no Symbol info. + *buf++ = 0; // nodeSize + } + // Add number of children. TODO: Handle case where we have more than 256. + assert(edges.size() < 256); + *buf++ = edges.size(); + // Append each child edge substring and node offset. + for (const Edge &edge : edges) { + memcpy(buf, edge.substring.data(), edge.substring.size()); + buf += edge.substring.size(); + *buf++ = '\0'; + buf += encodeULEB128(edge.child->offset, buf); + } +} + +TrieNode *TrieBuilder::makeNode() { + auto node = make(); + nodes.emplace_back(node); + return node; +} + +static int charAt(const Symbol *sym, size_t pos) { + StringRef str = sym->getName(); + if (pos >= str.size()) + return -1; + return str[pos]; +} + +// Build the trie by performing a three-way radix quicksort: We start by sorting +// the strings by their first characters, then sort the strings with the same +// first characters by their second characters, and so on recursively. Each +// time the prefixes diverge, we add a node to the trie. +// +// node: The most recently created node along this path in the trie (i.e. +// the furthest from the root.) +// lastPos: The prefix length of the most recently created node, i.e. the number +// of characters along its path from the root. +// pos: The string index we are currently sorting on. Note that each symbol +// S contained in vec has the same prefix S[0...pos). +void TrieBuilder::sortAndBuild(MutableArrayRef vec, + size_t lastPos, size_t pos, TrieNode *node) { +tailcall: + if (vec.size() == 0) + return; + + // Partition items so that items in [0, i) are less than the pivot, + // [i, j) are the same as the pivot, and [j, vec.size()) are greater than + // the pivot. + const Symbol *pivotSymbol = vec[0]; + int pivot = charAt(pivotSymbol, pos); + size_t i = 0; + size_t j = vec.size(); + for (size_t k = 1; k < j;) { + int c = charAt(vec[k], pos); + if (c < pivot) + std::swap(vec[i++], vec[k++]); + else if (c > pivot) + std::swap(vec[--j], vec[k]); + else + k++; + } + + bool isTerminal = pivot == -1; + bool prefixesDiverge = i != 0 || j != vec.size(); + if (lastPos != pos && (isTerminal || prefixesDiverge)) { + TrieNode *newNode = makeNode(); + node->edges.emplace_back(pivotSymbol->getName().slice(lastPos, pos), + newNode); + node = newNode; + lastPos = pos; + } + + sortAndBuild(vec.slice(0, i), lastPos, pos, node); + sortAndBuild(vec.slice(j), lastPos, pos, node); + + if (isTerminal) { + assert(j - i == 1); // no duplicate symbols + node->info = {pivotSymbol->getVA() + ImageBase}; + } else { + // This is the tail-call-optimized version of the following call: + // multikeySort(vec.slice(i, j - i), lastPos, pos + 1, node); + vec = vec.slice(i, j - i); + ++pos; + goto tailcall; + } +} + +size_t TrieBuilder::build() { + if (exported.empty()) + return 0; + + TrieNode *root = makeNode(); + sortAndBuild(exported, 0, 0, root); + + // Assign each node in the vector an offset in the trie stream, iterating + // until all uleb128 sizes have stabilized. + size_t offset; + bool more; + do { + offset = 0; + more = false; + for (TrieNode *node : nodes) + more |= node->updateOffset(offset); + } while (more); + + return offset; +} + +void TrieBuilder::writeTo(uint8_t *buf) { + for (TrieNode *node : nodes) + node->writeTo(buf); +} + +} // namespace macho +} // namespace lld diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -9,6 +9,7 @@ #ifndef LLD_MACHO_SYNTHETIC_SECTIONS_H #define LLD_MACHO_SYNTHETIC_SECTIONS_H +#include "ExportTrie.h" #include "InputSection.h" #include "Target.h" #include "llvm/ADT/SetVector.h" @@ -101,14 +102,16 @@ public: ExportSection(); void finalizeContents(); - size_t getSize() const override { return contents.size(); } + size_t getSize() const override { return size; } // Like other sections in __LINKEDIT, the export section is special: its // offsets are recorded in the LC_DYLD_INFO_ONLY load command, instead of in // section headers. bool isHidden() const override { return true; } void writeTo(uint8_t *buf) override; - SmallVector contents; +private: + TrieBuilder trieBuilder; + size_t size = 0; }; // Stores the strings referenced by the symbol table. diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -8,6 +8,7 @@ #include "SyntheticSections.h" #include "Config.h" +#include "ExportTrie.h" #include "InputFiles.h" #include "OutputSegment.h" #include "SymbolTable.h" @@ -136,38 +137,14 @@ } void ExportSection::finalizeContents() { - raw_svector_ostream os{contents}; - std::vector exported; // TODO: We should check symbol visibility. for (const Symbol *sym : symtab->getSymbols()) if (auto *defined = dyn_cast(sym)) - exported.push_back(defined); - - if (exported.empty()) - return; - - if (exported.size() > 1) { - error("TODO: Unable to export more than 1 symbol"); - return; - } - - auto *sym = exported.front(); - os << (char)0; // Indicates non-leaf node - os << (char)1; // # of children - os << sym->getName() << '\0'; - encodeULEB128(sym->getName().size() + 4, os); // Leaf offset - - // Leaf node - uint64_t addr = sym->getVA() + ImageBase; - os << (char)(1 + getULEB128Size(addr)); - os << (char)0; // Flags - encodeULEB128(addr, os); - os << (char)0; // Terminator + trieBuilder.addSymbol(*defined); + size = trieBuilder.build(); } -void ExportSection::writeTo(uint8_t *buf) { - memcpy(buf, contents.data(), contents.size()); -} +void ExportSection::writeTo(uint8_t *buf) { trieBuilder.writeTo(buf); } SymtabSection::SymtabSection(StringPoolSection &stringPoolSection) : stringPoolSection(stringPoolSection) { diff --git a/lld/test/MachO/Inputs/libhello.s b/lld/test/MachO/Inputs/libhello.s --- a/lld/test/MachO/Inputs/libhello.s +++ b/lld/test/MachO/Inputs/libhello.s @@ -1,5 +1,8 @@ .section __TEXT,__cstring -.globl _hello_world +.globl _hello_world, _hello_its_me _hello_world: .asciz "Hello world!\n" + +_hello_its_me: +.asciz "Hello, it's me\n" diff --git a/lld/test/MachO/dylink.s b/lld/test/MachO/dylink.s --- a/lld/test/MachO/dylink.s +++ b/lld/test/MachO/dylink.s @@ -15,12 +15,16 @@ # CHECK: movq [[#%u, HELLO_OFF:]](%rip), %rsi # CHECK-NEXT: [[#%x, HELLO_RIP:]]: +# CHECK: movq [[#%u, HELLO_ITS_ME_OFF:]](%rip), %rsi +# CHECK-NEXT: [[#%x, HELLO_ITS_ME_RIP:]]: + # CHECK: movq [[#%u, GOODBYE_OFF:]](%rip), %rsi # CHECK-NEXT: [[#%x, GOODBYE_RIP:]]: # CHECK-LABEL: Bind table: -# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, HELLO_RIP + HELLO_OFF]] pointer 0 libhello _hello_world -# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, GOODBYE_RIP + GOODBYE_OFF]] pointer 0 libgoodbye _goodbye_world +# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, HELLO_RIP + HELLO_OFF]] pointer 0 libhello _hello_world +# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, HELLO_ITS_ME_RIP + HELLO_ITS_ME_OFF]] pointer 0 libhello _hello_its_me +# CHECK-DAG: __DATA_CONST __got 0x{{0*}}[[#%x, GOODBYE_RIP + GOODBYE_OFF]] pointer 0 libgoodbye _goodbye_world .section __TEXT,__text .globl _main @@ -34,6 +38,12 @@ movl $0x2000004, %eax # write() syscall mov $1, %rdi # stdout + movq _hello_its_me@GOTPCREL(%rip), %rsi + mov $15, %rdx # length of str + syscall + + movl $0x2000004, %eax # write() syscall + mov $1, %rdi # stdout movq _goodbye_world@GOTPCREL(%rip), %rsi mov $15, %rdx # length of str syscall diff --git a/lld/test/MachO/export-trie.s b/lld/test/MachO/export-trie.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/export-trie.s @@ -0,0 +1,44 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: lld -flavor darwinnew -dylib %t.o -o %t.dylib + +# RUN: llvm-objdump --syms --exports-trie %t.dylib | \ +# RUN: FileCheck %s --check-prefix=EXPORTS +# EXPORTS-LABEL: SYMBOL TABLE: +# EXPORTS-DAG: [[#%x, HELLO_ADDR:]] {{.*}} _hello +# EXPORTS-DAG: [[#%x, HELLO_WORLD_ADDR:]] {{.*}} _hello_world +# EXPORTS-DAG: [[#%x, HELLO_ITS_ME_ADDR:]] {{.*}} _hello_its_me +# EXPORTS-DAG: [[#%x, HELLO_ITS_YOU_ADDR:]] {{.*}} _hello_its_you +# EXPORTS-LABEL: Exports trie: +# EXPORTS-DAG: 0x{{0*}}[[#%X, HELLO_ADDR]] _hello +# EXPORTS-DAG: 0x{{0*}}[[#%X, HELLO_WORLD_ADDR]] _hello_world +# EXPORTS-DAG: 0x{{0*}}[[#%x, HELLO_ITS_ME_ADDR:]] _hello_its_me +# EXPORTS-DAG: 0x{{0*}}[[#%x, HELLO_ITS_YOU_ADDR:]] _hello_its_you + +## Check that we are sharing prefixes in the trie. +# RUN: obj2yaml %t.dylib | FileCheck %s +# CHECK-LABEL: ExportTrie: +# CHECK: Name: '' +# CHECK: Name: _hello +# CHECK: Name: _ +# CHECK: Name: world +# CHECK: Name: its_ +# CHECK: Name: me +# CHECK: Name: you + +.section __TEXT,__cstring +.globl _hello, _hello_world, _hello_its_me, _hello_its_you + +## Test for when an entire symbol name is a prefix of another. +_hello: +.asciz "Hello!\n" + +_hello_world: +.asciz "Hello world!\n" + +.data +_hello_its_me: +.asciz "Hello, it's me\n" + +_hello_its_you: +.asciz "Hello, it's you\n" diff --git a/lld/test/MachO/no-exports-dylib.s b/lld/test/MachO/no-exports-dylib.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/no-exports-dylib.s @@ -0,0 +1,6 @@ +# REQUIRES: x86 +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: lld -flavor darwinnew -dylib %t.o -o %t.dylib + +# RUN: obj2yaml %t.dylib | FileCheck %s +# CHECK: export_size: 0 diff --git a/lld/test/MachO/symtab.s b/lld/test/MachO/symtab.s --- a/lld/test/MachO/symtab.s +++ b/lld/test/MachO/symtab.s @@ -14,10 +14,41 @@ # CHECK-NEXT: ] # CHECK-NEXT: Value: # CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: bar +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __text (0x1) +# CHECK-NEXT: RefType: +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: foo +# CHECK-NEXT: Extern +# CHECK-NEXT: Type: Section (0xE) +# CHECK-NEXT: Section: __data +# CHECK-NEXT: RefType: +# CHECK-NEXT: Flags [ (0x0) +# CHECK-NEXT: ] +# CHECK-NEXT: Value: +# CHECK-NEXT: } # CHECK-NEXT: ] +.data +.global foo +foo: + .asciz "Hello world!\n" + +.text +.global bar .global _main _main: mov $0, %rax ret + +bar: + mov $2, %rax + ret