diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -849,15 +849,12 @@ // We populate subsections by repeatedly splitting the last (highest // address) subsection. llvm::stable_sort(symbolIndices, [&](uint32_t lhs, uint32_t rhs) { - // Put private-label symbols that have no flags after other symbols at the - // same address. - StringRef lhsName = getSymName(nList[lhs]); - StringRef rhsName = getSymName(nList[rhs]); - if (nList[lhs].n_value == nList[rhs].n_value) { - if (isPrivateLabel(lhsName) && isPrivateLabel(rhsName)) - return nList[lhs].n_desc > nList[rhs].n_desc; - return !isPrivateLabel(lhsName) && isPrivateLabel(rhsName); - } + // Put extern weak symbols after other symbols at the same address so + // that weak symbol coalescing works correctly. See + // SymbolTable::addDefined() for details. + if (nList[lhs].n_value == nList[rhs].n_value && + nList[lhs].n_type & N_EXT && nList[rhs].n_type & N_EXT) + return !(nList[lhs].n_desc & N_WEAK_DEF) && (nList[rhs].n_desc & N_WEAK_DEF); return nList[lhs].n_value < nList[rhs].n_value; }); for (size_t j = 0; j < symbolIndices.size(); ++j) { @@ -883,17 +880,8 @@ if (!subsectionsViaSymbols || symbolOffset == 0 || sym.n_desc & N_ALT_ENTRY || !isa(isec)) { isec->hasAltEntry = symbolOffset != 0; - // If we have an private-label symbol that's an alias, and that alias - // doesn't have any flags of its own, then we can just reuse the aliased - // symbol. Our sorting step above ensures that any such symbols will - // appear after the non-private-label ones. See weak-def-alias-ignored.s - // for the motivation behind this. - if (symbolOffset == 0 && isPrivateLabel(name) && j != 0 && - sym.n_desc == 0) - symbols[symIndex] = symbols[symbolIndices[j - 1]]; - else - symbols[symIndex] = createDefined(sym, name, isec, symbolOffset, - symbolSize, forceHidden); + symbols[symIndex] = createDefined(sym, name, isec, symbolOffset, + symbolSize, forceHidden); continue; } auto *concatIsec = cast(isec); diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp --- a/lld/MachO/SymbolTable.cpp +++ b/lld/MachO/SymbolTable.cpp @@ -61,6 +61,38 @@ SmallVector dupSymDiags; } // namespace +// Move symbols at \p fromOff in \p fromIsec into \p toIsec, unless that symbol +// is \p skip. +static void transplantSymbolsAtOffset(InputSection *fromIsec, + InputSection *toIsec, Defined *skip, + uint64_t fromOff, uint64_t toOff) { + // Ensure the symbols will still be in address order after our insertions. + auto insertIt = llvm::upper_bound(toIsec->symbols, toOff, + [](uint64_t off, const Symbol *s) { + return cast(s)->value < off; + }); + llvm::erase_if(fromIsec->symbols, [&](Symbol *s) { + auto *d = cast(s); + if (d->value != fromOff) + return false; + if (d != skip) { + // This repeated insertion will be quadratic unless insertIt is the end + // iterator. However, that is typically the case for files that have + // .subsections_via_symbols set. + insertIt = toIsec->symbols.insert(insertIt, d); + d->isec = toIsec; + d->value = toOff; + // We don't want to have more than one unwindEntry at a given address, so + // drop the redundant ones. We We can safely drop the unwindEntries of + // the symbols in fromIsec since we will be adding another unwindEntry as + // we finish parsing toIsec's file. (We can assume that toIsec has its + // own unwindEntry because of the ODR.) + d->unwindEntry = nullptr; + } + return true; + }); +} + Defined *SymbolTable::addDefined(StringRef name, InputFile *file, InputSection *isec, uint64_t value, uint64_t size, bool isWeakDef, @@ -82,18 +114,27 @@ defined->referencedDynamically |= isReferencedDynamically; defined->noDeadStrip |= noDeadStrip; } - // FIXME: Handle this for bitcode files. - if (auto concatIsec = dyn_cast_or_null(isec)) + if (auto concatIsec = dyn_cast_or_null(isec)) { concatIsec->wasCoalesced = true; + // Any local symbols that alias the coalesced symbol should be moved + // into the prevailing section. Note that we have sorted the symbols + // in ObjFile::parseSymbols() such that extern weak symbols appear + // last, so we don't need to worry about subsequent symbols being + // added to an already-coalesced section. + if (defined->isec) + transplantSymbolsAtOffset(concatIsec, defined->isec, + /*skip=*/nullptr, value, defined->value); + } return defined; } if (defined->isWeakDef()) { - // FIXME: Handle this for bitcode files. if (auto concatIsec = dyn_cast_or_null(defined->isec)) { concatIsec->wasCoalesced = true; - concatIsec->symbols.erase(llvm::find(concatIsec->symbols, defined)); + if (isec) + transplantSymbolsAtOffset(concatIsec, isec, defined, defined->value, + value); } } else { std::string srcLoc1 = defined->getSourceLocation(); @@ -382,7 +423,7 @@ }; MapVector undefs; -} +} // namespace void macho::reportPendingDuplicateSymbols() { for (const auto &duplicate : dupSymDiags) { diff --git a/lld/test/MachO/local-alias-to-weak.s b/lld/test/MachO/local-alias-to-weak.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/local-alias-to-weak.s @@ -0,0 +1,149 @@ +# REQUIRES: x86 +## This test checks that when we coalesce weak definitions, their local symbol +## aliases defs don't cause the coalesced data to be retained. This was +## motivated by MC's aarch64 backend which automatically creates `ltmp` +## symbols at the start of each .text section. These symbols are frequently +## aliases of other symbols created by clang or other inputs to MC. I've chosen +## to explicitly create them here since we can then reference those symbols for +## a more complete test. +## +## Not retaining the data matters for more than just size -- we have a use case +## that depends on proper data coalescing to emit a valid file format. We also +## need this behavior to properly deduplicate the __objc_protolist section; +## failure to do this can result in dyld crashing on iOS 13. +## +## Finally, ld64 does all this regardless of whether .subsections_via_symbols is +## specified. We don't. But again, given how rare the lack of that directive is +## (I've only seen it from hand-written assembly inputs), I don't think we need +## to worry about it. + +# RUN: rm -rf %t; split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/weak-then-local.s -o %t/weak-then-local.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/local-then-weak.s -o %t/local-then-weak.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/no-subsections.s -o %t/no-subsections.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/no-dead-strip.s -o %t/no-dead-strip.o + +# RUN: %lld -lSystem -dylib %t/weak-then-local.o %t/local-then-weak.o -o %t/test1 +# RUN: llvm-objdump --macho --syms --section="__DATA,__data" --weak-bind %t/test1 | FileCheck %s +# RUN: %lld -lSystem -dylib %t/local-then-weak.o %t/weak-then-local.o -o %t/test2 +# RUN: llvm-objdump --macho --syms --section="__DATA,__data" --weak-bind %t/test2 | FileCheck %s + +## Check that we only have one copy of 0x123 in the data, not two. +# CHECK: Contents of (__DATA,__data) section +# CHECK-NEXT: 0000000000001000 23 01 00 00 00 00 00 00 00 10 00 00 00 00 00 00 {{$}} +# CHECK-NEXT: 0000000000001010 00 10 00 00 00 00 00 00 {{$}} +# CHECK-EMPTY: +# CHECK-NEXT: SYMBOL TABLE: +# CHECK-NEXT: 0000000000001000 l O __DATA,__data _alias +# CHECK-NEXT: 0000000000001008 l O __DATA,__data _ref +# CHECK-NEXT: 0000000000001000 l O __DATA,__data _alias +# CHECK-NEXT: 0000000000001010 l O __DATA,__data _ref +# CHECK-NEXT: 0000000000001000 w O __DATA,__data _weak +# CHECK-NEXT: 0000000000000000 *UND* dyld_stub_binder +# CHECK-EMPTY: +## Even though the references were to the non-weak `_alias` symbols, ld64 still +## emits weak binds as if they were the `_weak` symbol itself. We do not. I +## don't know of any programs that rely on this behavior, so I'm just +## documenting it here. +# CHECK-NEXT: Weak bind table: +# CHECK-NEXT: segment section address type addend symbol +# CHECK-EMPTY: + +# RUN: %lld -lSystem -dylib %t/local-then-weak.o %t/no-subsections.o -o %t/sub-nosub +# RUN: llvm-objdump --macho --syms --section="__DATA,__data" %t/sub-nosub | FileCheck %s --check-prefix SUB-NOSUB + +## This test case demonstrates a shortcoming of LLD: If .subsections_via_symbols +## isn't enabled, we don't elide the contents of coalesced weak symbols if they +## are part of a section that has other non-coalesced symbols. In contrast, LD64 +## does elide the contents. +# SUB-NOSUB: Contents of (__DATA,__data) section +# SUB-NOSUB-NEXT: 0000000000001000 23 01 00 00 00 00 00 00 00 10 00 00 00 00 00 00 +# SUB-NOSUB-NEXT: 0000000000001010 00 00 00 00 00 00 00 00 23 01 00 00 00 00 00 00 +# SUB-NOSUB-EMPTY: +# SUB-NOSUB-NEXT: SYMBOL TABLE: +# SUB-NOSUB-NEXT: 0000000000001000 l O __DATA,__data _alias +# SUB-NOSUB-NEXT: 0000000000001008 l O __DATA,__data _ref +# SUB-NOSUB-NEXT: 0000000000001010 l O __DATA,__data _zeros +# SUB-NOSUB-NEXT: 0000000000001000 l O __DATA,__data _alias +# SUB-NOSUB-NEXT: 0000000000001000 w O __DATA,__data _weak +# SUB-NOSUB-NEXT: 0000000000000000 *UND* dyld_stub_binder + +# RUN: %lld -lSystem -dylib %t/no-subsections.o %t/local-then-weak.o -o %t/nosub-sub +# RUN: llvm-objdump --macho --syms --section="__DATA,__data" %t/nosub-sub | FileCheck %s --check-prefix NOSUB-SUB + +# NOSUB-SUB: Contents of (__DATA,__data) section +# NOSUB-SUB-NEXT: 0000000000001000 00 00 00 00 00 00 00 00 23 01 00 00 00 00 00 00 +# NOSUB-SUB-NEXT: 0000000000001010 08 10 00 00 00 00 00 00 {{$}} +# NOSUB-SUB-EMPTY: +# NOSUB-SUB-NEXT: SYMBOL TABLE: +# NOSUB-SUB-NEXT: 0000000000001000 l O __DATA,__data _zeros +# NOSUB-SUB-NEXT: 0000000000001008 l O __DATA,__data _alias +# NOSUB-SUB-NEXT: 0000000000001008 l O __DATA,__data _alias +# NOSUB-SUB-NEXT: 0000000000001010 l O __DATA,__data _ref +# NOSUB-SUB-NEXT: 0000000000001008 w O __DATA,__data _weak +# NOSUB-SUB-NEXT: 0000000000000000 *UND* dyld_stub_binder + +## Verify that we don't drop any flags that the aliases have (such as +## .no_dead_strip). This is a regression test. We previously had subsections +## that were mistakenly stripped. + +# RUN: %lld -lSystem -dead_strip %t/no-dead-strip.o -o %t/no-dead-strip +# RUN: llvm-objdump --macho --section-headers %t/no-dead-strip | FileCheck %s \ +# RUN: --check-prefix=NO-DEAD-STRIP +# NO-DEAD-STRIP: __data 00000010 + +#--- weak-then-local.s +.globl _weak +.weak_definition _weak +.data +_weak: +_alias: + .quad 0x123 + +_ref: + .quad _alias + +.subsections_via_symbols + +#--- local-then-weak.s +.globl _weak +.weak_definition _weak +.data +_alias: +_weak: + .quad 0x123 + +_ref: + .quad _alias + +.subsections_via_symbols + +#--- no-subsections.s +.globl _weak +.weak_definition _weak +.data +_zeros: +.space 8 + +_weak: +_alias: + .quad 0x123 + +#--- no-dead-strip.s +.globl _main + +_main: + ret + +.data +.no_dead_strip l_foo, l_bar + +_foo: +l_foo: + .quad 0x123 + +l_bar: +_bar: + .quad 0x123 + +.subsections_via_symbols diff --git a/lld/test/MachO/private-label-alias.s b/lld/test/MachO/private-label-alias.s deleted file mode 100644 --- a/lld/test/MachO/private-label-alias.s +++ /dev/null @@ -1,105 +0,0 @@ -# REQUIRES: x86 -## This test checks that when we coalesce weak definitions, any flag-less -## private-label aliases to those weak defs don't cause the coalesced data to be -## retained. This test explicitly creates those private-label symbols, but it -## was actually motivated by MC's aarch64 backend which automatically creates -## them when emitting object files. I've chosen to explicitly create them here -## since we can then reference those symbols for a more complete test. -## -## Not retaining the data matters for more than just size -- we have a use case -## that depends on proper data coalescing to emit a valid file format. -## -## ld64 actually treats all local symbol aliases (not just the private ones) the -## same way. But implementing this is harder -- we would have to create those -## symbols first (so we can emit their names later), but we would have to -## ensure the linker correctly shuffles them around when their aliasees get -## coalesced. Emulating the behavior of weak binds for non-private symbols would -## be even trickier. Let's just deal with private-label symbols for now until we -## find a use case for more general local symbols. -## -## Finally, ld64 does all this regardless of whether .subsections_via_symbols is -## specified. We don't. But again, given how rare the lack of that directive is -## (I've only seen it from hand-written assembly inputs), I don't think we need -## to worry about it. - -# RUN: rm -rf %t; split-file %s %t -# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/weak-then-private.s -o %t/weak-then-private.o -# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/private-then-weak.s -o %t/private-then-weak.o -# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/no-dead-strip.s -o %t/no-dead-strip.o -# RUN: %lld -dylib %t/weak-then-private.o %t/private-then-weak.o -o %t/test1 -# RUN: %lld -dylib %t/private-then-weak.o %t/weak-then-private.o -o %t/test2 -# RUN: %lld -dead_strip %t/no-dead-strip.o -o %t/no-dead-strip - -# RUN: llvm-objdump --macho --syms --section="__DATA,__data" --weak-bind %t/test1 | FileCheck %s -# RUN: llvm-objdump --macho --syms --section="__DATA,__data" --weak-bind %t/test2 | FileCheck %s - -## Check that we only have one copy of 0x123 in the data, not two. -# CHECK: Contents of (__DATA,__data) section -# CHECK-NEXT: 0000000000001000 23 01 00 00 00 00 00 00 00 10 00 00 00 00 00 00 {{$}} -# CHECK-NEXT: 0000000000001010 00 10 00 00 00 00 00 00 {{$}} -# CHECK-EMPTY: -# CHECK-NEXT: SYMBOL TABLE: -# CHECK-NEXT: 0000000000001008 l O __DATA,__data _ref -# CHECK-NEXT: 0000000000001010 l O __DATA,__data _ref -# CHECK-NEXT: 0000000000001000 w O __DATA,__data _weak -# CHECK-NEXT: 0000000000000000 *UND* dyld_stub_binder -# CHECK-EMPTY: -## Even though the references were to the non-weak `l_ignored` aliases, we -## should still emit weak binds as if they were the `_weak` symbol itself. -# CHECK-NEXT: Weak bind table: -# CHECK-NEXT: segment section address type addend symbol -# CHECK-NEXT: __DATA __data 0x00001008 pointer 0 _weak -# CHECK-NEXT: __DATA __data 0x00001010 pointer 0 _weak - -## Verify that we don't drop any flags that private-label aliases have (such as -## .no_dead_strip). This is a regression test. We previously had subsections -## that were mistakenly stripped. - -# RUN: llvm-objdump --macho --section-headers %t/no-dead-strip | FileCheck %s \ -# RUN: --check-prefix=NO-DEAD-STRIP -# NO-DEAD-STRIP: __data 00000010 - -#--- weak-then-private.s -.globl _weak -.weak_definition _weak -.data -_weak: -l_ignored: - .quad 0x123 - -_ref: - .quad l_ignored - -.subsections_via_symbols - -#--- private-then-weak.s -.globl _weak -.weak_definition _weak -.data -l_ignored: -_weak: - .quad 0x123 - -_ref: - .quad l_ignored - -.subsections_via_symbols - -#--- no-dead-strip.s -.globl _main - -_main: - ret - -.data -.no_dead_strip l_foo, l_bar - -_foo: -l_foo: - .quad 0x123 - -l_bar: -_bar: - .quad 0x123 - -.subsections_via_symbols