diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h --- a/lld/MachO/SyntheticSections.h +++ b/lld/MachO/SyntheticSections.h @@ -531,13 +531,19 @@ class DeduplicatedCStringSection final : public CStringSection { public: - DeduplicatedCStringSection(); - uint64_t getSize() const override { return builder.getSize(); } + uint64_t getSize() const override { return size; } void finalizeContents() override; - void writeTo(uint8_t *buf) const override { builder.write(buf); } + void writeTo(uint8_t *buf) const override; private: - llvm::StringTableBuilder builder; + struct StringOffset { + uint8_t trailingZeros; + uint64_t outSecOff = UINT64_MAX; + + explicit StringOffset(uint8_t zeros) : trailingZeros(zeros) {} + }; + llvm::DenseMap stringOffsetMap; + size_t size = 0; }; /* diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -1341,7 +1341,10 @@ for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) { if (!isec->pieces[i].live) continue; - uint32_t pieceAlign = MinAlign(isec->pieces[i].inSecOff, align); + // See comment above DeduplicatedCStringSection for how alignment is + // handled. + uint32_t pieceAlign = + 1 << countTrailingZeros(isec->align | isec->pieces[i].inSecOff); offset = alignTo(offset, pieceAlign); isec->pieces[i].outSecOff = offset; isec->isFinal = true; @@ -1351,45 +1354,89 @@ } size = offset; } + // Mergeable cstring literals are found under the __TEXT,__cstring section. In // contrast to ELF, which puts strings that need different alignments into // different sections, clang's Mach-O backend puts them all in one section. // Strings that need to be aligned have the .p2align directive emitted before -// them, which simply translates into zero padding in the object file. +// them, which simply translates into zero padding in the object file. In other +// words, we have to infer the desired alignment of these cstrings from their +// addresses. // -// I *think* ld64 extracts the desired per-string alignment from this data by -// preserving each string's offset from the last section-aligned address. I'm -// not entirely certain since it doesn't seem consistent about doing this, and -// in fact doesn't seem to be correct in general: we can in fact can induce ld64 -// to produce a crashing binary just by linking in an additional object file -// that only contains a duplicate cstring at a different alignment. See PR50563 -// for details. +// We differ slightly from ld64 in how we've chosen to align these cstrings. +// Both LLD and ld64 preserve the number of trailing zeros in each cstring's +// address in the input object files. When deduplicating identical cstrings, +// both linkers pick the cstring whose address has more trailing zeros, and +// preserve the alignment of that address in the final binary. However, ld64 +// goes a step further and also preserves the offset of the cstring from the +// last section-aligned address. I.e. if a cstring is at offset 18 in the +// input, with a section alignment of 16, then both LLD and ld64 will ensure the +// final address is 2-byte aligned (since 18 == 16 + 2). But ld64 will also +// ensure that the final address is of the form 16 * k + 2 for some k. // -// On x86_64, the cstrings we've seen so far that require special alignment are -// all accessed by SIMD operations -- x86_64 requires SIMD accesses to be -// 16-byte-aligned. arm64 also seems to require 16-byte-alignment in some cases -// (PR50791), but I haven't tracked down the root cause. So for now, I'm just -// aligning all strings to 16 bytes. This is indeed wasteful, but -// implementation-wise it's simpler than preserving per-string -// alignment+offsets. It also avoids the aforementioned crash after -// deduplication of differently-aligned strings. Finally, the overhead is not -// huge: using 16-byte alignment (vs no alignment) is only a 0.5% size overhead -// when linking chromium_framework on x86_64. -DeduplicatedCStringSection::DeduplicatedCStringSection() - : builder(StringTableBuilder::RAW, /*Alignment=*/16) {} - +// Note that ld64's heuristic means that a dedup'ed cstring's final address is +// dependent on the order of the input object files. E.g. if in addition to the +// cstring at offset 18 above, we have a duplicate one in another file with a +// `.cstring` section alignment of 2 and an offset of zero, then ld64 will pick +// the cstring from the object file earlier on the command line (since both have +// the same number of trailing zeros in their address). So the final cstring may +// either be at some address `16 * k + 2` or at some address `2 * k`. +// +// I've opted not to follow this behavior primarily for implementation +// simplicity, and secondarily to save a few more bytes. It's not clear to me +// that preserving the section alignment + offset is ever necessary, and there +// are many cases that are clearly redundant. In particular, if an x86_64 object +// file contains some strings that are accessed via SIMD instructions, then the +// .cstring section in the object file will be 16-byte-aligned (since SIMD +// requires its operand addresses to be 16-byte aligned). However, there will +// typically also be other cstrings in the same file that aren't used via SIMD +// and don't need this alignment. They will be emitted at some arbitrary address +// `A`, but ld64 will treat them as being 16-byte aligned with an offset of `16 +// % A`. void DeduplicatedCStringSection::finalizeContents() { - // Add all string pieces to the string table builder to create section - // contents. + // Find the largest alignment required for each string. + for (const CStringInputSection *isec : inputs) { + for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) { + const StringPiece &piece = isec->pieces[i]; + if (!piece.live) + continue; + auto s = isec->getCachedHashStringRef(i); + assert(isec->align != 0); + uint8_t trailingZeros = countTrailingZeros(isec->align | piece.inSecOff); + auto it = stringOffsetMap.insert( + std::make_pair(s, StringOffset(trailingZeros))); + if (!it.second && it.first->second.trailingZeros < trailingZeros) + it.first->second.trailingZeros = trailingZeros; + } + } + + // Assign an offset for each string and save it to the corresponding + // StringPieces for easy access. for (CStringInputSection *isec : inputs) { - for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) - if (isec->pieces[i].live) - isec->pieces[i].outSecOff = - builder.add(isec->getCachedHashStringRef(i)); + for (size_t i = 0, e = isec->pieces.size(); i != e; ++i) { + if (!isec->pieces[i].live) + continue; + auto s = isec->getCachedHashStringRef(i); + auto it = stringOffsetMap.find(s); + assert(it != stringOffsetMap.end()); + StringOffset &offsetInfo = it->second; + if (offsetInfo.outSecOff == UINT64_MAX) { + offsetInfo.outSecOff = alignTo(size, 1 << offsetInfo.trailingZeros); + size = offsetInfo.outSecOff + s.size(); + } + isec->pieces[i].outSecOff = offsetInfo.outSecOff; + } isec->isFinal = true; } +} - builder.finalizeInOrder(); +void DeduplicatedCStringSection::writeTo(uint8_t *buf) const { + for (const auto &p : stringOffsetMap) { + StringRef data = p.first.val(); + uint64_t off = p.second.outSecOff; + if (!data.empty()) + memcpy(buf + off, data.data(), data.size()); + } } // This section is actually emitted as __TEXT,__const by ld64, but clang may diff --git a/lld/MachO/ld64-vs-lld.rst b/lld/MachO/ld64-vs-lld.rst --- a/lld/MachO/ld64-vs-lld.rst +++ b/lld/MachO/ld64-vs-lld.rst @@ -13,6 +13,12 @@ occurring. In particular, programs which compare string literals via pointer equality must be fixed to use value equality instead. +String Alignment +**************** +LLD is slightly less conservative about aligning cstrings, allowing it to pack +them more compactly. This should not result in any meaningful semantic +difference. + ``-no_deduplicate`` Flag ********************** - LD64: diff --git a/lld/test/MachO/cstring-align.s b/lld/test/MachO/cstring-align.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/cstring-align.s @@ -0,0 +1,133 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; split-file %s %t +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-empty.s -o %t/align-empty.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-4-0.s -o %t/align-4-0.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-4-2.s -o %t/align-4-2.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-0.s -o %t/align-16-0.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-2.s -o %t/align-16-2.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-4.s -o %t/align-16-4.o +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %t/align-16-8.s -o %t/align-16-8.o + +## Check that we preserve the alignment of cstrings. Alignment is determined +## not by section alignment but by the number of trailing zeros of the cstring's +## address in the input object file. + +## The non-dedup case is not particularly interesting since the null bytes don't +## get dedup'ed, meaning that the output strings get their offsets "naturally" +## preserved. + +# RUN: %lld -dylib %t/align-empty.o %t/align-4-0.o %t/align-16-0.o -o %t/align-4-0-16-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-4-0-16-0 | \ +# RUN: FileCheck %s -D#OFF1=4 -D#OFF2=16 +# RUN: %lld -dylib %t/align-empty.o %t/align-16-0.o %t/align-4-0.o -o %t/align-16-0-4-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-16-0-4-0 | \ +# RUN: FileCheck %s -D#OFF1=16 -D#OFF2=20 + +# RUN: %lld -dylib %t/align-empty.o %t/align-4-2.o %t/align-16-0.o -o %t/align-4-2-16-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-4-2-16-0 | \ +# RUN: FileCheck %s -D#OFF1=6 -D#OFF2=16 +# RUN: %lld -dylib %t/align-empty.o %t/align-16-0.o %t/align-4-2.o -o %t/align-16-0-4-2 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-16-0-4-2 | \ +# RUN: FileCheck %s -D#OFF1=16 -D#OFF2=22 + +# RUN: %lld -dylib %t/align-empty.o %t/align-4-0.o %t/align-16-2.o -o %t/align-4-0-16-2 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-4-0-16-2 | \ +# RUN: FileCheck %s -D#OFF1=4 -D#OFF2=18 +# RUN: %lld -dylib %t/align-empty.o %t/align-16-2.o %t/align-4-0.o -o %t/align-16-2-4-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/align-16-2-4-0 | \ +# RUN: FileCheck %s -D#OFF1=18 -D#OFF2=20 + +# CHECK: Contents of (__TEXT,__cstring) section +# CHECK-NEXT: [[#%.16x,START:]] {{$}} +# CHECK: [[#%.16x,START+OFF1]] a{{$}} +# CHECK: [[#%.16x,START+OFF2]] a{{$}} +# CHECK-EMPTY: + +## The dedup cases are more interesting... + +## Same offset, different alignments => pick higher alignment +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-0.o -o %t/dedup-4-0-16-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-0 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=16 +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-0.o %t/align-4-0.o -o %t/dedup-16-0-4-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-0-4-0 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=16 + +## 16 byte alignment vs 2 byte offset => align to 16 bytes +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-2.o %t/align-16-0.o -o %t/dedup-4-2-16-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-2-16-0 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=16 +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-0.o %t/align-4-2.o -o %t/dedup-16-0-4-2 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-0-4-2 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=16 + +## 4 byte alignment vs 2 byte offset => align to 4 bytes +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-2.o -o %t/dedup-4-0-16-2 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-2 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=4 +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-2.o %t/align-4-0.o -o %t/dedup-16-2-4-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-2-4-0 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=4 + +## Both inputs are 4-byte aligned, one via offset and the other via section alignment +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-4.o -o %t/dedup-4-0-16-4 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-4 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=4 +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-4.o %t/align-4-0.o -o %t/dedup-16-4-4-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-4-4-0 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=4 + +## 8-byte offset vs 4-byte section alignment => align to 8 bytes +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-4-0.o %t/align-16-8.o -o %t/dedup-4-0-16-8 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-4-0-16-8 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=8 +# RUN: %lld -dylib --deduplicate-literals %t/align-empty.o %t/align-16-8.o %t/align-4-0.o -o %t/dedup-16-8-4-0 +# RUN: llvm-objdump --macho --section="__TEXT,__cstring" %t/dedup-16-8-4-0 | \ +# RUN: FileCheck %s --check-prefix=DEDUP -D#OFF=8 + +# DEDUP: Contents of (__TEXT,__cstring) section +# DEDUP-NEXT: [[#%.16x,START:]] {{$}} +# DEDUP: [[#%.16x,START+OFF]] a{{$}} +# DEDUP-EMPTY: + +#--- align-empty.s +## We use this file to create an empty string at the start of every output +## file's .cstring section. This makes the test cases more interesting since LLD +## can't place the string "a" at the trivially-aligned zero offset. +.cstring +.p2align 2 +.asciz "" + +#--- align-4-0.s +.cstring +.p2align 2 +.asciz "a" + +#--- align-4-2.s +.cstring +.p2align 2 +.zero 0x2 +.asciz "a" + +#--- align-16-0.s +.cstring +.p2align 4 +.asciz "a" + +#--- align-16-2.s +.cstring +.p2align 4 +.zero 0x2 +.asciz "a" + +#--- align-16-4.s +.cstring +.p2align 4 +.zero 0x4 +.asciz "a" + +#--- align-16-8.s +.cstring +.p2align 4 +.zero 0x8 +.asciz "a" diff --git a/lld/test/MachO/cstring-dedup.s b/lld/test/MachO/cstring-dedup.s --- a/lld/test/MachO/cstring-dedup.s +++ b/lld/test/MachO/cstring-dedup.s @@ -8,12 +8,11 @@ # RUN: llvm-objdump --macho --section="__DATA,ptrs" --syms %t/test | FileCheck %s # RUN: llvm-readobj --section-headers %t/test | FileCheck %s --check-prefix=HEADER -## Make sure we only have 3 deduplicated strings in __cstring, and that they -## are 16-byte-aligned. +## Make sure we only have 3 deduplicated strings in __cstring. # STR: Contents of (__TEXT,__cstring) section -# STR: {{.*}}0 foo -# STR: {{.*}}0 barbaz -# STR: {{.*}}0 {{$}} +# STR: {{[[:xdigit:]]+}} foo +# STR: {{[[:xdigit:]]+}} barbaz +# STR: {{[[:xdigit:]]+}} {{$}} ## Make sure both symbol and section relocations point to the right thing. # CHECK: Contents of (__DATA,ptrs) section