diff --git a/lld/MachO/CMakeLists.txt b/lld/MachO/CMakeLists.txt --- a/lld/MachO/CMakeLists.txt +++ b/lld/MachO/CMakeLists.txt @@ -15,6 +15,7 @@ DriverUtils.cpp Dwarf.cpp ExportTrie.cpp + ICF.cpp InputFiles.cpp InputSection.cpp LTO.cpp diff --git a/lld/MachO/ConcatOutputSection.h b/lld/MachO/ConcatOutputSection.h --- a/lld/MachO/ConcatOutputSection.h +++ b/lld/MachO/ConcatOutputSection.h @@ -40,6 +40,7 @@ void finalize() override; bool needsThunks() const; uint64_t estimateStubsInRangeVA(size_t callIdx) const; + void eraseOmittedInputSections(); void writeTo(uint8_t *buf) const override; diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp --- a/lld/MachO/ConcatOutputSection.cpp +++ b/lld/MachO/ConcatOutputSection.cpp @@ -17,8 +17,7 @@ #include "lld/Common/Memory.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/Support/ScopedPrinter.h" - -#include +#include "llvm/Support/TimeProfiler.h" using namespace llvm; using namespace llvm::MachO; @@ -357,3 +356,12 @@ flags |= input->flags; flags &= pureMask; } + +void ConcatOutputSection::eraseOmittedInputSections() { + // Remove the duplicates from inputs + inputs.erase(std::remove_if(inputs.begin(), inputs.end(), + [](const ConcatInputSection *isec) -> bool { + return isec->shouldOmitFromOutput(); + }), + inputs.end()); +} diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -57,6 +57,13 @@ dynamic_lookup, }; +enum class ICFLevel { + unknown, + none, + safe, + all, +}; + struct SectionAlign { llvm::StringRef segName; llvm::StringRef sectName; @@ -126,6 +133,7 @@ NamespaceKind namespaceKind = NamespaceKind::twolevel; UndefinedSymbolTreatment undefinedSymbolTreatment = UndefinedSymbolTreatment::error; + ICFLevel icfLevel = ICFLevel::none; llvm::MachO::HeaderFileType outputType; std::vector systemLibraryRoots; std::vector librarySearchPaths; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -698,6 +698,29 @@ return treatment; } +static ICFLevel getICFLevel(const ArgList &args) { + bool noDeduplicate = args.hasArg(OPT_no_deduplicate); + StringRef icfLevelStr = args.getLastArgValue(OPT_icf); + auto icfLevel = StringSwitch(icfLevelStr) + .Cases("none", "", ICFLevel::none) + .Case("safe", ICFLevel::safe) + .Case("all", ICFLevel::all) + .Default(ICFLevel::unknown); + if (icfLevel == ICFLevel::unknown) { + warn(Twine("unknown -icf OPTION `") + icfLevelStr + + "', defaulting to `none'"); + icfLevel = ICFLevel::none; + } else if (icfLevel != ICFLevel::none && noDeduplicate) { + warn(Twine("`-icf " + icfLevelStr + + "' conflicts with -no_deduplicate, setting to `none'")); + icfLevel = ICFLevel::none; + } else if (icfLevel == ICFLevel::safe) { + warn(Twine("`-icf safe' is not yet implemented, reverting to `none'")); + icfLevel = ICFLevel::none; + } + return icfLevel; +} + static void warnIfDeprecatedOption(const Option &opt) { if (!opt.getGroup().isValid()) return; @@ -1096,6 +1119,8 @@ config->undefinedSymbolTreatment = getUndefinedSymbolTreatment(args); + config->icfLevel = getICFLevel(args); + if (config->outputType == MH_EXECUTE) config->entry = symtab->addUndefined(args.getLastArgValue(OPT_e, "_main"), /*file=*/nullptr, diff --git a/lld/MachO/ICF.h b/lld/MachO/ICF.h new file mode 100644 --- /dev/null +++ b/lld/MachO/ICF.h @@ -0,0 +1,42 @@ +//===- ICF.h ----------------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_MACHO_ICF_H +#define LLD_MACHO_ICF_H + +#include "lld/Common/LLVM.h" +#include + +namespace lld { +namespace macho { + +class ConcatInputSection; + +class ICF { +public: + ICF(std::vector &inputs); + + void run(); + void segregate(size_t begin, size_t end, + std::function + equals); + size_t findBoundary(size_t begin, size_t end); + void forEachClassRange(size_t begin, size_t end, + std::function func); + void forEachClass(std::function func); + + // ICF needs a copy of the inputs vector because its equivalence-class + // segregation algorithm destroys the proper sequence. + std::vector icfInputs; +}; + +} // namespace macho +} // namespace lld + +#endif diff --git a/lld/MachO/ICF.cpp b/lld/MachO/ICF.cpp new file mode 100644 --- /dev/null +++ b/lld/MachO/ICF.cpp @@ -0,0 +1,257 @@ +//===- ICF.cpp ------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "ICF.h" +#include "ConcatOutputSection.h" +#include "InputSection.h" +#include "Symbols.h" +#include "llvm/Support/Parallel.h" + +#include + +using namespace llvm; +using namespace lld; +using namespace lld::macho; + +ICF::ICF(std::vector &inputs) { + icfInputs.assign(inputs.begin(), inputs.end()); +} + +// ICF = Identical Code Folding +// +// We only fold __TEXT,__text, so this is really "code" folding, and not +// "COMDAT" folding. String and scalar constant literals are deduplicated +// elsewhere. +// +// Summary of segments & sections: +// +// Since folding never occurs across output-section boundaries, +// ConcatOutputSection is the natural input for ICF. +// +// The __TEXT segment is readonly at the MMU. Some sections are already +// deduplicated elsewhere (__TEXT,__cstring & __TEXT,__literal*) and some are +// synthetic and inherently free of duplicates (__TEXT,__stubs & +// __TEXT,__unwind_info). We only run ICF on __TEXT,__text. One might hope ICF +// could work on __TEXT,__concat, but doing so induces many test failures. +// +// The __LINKEDIT segment is readonly at the MMU, yet entirely synthetic, and +// thus ineligible for ICF. +// +// The __DATA_CONST segment is read/write at the MMU, but is logically const to +// the application after dyld applies fixups to pointer data. Some sections are +// deduplicated elsewhere (__DATA_CONST,__cfstring), and some are synthetic +// (__DATA_CONST,__got). There are no ICF opportunities here. +// +// The __DATA segment is read/write at the MMU, and as application-writeable +// data, none of its sections are eligible for ICF. +// +// Please see the large block comment in lld/ELF/ICF.cpp for an explanation +// of the segregation algorithm. +// +// FIXME(gkm): implement keep-unique attributes +// FIXME(gkm): implement address-significance tables for MachO object files + +static unsigned icfPass = 0; +static std::atomic icfRepeat{false}; + +// Compare everything except the relocation referents +static bool equalsConstant(const ConcatInputSection *ia, + const ConcatInputSection *ib) { + if (ia->data.size() != ib->data.size()) + return false; + if (ia->data != ib->data) + return false; + if (ia->flags != ib->flags) + return false; + if (ia->relocs.size() != ib->relocs.size()) + return false; + auto f = [&](const Reloc &ra, const Reloc &rb) { + if (ra.type != rb.type) + return false; + if (ra.pcrel != rb.pcrel) + return false; + if (ra.length != rb.length) + return false; + if (ra.offset != rb.offset) + return false; + if (ra.addend != rb.addend) + return false; + if (ra.referent.is() != rb.referent.is()) + return false; // a nice place to breakpoint + return true; + }; + return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), + f); +} + +// Compare only the relocation referents +static bool equalsVariable(const ConcatInputSection *ia, + const ConcatInputSection *ib) { + assert(ia->relocs.size() == ib->relocs.size()); + auto f = [&](const Reloc &ra, const Reloc &rb) { + if (ra.referent == rb.referent) + return true; + if (ra.referent.is()) { + const auto *sa = ra.referent.get(); + const auto *sb = rb.referent.get(); + if (sa->kind() != sb->kind()) + return false; + if (isa(sa)) { + const auto *da = dyn_cast(sa); + const auto *db = dyn_cast(sb); + if (da->value != db->value) + return false; + if (da->isAbsolute() != da->isAbsolute()) + return false; + if (da->isec) + if (da->isec->icfEqClass[icfPass % 2] != + db->isec->icfEqClass[icfPass % 2]) + return false; + } else if (isa(sa)) { + // There is one DylibSymbol per gotIndex and we already checked for + // symbol equality, thus we know that these must be different. + return false; + } else { + llvm_unreachable("equalsVariable symbol kind"); + } + } else { + const auto *sa = ra.referent.get(); + const auto *sb = rb.referent.get(); + if (sa->icfEqClass[icfPass % 2] != sb->icfEqClass[icfPass % 2]) + return false; + } + return true; + }; + return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), + f); +} + +// Find the first InputSection after BEGIN whose equivalence class differs +size_t ICF::findBoundary(size_t begin, size_t end) { + uint64_t beginHash = icfInputs[begin]->icfEqClass[icfPass % 2]; + for (size_t i = begin + 1; i < end; ++i) + if (beginHash != icfInputs[i]->icfEqClass[icfPass % 2]) + return i; + return end; +} + +// Invoke FUNC on subranges with matching equivalence class +void ICF::forEachClassRange(size_t begin, size_t end, + std::function func) { + while (begin < end) { + size_t mid = findBoundary(begin, end); + func(begin, mid); + begin = mid; + } +} + +// Split icfInputs into shards, then parallelize invocation of FUNC on subranges +// with matching equivalence class +void ICF::forEachClass(std::function func) { + // Only use threads when the benefits outweigh the overhead. + const size_t threadingThreshold = 1024; + if (icfInputs.size() < threadingThreshold) { + forEachClassRange(0, icfInputs.size(), func); + ++icfPass; + return; + } + + // Shard into non-overlapping intervals, and call FUNC in parallel. The + // sharding must be completed before any calls to FUNC are made so that FUNC + // can modify the InputSection in its shard without causing data races. + const size_t shards = 256; + size_t step = icfInputs.size() / shards; + size_t boundaries[shards + 1]; + boundaries[0] = 0; + boundaries[shards] = icfInputs.size(); + parallelForEachN(1, shards, [&](size_t i) { + boundaries[i] = findBoundary((i - 1) * step, icfInputs.size()); + }); + parallelForEachN(1, shards + 1, [&](size_t i) { + if (boundaries[i - 1] < boundaries[i]) { + forEachClassRange(boundaries[i - 1], boundaries[i], func); + } + }); + ++icfPass; +} + +void ICF::run() { + // Into each origin-section hash, combine all reloc referent section hashes. + for (icfPass = 0; icfPass < 2; ++icfPass) { + parallelForEach(icfInputs, [&](InputSection *isec) { + uint64_t hash = isec->icfEqClass[icfPass % 2]; + for (const Reloc &r : isec->relocs) { + if (auto *sym = r.referent.dyn_cast()) { + if (auto *dylibSym = dyn_cast(sym)) + hash += dylibSym->stubsHelperIndex; + else if (auto *defined = dyn_cast(sym)) + hash += + defined->value + + (defined->isec ? defined->isec->icfEqClass[icfPass % 2] : 0); + else + llvm_unreachable("foldIdenticalSections symbol kind"); + } + } + // Set MSB to 1 to avoid collisions with non-hashed classes. + isec->icfEqClass[(icfPass + 1) % 2] = hash | (1ull << 63); + }); + } + + llvm::stable_sort(icfInputs, + [](const InputSection *a, const InputSection *b) { + return a->icfEqClass[0] < b->icfEqClass[0]; + }); + forEachClass( + [&](size_t begin, size_t end) { segregate(begin, end, equalsConstant); }); + + // Split equivalence groups by comparing relocations until convergence + do { + icfRepeat = false; + forEachClass([&](size_t begin, size_t end) { + segregate(begin, end, equalsVariable); + }); + } while (icfRepeat); + log("ICF needed " + Twine(icfPass) + " iterations"); + + // Fold sections within equivalence classes + forEachClass([&](size_t begin, size_t end) { + if (end - begin < 2) + return; + ConcatInputSection *beginIsec = icfInputs[begin]; + for (size_t i = begin + 1; i < end; ++i) + beginIsec->foldIdentical(icfInputs[i]); + }); +} + +// Split an equivalence class into smaller classes. +void ICF::segregate( + size_t begin, size_t end, + std::function + equals) { + while (begin < end) { + // Divide [begin, end) into two. Let mid be the start index of the + // second group. + auto bound = std::stable_partition(icfInputs.begin() + begin + 1, + icfInputs.begin() + end, + [&](ConcatInputSection *isec) { + return equals(icfInputs[begin], isec); + }); + size_t mid = bound - icfInputs.begin(); + + // Split [begin, end) into [begin, mid) and [mid, end). We use mid as an + // equivalence class ID because every group ends with a unique index. + for (size_t i = begin; i < mid; ++i) + icfInputs[i]->icfEqClass[(icfPass + 1) % 2] = mid; + + // If we created a group, we need to iterate the main loop again. + if (mid != end) + icfRepeat = true; + + begin = mid; + } +} diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -642,10 +642,16 @@ auto *concatIsec = cast(isec); auto *nextIsec = make(*concatIsec); - nextIsec->data = isec->data.slice(symbolOffset); nextIsec->numRefs = 0; nextIsec->wasCoalesced = false; - isec->data = isec->data.slice(0, symbolOffset); + if (isZeroFill(isec->flags)) { + // Zero-fill sections have NULL data.data() non-zero data.size() + nextIsec->data = {nullptr, isec->data.size() - symbolOffset}; + isec->data = {nullptr, symbolOffset}; + } else { + nextIsec->data = isec->data.slice(symbolOffset); + isec->data = isec->data.slice(0, symbolOffset); + } // By construction, the symbol will be at offset zero in the new // subsection. diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h --- a/lld/MachO/InputSection.h +++ b/lld/MachO/InputSection.h @@ -23,6 +23,7 @@ class InputFile; class OutputSection; +class Defined; class InputSection { public: @@ -54,8 +55,20 @@ uint32_t align = 1; uint32_t flags = 0; uint32_t callSiteCount = 0; - bool isFinal = false; // is address assigned? + // is address assigned? + bool isFinal = false; + + bool isHashableForICF(bool isText) const; + void hashForICF(); + InputSection *canonical() { return replacement ? replacement : this; } + + // ICF can't fold functions with LSDA+personality + bool hasPersonality = false; + // Points to the surviving section after this one is folded by ICF + InputSection *replacement = nullptr; + // Equivalence-class ID for ICF + uint64_t icfEqClass[2] = {0, 0}; ArrayRef data; std::vector relocs; @@ -98,6 +111,8 @@ return isec->kind() == ConcatKind; } + void foldIdentical(ConcatInputSection *redundant); + // With subsections_via_symbols, most symbols have their own InputSection, // and for weak symbols (e.g. from inline functions), only the // InputSection from one translation unit will make it to the output, diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp --- a/lld/MachO/InputSection.cpp +++ b/lld/MachO/InputSection.cpp @@ -12,6 +12,7 @@ #include "Symbols.h" #include "SyntheticSections.h" #include "Target.h" +#include "UnwindInfoSection.h" #include "Writer.h" #include "lld/Common/Memory.h" #include "llvm/Support/Endian.h" @@ -44,6 +45,67 @@ return sym->getVA(); } +// ICF needs to hash any section that might potentially be duplicated so +// that it can match on content rather than identity. +bool InputSection::isHashableForICF(bool isText) const { + if (auto const *concatIsec = dyn_cast(this)) + if (concatIsec->shouldOmitFromOutput()) + return false; + switch (sectionType(flags)) { + case S_REGULAR: + if (isText) + return !hasPersonality; + // One might hope that we could hash __TEXT,__const subsections to fold + // references to duplicated values, but alas, many tests fail. + return false; + case S_CSTRING_LITERALS: + case S_4BYTE_LITERALS: + case S_8BYTE_LITERALS: + case S_16BYTE_LITERALS: + case S_LITERAL_POINTERS: + // FIXME(gkm): once literal sections are deduplicated, their content and + // identity correlate, so we can assign unique IDs to them rather than hash + // them. + return true; + case S_ZEROFILL: + case S_GB_ZEROFILL: + case S_NON_LAZY_SYMBOL_POINTERS: + case S_LAZY_SYMBOL_POINTERS: + case S_SYMBOL_STUBS: + case S_MOD_INIT_FUNC_POINTERS: + case S_MOD_TERM_FUNC_POINTERS: + case S_COALESCED: + case S_INTERPOSING: + case S_DTRACE_DOF: + case S_LAZY_DYLIB_SYMBOL_POINTERS: + case S_THREAD_LOCAL_REGULAR: + case S_THREAD_LOCAL_ZEROFILL: + case S_THREAD_LOCAL_VARIABLES: + case S_THREAD_LOCAL_VARIABLE_POINTERS: + case S_THREAD_LOCAL_INIT_FUNCTION_POINTERS: + return false; + default: + llvm_unreachable("Section type"); + } +} + +void InputSection::hashForICF() { + assert(data.data()); // zeroFill section data has nullptr with non-zero size + assert(icfEqClass[0] == 0); // don't overwrite a unique ID! + // Turn-on the top bit to guarantee that valid hashes have no collisions + // with the small-integer unique IDs for ICF-ineligible sections + icfEqClass[0] = xxHash64(data) | (1ull << 63); +} + +void ConcatInputSection::foldIdentical(ConcatInputSection *copy) { + align = std::max(align, copy->align); + copy->live = false; + copy->wasCoalesced = true; + numRefs += copy->numRefs; + copy->numRefs = 0; + copy->replacement = this; +} + void ConcatInputSection::writeTo(uint8_t *buf) { assert(!shouldOmitFromOutput()); diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -275,6 +275,13 @@ HelpText<"Disable infra for branches beyond the maximum branch distance.">, Flags<[HelpHidden]>, Group; +def icf: Separate<["-"], "icf">, + HelpText<"Set level for identical code folding (default: none)">, + MetaVarName<"[none,safe,all]">, + Group; +def no_deduplicate : Flag<["-"], "no_deduplicate">, + HelpText<"Disable code deduplicaiton (synonym for `-icf none')">, + Group; def grp_version : OptionGroup<"version">, HelpText<"VERSION TARGETING">; @@ -601,10 +608,6 @@ HelpText<"Fail if any symbols are weak imports, allowed to be NULL at runtime">, Flags<[HelpHidden]>, Group; -def no_deduplicate : Flag<["-"], "no_deduplicate">, - HelpText<"Omit the deduplication pass">, - Flags<[HelpHidden]>, - Group; def verbose_deduplicate : Flag<["-"], "verbose_deduplicate">, HelpText<"Print function names eliminated by deduplication and the total size of code savings">, Flags<[HelpHidden]>, diff --git a/lld/MachO/Symbols.cpp b/lld/MachO/Symbols.cpp --- a/lld/MachO/Symbols.cpp +++ b/lld/MachO/Symbols.cpp @@ -40,7 +40,9 @@ // no_dead_strip or live_support. In that case, the section will know // that it's live but `used` might be false. Non-absolute symbols always // have to use the section's `live` bit as source of truth. - return d->isAbsolute() ? used : d->isec->isLive(d->value); + if (d->isAbsolute()) + return used; + return d->isec->canonical()->isLive(d->value); } assert(!isa(this) && @@ -57,7 +59,7 @@ if (isAbsolute()) return value; - if (!isec->isFinal) { + if (!isec->canonical()->isFinal) { // A target arch that does not use thunks ought never ask for // the address of a function that has not yet been finalized. assert(target->usesThunks()); @@ -68,7 +70,7 @@ // expedient to return a contrived out-of-range address. return TargetInfo::outOfRangeVA; } - return isec->getVA(value); + return isec->canonical()->getVA(value); } uint64_t DylibSymbol::getVA() const { diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp --- a/lld/MachO/SyntheticSections.cpp +++ b/lld/MachO/SyntheticSections.cpp @@ -647,6 +647,9 @@ if (const auto *defined = dyn_cast(sym)) { if (!defined->isec || !isCodeSection(defined->isec) || !defined->isLive()) continue; + if (const auto *concatIsec = dyn_cast(defined->isec)) + if (concatIsec->shouldOmitFromOutput()) + continue; // TODO: Add support for thumbs, in that case // the lowest bit of nextAddr needs to be set to 1. addrs.push_back(defined->getVA()); diff --git a/lld/MachO/UnwindInfoSection.h b/lld/MachO/UnwindInfoSection.h --- a/lld/MachO/UnwindInfoSection.h +++ b/lld/MachO/UnwindInfoSection.h @@ -13,9 +13,6 @@ #include "SyntheticSections.h" #include "mach-o/compact_unwind_encoding.h" -#include "llvm/ADT/DenseMap.h" - -#include namespace lld { namespace macho { diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp --- a/lld/MachO/UnwindInfoSection.cpp +++ b/lld/MachO/UnwindInfoSection.cpp @@ -143,12 +143,18 @@ // work. But since there are usually just few personality functions // that are referenced from many places, at least some of them likely // live, it wouldn't reduce number of got entries. - for (Reloc &r : isec->relocs) { + for (size_t i = 0; i < isec->relocs.size(); ++i) { + Reloc &r = isec->relocs[i]; assert(target->hasAttr(r.type, RelocAttrBits::UNSIGNED)); if (r.offset % sizeof(CompactUnwindEntry) != offsetof(CompactUnwindEntry, personality)) continue; + Reloc &rFunc = isec->relocs[++i]; + assert(r.offset == + rFunc.offset + offsetof(CompactUnwindEntry, personality)); + rFunc.referent.get()->hasPersonality = true; + if (auto *s = r.referent.dyn_cast()) { if (auto *undefined = dyn_cast(s)) { treatUndefinedSymbol(*undefined); diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -9,6 +9,7 @@ #include "Writer.h" #include "ConcatOutputSection.h" #include "Config.h" +#include "ICF.h" #include "InputFiles.h" #include "InputSection.h" #include "MapFile.h" @@ -51,6 +52,7 @@ void scanSymbols(); template void createOutputSections(); template void createLoadCommands(); + void foldIdenticalSections(); void finalizeAddresses(); void finalizeLinkEditSegment(); void assignAddresses(OutputSegment *); @@ -76,6 +78,7 @@ LCUuid *uuidCommand = nullptr; OutputSegment *linkEditSegment = nullptr; + DenseMap concatOutputSections; }; // LC_DYLD_INFO_ONLY stores the offsets of symbol import/export information. @@ -885,7 +888,6 @@ } // Then add input sections to output sections. - DenseMap concatOutputSections; for (const auto &p : enumerate(inputSections)) { InputSection *isec = p.value(); OutputSection *osec; @@ -940,6 +942,47 @@ linkEditSegment = getOrCreateOutputSegment(segment_names::linkEdit); } +void Writer::foldIdenticalSections() { + if (config->icfLevel == ICFLevel::none) + return; + ConcatOutputSection *textOutputSection = concatOutputSections.lookup( + maybeRenameSection({segment_names::text, section_names::text})); + if (textOutputSection == nullptr) + return; + + TimeTraceScope timeScope("Fold Identical Code Sections"); + // The ICF equivalence-class segregation algorithm relies on pre-computed + // hashes of InputSection::data for the ConcatOutputSection::inputs and all + // sections referenced by their relocs. We could recursively traverse the + // relocs to find every referenced InputSection, but that precludes easy + // parallelization. Therefore, we hash every InputSection here where we have + // them all accessible as a simple vector. + std::vector hashable; + // If an InputSection is ineligible for ICF, we give it a unique ID to force + // it into an unfoldable singleton equivalence class. Begin the unique-ID + // space at inputSections.size(), so that it will never intersect with + // equivalence-class IDs which begin at 0. Since hashes & unique IDs never + // coexist with equivalence-class IDs, this is not necessary, but might help + // someone keep the numbers straight in case we ever need to debug the + // ICF::segregate() + uint64_t icfUniqueID = inputSections.size(); + for (InputSection *isec : inputSections) { + if (isec->isHashableForICF(isec->parent == textOutputSection)) + hashable.push_back(isec); + else + isec->icfEqClass[0] = ++icfUniqueID; + } + parallelForEach(hashable, [](InputSection *isec) { isec->hashForICF(); }); + // Now that every input section is either hashed or marked as unique, + // run the segregation algorithm to detect foldable subsections + ICF(textOutputSection->inputs).run(); + size_t oldSize = textOutputSection->inputs.size(); + textOutputSection->eraseOmittedInputSections(); + size_t newSize = textOutputSection->inputs.size(); + log("ICF kept " + Twine(newSize) + " removed " + Twine(oldSize - newSize) + + " of " + Twine(oldSize)); +} + void Writer::finalizeAddresses() { TimeTraceScope timeScope("Finalize addresses"); uint64_t pageSize = target->getPageSize(); @@ -1071,6 +1114,7 @@ in.stubHelper->setup(); scanSymbols(); createOutputSections(); + foldIdenticalSections(); // After this point, we create no new segments; HOWEVER, we might // yet create branch-range extension thunks for architectures whose // hardware call instructions have limited range, e.g., ARM(64). diff --git a/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libSystem.tbd b/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libSystem.tbd --- a/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libSystem.tbd +++ b/lld/test/MachO/Inputs/MacOSX.sdk/usr/lib/libSystem.tbd @@ -68,5 +68,5 @@ umbrella: System exports: - targets: [ x86_64-macos, x86_64-maccatalyst, arm64-macos ] - symbols: [ ___nan ] + symbols: [ ___nan, ___isnan, ___inf, ___isinf ] ... diff --git a/lld/test/MachO/icf-options.s b/lld/test/MachO/icf-options.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/icf-options.s @@ -0,0 +1,65 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; mkdir %t + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/main.o +# RUN: %lld -lSystem -icf all -o %t/all %t/main.o 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DIAG-EMPTY --allow-empty +# RUN: %lld -lSystem -icf none -o %t/none %t/main.o 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DIAG-EMPTY --allow-empty +# RUN: %lld -lSystem -no_deduplicate -o %t/no_dedup %t/main.o 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DIAG-EMPTY --allow-empty +# RUN: not %lld -lSystem -icf safe -o %t/safe %t/main.o 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DIAG-SAFE +# RUN: not %lld -lSystem -icf junk -o %t/junk %t/main.o 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DIAG-JUNK +# RUN: not %lld -lSystem -icf all -no_deduplicate -o %t/clash %t/main.o 2>&1 \ +# RUN: | FileCheck %s --check-prefix=DIAG-CLASH + +# DIAG-EMPTY-NOT: {{.}} +# DIAG-SAFE: `-icf safe' is not yet implemented, reverting to `none' +# DIAG-JUNK: unknown -icf OPTION `junk', defaulting to `none' +# DIAG-CLASH: `-icf all' conflicts with -no_deduplicate, setting to `none' + +# RUN: llvm-objdump -d --syms %t/all | FileCheck %s --check-prefix=FOLD +# RUN: llvm-objdump -d --syms %t/none | FileCheck %s --check-prefix=NOOP +# RUN: llvm-objdump -d --syms %t/no_dedup | FileCheck %s --check-prefix=NOOP + +# FOLD-LABEL: SYMBOL TABLE: +# FOLD: [[#%x,MAIN:]] g F __TEXT,__text _main +# FOLD: [[#%x,F:]] g F __TEXT,__text _f1 +# FOLD: [[#%x,F]] g F __TEXT,__text _f2 + +# FOLD-LABEL: Disassembly of section __TEXT,__text: +# FOLD: [[#%x,MAIN]] <_main>: +# FOLD-NEXT: callq 0x[[#%x,F]] <_f2> +# FOLD-NEXT: callq 0x[[#%x,F]] <_f2> + +# NOOP-LABEL: SYMBOL TABLE: +# NOOP: [[#%x,MAIN:]] g F __TEXT,__text _main +# NOOP: [[#%x,F1:]] g F __TEXT,__text _f1 +# NOOP: [[#%x,F2:]] g F __TEXT,__text _f2 + +# NOOP-LABEL: Disassembly of section __TEXT,__text: +# NOOP: [[#%x,MAIN]] <_main>: +# NOOP-NEXT: callq 0x[[#%x,F1]] <_f1> +# NOOP-NEXT: callq 0x[[#%x,F2]] <_f2> + +.subsections_via_symbols +.text +.p2align 2 + +.globl _f1 +_f1: + movl $0, %eax + ret + +.globl _f2 +_f2: + movl $0, %eax + ret + +.globl _main +_main: + callq _f1 + callq _f2 + ret diff --git a/lld/test/MachO/icf-scale.s b/lld/test/MachO/icf-scale.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/icf-scale.s @@ -0,0 +1,81 @@ +# REQUIRES: x86 +# RUN: rm -rf %t* + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: %lld -lSystem -icf all -o %t %t.o +# RUN: llvm-objdump -d --syms %t | FileCheck %s + +## When ICF has fewer than 1 Ki functions to segregate into equivalence classes, +## it uses a sequential algorithm to avoid the overhead of threading. +## At 1 Ki functions or more, when threading begins to pay-off, ICF employs its +## parallel segregation algorithm. Here we generate 4 Ki functions to exercise +## the parallel algorithm. There are 4 unique function bodies, each replicated +## 1 Ki times. The resulting folded program should retain one instance for each +## of the four unique functions. + +# CHECK-LABEL: SYMBOL TABLE: +# CHECK: [[#%x,G0:]] g F __TEXT,__text _g000000 +# CHECK: [[#%x,G1:]] g F __TEXT,__text _g100000 +# CHECK: [[#%x,G2:]] g F __TEXT,__text _g200000 +# CHECK: [[#%x,G3:]] g F __TEXT,__text _g300000 +## . . . many intervening _gXXXXXX symbols +# CHECK: [[#%x,G0]] g F __TEXT,__text _g033333 +# CHECK: [[#%x,G1]] g F __TEXT,__text _g133333 +# CHECK: [[#%x,G2]] g F __TEXT,__text _g233333 +# CHECK: [[#%x,G3]] g F __TEXT,__text _g333333 + +# CHECK-LABEL: Disassembly of section __TEXT,__text: +# CHECK-DAG: [[#%x,G0]] <_g033333>: +# CHECK-DAG: [[#%x,G1]] <_g133333>: +# CHECK-DAG: [[#%x,G2]] <_g233333>: +# CHECK-DAG: [[#%x,G3]] <_g333333>: +# CHECK-NOT: [[#]] <_g{{.*}}>: + +.subsections_via_symbols +.text +.p2align 2 + +.macro gen_4 c + .globl _g0\c, _g1\c, _g2\c, _g3\c + _g0\c:; movl $0, %eax; ret + _g1\c:; movl $1, %eax; ret + _g2\c:; movl $2, %eax; ret + _g3\c:; movl $3, %eax; ret +.endm + +.macro gen_16 c + gen_4 0\c + gen_4 1\c + gen_4 2\c + gen_4 3\c +.endm + +.macro gen_64 c + gen_16 0\c + gen_16 1\c + gen_16 2\c + gen_16 3\c +.endm + +.macro gen_256 c + gen_64 0\c + gen_64 1\c + gen_64 2\c + gen_64 3\c +.endm + +.macro gen_1024 c + gen_256 0\c + gen_256 1\c + gen_256 2\c + gen_256 3\c +.endm + +gen_1024 0 +gen_1024 1 +gen_1024 2 +gen_1024 3 + +.globl _main +_main: + ret diff --git a/lld/test/MachO/icf.s b/lld/test/MachO/icf.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/icf.s @@ -0,0 +1,205 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; mkdir %t + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/main.o +# RUN: %lld -lSystem -icf all -o %t/main %t/main.o +# RUN: llvm-objdump -d --syms %t/main | FileCheck %s + +# CHECK-LABEL: SYMBOL TABLE: +# CHECK: [[#%x,MAIN:]] g F __TEXT,__text _main +# CHECK: [[#%x,A:]] g F __TEXT,__text _a1 +# CHECK: [[#%x,A]] g F __TEXT,__text _a2 +# CHECK: [[#%x,C:]] g F __TEXT,__text _c +# CHECK: [[#%x,D:]] g F __TEXT,__text _d +# CHECK: [[#%x,E:]] g F __TEXT,__text _e +# CHECK: [[#%x,F:]] g F __TEXT,__text _f +# CHECK: [[#%x,G:]] g F __TEXT,__text _g +# CHECK: [[#%x,SR:]] g F __TEXT,__text _sr1 +# CHECK: [[#%x,SR]] g F __TEXT,__text _sr2 +# CHECK: [[#%x,MR:]] g F __TEXT,__text _mr1 +# CHECK: [[#%x,MR]] g F __TEXT,__text _mr2 +### FIXME: Mutually-recursive functions with identical bodies (see below) +# COM: [[#%x,XR:]] g F __TEXT,__text _xr1 +# COM: [[#%x,XR]] g F __TEXT,__text _xr2 + +# CHECK-LABEL: Disassembly of section __TEXT,__text: +# CHECK: [[#%x,MAIN]] <_main>: +# CHECK-NEXT: callq 0x[[#%x,A]] <_a2> +# CHECK-NEXT: callq 0x[[#%x,A]] <_a2> +# CHECK-NEXT: callq 0x[[#%x,C]] <_c> +# CHECK-NEXT: callq 0x[[#%x,D]] <_d> +# CHECK-NEXT: callq 0x[[#%x,E]] <_e> +# CHECK-NEXT: callq 0x[[#%x,F]] <_f> +# CHECK-NEXT: callq 0x[[#%x,G]] <_g> +# CHECK-NEXT: callq 0x[[#%x,SR]] <_sr2> +# CHECK-NEXT: callq 0x[[#%x,SR]] <_sr2> +# CHECK-NEXT: callq 0x[[#%x,MR]] <_mr2> +# CHECK-NEXT: callq 0x[[#%x,MR]] <_mr2> +### FIXME: Mutually-recursive functions with identical bodies (see below) +# COM-NEXT: callq 0x[[#%x,XR]] <_xr2> +# COM-NEXT: callq 0x[[#%x,XR]] <_xr2> + +### TODO: +### * Fold: funcs only differ in alignment +### * No fold: func has personality/LSDA +### * No fold: reloc references to absolute symbols with different values +### * No fold: func is weak? preemptable? +### * No fold: relocs to N_ALT_ENTRY symbols + +.subsections_via_symbols +.text + +### Fold: _a1 & _a2 have identical bodies, flags, relocs + +.globl _a1 +.p2align 4, 0x90 +_a1: + callq _d + mov ___nan@GOTPCREL(%rip), %rax + callq ___isnan + movl $0, %eax + ret + +.globl _a2 +.p2align 4, 0x90 +_a2: + callq _d + mov ___nan@GOTPCREL(%rip), %rax + callq ___isnan + movl $0, %eax + ret + +### No fold: _c has slightly different body from _a1 & _a2 + +.globl _c +.p2align 4, 0x90 +_c: + callq _d + mov ___nan@GOTPCREL(%rip), %rax + callq ___isnan + movl $1, %eax + ret + +### No fold: _d has the same body as _a1 & _a2, but _d is recursive! + +.globl _d +.p2align 4, 0x90 +_d: + callq _d + mov ___nan@GOTPCREL(%rip), %rax + callq ___isnan + movl $0, %eax + ret + +### No fold: the body of _e is longer + +.globl _e +.p2align 4, 0x90 +_e: + callq _d + mov ___nan@GOTPCREL(%rip), %rax + callq ___isnan + movl $0, %eax + ret + nop + +### No fold: the dylib symbols differ + +.globl _f +.p2align 4, 0x90 +_f: + callq _d + mov ___inf@GOTPCREL(%rip), %rax + callq ___isnan + movl $0, %eax + ret + +.globl _g +.p2align 4, 0x90 +_g: + callq _d + mov ___inf@GOTPCREL(%rip), %rax + callq ___isinf + movl $0, %eax + ret + +### Fold: Simple recursion + +.globl _sr1 +.p2align 4, 0x90 +_sr1: + callq _sr1 + movl $2, %eax + ret + +.globl _sr2 +.p2align 4, 0x90 +_sr2: + callq _sr2 + movl $2, %eax + ret + +### Fold: Mutually-recursive functions with symmetric bodies + +.globl _mr1 +.p2align 4, 0x90 +_mr1: + callq _mr1 # call myself + callq _mr2 # call my twin + movl $1, %eax + ret + +.globl _mr2 +.p2align 4, 0x90 +_mr2: + callq _mr2 # call myself + callq _mr1 # call my twin + movl $1, %eax + ret + +### Fold: Mutually-recursive functions with identical bodies +### +### FIXME: This test is currently broken. Recursive call sites have no relocs +### and the non-zero displacement field is already written to the section +### data, while non-recursive call sites use symbol relocs and section data +### contains zeros in the displacement field. Thus, ICF's equalsConstant() +### finds that the section data doesn't match. +### +### ELF folds this case properly because it emits symbol relocs for all calls, +### even recursive ones. + +.globl _xr1 +.p2align 4, 0x90 +_xr1: + callq _xr1 # call myself + callq _xr2 # call my twin + movl $3, %eax + ret + +.globl _xr2 +.p2align 4, 0x90 +_xr2: + callq _xr1 # call my twin + callq _xr2 # call myself + movl $3, %eax + ret + +### + +.globl _main +.p2align 4, 0x90 +_main: + callq _a1 + callq _a2 + callq _c + callq _d + callq _e + callq _f + callq _g + callq _sr1 + callq _sr2 + callq _mr1 + callq _mr2 + callq _xr1 + callq _xr2 + ret