diff --git a/lld/MachO/ConcatOutputSection.h b/lld/MachO/ConcatOutputSection.h --- a/lld/MachO/ConcatOutputSection.h +++ b/lld/MachO/ConcatOutputSection.h @@ -45,11 +45,23 @@ std::vector inputs; std::vector thunks; + // ICF needs a copy of the inputs vector because its equivalence-class + // segregation algorithm destroys the proper sequence. + std::vector icfInputs; static bool classof(const OutputSection *sec) { return sec->kind() == ConcatKind; } + void foldIdenticalSections(); + void segregate( + size_t begin, size_t end, + std::function equals); + size_t findBoundary(size_t begin, size_t end); + void forEachClassRange(size_t begin, size_t end, + std::function func); + void forEachClass(std::function func); + private: void mergeFlags(InputSection *input); diff --git a/lld/MachO/ConcatOutputSection.cpp b/lld/MachO/ConcatOutputSection.cpp --- a/lld/MachO/ConcatOutputSection.cpp +++ b/lld/MachO/ConcatOutputSection.cpp @@ -16,9 +16,12 @@ #include "lld/Common/ErrorHandler.h" #include "lld/Common/Memory.h" #include "llvm/BinaryFormat/MachO.h" +#include "llvm/Support/Parallel.h" #include "llvm/Support/ScopedPrinter.h" +#include "llvm/Support/TimeProfiler.h" #include +#include using namespace llvm; using namespace llvm::MachO; @@ -360,3 +363,261 @@ flags |= input->flags; flags &= pureMask; } + +// ICF = Identical C(ode|OMDAT) Folding +// +// Summary of segments & sections: +// +// Only input sections belonging to the same output section are eligible for +// folding. Folding cannot occur across output-section boundaries, therefore +// ConcatOutputSection is the natural place to run ICF. +// +// The __TEXT segment is readonly at the MMU. Some sections are already +// deduplicated elsewhere (__TEXT,__cstring & __TEXT,__literal*) and some are +// synthetic and inherently free of duplicates (__TEXT,__stubs & +// __TEXT,__unwind_info). We only run ICF on __TEXT,__text. One might hope ICF +// could work on __TEXT,__concat, but doing so induces many test failures. +// +// The __LINKEDIT segment is readonly at the MMU, yet entirely synthetic, and +// thus ineligible for ICF. +// +// The __DATA_CONST segment is read/write at the MMU, but is logically const to +// the application after dyld applies fixups to pointer data. Some sections are +// deduplicated elsewhere (__DATA_CONST,__cfstring), and some are synthetic +// (__DATA_CONST,__got). There are no ICF opportunities here. +// +// The __DATA segment is read/write at the MMU, and as application-writeable +// data, none of its sections are eligible for ICF. +// +// Please see the large block comment in lld/ELF/ICF.cpp for an explanation +// of the segregation algorithm. +// +// FIXME(gkm): implement keep-unique attributes +// FIXME(gkm): implement address-significance tables for MachO object files + +static unsigned icfPass = 0; +static std::atomic icfRepeat{false}; + +// (en|dis)able expensive debug logs +const bool debugLogUnequal = false; + +// Compare everything except the relocation referents +static bool equalsConstant(const InputSection *ia, const InputSection *ib) { + if (ia->data.size() != ib->data.size()) + return false; + StringRef na = ia->defined ? ia->defined->getName() : "???"; + StringRef nb = ib->defined ? ib->defined->getName() : "???"; + std::string size = std::to_string(ia->data.size()); + auto logUnequal = [size, na, nb](StringRef caption) -> bool { + if (debugLogUnequal) + log("ICF: " + caption + ": " + size + " " + na + " != " + nb); + return true; + }; + if (ia->data != ib->data && logUnequal("data")) + return false; + if (ia->flags != ib->flags && logUnequal("flags")) + return false; + if (ia->relocs.size() != ib->relocs.size() && logUnequal("reloc#")) + return false; + int n = 0; + return std::equal(ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), + [&](const Reloc &ra, const Reloc &rb) { + std::string nStr = "[" + std::to_string(n++) + "] "; + if (ra.type != rb.type && logUnequal(nStr + "type")) + return false; + if (ra.pcrel != rb.pcrel && logUnequal(nStr + "pcrel")) + return false; + if (ra.length != rb.length && logUnequal(nStr + "length")) + return false; + if (ra.offset != rb.offset && logUnequal(nStr + "offset")) + return false; + if (ra.addend != rb.addend && logUnequal(nStr + "addend")) + return false; + if (ra.referent.is() != + rb.referent.is() && + logUnequal(nStr + "sym/sec")) + return false; + return true; + }); +} + +// Compare only the relocation referents +static bool equalsVariable(const InputSection *ia, const InputSection *ib) { + auto logUnequal = [ia, ib](StringRef caption) -> bool { + if (debugLogUnequal) { + StringRef na = ia->defined ? ia->defined->getName() : "???"; + StringRef nb = ib->defined ? ib->defined->getName() : "???"; + std::string size = std::to_string(ia->data.size()); + log("ICF: " + caption + ": " + size + " " + na + " != " + nb); + } + return true; + }; + assert(ia->relocs.size() == ib->relocs.size()); + int n = 0; + return std::equal( + ia->relocs.begin(), ia->relocs.end(), ib->relocs.begin(), + [&](const Reloc &ra, const Reloc &rb) { + std::string nStr = "[" + std::to_string(n++) + "] "; + if (ra.referent == rb.referent) + return true; + if (ra.referent.is()) { + const auto *sa = ra.referent.get(); + const auto *sb = rb.referent.get(); + if (sa->kind() != sb->kind() && logUnequal(nStr + "sym kind")) + return false; + if (isa(sa) && isa(sb)) { + const auto *da = dyn_cast(sa); + const auto *db = dyn_cast(sb); + return !(da->isec->icfEquivalenceClass[icfPass % 2] != + db->isec->icfEquivalenceClass[icfPass % 2] && + logUnequal(nStr + "def hash")); + } + if (isa(sa) && isa(sb)) { + const auto *da = dyn_cast(sa); + const auto *db = dyn_cast(sb); + return !(da->stubsHelperIndex != db->stubsHelperIndex && + logUnequal(nStr + "stub index")); + } + llvm_unreachable("equalsVariable symbol kind"); + } + const auto *sa = ra.referent.get(); + const auto *sb = rb.referent.get(); + return (!(sa->icfEquivalenceClass[icfPass % 2] != + sb->icfEquivalenceClass[icfPass % 2] && + logUnequal(nStr + "sec hash"))); + }); +} + +// Find the first InputSection after BEGIN whose equivalence class differs +size_t ConcatOutputSection::findBoundary(size_t begin, size_t end) { + uint64_t beginHash = icfInputs[begin]->icfEquivalenceClass[icfPass % 2]; + for (size_t i = begin + 1; i < end; ++i) + if (beginHash != icfInputs[i]->icfEquivalenceClass[icfPass % 2]) + return i; + return end; +} + +// Invoke FUNC on subranges with matching equivalence class +void ConcatOutputSection::forEachClassRange( + size_t begin, size_t end, std::function func) { + while (begin < end) { + size_t mid = findBoundary(begin, end); + func(begin, mid); + begin = mid; + } +} + +// Split icfInputs into shards, then parallelize invocation of FUNC on subranges +// with matching equivalence class +void ConcatOutputSection::forEachClass( + std::function func) { + // Only use threads when the benefits outweigh the overhead. + const size_t threadingThreshold = 1024; + if (icfInputs.size() < threadingThreshold) { + forEachClassRange(0, icfInputs.size(), func); + ++icfPass; + return; + } + + // Shard into non-overlapping intervals, and call FUNC in parallel. The + // sharding must be completed before any calls to FUNC are made so that FUNC + // can modify the InputSection in its shard without causing data races. + const size_t shards = 256; + size_t step = icfInputs.size() / shards; + size_t boundaries[shards + 1]; + boundaries[0] = 0; + boundaries[shards] = icfInputs.size(); + parallelForEachN(1, shards, [&](size_t i) { + boundaries[i] = findBoundary((i - 1) * step, icfInputs.size()); + }); + parallelForEachN(1, shards + 1, [&](size_t i) { + if (boundaries[i - 1] < boundaries[i]) { + forEachClassRange(boundaries[i - 1], boundaries[i], func); + } + }); + ++icfPass; +} + +void ConcatOutputSection::foldIdenticalSections() { + icfInputs.assign(inputs.begin(), inputs.end()); + + // Into each origin-section hash, combine all reloc referent section hashes. + for (icfPass = 0; icfPass < 2; ++icfPass) { + parallelForEach(icfInputs, [&](InputSection *isec) { + uint64_t hash = isec->icfEquivalenceClass[icfPass % 2]; + for (const Reloc &r : isec->relocs) { + if (auto *sym = r.referent.dyn_cast()) { + if (auto *defined = dyn_cast(sym)) + hash += defined->isec->icfEquivalenceClass[icfPass % 2]; + else if (auto *dylibSym = dyn_cast(sym)) + hash += static_cast(dylibSym->stubsHelperIndex) << 32; + else + llvm_unreachable("foldIdenticalSections symbol kind"); + } + } + // Set MSB to 1 to avoid collisions with non-hashed classes. + isec->icfEquivalenceClass[(icfPass + 1) % 2] = hash | (1ull << 63); + }); + } + + llvm::stable_sort( + icfInputs, [](const InputSection *a, const InputSection *b) { + return a->icfEquivalenceClass[0] < b->icfEquivalenceClass[0]; + }); + forEachClass( + [&](size_t begin, size_t end) { segregate(begin, end, equalsConstant); }); + + // Split equivalence groups by comparing relocations until convergence + do { + icfRepeat = false; + forEachClass([&](size_t begin, size_t end) { + segregate(begin, end, equalsVariable); + }); + } while (icfRepeat); + log("ICF needed " + Twine(icfPass) + " iterations"); + + // Fold sections within equivalence classes + forEachClass([&](size_t begin, size_t end) { + if (end - begin < 2) + return; + InputSection *beginIsec = icfInputs[begin]; + for (size_t i = begin + 1; i < end; ++i) + beginIsec->foldIdentical(icfInputs[i]); + }); + + // Remove the duplicates from inputs + size_t oldSize = inputs.size(); + inputs.erase(std::remove_if(inputs.begin(), inputs.end(), + [](const InputSection *isec) -> bool { + return isec->shouldOmitFromOutput(); + }), + inputs.end()); + log("ICF kept " + std::to_string(inputs.size()) + " removed " + + std::to_string(oldSize - inputs.size()) + " of " + + std::to_string(oldSize)); +} + +// Split an equivalence class into smaller classes. +void ConcatOutputSection::segregate( + size_t begin, size_t end, + std::function equals) { + while (begin < end) { + // Divide [Begin, End) into two. Let Mid be the start index of the + // second group. + auto bound = std::stable_partition( + icfInputs.begin() + begin + 1, icfInputs.begin() + end, + [&](InputSection *isec) { return equals(icfInputs[begin], isec); }); + size_t mid = bound - icfInputs.begin(); + + // Split [Begin, End) into [Begin, Mid) and [Mid, End). We use Mid as an + // equivalence class ID because every group ends with a unique index. + for (size_t i = begin; i < mid; ++i) + icfInputs[i]->icfEquivalenceClass[(icfPass + 1) % 2] = mid; + + // If we created a group, we need to iterate the main loop again. + if (mid != end) + icfRepeat = true; + + begin = mid; + } +} diff --git a/lld/MachO/Config.h b/lld/MachO/Config.h --- a/lld/MachO/Config.h +++ b/lld/MachO/Config.h @@ -51,6 +51,13 @@ dynamic_lookup, }; +enum class ICFLevel { + unknown, + none, + safe, + all, +}; + struct SectionAlign { llvm::StringRef segName; llvm::StringRef sectName; @@ -116,6 +123,7 @@ NamespaceKind namespaceKind = NamespaceKind::twolevel; UndefinedSymbolTreatment undefinedSymbolTreatment = UndefinedSymbolTreatment::error; + ICFLevel icfLevel = ICFLevel::none; llvm::MachO::HeaderFileType outputType; std::vector systemLibraryRoots; std::vector librarySearchPaths; diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp --- a/lld/MachO/Driver.cpp +++ b/lld/MachO/Driver.cpp @@ -544,13 +544,14 @@ // FIXME: CommonSymbol should store isReferencedDynamically, noDeadStrip // and pass them on here. - replaceSymbol(sym, sym->getName(), isec->file, isec, /*value=*/0, - /*size=*/0, - /*isWeakDef=*/false, - /*isExternal=*/true, common->privateExtern, - /*isThumb=*/false, - /*isReferencedDynamically=*/false, - /*noDeadStrip=*/false); + isec->defined = replaceSymbol( + sym, sym->getName(), isec->file, isec, /*value=*/0, + /*size=*/0, + /*isWeakDef=*/false, + /*isExternal=*/true, common->privateExtern, + /*isThumb=*/false, + /*isReferencedDynamically=*/false, + /*noDeadStrip=*/false); } } @@ -687,6 +688,24 @@ return treatment; } +static ICFLevel getICFLevel(const ArgList &args) { + StringRef icfLevelStr = args.getLastArgValue(OPT_icf); + auto icfLevel = StringSwitch(icfLevelStr) + .Cases("none", "", ICFLevel::none) + .Case("safe", ICFLevel::safe) + .Case("all", ICFLevel::all) + .Default(ICFLevel::unknown); + if (icfLevel == ICFLevel::unknown) { + warn(Twine("unknown -icf OPTION `") + icfLevelStr + + "', defaulting to `none'"); + icfLevel = ICFLevel::none; + } else if (icfLevel == ICFLevel::safe) { + warn(Twine("`-icf safe' is not yet implemented, reverting to `none'")); + icfLevel = ICFLevel::none; + } + return icfLevel; +} + static void warnIfDeprecatedOption(const Option &opt) { if (!opt.getGroup().isValid()) return; @@ -1080,6 +1099,8 @@ config->undefinedSymbolTreatment = getUndefinedSymbolTreatment(args); + config->icfLevel = getICFLevel(args); + if (config->outputType == MH_EXECUTE) config->entry = symtab->addUndefined(args.getLastArgValue(OPT_e, "_main"), /*file=*/nullptr, diff --git a/lld/MachO/InputFiles.cpp b/lld/MachO/InputFiles.cpp --- a/lld/MachO/InputFiles.cpp +++ b/lld/MachO/InputFiles.cpp @@ -488,19 +488,21 @@ if (isWeakDefCanBeHidden) isPrivateExtern = true; - return symtab->addDefined( + isec->defined = symtab->addDefined( name, isec->file, isec, value, size, sym.n_desc & N_WEAK_DEF, isPrivateExtern, sym.n_desc & N_ARM_THUMB_DEF, sym.n_desc & REFERENCED_DYNAMICALLY, sym.n_desc & N_NO_DEAD_STRIP); + return isec->defined; } assert(!isWeakDefCanBeHidden && "weak_def_can_be_hidden on already-hidden symbol?"); - return make( + isec->defined = make( name, isec->file, isec, value, size, sym.n_desc & N_WEAK_DEF, /*isExternal=*/false, /*isPrivateExtern=*/false, sym.n_desc & N_ARM_THUMB_DEF, sym.n_desc & REFERENCED_DYNAMICALLY, sym.n_desc & N_NO_DEAD_STRIP); + return isec->defined; } // Absolute symbols are defined symbols that do not have an associated @@ -615,10 +617,15 @@ } auto *nextIsec = make(*isec); - nextIsec->data = isec->data.slice(symbolOffset); nextIsec->numRefs = 0; nextIsec->wasCoalesced = false; - isec->data = isec->data.slice(0, symbolOffset); + if (isZeroFill(isec->flags)) { + nextIsec->data = {nullptr, isec->data.size() - symbolOffset}; + isec->data = {nullptr, symbolOffset}; + } else { + nextIsec->data = isec->data.slice(symbolOffset); + isec->data = isec->data.slice(0, symbolOffset); + } // By construction, the symbol will be at offset zero in the new // subsection. diff --git a/lld/MachO/InputSection.h b/lld/MachO/InputSection.h --- a/lld/MachO/InputSection.h +++ b/lld/MachO/InputSection.h @@ -21,6 +21,7 @@ class InputFile; class OutputSection; +class Defined; class InputSection { public: @@ -48,6 +49,15 @@ // How many symbols refer to this InputSection. uint32_t numRefs = 0; + bool isHashableForICF(const OutputSection *textOutputSection) const; + void hashForICF(); + void foldIdentical(InputSection *redundant); + + // Equivalence-class ID for ICF + uint64_t icfEquivalenceClass[2] = {0, 0}; + // Track the defined symbol that references this - for ICF logs + Defined *defined = nullptr; + // With subsections_via_symbols, most symbols have their own InputSection, // and for weak symbols (e.g. from inline functions), only the // InputSection from one translation unit will make it to the output, diff --git a/lld/MachO/InputSection.cpp b/lld/MachO/InputSection.cpp --- a/lld/MachO/InputSection.cpp +++ b/lld/MachO/InputSection.cpp @@ -12,9 +12,11 @@ #include "Symbols.h" #include "SyntheticSections.h" #include "Target.h" +#include "UnwindInfoSection.h" #include "Writer.h" #include "lld/Common/Memory.h" #include "llvm/Support/Endian.h" +#include "llvm/Support/xxhash.h" using namespace llvm; using namespace llvm::MachO; @@ -45,6 +47,77 @@ return sym->getVA(); } +// ICF needs to hash any section that might potentially be duplicated so +// that it can match on content rather than identity. +bool InputSection::isHashableForICF( + const OutputSection *textOutputSection) const { + if (shouldOmitFromOutput()) + return false; + switch (sectionType(flags)) { + case S_REGULAR: + if (parent == textOutputSection) + return !in.unwindInfo->funcHasPersonality(this); + // One might hope that we could hash __TEXT,__const subsections to fold + // references to duplicated values, but alas, many tests fail. + return false; + case S_CSTRING_LITERALS: + case S_4BYTE_LITERALS: + case S_8BYTE_LITERALS: + case S_16BYTE_LITERALS: + case S_LITERAL_POINTERS: + // FIXME(gkm): once literal sections are deduplicated, their content and + // identity correlate, so we can assign unique IDs to them rather than hash + // them. + return true; + case S_ZEROFILL: + case S_GB_ZEROFILL: + case S_NON_LAZY_SYMBOL_POINTERS: + case S_LAZY_SYMBOL_POINTERS: + case S_SYMBOL_STUBS: + case S_MOD_INIT_FUNC_POINTERS: + case S_MOD_TERM_FUNC_POINTERS: + case S_COALESCED: + case S_INTERPOSING: + case S_DTRACE_DOF: + case S_LAZY_DYLIB_SYMBOL_POINTERS: + case S_THREAD_LOCAL_REGULAR: + case S_THREAD_LOCAL_ZEROFILL: + case S_THREAD_LOCAL_VARIABLES: + case S_THREAD_LOCAL_VARIABLE_POINTERS: + case S_THREAD_LOCAL_INIT_FUNCTION_POINTERS: + return false; + default: + llvm_unreachable("Section type"); + } +} + +void InputSection::hashForICF() { + assert(data.data()); // zeroFill section data has nullptr with non-zero size + assert(icfEquivalenceClass[0] == 0); // don't overwrite a unique ID! + // Turn-on the top bit to guarantee that valid hashes have no + // collisions with the small-integer unique IDs for ICF-inelgible sections + icfEquivalenceClass[0] = xxHash64(data) | (1ull << 63); +} + +// (en|dis)able expensive debug logs +const bool debugLogFolds = false; + +void InputSection::foldIdentical(InputSection *copy) { + if (debugLogFolds) { + StringRef thisName = defined ? defined->getName() : "???"; + StringRef thatName = copy->defined ? copy->defined->getName() : "???"; + log("ICF: " + thisName + " replaces " + thatName + ", saves " + + to_string(getSize())); + align = std::max(align, copy->align); + } + copy->wasCoalesced = true; + if (copy->defined) { + numRefs++; + copy->defined->isec = this; + copy->numRefs--; + } +} + void InputSection::writeTo(uint8_t *buf) { assert(!shouldOmitFromOutput()); diff --git a/lld/MachO/Options.td b/lld/MachO/Options.td --- a/lld/MachO/Options.td +++ b/lld/MachO/Options.td @@ -262,6 +262,10 @@ HelpText<"Disable infra for branches beyond the maximum branch distance.">, Flags<[HelpHidden]>, Group; +def icf: Separate<["-"], "icf">, + HelpText<"Enable identical code folding at level (default: none)">, + MetaVarName<"[none,safe,all]">, + Group; def grp_version : OptionGroup<"version">, HelpText<"VERSION TARGETING">; diff --git a/lld/MachO/SymbolTable.cpp b/lld/MachO/SymbolTable.cpp --- a/lld/MachO/SymbolTable.cpp +++ b/lld/MachO/SymbolTable.cpp @@ -92,6 +92,8 @@ Defined *defined = replaceSymbol( s, name, file, isec, value, size, isWeakDef, /*isExternal=*/true, isPrivateExtern, isThumb, isReferencedDynamically, noDeadStrip); + if (isec) + isec->defined = defined; defined->overridesWeakDef = overridesWeakDef; return defined; } diff --git a/lld/MachO/UnwindInfoSection.h b/lld/MachO/UnwindInfoSection.h --- a/lld/MachO/UnwindInfoSection.h +++ b/lld/MachO/UnwindInfoSection.h @@ -33,6 +33,7 @@ bool isNeeded() const override { return compactUnwindSection != nullptr; } uint64_t getSize() const override { return unwindInfoSize; } virtual void prepareRelocations(InputSection *) = 0; + virtual bool funcHasPersonality(const InputSection *) = 0; void setCompactUnwindSection(ConcatOutputSection *cuSection) { compactUnwindSection = cuSection; diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp --- a/lld/MachO/UnwindInfoSection.cpp +++ b/lld/MachO/UnwindInfoSection.cpp @@ -106,6 +106,13 @@ template class UnwindInfoSectionImpl : public UnwindInfoSection { public: void prepareRelocations(InputSection *) override; + bool funcHasPersonality(const InputSection *isec) override { + bool hasPersonality = funcsWithPersonality.contains(isec); + if (hasPersonality && isec->defined) + log("ICF get personality " + + (isec->defined ? isec->defined->getName() : "???")); + return hasPersonality; + } void finalize() override; void writeTo(uint8_t *buf) const override; @@ -116,6 +123,7 @@ std::vector personalities; SmallDenseMap, Symbol *> personalityTable; + DenseSet funcsWithPersonality; std::vector lsdaEntries; // Map of function offset (from the image base) to an index within the LSDA // array. @@ -143,12 +151,22 @@ // work. But since there are usually just few personality functions // that are referenced from many places, at least some of them likely // live, it wouldn't reduce number of got entries. - for (Reloc &r : isec->relocs) { + for (size_t i = 0; i < isec->relocs.size(); ++i) { + Reloc &r = isec->relocs[i]; assert(target->hasAttr(r.type, RelocAttrBits::UNSIGNED)); if (r.offset % sizeof(CompactUnwindEntry) != offsetof(CompactUnwindEntry, personality)) continue; + Reloc &rFunc = isec->relocs[++i]; + assert(r.offset == + rFunc.offset + offsetof(CompactUnwindEntry, personality)); + if (const auto *fIsec = rFunc.referent.get()) { + const auto *fSym = fIsec->defined; + log("ICF sec personality " + (fSym ? fSym->getName() : "???")); + funcsWithPersonality.insert(fIsec); + } + if (auto *s = r.referent.dyn_cast()) { if (auto *undefined = dyn_cast(s)) { treatUndefinedSymbol(*undefined); diff --git a/lld/MachO/Writer.cpp b/lld/MachO/Writer.cpp --- a/lld/MachO/Writer.cpp +++ b/lld/MachO/Writer.cpp @@ -51,6 +51,7 @@ void scanSymbols(); template void createOutputSections(); template void createLoadCommands(); + void foldIdenticalSections(); void finalizeAddresses(); void finalizeLinkEditSegment(); void assignAddresses(OutputSegment *); @@ -75,6 +76,7 @@ LCUuid *uuidCommand = nullptr; OutputSegment *linkEditSegment = nullptr; + DenseMap concatOutputSections; }; // LC_DYLD_INFO_ONLY stores the offsets of symbol import/export information. @@ -864,7 +866,6 @@ } // Then add input sections to output sections. - DenseMap concatOutputSections; for (const auto &p : enumerate(inputSections)) { InputSection *isec = p.value(); if (isec->shouldOmitFromOutput()) @@ -905,6 +906,37 @@ linkEditSegment = getOrCreateOutputSegment(segment_names::linkEdit); } +void Writer::foldIdenticalSections() { + if (config->icfLevel == ICFLevel::none) + return; + ConcatOutputSection *textOutputSection = concatOutputSections.lookup( + maybeRenameSection({segment_names::text, section_names::text})); + if (textOutputSection == nullptr) + return; + + TimeTraceScope timeScope("Fold Identical Code Sections"); + // The ICF equivalence-class segregation algorithm relies on pre-computed + // hashes of InputSection::data for the ConcatOutputSection::inputs and all + // sections referenced by their relocs. We could recursively traverse the + // relocs to find every referenced InputSection, but that precludes easy + // parallelization. Therefore, we hash every InputSection here where we have + // them all accessible as a simple vector. + std::vector hashable; + // If an InputSection is ineligible for ICF, we give it a unique + // ID to force it into an unfoldable singleton equivalence class. + uint64_t icfInvalidId = 0; + for (InputSection *isec : inputSections) { + if (isec->isHashableForICF(textOutputSection)) + hashable.push_back(isec); + else + isec->icfEquivalenceClass[0] = ++icfInvalidId; + } + parallelForEach(hashable, [](InputSection *isec) { isec->hashForICF(); }); + // Now that every input section is either hashed or marked as unique, + // run the segregation algorithm to detect foldable subsections + textOutputSection->foldIdenticalSections(); +} + void Writer::finalizeAddresses() { TimeTraceScope timeScope("Finalize addresses"); uint64_t pageSize = target->getPageSize(); @@ -1029,6 +1061,7 @@ in.stubHelper->setup(); scanSymbols(); createOutputSections(); + foldIdenticalSections(); // After this point, we create no new segments; HOWEVER, we might // yet create branch-range extension thunks for architectures whose // hardware call instructions have limited range, e.g., ARM(64). diff --git a/lld/test/ELF/Inputs/icf2.s b/lld/test/ELF/Inputs/icf2.s --- a/lld/test/ELF/Inputs/icf2.s +++ b/lld/test/ELF/Inputs/icf2.s @@ -3,3 +3,7 @@ f2: mov $60, %rdi call f1 + call f2 + + call f1 + call f2 diff --git a/lld/test/ELF/icf2.s b/lld/test/ELF/icf2.s --- a/lld/test/ELF/icf2.s +++ b/lld/test/ELF/icf2.s @@ -15,3 +15,7 @@ f1: mov $60, %rdi call f2 + call f1 + + call f1 + call f2 diff --git a/lld/test/MachO/icf-scale.s b/lld/test/MachO/icf-scale.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/icf-scale.s @@ -0,0 +1,81 @@ +# REQUIRES: x86 +# RUN: rm -rf %t* + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: %lld -lSystem -icf all -o %t %t.o +# RUN: llvm-objdump -d --syms %t | FileCheck %s + +## When ICF has fewer than 1 Ki functions to segregate into equivalence classes, +## it uses a sequential algorithm to avoid the overhead of threading. +## At 1 Ki functions or more, when threading begins to pay-off ICF employs its +## parallel segregation algorithm. Here we generate 4 Ki functions to exercise +## the parallel algorithm. There are 4 uniqe function bodies, each replicated +## 1 Ki times. The resulting folded program should one instance for each of the +## four unique functions. + +# CHECK-LABEL: SYMBOL TABLE: +# CHECK: [[#%x,G0:]] g F __TEXT,__text _g000000 +# CHECK: [[#%x,G1:]] g F __TEXT,__text _g100000 +# CHECK: [[#%x,G2:]] g F __TEXT,__text _g200000 +# CHECK: [[#%x,G3:]] g F __TEXT,__text _g300000 +## . . . +# CHECK: [[#%x,G0]] g F __TEXT,__text _g033333 +# CHECK: [[#%x,G1]] g F __TEXT,__text _g133333 +# CHECK: [[#%x,G2]] g F __TEXT,__text _g233333 +# CHECK: [[#%x,G3]] g F __TEXT,__text _g333333 + +# CHECK-LABEL: Disassembly of section __TEXT,__text: +# CHECK-DAG: [[#%x,G0]] <_g033333>: +# CHECK-DAG: [[#%x,G1]] <_g133333>: +# CHECK-DAG: [[#%x,G2]] <_g233333>: +# CHECK-DAG: [[#%x,G3]] <_g333333>: +# CHECK-NOT: [[#]] <_g{{.*}}>: + +.subsections_via_symbols +.text +.p2align 2 + +.macro gen_4 c + .globl _g0\c, _g1\c, _g2\c, _g3\c + _g0\c:; movl $0, %eax; ret + _g1\c:; movl $1, %eax; ret + _g2\c:; movl $2, %eax; ret + _g3\c:; movl $3, %eax; ret +.endm + +.macro gen_16 c + gen_4 0\c + gen_4 1\c + gen_4 2\c + gen_4 3\c +.endm + +.macro gen_64 c + gen_16 0\c + gen_16 1\c + gen_16 2\c + gen_16 3\c +.endm + +.macro gen_256 c + gen_64 0\c + gen_64 1\c + gen_64 2\c + gen_64 3\c +.endm + +.macro gen_1024 c + gen_256 0\c + gen_256 1\c + gen_256 2\c + gen_256 3\c +.endm + +gen_1024 0 +gen_1024 1 +gen_1024 2 +gen_1024 3 + +.globl _main +_main: + ret diff --git a/lld/test/MachO/icf.s b/lld/test/MachO/icf.s new file mode 100644 --- /dev/null +++ b/lld/test/MachO/icf.s @@ -0,0 +1,165 @@ +# REQUIRES: x86 +# RUN: rm -rf %t; mkdir %t + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/main.o +# RUN: %lld -lSystem -icf all -o %t/main %t/main.o +# RUN: llvm-objdump -d --syms %t/main | FileCheck %s + +# CHECK-LABEL: SYMBOL TABLE: +# CHECK: [[#%x,MAIN:]] g F __TEXT,__text _main +# CHECK: [[#%x,F:]] g F __TEXT,__text _f1 +# CHECK: [[#%x,F]] g F __TEXT,__text _f2 +# CHECK: [[#%x,C:]] g F __TEXT,__text _c +# CHECK: [[#%x,D:]] g F __TEXT,__text _d +# CHECK: [[#%x,E:]] g F __TEXT,__text _e +# CHECK: [[#%x,SR:]] g F __TEXT,__text _sr1 +# CHECK: [[#%x,SR]] g F __TEXT,__text _sr2 +# CHECK: [[#%x,MR:]] g F __TEXT,__text _mr1 +# CHECK: [[#%x,MR]] g F __TEXT,__text _mr2 +### Note that "BROKE" is not a FileCheck keyword, just something it ignores +# BROKE: [[#%x,XR:]] g F __TEXT,__text _xr1 +# BROKE: [[#%x,XR]] g F __TEXT,__text _xr2 + +# CHECK-LABEL: Disassembly of section __TEXT,__text: +# CHECK: [[#%x,MAIN]] <_main>: +# CHECK-NEXT: callq 0x[[#%x,F]] <_f2> +# CHECK-NEXT: callq 0x[[#%x,F]] <_f2> +# CHECK-NEXT: callq 0x[[#%x,C]] <_c> +# CHECK-NEXT: callq 0x[[#%x,D]] <_d> +# CHECK-NEXT: callq 0x[[#%x,E]] <_e> +# CHECK-NEXT: callq 0x[[#%x,SR]] <_sr2> +# CHECK-NEXT: callq 0x[[#%x,SR]] <_sr2> +# CHECK-NEXT: callq 0x[[#%x,MR]] <_mr2> +# CHECK-NEXT: callq 0x[[#%x,MR]] <_mr2> +# BROKE-NEXT: callq 0x[[#%x,XR]] <_xr2> +# BROKE-NEXT: callq 0x[[#%x,XR]] <_xr2> + +.subsections_via_symbols +.text + +### Fold: _f1 & _f2 have identical bodies, flags, relocs, +### but different alignment is OK + +.p2align 2 + +.globl _f1, _f1_end +_f1: + callq _d + movl $0, %eax + nopl (%rax,%rax) + ret +_f1_end: # attribute any alignment padding to _f1_end, not to _f1 + +.p2align 3 + +.globl _f2, _f2_end +_f2: + callq _d + movl $0, %eax + nopl (%rax,%rax) + ret +_f2_end: # attribute any alignment padding to _f2_end, not to _f2 + +.p2align 2 + +### No fold: _c has slightly different body from _f1 & _f2 + +.globl _c +_c: + callq _d + movl $1, %eax + nopl (%rax,%rax) + ret + +### No fold: _d has the same body as _f1 & _f2, but _d is recursive! + +.globl _d +_d: + callq _d + movl $0, %eax + nopl (%rax,%rax) + ret + +### No fold: the body of _e is longer + +.globl _e +_e: + callq _d + movl $0, %eax + nopl (%rax,%rax) + ret + nop + +### Fold: Simple recursion + +.globl _sr1 +_sr1: + callq _sr1 + movl $2, %eax + nopl (%rax,%rax) + ret + +.globl _sr2 +_sr2: + callq _sr2 + movl $2, %eax + nopl (%rax,%rax) + ret + +### Fold: Mutually-recursive functions with symmetric bodies + +.globl _mr1 +_mr1: + callq _mr1 # call myself + callq _mr2 # call my twin + movl $1, %eax + ret + +.globl _mr2 +_mr2: + callq _mr2 # call myself + callq _mr1 # call my twin + movl $1, %eax + ret + +### Fold: Mutually-recursive functions with identical bodies +### +### FIXME: This test is currently broken. Recursive call sites have no relocs +### and the non-zero displacement field is already written to the section +### data, while non-recursive call sites use symbol relocs and section data +### contains zeros in the displacement field. Thus, ICF's equalsConstant() +### finds that the section data doesn't match. +### +### ELF folds this case properly because it emits symbol relocs for all calls, +### even recursive ones. + +.globl _xr1 +_xr1: + callq _xr1 # call myself + callq _xr2 # call my twin + movl $3, %eax + ret + +.globl _xr2 +_xr2: + callq _xr1 # call my twin + callq _xr2 # call myself + movl $3, %eax + ret + +### + +.globl _main +_main: + callq _f1 + callq _f2 + callq _c + callq _d + callq _e + callq _sr1 + callq _sr2 + callq _mr1 + callq _mr2 + callq _xr1 + callq _xr2 + ret