Index: lld/MachO/CMakeLists.txt =================================================================== --- lld/MachO/CMakeLists.txt +++ lld/MachO/CMakeLists.txt @@ -10,6 +10,7 @@ Arch/ARM64Common.cpp Arch/ARM64_32.cpp Arch/X86_64.cpp + CallGraphSort.cpp ConcatOutputSection.cpp Driver.cpp DriverUtils.cpp Index: lld/MachO/CallGraphSort.h =================================================================== --- /dev/null +++ lld/MachO/CallGraphSort.h @@ -0,0 +1,22 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_MACHO_CALL_GRAPH_SORT_H +#define LLD_MACHO_CALL_GRAPH_SORT_H + +#include "InputSection.h" +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace macho { + +llvm::DenseMap computeCallGraphProfileOrder(); +} // namespace macho +} // namespace lld + +#endif Index: lld/MachO/CallGraphSort.cpp =================================================================== --- /dev/null +++ lld/MachO/CallGraphSort.cpp @@ -0,0 +1,252 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// This is based on the ELF port, see ELF/CallGraphSort.cpp for the details +/// about the algorithm. +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "Config.h" +#include "InputFiles.h" +#include "Symbols.h" +#include "Target.h" +#include "lld/Common/ErrorHandler.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/MapVector.h" +#include "llvm/Support/TimeProfiler.h" +#include "llvm/Support/raw_ostream.h" +#include + +using namespace llvm; +using namespace lld; +using namespace lld::macho; + +namespace { +struct Edge { + int from; + uint64_t weight; +}; + +struct Cluster { + Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} + + double getDensity() const { + if (size == 0) + return 0; + return double(weight) / double(size); + } + + int next; + int prev; + uint64_t size; + uint64_t weight = 0; + uint64_t initialWeight = 0; + Edge bestPred = {-1, 0}; +}; + +class CallGraphSort { +public: + CallGraphSort(); + + DenseMap run(); + +private: + std::vector clusters; + std::vector sections; +}; +// Maximum amount the combined cluster density can be worse than the original +// cluster to consider merging. +constexpr int MAX_DENSITY_DEGRADATION = 8; +} // end anonymous namespace + +using SectionPair = std::pair; + +// Take the edge list in config->callGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { + MapVector &profile = config->callGraphProfile; + DenseMap secToCluster; + + auto getOrCreateCluster = [&](const InputSection *isec) -> int { + auto res = secToCluster.try_emplace(isec, clusters.size()); + if (res.second) { + sections.push_back(isec); + clusters.emplace_back(clusters.size(), isec->getSize()); + } + return res.first->second; + }; + + // Create the graph + for (std::pair &c : profile) { + const auto fromSec = c.first.first->canonical(); + const auto toSec = c.first.second->canonical(); + uint64_t weight = c.second; + // Ignore edges between input sections belonging to different output + // sections. This is done because otherwise we would end up with clusters + // containing input sections that can't actually be placed adjacently in the + // output. This messes with the cluster size and density calculations. We + // would also end up moving input sections in other output sections without + // moving them closer to what calls them. + if (fromSec->parent != toSec->parent) + continue; + + int from = getOrCreateCluster(fromSec); + int to = getOrCreateCluster(toSec); + + clusters[to].weight += weight; + + if (from == to) + continue; + + // Remember the best edge. + Cluster &toC = clusters[to]; + if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { + toC.bestPred.from = from; + toC.bestPred.weight = weight; + } + } + for (Cluster &c : clusters) + c.initialWeight = c.weight; +} + +// It's bad to merge clusters which would degrade the density too much. +static bool isNewDensityBad(Cluster &a, Cluster &b) { + double newDensity = double(a.weight + b.weight) / double(a.size + b.size); + return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; +} + +// Find the leader of V's belonged cluster (represented as an equivalence +// class). We apply union-find path-halving technique (simple to implement) in +// the meantime as it decreases depths and the time complexity. +static int getLeader(std::vector &leaders, int v) { + while (leaders[v] != v) { + leaders[v] = leaders[leaders[v]]; + v = leaders[v]; + } + return v; +} + +static void mergeClusters(std::vector &cs, Cluster &into, int intoIdx, + Cluster &from, int fromIdx) { + int tail1 = into.prev, tail2 = from.prev; + into.prev = tail2; + cs[tail2].next = intoIdx; + from.prev = tail1; + cs[tail1].next = fromIdx; + into.size += from.size; + into.weight += from.weight; + from.size = 0; + from.weight = 0; +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +DenseMap CallGraphSort::run() { + const uint64_t maxClusterSize = target->getPageSize(); + + // Cluster indices sorted by density. + std::vector sorted(clusters.size()); + // For union-find. + std::vector leaders(clusters.size()); + + std::iota(leaders.begin(), leaders.end(), 0); + std::iota(sorted.begin(), sorted.end(), 0); + + llvm::stable_sort(sorted, [&](int a, int b) { + return clusters[a].getDensity() > clusters[b].getDensity(); + }); + + for (int l : sorted) { + // The cluster index is the same as the index of its leader here because + // clusters[L] has not been merged into another cluster yet. + Cluster &c = clusters[l]; + + // Don't consider merging if the edge is unlikely. + if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) + continue; + + int predL = getLeader(leaders, c.bestPred.from); + // Already in the same cluster. + if (l == predL) + continue; + + Cluster *predC = &clusters[predL]; + if (c.size + predC->size > maxClusterSize) + continue; + + if (isNewDensityBad(*predC, c)) + continue; + + leaders[l] = predL; + mergeClusters(clusters, *predC, predL, c, l); + } + // Sort remaining non-empty clusters by density. + sorted.clear(); + for (int i = 0, e = (int)clusters.size(); i != e; ++i) + if (clusters[i].size > 0) + sorted.push_back(i); + llvm::stable_sort(sorted, [&](int a, int b) { + return clusters[a].getDensity() > clusters[b].getDensity(); + }); + + DenseMap orderMap; + + // Sections will be sorted by decreasing order. Absent sections will have + // priority 0 and be placed at the end of sections. + // NB: This is opposite from COFF/ELF to be compatible with the existing + // order-file code. + int curOrder = clusters.size(); + for (int leader : sorted) { + for (int i = leader;;) { + orderMap[sections[i]] = curOrder--; + i = clusters[i].next; + if (i == leader) + break; + } + } + if (!config->printSymbolOrder.empty()) { + std::error_code ec; + raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); + if (ec) { + error("cannot open " + config->printSymbolOrder + ": " + ec.message()); + return orderMap; + } + // Print the symbols ordered by C3, in the order of decreasing curOrder + // Instead of sorting all the orderMap, just repeat the loops above. + for (int leader : sorted) + for (int i = leader;;) { + const InputSection *isec = sections[i]; + // Search all the symbols in the file of the section + // and find out a Defined symbol with name that is within the + // section. + for (Symbol *sym : isec->getFile()->symbols) { + if (auto *d = dyn_cast_or_null(sym)) { + if (d->isec == isec) + os << sym->getName() << "\n"; + } + } + i = clusters[i].next; + if (i == leader) + break; + } + } + + return orderMap; +} + +// Sort sections by the profile data provided by __LLVM,__cg_profile sections. +// +// This first builds a call graph based on the profile data then merges sections +// according to the C³ heuristic. All clusters are then sorted by a density +// metric to further improve locality. +DenseMap macho::computeCallGraphProfileOrder() { + TimeTraceScope timeScope("Call graph profile sort"); + return CallGraphSort().run(); +} Index: lld/MachO/Config.h =================================================================== --- lld/MachO/Config.h +++ lld/MachO/Config.h @@ -12,6 +12,7 @@ #include "llvm/ADT/CachedHashString.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" #include "llvm/BinaryFormat/MachO.h" @@ -27,6 +28,7 @@ namespace lld { namespace macho { +class InputSection; class Symbol; struct SymbolPriorityEntry; @@ -166,6 +168,12 @@ std::vector segmentProtections; llvm::DenseMap priorities; + llvm::MapVector, + uint64_t> + callGraphProfile; + bool callGraphProfileSort = false; + llvm::StringRef printSymbolOrder; + SectionRenameMap sectionRenameMap; SegmentRenameMap segmentRenameMap; Index: lld/MachO/Driver.cpp =================================================================== --- lld/MachO/Driver.cpp +++ lld/MachO/Driver.cpp @@ -1031,6 +1031,25 @@ assert(inputOrder <= UnspecifiedInputOrder); } +static void extractCallGraphProfile() { + TimeTraceScope timeScope("Extract call graph profile"); + for (const InputFile *file : inputFiles) { + auto *obj = dyn_cast_or_null(file); + if (!obj) + continue; + for (const CallGraphEntry &entry : obj->callGraph) { + assert(entry.fromIndex < obj->symbols.size() && + entry.toIndex < obj->symbols.size()); + auto *fromSym = dyn_cast_or_null(obj->symbols[entry.fromIndex]); + auto *toSym = dyn_cast_or_null(obj->symbols[entry.toIndex]); + + if (!fromSym || !toSym) + continue; + config->callGraphProfile[{fromSym->isec, toSym->isec}] += entry.count; + } + } +} + static void foldIdenticalLiterals() { // We always create a cStringSection, regardless of whether dedupLiterals is // true. If it isn't, we simply create a non-deduplicating CStringSection. @@ -1188,6 +1207,9 @@ config->icfLevel = getICFLevel(args); config->dedupLiterals = args.hasArg(OPT_deduplicate_literals) || config->icfLevel != ICFLevel::none; + config->callGraphProfileSort = args.hasFlag( + OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true); + config->printSymbolOrder = args.getLastArgValue(OPT_print_symbol_order); // FIXME: Add a commandline flag for this too. config->zeroModTime = getenv("ZERO_AR_DATE"); @@ -1374,8 +1396,10 @@ replaceCommonSymbols(); StringRef orderFile = args.getLastArgValue(OPT_order_file); - if (!orderFile.empty()) + if (!orderFile.empty()) { parseOrderFile(orderFile); + config->callGraphProfileSort = false; + } referenceStubBinder(); @@ -1416,6 +1440,8 @@ } gatherInputSections(); + if (config->callGraphProfileSort) + extractCallGraphProfile(); if (config->deadStrip) markLive(); Index: lld/MachO/InputFiles.h =================================================================== --- lld/MachO/InputFiles.h +++ lld/MachO/InputFiles.h @@ -56,6 +56,16 @@ }; using SubsectionMap = std::vector; +// Represents a call graph profile edge. +struct CallGraphEntry { + // The index of the caller in the symbol table. + uint32_t fromIndex; + // The index of the callee in the symbol table. + uint32_t toIndex; + // Number of calls from callee to caller in the profile. + uint64_t count; +}; + class InputFile { public: enum Kind { @@ -104,6 +114,7 @@ const uint32_t modTime; std::vector debugSections; ArrayRef dataInCodeEntries; + std::vector callGraph; private: template void parse(); Index: lld/MachO/InputFiles.cpp =================================================================== --- lld/MachO/InputFiles.cpp +++ lld/MachO/InputFiles.cpp @@ -63,6 +63,7 @@ #include "llvm/ADT/iterator.h" #include "llvm/BinaryFormat/MachO.h" #include "llvm/LTO/LTO.h" +#include "llvm/Support/BinaryStreamReader.h" #include "llvm/Support/Endian.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" @@ -303,6 +304,24 @@ } else if (auto recordSize = getRecordSize(segname, name)) { splitRecords(*recordSize); } else if (segname == segment_names::llvm) { + if (name == "__cg_profile" && config->callGraphProfileSort) { + BinaryStreamReader reader(data, support::little); + while (!reader.empty()) { + uint32_t fromIndex, toIndex; + uint64_t count; + if (Error err = reader.readInteger(fromIndex)) + fatal(toString(this) + ": Expected 32-bit integer"); + if (Error err = reader.readInteger(toIndex)) + fatal(toString(this) + ": Expected 32-bit integer"); + if (Error err = reader.readInteger(count)) + fatal(toString(this) + ": Expected 64-bit integer"); + callGraph.emplace_back(); + CallGraphEntry &entry = callGraph.back(); + entry.fromIndex = fromIndex; + entry.toIndex = toIndex; + entry.count = count; + } + } // ld64 does not appear to emit contents from sections within the __LLVM // segment. Symbols within those sections point to bitcode metadata // instead of actual symbols. Global symbols within those sections could Index: lld/MachO/InputSection.h =================================================================== --- lld/MachO/InputSection.h +++ lld/MachO/InputSection.h @@ -51,6 +51,7 @@ virtual bool isLive(uint64_t off) const = 0; virtual void markLive(uint64_t off) = 0; virtual InputSection *canonical() { return this; } + virtual const InputSection *canonical() const { return this; } OutputSection *parent = nullptr; @@ -117,6 +118,9 @@ InputSection *canonical() override { return replacement ? replacement : this; } + const InputSection *canonical() const override { + return replacement ? replacement : this; + } static bool classof(const InputSection *isec) { return isec->kind() == ConcatKind; Index: lld/MachO/Options.td =================================================================== --- lld/MachO/Options.td +++ lld/MachO/Options.td @@ -73,6 +73,15 @@ Group; def O : JoinedOrSeparate<["-"], "O">, HelpText<"Optimize output file size">; +def call_graph_profile_sort: Flag<["--"], "call-graph-profile-sort">, + HelpText<"Reorder sections with call graph profile (default: true)">, + Group; +def no_call_graph_profile_sort : Flag<["--"], "no-call-graph-profile-sort">, + HelpText<"Do not reorder sections with call graph profile (default: false)">, + Group; +def print_symbol_order: Joined<["--"], "print-symbol-order=">, + HelpText<"Print a symbol order specified by --call-graph-profile-sort into the specified file">, + Group; // This is a complete Options.td compiled from Apple's ld(1) manpage // dated 2018-03-07 and cross checked with ld64 source code in repo Index: lld/MachO/Writer.cpp =================================================================== --- lld/MachO/Writer.cpp +++ lld/MachO/Writer.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "Writer.h" +#include "CallGraphSort.h" #include "ConcatOutputSection.h" #include "Config.h" #include "InputFiles.h" @@ -843,6 +844,8 @@ // Each section gets assigned the priority of the highest-priority symbol it // contains. static DenseMap buildInputSectionPriorities() { + if (config->callGraphProfileSort) + return computeCallGraphProfileOrder(); DenseMap sectionPriorities; if (config->priorities.empty()) Index: lld/test/MachO/cgprofile-icf.s =================================================================== --- /dev/null +++ lld/test/MachO/cgprofile-icf.s @@ -0,0 +1,46 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t + +# RUN: %lld -e A %t -o %t.out --icf=all +# RUN: llvm-nm --numeric-sort %t.out | FileCheck %s +# RUN: %lld -e A %t -o %t2.out +# RUN: llvm-nm --numeric-sort %t2.out | FileCheck %s --check-prefix=NOICF + +.text + .globl D +D: + mov $60, %rax + retq + + .globl C +C: + mov $60, %rax + retq + + .globl B +B: + mov $2, %rax + retq + + .globl A +A: + mov $42, %rax + retq + +.cg_profile A, B, 100 +.cg_profile A, C, 40 +.cg_profile C, D, 61 + +.subsections_via_symbols + +# CHECK: 0000000100000290 T A +# CHECK-NEXT: 0000000100000298 T C +# CHECK-NEXT: 0000000100000298 T D +# CHECK-NEXT: 00000001000002a0 T B + +# NOICF: 0000000100000290 T A +# NOICF-NEXT: 0000000100000298 T B +# NOICF-NEXT: 00000001000002a0 T C +# NOICF-NEXT: 00000001000002a8 T D + Index: lld/test/MachO/cgprofile-obj.s =================================================================== --- /dev/null +++ lld/test/MachO/cgprofile-obj.s @@ -0,0 +1,42 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t.o +# RUN: %lld -lSystem -e A -o %t.out %t.o +# RUN: llvm-nm --numeric-sort %t.out | FileCheck %s +# RUN: %lld --no-call-graph-profile-sort -lSystem -e A -o %t.out %t.o +# RUN: llvm-nm --numeric-sort %t.out | FileCheck %s --check-prefix=NO-CG + +.text + +D: + retq +.globl C +C: + retq +.globl B +B: + retq +.globl A +A: +Aa: + retq +.subsections_via_symbols + + + + .cg_profile A, B, 10 + .cg_profile A, B, 10 + .cg_profile Aa, B, 80 + .cg_profile A, C, 40 + .cg_profile B, C, 30 + .cg_profile C, D, 90 + +# CHECK: 00000001000002c8 T A +# CHECK: 00000001000002c9 T B +# CHECK: 00000001000002ca T C +# CHECK: 00000001000002cb t D + +# NO-CG: 00000001000002c8 t D +# NO-CG: 00000001000002c9 T C +# NO-CG: 00000001000002ca T B +# NO-CG: 00000001000002cb T A Index: lld/test/MachO/cgprofile-print.s =================================================================== --- /dev/null +++ lld/test/MachO/cgprofile-print.s @@ -0,0 +1,34 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t +# RUN: %lld -e A %t -o %t2 --print-symbol-order=%t3 +# RUN: FileCheck %s --input-file %t3 + +# CHECK: B +# CHECK-NEXT: C +# CHECK-NEXT: D +# CHECK-NEXT: A + +.text +.globl A +A: + nop + +.globl B +B: + nop + +.globl C +C: + nop + +.globl D +D: + nop + +.subsections_via_symbols + +.cg_profile A, B, 5 +.cg_profile B, C, 50 +.cg_profile C, D, 40 +.cg_profile D, B, 10 Index: llvm/utils/gn/secondary/lld/MachO/BUILD.gn =================================================================== --- llvm/utils/gn/secondary/lld/MachO/BUILD.gn +++ llvm/utils/gn/secondary/lld/MachO/BUILD.gn @@ -27,6 +27,7 @@ "Arch/ARM64Common.cpp", "Arch/ARM64_32.cpp", "Arch/X86_64.cpp", + "CallGraphSort.cpp", "ConcatOutputSection.cpp", "Driver.cpp", "DriverUtils.cpp",