diff --git a/lld/COFF/CMakeLists.txt b/lld/COFF/CMakeLists.txt --- a/lld/COFF/CMakeLists.txt +++ b/lld/COFF/CMakeLists.txt @@ -8,6 +8,7 @@ add_lld_library(lldCOFF Chunks.cpp + CallGraphSort.cpp DebugTypes.cpp DLL.cpp Driver.cpp diff --git a/lld/COFF/CallGraphSort.h b/lld/COFF/CallGraphSort.h new file mode 100644 --- /dev/null +++ b/lld/COFF/CallGraphSort.h @@ -0,0 +1,22 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_COFF_CALL_GRAPH_SORT_H +#define LLD_COFF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace coff { +class SectionChunk; + +llvm::DenseMap computeCallGraphProfileOrder(); +} // namespace coff +} // namespace lld + +#endif diff --git a/lld/COFF/CallGraphSort.cpp b/lld/COFF/CallGraphSort.cpp new file mode 100644 --- /dev/null +++ b/lld/COFF/CallGraphSort.cpp @@ -0,0 +1,271 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement +/// for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are laid out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incoming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the amount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "InputFiles.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "lld/Common/ErrorHandler.h" + +#include +#include + +using namespace llvm; +using namespace lld; +using namespace lld::coff; + +namespace { +struct Edge { + int from; + uint64_t weight; +}; + +struct Cluster { + Cluster(int sec, size_t s) : next(sec), prev(sec), size(s) {} + + double getDensity() const { + if (size == 0) + return 0; + return double(weight) / double(size); + } + + int next; + int prev; + size_t size = 0; + uint64_t weight = 0; + uint64_t initialWeight = 0; + Edge bestPred = {-1, 0}; +}; + +class CallGraphSort { +public: + CallGraphSort(); + + DenseMap run(); + +private: + std::vector clusters; + std::vector sections; +}; + +// Maximum amount the combined cluster density can be worse than the original +// cluster to consider merging. +constexpr int MAX_DENSITY_DEGRADATION = 8; + +// Maximum cluster size in bytes. +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; +} // end anonymous namespace + +using SectionPair = std::pair; + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { + MapVector &profile = config->callGraphProfile; + DenseMap secToCluster; + + auto getOrCreateNode = [&](const SectionChunk *isec) -> int { + auto res = secToCluster.try_emplace(isec, clusters.size()); + if (res.second) { + sections.push_back(isec); + clusters.emplace_back(clusters.size(), isec->getSize()); + } + return res.first->second; + }; + + // Create the graph. + for (std::pair &c : profile) { + const auto *fromSB = cast(c.first.first->repl); + const auto *toSB = cast(c.first.second->repl); + uint64_t weight = c.second; + + // Ignore edges between input sections belonging to different output + // sections. This is done because otherwise we would end up with clusters + // containing input sections that can't actually be placed adjacently in the + // output. This messes with the cluster size and density calculations. We + // would also end up moving input sections in other output sections without + // moving them closer to what calls them. + if (fromSB->getOutputSection() != toSB->getOutputSection()) + continue; + + int from = getOrCreateNode(fromSB); + int to = getOrCreateNode(toSB); + + clusters[to].weight += weight; + + if (from == to) + continue; + + // Remember the best edge. + Cluster &toC = clusters[to]; + if (toC.bestPred.from == -1 || toC.bestPred.weight < weight) { + toC.bestPred.from = from; + toC.bestPred.weight = weight; + } + } + for (Cluster &c : clusters) + c.initialWeight = c.weight; +} + +// It's bad to merge clusters which would degrade the density too much. +static bool isNewDensityBad(Cluster &a, Cluster &b) { + double newDensity = double(a.weight + b.weight) / double(a.size + b.size); + return newDensity < a.getDensity() / MAX_DENSITY_DEGRADATION; +} + +// Find the leader of V's belonged cluster (represented as an equivalence +// class). We apply union-find path-halving technique (simple to implement) in +// the meantime as it decreases depths and the time complexity. +static int getLeader(std::vector &leaders, int v) { + while (leaders[v] != v) { + leaders[v] = leaders[leaders[v]]; + v = leaders[v]; + } + return v; +} + +static void mergeClusters(std::vector &cs, Cluster &into, int intoIdx, + Cluster &from, int fromIdx) { + int tail1 = into.prev, tail2 = from.prev; + into.prev = tail2; + cs[tail2].next = intoIdx; + from.prev = tail1; + cs[tail1].next = fromIdx; + into.size += from.size; + into.weight += from.weight; + from.size = 0; + from.weight = 0; +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +DenseMap CallGraphSort::run() { + std::vector sorted(clusters.size()); + std::vector leaders(clusters.size()); + + std::iota(leaders.begin(), leaders.end(), 0); + std::iota(sorted.begin(), sorted.end(), 0); + llvm::stable_sort(sorted, [&](int a, int b) { + return clusters[a].getDensity() > clusters[b].getDensity(); + }); + + for (int l : sorted) { + // The cluster index is the same as the index of its leader here because + // clusters[L] has not been merged into another cluster yet. + Cluster &c = clusters[l]; + + // Don't consider merging if the edge is unlikely. + if (c.bestPred.from == -1 || c.bestPred.weight * 10 <= c.initialWeight) + continue; + + int predL = getLeader(leaders, c.bestPred.from); + if (l == predL) + continue; + + Cluster *predC = &clusters[predL]; + if (c.size + predC->size > MAX_CLUSTER_SIZE) + continue; + + if (isNewDensityBad(*predC, c)) + continue; + + leaders[l] = predL; + mergeClusters(clusters, *predC, predL, c, l); + } + + // Sort remaining non-empty clusters by density. + sorted.clear(); + for (int i = 0, e = (int)clusters.size(); i != e; ++i) + if (clusters[i].size > 0) + sorted.push_back(i); + llvm::stable_sort(sorted, [&](int a, int b) { + return clusters[a].getDensity() > clusters[b].getDensity(); + }); + + DenseMap orderMap; + int curOrder = INT_MIN; + for (int leader : sorted) + for (int i = leader;;) { + orderMap[sections[i]] = curOrder++; + i = clusters[i].next; + if (i == leader) + break; + } + + if (!config->printSymbolOrder.empty()) { + std::error_code ec; + raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); + if (ec) { + error("cannot open " + config->printSymbolOrder + ": " + ec.message()); + return orderMap; + } + // Print the symbols ordered by C3, in the order of increasing curOrder + // Instead of sorting all the orderMap, just repeat the loops above. + for (int leader : sorted) + for (int i = leader;;) { + const SectionChunk *sc = sections[i]; + + // Search all the symbols in the file of the section + // and find out a Defined symbol with name that is within the section. + for (Symbol *sym : sc->file->getSymbols()) + if (auto *d = dyn_cast_or_null(sym)) + if (d->isCOMDAT && !d->getCOFFSymbol().isSection()) + if (sc == d->getChunk()) + os << sym->getName() << "\n"; + i = clusters[i].next; + if (i == leader) + break; + } + } + + return orderMap; +} + +// Sort sections by the profile data provided by /callgraph-profile-file +// +// This first builds a call graph based on the profile data then merges sections +// according to the C³ heuristic. All clusters are then sorted by a density +// metric to further improve locality. +DenseMap coff::computeCallGraphProfileOrder() { + return CallGraphSort().run(); +} diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -9,6 +9,7 @@ #ifndef LLD_COFF_CONFIG_H #define LLD_COFF_CONFIG_H +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/COFF.h" @@ -29,6 +30,7 @@ class StringChunk; class Symbol; class InputFile; +class SectionChunk; // Short aliases. static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64; @@ -200,6 +202,14 @@ // Used for /lto-obj-path: llvm::StringRef ltoObjPath; + // Used for /call-graph-ordering-file: + llvm::MapVector, + uint64_t> + callGraphProfile; + + // Used for /print-symbol-order: + StringRef printSymbolOrder; + uint64_t align = 4096; uint64_t imageBase = -1; uint64_t fileAlign = 512; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -924,6 +924,46 @@ } } +static void parseCallGraphFile(StringRef path) { + std::unique_ptr mb = CHECK( + MemoryBuffer::getFile(path, -1, false, true), "could not open " + path); + + // Build a map from symbol name to section + DenseMap map; + for (ObjFile *file : ObjFile::instances) + for (Symbol *sym : file->getSymbols()) + if (sym) + map[sym->getName()] = sym; + + auto findSection = [&](StringRef name) -> SectionChunk * { + Symbol *sym = map.lookup(name); + if (!sym) { + if (config->warnMissingOrderSymbol) + warn(path + ": no such symbol: " + name); + return nullptr; + } + + if (DefinedCOFF *dr = dyn_cast_or_null(sym)) + return dyn_cast_or_null(dr->getChunk()); + return nullptr; + }; + + for (StringRef line : args::getLines(*mb)) { + SmallVector fields; + line.split(fields, ' '); + uint64_t count; + + if (fields.size() != 3 || !to_integer(fields[2], count)) { + error(path + ": parse error"); + return; + } + + if (SectionChunk *from = findSection(fields[0])) + if (SectionChunk *to = findSection(fields[1])) + config->callGraphProfile[std::make_pair(from, to)] += count; + } +} + static void markAddrsig(Symbol *s) { if (auto *d = dyn_cast_or_null(s)) if (SectionChunk *c = dyn_cast_or_null(d->getChunk())) @@ -2020,12 +2060,23 @@ if (config->manifest == Configuration::SideBySide) createSideBySideManifest(); + if (args.hasArg(OPT_order) && args.hasArg(OPT_call_graph_ordering_file)) + error("/order and /call-graph-order-file may not be used together"); + // Handle /order. We want to do this at this moment because we // need a complete list of comdat sections to warn on nonexistent // functions. if (auto *arg = args.getLastArg(OPT_order)) parseOrderFile(arg->getValue()); + // Handle /call-graph-ordering-file + if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) + parseCallGraphFile(arg->getValue()); + + // Handle /print-symbol-order + if (auto *arg = args.getLastArg(OPT_print_symbol_order)) + config->printSymbolOrder = arg->getValue(); + // Identify unreferenced COMDAT sections. if (config->doGC) markLive(symtab->getChunks()); diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -234,6 +234,13 @@ def threads : P<"threads", "Number of threads. '1' disables multi-threading. By " "default all available hardware threads are used">; +def call_graph_ordering_file: P< + "call-graph-ordering-file", + "Layout sections to optimize the given callgraph">; +def print_symbol_order: P< + "print-symbol-order", + "Print a symbol order specified by /call-graph-ordering-file into the " + "specified file">; // Flags for debugging def lldmap : F<"lldmap">; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "Writer.h" +#include "CallGraphSort.h" #include "Config.h" #include "DLL.h" #include "InputFiles.h" @@ -229,6 +230,7 @@ void setSectionPermissions(); void writeSections(); void writeBuildId(); + void sortSections(); void sortExceptionTable(); void sortCRTSectionChunks(std::vector &chunks); void addSyntheticIdata(); @@ -798,6 +800,19 @@ name.startswith(".xdata$") || name.startswith(".eh_frame$"); } +void Writer::sortSections() { + if (!config->callGraphProfile.empty()) { + DenseMap order = computeCallGraphProfileOrder(); + for (auto I : order) { + if (DefinedRegular *sym = I.first->sym) + config->order[sym->getName()] = I.second; + } + } + if (!config->order.empty()) + for (auto it : partialSections) + sortBySectionOrder(it.second->chunks); +} + // Create output section objects and add them to OutputSections. void Writer::createSections() { // First, create the builtin sections. @@ -861,10 +876,7 @@ if (hasIdata) addSyntheticIdata(); - // Process an /order option. - if (!config->order.empty()) - for (auto it : partialSections) - sortBySectionOrder(it.second->chunks); + sortSections(); if (hasIdata) locateImportTables(); diff --git a/lld/test/COFF/cgprofile-bad-clusters.s b/lld/test/COFF/cgprofile-bad-clusters.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-bad-clusters.s @@ -0,0 +1,61 @@ +# REQUIRES: x86 +# This test checks that CallGraphSort ignores edges that would form "bad" +# clusters. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A C 1" > %t.call_graph +# RUN: echo "E B 4" >> %t.call_graph +# RUN: echo "C D 2" >> %t.call_graph +# RUN: echo "B D 1" >> %t.call_graph +# RUN: echo "F G 6" >> %t.call_graph +# RUN: echo "G H 5" >> %t.call_graph +# RUN: echo "H I 4" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s + + .section .text,"ax",one_only,A + .globl A +A: + retq + + .section .text,"ax",one_only,D +D: + .fill 1000, 1, 0 + + .section .text,"ax",one_only,E +E: + retq + + .section .text,"ax",one_only,C +C: + retq + + .section .text,"ax",one_only,B +B: + .fill 1000, 1, 0 + + .section .text,"ax",one_only,F +F: + .fill (1024 * 1024) - 1, 1, 0 + + .section .text,"ax",one_only,G +G: + retq + + .section .text,"ax",one_only,H +H: + retq + + .section .text,"ax",one_only,I +I: + .fill 13, 1, 0 + +# CHECK: 140001000 t H +# CHECK: 140001001 t I +# CHECK: 14000100e T A +# CHECK: 14000100f t C +# CHECK: 140001010 t E +# CHECK: 140001011 t B +# CHECK: 1400013f9 t D +# CHECK: 1400017e1 t F +# CHECK: 1401017e0 t G diff --git a/lld/test/COFF/cgprofile-icf.s b/lld/test/COFF/cgprofile-icf.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-icf.s @@ -0,0 +1,45 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t + +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "C D 61" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab /opt:icf +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOICF + + .section .text,"x",one_only,D + .globl D +D: + mov $60, %rax + retq + + .section .text,"x",one_only,C + .globl C +C: + mov $60, %rax + retq + + .section .text,"x",one_only,B + .globl B +B: + mov $2, %rax + retq + + .section .text,"x",one_only,A + .globl A +A: + mov $42, %rax + retq + +# CHECK: 140001000 T A +# CHECK: 140001008 T C +# CHECK: 140001008 T D +# CHECK: 140001010 T B + +# NOICF: 140001000 T A +# NOICF: 140001008 T B +# NOICF: 140001010 T C +# NOICF: 140001018 T D diff --git a/lld/test/COFF/cgprofile-print.s b/lld/test/COFF/cgprofile-print.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-print.s @@ -0,0 +1,35 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A B 5" > %t.call_graph +# RUN: echo "B C 50" >> %t.call_graph +# RUN: echo "C D 40" >> %t.call_graph +# RUN: echo "D B 10" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /print-symbol-order:%t3 +# RUN: FileCheck %s --input-file %t3 + +# CHECK: B +# CHECK-NEXT: C +# CHECK-NEXT: D +# CHECK-NEXT: A + +.section .text, "x", one_only, A +.globl A +A: + nop + +.section .text, "x", one_only, B +.globl B +B: + nop + +.section .text, "x", one_only, C +.globl C +C: + nop + +.section .text, "x", one_only, D +.globl D +D: + nop + diff --git a/lld/test/COFF/cgprofile-txt.s b/lld/test/COFF/cgprofile-txt.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-txt.s @@ -0,0 +1,43 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: lld-link /subsystem:console /entry:A %t /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOSORT + +# RUN: echo "A B 5" > %t.call_graph +# RUN: echo "B C 50" >> %t.call_graph +# RUN: echo "C D 40" >> %t.call_graph +# RUN: echo "D B 10" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s + +# NOSORT: 140001000 T A +# NOSORT: 140001001 T B +# NOSORT: 140001002 T C +# NOSORT: 140001003 T D + +# CHECK: 140001000 T B +# CHECK: 140001001 T C +# CHECK: 140001002 T D +# CHECK: 140001003 T A + +.section .text, "x", one_only, A +.globl A +A: + nop + +.section .text, "x", one_only, B +.globl B +B: + nop + +.section .text, "x", one_only, C +.globl C +C: + nop + +.section .text, "x", one_only, D +.globl D +D: + nop +