diff --git a/lld/COFF/CMakeLists.txt b/lld/COFF/CMakeLists.txt --- a/lld/COFF/CMakeLists.txt +++ b/lld/COFF/CMakeLists.txt @@ -8,6 +8,7 @@ add_lld_library(lldCOFF Chunks.cpp + CallGraphSort.cpp DebugTypes.cpp DLL.cpp Driver.cpp diff --git a/lld/COFF/CallGraphSort.h b/lld/COFF/CallGraphSort.h new file mode 100644 --- /dev/null +++ b/lld/COFF/CallGraphSort.h @@ -0,0 +1,22 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_COFF_CALL_GRAPH_SORT_H +#define LLD_COFF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace coff { +class SectionChunk; + +llvm::DenseMap computeCallGraphProfileOrder(); +} // namespace coff +} // namespace lld + +#endif diff --git a/lld/ELF/CallGraphSort.cpp b/lld/COFF/CallGraphSort.cpp copy from lld/ELF/CallGraphSort.cpp copy to lld/COFF/CallGraphSort.cpp --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/COFF/CallGraphSort.cpp @@ -6,50 +6,23 @@ // //===----------------------------------------------------------------------===// /// -/// Implementation of Call-Chain Clustering from: Optimizing Function Placement -/// for Large-Scale Data-Center Applications -/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf -/// -/// The goal of this algorithm is to improve runtime performance of the final -/// executable by arranging code sections such that page table and i-cache -/// misses are minimized. -/// -/// Definitions: -/// * Cluster -/// * An ordered list of input sections which are laid out as a unit. At the -/// beginning of the algorithm each input section has its own cluster and -/// the weight of the cluster is the sum of the weight of all incoming -/// edges. -/// * Call-Chain Clustering (C³) Heuristic -/// * Defines when and how clusters are combined. Pick the highest weighted -/// input section then add it to its most likely predecessor if it wouldn't -/// penalize it too much. -/// * Density -/// * The weight of the cluster divided by the size of the cluster. This is a -/// proxy for the amount of execution time spent per byte of the cluster. -/// -/// It does so given a call graph profile by the following: -/// * Build a weighted call graph from the call graph profile -/// * Sort input sections by weight -/// * For each input section starting with the highest weight -/// * Find its most likely predecessor cluster -/// * Check if the combined cluster would be too large, or would have too low -/// a density. -/// * If not, then combine the clusters. -/// * Sort non-empty clusters by density +/// This is based on the ELF port, see ELF/ICF.cpp for the details about the +/// algorithm. /// //===----------------------------------------------------------------------===// #include "CallGraphSort.h" -#include "OutputSections.h" +#include "InputFiles.h" #include "SymbolTable.h" #include "Symbols.h" +#include "lld/Common/ErrorHandler.h" +#include #include using namespace llvm; using namespace lld; -using namespace lld::elf; +using namespace lld::coff; namespace { struct Edge { @@ -68,7 +41,7 @@ int next; int prev; - size_t size = 0; + uint64_t size; uint64_t weight = 0; uint64_t initialWeight = 0; Edge bestPred = {-1, 0}; @@ -78,11 +51,11 @@ public: CallGraphSort(); - DenseMap run(); + DenseMap run(); private: std::vector clusters; - std::vector sections; + std::vector sections; }; // Maximum amount the combined cluster density can be worse than the original @@ -93,17 +66,16 @@ constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; } // end anonymous namespace -using SectionPair = - std::pair; +using SectionPair = std::pair; // Take the edge list in Config->CallGraphProfile, resolve symbol names to // Symbols, and generate a graph between InputSections with the provided // weights. CallGraphSort::CallGraphSort() { MapVector &profile = config->callGraphProfile; - DenseMap secToCluster; + DenseMap secToCluster; - auto getOrCreateNode = [&](const InputSectionBase *isec) -> int { + auto getOrCreateNode = [&](const SectionChunk *isec) -> int { auto res = secToCluster.try_emplace(isec, clusters.size()); if (res.second) { sections.push_back(isec); @@ -114,8 +86,8 @@ // Create the graph. for (std::pair &c : profile) { - const auto *fromSB = cast(c.first.first->repl); - const auto *toSB = cast(c.first.second->repl); + const auto *fromSec = cast(c.first.first->repl); + const auto *toSec = cast(c.first.second->repl); uint64_t weight = c.second; // Ignore edges between input sections belonging to different output @@ -124,11 +96,11 @@ // output. This messes with the cluster size and density calculations. We // would also end up moving input sections in other output sections without // moving them closer to what calls them. - if (fromSB->getOutputSection() != toSB->getOutputSection()) + if (fromSec->getOutputSection() != toSec->getOutputSection()) continue; - int from = getOrCreateNode(fromSB); - int to = getOrCreateNode(toSB); + int from = getOrCreateNode(fromSec); + int to = getOrCreateNode(toSec); clusters[to].weight += weight; @@ -178,7 +150,7 @@ // Group InputSections into clusters using the Call-Chain Clustering heuristic // then sort the clusters by density. -DenseMap CallGraphSort::run() { +DenseMap CallGraphSort::run() { std::vector sorted(clusters.size()); std::vector leaders(clusters.size()); @@ -221,16 +193,18 @@ return clusters[a].getDensity() > clusters[b].getDensity(); }); - DenseMap orderMap; - int curOrder = 1; - for (int leader : sorted) + DenseMap orderMap; + // Sections will be sorted by increasing order. Absent sections will have + // priority 0 and be placed at the end of sections. + int curOrder = INT_MIN; + for (int leader : sorted) { for (int i = leader;;) { orderMap[sections[i]] = curOrder++; i = clusters[i].next; if (i == leader) break; } - + } if (!config->printSymbolOrder.empty()) { std::error_code ec; raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); @@ -238,18 +212,21 @@ error("cannot open " + config->printSymbolOrder + ": " + ec.message()); return orderMap; } - // Print the symbols ordered by C3, in the order of increasing curOrder // Instead of sorting all the orderMap, just repeat the loops above. for (int leader : sorted) for (int i = leader;;) { + const SectionChunk *sc = sections[i]; + // Search all the symbols in the file of the section - // and find out a Defined symbol with name that is within the section. - for (Symbol *sym : sections[i]->file->getSymbols()) - if (!sym->isSection()) // Filter out section-type symbols here. - if (auto *d = dyn_cast(sym)) - if (sections[i] == d->section) - os << sym->getName() << "\n"; + // and find out a DefinedCOFF symbol with name that is within the + // section. + for (Symbol *sym : sc->file->getSymbols()) + if (auto *d = dyn_cast_or_null(sym)) + // Filter out non-COMDAT symbols and section symbols. + if (d->isCOMDAT && !d->getCOFFSymbol().isSection() && + sc == d->getChunk()) + os << sym->getName() << "\n"; i = clusters[i].next; if (i == leader) break; @@ -259,11 +236,11 @@ return orderMap; } -// Sort sections by the profile data provided by -callgraph-profile-file +// Sort sections by the profile data provided by /call-graph-ordering-file // // This first builds a call graph based on the profile data then merges sections // according to the C³ heuristic. All clusters are then sorted by a density // metric to further improve locality. -DenseMap elf::computeCallGraphProfileOrder() { +DenseMap coff::computeCallGraphProfileOrder() { return CallGraphSort().run(); } diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h --- a/lld/COFF/Config.h +++ b/lld/COFF/Config.h @@ -9,6 +9,7 @@ #ifndef LLD_COFF_CONFIG_H #define LLD_COFF_CONFIG_H +#include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Object/COFF.h" @@ -29,6 +30,7 @@ class StringChunk; class Symbol; class InputFile; +class SectionChunk; // Short aliases. static const auto AMD64 = llvm::COFF::IMAGE_FILE_MACHINE_AMD64; @@ -200,6 +202,14 @@ // Used for /lto-obj-path: llvm::StringRef ltoObjPath; + // Used for /call-graph-ordering-file: + llvm::MapVector, + uint64_t> + callGraphProfile; + + // Used for /print-symbol-order: + StringRef printSymbolOrder; + uint64_t align = 4096; uint64_t imageBase = -1; uint64_t fileAlign = 512; diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp --- a/lld/COFF/Driver.cpp +++ b/lld/COFF/Driver.cpp @@ -924,6 +924,46 @@ } } +static void parseCallGraphFile(StringRef path) { + std::unique_ptr mb = CHECK( + MemoryBuffer::getFile(path, -1, false, true), "could not open " + path); + + // Build a map from symbol name to section + DenseMap map; + for (ObjFile *file : ObjFile::instances) + for (Symbol *sym : file->getSymbols()) + if (sym) + map[sym->getName()] = sym; + + auto findSection = [&](StringRef name) -> SectionChunk * { + Symbol *sym = map.lookup(name); + if (!sym) { + if (config->warnMissingOrderSymbol) + warn(path + ": no such symbol: " + name); + return nullptr; + } + + if (DefinedCOFF *dr = dyn_cast_or_null(sym)) + return dyn_cast_or_null(dr->getChunk()); + return nullptr; + }; + + for (StringRef line : args::getLines(*mb)) { + SmallVector fields; + line.split(fields, ' '); + uint64_t count; + + if (fields.size() != 3 || !to_integer(fields[2], count)) { + error(path + ": parse error"); + return; + } + + if (SectionChunk *from = findSection(fields[0])) + if (SectionChunk *to = findSection(fields[1])) + config->callGraphProfile[std::make_pair(from, to)] += count; + } +} + static void markAddrsig(Symbol *s) { if (auto *d = dyn_cast_or_null(s)) if (SectionChunk *c = dyn_cast_or_null(d->getChunk())) @@ -2020,12 +2060,23 @@ if (config->manifest == Configuration::SideBySide) createSideBySideManifest(); + if (args.hasArg(OPT_order) && args.hasArg(OPT_call_graph_ordering_file)) + error("/order and /call-graph-order-file may not be used together"); + // Handle /order. We want to do this at this moment because we // need a complete list of comdat sections to warn on nonexistent // functions. if (auto *arg = args.getLastArg(OPT_order)) parseOrderFile(arg->getValue()); + // Handle /call-graph-ordering-file + if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) + parseCallGraphFile(arg->getValue()); + + // Handle /print-symbol-order + if (auto *arg = args.getLastArg(OPT_print_symbol_order)) + config->printSymbolOrder = arg->getValue(); + // Identify unreferenced COMDAT sections. if (config->doGC) markLive(symtab->getChunks()); diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td --- a/lld/COFF/Options.td +++ b/lld/COFF/Options.td @@ -234,6 +234,13 @@ def threads : P<"threads", "Number of threads. '1' disables multi-threading. By " "default all available hardware threads are used">; +def call_graph_ordering_file: P< + "call-graph-ordering-file", + "Layout sections to optimize the given callgraph">; +def print_symbol_order: P< + "print-symbol-order", + "Print a symbol order specified by /call-graph-ordering-file into the " + "specified file">; // Flags for debugging def lldmap : F<"lldmap">; diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "Writer.h" +#include "CallGraphSort.h" #include "Config.h" #include "DLL.h" #include "InputFiles.h" @@ -229,6 +230,7 @@ void setSectionPermissions(); void writeSections(); void writeBuildId(); + void sortSections(); void sortExceptionTable(); void sortCRTSectionChunks(std::vector &chunks); void addSyntheticIdata(); @@ -798,6 +800,19 @@ name.startswith(".xdata$") || name.startswith(".eh_frame$"); } +void Writer::sortSections() { + if (!config->callGraphProfile.empty()) { + DenseMap order = computeCallGraphProfileOrder(); + for (auto I : order) { + if (DefinedRegular *sym = I.first->sym) + config->order[sym->getName()] = I.second; + } + } + if (!config->order.empty()) + for (auto it : partialSections) + sortBySectionOrder(it.second->chunks); +} + // Create output section objects and add them to OutputSections. void Writer::createSections() { // First, create the builtin sections. @@ -861,10 +876,7 @@ if (hasIdata) addSyntheticIdata(); - // Process an /order option. - if (!config->order.empty()) - for (auto it : partialSections) - sortBySectionOrder(it.second->chunks); + sortSections(); if (hasIdata) locateImportTables(); diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -68,7 +68,7 @@ int next; int prev; - size_t size = 0; + uint64_t size; uint64_t weight = 0; uint64_t initialWeight = 0; Edge bestPred = {-1, 0}; @@ -223,14 +223,14 @@ DenseMap orderMap; int curOrder = 1; - for (int leader : sorted) + for (int leader : sorted) { for (int i = leader;;) { orderMap[sections[i]] = curOrder++; i = clusters[i].next; if (i == leader) break; } - + } if (!config->printSymbolOrder.empty()) { std::error_code ec; raw_fd_ostream os(config->printSymbolOrder, ec, sys::fs::OF_None); diff --git a/lld/test/COFF/cgprofile-bad-clusters.s b/lld/test/COFF/cgprofile-bad-clusters.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-bad-clusters.s @@ -0,0 +1,61 @@ +# REQUIRES: x86 +# This test checks that CallGraphSort ignores edges that would form "bad" +# clusters. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A C 1" > %t.call_graph +# RUN: echo "E B 4" >> %t.call_graph +# RUN: echo "C D 2" >> %t.call_graph +# RUN: echo "B D 1" >> %t.call_graph +# RUN: echo "F G 6" >> %t.call_graph +# RUN: echo "G H 5" >> %t.call_graph +# RUN: echo "H I 4" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s + + .section .text,"ax",one_only,A + .globl A +A: + retq + + .section .text,"ax",one_only,D +D: + .fill 1000, 1, 0 + + .section .text,"ax",one_only,E +E: + retq + + .section .text,"ax",one_only,C +C: + retq + + .section .text,"ax",one_only,B +B: + .fill 1000, 1, 0 + + .section .text,"ax",one_only,F +F: + .fill (1024 * 1024) - 1, 1, 0 + + .section .text,"ax",one_only,G +G: + retq + + .section .text,"ax",one_only,H +H: + retq + + .section .text,"ax",one_only,I +I: + .fill 13, 1, 0 + +# CHECK: 140001000 t H +# CHECK: 140001001 t I +# CHECK: 14000100e T A +# CHECK: 14000100f t C +# CHECK: 140001010 t E +# CHECK: 140001011 t B +# CHECK: 1400013f9 t D +# CHECK: 1400017e1 t F +# CHECK: 1401017e0 t G diff --git a/lld/test/COFF/cgprofile-icf.s b/lld/test/COFF/cgprofile-icf.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-icf.s @@ -0,0 +1,45 @@ +# REQUIRES: x86 +# Test the compatibility of ICF and cgprofile + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "C D 61" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab /opt:icf +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOICF + + .section .text,"x",one_only,D + .globl D +D: + mov $60, %rax + retq + + .section .text,"x",one_only,C + .globl C +C: + mov $60, %rax + retq + + .section .text,"x",one_only,B + .globl B +B: + mov $2, %rax + retq + + .section .text,"x",one_only,A + .globl A +A: + mov $42, %rax + retq + +# CHECK: 140001000 T A +# CHECK: 140001008 T C +# CHECK: 140001008 T D +# CHECK: 140001010 T B + +# NOICF: 140001000 T A +# NOICF: 140001008 T B +# NOICF: 140001010 T C +# NOICF: 140001018 T D diff --git a/lld/test/COFF/cgprofile-print.s b/lld/test/COFF/cgprofile-print.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-print.s @@ -0,0 +1,35 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: echo "A B 5" > %t.call_graph +# RUN: echo "B C 50" >> %t.call_graph +# RUN: echo "C D 40" >> %t.call_graph +# RUN: echo "D B 10" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /print-symbol-order:%t3 +# RUN: FileCheck %s --input-file %t3 + +# CHECK: B +# CHECK-NEXT: C +# CHECK-NEXT: D +# CHECK-NEXT: A + +.section .text, "x", one_only, A +.globl A +A: + nop + +.section .text, "x", one_only, B +.globl B +B: + nop + +.section .text, "x", one_only, C +.globl C +C: + nop + +.section .text, "x", one_only, D +.globl D +D: + nop + diff --git a/lld/test/COFF/cgprofile-txt.s b/lld/test/COFF/cgprofile-txt.s new file mode 100644 --- /dev/null +++ b/lld/test/COFF/cgprofile-txt.s @@ -0,0 +1,44 @@ +# REQUIRES: x86 +# Test correctness of call graph ordering. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-win32 %s -o %t +# RUN: lld-link /subsystem:console /entry:A %t /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s --check-prefix=NOSORT + +# RUN: echo "A B 5" > %t.call_graph +# RUN: echo "B C 50" >> %t.call_graph +# RUN: echo "C D 40" >> %t.call_graph +# RUN: echo "D B 10" >> %t.call_graph +# RUN: lld-link /subsystem:console /entry:A %t /call-graph-ordering-file:%t.call_graph /out:%t2 /debug:symtab +# RUN: llvm-nm --numeric-sort %t2 | FileCheck %s + +# NOSORT: 140001000 T A +# NOSORT: 140001001 T B +# NOSORT: 140001002 T C +# NOSORT: 140001003 T D + +# CHECK: 140001000 T B +# CHECK: 140001001 T C +# CHECK: 140001002 T D +# CHECK: 140001003 T A + +.section .text, "x", one_only, A +.globl A +A: + nop + +.section .text, "x", one_only, B +.globl B +B: + nop + +.section .text, "x", one_only, C +.globl C +C: + nop + +.section .text, "x", one_only, D +.globl D +D: + nop +