diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt --- a/lld/ELF/CMakeLists.txt +++ b/lld/ELF/CMakeLists.txt @@ -72,6 +72,7 @@ Option Passes Support + TransformUtils TargetParser LINK_LIBS diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h --- a/lld/ELF/CallGraphSort.h +++ b/lld/ELF/CallGraphSort.h @@ -14,6 +14,8 @@ namespace lld::elf { class InputSectionBase; +llvm::DenseMap applyCDSort(); + llvm::DenseMap computeCallGraphProfileOrder(); } // namespace lld::elf diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -6,38 +6,21 @@ // //===----------------------------------------------------------------------===// /// -/// Implementation of Call-Chain Clustering from: Optimizing Function Placement -/// for Large-Scale Data-Center Applications -/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf -/// -/// The goal of this algorithm is to improve runtime performance of the final -/// executable by arranging code sections such that page table and i-cache -/// misses are minimized. -/// -/// Definitions: -/// * Cluster -/// * An ordered list of input sections which are laid out as a unit. At the -/// beginning of the algorithm each input section has its own cluster and -/// the weight of the cluster is the sum of the weight of all incoming -/// edges. -/// * Call-Chain Clustering (C³) Heuristic -/// * Defines when and how clusters are combined. Pick the highest weighted -/// input section then add it to its most likely predecessor if it wouldn't -/// penalize it too much. -/// * Density -/// * The weight of the cluster divided by the size of the cluster. This is a -/// proxy for the amount of execution time spent per byte of the cluster. -/// -/// It does so given a call graph profile by the following: -/// * Build a weighted call graph from the call graph profile -/// * Sort input sections by weight -/// * For each input section starting with the highest weight -/// * Find its most likely predecessor cluster -/// * Check if the combined cluster would be too large, or would have too low -/// a density. -/// * If not, then combine the clusters. -/// * Sort non-empty clusters by density +/// The file is responsible for sorting sections using the profile data by +/// placing frequently executed code sections together. The goal of the +/// placement is to improve the runtime performance of the final executable by +/// arranging code sections such that page table and i-cache misses are reduced. /// +/// The algorithm first builds a call graph based on the profile data and then +/// iteratively merges "chains" (ordered lists) of input sections which will be +/// laid out as a unit. There are two implementations for deciding how to +/// merge a pair of chains: +/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows +/// "Optimizing Function Placement for Large-Scale Data-Center Applications" +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which +/// typically produces layouts with higher locality, and hence, yields fewer +/// instruction cache misses on large binaries. //===----------------------------------------------------------------------===// #include "CallGraphSort.h" @@ -45,6 +28,7 @@ #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include @@ -75,6 +59,33 @@ Edge bestPred = {-1, 0}; }; +/// Implementation of the Call-Chain Clustering (C^3). The goal of this +/// algorithm is to improve runtime performance of the executable by arranging +/// code sections such that page table and i-cache misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are laid out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incoming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the amount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density class CallGraphSort { public: CallGraphSort(); @@ -260,11 +271,85 @@ return orderMap; } +using CallT = std::pair; + +// Fill in necessary data for Cache-Directed Sort. +static void buildCallGraph(std::vector &funcSizes, + std::vector &funcCounts, + std::vector> &callCounts, + std::vector &callOffsets, + std::vector §ions) { + MapVector &profile = config->callGraphProfile; + DenseMap secToTargetId; + + auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { + auto res = secToTargetId.try_emplace(inSec, sections.size()); + if (res.second) { + // inSec does not appear before in the graph. + sections.push_back(inSec); + assert(inSec->getSize() > 0 && "found a function with zero size"); + funcSizes.push_back(inSec->getSize()); + funcCounts.push_back(0); + } + return res.first->second; + }; + + // Create the graph + for (std::pair &c : profile) { + const InputSectionBase *fromSB = cast(c.first.first); + const InputSectionBase *toSB = cast(c.first.second); + // Ignore edges between input sections belonging to different sections. + if (fromSB->getOutputSection() != toSB->getOutputSection()) + continue; + + uint64_t weight = c.second; + // Ignore edges with zero weight. + if (weight == 0) + continue; + + size_t from = getOrCreateNode(fromSB); + size_t to = getOrCreateNode(toSB); + // Ignore self-edges (recursive calls). + if (from == to) + continue; + + callCounts.push_back(std::make_pair(std::make_pair(from, to), weight)); + callOffsets.push_back((funcSizes[from] + 1) / 2); + funcCounts[to] += weight; + } +} + +// Sort sections by the profile data using the Cache-Directed Sort algorithm. +// The placement is done by optimizing the locality by co-locating frequently +// executed code sections together. +DenseMap elf::applyCDSort() { + // Creating a call graph and necessary data. + std::vector funcSizes; + std::vector funcCounts; + std::vector> callCounts; + std::vector callOffsets; + std::vector sections; + buildCallGraph(funcSizes, funcCounts, callCounts, callOffsets, sections); + + // Run the layout algorithm. + std::vector sortedSections = + applyCDSLayout(funcSizes, funcCounts, callCounts, callOffsets); + + // Create the final order. + DenseMap orderMap; + int curOrder = 1; + for (uint64_t secIdx : sortedSections) + orderMap[sections[secIdx]] = curOrder++; + + return orderMap; +} + // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections -// according to the C³ heuristic. All clusters are then sorted by a density -// metric to further improve locality. +// according either to the C³ or Cache-Directed-Sort ordering algorithm. DenseMap elf::computeCallGraphProfileOrder() { + if (config->useCDSort) + return applyCDSort(); return CallGraphSort().run(); } diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -289,6 +289,7 @@ bool undefinedVersion; bool unique; bool useAndroidRelrTags = false; + bool useCDSort; bool warnBackrefs; llvm::SmallVector warnBackrefsExclude; bool warnCommon; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1364,6 +1364,7 @@ config->unique = args.hasArg(OPT_unique); config->useAndroidRelrTags = args.hasFlag( OPT_use_android_relr_tags, OPT_no_use_android_relr_tags, false); + config->useCDSort = args.hasFlag(OPT_use_cdsort, OPT_no_use_cdsort, true); config->warnBackrefs = args.hasFlag(OPT_warn_backrefs, OPT_no_warn_backrefs, false); config->warnCommon = args.hasFlag(OPT_warn_common, OPT_no_warn_common, false); diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -337,6 +337,10 @@ "Use SHT_ANDROID_RELR / DT_ANDROID_RELR* tags instead of SHT_RELR / DT_RELR*", "Use SHT_RELR / DT_RELR* tags (default)">; +defm use_cdsort: BB<"use-cdsort", + "Use cache-directed-sort to reorder functions in the binary (default)", + "Do not use cache-directed-sort to reorder functions in the binary">; + def pic_veneer: F<"pic-veneer">, HelpText<"Always generate position independent thunks (veneers)">; diff --git a/lld/test/ELF/cgprofile-bad-clusters.s b/lld/test/ELF/cgprofile-bad-clusters.s --- a/lld/test/ELF/cgprofile-bad-clusters.s +++ b/lld/test/ELF/cgprofile-bad-clusters.s @@ -10,7 +10,7 @@ # RUN: echo "F G 6" >> %t.call_graph # RUN: echo "G H 5" >> %t.call_graph # RUN: echo "H I 4" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2 # RUN: llvm-readobj --symbols %t2 | FileCheck %s .section .text.A,"ax",@progbits diff --git a/lld/test/ELF/cgprofile-icf.s b/lld/test/ELF/cgprofile-icf.s --- a/lld/test/ELF/cgprofile-icf.s +++ b/lld/test/ELF/cgprofile-icf.s @@ -5,9 +5,9 @@ # RUN: echo "A B 100" > %t.call_graph # RUN: echo "A C 40" >> %t.call_graph # RUN: echo "C D 61" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t.out -icf=all +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t.out -icf=all # RUN: llvm-readobj --symbols %t.out | FileCheck %s -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2.out +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2.out # RUN: llvm-readobj --symbols %t2.out | FileCheck %s --check-prefix=NOICF .section .text.D,"ax",@progbits diff --git a/lld/test/ELF/cgprofile-rela.test b/lld/test/ELF/cgprofile-rela.test --- a/lld/test/ELF/cgprofile-rela.test +++ b/lld/test/ELF/cgprofile-rela.test @@ -3,7 +3,7 @@ # REQUIRES: x86 # RUN: yaml2obj %s -o %t.o -# RUN: ld.lld %t.o -o %t +# RUN: ld.lld --no-use-cdsort %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s # RUN: ld.lld --no-call-graph-profile-sort %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s --- a/lld/test/ELF/cgprofile-txt.s +++ b/lld/test/ELF/cgprofile-txt.s @@ -24,8 +24,10 @@ # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2 # RUN: llvm-readobj --symbols %t2 | FileCheck %s +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --use-cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT .section .text.D,"ax",@progbits D: @@ -159,6 +161,31 @@ # CHECK: Name: _init2 # CHECK-NEXT: Value: 0x201141 +# CDSORT: Name: D +# CDSORT-NEXT: Value: 0x201123 +# CDSORT: Name: TooManyPreds +# CDSORT-NEXT: Value: 0x20112F +# CDSORT: Name: TooManyPreds10 +# CDSORT-NEXT: Value: 0x20112E +# CDSORT: Name: C +# CDSORT-NEXT: Value: 0x201122 +# CDSORT: Name: B +# CDSORT-NEXT: Value: 0x201121 +# CDSORT: Name: A +# CDSORT-NEXT: Value: 0x201120 +# CDSORT: Name: TS +# CDSORT-NEXT: Value: 0x20113D +# CDSORT: Name: PP +# CDSORT-NEXT: Value: 0x20113C +# CDSORT: Name: QC +# CDSORT-NEXT: Value: 0x20113E +# CDSORT: Name: GB +# CDSORT-NEXT: Value: 0x20113F +# CDSORT: Name: _init +# CDSORT-NEXT: Value: 0x201140 +# CDSORT: Name: _init2 +# CDSORT-NEXT: Value: 0x201141 + # NOSORT: Name: D # NOSORT-NEXT: Value: 0x201120 # NOSORT: Name: TooManyPreds diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s --- a/lld/test/ELF/cgprofile-txt2.s +++ b/lld/test/ELF/cgprofile-txt2.s @@ -5,17 +5,28 @@ # RUN: echo "B C 50" >> %t.call_graph # RUN: echo "C D 40" >> %t.call_graph # RUN: echo "D B 10" >> %t.call_graph +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --no-use-cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3 # RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 -# RUN: llvm-readobj --symbols %t2 | FileCheck %s +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS -# CHECK: Name: A -# CHECK-NEXT: Value: 0x201123 -# CHECK: Name: B -# CHECK-NEXT: Value: 0x201120 -# CHECK: Name: C -# CHECK-NEXT: Value: 0x201121 -# CHECK: Name: D -# CHECK-NEXT: Value: 0x201122 +# CHECKC3: Name: A +# CHECKC3-NEXT: Value: 0x201123 +# CHECKC3: Name: B +# CHECKC3-NEXT: Value: 0x201120 +# CHECKC3: Name: C +# CHECKC3-NEXT: Value: 0x201121 +# CHECKC3: Name: D +# CHECKC3-NEXT: Value: 0x201122 + +# CHECKCDS: Name: A +# CHECKCDS-NEXT: Value: 0x201120 +# CHECKCDS: Name: B +# CHECKCDS-NEXT: Value: 0x201121 +# CHECKCDS: Name: C +# CHECKCDS-NEXT: Value: 0x201122 +# CHECKCDS: Name: D +# CHECKCDS-NEXT: Value: 0x201123 .section .text.A,"ax",@progbits .globl A