diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt --- a/lld/ELF/CMakeLists.txt +++ b/lld/ELF/CMakeLists.txt @@ -73,6 +73,7 @@ Passes Support TargetParser + TransformUtils LINK_LIBS lldCommon diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h --- a/lld/ELF/CallGraphSort.h +++ b/lld/ELF/CallGraphSort.h @@ -14,6 +14,8 @@ namespace lld::elf { class InputSectionBase; +llvm::DenseMap computeCacheDirectedSortOrder(); + llvm::DenseMap computeCallGraphProfileOrder(); } // namespace lld::elf diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -6,38 +6,21 @@ // //===----------------------------------------------------------------------===// /// -/// Implementation of Call-Chain Clustering from: Optimizing Function Placement -/// for Large-Scale Data-Center Applications -/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf -/// -/// The goal of this algorithm is to improve runtime performance of the final -/// executable by arranging code sections such that page table and i-cache -/// misses are minimized. -/// -/// Definitions: -/// * Cluster -/// * An ordered list of input sections which are laid out as a unit. At the -/// beginning of the algorithm each input section has its own cluster and -/// the weight of the cluster is the sum of the weight of all incoming -/// edges. -/// * Call-Chain Clustering (C³) Heuristic -/// * Defines when and how clusters are combined. Pick the highest weighted -/// input section then add it to its most likely predecessor if it wouldn't -/// penalize it too much. -/// * Density -/// * The weight of the cluster divided by the size of the cluster. This is a -/// proxy for the amount of execution time spent per byte of the cluster. -/// -/// It does so given a call graph profile by the following: -/// * Build a weighted call graph from the call graph profile -/// * Sort input sections by weight -/// * For each input section starting with the highest weight -/// * Find its most likely predecessor cluster -/// * Check if the combined cluster would be too large, or would have too low -/// a density. -/// * If not, then combine the clusters. -/// * Sort non-empty clusters by density +/// The file is responsible for sorting sections using LLVM call graph profile +/// data by placing frequently executed code sections together. The goal of the +/// placement is to improve the runtime performance of the final executable by +/// arranging code sections so that i-TLB misses and i-cache misses are reduced. /// +/// The algorithm first builds a call graph based on the profile data and then +/// iteratively merges "chains" (ordered lists) of input sections which will be +/// laid out as a unit. There are two implementations for deciding how to +/// merge a pair of chains: +/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows +/// "Optimizing Function Placement for Large-Scale Data-Center Applications" +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which +/// typically produces layouts with higher locality, and hence, yields fewer +/// instruction cache misses on large binaries. //===----------------------------------------------------------------------===// #include "CallGraphSort.h" @@ -45,6 +28,7 @@ #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include @@ -75,6 +59,33 @@ Edge bestPred = {-1, 0}; }; +/// Implementation of the Call-Chain Clustering (C^3). The goal of this +/// algorithm is to improve runtime performance of the executable by arranging +/// code sections such that page table and i-cache misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are laid out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incoming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the amount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density class CallGraphSort { public: CallGraphSort(); @@ -260,11 +271,74 @@ return orderMap; } +// Sort sections by the profile data using the Cache-Directed Sort algorithm. +// The placement is done by optimizing the locality by co-locating frequently +// executed code sections together. +DenseMap elf::computeCacheDirectedSortOrder() { + SmallVector funcSizes; + SmallVector funcCounts; + SmallVector callCounts; + SmallVector callOffsets; + SmallVector sections; + DenseMap secToTargetId; + + auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { + auto res = secToTargetId.try_emplace(inSec, sections.size()); + if (res.second) { + // inSec does not appear before in the graph. + sections.push_back(inSec); + assert(inSec->getSize() > 0 && "found a function with zero size"); + funcSizes.push_back(inSec->getSize()); + funcCounts.push_back(0); + } + return res.first->second; + }; + + // Create the graph. + for (std::pair &c : config->callGraphProfile) { + const InputSectionBase *fromSB = cast(c.first.first); + const InputSectionBase *toSB = cast(c.first.second); + // Ignore edges between input sections belonging to different sections. + if (fromSB->getOutputSection() != toSB->getOutputSection()) + continue; + + uint64_t weight = c.second; + // Ignore edges with zero weight. + if (weight == 0) + continue; + + size_t from = getOrCreateNode(fromSB); + size_t to = getOrCreateNode(toSB); + // Ignore self-edges (recursive calls). + if (from == to) + continue; + + callCounts.push_back({from, to, weight}); + // Assume that the jump is at the middle of the input section. The profile + // data does not contain jump offsets. + callOffsets.push_back((funcSizes[from] + 1) / 2); + funcCounts[to] += weight; + } + + // Run the layout algorithm. + std::vector sortedSections = codelayout::computeCacheDirectedLayout( + funcSizes, funcCounts, callCounts, callOffsets); + + // Create the final order. + DenseMap orderMap; + int curOrder = 1; + for (uint64_t secIdx : sortedSections) + orderMap[sections[secIdx]] = curOrder++; + + return orderMap; +} + // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections -// according to the C³ heuristic. All clusters are then sorted by a density -// metric to further improve locality. +// according either to the C³ or Cache-Directed-Sort ordering algorithm. DenseMap elf::computeCallGraphProfileOrder() { + if (config->callGraphProfileSort == CGProfileSortKind::Cdsort) + return computeCacheDirectedSortOrder(); return CallGraphSort().run(); } diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -59,8 +59,8 @@ // For --build-id. enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid }; -// For --call-graph-profile-sort={none,hfsort}. -enum class CGProfileSortKind { None, Hfsort }; +// For --call-graph-profile-sort={none,hfsort,cdsort}. +enum class CGProfileSortKind { None, Hfsort, Cdsort }; // For --discard-{all,locals,none}. enum class DiscardPolicy { Default, All, Locals, None }; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -1098,6 +1098,8 @@ StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort"); if (s == "hfsort") return CGProfileSortKind::Hfsort; + if (s == "cdsort") + return CGProfileSortKind::Cdsort; if (s != "none") error("unknown --call-graph-profile-sort= value: " + s); return CGProfileSortKind::None; diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -607,7 +607,7 @@ defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch", "turn on warnings about profile cfg mismatch (default)", "turn off warnings about profile cfg mismatch">; -defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", +defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">; def lto_obj_path_eq: JJ<"lto-obj-path=">; def lto_sample_profile: JJ<"lto-sample-profile=">, diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -129,6 +129,8 @@ Ignore call graph profile. .It Cm hfsort Use hfsort (default). +.It Cm cdsort +Use cdsort. .El .Pp .It Fl -color-diagnostics Ns = Ns Ar value diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s --- a/lld/test/ELF/cgprofile-txt.s +++ b/lld/test/ELF/cgprofile-txt.s @@ -30,6 +30,9 @@ # RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b # RUN: cmp %t2 %t2b +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT + # RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \ # RUN: -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN @@ -167,6 +170,31 @@ # CHECK: Name: _init2 # CHECK-NEXT: Value: 0x201141 +# CDSORT: Name: D +# CDSORT-NEXT: Value: 0x201123 +# CDSORT: Name: TooManyPreds +# CDSORT-NEXT: Value: 0x20112F +# CDSORT: Name: TooManyPreds10 +# CDSORT-NEXT: Value: 0x20112E +# CDSORT: Name: C +# CDSORT-NEXT: Value: 0x201122 +# CDSORT: Name: B +# CDSORT-NEXT: Value: 0x201121 +# CDSORT: Name: A +# CDSORT-NEXT: Value: 0x201120 +# CDSORT: Name: TS +# CDSORT-NEXT: Value: 0x20113D +# CDSORT: Name: PP +# CDSORT-NEXT: Value: 0x20113C +# CDSORT: Name: QC +# CDSORT-NEXT: Value: 0x20113E +# CDSORT: Name: GB +# CDSORT-NEXT: Value: 0x20113F +# CDSORT: Name: _init +# CDSORT-NEXT: Value: 0x201140 +# CDSORT: Name: _init2 +# CDSORT-NEXT: Value: 0x201141 + # NOSORT: Name: D # NOSORT-NEXT: Value: 0x201120 # NOSORT: Name: TooManyPreds diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s --- a/lld/test/ELF/cgprofile-txt2.s +++ b/lld/test/ELF/cgprofile-txt2.s @@ -5,17 +5,28 @@ # RUN: echo "B C 50" >> %t.call_graph # RUN: echo "C D 40" >> %t.call_graph # RUN: echo "D B 10" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 -# RUN: llvm-readobj --symbols %t2 | FileCheck %s +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS -# CHECK: Name: A -# CHECK-NEXT: Value: 0x201123 -# CHECK: Name: B -# CHECK-NEXT: Value: 0x201120 -# CHECK: Name: C -# CHECK-NEXT: Value: 0x201121 -# CHECK: Name: D -# CHECK-NEXT: Value: 0x201122 +# CHECKC3: Name: A +# CHECKC3-NEXT: Value: 0x201123 +# CHECKC3: Name: B +# CHECKC3-NEXT: Value: 0x201120 +# CHECKC3: Name: C +# CHECKC3-NEXT: Value: 0x201121 +# CHECKC3: Name: D +# CHECKC3-NEXT: Value: 0x201122 + +# CHECKCDS: Name: A +# CHECKCDS-NEXT: Value: 0x201120 +# CHECKCDS: Name: B +# CHECKCDS-NEXT: Value: 0x201121 +# CHECKCDS: Name: C +# CHECKCDS-NEXT: Value: 0x201122 +# CHECKCDS: Name: D +# CHECKCDS-NEXT: Value: 0x201123 .section .text.A,"ax",@progbits .globl A