Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -19,6 +19,7 @@ Arch/SPARCV9.cpp Arch/X86.cpp Arch/X86_64.cpp + CallGraphSort.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/CallGraphSort.h =================================================================== --- /dev/null +++ ELF/CallGraphSort.h @@ -0,0 +1,23 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CALL_GRAPH_SORT_H +#define LLD_ELF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace elf { +class InputSectionBase; + +llvm::DenseMap computeCallGraphProfileOrder(); +} // namespace elf +} // namespace lld + +#endif Index: ELF/CallGraphSort.cpp =================================================================== --- /dev/null +++ ELF/CallGraphSort.cpp @@ -0,0 +1,350 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement +/// for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are layed out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incomming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the ammount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "OutputSections.h" +#include "SymbolTable.h" +#include "Symbols.h" + +#include "llvm/Support/MathExtras.h" + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +using ClusterIndex = std::ptrdiff_t; +using SectionIndex = std::ptrdiff_t; +using EdgeIndex = std::ptrdiff_t; + +// Used for calculating an comparing density. Use soft-float for determinism. +struct Double : APFloat { + Double() : APFloat(APFloat::IEEEdouble(), 0) {} + Double(uint64_t Val) : APFloat(APFloat::IEEEdouble(), Val) {} + Double(APFloat A) : APFloat(A) {} + bool operator>(const Double Other) const { + return compare(Other) == cmpGreaterThan; + } + bool operator<(const Double Other) const { + return compare(Other) == cmpLessThan; + } +}; + +struct Cluster { + Cluster(SectionIndex Sec, const InputSectionBase *IS); + + Double getDensity() const { + if (Size == 0) + return 0; + return Double(Weight) / Double(Size); + } + + std::vector ISBs; + std::vector Sections; + int64_t Size = 0; + uint64_t Weight = 0; +}; + +struct Section { + Section(const InputSectionBase *IS) : ISB(IS) { Size = ISB->getSize(); } + + Double getDensity() const { + if (Size == 0) + return 0; + return Double(Weight) / Double(Size); + } + + int64_t Size = 0; + uint64_t Weight = 0; + const InputSectionBase *ISB; + std::vector Preds; + std::vector Succs; +}; + +struct Edge { + SectionIndex From; + SectionIndex To; + uint64_t Weight; + Double NormalizedWeight = 0; + + bool operator==(const Edge Other) const; +}; + +struct EdgeDenseMapInfo { + static Edge getEmptyKey() { + return {DenseMapInfo::getEmptyKey(), + DenseMapInfo::getEmptyKey(), 0, 0}; + } + static Edge getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey(), 0, 0}; + } + static unsigned getHashValue(const Edge &Val) { + return hash_combine(DenseMapInfo::getHashValue(Val.From), + DenseMapInfo::getHashValue(Val.To)); + } + static bool isEqual(const Edge &LHS, const Edge &RHS) { return LHS == RHS; } +}; + +class CallGraphSort { +public: + CallGraphSort(); + + DenseMap run(); + +private: + DenseMap EdgeMap; + std::vector Clusters; + std::vector Edges; + std::vector
Sections; + + void normalizeEdgeWeights(); + void generateClusters(); +}; + +// Maximum ammount the combined cluster density can be worse than the original +// cluster to consider merging. +constexpr int MAX_DENSITY_DEGRADATION = 8; + +// Maximum cluster size in bytes. +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; +} // end anonymous namespace + +Cluster::Cluster(SectionIndex Sec, const InputSectionBase *IS) { + ISBs.push_back(IS); + Sections.push_back(Sec); + Size = IS->getSize(); +} + +bool Edge::operator==(const Edge Other) const { + return From == Other.From && To == Other.To; +} + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { + MapVector, + uint64_t> &Profile = Config->CallGraphProfile; + DenseMap SecToSec; + + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> SectionIndex { + auto Res = SecToSec.insert(std::make_pair(IS, Sections.size())); + if (Res.second) + Sections.emplace_back(IS); + return Res.first->second; + }; + + // Create the graph. + for (const auto &C : Profile) { + const InputSectionBase *FromSB = C.first.first; + const InputSectionBase *ToSB = C.first.second; + uint64_t Weight = C.second; + + if (Weight == 0) + continue; + + // Ignore edges between input sections belonging to different output + // sections. This is done because otherwise we would end up with clusters + // containing input sections that can't actually be placed adjacently in the + // output. This messes with the cluster size and density calculations. We + // would also end up moving input sections in other output sections without + // moving them closer to what calls them. + if (FromSB->getOutputSection() != ToSB->getOutputSection()) + continue; + + SectionIndex From = GetOrCreateNode(FromSB); + SectionIndex To = GetOrCreateNode(ToSB); + + Sections[To].Weight = SaturatingAdd(Sections[To].Weight, Weight); + + if (From == To) + continue; + + Edge E{From, To, Weight}; + + // Add or increment an edge + auto Res = EdgeMap.insert(std::make_pair(E, Edges.size())); + EdgeIndex EI = Res.first->second; + if (Res.second) { + Edges.push_back(E); + Sections[From].Succs.push_back(To); + Sections[To].Preds.push_back(From); + } else + Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, Weight); + } + normalizeEdgeWeights(); +} + +// Normalize the edge weights so that we can reject edges which have a low +// probibility. +void CallGraphSort::normalizeEdgeWeights() { + for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) { + Section &S = Sections[SI]; + for (SectionIndex PI : S.Preds) { + Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]]; + if (S.Weight == 0) + continue; + E.NormalizedWeight = Double(E.Weight) / Double(S.Weight); + } + } +} + +// It's bad to merge clusters which would degrade the density too much. +static bool isNewDensityBad(Cluster &A, Cluster &B) { + Double NewDensity = Double(A.Weight + B.Weight) / Double(A.Size + B.Size); + if (A.getDensity() > + NewDensity * Double(MAX_DENSITY_DEGRADATION)) + return true; + return false; +} + +static void mergeClusters(Cluster &Into, Cluster &From) { + Into.ISBs.insert(Into.ISBs.end(), From.ISBs.begin(), From.ISBs.end()); + Into.Sections.insert(Into.Sections.end(), From.Sections.begin(), + From.Sections.end()); + Into.Size += From.Size; + Into.Weight += From.Weight; + From.ISBs.clear(); + From.Sections.clear(); + From.Size = 0; + From.Weight = 0; +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +void CallGraphSort::generateClusters() { + // Minimum edge probability to consider merging. + const Double MIN_EDGE_PROBABILITY = Double(1) / Double(10); + + std::vector SortedSecs; + std::vector SecToCluster(Sections.size()); + + Clusters.reserve(Sections.size()); + + for (SectionIndex SI = 0, SE = Sections.size(); SI != SE; ++SI) { + Cluster C(SI, Sections[SI].ISB); + C.Size = Sections[SI].Size; + C.Weight = Sections[SI].Weight; + Clusters.push_back(C); + SortedSecs.push_back(SI); + } + + for (Cluster &C : Clusters) { + SecToCluster[C.Sections.front()] = &C; + } + + std::stable_sort(SortedSecs.begin(), SortedSecs.end(), + [&](SectionIndex A, SectionIndex B) { + return Sections[A].getDensity() > Sections[B].getDensity(); + }); + + for (SectionIndex SI : SortedSecs) { + Cluster &C = *SecToCluster[SI]; + + SectionIndex BestPred = -1; + Double BestWeight = 0; + + for (SectionIndex PI : Sections[SI].Preds) { + Edge &E = Edges[EdgeMap[{PI, SI, 0, 0}]]; + if (BestPred == -1 || E.NormalizedWeight > BestWeight) { + BestPred = PI; + BestWeight = E.NormalizedWeight; + } + } + + if (BestWeight < MIN_EDGE_PROBABILITY) + continue; + + Cluster *PredC = SecToCluster[BestPred]; + if (PredC == nullptr || PredC == &C) + continue; + + if (C.Size + PredC->Size > MAX_CLUSTER_SIZE) + continue; + + if (isNewDensityBad(*PredC, C)) + continue; + + for (SectionIndex SI : C.Sections) + SecToCluster[SI] = PredC; + + mergeClusters(*PredC, C); + } + + // Remove empty or dead nodes. + Clusters.erase(std::remove_if(Clusters.begin(), Clusters.end(), + [](const Cluster &C) { + return C.Size == 0 || C.Sections.empty(); + }), + Clusters.end()); + + // Sort by density. Invalidates all NodeIndexs. + std::sort(Clusters.begin(), Clusters.end(), + [](const Cluster &A, const Cluster &B) { + return A.getDensity() > B.getDensity(); + }); +} + +DenseMap CallGraphSort::run() { + generateClusters(); + + // Generate order. + llvm::DenseMap OrderMap; + ssize_t CurOrder = 1; + + for (const Cluster &C : Clusters) + for (const InputSectionBase *IS : C.ISBs) + OrderMap[IS] = CurOrder++; + + return OrderMap; +} + +// Sort sections by the profile data provided by -callgraph-profile-file +// +// This first builds a call graph based on the profile data then merges sections +// according to the C³ huristic. All clusters are then sorted by a density +// metric to further improve locality. +DenseMap elf::computeCallGraphProfileOrder() { + return CallGraphSort().run(); +} Index: ELF/Config.h =================================================================== --- ELF/Config.h +++ ELF/Config.h @@ -24,6 +24,7 @@ namespace elf { class InputFile; +class InputSectionBase; enum ELFKind { ELFNoneKind, @@ -103,6 +104,9 @@ std::vector VersionScriptGlobals; std::vector VersionScriptLocals; std::vector BuildIdVector; + llvm::MapVector, + uint64_t> + CallGraphProfile; bool AllowMultipleDefinition; bool AndroidPackDynRelocs = false; bool ARMHasBlx = false; Index: ELF/Driver.cpp =================================================================== --- ELF/Driver.cpp +++ ELF/Driver.cpp @@ -571,6 +571,31 @@ return {BuildIdKind::None, {}}; } +static void readCallGraph(MemoryBufferRef MB) { + // Build a map from symbol name to section + DenseMap SymbolSection; + for (InputFile *File : ObjectFiles) + for (Symbol *Sym : File->getSymbols()) + if (auto *D = dyn_cast(Sym)) + if (auto *IS = dyn_cast_or_null(D->Section)) + SymbolSection[D->getName()] = IS; + + std::vector Lines = args::getLines(MB); + for (StringRef L : Lines) { + SmallVector Fields; + L.split(Fields, ' '); + if (Fields.size() != 3) + fatal("parse error"); + uint64_t Count; + if (!to_integer(Fields[2], Count)) + fatal("parse error"); + InputSectionBase *FromSec = SymbolSection.lookup(Fields[0]); + InputSectionBase *ToSec = SymbolSection.lookup(Fields[1]); + if (FromSec && ToSec) + Config->CallGraphProfile[std::make_pair(FromSec, ToSec)] = Count; + } +} + static bool getCompressDebugSections(opt::InputArgList &Args) { StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none"); if (S == "none") @@ -1118,6 +1143,10 @@ // Apply symbol renames for -wrap. Symtab->applySymbolWrap(); + if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file)) + if (Optional Buffer = readFile(Arg->getValue())) + readCallGraph(*Buffer); + // Now that we have a complete list of input files. // Beyond this point, no new files are added. // Aggregate all input sections into one place. Index: ELF/Options.td =================================================================== --- ELF/Options.td +++ ELF/Options.td @@ -66,6 +66,9 @@ "Only set DT_NEEDED for shared libraries if used", "Always set DT_NEEDED for shared libraries">; +defm call_graph_ordering_file: Eq<"call-graph-ordering-file">, + HelpText<"Layout sections to optimize the given callgraph">; + // -chroot doesn't have a help text because it is an internal option. defm chroot: Eq<"chroot">; Index: ELF/Writer.cpp =================================================================== --- ELF/Writer.cpp +++ ELF/Writer.cpp @@ -9,6 +9,7 @@ #include "Writer.h" #include "AArch64ErrataFix.h" +#include "CallGraphSort.h" #include "Config.h" #include "Filesystem.h" #include "LinkerScript.h" @@ -1110,6 +1111,14 @@ // If no layout was provided by linker script, we want to apply default // sorting for special input sections. This also handles --symbol-ordering-file. template void Writer::sortInputSections() { + // Use the rarely used option -call-graph-ordering-file to sort sections. + if (!Config->CallGraphProfile.empty()) { + DenseMap Order = + computeCallGraphProfileOrder(); + for (BaseCommand *Base : Script->SectionCommands) + if (auto *Sec = dyn_cast(Base)) + sortSection(Sec, Order); + } // Build the order once since it is expensive. DenseMap Order = buildSectionOrder(); for (BaseCommand *Base : Script->SectionCommands) Index: test/ELF/cgprofile-txt.s =================================================================== --- /dev/null +++ test/ELF/cgprofile-txt.s @@ -0,0 +1,106 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t +# RUN: ld.lld -e A %t -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT + +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "B C 30" >> %t.call_graph +# RUN: echo "C D 90" >> %t.call_graph +# RUN: echo "PP TS 100" >> %t.call_graph +# RUN: echo "_init2 _init 24567837" >> %t.call_graph +# RUN: echo "TS QC 9001" >> %t.call_graph +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s + + .section .text.D,"ax",@progbits +D: + retq + + .section .text.C,"ax",@progbits + .globl C +C: + retq + + .section .text.B,"ax",@progbits + .globl B +B: + retq + + .section .text.A,"ax",@progbits + .globl A +A: + retq + + .section .ponies,"ax",@progbits,unique,1 + .globl TS +TS: + retq + + .section .ponies,"ax",@progbits,unique,2 + .globl PP +PP: + retq + + .section .other,"ax",@progbits,unique,1 + .globl QC +QC: + retq + + .section .other,"ax",@progbits,unique,2 + .globl GB +GB: + retq + + .section .init,"ax",@progbits,unique,1 + .globl _init +_init: + retq + + .section .init,"ax",@progbits,unique,2 + .globl _init2 +_init2: + retq + +# CHECK: Name: D +# CHECK-NEXT: Value: 0x201003 +# CHECK: Name: A +# CHECK-NEXT: Value: 0x201000 +# CHECK: Name: B +# CHECK-NEXT: Value: 0x201001 +# CHECK: Name: C +# CHECK-NEXT: Value: 0x201002 +# CHECK: Name: GB +# CHECK-NEXT: Value: 0x201007 +# CHECK: Name: PP +# CHECK-NEXT: Value: 0x201004 +# CHECK: Name: QC +# CHECK-NEXT: Value: 0x201006 +# CHECK: Name: TS +# CHECK-NEXT: Value: 0x201005 +# CHECK: Name: _init +# CHECK-NEXT: Value: 0x201008 +# CHECK: Name: _init2 +# CHECK-NEXT: Value: 0x201009 + +# NOSORT: Name: D +# NOSORT-NEXT: Value: 0x201000 +# NOSORT: Name: A +# NOSORT-NEXT: Value: 0x201003 +# NOSORT: Name: B +# NOSORT-NEXT: Value: 0x201002 +# NOSORT: Name: C +# NOSORT-NEXT: Value: 0x201001 +# NOSORT: Name: GB +# NOSORT-NEXT: Value: 0x201007 +# NOSORT: Name: PP +# NOSORT-NEXT: Value: 0x201005 +# NOSORT: Name: QC +# NOSORT-NEXT: Value: 0x201006 +# NOSORT: Name: TS +# NOSORT-NEXT: Value: 0x201004 +# NOSORT: Name: _init +# NOSORT-NEXT: Value: 0x201008 +# NOSORT: Name: _init2 +# NOSORT-NEXT: Value: 0x201009