Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -19,6 +19,7 @@ Arch/SPARCV9.cpp Arch/X86.cpp Arch/X86_64.cpp + CallGraphSort.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/CallGraphSort.h =================================================================== --- /dev/null +++ ELF/CallGraphSort.h @@ -0,0 +1,23 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CALL_GRAPH_SORT_H +#define LLD_ELF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace elf { +class InputSectionBase; + +llvm::DenseMap computeCallGraphProfileOrder(); +} // namespace elf +} // namespace lld + +#endif Index: ELF/CallGraphSort.cpp =================================================================== --- /dev/null +++ ELF/CallGraphSort.cpp @@ -0,0 +1,379 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement +/// for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are layed out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incomming +/// edges. +/// * Call-Chain Clustering (Cウ) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weight edge +/// from cluster _u_ to _v_ then move the sections in _v_ and append them to +/// _u_ unless the combined size would be larger than the page size. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the ammount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a call graph from the profile +/// * While there are unresolved edges +/// * Find the edge with the highest weight +/// * Check if merging the two clusters would create a cluster larger than the +/// target page size +/// * If not, contract that edge putting the callee after the caller +/// * Sort remaining clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "OutputSections.h" +#include "SymbolTable.h" +#include "Symbols.h" +#include "Target.h" + +#include "llvm/Support/MathExtras.h" + +#include +#include +#include + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +using NodeIndex = std::ptrdiff_t; +using EdgeIndex = std::ptrdiff_t; + +struct Edge; + +struct EdgePriorityCmp { + std::vector &Edges; + bool operator()(EdgeIndex A, EdgeIndex B) const; +}; + +using PriorityQueue = std::multiset; + +struct Node { + Node() = default; + Node(const InputSectionBase *IS); + std::vector Sections; + std::vector IncidentEdges; + int64_t Size = 0; + uint64_t Weight = 0; +}; + +struct Edge { + NodeIndex From; + NodeIndex To; + uint64_t Weight; + PriorityQueue::iterator PriorityPos; + bool operator==(const Edge Other) const; + bool operator<(const Edge Other) const; +}; + +bool EdgePriorityCmp::operator()(EdgeIndex A, EdgeIndex B) const { + return Edges[A].Weight < Edges[B].Weight; +} + +struct EdgeDenseMapInfo { + static Edge getEmptyKey() { + return {DenseMapInfo::getEmptyKey(), + DenseMapInfo::getEmptyKey(), 0, + PriorityQueue::iterator()}; + } + static Edge getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey(), 0, + PriorityQueue::iterator()}; + } + static unsigned getHashValue(const Edge &Val) { + return hash_combine(DenseMapInfo::getHashValue(Val.From), + DenseMapInfo::getHashValue(Val.To)); + } + static bool isEqual(const Edge &LHS, const Edge &RHS) { return LHS == RHS; } +}; + +class CallGraphSort { +public: + CallGraphSort(); + + DenseMap run(); + +private: + std::vector Nodes; + std::vector Edges; + + PriorityQueue WorkQueue{EdgePriorityCmp{Edges}}; + + bool killEdge(EdgeIndex EI); + void contractEdge(EdgeIndex CEI); + void generateClusters(); +}; +} // end anonymous namespace + +Node::Node(const InputSectionBase *IS) { + Sections.push_back(IS); + Size = IS->getSize(); +} + +bool Edge::operator==(const Edge Other) const { + return From == Other.From && To == Other.To; +} + +bool Edge::operator<(const Edge Other) const { + if (From != Other.From) + return From < Other.From; + return To < Other.To; +} + +static bool isKnownNonreorderableSection(const OutputSection *OS) { + return llvm::StringSwitch(OS->Name) + .Cases(".init", ".fini", ".init_array.", ".fini_array.", ".ctors.", + ".dtors.", true) + .Default(false); +} + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { + MapVector, + uint64_t> &Profile = Config->CallGraphProfile; + DenseMap SecToNode; + DenseMap EdgeMap; + + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> NodeIndex { + auto Res = SecToNode.insert(std::make_pair(IS, Nodes.size())); + if (Res.second) + Nodes.emplace_back(IS); + return Res.first->second; + }; + + // Create the graph. + for (const auto &C : Profile) { + const InputSectionBase *FromSB = C.first.first; + const InputSectionBase *ToSB = C.first.second; + uint64_t Weight = C.second; + + if (Weight == 0) + continue; + + // Ignore edges with input sections from different output sections. This is + // done so that we don't form clusters which contain input sections which + // can't be laid out adjacently. This also prevents accidental reordering + // of input sections due to getting assigned an order in the order map. + if (FromSB->getOutputSection() != ToSB->getOutputSection()) + continue; + + if (isKnownNonreorderableSection(FromSB->getOutputSection())) + continue; + + NodeIndex From = GetOrCreateNode(FromSB); + NodeIndex To = GetOrCreateNode(ToSB); + + Nodes[To].Weight = SaturatingAdd(Nodes[To].Weight, Weight); + + // Reject self edges after adding node weight. This ends up with the same + // weight as if we had merged the nodes. + if (From == To) + continue; + + Edge E{From, To, Weight, WorkQueue.end()}; + + // Add or increment an edge + auto Res = EdgeMap.insert(std::make_pair(E, Edges.size())); + EdgeIndex EI = Res.first->second; + if (Res.second) { + Edges.push_back(E); + Nodes[From].IncidentEdges.push_back(EI); + Nodes[To].IncidentEdges.push_back(EI); + } else + Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, Weight); + } +} + +/// Like std::unique, but calls Merge on equal values. Merge is allowed +/// to modifiy its first argument. +/// +/// Merge is a callable with signature +/// Merge(*declval(), *declval()) +/// +/// Example: +/// +/// int a[] = {1, 2, 2, 3, 4, 5, 5}; +/// auto end = merge_unique(std::begin(a), std::end(a), +/// [](int a, int b) { return a == b; }, +/// [](int &a, int b) { a += b; }); +/// +/// for (auto i = a; i != end; ++i) +/// std::cout << *i << " "; +/// +/// -- 1 4 3 4 10 +template +static ForwardIt merge_unique(ForwardIt First, ForwardIt Last, PredTy Pred, + MergeTy Merge) { + if (First == Last) + return Last; + + ForwardIt I = First; + while (++I != Last) { + if (Pred(*First, *I)) + Merge(*First, *I); + else if (++First != I) + *First = std::move(*I); + } + return ++First; +} + +/// Marks an edge as head and removes it from the work queue. +/// Returns true if the edge was killed, false if it was already dead. +bool CallGraphSort::killEdge(EdgeIndex EI) { + Edge &E = Edges[EI]; + if (E.PriorityPos != WorkQueue.end()) { + WorkQueue.erase(E.PriorityPos); + E.PriorityPos = WorkQueue.end(); + return true; + } + return false; +} + +/// Remove edge \p CEI from the graph while simultaneously merging its two +/// incident vertices u and v. This merges any duplicate edges between u and v +/// by accumulating their weights. +void CallGraphSort::contractEdge(EdgeIndex CEI) { + // Make a copy of the edge as the original will be marked killed while being + // used. + Edge CE = Edges[CEI]; + assert(CE.From != CE.To && "Got self edge!"); + std::vector &FE = Nodes[CE.From].IncidentEdges; + + // Remove the self edge from From. + FE.erase(std::remove(FE.begin(), FE.end(), CEI)); + std::vector &TE = Nodes[CE.To].IncidentEdges; + + // Update all edges incident with To to reference From instead. Then if they + // aren't self edges add them to From. + for (EdgeIndex EI : TE) { + Edge &E = Edges[EI]; + if (E.From == CE.To) + E.From = CE.From; + if (E.To == CE.To) + E.To = CE.From; + if (E.To == E.From) { + killEdge(EI); + continue; + } + FE.push_back(EI); + } + + // Free memory. Otherwise we end up with N^2 memory usage. + std::vector().swap(TE); + + if (FE.empty()) + return; + + // Sort edges so they can be merged. The stability of this sort doesn't matter + // as equal edges will be merged in an order independent manner. + std::sort(FE.begin(), FE.end(), + [&](EdgeIndex AI, EdgeIndex BI) { return Edges[AI] < Edges[BI]; }); + + FE.erase(merge_unique(FE.begin(), FE.end(), + [&](EdgeIndex AI, EdgeIndex BI) { + return Edges[AI] == Edges[BI]; + }, + [&](EdgeIndex AI, EdgeIndex BI) { + Edge &A = Edges[AI]; + Edge &B = Edges[BI]; + killEdge(BI); + bool Restore = killEdge(AI); + A.Weight = SaturatingAdd(A.Weight, B.Weight); + if (Restore) + A.PriorityPos = WorkQueue.insert(AI); + }), + FE.end()); +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +void CallGraphSort::generateClusters() { + for (size_t I = 0; I < Edges.size(); ++I) { + Edges[I].PriorityPos = WorkQueue.insert(I); + } + + // Collapse the graph. + while (!WorkQueue.empty()) { + PriorityQueue::const_iterator I = --WorkQueue.end(); + EdgeIndex MaxI = *I; + const Edge MaxE = Edges[MaxI]; + killEdge(MaxI); + // Merge the Nodes. + Node &From = Nodes[MaxE.From]; + Node &To = Nodes[MaxE.To]; + if (From.Size + To.Size > Target->PageSize) + continue; + contractEdge(MaxI); + From.Sections.insert(From.Sections.end(), To.Sections.begin(), + To.Sections.end()); + From.Size += To.Size; + From.Weight = SaturatingAdd(From.Weight, To.Weight); + To.Sections.clear(); + To.Size = 0; + To.Weight = 0; + } + + // Remove empty or dead nodes. + Nodes.erase(std::remove_if(Nodes.begin(), Nodes.end(), + [](const Node &N) { + return N.Size == 0 || N.Sections.empty(); + }), + Nodes.end()); + + // Sort by density. Invalidates all NodeIndexs. + std::sort(Nodes.begin(), Nodes.end(), [](const Node &A, const Node &B) { + return (APFloat(APFloat::IEEEdouble(), A.Weight) / + APFloat(APFloat::IEEEdouble(), A.Size)) + .compare(APFloat(APFloat::IEEEdouble(), B.Weight) / + APFloat(APFloat::IEEEdouble(), B.Size)) == + APFloat::cmpLessThan; + }); +} + +DenseMap CallGraphSort::run() { + generateClusters(); + + // Generate order. + llvm::DenseMap OrderMap; + ssize_t CurOrder = 1; + + for (const Node &N : Nodes) + for (const InputSectionBase *IS : N.Sections) + OrderMap[IS] = CurOrder++; + + return OrderMap; +} + +// Sort sections by the profile data provided by -callgraph-profile-file +// +// This first builds a call graph based on the profile data then iteratively +// merges the hottest call edges as long as it would not create a cluster larger +// than the page size. All clusters are then sorted by a density metric to +// further improve locality. +DenseMap elf::computeCallGraphProfileOrder() { + return CallGraphSort().run(); +} Index: ELF/Config.h =================================================================== --- ELF/Config.h +++ ELF/Config.h @@ -25,6 +25,7 @@ namespace elf { class InputFile; +class InputSectionBase; enum ELFKind { ELFNoneKind, @@ -104,6 +105,9 @@ std::vector VersionScriptGlobals; std::vector VersionScriptLocals; std::vector BuildIdVector; + llvm::MapVector, + uint64_t> + CallGraphProfile; bool AllowMultipleDefinition; bool AndroidPackDynRelocs = false; bool ARMHasBlx = false; Index: ELF/Driver.cpp =================================================================== --- ELF/Driver.cpp +++ ELF/Driver.cpp @@ -570,6 +570,31 @@ return {BuildIdKind::None, {}}; } +static void readCallGraph(MemoryBufferRef MB) { + // Build a map from symbol name to section + DenseMap SymbolSection; + for (InputFile *File : ObjectFiles) + for (Symbol *Sym : File->getSymbols()) + if (auto *D = dyn_cast(Sym)) + if (auto *IS = dyn_cast_or_null(D->Section)) + SymbolSection[D->getName()] = IS; + + std::vector Lines = args::getLines(MB); + for (StringRef L : Lines) { + SmallVector Fields; + L.split(Fields, ' '); + if (Fields.size() != 3) + fatal("parse error"); + uint64_t Count; + if (!to_integer(Fields[2], Count)) + fatal("parse error"); + InputSectionBase *FromSec = SymbolSection.lookup(Fields[0]); + InputSectionBase *ToSec = SymbolSection.lookup(Fields[1]); + if (FromSec && ToSec) + Config->CallGraphProfile[std::make_pair(FromSec, ToSec)] = Count; + } +} + static bool getCompressDebugSections(opt::InputArgList &Args) { StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none"); if (S == "none") @@ -1097,6 +1122,10 @@ // Apply symbol renames for -wrap. Symtab->applySymbolWrap(); + if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file)) + if (Optional Buffer = readFile(Arg->getValue())) + readCallGraph(*Buffer); + // Now that we have a complete list of input files. // Beyond this point, no new files are added. // Aggregate all input sections into one place. Index: ELF/Options.td =================================================================== --- ELF/Options.td +++ ELF/Options.td @@ -66,6 +66,9 @@ "Only set DT_NEEDED for shared libraries if used", "Always set DT_NEEDED for shared libraries">; +def call_graph_ordering_file: S<"call-graph-ordering-file">, + HelpText<"Layout sections to optimize the given callgraph">; + // -chroot doesn't have a help text because it is an internal option. def chroot: S<"chroot">; Index: ELF/Writer.cpp =================================================================== --- ELF/Writer.cpp +++ ELF/Writer.cpp @@ -9,6 +9,7 @@ #include "Writer.h" #include "AArch64ErrataFix.h" +#include "CallGraphSort.h" #include "Config.h" #include "Filesystem.h" #include "LinkerScript.h" @@ -1058,6 +1059,17 @@ // If no layout was provided by linker script, we want to apply default // sorting for special input sections. This also handles --symbol-ordering-file. template void Writer::sortInputSections() { + // Use the rarely used option -call-graph-ordering-file to sort sections. + if (!Config->CallGraphProfile.empty()) { + DenseMap OrderMap = + computeCallGraphProfileOrder(); + + for (BaseCommand *Base : Script->SectionCommands) + if (auto *Sec = dyn_cast(Base)) + if (Sec->Live) + Sec->sort([&](InputSectionBase *S) { return OrderMap.lookup(S); }); + } + // Sort input sections by priority using the list provided // by --symbol-ordering-file. DenseMap Order = buildSectionOrder(); Index: test/ELF/cgprofile-txt.s =================================================================== --- /dev/null +++ test/ELF/cgprofile-txt.s @@ -0,0 +1,106 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t +# RUN: ld.lld -e A %t -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT + +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "B C 30" >> %t.call_graph +# RUN: echo "C D 90" >> %t.call_graph +# RUN: echo "PP TS 100" >> %t.call_graph +# RUN: echo "_init2 _init 24567837" >> %t.call_graph +# RUN: echo "TS QC 9001" >> %t.call_graph +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s + + .section .text.D,"ax",@progbits +D: + retq + + .section .text.C,"ax",@progbits + .globl C +C: + retq + + .section .text.B,"ax",@progbits + .globl B +B: + retq + + .section .text.A,"ax",@progbits + .globl A +A: + retq + + .section .ponies,"ax",@progbits,unique,1 + .globl TS +TS: + retq + + .section .ponies,"ax",@progbits,unique,2 + .globl PP +PP: + retq + + .section .other,"ax",@progbits,unique,1 + .globl QC +QC: + retq + + .section .other,"ax",@progbits,unique,2 + .globl GB +GB: + retq + + .section .init,"ax",@progbits,unique,1 + .globl _init +_init: + retq + + .section .init,"ax",@progbits,unique,2 + .globl _init2 +_init2: + retq + +# CHECK: Name: D +# CHECK-NEXT: Value: 0x201003 +# CHECK: Name: A +# CHECK-NEXT: Value: 0x201000 +# CHECK: Name: B +# CHECK-NEXT: Value: 0x201001 +# CHECK: Name: C +# CHECK-NEXT: Value: 0x201002 +# CHECK: Name: GB +# CHECK-NEXT: Value: 0x201007 +# CHECK: Name: PP +# CHECK-NEXT: Value: 0x201004 +# CHECK: Name: QC +# CHECK-NEXT: Value: 0x201006 +# CHECK: Name: TS +# CHECK-NEXT: Value: 0x201005 +# CHECK: Name: _init +# CHECK-NEXT: Value: 0x201008 +# CHECK: Name: _init2 +# CHECK-NEXT: Value: 0x201009 + +# NOSORT: Name: D +# NOSORT-NEXT: Value: 0x201000 +# NOSORT: Name: A +# NOSORT-NEXT: Value: 0x201003 +# NOSORT: Name: B +# NOSORT-NEXT: Value: 0x201002 +# NOSORT: Name: C +# NOSORT-NEXT: Value: 0x201001 +# NOSORT: Name: GB +# NOSORT-NEXT: Value: 0x201007 +# NOSORT: Name: PP +# NOSORT-NEXT: Value: 0x201005 +# NOSORT: Name: QC +# NOSORT-NEXT: Value: 0x201006 +# NOSORT: Name: TS +# NOSORT-NEXT: Value: 0x201004 +# NOSORT: Name: _init +# NOSORT-NEXT: Value: 0x201008 +# NOSORT: Name: _init2 +# NOSORT-NEXT: Value: 0x201009