Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -19,6 +19,7 @@ Arch/SPARCV9.cpp Arch/X86.cpp Arch/X86_64.cpp + CallGraphSort.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/CallGraphSort.h =================================================================== --- /dev/null +++ ELF/CallGraphSort.h @@ -0,0 +1,24 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CALL_GRAPH_SORT_H +#define LLD_ELF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace elf { +class InputSectionBase; + +llvm::DenseMap +computeCallGraphProfileOrder(); +} +} + +#endif Index: ELF/CallGraphSort.cpp =================================================================== --- /dev/null +++ ELF/CallGraphSort.cpp @@ -0,0 +1,334 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file This file implements Call-Chain Clustering from: +/// Optimizing Function Placement for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are layed out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incomming +/// edges. +/// * Call-Chain Clustering (Cウ) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weight edge +/// from cluster _u_ to _v_ then move the sections in _v_ and append them to +/// _u_ unless the combined size would be larger than the page size. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the ammount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a call graph from the profile +/// * While there are unresolved edges +/// * Find the edge with the highest weight +/// * Check if merging the two clusters would create a cluster larger than the +/// target page size +/// * If not, contract that edge putting the callee after the caller +/// * Sort remaining clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "Symbols.h" +#include "SymbolTable.h" +#include "Target.h" + +#include "llvm/Support/MathExtras.h" + +#include +#include +#include + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +using NodeIndex = std::ptrdiff_t; +using EdgeIndex = std::ptrdiff_t; + +struct Edge; + +struct EdgePriorityCmp { + std::vector &Edges; + bool operator()(EdgeIndex A, EdgeIndex B) const; +}; + +using PriorityQueue = std::set; + +struct Node { + Node() = default; + Node(const InputSectionBase *IS); + std::vector Sections; + std::vector IncidentEdges; + int64_t Size = 0; + uint64_t Weight = 0; +}; + +struct Edge { + NodeIndex From; + NodeIndex To; + mutable uint64_t Weight; + PriorityQueue::iterator PriorityPos; + bool operator==(const Edge Other) const; + bool operator<(const Edge Other) const; + void kill(); + bool isDead() const; +}; + +bool EdgePriorityCmp::operator()(EdgeIndex A, EdgeIndex B) const { + return Edges[A].Weight < Edges[B].Weight; +} + +struct EdgeDenseMapInfo { + static Edge getEmptyKey() { + return {DenseMapInfo::getEmptyKey(), + DenseMapInfo::getEmptyKey(), 0}; + } + static Edge getTombstoneKey() { + return {DenseMapInfo::getTombstoneKey(), + DenseMapInfo::getTombstoneKey(), 0}; + } + static unsigned getHashValue(const Edge &Val) { + return hash_combine(DenseMapInfo::getHashValue(Val.From), + DenseMapInfo::getHashValue(Val.To)); + } + static bool isEqual(const Edge &LHS, const Edge &RHS) { return LHS == RHS; } +}; + +class CallGraphSort { +public: + CallGraphSort( + DenseMap, uint64_t> &Profile); + + DenseMap run(); + +private: + std::vector Nodes; + std::vector Edges; + + PriorityQueue WorkQueue{EdgePriorityCmp{Edges}}; + + void contractEdge(EdgeIndex CEI); + void generateClusters(); +}; +} // end anonymous namespace + +Node::Node(const InputSectionBase *IS) { + Sections.push_back(IS); + Size = IS->getSize(); +} + +bool Edge::operator==(const Edge Other) const { + return From == Other.From && To == Other.To; +} + +bool Edge::operator<(const Edge Other) const { + if (From != Other.From) + return From < Other.From; + return To < Other.To; +} + +void Edge::kill() { + From = 0; + To = 0; +} + +bool Edge::isDead() const { return From == 0 && To == 0; } + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort( + DenseMap, uint64_t> &Profile) { + DenseMap SecToNode; + DenseMap EdgeMap; + + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> NodeIndex { + auto Res = SecToNode.insert(std::make_pair(IS, Nodes.size())); + if (Res.second) + Nodes.emplace_back(IS); + return Res.first->second; + }; + + // Create the graph. + for (const auto &C : Profile) { + const Symbol *FromSym = C.first.first; + const Symbol *ToSym = C.first.second; + uint64_t Weight = C.second; + + if (Weight == 0) + continue; + + // Get the input section for a given symbol. + auto *FromDR = dyn_cast_or_null(FromSym); + auto *ToDR = dyn_cast_or_null(ToSym); + if (!FromDR || !ToDR) + continue; + + auto *FromSB = dyn_cast_or_null(FromDR->Section); + auto *ToSB = dyn_cast_or_null(ToDR->Section); + if (!FromSB || !ToSB || FromSB->getSize() == 0 || ToSB->getSize() == 0) + continue; + + NodeIndex From = GetOrCreateNode(FromSB); + NodeIndex To = GetOrCreateNode(ToSB); + + Nodes[To].Weight = SaturatingAdd(Nodes[To].Weight, Weight); + + if (From == To) + continue; + + Edge E{From, To, Weight, WorkQueue.end()}; + + // Add or increment an edge + auto Res = EdgeMap.insert(std::make_pair(E, Edges.size())); + EdgeIndex EI = Res.first->second; + if (Res.second) { + Edges.push_back(E); + Nodes[From].IncidentEdges.push_back(EI); + Nodes[To].IncidentEdges.push_back(EI); + } else + Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, Weight); + } +} + +/// Remove edge \p CEI from the graph while simultaneously merging its two +/// incident vertices u and v. This merges any duplicate edges between u and v +/// by accumulating their weights. +void CallGraphSort::contractEdge(EdgeIndex CEI) { + // Make a copy of the edge as the original will be marked killed while being + // used. + Edge CE = Edges[CEI]; + assert(CE.From != CE.To && "Got self edge!"); + std::vector &FE = Nodes[CE.From].IncidentEdges; + + // Remove the self edge from From. + FE.erase(std::remove(FE.begin(), FE.end(), CEI)); + std::vector &TE = Nodes[CE.To].IncidentEdges; + + // Update all edges incident with To to reference From instead. Then if they + // aren't self edges add them to From. + for (EdgeIndex EI : TE) { + Edge &E = Edges[EI]; + if (E.From == CE.To) + E.From = CE.From; + if (E.To == CE.To) + E.To = CE.From; + if (E.To == E.From) { + E.kill(); + continue; + } + FE.push_back(EI); + } + + // Free memory. Otherwise we end up with N^2 memory usage. + std::vector().swap(TE); + + if (FE.empty()) + return; + + // Sort edges so they can be merged. The stability of this sort doesn't matter + // as equal edges will be merged in an order independent manner. + std::sort(FE.begin(), FE.end(), + [&](EdgeIndex AI, EdgeIndex BI) { return Edges[AI] < Edges[BI]; }); + + FE.erase(merge_unique(FE.begin(), FE.end(), + [&](EdgeIndex AI, EdgeIndex BI) { + return Edges[AI] == Edges[BI]; + }, + [&](EdgeIndex AI, EdgeIndex BI) { + Edge &A = Edges[AI]; + Edge &B = Edges[BI]; + WorkQueue.erase(A.PriorityPos); + WorkQueue.erase(B.PriorityPos); + A.Weight = SaturatingAdd(A.Weight, B.Weight); + B.kill(); + A.PriorityPos = WorkQueue.insert(AI).first; + B.PriorityPos = WorkQueue.end(); + }), + FE.end()); +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +void CallGraphSort::generateClusters() { + for (size_t I = 0; I < Edges.size(); ++I) { + Edges[I].PriorityPos = WorkQueue.insert(I).first; + } + + // Collapse the graph. + while (!WorkQueue.empty()) { + PriorityQueue::const_iterator I = --WorkQueue.end(); + EdgeIndex MaxI = *I; + const Edge MaxE = Edges[MaxI]; + WorkQueue.erase(I); + if (MaxE.isDead()) + continue; + // Merge the Nodes. + Node &From = Nodes[MaxE.From]; + Node &To = Nodes[MaxE.To]; + if (From.Size + To.Size > Target->PageSize) + continue; + contractEdge(MaxI); + From.Sections.insert(From.Sections.end(), To.Sections.begin(), + To.Sections.end()); + From.Size += To.Size; + From.Weight = SaturatingAdd(From.Weight, To.Weight); + To.Sections.clear(); + To.Size = 0; + To.Weight = 0; + } + + // Remove empty or dead nodes. + Nodes.erase(std::remove_if(Nodes.begin(), Nodes.end(), + [](const Node &N) { + return N.Size == 0 || N.Sections.empty(); + }), + Nodes.end()); + + // Sort by density. Invalidates all NodeIndexs. + std::sort(Nodes.begin(), Nodes.end(), [](const Node &A, const Node &B) { + return (APFloat(APFloat::IEEEdouble(), A.Weight) / + APFloat(APFloat::IEEEdouble(), A.Size)) + .compare(APFloat(APFloat::IEEEdouble(), B.Weight) / + APFloat(APFloat::IEEEdouble(), B.Size)) == + APFloat::cmpLessThan; + }); +} + +DenseMap CallGraphSort::run() { + generateClusters(); + + // Generate order. + llvm::DenseMap OrderMap; + ssize_t CurOrder = 1; + + for (const Node &N : Nodes) + for (const InputSectionBase *IS : N.Sections) + OrderMap[IS] = CurOrder++; + + return OrderMap; +} + +// Sort sections by the profile data provided by -callgraph-profile-file +// +// This first builds a call graph based on the profile data then iteratively +// merges the hottest call edges as long as it would not create a cluster larger +// than the page size. All clusters are then sorted by a density metric to +// further improve locality. +DenseMap elf::computeCallGraphProfileOrder() { + return CallGraphSort(Config->CallGraphProfile).run(); +} Index: ELF/Config.h =================================================================== --- ELF/Config.h +++ ELF/Config.h @@ -10,6 +10,7 @@ #ifndef LLD_ELF_CONFIG_H #define LLD_ELF_CONFIG_H +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" @@ -24,6 +25,7 @@ namespace elf { class InputFile; +class Symbol; enum ELFKind { ELFNoneKind, @@ -103,6 +105,8 @@ std::vector VersionScriptGlobals; std::vector VersionScriptLocals; std::vector BuildIdVector; + llvm::DenseMap, uint64_t> + CallGraphProfile; bool AllowMultipleDefinition; bool AndroidPackDynRelocs = false; bool ARMHasBlx = false; @@ -111,6 +115,7 @@ bool AsNeeded = false; bool Bsymbolic; bool BsymbolicFunctions; + bool CallGraphProfileSort = true; bool CompressDebugSections; bool DefineCommon; bool Demangle = true; Index: ELF/Driver.cpp =================================================================== --- ELF/Driver.cpp +++ ELF/Driver.cpp @@ -596,6 +596,8 @@ Config->AuxiliaryList = args::getStrings(Args, OPT_auxiliary); Config->Bsymbolic = Args.hasArg(OPT_Bsymbolic); Config->BsymbolicFunctions = Args.hasArg(OPT_Bsymbolic_functions); + Config->CallGraphProfileSort = Args.hasFlag( + OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true); Config->Chroot = Args.getLastArgValue(OPT_chroot); Config->CompressDebugSections = getCompressDebugSections(Args); Config->DefineCommon = Args.hasFlag(OPT_define_common, OPT_no_define_common, Index: ELF/InputFiles.h =================================================================== --- ELF/InputFiles.h +++ ELF/InputFiles.h @@ -158,6 +158,7 @@ typedef typename ELFT::Sym Elf_Sym; typedef typename ELFT::Shdr Elf_Shdr; typedef typename ELFT::Word Elf_Word; + typedef typename ELFT::CGProfile Elf_CGProfile; StringRef getShtGroupSignature(ArrayRef Sections, const Elf_Shdr &Sec); @@ -203,6 +204,7 @@ initializeSections(llvm::DenseSet &ComdatGroups); void initializeSymbols(); void initializeDwarf(); + void parseCGProfile(); InputSectionBase *getRelocTarget(const Elf_Shdr &Sec); InputSectionBase *createInputSection(const Elf_Shdr &Sec); StringRef getSectionName(const Elf_Shdr &Sec); @@ -220,6 +222,8 @@ std::unique_ptr DwarfLine; llvm::DenseMap> VariableLoc; llvm::once_flag InitDwarfLine; + + ArrayRef CGProfile; }; // LazyObjFile is analogous to ArchiveFile in the sense that Index: ELF/InputFiles.cpp =================================================================== --- ELF/InputFiles.cpp +++ ELF/InputFiles.cpp @@ -222,6 +222,15 @@ return ""; } +template +void lld::elf::ObjFile::parseCGProfile() { + for (const Elf_CGProfile &CGPE : CGProfile) { + uint64_t &C = Config->CallGraphProfile[std::make_pair( + &getSymbol(CGPE.cgp_from), &getSymbol(CGPE.cgp_to))]; + C = std::max(C, (uint64_t)CGPE.cgp_weight); + } +} + // Returns "", "foo.a(bar.o)" or "baz.o". std::string lld::toString(const InputFile *F) { if (!F) @@ -286,6 +295,7 @@ // Read section and symbol tables. initializeSections(ComdatGroups); initializeSymbols(); + parseCGProfile(); } // Sections with SHT_GROUP and comdat bits define comdat section groups. @@ -626,6 +636,13 @@ if (Name == ".eh_frame" && !Config->Relocatable) return make(*this, Sec, Name); + // Profile data. + if (Name == ".note.llvm.cgprofile") { + CGProfile = check( + this->getObj().template getSectionContentsAsArray(&Sec)); + return &InputSection::Discarded; + } + if (shouldMerge(Sec)) return make(*this, Sec, Name); return make(*this, Sec, Name); Index: ELF/Options.td =================================================================== --- ELF/Options.td +++ ELF/Options.td @@ -51,6 +51,9 @@ def as_needed: F<"as-needed">, HelpText<"Only set DT_NEEDED for shared libraries if used">; +def call_graph_profile_sort: F<"call-graph-profile-sort">, + HelpText<"Sort sections by call graph profile information">; + // -chroot doesn't have a help text because it is an internal option. def chroot: S<"chroot">; @@ -175,6 +178,9 @@ def no_as_needed: F<"no-as-needed">, HelpText<"Always DT_NEEDED for shared libraries">; +def no_call_graph_profile_sort: F<"no-call-graph-profile-sort">, + HelpText<"Don't sort sections by call graph profile information">; + def no_color_diagnostics: F<"no-color-diagnostics">, HelpText<"Do not use colors in diagnostics">; Index: ELF/Writer.cpp =================================================================== --- ELF/Writer.cpp +++ ELF/Writer.cpp @@ -9,6 +9,7 @@ #include "Writer.h" #include "AArch64ErrataFix.h" +#include "CallGraphSort.h" #include "Config.h" #include "Filesystem.h" #include "LinkerScript.h" @@ -1020,6 +1021,15 @@ template void Writer::sortInputSections() { assert(!Script->HasSectionsCommand); + // Use the rarely used option -call-graph-ordering-file to sort sections. + if (Config->CallGraphProfileSort && !Config->CallGraphProfile.empty()) { + DenseMap OrderMap = + computeCallGraphProfileOrder(); + + if (OutputSection *Sec = findSection(".text")) + Sec->sort([&](InputSectionBase *S) { return OrderMap.lookup(S); }); + } + // Sort input sections by priority using the list provided // by --symbol-ordering-file. DenseMap Order = buildSectionOrder(); Index: test/ELF/cgprofile-object.s =================================================================== --- /dev/null +++ test/ELF/cgprofile-object.s @@ -0,0 +1,50 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t +# RUN: ld.lld %t -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s +# RUN: ld.lld %t -o %t2 -no-call-graph-profile-sort +# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT + + .section .text.hot._Z4fooav,"ax",@progbits + .globl _Z4fooav +_Z4fooav: + retq + + .section .text.hot._Z4foobv,"ax",@progbits + .globl _Z4foobv +_Z4foobv: + retq + + .section .text.hot._Z3foov,"ax",@progbits + .globl _Z3foov +_Z3foov: + retq + + .section .text.hot._start,"ax",@progbits + .globl _start +_start: + retq + + + .cg_profile _start, _Z3foov, 1 + .cg_profile _Z4fooav, _Z4foobv, 1 + .cg_profile _Z3foov, _Z4fooav, 1 + +# CHECK: Name: _Z3foov +# CHECK-NEXT: Value: 0x201001 +# CHECK: Name: _Z4fooav +# CHECK-NEXT: Value: 0x201002 +# CHECK: Name: _Z4foobv +# CHECK-NEXT: Value: 0x201003 +# CHECK: Name: _start +# CHECK-NEXT: Value: 0x201000 + +# NOSORT: Name: _Z3foov +# NOSORT-NEXT: Value: 0x201002 +# NOSORT: Name: _Z4fooav +# NOSORT-NEXT: Value: 0x201000 +# NOSORT: Name: _Z4foobv +# NOSORT-NEXT: Value: 0x201001 +# NOSORT: Name: _start +# NOSORT-NEXT: Value: 0x201003