Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -18,6 +18,7 @@ Arch/SPARCV9.cpp Arch/X86.cpp Arch/X86_64.cpp + CallGraphSort.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/CallGraphSort.h =================================================================== --- /dev/null +++ ELF/CallGraphSort.h @@ -0,0 +1,24 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CALL_GRAPH_SORT_H +#define LLD_ELF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace elf { +class InputSectionBase; + +llvm::DenseMap +computeCallGraphProfileOrder(); +} +} + +#endif Index: ELF/CallGraphSort.cpp =================================================================== --- /dev/null +++ ELF/CallGraphSort.cpp @@ -0,0 +1,274 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file This file implements Call-Chain Clustering from: +/// Optimizing Function Placement for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// It does so given a call graph profile by the following: +/// * Build a call graph from the profile +/// * While there are unresolved edges +/// * Find the edge with the highest weight +/// * Check if merging the two clusters would create a cluster larger than the +/// target page size +/// * If not, contract that edge putting the callee after the caller +/// * Sort remaining clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "SymbolTable.h" +#include "Target.h" + +#include "llvm/Support/MathExtras.h" + +#include +#include + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +class CallGraphSort { + using NodeIndex = std::ptrdiff_t; + using EdgeIndex = std::ptrdiff_t; + + struct Node { + Node() = default; + Node(const InputSectionBase *IS); + std::vector Sections; + std::vector IncidentEdges; + int64_t Size = 0; + uint64_t Weight = 0; + }; + + struct Edge { + NodeIndex From; + NodeIndex To; + mutable uint64_t Weight; + bool operator==(const Edge Other) const; + bool operator<(const Edge Other) const; + void kill(); + bool isDead() const; + }; + + std::vector Nodes; + std::vector Edges; + struct EdgePriorityCmp { + std::vector &Edges; + bool operator()(EdgeIndex A, EdgeIndex B) const { + return Edges[A].Weight < Edges[B].Weight; + } + }; + std::priority_queue, EdgePriorityCmp> + WorkQueue{EdgePriorityCmp{Edges}}; + + void contractEdge(EdgeIndex CEI); + void generateClusters(); + +public: + CallGraphSort(DenseMap, uint64_t> &Profile); + + DenseMap run(); +}; +} // end anonymous namespace + +CallGraphSort::Node::Node(const InputSectionBase *IS) { + Sections.push_back(IS); + Size = IS->getSize(); +} + +bool CallGraphSort::Edge::operator==(const Edge Other) const { + return From == Other.From && To == Other.To; +} + +bool CallGraphSort::Edge::operator<(const Edge Other) const { + if (From != Other.From) + return From < Other.From; + if (To != Other.To) + return To < Other.To; + return false; +} + +void CallGraphSort::Edge::kill() { + From = 0; + To = 0; +} + +bool CallGraphSort::Edge::isDead() const { return From == 0 && To == 0; } + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// SymbolBodys, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort( + DenseMap, uint64_t> &Profile) { + DenseMap SecToNode; + std::map EdgeMap; + + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> NodeIndex { + auto Res = SecToNode.insert(std::make_pair(IS, Nodes.size())); + if (Res.second) + Nodes.emplace_back(IS); + return Res.first->second; + }; + + // Create the graph. + for (const auto &C : Profile) { + if (C.second == 0) + continue; + auto FromDR = dyn_cast_or_null(Symtab->find(C.first.first)); + auto ToDR = dyn_cast_or_null(Symtab->find(C.first.second)); + if (!FromDR || !ToDR) + continue; + auto FromSB = dyn_cast_or_null(FromDR->Section); + auto ToSB = dyn_cast_or_null(ToDR->Section); + if (!FromSB || !ToSB || FromSB->getSize() == 0 || ToSB->getSize() == 0) + continue; + NodeIndex From = GetOrCreateNode(FromSB); + NodeIndex To = GetOrCreateNode(ToSB); + Edge E{From, To, C.second}; + auto Res = EdgeMap.insert(std::make_pair(E, Edges.size())); + EdgeIndex EI = Res.first->second; + if (Res.second) { + Edges.push_back(E); + Nodes[From].IncidentEdges.push_back(EI); + Nodes[To].IncidentEdges.push_back(EI); + } else + Edges[EI].Weight = SaturatingAdd(Edges[EI].Weight, C.second); + Nodes[To].Weight = SaturatingAdd(Nodes[To].Weight, C.second); + } +} + +void CallGraphSort::contractEdge(EdgeIndex CEI) { + // Make a copy of the edge as the original will be marked killed while being + // used. + Edge CE = Edges[CEI]; + std::vector &FE = Nodes[CE.From].IncidentEdges; + // Remove the self edge from From. + FE.erase(std::remove(FE.begin(), FE.end(), CEI)); + std::vector &TE = Nodes[CE.To].IncidentEdges; + // Update all edges incident with To to reference From instead. Then if they + // aren't self edges add them to From. + for (EdgeIndex EI : TE) { + Edge &E = Edges[EI]; + // E.From = E.From == CE.To ? CE.From : E.From; + // E.To = E.To == CE.To ? CE.From : E.To; + if (E.From == CE.To) + E.From = CE.From; + if (E.To == CE.To) + E.To = CE.From; + if (E.To == E.From) { + E.kill(); + continue; + } + FE.push_back(EI); + } + + // Free memory. + std::vector().swap(TE); + + if (FE.empty()) + return; + + // Sort edges so they can be merged. The stability of this sort doesn't matter + // as equal edges will be merged in an order independent manner. + std::sort(FE.begin(), FE.end(), + [&](EdgeIndex AI, EdgeIndex BI) { return Edges[AI] < Edges[BI]; }); + + // std::unique, but also merge equal values. + auto First = FE.begin(); + auto Last = FE.end(); + auto Result = First; + while (++First != Last) { + if (Edges[*Result] == Edges[*First]) { + Edges[*Result].Weight = + SaturatingAdd(Edges[*Result].Weight, Edges[*First].Weight); + Edges[*First].kill(); + // Add the updated edge to the work queue without removing the previous + // entry. Edges will never be contracted twice as they are marked as dead. + WorkQueue.push(*Result); + } else if (++Result != First) + *Result = *First; + } + FE.erase(++Result, FE.end()); +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +void CallGraphSort::generateClusters() { + for (size_t I = 0; I < Edges.size(); ++I) + WorkQueue.push(I); + // Collapse the graph. + while (!WorkQueue.empty()) { + EdgeIndex MaxI = WorkQueue.top(); + const Edge MaxE = Edges[MaxI]; + WorkQueue.pop(); + if (MaxE.isDead()) + continue; + // Merge the Nodes. + Node &From = Nodes[MaxE.From]; + Node &To = Nodes[MaxE.To]; + if (From.Size + To.Size > Target->PageSize) + continue; + contractEdge(MaxI); + From.Sections.insert(From.Sections.end(), To.Sections.begin(), + To.Sections.end()); + From.Size += To.Size; + From.Weight = SaturatingAdd(From.Weight, To.Weight); + To.Sections.clear(); + To.Size = 0; + To.Weight = 0; + } + + // Remove empty or dead nodes. + Nodes.erase(std::remove_if(Nodes.begin(), Nodes.end(), + [](const Node &N) { + return N.Size == 0 || N.Sections.empty(); + }), + Nodes.end()); + + // Sort by density. Invalidates all NodeIndexs. + std::sort(Nodes.begin(), Nodes.end(), [](const Node &A, const Node &B) { + return (APFloat(APFloat::IEEEdouble(), A.Weight) / + APFloat(APFloat::IEEEdouble(), A.Size)) + .compare(APFloat(APFloat::IEEEdouble(), B.Weight) / + APFloat(APFloat::IEEEdouble(), B.Size)) == + APFloat::cmpLessThan; + }); +} + +DenseMap CallGraphSort::run() { + generateClusters(); + + // Generate order. + llvm::DenseMap OrderMap; + ssize_t CurOrder = 1; + + for (const Node &N : Nodes) + for (const InputSectionBase *IS : N.Sections) + OrderMap[IS] = CurOrder++; + + return OrderMap; +} + +// Sort sections by the profile data provided by -callgraph-profile-file +// +// This first builds a call graph based on the profile data then iteratively +// merges the hottest call edges as long as it would not create a cluster larger +// than the page size. All clusters are then sorted by a density metric to +// further improve locality. +DenseMap elf::computeCallGraphProfileOrder() { + CallGraphSort CGS(Config->CallGraphProfile); + return CGS.run(); +} Index: ELF/Config.h =================================================================== --- ELF/Config.h +++ ELF/Config.h @@ -10,6 +10,7 @@ #ifndef LLD_ELF_CONFIG_H #define LLD_ELF_CONFIG_H +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" @@ -108,6 +109,8 @@ std::vector VersionScriptLocals; std::vector BuildIdVector; llvm::MapVector RenamedSymbols; + llvm::DenseMap, uint64_t> + CallGraphProfile; bool AllowMultipleDefinition; bool AsNeeded = false; bool Bsymbolic; Index: ELF/Driver.cpp =================================================================== --- ELF/Driver.cpp +++ ELF/Driver.cpp @@ -600,6 +600,39 @@ return Ret; } +// This reads a list of call edges with weights one line at a time from a file +// with the following format for each line: +// +// ^[.*]+ [.*]+ [.*]+$ +// +// It interprets the first value as an unsigned 64 bit weight, the second as +// the symbol the call is from, and the third as the symbol the call is to. +// +// Example: +// +// 5000 c a +// 4000 c b +// 18446744073709551615 e d +// +static void readCallGraphProfile(MemoryBufferRef MB) { + for (StringRef L : getLines(MB)) { + SmallVector Fields; + L.split(Fields, ' '); + if (Fields.size() != 3) { + error("parse error: " + MB.getBufferIdentifier() + ": " + L); + return; + } + uint64_t Count; + if (!to_integer(Fields[0], Count)) { + error("parse error: " + MB.getBufferIdentifier() + ": " + L); + return; + } + StringRef From = Fields[1]; + StringRef To = Fields[2]; + Config->CallGraphProfile[std::make_pair(From, To)] = Count; + } +} + static bool getCompressDebugSections(opt::InputArgList &Args) { StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none"); if (S == "none") @@ -755,6 +788,10 @@ if (Optional Buffer = readFile(Arg->getValue())) Config->SymbolOrderingFile = getLines(*Buffer); + if (auto *Arg = Args.getLastArg(OPT_call_graph_profile_file)) + if (Optional Buffer = readFile(Arg->getValue())) + readCallGraphProfile(*Buffer); + // If --retain-symbol-file is used, we'll keep only the symbols listed in // the file and discard all others. if (auto *Arg = Args.getLastArg(OPT_retain_symbols_file)) { Index: ELF/Options.td =================================================================== --- ELF/Options.td +++ ELF/Options.td @@ -52,6 +52,9 @@ def as_needed: F<"as-needed">, HelpText<"Only set DT_NEEDED for shared libraries if used">; +def call_graph_profile_file: S<"call-graph-profile-file">, + HelpText<"Layout sections to optimize the given callgraph">; + // -chroot doesn't have a help text because it is an internal option. def chroot: S<"chroot">; Index: ELF/Writer.cpp =================================================================== --- ELF/Writer.cpp +++ ELF/Writer.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "Writer.h" +#include "CallGraphSort.h" #include "Config.h" #include "Filesystem.h" #include "LinkerScript.h" @@ -920,6 +921,21 @@ Old.end()); Script->fabricateDefaultCommands(); + + // Use the rarely used option -call-graph-ordering-file to sort sections. + if (!Config->CallGraphProfile.empty()) { + llvm::DenseMap OrderMap = + computeCallGraphProfileOrder(); + + for (BaseCommand *Base : Script->Opt.Commands) { + auto *OS = dyn_cast(Base); + if (!OS || OS->Name != ".text") + continue; + OS->sort([&](InputSectionBase *IS) { return OrderMap.lookup(IS); }); + break; + } + } + sortBySymbolsOrder(); sortInitFini(findSection(".init_array")); sortInitFini(findSection(".fini_array")); Index: test/ELF/Inputs/cgprofile.txt =================================================================== --- /dev/null +++ test/ELF/Inputs/cgprofile.txt @@ -0,0 +1,7 @@ +5000 c a +4000 c b +0 d e +18446744073709551615 e d +18446744073709551611 f d +18446744073709551612 f e +6000 c h Index: test/ELF/cgprofile.s =================================================================== --- /dev/null +++ test/ELF/cgprofile.s @@ -0,0 +1,128 @@ +# REQUIRES: x86 +# +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1 +# RUN: ld.lld %t1 -e a -o %t -call-graph-profile-file %p/Inputs/cgprofile.txt +# RUN: llvm-readobj -symbols %t | FileCheck %s + + .section .text.a,"ax",@progbits + .global a +a: + .zero 20 + + .section .text.b,"ax",@progbits + .global b +b: + .zero 1 + + .section .text.c,"ax",@progbits + .global c +c: + .zero 4095 + + .section .text.d,"ax",@progbits + .global d +d: + .zero 51 + + .section .text.e,"ax",@progbits + .global e +e: + .zero 42 + + .section .text.f,"ax",@progbits + .global f +f: + .zero 42 + + .section .text.g,"ax",@progbits + .global g +g: + .zero 34 + + .section .text.h,"ax",@progbits + .global h +h: + +# CHECK: Symbols [ +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: (0) +# CHECK-NEXT: Value: 0x0 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Local (0x0) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: Undefined (0x0) +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: a +# CHECK-NEXT: Value: 0x202022 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: b +# CHECK-NEXT: Value: 0x202021 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: c +# CHECK-NEXT: Value: 0x201022 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: d +# CHECK-NEXT: Value: 0x20208A +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: e +# CHECK-NEXT: Value: 0x202060 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: f +# CHECK-NEXT: Value: 0x202036 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: g +# CHECK-NEXT: Value: 0x201000 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: h +# CHECK-NEXT: Value: 0x201022 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT:]