Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -18,6 +18,7 @@ Arch/SPARCV9.cpp Arch/X86.cpp Arch/X86_64.cpp + CallGraphSort.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/CallGraphSort.h =================================================================== --- /dev/null +++ ELF/CallGraphSort.h @@ -0,0 +1,24 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CALL_GRAPH_SORT_H +#define LLD_ELF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace elf { +class InputSectionBase; + +llvm::DenseMap +computeCallGraphProfileOrder(); +} +} + +#endif Index: ELF/CallGraphSort.cpp =================================================================== --- /dev/null +++ ELF/CallGraphSort.cpp @@ -0,0 +1,188 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// \file This file implements Call-Chain Clustering from: +/// Optimizing Function Placement for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// It does so given a call graph profile by the following: +/// * Build a call graph from the profile +/// * While there are unresolved edges +/// * Find the edge with the highest weight +/// * Check if merging the two clusters would create a cluster larger than the +/// target page size +/// * If not, contract that edge putting the callee after the caller +/// * Sort remaining clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "SymbolTable.h" +#include "Target.h" + +#include "llvm/Support/MathExtras.h" + +#include + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +using NodeIndex = std::ptrdiff_t; + +struct Node { + Node() = default; + Node(const InputSectionBase *IS) { + Sections.push_back(IS); + Size = IS->getSize(); + } + std::vector Sections; + int64_t Size = 0; + uint64_t Weight = 0; +}; + +struct Edge { + NodeIndex From; + NodeIndex To; + mutable uint64_t Weight; + bool operator==(const Edge Other) const { + return From == Other.From && To == Other.To; + } +}; + +struct EdgeHash { + std::size_t operator()(const Edge E) const { + return llvm::hash_combine(E.From, E.To); + }; +}; +} // end anonymous namespace + +static void insertOrIncrementEdge(std::unordered_set &Edges, + const Edge E) { + if (E.From == E.To) + return; + auto Res = Edges.insert(E); + if (!Res.second) + Res.first->Weight = SaturatingAdd(Res.first->Weight, E.Weight); +} + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// SymbolBodys, and generate a graph between InputSections with the provided +// weights. +static void buildCallGraph(std::vector &Nodes, + std::unordered_set &Edges) { + DenseMap SecToNode; + + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> NodeIndex { + auto Res = SecToNode.insert(std::make_pair(IS, Nodes.size())); + if (Res.second) + Nodes.emplace_back(IS); + return Res.first->second; + }; + + // Create the graph. + for (const auto &C : Config->CallGraphProfile) { + if (C.second == 0) + continue; + auto FromDR = dyn_cast_or_null(Symtab->find(C.first.first)); + auto ToDR = dyn_cast_or_null(Symtab->find(C.first.second)); + if (!FromDR || !ToDR) + continue; + auto FromSB = dyn_cast_or_null(FromDR->Section); + auto ToSB = dyn_cast_or_null(ToDR->Section); + if (!FromSB || !ToSB || FromSB->getSize() == 0 || ToSB->getSize() == 0) + continue; + NodeIndex From = GetOrCreateNode(FromSB); + NodeIndex To = GetOrCreateNode(ToSB); + insertOrIncrementEdge(Edges, {From, To, C.second}); + Nodes[To].Weight = SaturatingAdd(Nodes[To].Weight, C.second); + } +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +static void generateClusters(std::vector &Nodes, + std::unordered_set &Edges) { + // Collapse the graph. + while (!Edges.empty()) { + // Find the largest edge + // FIXME: non deterministic order for equal edges. + // FIXME: n^2 on Edges. + auto Max = std::max_element( + Edges.begin(), Edges.end(), + [](const Edge A, const Edge B) { return A.Weight < B.Weight; }); + const Edge MaxE = *Max; + Edges.erase(Max); + // Merge the Nodes. + Node &From = Nodes[MaxE.From]; + Node &To = Nodes[MaxE.To]; + if (From.Size + To.Size > Target->PageSize) + continue; + From.Sections.insert(From.Sections.end(), To.Sections.begin(), + To.Sections.end()); + To.Sections.clear(); + From.Size += To.Size; + From.Weight = SaturatingAdd(From.Weight, To.Weight); + // Collect all edges from or to the removed node and update them for the new + // node. + std::vector OldEdges; + // FIXME: n^2 on Edges. + for (auto EI = Edges.begin(), EE = Edges.end(); EI != EE;) { + if (EI->From == MaxE.To || EI->To == MaxE.To) { + OldEdges.push_back(*EI); + EI = Edges.erase(EI); + } else + ++EI; + } + for (const Edge E : OldEdges) { + insertOrIncrementEdge(Edges, + {E.From == MaxE.To ? MaxE.From : E.From, + E.To == MaxE.To ? MaxE.From : E.To, E.Weight}); + } + } + + // Sort by density. + std::sort(Nodes.begin(), Nodes.end(), [](const Node &A, const Node &B) { + return (APFloat(APFloat::IEEEdouble(), A.Weight) / + APFloat(APFloat::IEEEdouble(), A.Size)) + .compare(APFloat(APFloat::IEEEdouble(), B.Weight) / + APFloat(APFloat::IEEEdouble(), B.Size)) == + APFloat::cmpLessThan; + }); +} + +// Sort sections by the profile data provided by -callgraph-ordering-file +// +// This first builds a call graph based on the profile data then iteratively +// merges the hottest call edges as long as it would not create a cluster larger +// than the page size. All clusters are then sorted by a density metric to +// further improve locality. +llvm::DenseMap +elf::computeCallGraphProfileOrder() { + std::vector Nodes; + std::unordered_set Edges; + + buildCallGraph(Nodes, Edges); + generateClusters(Nodes, Edges); + + // Generate order. + llvm::DenseMap OrderMap; + ssize_t CurOrder = 1; + + for (const Node &N : Nodes) + for (const InputSectionBase *IS : N.Sections) + OrderMap[IS] = CurOrder++; + + return OrderMap; +} Index: ELF/Config.h =================================================================== --- ELF/Config.h +++ ELF/Config.h @@ -10,6 +10,7 @@ #ifndef LLD_ELF_CONFIG_H #define LLD_ELF_CONFIG_H +#include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSet.h" @@ -108,6 +109,8 @@ std::vector VersionScriptLocals; std::vector BuildIdVector; llvm::MapVector RenamedSymbols; + llvm::DenseMap, uint64_t> + CallGraphProfile; bool AllowMultipleDefinition; bool AsNeeded = false; bool Bsymbolic; Index: ELF/Driver.cpp =================================================================== --- ELF/Driver.cpp +++ ELF/Driver.cpp @@ -600,6 +600,39 @@ return Ret; } +// This reads a list of call edges with weights one line at a time from a file +// with the following format for each line: +// +// ^[.*]+ [.*]+ [.*]+$ +// +// It interprets the first value as an unsigned 64 bit weight, the second as +// the symbol the call is from, and the third as the symbol the call is to. +// +// Example: +// +// 5000 c a +// 4000 c b +// 18446744073709551615 e d +// +static void readCallGraphProfile(MemoryBufferRef MB) { + for (StringRef L : getLines(MB)) { + SmallVector Fields; + L.split(Fields, ' '); + if (Fields.size() != 3) { + error("parse error"); + return; + } + uint64_t Count; + if (!to_integer(Fields[0], Count)) { + error("parse error"); + return; + } + StringRef From = Fields[1]; + StringRef To = Fields[2]; + Config->CallGraphProfile[std::make_pair(From, To)] = Count; + } +} + static bool getCompressDebugSections(opt::InputArgList &Args) { StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none"); if (S == "none") @@ -727,6 +760,10 @@ if (Optional Buffer = readFile(Arg->getValue())) Config->SymbolOrderingFile = getLines(*Buffer); + if (auto *Arg = Args.getLastArg(OPT_callgraph_profile_file)) + if (Optional Buffer = readFile(Arg->getValue())) + readCallGraphProfile(*Buffer); + // If --retain-symbol-file is used, we'll keep only the symbols listed in // the file and discard all others. if (auto *Arg = Args.getLastArg(OPT_retain_symbols_file)) { Index: ELF/Options.td =================================================================== --- ELF/Options.td +++ ELF/Options.td @@ -52,6 +52,9 @@ def as_needed: F<"as-needed">, HelpText<"Only set DT_NEEDED for shared libraries if used">; +def callgraph_profile_file: S<"callgraph-profile-file">, + HelpText<"Layout sections to optimize the given callgraph">; + // -chroot doesn't have a help text because it is an internal option. def chroot: S<"chroot">; Index: ELF/Writer.cpp =================================================================== --- ELF/Writer.cpp +++ ELF/Writer.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "Writer.h" +#include "CallGraphSort.h" #include "Config.h" #include "Filesystem.h" #include "LinkerScript.h" @@ -911,6 +912,21 @@ Old.end()); Script->fabricateDefaultCommands(); + + // Use the rarely used option -callgraph-ordering-file to sort sections. + if (!Config->CallGraphProfile.empty()) { + llvm::DenseMap OrderMap = + computeCallGraphProfileOrder(); + + for (BaseCommand *Base : Script->Opt.Commands) { + auto *OS = dyn_cast(Base); + if (!OS || OS->Name != ".text") + continue; + OS->sort([&](InputSectionBase *IS) { return OrderMap.lookup(IS); }); + break; + } + } + sortBySymbolsOrder(); sortInitFini(findSection(".init_array")); sortInitFini(findSection(".fini_array")); Index: test/ELF/Inputs/cgprofile.txt =================================================================== --- /dev/null +++ test/ELF/Inputs/cgprofile.txt @@ -0,0 +1,7 @@ +5000 c a +4000 c b +0 d e +18446744073709551615 e d +18446744073709551611 f d +18446744073709551612 f e +6000 c h Index: test/ELF/cgprofile.s =================================================================== --- /dev/null +++ test/ELF/cgprofile.s @@ -0,0 +1,128 @@ +# REQUIRES: x86 +# +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t1 +# RUN: ld.lld %t1 -e a -o %t -callgraph-profile-file %p/Inputs/cgprofile.txt +# RUN: llvm-readobj -symbols %t | FileCheck %s + + .section .text.a,"ax",@progbits + .global a +a: + .zero 20 + + .section .text.b,"ax",@progbits + .global b +b: + .zero 1 + + .section .text.c,"ax",@progbits + .global c +c: + .zero 4095 + + .section .text.d,"ax",@progbits + .global d +d: + .zero 51 + + .section .text.e,"ax",@progbits + .global e +e: + .zero 42 + + .section .text.f,"ax",@progbits + .global f +f: + .zero 42 + + .section .text.g,"ax",@progbits + .global g +g: + .zero 34 + + .section .text.h,"ax",@progbits + .global h +h: + +# CHECK: Symbols [ +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: (0) +# CHECK-NEXT: Value: 0x0 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Local (0x0) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: Undefined (0x0) +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: a +# CHECK-NEXT: Value: 0x202022 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: b +# CHECK-NEXT: Value: 0x202021 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: c +# CHECK-NEXT: Value: 0x201022 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: d +# CHECK-NEXT: Value: 0x20208A +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: e +# CHECK-NEXT: Value: 0x202060 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: f +# CHECK-NEXT: Value: 0x202036 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: g +# CHECK-NEXT: Value: 0x201000 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT: Symbol { +# CHECK-NEXT: Name: h +# CHECK-NEXT: Value: 0x201022 +# CHECK-NEXT: Size: 0 +# CHECK-NEXT: Binding: Global (0x1) +# CHECK-NEXT: Type: None (0x0) +# CHECK-NEXT: Other: 0 +# CHECK-NEXT: Section: .text +# CHECK-NEXT: } +# CHECK-NEXT:]