Index: ELF/CMakeLists.txt =================================================================== --- ELF/CMakeLists.txt +++ ELF/CMakeLists.txt @@ -19,6 +19,7 @@ Arch/SPARCV9.cpp Arch/X86.cpp Arch/X86_64.cpp + CallGraphSort.cpp Driver.cpp DriverUtils.cpp EhFrame.cpp Index: ELF/CallGraphSort.h =================================================================== --- /dev/null +++ ELF/CallGraphSort.h @@ -0,0 +1,23 @@ +//===- CallGraphSort.h ------------------------------------------*- C++ -*-===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLD_ELF_CALL_GRAPH_SORT_H +#define LLD_ELF_CALL_GRAPH_SORT_H + +#include "llvm/ADT/DenseMap.h" + +namespace lld { +namespace elf { +class InputSectionBase; + +llvm::DenseMap computeCallGraphProfileOrder(); +} // namespace elf +} // namespace lld + +#endif Index: ELF/CallGraphSort.cpp =================================================================== --- /dev/null +++ ELF/CallGraphSort.cpp @@ -0,0 +1,252 @@ +//===- CallGraphSort.cpp --------------------------------------------------===// +// +// The LLVM Linker +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +/// +/// Implementation of Call-Chain Clustering from: Optimizing Function Placement +/// for Large-Scale Data-Center Applications +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// +/// The goal of this algorithm is to improve runtime performance of the final +/// executable by arranging code sections such that page table and i-cache +/// misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are layed out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incomming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the ammount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density +/// +//===----------------------------------------------------------------------===// + +#include "CallGraphSort.h" +#include "OutputSections.h" +#include "SymbolTable.h" +#include "Symbols.h" + +using namespace llvm; +using namespace lld; +using namespace lld::elf; + +namespace { +struct Edge { + int From; + uint64_t Weight; +}; + +struct Cluster { + Cluster(int Sec, size_t S) { + Sections.push_back(Sec); + Size = S; + } + + double getDensity() const { + if (Size == 0) + return 0; + return double(Weight) / double(Size); + } + + std::vector Sections; + size_t Size = 0; + uint64_t Weight = 0; + uint64_t InitialWeight = 0; + std::vector Preds; +}; + +class CallGraphSort { +public: + CallGraphSort(); + + DenseMap run(); + +private: + std::vector Clusters; + std::vector Sections; + + void groupClusters(); +}; + +// Maximum ammount the combined cluster density can be worse than the original +// cluster to consider merging. +constexpr int MAX_DENSITY_DEGRADATION = 8; + +// Maximum cluster size in bytes. +constexpr uint64_t MAX_CLUSTER_SIZE = 1024 * 1024; +} // end anonymous namespace + +// Take the edge list in Config->CallGraphProfile, resolve symbol names to +// Symbols, and generate a graph between InputSections with the provided +// weights. +CallGraphSort::CallGraphSort() { + llvm::MapVector, + uint64_t> &Profile = Config->CallGraphProfile; + DenseMap SecToCluster; + + auto GetOrCreateNode = [&](const InputSectionBase *IS) -> int { + auto Res = SecToCluster.insert(std::make_pair(IS, Clusters.size())); + if (Res.second) { + Sections.push_back(IS); + Clusters.emplace_back(Clusters.size(), IS->getSize()); + } + return Res.first->second; + }; + + // Create the graph. + for (const auto &C : Profile) { + const auto *FromSB = cast(C.first.first->Repl); + const auto *ToSB = cast(C.first.second->Repl); + uint64_t Weight = C.second; + + // Ignore edges between input sections belonging to different output + // sections. This is done because otherwise we would end up with clusters + // containing input sections that can't actually be placed adjacently in the + // output. This messes with the cluster size and density calculations. We + // would also end up moving input sections in other output sections without + // moving them closer to what calls them. + if (FromSB->getOutputSection() != ToSB->getOutputSection()) + continue; + + int From = GetOrCreateNode(FromSB); + int To = GetOrCreateNode(ToSB); + + Clusters[To].Weight += Weight; + + if (From == To) + continue; + + // Add an edge + Clusters[To].Preds.push_back({From, Weight}); + } + for (Cluster &C : Clusters) + C.InitialWeight = C.Weight; +} + +// It's bad to merge clusters which would degrade the density too much. +static bool isNewDensityBad(Cluster &A, Cluster &B) { + double NewDensity = double(A.Weight + B.Weight) / double(A.Size + B.Size); + if (NewDensity < A.getDensity() / MAX_DENSITY_DEGRADATION) + return true; + return false; +} + +static void mergeClusters(Cluster &Into, Cluster &From) { + Into.Sections.insert(Into.Sections.end(), From.Sections.begin(), + From.Sections.end()); + Into.Size += From.Size; + Into.Weight += From.Weight; + From.Sections.clear(); + From.Size = 0; + From.Weight = 0; +} + +// Group InputSections into clusters using the Call-Chain Clustering heuristic +// then sort the clusters by density. +void CallGraphSort::groupClusters() { + std::vector SortedSecs(Clusters.size()); + std::vector SecToCluster(Clusters.size()); + + for (int SI = 0, SE = Clusters.size(); SI != SE; ++SI) { + SortedSecs[SI] = SI; + SecToCluster[SI] = &Clusters[SI]; + } + + std::stable_sort(SortedSecs.begin(), SortedSecs.end(), [&](int A, int B) { + return Clusters[B].getDensity() < Clusters[A].getDensity(); + }); + + for (int SI : SortedSecs) { + Cluster &C = *SecToCluster[SI]; + + int BestPred = -1; + uint64_t BestWeight = 0; + + for (Edge &E : Clusters[SI].Preds) { + if (BestPred == -1 || E.Weight > BestWeight) { + BestPred = E.From; + BestWeight = E.Weight; + } + } + + if (BestPred == -1) + continue; + + // don't consider merging if the edge is unlikely. + if (BestWeight * 10 <= C.InitialWeight) + continue; + + Cluster *PredC = SecToCluster[BestPred]; + if (PredC == nullptr || PredC == &C) + continue; + + if (C.Size + PredC->Size > MAX_CLUSTER_SIZE) + continue; + + if (isNewDensityBad(*PredC, C)) + continue; + + // NOTE: Consider using a disjoint-set to track section -> cluster mapping + // if this is ever slow. + for (int SI : C.Sections) + SecToCluster[SI] = PredC; + + mergeClusters(*PredC, C); + } + + // Remove empty or dead nodes. + Clusters.erase(std::remove_if(Clusters.begin(), Clusters.end(), + [](const Cluster &C) { + return C.Size == 0 || C.Sections.empty(); + }), + Clusters.end()); + + // Sort by density. Invalidates all NodeIndexs. + std::sort(Clusters.begin(), Clusters.end(), + [](const Cluster &A, const Cluster &B) { + return A.getDensity() > B.getDensity(); + }); +} + +DenseMap CallGraphSort::run() { + groupClusters(); + + // Generate order. + llvm::DenseMap OrderMap; + ssize_t CurOrder = 1; + + for (const Cluster &C : Clusters) + for (int SecIndex : C.Sections) + OrderMap[Sections[SecIndex]] = CurOrder++; + + return OrderMap; +} + +// Sort sections by the profile data provided by -callgraph-profile-file +// +// This first builds a call graph based on the profile data then merges sections +// according to the C³ huristic. All clusters are then sorted by a density +// metric to further improve locality. +DenseMap elf::computeCallGraphProfileOrder() { + return CallGraphSort().run(); +} Index: ELF/Config.h =================================================================== --- ELF/Config.h +++ ELF/Config.h @@ -24,6 +24,7 @@ namespace elf { class InputFile; +class InputSectionBase; enum ELFKind { ELFNoneKind, @@ -104,6 +105,9 @@ std::vector VersionScriptGlobals; std::vector VersionScriptLocals; std::vector BuildIdVector; + llvm::MapVector, + uint64_t> + CallGraphProfile; bool AllowMultipleDefinition; bool AndroidPackDynRelocs = false; bool ARMHasBlx = false; Index: ELF/Driver.cpp =================================================================== --- ELF/Driver.cpp +++ ELF/Driver.cpp @@ -572,6 +572,45 @@ return {BuildIdKind::None, {}}; } +static void readCallGraph(MemoryBufferRef MB) { + // Build a map from symbol name to section + DenseMap SymbolNameToSymbol; + for (InputFile *File : ObjectFiles) + for (Symbol *Sym : File->getSymbols()) + SymbolNameToSymbol[Sym->getName()] = Sym; + + for (StringRef L : args::getLines(MB)) { + SmallVector Fields; + L.split(Fields, ' '); + if (Fields.size() != 3) + fatal("parse error"); + uint64_t Count; + if (!to_integer(Fields[2], Count)) + fatal("parse error"); + const Symbol *FromSym = SymbolNameToSymbol.lookup(Fields[0]); + const Symbol *ToSym = SymbolNameToSymbol.lookup(Fields[1]); + if (Config->WarnSymbolOrdering) { + if (!FromSym) + warn("call graph file: no such symbol: " + Fields[0]); + if (!ToSym) + warn("call graph file: no such symbol: " + Fields[1]); + } + if (!FromSym || !ToSym || Count == 0) + continue; + warnUnorderableSymbol(FromSym); + warnUnorderableSymbol(ToSym); + const Defined *FromSymD = dyn_cast(FromSym); + const Defined *ToSymD = dyn_cast(ToSym); + if (!FromSymD || !ToSymD) + continue; + const auto *FromSB = dyn_cast_or_null(FromSymD->Section); + const auto *ToSB = dyn_cast_or_null(ToSymD->Section); + if (!FromSB || !ToSB) + continue; + Config->CallGraphProfile[std::make_pair(FromSB, ToSB)] += Count; + } +} + static bool getCompressDebugSections(opt::InputArgList &Args) { StringRef S = Args.getLastArgValue(OPT_compress_debug_sections, "none"); if (S == "none") @@ -1237,6 +1276,11 @@ if (Config->ICF) doIcf(); + // Read the callgraph now that we know what was gced or icfed + if (auto *Arg = Args.getLastArg(OPT_call_graph_ordering_file)) + if (Optional Buffer = readFile(Arg->getValue())) + readCallGraph(*Buffer); + // Write the result to the file. writeResult(); } Index: ELF/Options.td =================================================================== --- ELF/Options.td +++ ELF/Options.td @@ -66,6 +66,9 @@ "Only set DT_NEEDED for shared libraries if used", "Always set DT_NEEDED for shared libraries">; +defm call_graph_ordering_file: Eq<"call-graph-ordering-file">, + HelpText<"Layout sections to optimize the given callgraph">; + // -chroot doesn't have a help text because it is an internal option. defm chroot: Eq<"chroot">; Index: ELF/Symbols.h =================================================================== --- ELF/Symbols.h +++ ELF/Symbols.h @@ -356,6 +356,8 @@ if (S->Traced) printTraceSymbol(S); } + +void warnUnorderableSymbol(const Symbol *Sym); } // namespace elf std::string toString(const elf::Symbol &B); Index: ELF/Symbols.cpp =================================================================== --- ELF/Symbols.cpp +++ ELF/Symbols.cpp @@ -248,6 +248,27 @@ message(toString(Sym->File) + S + Sym->getName()); } +void elf::warnUnorderableSymbol(const Symbol *Sym) { + if (!Config->WarnSymbolOrdering) + return; + const InputFile *File = Sym->File; + auto *D = dyn_cast(Sym); + if (Sym->isUndefined()) + warn(toString(File) + + ": unable to order undefined symbol: " + Sym->getName()); + else if (Sym->isShared()) + warn(toString(File) + ": unable to order shared symbol: " + Sym->getName()); + else if (D && !D->Section) + warn(toString(File) + + ": unable to order absolute symbol: " + Sym->getName()); + else if (D && isa(D->Section)) + warn(toString(File) + + ": unable to order synthetic symbol: " + Sym->getName()); + else if (D && !D->Section->Repl->Live) + warn(toString(File) + + ": unable to order discarded symbol: " + Sym->getName()); +} + // Returns a symbol for an error message. std::string lld::toString(const Symbol &B) { if (Config->Demangle) Index: ELF/Writer.cpp =================================================================== --- ELF/Writer.cpp +++ ELF/Writer.cpp @@ -9,6 +9,7 @@ #include "Writer.h" #include "AArch64ErrataFix.h" +#include "CallGraphSort.h" #include "Config.h" #include "Filesystem.h" #include "LinkerScript.h" @@ -1029,6 +1030,10 @@ // Builds section order for handling --symbol-ordering-file. static DenseMap buildSectionOrder() { DenseMap SectionOrder; + // Use the rarely used option -call-graph-ordering-file to sort sections. + if (!Config->CallGraphProfile.empty()) + return computeCallGraphProfileOrder(); + if (Config->SymbolOrderingFile.empty()) return SectionOrder; @@ -1053,25 +1058,7 @@ SymbolOrderEntry &Ent = It->second; Ent.Present = true; - if (Config->WarnSymbolOrdering) { - auto *D = dyn_cast(&Sym); - InputFile *File = Sym.File; - if (Sym.isUndefined()) - warn(toString(File) + - ": unable to order undefined symbol: " + Sym.getName()); - else if (Sym.isShared()) - warn(toString(File) + - ": unable to order shared symbol: " + Sym.getName()); - else if (D && !D->Section) - warn(toString(File) + - ": unable to order absolute symbol: " + Sym.getName()); - else if (D && isa(D->Section)) - warn(toString(File) + - ": unable to order synthetic symbol: " + Sym.getName()); - else if (D && !D->Section->Repl->Live) - warn(toString(File) + - ": unable to order discarded symbol: " + Sym.getName()); - } + warnUnorderableSymbol(&Sym); if (auto *D = dyn_cast(&Sym)) { if (auto *Sec = dyn_cast_or_null(D->Section)) { Index: test/ELF/cgprofile-icf.s =================================================================== --- /dev/null +++ test/ELF/cgprofile-icf.s @@ -0,0 +1,53 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t + +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "C D 61" >> %t.call_graph +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t.out -icf=all +# RUN: llvm-readobj -symbols %t.out | FileCheck %s +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2.out +# RUN: llvm-readobj -symbols %t2.out | FileCheck %s --check-prefix=NOICF + + .section .text.D,"ax",@progbits + .globl D +D: + mov $60, %rax + retq + + .section .text.C,"ax",@progbits + .globl C +C: + mov $60, %rax + retq + + .section .text.B,"ax",@progbits + .globl B +B: + mov $2, %rax + retq + + .section .text.A,"ax",@progbits + .globl A +A: + mov $42, %rax + retq + +# CHECK: Name: A +# CHECK-NEXT: Value: 0x201000 +# CHECK: Name: B +# CHECK-NEXT: Value: 0x201010 +# CHECK: Name: C +# CHECK-NEXT: Value: 0x201008 +# CHECK: Name: D +# CHECK-NEXT: Value: 0x201008 + +# NOICF: Name: A +# NOICF-NEXT: Value: 0x201000 +# NOICF: Name: B +# NOICF-NEXT: Value: 0x201008 +# NOICF: Name: C +# NOICF-NEXT: Value: 0x201010 +# NOICF: Name: D +# NOICF-NEXT: Value: 0x201018 Index: test/ELF/cgprofile-txt.s =================================================================== --- /dev/null +++ test/ELF/cgprofile-txt.s @@ -0,0 +1,185 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t +# RUN: ld.lld -e A %t -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s --check-prefix=NOSORT + +# RUN: echo "A B 10" > %t.call_graph +# RUN: echo "A B 10" >> %t.call_graph +# RUN: echo "Aa B 80" >> %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "B C 30" >> %t.call_graph +# RUN: echo "C D 90" >> %t.call_graph +# RUN: echo "PP TS 100" >> %t.call_graph +# RUN: echo "_init2 _init 24567837" >> %t.call_graph +# RUN: echo "TS QC 9001" >> %t.call_graph +# RUN: echo "TooManyPreds0 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds1 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds2 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds3 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds4 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds5 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds6 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds7 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph +# RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: llvm-readobj -symbols %t2 | FileCheck %s + + .section .text.D,"ax",@progbits +D: + retq + + .section .text.C,"ax",@progbits + .globl C +C: + retq + + .section .text.B,"ax",@progbits + .globl B +B: + retq + + .section .text.A,"ax",@progbits + .globl A +A: +Aa: + retq + + .section .ponies,"ax",@progbits,unique,1 + .globl TS +TS: + retq + + .section .ponies,"ax",@progbits,unique,2 + .globl PP +PP: + retq + + .section .other,"ax",@progbits,unique,1 + .globl QC +QC: + retq + + .section .other,"ax",@progbits,unique,2 + .globl GB +GB: + retq + + .section .init,"ax",@progbits,unique,1 + .globl _init +_init: + retq + + .section .init,"ax",@progbits,unique,2 + .globl _init2 +_init2: + retq + + .section .text.TooManyPreds,"ax",@progbits +TooManyPreds: + retq + retq + retq + retq + retq + retq + retq + retq + retq + retq + + .section .text.TooManyPreds0,"ax",@progbits +TooManyPreds0: + retq + + .section .text.TooManyPreds1,"ax",@progbits +TooManyPreds1: + retq + + .section .text.TooManyPreds2,"ax",@progbits +TooManyPreds2: + retq + + .section .text.TooManyPreds3,"ax",@progbits +TooManyPreds3: + retq + + .section .text.TooManyPreds4,"ax",@progbits +TooManyPreds4: + retq + + .section .text.TooManyPreds5,"ax",@progbits +TooManyPreds5: + retq + + .section .text.TooManyPreds6,"ax",@progbits +TooManyPreds6: + retq + + .section .text.TooManyPreds7,"ax",@progbits +TooManyPreds7: + retq + + .section .text.TooManyPreds8,"ax",@progbits +TooManyPreds8: + retq + + .section .text.TooManyPreds9,"ax",@progbits +TooManyPreds9: + retq + + .section .text.TooManyPreds10,"ax",@progbits +TooManyPreds10: + retq + +# CHECK: Name: D +# CHECK-NEXT: Value: 0x201003 +# CHECK: Name: TooManyPreds +# CHECK-NEXT: Value: 0x201004 +# CHECK: Name: TooManyPreds10 +# CHECK-NEXT: Value: 0x201018 +# CHECK: Name: A +# CHECK-NEXT: Value: 0x201000 +# CHECK: Name: B +# CHECK-NEXT: Value: 0x201001 +# CHECK: Name: C +# CHECK-NEXT: Value: 0x201002 +# CHECK: Name: GB +# CHECK-NEXT: Value: 0x20101F +# CHECK: Name: PP +# CHECK-NEXT: Value: 0x20101C +# CHECK: Name: QC +# CHECK-NEXT: Value: 0x20101E +# CHECK: Name: TS +# CHECK-NEXT: Value: 0x20101D +# CHECK: Name: _init +# CHECK-NEXT: Value: 0x201020 +# CHECK: Name: _init2 +# CHECK-NEXT: Value: 0x201021 + +# NOSORT: Name: D +# NOSORT-NEXT: Value: 0x201000 +# NOSORT: Name: TooManyPreds +# NOSORT-NEXT: Value: 0x201004 +# NOSORT: Name: TooManyPreds10 +# NOSORT-NEXT: Value: 0x201018 +# NOSORT: Name: A +# NOSORT-NEXT: Value: 0x201003 +# NOSORT: Name: B +# NOSORT-NEXT: Value: 0x201002 +# NOSORT: Name: C +# NOSORT-NEXT: Value: 0x201001 +# NOSORT: Name: GB +# NOSORT-NEXT: Value: 0x20101C +# NOSORT: Name: PP +# NOSORT-NEXT: Value: 0x20101A +# NOSORT: Name: QC +# NOSORT-NEXT: Value: 0x20101B +# NOSORT: Name: TS +# NOSORT-NEXT: Value: 0x201019 +# NOSORT: Name: _init +# NOSORT-NEXT: Value: 0x20101D +# NOSORT: Name: _init2 +# NOSORT-NEXT: Value: 0x20101E Index: test/ELF/cgprofile-warn.s =================================================================== --- /dev/null +++ test/ELF/cgprofile-warn.s @@ -0,0 +1,29 @@ +# REQUIRES: x86 + +# RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t + +# RUN: echo "A B 100" > %t.call_graph +# RUN: echo "A C 40" >> %t.call_graph +# RUN: echo "B C 30" >> %t.call_graph +# RUN: echo "adena A 30" >> %t.call_graph +# RUN: echo "poppy A 30" >> %t.call_graph +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t.out \ +# RUN: -noinhibit-exec -icf=all 2>&1 | FileCheck %s + + .section .text.C,"ax",@progbits + .globl C +C: + mov poppy, %rax + retq + +B = 0x1234 + + .section .text.A,"ax",@progbits + .globl A +A: + mov poppy, %rax + retq + +# CHECK: unable to order absolute symbol: B +# CHECK: call graph file: no such symbol: adena +# CHECK: unable to order undefined symbol: poppy