diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt --- a/lld/ELF/CMakeLists.txt +++ b/lld/ELF/CMakeLists.txt @@ -71,6 +71,7 @@ Option Passes Support + TransformUtils TargetParser LINK_LIBS diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h --- a/lld/ELF/CallGraphSort.h +++ b/lld/ELF/CallGraphSort.h @@ -14,6 +14,8 @@ namespace lld::elf { class InputSectionBase; +llvm::DenseMap applyCDS(); + llvm::DenseMap computeCallGraphProfileOrder(); } // namespace lld::elf diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -45,6 +45,7 @@ #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include @@ -260,11 +261,91 @@ return orderMap; } +using CallT = std::pair; + +void buildCallGraph(std::vector &FuncSizes, + std::vector &FuncCounts, + std::vector> &CallCounts, + std::vector &CallOffsets, + std::vector &Sections) { + MapVector &Profile = config->callGraphProfile; + DenseMap SecToTargetId; + + auto getOrCreateNode = [&](const InputSectionBase *InSec) -> size_t { + auto Res = SecToTargetId.try_emplace(InSec, Sections.size()); + if (Res.second) { + // Isec does not appear before in the graph. + Sections.push_back(InSec); + assert(InSec->getSize() > 0); + FuncSizes.push_back(InSec->getSize()); + FuncCounts.push_back(0); + } + return Res.first->second; + }; + + // Create the graph + for (std::pair &C : Profile) { + const InputSectionBase *FromSB = cast(C.first.first); + const InputSectionBase *ToSB = cast(C.first.second); + // Ignore edges between input sections belonging to different output + // sections. This is done because otherwise we would end up with clusters + // containing input sections that can't actually be placed adjacently in the + // output. This messes with the cluster size and density calculations. We + // would also end up moving input sections in other output sections without + // moving them closer to what calls them. + if (FromSB->getOutputSection() != ToSB->getOutputSection()) + continue; + + uint64_t Weight = C.second; + if (Weight == 0) + continue; + assert(Weight > 0 && "an arc with zero weight"); + + size_t From = getOrCreateNode(FromSB); + size_t To = getOrCreateNode(ToSB); + + if (From == To) + continue; + + auto It = std::make_pair(From, To); + CallCounts.push_back(std::make_pair(It, Weight)); + CallOffsets.push_back((FuncSizes[From] + 1) / 2); + FuncCounts[To] += Weight; + } +} + +DenseMap elf::applyCDS() { + // Creating a call graph + std::vector FuncSizes; + std::vector FuncCounts; + std::vector> CallCounts; + std::vector CallOffsets; + std::vector Sections; + buildCallGraph(FuncSizes, FuncCounts, CallCounts, CallOffsets, Sections); + + // Run the layout algorithm + CDSortConfig Config; + std::vector SortedSections = + applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); + + // Create the final order + DenseMap OrderMap; + int CurOrder = 1; + for (uint64_t SecIdx : SortedSections) { + assert(0 <= SecIdx && SecIdx < Sections.size()); + OrderMap[Sections[SecIdx]] = CurOrder++; + } + + return OrderMap; +} + // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections -// according to the C³ heuristic. All clusters are then sorted by a density -// metric to further improve locality. +// according either to the C³ or CDS algorithm. DenseMap elf::computeCallGraphProfileOrder() { - return CallGraphSort().run(); + if (config->useCDS) + return applyCDS(); + else + return CallGraphSort().run(); } diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -276,6 +276,7 @@ bool undefinedVersion; bool unique; bool useAndroidRelrTags = false; + bool useCDS = false; bool warnBackrefs; llvm::SmallVector warnBackrefsExclude; bool warnCommon; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -350,7 +350,7 @@ if (config->fixCortexA8 && !config->isLE) error("--fix-cortex-a8 is not supported on big endian targets"); - + if (config->tocOptimize && config->emachine != EM_PPC64) error("--toc-optimize is only supported on PowerPC64 targets"); @@ -1310,6 +1310,7 @@ config->unique = args.hasArg(OPT_unique); config->useAndroidRelrTags = args.hasFlag( OPT_use_android_relr_tags, OPT_no_use_android_relr_tags, false); + config->useCDS = args.hasFlag(OPT_use_cds, OPT_no_use_cds, false); config->warnBackrefs = args.hasFlag(OPT_warn_backrefs, OPT_no_warn_backrefs, false); config->warnCommon = args.hasFlag(OPT_warn_common, OPT_no_warn_common, false); diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -320,6 +320,10 @@ "Use SHT_ANDROID_RELR / DT_ANDROID_RELR* tags instead of SHT_RELR / DT_RELR*", "Use SHT_RELR / DT_RELR* tags (default)">; +defm use_cds: B<"use-cds", + "Use CDS to reorder functions in the binary", + "Do not use CDS to reorder functions in the binary">; + def pic_veneer: F<"pic-veneer">, HelpText<"Always generate position independent thunks (veneers)">;