Index: lld/ELF/PropellerBBReordering.h =================================================================== --- /dev/null +++ lld/ELF/PropellerBBReordering.h @@ -0,0 +1,264 @@ +#ifndef LLD_ELF_PROPELLER_BB_REORDERING_H +#define LLD_ELF_PROPELLER_BB_REORDERING_H + +#include "PropellerELFCfg.h" + +#include +#include +#include +#include +#include + +using std::list; +using std::map; +using std::set; +using std::unique_ptr; +using std::unordered_map; +using std::unordered_set; +using std::vector; + +using llvm::StringMap; + +namespace lld { +namespace propeller { + +enum MergeOrder { + Begin, + X2X1Y = Begin, + BeginNext, + X1YX2 = BeginNext, + X2YX1, + YX2X1, + End +}; + +/* Represents a chain of ELFCfgNodes (basic blocks). */ +class NodeChain { +public: + const ELFCfgNode *DelegateNode = nullptr; + list Nodes; + + // Total binary size of the chain + uint32_t Size{0}; + + // Total execution frequency of the chain + uint64_t Freq{0}; + + double Score{0}; + + // Constructor for building a NodeChain with a single Node. + NodeChain(const ELFCfgNode *Node) { + DelegateNode = Node; + Nodes.push_back(Node); + Size = Node->ShSize; + Freq = Node->Freq; + } + + void Dump() const; + + double GetExecDensity() const { return ((double)Freq) / std::max(Size, (uint32_t)1); } + + const ELFCfgNode *GetFirstNode() const { return Nodes.front(); } + + const ELFCfgNode *GetLastNode() const { return Nodes.back(); } +}; + + +/* Chain builder based on ExtTSP metric */ +class NodeChainBuilder { +private: + /* Cfg representing a function.*/ + const ELFCfg *Cfg; + /* Set of built chains, keyed by section index of their Delegate Nodes.*/ + map> Chains; + unordered_map NodeToChainMap; + unordered_map NodeOffset; + + unordered_map MutuallyForcedOut; + + class NodeChainAssembly; + class NodeChainSlice; + + map, std::unique_ptr> + NodeChainAssemblies; + unordered_map> CandidateChains; + + void SortChainsByExecutionDensity(vector &ChainOrder); + void InitializeExtTSP(); + void AttachFallThroughs(); + + /* This function tries to place two nodes immediately adjacent to + * each other (used for fallthroughs). + * Returns true if this can be done. */ + bool AttachNodes(const ELFCfgNode *Src, const ELFCfgNode *Sink); + + void MergeChainEdges(NodeChain *SplitChain, NodeChain *UnsplitChain); + + void MergeChains(NodeChain *LeftChain, NodeChain *RightChain); + void MergeChains(std::unique_ptr A); + + double ExtTSPScore(NodeChain *Chain) const; + bool UpdateNodeChainAssembly(NodeChain *SplitChain, NodeChain *UnsplitChain); + void ComputeChainOrder(vector &); + + void initMutuallyForcedEdges(); + + void initNodeChains(); + +public: + virtual ~NodeChainBuilder() = default; + NodeChainBuilder(NodeChainBuilder &) = delete; + + uint32_t getNodeOffset(const ELFCfgNode *N) const { return NodeOffset.at(N); } + + NodeChainBuilder(const ELFCfg *_Cfg) : Cfg(_Cfg) { + initNodeChains(); + initMutuallyForcedEdges(); + } + + void doSplitOrder(list &SymbolList, + list::iterator HotPlaceHolder, + list::iterator ColdPlaceHolder); +}; + +class NodeChainBuilder::NodeChainSlice { +private: + NodeChain *Chain; + list::iterator Begin, End; + uint32_t BeginOffset, EndOffset; + + NodeChainSlice(NodeChain *C, list::iterator _Begin, + list::iterator _End, + const NodeChainBuilder &ChainBuilder) + : Chain(C), Begin(_Begin), End(_End) { + + BeginOffset = ChainBuilder.getNodeOffset(*Begin); + if (End == Chain->Nodes.end()) + EndOffset = Chain->Size; + else + EndOffset = ChainBuilder.getNodeOffset(*End); + assert(EndOffset >= BeginOffset); + } + + uint32_t Size() const { return EndOffset - BeginOffset; } + + friend class NodeChainBuilder; + friend class NodeChainAssembly; +}; + +class NodeChainBuilder::NodeChainAssembly { +private: + double Score; + bool ScoreComputed{false}; + const NodeChainBuilder *ChainBuilder; + NodeChain *SplitChain, *UnsplitChain; + MergeOrder MOrder; + list::iterator SlicePos; + + vector Slices; + + NodeChainAssembly(NodeChain *ChainX, NodeChain *ChainY, + list::iterator _SlicePos, + MergeOrder _MOrder, + const NodeChainBuilder *_ChainBuilder) { + SplitChain = ChainX; + UnsplitChain = ChainY; + SlicePos = _SlicePos; + MOrder = _MOrder; + ChainBuilder = _ChainBuilder; + NodeChainSlice X1(ChainX, ChainX->Nodes.begin(), SlicePos, *ChainBuilder); + NodeChainSlice X2(ChainX, SlicePos, ChainX->Nodes.end(), *ChainBuilder); + NodeChainSlice Y(ChainY, ChainY->Nodes.begin(), ChainY->Nodes.end(), + *ChainBuilder); + + switch (MOrder) { + case MergeOrder::X2X1Y: + Slices = {X2, X1, Y}; + break; + case MergeOrder::X1YX2: + Slices = {X1, Y, X2}; + break; + case MergeOrder::X2YX1: + Slices = {X2, Y, X1}; + break; + case MergeOrder::YX2X1: + Slices = {Y, X2, X1}; + break; + default: + assert("Invalid MergeOrder!" && false); + } + } + + double GetExtTSPScore() { + if (ScoreComputed) + return Score; + else { + ScoreComputed = true; + return (Score = ExtTSPScore()); + } + } + + double GetExtTSPGain() { + return GetExtTSPScore() - SplitChain->Score - UnsplitChain->Score; + } + + bool FindSliceIndex(const ELFCfgNode *Node, uint8_t &Idx) const { + NodeChain *Chain = ChainBuilder->NodeToChainMap.at(Node); + if (SplitChain != Chain && UnsplitChain != Chain) + return false; + auto Offset = ChainBuilder->NodeOffset.at(Node); + for (Idx = 0; Idx < 3; ++Idx) { + if (Chain != Slices[Idx].Chain) + continue; + if (Offset < Slices[Idx].BeginOffset) + continue; + if (Offset > Slices[Idx].EndOffset) + continue; + if (Offset < Slices[Idx].EndOffset && Offset > Slices[Idx].BeginOffset) + return true; + if (Offset == Slices[Idx].EndOffset) { + for(auto NI = std::prev(Slices[Idx].End); + NI != std::prev(Slices[Idx].Begin) && !(*NI)->ShSize ; + NI--){ + if(*NI == Node) + return true; + } + } + if (Offset == Slices[Idx].BeginOffset) { + for(auto NI = Slices[Idx].Begin; + NI != Slices[Idx].End; + NI++){ + if(*NI == Node) + return true; + if((*NI)->ShSize) + break; + } + } + } + return false; + } + + double ExtTSPScore() const; + + const ELFCfgNode *GetFirstNode() const { + for (auto &Slice : Slices) + if (Slice.Begin != Slice.End) + return *Slice.Begin; + return nullptr; + } + + friend class NodeChainBuilder; + +public: + NodeChainAssembly(NodeChainAssembly &&) = default; + // copy constructor is implicitly deleted + // NodeChainAssembly(const NodeChainAssembly&) = delete; + NodeChainAssembly() = delete; +}; + + +ostream & operator << (ostream &Out, const NodeChain &Chain); + +} // namespace propeller +} // namespace lld +#endif Index: lld/ELF/PropellerBBReordering.cpp =================================================================== --- /dev/null +++ lld/ELF/PropellerBBReordering.cpp @@ -0,0 +1,542 @@ +#include "PropellerBBReordering.h" + +#include "llvm/Support/CommandLine.h" +#include "Config.h" + +#include + +#include +#include +#include +#include + +using lld::elf::config; + +using std::list; +using std::set; +using std::unique_ptr; +using std::unordered_map; +using std::vector; +using namespace llvm; + +namespace lld { +namespace propeller { + +ostream &operator<<(ostream &Out, const NodeChain &Chain) { + Out << "Chain for Cfg Node: {" << *(Chain.DelegateNode) << "}: total binary size: " << Chain.Size + << ", total frequency: " << Chain.Freq << "\n"; + Out << " Nodes: " ; + + for (const ELFCfgNode *N : Chain.Nodes) + Out << *N << " ---> " ; + return Out; +} + +double GetEdgeExtTSPScore(const ELFCfgEdge *Edge, bool IsEdgeForward, + uint32_t SrcSinkDistance) { + if (Edge->Weight == 0) + return 0; + + if (SrcSinkDistance == 0 && (Edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC)) + return Edge->Weight * config->propellerFallthroughWeight; + + if (IsEdgeForward && SrcSinkDistance < config->propellerForwardJumpDistance) + return Edge->Weight * config->propellerForwardJumpWeight * + (1.0 - ((double)SrcSinkDistance) / config->propellerForwardJumpDistance); + + if (!IsEdgeForward && SrcSinkDistance < config->propellerBackwardJumpDistance) + return Edge->Weight * config->propellerBackwardJumpWeight * + (1.0 - ((double)SrcSinkDistance) / config->propellerBackwardJumpDistance); + return 0; +} + +void NodeChainBuilder::SortChainsByExecutionDensity( + vector &ChainOrder) { + for (auto CI = Chains.cbegin(), CE = Chains.cend(); CI != CE; ++CI) { + ChainOrder.push_back(CI->second.get()); + } + + std::sort( + ChainOrder.begin(), ChainOrder.end(), + [this](const NodeChain *C1, const NodeChain *C2) { + if (C1->GetFirstNode() == this->Cfg->getEntryNode()) + return true; + if (C2->GetFirstNode() == this->Cfg->getEntryNode()) + return false; + double C1ExecDensity = C1->GetExecDensity(); + double C2ExecDensity = C2->GetExecDensity(); + if (C1ExecDensity == C2ExecDensity) { + if (C1->DelegateNode->MappedAddr == C2->DelegateNode->MappedAddr) + return C1->DelegateNode->Shndx < C2->DelegateNode->Shndx; + return C1->DelegateNode->MappedAddr < C2->DelegateNode->MappedAddr; + } + return C1ExecDensity > C2ExecDensity; + }); +} + +void NodeChainBuilder::AttachFallThroughs() { + for (auto &Node : Cfg->Nodes) { + if (Node->FTEdge != nullptr) { + AttachNodes(Node.get(), Node->FTEdge->Sink); + } + } + + for (auto &Edge : Cfg->IntraEdges) { + AttachNodes(Edge->Src, Edge->Sink); + } +} + +/* Merge two chains in the specified order. */ +void NodeChainBuilder::MergeChains(NodeChain *LeftChain, + NodeChain *RightChain) { + for (const ELFCfgNode *Node : RightChain->Nodes) { + LeftChain->Nodes.push_back(Node); + NodeToChainMap[Node] = LeftChain; + NodeOffset[Node] += LeftChain->Size; + } + LeftChain->Size += RightChain->Size; + LeftChain->Freq += RightChain->Freq; + Chains.erase(RightChain->DelegateNode->Shndx); +} + +/* This function tries to place two nodes immediately adjacent to + * each other (used for fallthroughs). + * Returns true if this can be done. */ +bool NodeChainBuilder::AttachNodes(const ELFCfgNode *Src, + const ELFCfgNode *Sink) { + if (Sink == Cfg->getEntryNode()) + return false; + if (Src->Freq == 0 ^ Sink->Freq == 0) + return false; + NodeChain *SrcChain = NodeToChainMap.at(Src); + NodeChain *SinkChain = NodeToChainMap.at(Sink); + if (SrcChain == SinkChain) + return false; + if (SrcChain->GetLastNode() != Src || SinkChain->GetFirstNode() != Sink) + return false; + MergeChains(SrcChain, SinkChain); + return true; +} + +void NodeChainBuilder::MergeChains(std::unique_ptr A) { + /* Merge the Node sequences according to the given NodeChainAssembly.*/ + list NewNodeOrder; + for (NodeChainSlice &Slice : A->Slices){ + std::copy(Slice.Begin, Slice.End, std::back_inserter(NewNodeOrder)); + } + A->SplitChain->Nodes = std::move(NewNodeOrder); + + /* Update NodeOffset and NodeToChainMap for all the nodes in the sequence */ + uint32_t RunningOffset = 0; + for (const ELFCfgNode *Node : A->SplitChain->Nodes) { + NodeToChainMap[Node] = A->SplitChain; + NodeOffset[Node] = RunningOffset; + RunningOffset += Node->ShSize; + } + A->SplitChain->Size = RunningOffset; + + /* Update the tota frequency, and TSP score of the merged chain */ + A->SplitChain->Freq += A->UnsplitChain->Freq; + A->SplitChain->Score = A->GetExtTSPScore(); + + /* Merge the assembly candidate chains of the two chains into the candidate + * chains of the remaining NodeChain and remove the records for the defunct NodeChain. */ + for (NodeChain *C : CandidateChains[A->UnsplitChain]) { + NodeChainAssemblies.erase(std::make_pair(C, A->UnsplitChain)); + NodeChainAssemblies.erase(std::make_pair(A->UnsplitChain, C)); + CandidateChains[C].erase(A->UnsplitChain); + if(C!=A->SplitChain) + CandidateChains[A->SplitChain].insert(C); + } + + /* Update the NodeChainAssembly for all candidate chains of the merged + * NodeChain. Remove a NodeChain from the merge chain's candidates if the + * NodeChainAssembly update finds no gain.*/ + auto &SplitChainCandidateChains = CandidateChains[A->SplitChain]; + + for (auto CI = SplitChainCandidateChains.begin(), + CE = SplitChainCandidateChains.end(); + CI != CE;) { + NodeChain *OtherChain = *CI; + auto &OtherChainCandidateChains = CandidateChains[OtherChain]; + bool OS = UpdateNodeChainAssembly(OtherChain, A->SplitChain); + bool SO = UpdateNodeChainAssembly(A->SplitChain, OtherChain); + if (OS || SO) { + OtherChainCandidateChains.insert(A->SplitChain); + CI++; + } else { + OtherChainCandidateChains.erase(A->SplitChain); + CI = SplitChainCandidateChains.erase(CI); + } + } + + /* Remove all the candidate chain records for the merged-in chain.*/ + CandidateChains.erase(A->UnsplitChain); + + /* Finally, remove the merged-in chain record from the Chains.*/ + Chains.erase(A->UnsplitChain->DelegateNode->Shndx); +} + +/* Calculate the Extended TSP metric for a NodeChain */ +double NodeChainBuilder::ExtTSPScore(NodeChain *Chain) const { + double Score = 0; + uint32_t SrcOffset = 0; + for (const ELFCfgNode *Node : Chain->Nodes) { + for (const ELFCfgEdge *Edge : Node->Outs) { + if (!Edge->Weight) + continue; + NodeChain *SinkChain = NodeToChainMap.at(Edge->Sink); + if (SinkChain != Chain) + continue; + auto SinkOffset = getNodeOffset(Edge->Sink); + bool EdgeForward = SrcOffset + Node->ShSize <= SinkOffset; + uint32_t Distance = EdgeForward ? SinkOffset - SrcOffset - Node->ShSize + : SrcOffset - SinkOffset + Node->ShSize; + + Score += GetEdgeExtTSPScore(Edge, EdgeForward, Distance); + } + SrcOffset += Node->ShSize; + } + return Score; +} + +/* Updates the best NodeChainAssembly between two NodeChains. The existing + * record will be replaced by the new NodeChainAssembly if a non-zero gain + * is achieved. Otherwise, it will be removed. + * If a nodechain assembly record is (kept) inserted, returns true. Otherwise + * returns false. */ +bool NodeChainBuilder::UpdateNodeChainAssembly(NodeChain *SplitChain, + NodeChain *UnsplitChain) { + // Remove the assembly record + NodeChainAssemblies.erase(std::make_pair(SplitChain, UnsplitChain)); + + // Only consider split the chain if the size of the chain is smaller than a treshold. + bool DoSplit = (SplitChain->Size <= config->propellerChainSplitThreshold); + auto SlicePosEnd = + DoSplit ? SplitChain->Nodes.end() : std::next(SplitChain->Nodes.begin()); + + list> CandidateNCAs; + + for (auto SlicePos = SplitChain->Nodes.begin(); SlicePos != SlicePosEnd; + ++SlicePos) { + // Do not split the mutually-forced edges in the chain. + if (SlicePos != SplitChain->Nodes.begin() && + MutuallyForcedOut[*std::prev(SlicePos)] == *SlicePos) + continue; + + // If the split position is at the beginning (no splitting), only consider one MergeOrder + auto MergeOrderEnd = (SlicePos == SplitChain->Nodes.begin()) + ? MergeOrder::BeginNext + : MergeOrder::End; + + for (uint8_t MI = MergeOrder::Begin; MI != MergeOrderEnd; MI++) { + MergeOrder MOrder = static_cast(MI); + + // Create the NodeChainAssembly representing this particular assembly. + auto NCA = unique_ptr(new NodeChainAssembly( + SplitChain, UnsplitChain, SlicePos, MOrder, this)); + + ELFCfgNode *EntryNode = Cfg->getEntryNode(); + if ((NCA->SplitChain->GetFirstNode() == EntryNode || + NCA->UnsplitChain->GetFirstNode() == EntryNode) && + NCA->GetFirstNode() != EntryNode) + continue; + CandidateNCAs.push_back(std::move(NCA)); + } + } + + // Insert the best assembly of the two chains (only if it has positive gain). + auto BestCandidate = std::max_element( + CandidateNCAs.begin(), CandidateNCAs.end(), + [](unique_ptr &C1, unique_ptr &C2) { + return C1->GetExtTSPGain() < C2->GetExtTSPGain(); + }); + + if (BestCandidate != CandidateNCAs.end() && + (*BestCandidate)->GetExtTSPGain() > 0) { + NodeChainAssemblies.emplace( + std::piecewise_construct, + std::forward_as_tuple(SplitChain, UnsplitChain), + std::forward_as_tuple(std::move(*BestCandidate))); + return true; + } else + return false; +} + +void NodeChainBuilder::initNodeChains() { + for (auto &Node : Cfg->Nodes) { + NodeChain *Chain = new NodeChain(Node.get()); + NodeToChainMap[Node.get()] = Chain; + NodeOffset[Node.get()] = 0; + Chains.emplace(std::piecewise_construct, + std::forward_as_tuple(Node->Shndx), + std::forward_as_tuple(Chain)); + } +} + +void NodeChainBuilder::initMutuallyForcedEdges() { + // Find all the mutually-forced edges. + // These are all the edges which are -- based on the profile -- the only (executed) outgoing edge + // from their source node and the only (executed) incoming edges to their sink nodes + unordered_map> ProfiledOuts; + unordered_map> ProfiledIns; + + for (auto &Node : Cfg->Nodes) { + std::copy_if(Node->Outs.begin(), Node->Outs.end(), + std::back_inserter(ProfiledOuts[Node.get()]), + [](const ELFCfgEdge *Edge) { + return Edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC && + Edge->Weight != 0; + }); + std::copy_if(Node->Ins.begin(), Node->Ins.end(), + std::back_inserter(ProfiledIns[Node.get()]), + [](const ELFCfgEdge *Edge) { + return Edge->Type == ELFCfgEdge::EdgeType::INTRA_FUNC && + Edge->Weight != 0; + }); + } + + for (auto &Node : Cfg->Nodes) { + if (ProfiledOuts[Node.get()].size() == 1) { + ELFCfgEdge *Edge = ProfiledOuts[Node.get()].front(); + if (ProfiledIns[Edge->Sink].size() == 1) + MutuallyForcedOut[Node.get()] = Edge->Sink; + } + } + + // Break cycles in the mutually forced edges by cutting the edge sinking to + // the smallest address in every cycle (hopefully a loop backedge) + map NodeToCycleMap; + set CycleCutNodes; + unsigned CycleCount = 0; + for (auto It = MutuallyForcedOut.begin(); It != MutuallyForcedOut.end(); + ++It) { + // Check to see if the node (and its cycle) have already been visited. + if (NodeToCycleMap[It->first]) + continue; + const ELFCfgEdge *VictimEdge = nullptr; + auto NodeIt = It; + CycleCount++; + while (NodeIt != MutuallyForcedOut.end()) { + const ELFCfgNode *Node = NodeIt->first; + auto NodeCycle = NodeToCycleMap[Node]; + if (NodeCycle != 0) { + // If this node is marked with a number, either it is the same number, + // in which case we have found a cycle. Or it is a different number, + // which means we have found a path to a previously visited path + // (non-cycle). + if (NodeCycle == CycleCount) { + // We have found a cycle: add the victim edge + CycleCutNodes.insert(VictimEdge->Src); + } + break; + } else + NodeToCycleMap[Node] = CycleCount; + const ELFCfgEdge *Edge = ProfiledOuts[Node].front(); + if (!VictimEdge || + (Edge->Sink->MappedAddr < VictimEdge->Sink->MappedAddr)) { + VictimEdge = Edge; + } + NodeIt = MutuallyForcedOut.find(NodeIt->second); + } + } + + // Remove the victim edges to break cycles + for (const ELFCfgNode *Node : CycleCutNodes) + MutuallyForcedOut.erase(Node); +} + +// This function initializes the ExtTSP algorithm's data. +void NodeChainBuilder::InitializeExtTSP(){ + // For each chain, compute its ExtTSP score, add its chain assembly records + // and its merge candidate chain. + + std::set > Visited; + for (auto &C : Chains) { + NodeChain *Chain = C.second.get(); + Chain->Score = ExtTSPScore(Chain); + for (const ELFCfgNode *Node : Chain->Nodes) { + for (const ELFCfgEdge *Edge : Node->Outs) { + if (!Edge->Weight) + continue; + NodeChain *OtherChain = NodeToChainMap.at(Edge->Sink); + if (Chain == OtherChain) + continue; + if (Visited.count(std::make_pair(Chain, OtherChain))) + continue; + bool CO = UpdateNodeChainAssembly(Chain, OtherChain); + bool OC = UpdateNodeChainAssembly(OtherChain, Chain); + if (CO || OC) { + CandidateChains[Chain].insert(OtherChain); + CandidateChains[OtherChain].insert(Chain); + } + Visited.insert(std::make_pair(Chain, OtherChain)); + Visited.insert(std::make_pair(OtherChain, Chain)); + } + } + } +} + +void NodeChainBuilder::ComputeChainOrder( + vector &ChainOrder) { + + // Attach the mutually-foced edges (which will not be split anymore). + for (auto &KV : MutuallyForcedOut) + AttachNodes(KV.first, KV.second); + + // Initialize the Extended TSP algorithm's data. + InitializeExtTSP(); + + // Keep merging the chain assembly record with the highest ExtTSP gain, until + // no more gain is possible. + bool Merged; + do { + Merged = false; + auto BestCandidate = std::max_element( + NodeChainAssemblies.begin(), NodeChainAssemblies.end(), + [](std::pair, + unique_ptr> &C1, + std::pair, + unique_ptr> &C2) { + return C1.second->GetExtTSPGain() < C2.second->GetExtTSPGain(); + }); + + if (BestCandidate != NodeChainAssemblies.end() && + BestCandidate->second->GetExtTSPGain() > 0) { + unique_ptr BestCandidateNCA = + std::move(BestCandidate->second); + NodeChainAssemblies.erase(BestCandidate); + MergeChains(std::move(BestCandidateNCA)); + Merged = true; + } + } while (Merged); + + // Merge fallthrough basic blocks if we have missed any + AttachFallThroughs(); + + SortChainsByExecutionDensity(ChainOrder); +} + + +// This function computes the ExtTSP score for a chain assembly record. +double NodeChainBuilder::NodeChainAssembly::ExtTSPScore() const { + double Score = 0; + for (uint8_t SrcSliceIdx = 0; SrcSliceIdx < 3; ++SrcSliceIdx) { + const NodeChainSlice &SrcSlice = Slices[SrcSliceIdx]; + uint32_t SrcNodeOffset = SrcSlice.BeginOffset; + for (auto NodeIt = SrcSlice.Begin; NodeIt != SrcSlice.End; + SrcNodeOffset += (*NodeIt)->ShSize, ++NodeIt) { + const ELFCfgNode *Node = *NodeIt; + for (const ELFCfgEdge *Edge : Node->Outs) { + if (!Edge->Weight) + continue; + + uint8_t SinkSliceIdx; + + if (FindSliceIndex(Edge->Sink, SinkSliceIdx)) { + auto SinkNodeOffset = ChainBuilder->getNodeOffset(Edge->Sink); + bool EdgeForward = + (SrcSliceIdx < SinkSliceIdx) || + (SrcSliceIdx == SinkSliceIdx && (SrcNodeOffset + Node->ShSize <= SinkNodeOffset)); + + uint32_t Distance = 0; + + if (SrcSliceIdx == SinkSliceIdx) { + Distance = EdgeForward + ? SinkNodeOffset - SrcNodeOffset - Node->ShSize + : SrcNodeOffset - SinkNodeOffset + Node->ShSize; + } else { + const NodeChainSlice &SinkSlice = Slices[SinkSliceIdx]; + Distance = EdgeForward + ? SrcSlice.EndOffset - SrcNodeOffset - Node->ShSize + + SinkNodeOffset - SinkSlice.BeginOffset + : SrcNodeOffset - SrcSlice.BeginOffset + Node->ShSize + + SinkSlice.EndOffset - SinkNodeOffset ; + // Increment the distance by the size of the middle slice if the src + // and sink are from the two ends. + if (std::abs(SinkSliceIdx - SrcSliceIdx) == 2) + Distance += Slices[1].Size(); + } + Score += GetEdgeExtTSPScore(Edge, EdgeForward, Distance); + } + } + } + } + return Score; +} + +void NodeChainBuilder::doSplitOrder(list &SymbolList, + list::iterator HotPlaceHolder, + list::iterator ColdPlaceHolder) { + + vector ChainOrder; + ComputeChainOrder(ChainOrder); + + std::unordered_map Address; + unsigned CurrentAddress = 0; + for (const NodeChain *C : ChainOrder) { + list::iterator InsertPos = + C->Freq ? HotPlaceHolder : ColdPlaceHolder; + for (const ELFCfgNode *N : C->Nodes){ + SymbolList.insert(InsertPos, N->ShName); + if (C->Freq){ + Address[N]=CurrentAddress; + CurrentAddress+=N->ShSize; + } + } + } + + if(config->propellerAlignBasicBlocks) { + + enum VisitStatus { + NONE = 0, + DURING, + FINISHED }; + + std::unordered_map BackEdgeFreq; + std::unordered_map Visited; + + std::function visit; + visit = [&Address, &Visited, &BackEdgeFreq, &visit](const ELFCfgNode * N){ + if (Visited[N]!=NONE) + return; + if (!N->Freq) + return; + Visited[N] = DURING; + if(N->FTEdge) + visit(N->FTEdge->Sink); + for(const ELFCfgEdge * E: N->Outs){ + if(E->Sink->Freq && Address[E->Sink] > Address[N]) + visit(E->Sink); + } + for(const ELFCfgEdge * E: N->Outs){ + if(E->Sink->Freq && Address[E->Sink] <= Address[N]){ + if(Visited[E->Sink]==DURING){ + BackEdgeFreq[E->Sink]+=E->Weight; + } + } + } + Visited[N] = FINISHED; + }; + + for(const NodeChain *C: ChainOrder) + if (C->Freq != 0){ + for (const ELFCfgNode *N : C->Nodes) + visit(N); + } + + for(auto &N: Cfg->Nodes){ + if(N.get() == Cfg->getEntryNode()) + continue; + if(N->Freq && + (N->Freq >= 10 * Cfg->getEntryNode()->Freq) && + (BackEdgeFreq[N.get()] * 5 >= N->Freq * 4)){ + config->symbolAlignmentFile.insert(std::make_pair(N->ShName, 16)); + } else + config->symbolAlignmentFile.insert(std::make_pair(N->ShName, 1)); + } + } +} + +} // namespace propeller +} // namespace lld Index: lld/ELF/PropellerFuncOrdering.h =================================================================== --- /dev/null +++ lld/ELF/PropellerFuncOrdering.h @@ -0,0 +1,67 @@ +#ifndef LLD_ELF_PROPELLER_FUNC_ORDERING_H +#define LLD_ELF_PROPELLER_FUNC_ORDERING_H + +#include +#include +#include +#include + +using std::list; +using std::map; +using std::unique_ptr; +using std::vector; + +namespace lld { +namespace propeller { + +class ELFCfg; + +class CCubeAlgorithm { +public: + class Cluster { + public: + Cluster(ELFCfg *Cfg); + ~Cluster(); + list Cfgs; + uint64_t Size; + uint64_t Weight; + + // Merge "Other" cluster into this cluster. + Cluster & operator << (Cluster &Other) { + Cfgs.insert(Cfgs.end(), Other.Cfgs.begin(), Other.Cfgs.end()); + this->Weight += Other.Weight; + this->Size += Other.Size; + return *this; + } + + double getDensity() {return ((double)Weight)/Size;} + + // Handler is used to remove itself from ownership list without + // the need to iterate through the list. + typename list>::iterator Handler; + }; + +public: + CCubeAlgorithm() {} + + template + void init(CfgContainerTy &CfgContainer); + + unsigned doOrder(list& CfgOrder); + +private: + ELFCfg *getMostLikelyPredecessor( + Cluster *Cluster, ELFCfg *Cfg, + map &ClusterMap); + + void mergeClusters(); + void sortClusters(); + + vector HotCfgs, ColdCfgs; + list> Clusters; +}; + +} +} + +#endif Index: lld/ELF/PropellerFuncOrdering.cpp =================================================================== --- /dev/null +++ lld/ELF/PropellerFuncOrdering.cpp @@ -0,0 +1,154 @@ +#include "PropellerFuncOrdering.h" + +#include "Config.h" +#include "Propeller.h" +#include "PropellerELFCfg.h" + +#include +#include + +using lld::elf::config; +using std::map; + +namespace lld { +namespace propeller { + +template +void CCubeAlgorithm::init(CfgContainerTy &CfgContainer) { + CfgContainer.forEachCfgRef([this](ELFCfg &Cfg) { + if (Cfg.isHot()) + this->HotCfgs.push_back(&Cfg); + else + this->ColdCfgs.push_back(&Cfg); + }); + + auto CfgComparator = [](ELFCfg *Cfg1, ELFCfg *Cfg2) ->bool { + return Cfg1->getEntryNode()->MappedAddr < Cfg2->getEntryNode()->MappedAddr; + }; + auto sortCfg = [&CfgComparator](vector &Cfg) { + std::sort(Cfg.begin(), Cfg.end(), CfgComparator); + }; + sortCfg(HotCfgs); + sortCfg(ColdCfgs); +} + +CCubeAlgorithm::Cluster::Cluster(ELFCfg *Cfg) + : Cfgs(1, Cfg) {} + +CCubeAlgorithm::Cluster::~Cluster() {} + +ELFCfg *CCubeAlgorithm::getMostLikelyPredecessor( + Cluster *Cluster, ELFCfg *Cfg, + map + &ClusterMap) { + ELFCfgNode *Entry = Cfg->getEntryNode(); + if (!Entry) + return nullptr; + ELFCfgEdge *E = nullptr; + for (ELFCfgEdge *CallIn : Entry->CallIns) { + auto *Caller = CallIn->Src->Cfg; + auto *CallerCluster = ClusterMap[Caller]; + assert(Caller->isHot()); + if (Caller == Cfg || CallerCluster == Cluster) + continue; + // Ignore calls which are cold relative to the callee + if (CallIn->Weight * 10 < Entry->Freq) + continue; + // Do not merge if the caller cluster's density would degrade by more than + // 1/8. + if (8 * CallerCluster->Size * (Cluster->Weight * CallerCluster->Weight) < + CallerCluster->Weight * (Cluster->Size + CallerCluster->Size)) + continue; + if (!E || E->Weight < CallIn->Weight) { + if (ClusterMap[CallIn->Src->Cfg]->Size > (1 << 21)) continue; + E = CallIn; + } + } + return E ? E->Src->Cfg : nullptr; +} + +void CCubeAlgorithm::mergeClusters() { + // Signed key is used here, because negated density are used as + // sorting keys. + map CfgWeightMap; + map ClusterMap; + for(ELFCfg * Cfg: HotCfgs){ + uint64_t CfgWeight = 0; + uint64_t CfgSize = config->propellerSplitFuncs ? 0 : (double)Cfg->Size; + Cfg->forEachNodeRef([&CfgSize, &CfgWeight](ELFCfgNode &N) { + CfgWeight += N.Freq * N.ShSize; + if (config->propellerSplitFuncs && N.Freq) + CfgSize += N.ShSize; + }); + + assert(CfgSize!=0); + Cluster *C = new Cluster(Cfg); + C->Weight = CfgWeight; + C->Size = std::max(CfgSize, (uint64_t)1); + CfgWeightMap.emplace(Cfg, C->getDensity()); + C->Handler = Clusters.emplace(Clusters.end(), C); + ClusterMap[Cfg] = C; + } + + std::stable_sort(HotCfgs.begin(), HotCfgs.end(), + [&CfgWeightMap] (ELFCfg* Cfg1, ELFCfg* Cfg2){ + return CfgWeightMap[Cfg1] > CfgWeightMap[Cfg2]; + }); + for (ELFCfg* Cfg : HotCfgs){ + if (CfgWeightMap[Cfg] <= 0.005) + break; + auto *Cluster = ClusterMap[Cfg]; + if (Cluster->Size > (1 << 21)) + continue; + assert(Cluster); + + ELFCfg *PredecessorCfg = + getMostLikelyPredecessor(Cluster, Cfg, ClusterMap); + if (!PredecessorCfg) + continue; + assert(PredecessorCfg != Cfg); + // log("propeller: most-likely caller of " + Twine(Cfg->Name) + " -> " + Twine(PredecessorCfg->Name)); + auto *PredecessorCluster = ClusterMap[PredecessorCfg]; + assert(PredecessorCluster); + + if (PredecessorCluster == Cluster) + continue; + // if (PredecessorCluster->Size + Cluster->Size > (1 << 21)) + // continue; + + // Join 2 clusters into PredecessorCluster. + *PredecessorCluster << *Cluster; + + // Update Cfg <-> Cluster mapping, because all cfgs that were + // previsously in Cluster are now in PredecessorCluster. + for (ELFCfg *Cfg : Cluster->Cfgs) { + ClusterMap[Cfg] = PredecessorCluster; + } + Clusters.erase(Cluster->Handler); + } +} + +void CCubeAlgorithm::sortClusters() { + Clusters.sort([](unique_ptr &C1, unique_ptr &C2) { + if (C1->getDensity() == C2->getDensity()) + return C1->Cfgs.front()->getEntryNode()->MappedAddr < + C2->Cfgs.front()->getEntryNode()->MappedAddr; + return C1->getDensity() > C2->getDensity(); + }); +} + +unsigned CCubeAlgorithm::doOrder(list& CfgOrder) { + mergeClusters(); + sortClusters(); + for (auto &Cptr : Clusters) { + CfgOrder.insert(CfgOrder.end(), Cptr->Cfgs.begin(), Cptr->Cfgs.end()); + } + + CfgOrder.insert(CfgOrder.end(), ColdCfgs.begin(), ColdCfgs.end()); + return HotCfgs.size(); +} + +template void CCubeAlgorithm::init(Propeller &); + +} +} Index: lld/test/ELF/propeller/propeller-layout-function-ordering.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-function-ordering.s @@ -0,0 +1,75 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises function reordering on three functions. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.out + +# RUN: llvm-nm -nS %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000008 t foo +# BEFORE-NEXT: 0000000000201128 0000000000000008 t bar +# BEFORE-NEXT: 0000000000201130 0000000000000008 t baz + +## Create a propeller profile based on the following inter-procedural calls. +## +## 100 100 +## baz -----> bar -----> foo +## + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 8 Nfoo" >> %t_prof.propeller +# RUN: echo "2 8 Nbar" >> %t_prof.propeller +# RUN: echo "3 8 Nbaz" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "3 2 100 C" >> %t_prof.propeller +# RUN: echo "2 1 100 C" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller + +## Link with the propeller profile and expect the functions to be reordered. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -o %t.propeller.reorder.out +# RUN: llvm-nm -nS %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 0000000000000008 t baz +# REORDER-NEXT: 0000000000201128 0000000000000008 t bar +# REORDER-NEXT: 0000000000201130 0000000000000008 t foo + +## Disable function reordering and expect that the original function order is retained. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-funcs -o %t.propeller.noreorder.out +# RUN: llvm-nm -nS %t.propeller.noreorder.out| FileCheck %s --check-prefix=NOREORDER + +# NOREORDER: 0000000000201120 0000000000000008 t foo +# NOREORDER-NEXT: 0000000000201128 0000000000000008 t bar +# NOREORDER-NEXT: 0000000000201130 0000000000000008 t baz + +.section .text,"ax",@progbits,unique,1 +# -- Begin function foo +.type foo,@function +foo: + .quad 0x11 + +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo + +.section .text,"ax",@progbits,unique,2 +# -- Begin function bar +.type bar,@function +bar: + .quad 0x22 + +.Lbar_end: + .size bar, .Lbar_end-bar +# -- End function bar + +.section .text,"ax",@progbits,unique,3 +# -- Begin function baz +.type baz,@function +baz: + .quad 0x33 + +.Lbaz_end: + .size baz, .Lbaz_end-baz +# -- End function baz Index: lld/test/ELF/propeller/propeller-layout-function-with-loop.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-function-with-loop.s @@ -0,0 +1,131 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises basic block reordering on a single function with a simple loop. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.out +# RUN: llvm-objdump -d %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 0000000000201123 a.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: je 3 + +# BEFORE: 0000000000201128 aa.BB.foo: +# BEFORE-NEXT: nopl (%rax) + +# BEFORE: 000000000020112b aaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: jne -13 + +# BEFORE: 0000000000201130 aaaa.BB.foo: +# BEFORE-NEXT: nopl (%rax) +# BEFORE-NEXT: retq + + +## Create a propeller profile for foo, based on the cfg below: +## +## foo +## | +## |5 +## V +## +--> a.BB.foo <--+ +## | / \ | +## 0| 0/ \95 |90 +## | / \ | +## | v \ | +## aa.BB.foo v | +## \ aaa.BB.foo +## \ / +## \ / +## 0\ /10 +## \ / +## v v +## aaaa.BB.foo +## +## The optimal layout must include foo -> a.BB.foo -> aaa.BB.foo -> aaaa.BB.foo +## as a fallthrough chain in the layout. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 14 Nfoo" > %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 3 1.2" >> %t_prof.propeller +# RUN: echo "4 5 1.3" >> %t_prof.propeller +# RUN: echo "5 4 1.4" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "4 2 90" >> %t_prof.propeller +# RUN: echo "2 4 95" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "4 5 10" >> %t_prof.propeller +# RUN: echo "1 2 5" >> %t_prof.propeller + +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.reorder.out +# RUN: llvm-objdump -d %t.propeller.reorder.out| FileCheck %s --check-prefix=REORDER + +# REORDER: 0000000000201120 foo: +# REORDER-NEXT: nopl (%rax) + +# REORDER: 0000000000201123 a.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne 9 + +# REORDER: 0000000000201128 aaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jne -10 + +# REORDER: 000000000020112d aaaa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: retq + +# REORDER: 0000000000201131 aa.BB.foo: +# REORDER-NEXT: nopl (%rax) +# REORDER-NEXT: jmp -14 + +## Disable basic block reordering and expect that the original bb order is retained. +# +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-opt=no-reorder-blocks -propeller-keep-named-symbols -o %t.propeller.noreorder.out +# RUN: diff %t.propeller.noreorder.out %t.out + +.section .text,"ax",@progbits +# -- Begin function foo +.type foo,@function +foo: + nopl (%rax) + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + jmp aaa.BB.foo +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + jne a.BB.foo + jmp aaaa.BB.foo +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits,unique,4 +aaaa.BB.foo: + nopl (%rax) + ret +.Laaaa.BB.foo_end: + .size aaaa.BB.foo, .Laaaa.BB.foo_end-aaaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function (foo) Index: lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s =================================================================== --- /dev/null +++ lld/test/ELF/propeller/propeller-layout-optimal-fallthrough.s @@ -0,0 +1,87 @@ +# REQUIRES: x86 +## Basic propeller tests. +## This test exercises optimal basic block reordering one a single function. + +# RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t.o +# RUN: ld.lld %t.o -o %t.out +# RUN: llvm-nm -Sn %t.out| FileCheck %s --check-prefix=BEFORE + +# BEFORE: 0000000000201120 0000000000000005 t foo +# BEFORE-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# BEFORE-NEXT: 000000000020112a 0000000000000004 t aa.BB.foo +# BEFORE-NEXT: 000000000020112e 0000000000000004 t aaa.BB.foo + +## Create a propeller profile for foo, based on the cfg below: +## +## +## foo +## |\ +## | \100 +## | \ +## | v +## 150| a.BB.foo +## | / \ +## | /100 \0 +## | / \ +## vv v +## aaa.BB.foo aa.BB.foo +## +## A naive fallthrough maximization approach would attach foo -> aaa.BB.foo and +## as a fallthrough edge. However, the optimal ordering is foo -> a.BB.foo -> aaa.BB.foo. +## This example is motivated by Figure 6 in https://arxiv.org/pdf/1809.04676.pdf. + +# RUN: echo "Symbols" > %t_prof.propeller +# RUN: echo "1 12 Nfoo" >> %t_prof.propeller +# RUN: echo "2 5 1.1" >> %t_prof.propeller +# RUN: echo "3 4 1.2" >> %t_prof.propeller +# RUN: echo "4 4 1.3" >> %t_prof.propeller +# RUN: echo "Branches" >> %t_prof.propeller +# RUN: echo "1 4 150" >> %t_prof.propeller +# RUN: echo "2 4 100" >> %t_prof.propeller +# RUN: echo "Fallthroughs" >> %t_prof.propeller +# RUN: echo "1 2 100" >> %t_prof.propeller + +# RUN: ld.lld %t.o -propeller=%t_prof.propeller -propeller-keep-named-symbols -o %t.propeller.out +# RUN: llvm-nm -nS %t.propeller.out| FileCheck %s --check-prefix=AFTER + +# AFTER: 0000000000201120 0000000000000005 t foo +# AFTER-NEXT: 0000000000201125 0000000000000005 t a.BB.foo +# AFTER-NEXT: 000000000020112a 0000000000000004 t aaa.BB.foo +# AFTER-NEXT: 000000000020112e 0000000000000004 t aa.BB.foo + +#.global _start +#_start: +# -- Begin function foo +.section .text,"ax",@progbits +.type foo,@function +foo: + nopl (%rax) + je aaa.BB.foo + jmp a.BB.foo + +.section .text,"ax",@progbits,unique,1 +a.BB.foo: + nopl (%rax) + je aaa.BB.foo + jmp aa.BB.foo +.La.BB.foo_end: + .size a.BB.foo, .La.BB.foo_end-a.BB.foo + +.section .text,"ax",@progbits,unique,2 +aa.BB.foo: + nopl (%rax) + ret +.Laa.BB.foo_end: + .size aa.BB.foo, .Laa.BB.foo_end-aa.BB.foo + +.section .text,"ax",@progbits,unique,3 +aaa.BB.foo: + nopl (%rax) + ret +.Laaa.BB.foo_end: + .size aaa.BB.foo, .Laaa.BB.foo_end-aaa.BB.foo + +.section .text,"ax",@progbits +.Lfoo_end: + .size foo, .Lfoo_end-foo +# -- End function foo