Index: include/llvm/Analysis/BlockFrequencyInfoImpl.h =================================================================== --- include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -22,10 +22,12 @@ #include "llvm/Support/BlockFrequency.h" #include "llvm/Support/BranchProbability.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" #include "llvm/Support/ScaledNumber.h" #include "llvm/Support/raw_ostream.h" #include #include +#include #include #include @@ -194,28 +196,28 @@ /// Contains the data necessary to represent a loop as a pseudo-node once it's /// packaged. struct LoopData { - typedef SmallVector, 4> ExitMap; + typedef std::pair EdgeMass; + typedef SmallVector ExitMap; + typedef SmallVector HeaderMassList; typedef SmallVector NodeList; - typedef SmallVector HeaderMassList; LoopData *Parent; ///< The parent loop. bool IsPackaged; ///< Whether this has been packaged. uint32_t NumHeaders; ///< Number of headers. ExitMap Exits; ///< Successor edges (and weights). NodeList Nodes; ///< Header and the members of the loop. - HeaderMassList BackedgeMass; ///< Mass returned to each loop header. + HeaderMassList Backedges; ///< Mass returned to each loop header. BlockMass Mass; Scaled64 Scale; LoopData(LoopData *Parent, const BlockNode &Header) - : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header), - BackedgeMass(1) {} + : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {} template LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther, It2 LastOther) : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) { NumHeaders = Nodes.size(); Nodes.insert(Nodes.end(), FirstOther, LastOther); - BackedgeMass.resize(NumHeaders); + Backedges.reserve(NumHeaders); } bool isHeader(const BlockNode &Node) const { if (isIrreducible()) @@ -225,15 +227,6 @@ } BlockNode getHeader() const { return Nodes[0]; } bool isIrreducible() const { return NumHeaders > 1; } - - HeaderMassList::difference_type getHeaderIndex(const BlockNode &B) { - assert(isHeader(B) && "this is only valid on loop header blocks"); - if (isIrreducible()) - return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) - - Nodes.begin(); - return 0; - } - NodeList::const_iterator members_begin() const { return Nodes.begin() + NumHeaders; } @@ -252,17 +245,11 @@ WorkingData(const BlockNode &Node) : Node(Node), Loop(nullptr) {} bool isLoopHeader() const { return Loop && Loop->isHeader(Node); } - bool isDoubleLoopHeader() const { - return isLoopHeader() && Loop->Parent && Loop->Parent->isIrreducible() && - Loop->Parent->isHeader(Node); - } LoopData *getContainingLoop() const { - if (!isLoopHeader()) - return Loop; - if (!isDoubleLoopHeader()) + if (Loop && Loop->getHeader() == Node) return Loop->Parent; - return Loop->Parent->Parent; + return Loop; } /// \brief Resolve a node to its representative. @@ -297,21 +284,15 @@ /// has been packaged), returns the mass of its pseudo-node. If it's a /// node inside a packaged loop, it returns the loop's mass. BlockMass &getMass() { - if (!isAPackage()) - return Mass; - if (!isADoublePackage()) + if (isAPackage()) return Loop->Mass; - return Loop->Parent->Mass; + return Mass; } /// \brief Has ContainingLoop been packaged up? bool isPackaged() const { return getResolvedNode() != Node; } /// \brief Has Loop been packaged up? bool isAPackage() const { return isLoopHeader() && Loop->IsPackaged; } - /// \brief Has Loop been packaged up twice? - bool isADoublePackage() const { - return isDoubleLoopHeader() && Loop->Parent->IsPackaged; - } }; /// \brief Unscaled probability weight. @@ -335,6 +316,9 @@ Weight() : Type(Local), Amount(0) {} Weight(DistType Type, BlockNode TargetNode, uint64_t Amount) : Type(Type), TargetNode(TargetNode), Amount(Amount) {} + bool operator==(const Weight &X) const { + return Type == X.Type && TargetNode == X.TargetNode; + } }; /// \brief Distribution of unscaled probability weight. @@ -347,9 +331,9 @@ /// the distribution. It should never overflow twice. struct Distribution { typedef SmallVector WeightList; - WeightList Weights; ///< Individual successor weights. - uint64_t Total; ///< Sum of all weights. - bool DidOverflow; ///< Whether \a Total did overflow. + WeightList Weights; ///< Individual successor weights. + uint64_t Total; ///< Sum of all weights. + bool DidOverflow; ///< Whether \a Total did overflow. Distribution() : Total(0), DidOverflow(false) {} void addLocal(const BlockNode &Node, uint64_t Amount) { @@ -405,52 +389,38 @@ bool addToDist(Distribution &Dist, const LoopData *OuterLoop, const BlockNode &Pred, const BlockNode &Succ, uint64_t Weight); - LoopData &getLoopPackage(const BlockNode &Head) { - assert(Head.Index < Working.size()); - assert(Working[Head.Index].isLoopHeader()); - return *Working[Head.Index].Loop; - } - - /// \brief Analyze irreducible SCCs. - /// - /// Separate irreducible SCCs from \c G, which is an explict graph of \c - /// OuterLoop (or the top-level function, if \c OuterLoop is \c nullptr). - /// Insert them into \a Loops before \c Insert. - /// - /// \return the \c LoopData nodes representing the irreducible SCCs. - iterator_range::iterator> - analyzeIrreducible(const bfi_detail::IrreducibleGraph &G, LoopData *OuterLoop, - std::list::iterator Insert); + typedef std::vector NodeVecTy; + typedef std::vector> SCCLoopVecTy; /// \brief Update a loop after packaging irreducible SCCs inside of it. /// - /// Update \c OuterLoop. Before finding irreducible control flow, it was - /// partway through \a computeMassInLoop(), so \a LoopData::Exits and \a - /// LoopData::BackedgeMass need to be reset. Also, nodes that were packaged - /// up need to be removed from \a OuterLoop::Nodes. - void updateLoopWithIrreducible(LoopData &OuterLoop); + /// Update \c OuterLoop. For subloops that contain nodes that are also the + /// header of OuterLoop, move their first header to OuterLoop header list. + /// Also, nodes that were packaged up need to be removed from \a + /// OuterLoop::Nodes. + void updateLoopWithIrreducible(LoopData &OuterLoop, SCCLoopVecTy &SCCs); /// \brief Distribute mass according to a distribution. /// - /// Distributes the mass in Source according to Dist. If LoopHead.isValid(), + /// Distributes the mass according to Dist. If LoopHead.isValid(), /// backedges and exits are stored in its entry in Loops. /// /// Mass is distributed in parallel from two copies of the source mass. - void distributeMass(const BlockNode &Source, LoopData *OuterLoop, - Distribution &Dist); + void distributeMass(const BlockMass &Mass, LoopData *OuterLoop, + Distribution &Dist, bool PropagateBackedge = false); /// \brief Compute the loop scale for a loop. void computeLoopScale(LoopData &Loop); - /// Adjust the mass of all headers in an irreducible loop. - /// - /// Initially, irreducible loops are assumed to distribute their mass - /// equally among its headers. This can lead to wrong frequency estimates - /// since some headers may be executed more frequently than others. + /// \brief Create subloops and single nodes from irreducible graph. /// - /// This adjusts header mass distribution so it matches the weights of - /// the backedges going into each of the loop headers. - void adjustLoopHeaderMass(LoopData &Loop); + /// Divide nodes of irreducible graph into nodes of subloops and standalone + /// nodes. For each subloop, save those trivial SCC that are before it + /// in topological order. LeftSCCs are those trivial SCCs that are after all + /// subloops. + void createSCCLoop(LoopData *OuterLoop, const bfi_detail::IrreducibleGraph &G, + std::list::iterator Insert, NodeVecTy &LeftSCCs, + SCCLoopVecTy &SCCs); /// \brief Package up a loop. void packageLoop(LoopData &Loop); @@ -789,7 +759,7 @@ typedef typename bfi_detail::TypeMap::BlockT BlockT; typedef typename bfi_detail::TypeMap::FunctionT FunctionT; typedef typename bfi_detail::TypeMap::BranchProbabilityInfoT - BranchProbabilityInfoT; + BranchProbabilityInfoT; typedef typename bfi_detail::TypeMap::LoopT LoopT; typedef typename bfi_detail::TypeMap::LoopInfoT LoopInfoT; @@ -844,7 +814,8 @@ /// currently assigned to \c Node between its successors. /// /// \return \c true unless there's an irreducible backedge. - bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node); + bool propagateMassToSuccessors(LoopData *OuterLoop, const BlockNode &Node, + bool PropagateBackedge = false); /// \brief Compute mass in a particular loop. /// @@ -866,6 +837,32 @@ /// \return \c true unless there's an irreducible backedge. bool tryToComputeMassInFunction(); + typedef std::map> + GeometricMeanInfo; + + /// \brief Compute geometric series start term for headers. + /// + /// Propagate mass on each node of \c Loop in topological order. For + /// subloops, recurisvely call \c computeIrreducibleMass to calculate its + /// local frequency and exits weights distribution. Store calculated terms + /// in \c HeaderData. + void computeStartTerm(GeometricMeanInfo &HeaderData, LoopData *Loop, + SCCLoopVecTy &SubLoops, NodeVecTy &LeftSCCs, + std::list::iterator Insert); + + /// \brief Compute geometric series ratio for headers. + /// + /// Use \c Loop::scale to scale down mass of blocks. Otherwise there will be + /// overflow when propagating for computing ratio. Set default \c Loop::scale + /// to 1.0. + void computeGeometricRatio(GeometricMeanInfo &HeaderData, LoopData &Loop); + + /// \brief Compute loop scale and block masses of irregular loop. + /// + /// With computed start term and ratio in \c HeaderData, compute scaled block + /// mass. + void computeScaledBlockMass(GeometricMeanInfo &HeaderData, LoopData &Loop); + /// \brief Compute mass in (and package up) irreducible SCCs. /// /// Find the irreducible SCCs in \c OuterLoop, add them to \a Loops (in front @@ -1060,44 +1057,26 @@ template void BlockFrequencyInfoImpl::computeMassInLoops() { // Visit loops with the deepest first, and the top-level loops last. - for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) { - if (computeMassInLoop(*L)) - continue; - auto Next = std::next(L); - computeIrreducibleMass(&*L, L.base()); - L = std::prev(Next); - if (computeMassInLoop(*L)) - continue; - llvm_unreachable("unhandled irreducible control flow"); - } + for (auto L = Loops.rbegin(), E = Loops.rend(); L != E; ++L) + if (!computeMassInLoop(*L)) { + auto Next = std::next(L); + computeIrreducibleMass(&*L, L.base()); + L = std::prev(Next); + } } template bool BlockFrequencyInfoImpl::computeMassInLoop(LoopData &Loop) { // Compute mass in loop. DEBUG(dbgs() << "compute-mass-in-loop: " << getLoopName(Loop) << "\n"); - - if (Loop.isIrreducible()) { - BlockMass Remaining = BlockMass::getFull(); - for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &Mass = Working[Loop.Nodes[H].Index].getMass(); - Mass = Remaining * BranchProbability(1, Loop.NumHeaders - H); - Remaining -= Mass; - } - for (const BlockNode &M : Loop.Nodes) - if (!propagateMassToSuccessors(&Loop, M)) - llvm_unreachable("unhandled irreducible control flow"); - - adjustLoopHeaderMass(Loop); - } else { - Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); - if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) - llvm_unreachable("irreducible control flow to loop header!?"); - for (const BlockNode &M : Loop.members()) - if (!propagateMassToSuccessors(&Loop, M)) - // Irreducible backedge. - return false; - } + assert(!Loop.isIrreducible() && "Irreducible loop don't need this"); + Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); + if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) + llvm_unreachable("irreducible control flow to loop header!?"); + for (const BlockNode &M : Loop.members()) + if (!propagateMassToSuccessors(&Loop, M)) + // Irreducible backedge. + return false; computeLoopScale(Loop); packageLoop(Loop); @@ -1128,9 +1107,6 @@ if (tryToComputeMassInFunction()) return; computeIrreducibleMass(nullptr, Loops.begin()); - if (tryToComputeMassInFunction()) - return; - llvm_unreachable("unhandled irreducible control flow"); } /// \note This should be a lambda, but that crashes GCC 4.7. @@ -1141,8 +1117,7 @@ typedef GraphTraits Successor; const BlockFrequencyInfoImpl &BFI; - explicit BlockEdgesAdder(const BlockFrequencyInfoImpl &BFI) - : BFI(BFI) {} + explicit BlockEdgesAdder(const BlockFrequencyInfoImpl &BFI) : BFI(BFI) {} void operator()(IrreducibleGraph &G, IrreducibleGraph::IrrNode &Irr, const LoopData *OuterLoop) { const BlockT *BB = BFI.RPOT[Irr.Node.Index]; @@ -1153,24 +1128,184 @@ }; } template +void BlockFrequencyInfoImpl::computeStartTerm( + GeometricMeanInfo &HeaderData, LoopData *Loop, SCCLoopVecTy &SubLoops, + NodeVecTy &LeftSCCs, std::list::iterator Insert) { + DEBUG(dbgs() << "compute start terms\n"); + for (auto &S : SubLoops) { + // First propagate mass on subloop's dependence nodes. + for (const auto Node : S.second) + if (!propagateMassToSuccessors(Loop, Node)) + llvm_unreachable("unhandled irreducible control flow"); + + // Save accumulated header mass of current subloop. After local mass of + // current subloop is recursively computed, we use this total mass to + // propagate through it. + BlockMass TotalInputMass = std::accumulate( + S.first->Nodes.begin(), S.first->Nodes.begin() + S.first->NumHeaders, + BlockMass::getEmpty(), [this](BlockMass &T, BlockNode &N) { + return T += Working[N.Index].getMass(); + }); + + // Recusively compute local mass of subloop + auto Next = std::next(Insert); + computeIrreducibleMass(S.first, Insert); + Insert = Next; + + DEBUG(dbgs() << "propagate on subloop node\n"); + S.first->Mass = TotalInputMass; + if (!propagateMassToSuccessors(Loop, S.first->getHeader())) + llvm_unreachable("unhandled irreducible control flow"); + } + // Propogate on the rest of the nodes. + DEBUG(dbgs() << "propagate on the rest of the nodes\n"); + for (auto Node : LeftSCCs) + if (!propagateMassToSuccessors(Loop, Node)) + llvm_unreachable("unhandled irreducible control flow"); + + if (!(Loop && Loop->isIrreducible())) + return; + + // Add backedge weight to their target node. + for (const auto &I : Loop->Backedges) + Working[I.first.Index].getMass() += I.second; + + // Save all start terms + for (uint32_t H = 0; H < Loop->NumHeaders; ++H) { + auto &HeaderNode = Loop->Nodes[H]; + auto &HeaderMass = Working[HeaderNode.Index].getMass(); + HeaderData[HeaderNode].first = HeaderMass; + DEBUG(dbgs() << "header start term mass = " << HeaderMass << " [" + << HeaderMass.toScaled() << "] (" << getBlockName(HeaderNode) + << ")\n"); + } +} + +template +void BlockFrequencyInfoImpl::computeGeometricRatio( + GeometricMeanInfo &HeaderData, LoopData &Loop) { + DEBUG(dbgs() << "compute geometric ratio\n"); + Loop.Scale = BlockMass::getFull().toScaled(); + LoopData::NodeList SortedNodes = Loop.Nodes; + std::sort(SortedNodes.begin(), SortedNodes.end()); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + Loop.Backedges.clear(); + for (auto N : SortedNodes) + Working[N.Index].getMass() = BlockMass::getEmpty(); + + auto &HeaderNode = Loop.Nodes[H]; + auto &HeaderMass = Working[HeaderNode.Index].getMass(); + HeaderMass = BlockMass::getFull(); + auto It = std::find(SortedNodes.begin(), SortedNodes.end(), HeaderNode); + for (int i = 0; i < SortedNodes.size(); ++i, ++It) { + if (It == SortedNodes.end()) + It = SortedNodes.begin(); + if (!propagateMassToSuccessors(&Loop, *It, true)) + llvm_unreachable("irreducible control flow to loop header!?"); + } + for (const auto &I : Loop.Backedges) + if (I.first == HeaderNode) + HeaderData[HeaderNode].second += I.second; + + // compute ratio to find loop scale. + // TODO use a(1-r^n)/(1-r) requires n, which a the loop scale we could have + // computed during start term computing propagation. Here it is simplified + // to use a/(1-r). + BlockMass LeftMass = BlockMass::getFull() - HeaderData[HeaderNode].second; + Scaled64 S = LeftMass.toScaled().inverse(); + if (S > Loop.Scale) + Loop.Scale = S; + DEBUG(dbgs() << "header ratio = " << S << " (" << getBlockName(HeaderNode) + << ")\n"); + } +} + +template +void BlockFrequencyInfoImpl::computeScaledBlockMass( + GeometricMeanInfo &HeaderData, LoopData &Loop) { + // Clear mass. + Loop.Exits.clear(); + for (const BlockNode &N : Loop.Nodes) + Working[N.Index].getMass() = BlockMass::getEmpty(); + + // Set scaled header mass. + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + auto &AR = HeaderData[HeaderNode]; + BlockMass LeftMass = BlockMass::getFull() - AR.second; + Scaled64 S = LeftMass.toScaled().inverse(); + // Scale down header mass by loop scale. It will be scaled back when + // unwrapping loops. + S /= Loop.Scale; + uint64_t HeaderMass = S.scale(AR.first.getMass()); + Working[HeaderNode.Index].getMass() = BlockMass(HeaderMass); + } + + for (const BlockNode &M : Loop.Nodes) + if (!propagateMassToSuccessors(&Loop, M)) + llvm_unreachable("irreducible control flow to loop header!?"); +} + +template void BlockFrequencyInfoImpl::computeIrreducibleMass( - LoopData *OuterLoop, std::list::iterator Insert) { - DEBUG(dbgs() << "analyze-irreducible-in-"; - if (OuterLoop) dbgs() << "loop: " << getLoopName(*OuterLoop) << "\n"; + LoopData *Loop, std::list::iterator Insert) { + DEBUG(dbgs() << "computeIrreducibleMass-in-"; + if (Loop) dbgs() << "loop: " << getLoopName(*Loop) << "\n"; else dbgs() << "function\n"); + // Collect weight distribution among headers since IrreducibleGraph ctor + // would reset block mass. + Distribution Dist; + if (Loop) { + for (uint32_t H = 0; H < Loop->NumHeaders; ++H) { + auto &HeaderNode = Loop->Nodes[H]; + auto HeaderMass = Working[HeaderNode.Index].getMass().getMass(); + if (HeaderMass > 0) + Dist.addLocal(HeaderNode, HeaderMass); + else + DEBUG(dbgs() << " Nothing added. Header mass is zero\n"); + } + } else { + WorkingData &Entry = Working[0]; + Dist.addLocal(Entry.Node, UINT64_C(100)); + } + using namespace bfi_detail; // Ideally, addBlockEdges() would be declared here as a lambda, but that // crashes GCC 4.7. BlockEdgesAdder addBlockEdges(*this); - IrreducibleGraph G(*this, OuterLoop, addBlockEdges); - - for (auto &L : analyzeIrreducible(G, OuterLoop, Insert)) - computeMassInLoop(L); - - if (!OuterLoop) + IrreducibleGraph G(*this, Loop, addBlockEdges); + + // Resolve dependency using SCC. + NodeVecTy LeftSCCs; + SCCLoopVecTy SubLoops; + createSCCLoop(Loop, G, Insert, LeftSCCs, SubLoops); + + // To calculate the geometirc series, we need start term(a) and ratio(r). The + // final block freq is a/(1-r). pair.first is start term, pair.second is + // ratio. + GeometricMeanInfo HeaderData; + + // Distribute mass among headers according to their input relative weight. + DEBUG(dbgs() << "distribute full mass among headers\n"); + for (const Weight &W : Dist.Weights) + DEBUG(dbgs() << " header weight = " + << format("%*" PRIu64, + std::snprintf(nullptr, 0, "%" PRIu64, + std::numeric_limits::max()), + W.Amount) + << " (" << getBlockName(W.TargetNode) << ")\n"); + distributeMass(BlockMass::getFull(), nullptr, Dist); + + computeStartTerm(HeaderData, Loop, SubLoops, LeftSCCs, Insert); + if (!Loop) return; - updateLoopWithIrreducible(*OuterLoop); + if (Loop->isIrreducible()) { + computeGeometricRatio(HeaderData, *Loop); + computeScaledBlockMass(HeaderData, *Loop); + } else + computeLoopScale(*Loop); + packageLoop(*Loop); } namespace { @@ -1181,9 +1316,8 @@ } // namespace template -bool -BlockFrequencyInfoImpl::propagateMassToSuccessors(LoopData *OuterLoop, - const BlockNode &Node) { +bool BlockFrequencyInfoImpl::propagateMassToSuccessors( + LoopData *OuterLoop, const BlockNode &Node, bool PropagateBackedge) { DEBUG(dbgs() << " - node: " << getBlockName(Node) << "\n"); // Calculate probability for successors. Distribution Dist; @@ -1204,7 +1338,8 @@ // Distribute mass to successors, saving exit and backedge data in the // loop header. - distributeMass(Node, OuterLoop, Dist); + BlockMass Mass = Working[Node.Index].getMass(); + distributeMass(Mass, OuterLoop, Dist, PropagateBackedge); return true; } Index: lib/Analysis/BlockFrequencyInfoImpl.cpp =================================================================== --- lib/Analysis/BlockFrequencyInfoImpl.cpp +++ lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -135,8 +135,9 @@ static void combineWeightsBySorting(WeightList &Weights) { // Sort so edges to the same node are adjacent. std::sort(Weights.begin(), Weights.end(), - [](const Weight &L, - const Weight &R) { return L.TargetNode < R.TargetNode; }); + [](const Weight &L, const Weight &R) { + return L.TargetNode < R.TargetNode; + }); // Combine adjacent edges. WeightList::iterator O = Weights.begin(); @@ -221,8 +222,8 @@ // sum of the weights, but let's double-check. assert(Total == std::accumulate(Weights.begin(), Weights.end(), UINT64_C(0), [](uint64_t Sum, const Weight &W) { - return Sum + W.Amount; - }) && + return Sum + W.Amount; + }) && "Expected total to be correct"); return; } @@ -295,11 +296,14 @@ return true; } - if (Working[Resolved.Index].getContainingLoop() != OuterLoop) { - DEBUG(debugSuccessor(" exit ")); - Dist.addExit(Resolved, Weight); - return true; - } + LoopData *SuccLoop = Working[Resolved.Index].getContainingLoop(); + // Edge goes into subloop should not be exit edge. + if (!SuccLoop || SuccLoop->Parent != OuterLoop) + if (SuccLoop != OuterLoop) { + DEBUG(debugSuccessor(" exit ")); + Dist.addExit(Resolved, Weight); + return true; + } if (Resolved < Pred) { if (!isLoopHeader(Pred)) { @@ -355,8 +359,8 @@ // LoopScale == 1 / ExitMass // ExitMass == HeadMass - BackedgeMass BlockMass TotalBackedgeMass; - for (auto &Mass : Loop.BackedgeMass) - TotalBackedgeMass += Mass; + for (auto &Mass : Loop.Backedges) + TotalBackedgeMass += Mass.second; BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; // Block scale stores the inverse of the scale. If this is an infinite loop, @@ -378,7 +382,7 @@ for (const BlockNode &M : Loop.Nodes) { if (auto *Loop = Working[M.Index].getPackagedLoop()) Loop->Exits.clear(); - DEBUG(dbgs() << " - node: " << getBlockName(M.Index) << "\n"); + DEBUG(dbgs() << " *- node: " << getBlockName(M.Index) << "\n"); } Loop.IsPackaged = true; } @@ -396,10 +400,10 @@ } #endif -void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, +void BlockFrequencyInfoImplBase::distributeMass(const BlockMass &Mass, LoopData *OuterLoop, - Distribution &Dist) { - BlockMass Mass = Working[Source.Index].getMass(); + Distribution &Dist, + bool PropagateBackedge) { DEBUG(dbgs() << " => mass: " << Mass << "\n"); // Distribute mass to successors as laid out in Dist. @@ -419,8 +423,10 @@ // Check for a backedge. if (W.Type == Weight::Backedge) { - OuterLoop->BackedgeMass[OuterLoop->getHeaderIndex(W.TargetNode)] += Taken; + OuterLoop->Backedges.push_back(std::make_pair(W.TargetNode, Taken)); DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); + if (PropagateBackedge && OuterLoop->isIrreducible()) + Working[W.TargetNode.Index].getMass() += Taken; continue; } @@ -591,8 +597,18 @@ void IrreducibleGraph::addEdge(IrrNode &Irr, const BlockNode &Succ, const BFIBase::LoopData *OuterLoop) { - if (OuterLoop && OuterLoop->isHeader(Succ)) - return; + if (OuterLoop) { + if (OuterLoop->isIrreducible()) { + // do not add retreating edge. + if (OuterLoop->getHeader() == Succ) + return; + } else { + // do not add back edge of OuterLoop + if (OuterLoop->isHeader(Succ)) + return; + } + } + // do not add exiting edge of OuterLoop auto L = Lookup.find(Succ.Index); if (L == Lookup.end()) return; @@ -609,9 +625,7 @@ typedef const GraphT::IrrNode NodeType; typedef GraphT::IrrNode::iterator ChildIteratorType; - static const NodeType *getEntryNode(const GraphT &G) { - return G.StartIrr; - } + static const NodeType *getEntryNode(const GraphT &G) { return G.StartIrr; } static ChildIteratorType child_begin(NodeType *N) { return N->succ_begin(); } static ChildIteratorType child_end(NodeType *N) { return N->succ_end(); } }; @@ -622,8 +636,7 @@ /// Find entry blocks and other blocks with backedges, which exist when \c G /// contains irreducible sub-SCCs. static void findIrreducibleHeaders( - const BlockFrequencyInfoImplBase &BFI, - const IrreducibleGraph &G, + const BlockFrequencyInfoImplBase &BFI, LoopData *OuterLoop, const std::vector &SCC, LoopData::NodeList &Headers, LoopData::NodeList &Others) { // Map from nodes in the SCC to whether it's an entry block. @@ -635,60 +648,37 @@ for (auto I = InSCC.begin(), E = InSCC.end(); I != E; ++I) { auto &Irr = *I->first; - for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { - if (InSCC.count(P)) - continue; - - // This is an entry block. - I->second = true; + for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) + if (!InSCC.count(P)) { + // This is an entry block. + I->second = true; + break; + } + + // This SCC may share entry with ancestor SCC. In this case, pred of + // this node is not in OuterLoop, but more outer scope. + if (OuterLoop && !I->second) + for (uint32_t H = 0; H < OuterLoop->NumHeaders; ++H) + if (OuterLoop->Nodes[H] == Irr.Node) { + I->second = true; + break; + } + + if (I->second) { Headers.push_back(Irr.Node); DEBUG(dbgs() << " => entry = " << BFI.getBlockName(Irr.Node) << "\n"); - break; + } else { + Others.push_back(Irr.Node); + DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); } } assert(Headers.size() >= 2 && "Expected irreducible CFG; -loop-info is likely invalid"); - if (Headers.size() == InSCC.size()) { - // Every block is a header. - std::sort(Headers.begin(), Headers.end()); - return; - } - - // Look for extra headers from irreducible sub-SCCs. - for (const auto &I : InSCC) { - // Entry blocks are already headers. - if (I.second) - continue; - - auto &Irr = *I.first; - for (const auto *P : make_range(Irr.pred_begin(), Irr.pred_end())) { - // Skip forward edges. - if (P->Node < Irr.Node) - continue; - - // Skip predecessors from entry blocks. These can have inverted - // ordering. - if (InSCC.lookup(P)) - continue; - - // Store the extra header. - Headers.push_back(Irr.Node); - DEBUG(dbgs() << " => extra = " << BFI.getBlockName(Irr.Node) << "\n"); - break; - } - if (Headers.back() == Irr.Node) - // Added this as a header. - continue; - - // This is not a header. - Others.push_back(Irr.Node); - DEBUG(dbgs() << " => other = " << BFI.getBlockName(Irr.Node) << "\n"); - } std::sort(Headers.begin(), Headers.end()); std::sort(Others.begin(), Others.end()); } -static void createIrreducibleLoop( +static LoopData *createIrreducibleLoop( BlockFrequencyInfoImplBase &BFI, const IrreducibleGraph &G, LoopData *OuterLoop, std::list::iterator Insert, const std::vector &SCC) { @@ -697,83 +687,92 @@ LoopData::NodeList Headers; LoopData::NodeList Others; - findIrreducibleHeaders(BFI, G, SCC, Headers, Others); + findIrreducibleHeaders(BFI, OuterLoop, SCC, Headers, Others); auto Loop = BFI.Loops.emplace(Insert, OuterLoop, Headers.begin(), Headers.end(), Others.begin(), Others.end()); // Update loop hierarchy. for (const auto &N : Loop->Nodes) - if (BFI.Working[N.Index].isLoopHeader()) + if (BFI.Working[N.Index].isLoopHeader() && + !BFI.Working[N.Index].Loop->isIrreducible()) BFI.Working[N.Index].Loop->Parent = &*Loop; else BFI.Working[N.Index].Loop = &*Loop; -} - -iterator_range::iterator> -BlockFrequencyInfoImplBase::analyzeIrreducible( - const IrreducibleGraph &G, LoopData *OuterLoop, - std::list::iterator Insert) { - assert((OuterLoop == nullptr) == (Insert == Loops.begin())); - auto Prev = OuterLoop ? std::prev(Insert) : Loops.end(); - for (auto I = scc_begin(G); !I.isAtEnd(); ++I) { - if (I->size() < 2) - continue; + return &*Loop; +} + +void BlockFrequencyInfoImplBase::updateLoopWithIrreducible( + LoopData &OuterLoop, SCCLoopVecTy &SubSCCs) { + auto &OuterNodes = OuterLoop.Nodes; + auto &NumHeader = OuterLoop.NumHeaders; + for (auto &kv : SubSCCs) { + LoopData *L = kv.first; + auto isSubLoopNode = [L](const BlockNode &N) { + return std::find(L->Nodes.begin(), L->Nodes.end(), N) != L->Nodes.end(); + }; + + // Remove headers shared with subloop. + auto IE = OuterNodes.begin() + NumHeader; + auto It = std::remove_if(OuterNodes.begin(), IE, isSubLoopNode); + auto NumShareHeader = std::distance(It, IE); + OuterNodes.erase(It, IE); + if (NumShareHeader > 0) { + assert(NumHeader >= NumShareHeader); + NumHeader -= NumShareHeader; + } - // Translate the SCC into RPO. - createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); + // Remove other nodes of subloop. + OuterNodes.erase(std::remove_if(OuterNodes.begin() + NumHeader, + OuterNodes.end(), isSubLoopNode), + OuterNodes.end()); + + // Add back subloop header. + if (NumShareHeader > 0) { + OuterNodes.insert(OuterNodes.begin(), L->getHeader()); + ++NumHeader; + } else + OuterNodes.push_back(L->getHeader()); } + std::sort(OuterNodes.begin(), OuterNodes.begin() + NumHeader); + std::sort(OuterNodes.begin() + NumHeader, OuterNodes.end()); +} + +void BlockFrequencyInfoImplBase::createSCCLoop( + LoopData *OuterLoop, const IrreducibleGraph &G, + std::list::iterator Insert, NodeVecTy &LeftSingleSCCs, + SCCLoopVecTy &SubSCCs) { + // Must get all sccs before descend since SCCIterator returns inverse + // topological order. + LoopData *PrevLoop = nullptr; + NodeVecTy SingleSCCs; + for (auto I = scc_begin(G); !I.isAtEnd(); ++I) + if (I->size() < 2) { + assert(I->size() == 1); + DEBUG(dbgs() << " - found-trivial-scc " << getBlockName((*I)[0]->Node) + << "\n"); + SingleSCCs.insert(SingleSCCs.begin(), (*I)[0]->Node); + } else { + // Translate the SCC into RPO. + auto L = createIrreducibleLoop(*this, G, OuterLoop, Insert, *I); + // SCCIterator gives reverse topological order, make it normal + if (PrevLoop) + SubSCCs.insert(SubSCCs.begin(), std::make_pair(PrevLoop, SingleSCCs)); + else + LeftSingleSCCs = SingleSCCs; + PrevLoop = L; + SingleSCCs.clear(); + } - if (OuterLoop) - return make_range(std::next(Prev), Insert); - return make_range(Loops.begin(), Insert); -} + if (SingleSCCs.empty()) + return; + if (PrevLoop) + SubSCCs.insert(SubSCCs.begin(), std::make_pair(PrevLoop, SingleSCCs)); + else + LeftSingleSCCs = SingleSCCs; -void -BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { - OuterLoop.Exits.clear(); - for (auto &Mass : OuterLoop.BackedgeMass) - Mass = BlockMass::getEmpty(); - auto O = OuterLoop.Nodes.begin() + 1; - for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) - if (!Working[I->Index].isPackaged()) - *O++ = *I; - OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); + if (OuterLoop) + updateLoopWithIrreducible(*OuterLoop, SubSCCs); } -void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { - assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); - - // Since the loop has more than one header block, the mass flowing back into - // each header will be different. Adjust the mass in each header loop to - // reflect the masses flowing through back edges. - // - // To do this, we distribute the initial mass using the backedge masses - // as weights for the distribution. - BlockMass LoopMass = BlockMass::getFull(); - Distribution Dist; - - DEBUG(dbgs() << "adjust-loop-header-mass:\n"); - for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { - auto &HeaderNode = Loop.Nodes[H]; - auto &BackedgeMass = Loop.BackedgeMass[Loop.getHeaderIndex(HeaderNode)]; - DEBUG(dbgs() << " - Add back edge mass for node " - << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n"); - if (BackedgeMass.getMass() > 0) - Dist.addLocal(HeaderNode, BackedgeMass.getMass()); - else - DEBUG(dbgs() << " Nothing added. Back edge mass is zero\n"); - } - - DitheringDistributer D(Dist, LoopMass); - - DEBUG(dbgs() << " Distribute loop mass " << LoopMass - << " to headers using above weights\n"); - for (const Weight &W : Dist.Weights) { - BlockMass Taken = D.takeMass(W.Amount); - assert(W.Type == Weight::Local && "all weights should be local"); - Working[W.TargetNode.Index].getMass() = Taken; - DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); - } -} Index: test/Analysis/BlockFrequencyInfo/irreducible.ll =================================================================== --- test/Analysis/BlockFrequencyInfo/irreducible.ll +++ test/Analysis/BlockFrequencyInfo/irreducible.ll @@ -98,12 +98,12 @@ br i1 %x, label %c1, label %c2, !prof !2 c1: -; CHECK-NEXT: c1: float = 2.0, +; CHECK-NEXT: c1: float = 2.1429, ; The "correct" answer is: float = 2.142857{{[0-9]*}}, br i1 %x, label %c2, label %exit, !prof !2 c2: -; CHECK-NEXT: c2: float = 2.0, +; CHECK-NEXT: c2: float = 1.8571, ; The "correct" answer is: float = 1.857142{{[0-9]*}}, br i1 %x, label %c1, label %exit, !prof !2 @@ -359,15 +359,15 @@ br i1 %x, label %left, label %right, !prof !18 left: -; CHECK-NEXT: left: float = 1.0, +; CHECK-NEXT: left: float = 0.94{{[0-9]*}}, br i1 %x, label %bottom, label %exit, !prof !19 right: -; CHECK-NEXT: right: float = 1.0, +; CHECK-NEXT: right: float = 1.17{{[0-9]*}}, br i1 %x, label %bottom, label %exit, !prof !20 bottom: -; CHECK-NEXT: bottom: float = 1.0, +; CHECK-NEXT: bottom: float = 0.88{{[0-9]*}}, br i1 %x, label %left, label %right, !prof !18 exit: @@ -378,12 +378,7 @@ !19 = !{!"branch_weights", i32 1, i32 3} !20 = !{!"branch_weights", i32 3, i32 1} -; An irreducible SCC with an irreducible sub-SCC. In the current version of -; -block-freq, this means an extra header. -; -; This testcases uses non-trivial branch weights. The CHECK statements here -; will start to fail if we change -block-freq to be more accurate. Currently, -; loop headers are affected by the weight of their corresponding back edges. +; This testcases uses non-trivial branch weights. define void @nonentry_header(i1 %x, i2 %y) { ; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header': ; CHECK-NEXT: block-frequency-info: nonentry_header @@ -392,21 +387,21 @@ br i1 %x, label %left, label %right, !prof !21 left: -; CHECK-NEXT: left: float = 0.14 +; CHECK-NEXT: left: float = 0.8333{{[0-9]*}}, br i1 %x, label %top, label %bottom, !prof !22 right: -; CHECK-NEXT: right: float = 0.42 +; CHECK-NEXT: right: float = 80.92, br i1 %x, label %top, label %bottom, !prof !22 top: -; CHECK-NEXT: top: float = 8.43 +; CHECK-NEXT: top: float = 283.22, switch i2 %y, label %exit [ i2 0, label %left i2 1, label %right i2 2, label %bottom ], !prof !23 bottom: -; CHECK-NEXT: bottom: float = 4.5, +; CHECK-NEXT: bottom: float = 1.6892, br label %top exit: @@ -415,4 +410,4 @@ } !21 = !{!"branch_weights", i32 2, i32 1} !22 = !{!"branch_weights", i32 1, i32 1} -!23 = !{!"branch_weights", i32 8, i32 1, i32 3, i32 12} +!23 = !{!"branch_weights", i32 8, i32 1, i32 3000, i32 12}