Index: llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h =================================================================== --- llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ llvm/trunk/include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -196,23 +196,26 @@ struct LoopData { typedef SmallVector, 4> ExitMap; typedef SmallVector NodeList; - LoopData *Parent; ///< The parent loop. - bool IsPackaged; ///< Whether this has been packaged. - uint32_t NumHeaders; ///< Number of headers. - ExitMap Exits; ///< Successor edges (and weights). - NodeList Nodes; ///< Header and the members of the loop. - BlockMass BackedgeMass; ///< Mass returned to loop header. + typedef SmallVector HeaderMassList; + LoopData *Parent; ///< The parent loop. + bool IsPackaged; ///< Whether this has been packaged. + uint32_t NumHeaders; ///< Number of headers. + ExitMap Exits; ///< Successor edges (and weights). + NodeList Nodes; ///< Header and the members of the loop. + HeaderMassList BackedgeMass; ///< Mass returned to each loop header. BlockMass Mass; Scaled64 Scale; LoopData(LoopData *Parent, const BlockNode &Header) - : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header) {} + : Parent(Parent), IsPackaged(false), NumHeaders(1), Nodes(1, Header), + BackedgeMass(1) {} template LoopData(LoopData *Parent, It1 FirstHeader, It1 LastHeader, It2 FirstOther, It2 LastOther) : Parent(Parent), IsPackaged(false), Nodes(FirstHeader, LastHeader) { NumHeaders = Nodes.size(); Nodes.insert(Nodes.end(), FirstOther, LastOther); + BackedgeMass.resize(NumHeaders); } bool isHeader(const BlockNode &Node) const { if (isIrreducible()) @@ -223,6 +226,14 @@ BlockNode getHeader() const { return Nodes[0]; } bool isIrreducible() const { return NumHeaders > 1; } + HeaderMassList::difference_type headerIndexFor(const BlockNode &B) { + assert(isHeader(B) && "this is only valid on loop header blocks"); + if (isIrreducible()) + return std::lower_bound(Nodes.begin(), Nodes.begin() + NumHeaders, B) - + Nodes.begin(); + return 0; + } + NodeList::const_iterator members_begin() const { return Nodes.begin() + NumHeaders; } @@ -431,6 +442,16 @@ /// \brief Compute the loop scale for a loop. void computeLoopScale(LoopData &Loop); + /// Adjust the mass of all headers in an irreducible loop. + /// + /// Initially, irreducible loops are assumed to distribute their mass + /// equally among its headers. This can lead to wrong frequency estimates + /// since some headers may be executed more frequently than others. + /// + /// This adjusts header mass distribution so it matches the weights of + /// the backedges going into each of the loop headers. + void adjustLoopHeaderMass(LoopData &Loop); + /// \brief Package up a loop. void packageLoop(LoopData &Loop); @@ -735,11 +756,6 @@ /// as sub-loops, rather than arbitrarily shoving the problematic /// blocks into the headers of the main irreducible SCC. /// -/// - Backedge frequencies are assumed to be evenly split between the -/// headers of a given irreducible SCC. Instead, we could track the -/// backedge mass separately for each header, and adjust their relative -/// frequencies. -/// /// - Entry frequencies are assumed to be evenly split between the /// headers of a given irreducible SCC, which is the only option if we /// need to compute mass in the SCC before its parent loop. Instead, @@ -1042,6 +1058,8 @@ for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); + + adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) Index: llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp =================================================================== --- llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp +++ llvm/trunk/lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -286,7 +286,7 @@ if (isLoopHeader(Resolved)) { DEBUG(debugSuccessor("backedge")); - Dist.addBackedge(OuterLoop->getHeader(), Weight); + Dist.addBackedge(Resolved, Weight); return true; } @@ -349,7 +349,10 @@ // LoopScale == 1 / ExitMass // ExitMass == HeadMass - BackedgeMass - BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass; + BlockMass TotalBackedgeMass; + for (auto &Mass : Loop.BackedgeMass) + TotalBackedgeMass += Mass; + BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; // Block scale stores the inverse of the scale. If this is an infinite loop, // its exit mass will be zero. In this case, use an arbitrary scale for the @@ -358,7 +361,7 @@ ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse(); DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() - << " - " << Loop.BackedgeMass << ")\n" + << " - " << TotalBackedgeMass << ")\n" << " - scale = " << Loop.Scale << "\n"); } @@ -375,6 +378,19 @@ Loop.IsPackaged = true; } +#ifndef NDEBUG +static void debugAssign(const BlockFrequencyInfoImplBase &BFI, + const DitheringDistributer &D, const BlockNode &T, + const BlockMass &M, const char *Desc) { + dbgs() << " => assign " << M << " (" << D.RemMass << ")"; + if (Desc) + dbgs() << " [" << Desc << "]"; + if (T.isValid()) + dbgs() << " to " << BFI.getBlockName(T); + dbgs() << "\n"; +} +#endif + void BlockFrequencyInfoImplBase::distributeMass(const BlockNode &Source, LoopData *OuterLoop, Distribution &Dist) { @@ -384,25 +400,12 @@ // Distribute mass to successors as laid out in Dist. DitheringDistributer D(Dist, Mass); -#ifndef NDEBUG - auto debugAssign = [&](const BlockNode &T, const BlockMass &M, - const char *Desc) { - dbgs() << " => assign " << M << " (" << D.RemMass << ")"; - if (Desc) - dbgs() << " [" << Desc << "]"; - if (T.isValid()) - dbgs() << " to " << getBlockName(T); - dbgs() << "\n"; - }; - (void)debugAssign; -#endif - for (const Weight &W : Dist.Weights) { // Check for a local edge (non-backedge and non-exit). BlockMass Taken = D.takeMass(W.Amount); if (W.Type == Weight::Local) { Working[W.TargetNode.Index].getMass() += Taken; - DEBUG(debugAssign(W.TargetNode, Taken, nullptr)); + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); continue; } @@ -411,15 +414,16 @@ // Check for a backedge. if (W.Type == Weight::Backedge) { - OuterLoop->BackedgeMass += Taken; - DEBUG(debugAssign(BlockNode(), Taken, "back")); + auto ix = OuterLoop->headerIndexFor(W.TargetNode); + OuterLoop->BackedgeMass[ix] += Taken; + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "back")); continue; } // This must be an exit. assert(W.Type == Weight::Exit); OuterLoop->Exits.push_back(std::make_pair(W.TargetNode, Taken)); - DEBUG(debugAssign(W.TargetNode, Taken, "exit")); + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, "exit")); } } @@ -713,10 +717,44 @@ void BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { OuterLoop.Exits.clear(); - OuterLoop.BackedgeMass = BlockMass::getEmpty(); + for (auto &Mass : OuterLoop.BackedgeMass) + Mass = BlockMass::getEmpty(); auto O = OuterLoop.Nodes.begin() + 1; for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) if (!Working[I->Index].isPackaged()) *O++ = *I; OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); } + +void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { + assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); + + // Since the loop has more than one header block, the mass flowing back into + // each header will be different. Adjust the mass in each header loop to + // reflect the masses flowing through back edges. + // + // To do this, we distribute the initial mass using the backedge masses + // as weights for the distribution. + BlockMass LoopMass = BlockMass::getFull(); + Distribution Dist; + + DEBUG(dbgs() << "adjust-loop-header-mass:\n"); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + auto &BackedgeMass = Loop.BackedgeMass[Loop.headerIndexFor(HeaderNode)]; + DEBUG(dbgs() << " - Add back edge mass for node " + << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n"); + Dist.addLocal(HeaderNode, BackedgeMass.getMass()); + } + + DitheringDistributer D(Dist, LoopMass); + + DEBUG(dbgs() << " Distribute loop mass " << LoopMass + << " to headers using above weights\n"); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + DEBUG(debugAssign(*this, D, W.TargetNode, Taken, nullptr)); + } +} Index: llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll =================================================================== --- llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll +++ llvm/trunk/test/Analysis/BlockFrequencyInfo/PR23525.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +@g = global i32 0, align 4 + +; Function Attrs: inlinehint noinline nounwind uwtable +define i32 @_Z8hot_loopi(i32 %n) !prof !1 { +entry: + %div = sdiv i32 %n, 2 + %rem12 = and i32 %n, 1 + %cmp = icmp eq i32 %rem12, 0 + br i1 %cmp, label %Next, label %for.cond, !prof !2 + +; CHECK: - for.cond: float = 25.85{{[0-9]*}}, int = 206 +for.cond: ; preds = %entry, %for.inc + %i.0 = phi i32 [ %inc, %for.inc ], [ %div, %entry ] + %cmp1 = icmp slt i32 %i.0, %n + br i1 %cmp1, label %for.body, label %for.end, !prof !3, !llvm.loop !4 + +; CHECK: - for.body: float = 24.52, int = 196 +for.body: ; preds = %for.cond + %rem213 = and i32 %i.0, 1 + %cmp3 = icmp eq i32 %rem213, 0 + br i1 %cmp3, label %if.then.4, label %Next, !prof !6 + +; CHECK: - if.then.4: float = 12.26{{[0-9]*}}, int = 98 +if.then.4: ; preds = %for.body + %0 = load i32, i32* @g, align 4, !tbaa !7 + %mul = shl nsw i32 %0, 1 + br label %for.inc + +; CHECK: - Next: float = 12.41{{[0-9]*}}, int = 99 +Next: ; preds = %for.body, %entry + %i.1 = phi i32 [ %div, %entry ], [ %i.0, %for.body ] + %1 = load i32, i32* @g, align 4, !tbaa !7 + %add = add nsw i32 %1, %n + br label %for.inc + +; CHECK: - for.inc: float = 38.28{{[0-9]*}}, int = 306 +for.inc: ; preds = %if.then.4, %Next + %storemerge = phi i32 [ %add, %Next ], [ %mul, %if.then.4 ] + %i.2 = phi i32 [ %i.1, %Next ], [ %i.0, %if.then.4 ] + store i32 %storemerge, i32* @g, align 4, !tbaa !7 + %inc = add nsw i32 %i.2, 1 + br label %for.cond + +; CHECK: - for.end: float = 1.0, int = 8 +for.end: ; preds = %for.cond + %2 = load i32, i32* @g, align 4, !tbaa !7 + ret i32 %2 +} + +; Function Attrs: nounwind uwtable +define i32 @main() !prof !11 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 0 + +for.body: ; preds = %for.body, %entry + %i.04 = phi i32 [ 1, %entry ], [ %inc, %for.body ] + %call = tail call i32 @_Z8hot_loopi(i32 %i.04) + %inc = add nuw nsw i32 %i.04, 1 + %exitcond = icmp eq i32 %inc, 100 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !12 +} + + +!1 = !{!"function_entry_count", i64 99} +!2 = !{!"branch_weights", i32 50, i32 51} +!3 = !{!"branch_weights", i32 2452, i32 100} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} +!6 = !{!"branch_weights", i32 1227, i32 1226} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C/C++ TBAA"} +!11 = !{!"function_entry_count", i64 1} +!12 = !{!"branch_weights", i32 2, i32 100} Index: llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll =================================================================== --- llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll +++ llvm/trunk/test/Analysis/BlockFrequencyInfo/irreducible.ll @@ -130,9 +130,6 @@ ; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent ; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This ; infinite series sums to 1. -; -; Since the currently algorithm *always* assumes entry blocks are equal, -; -block-freq gets the right answers here. define void @crossloops(i2 %x) { ; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops': ; CHECK-NEXT: block-frequency-info: crossloops @@ -386,7 +383,7 @@ ; ; This testcases uses non-trivial branch weights. The CHECK statements here ; will start to fail if we change -block-freq to be more accurate. Currently, -; we expect left, right and top to be treated as equal headers. +; loop headers are affected by the weight of their corresponding back edges. define void @nonentry_header(i1 %x, i2 %y) { ; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header': ; CHECK-NEXT: block-frequency-info: nonentry_header @@ -395,15 +392,15 @@ br i1 %x, label %left, label %right, !prof !21 left: -; CHECK-NEXT: left: float = 3.0, +; CHECK-NEXT: left: float = 0.14{{[0-9]*}}, br i1 %x, label %top, label %bottom, !prof !22 right: -; CHECK-NEXT: right: float = 3.0, +; CHECK-NEXT: right: float = 0.42{{[0-9]*}}, br i1 %x, label %top, label %bottom, !prof !22 top: -; CHECK-NEXT: top: float = 3.0, +; CHECK-NEXT: top: float = 8.43{{[0-9]*}}, switch i2 %y, label %exit [ i2 0, label %left i2 1, label %right i2 2, label %bottom ], !prof !23