Index: include/llvm/Analysis/BlockFrequencyInfoImpl.h =================================================================== --- include/llvm/Analysis/BlockFrequencyInfoImpl.h +++ include/llvm/Analysis/BlockFrequencyInfoImpl.h @@ -196,12 +196,13 @@ struct LoopData { typedef SmallVector, 4> ExitMap; typedef SmallVector NodeList; - LoopData *Parent; ///< The parent loop. - bool IsPackaged; ///< Whether this has been packaged. - uint32_t NumHeaders; ///< Number of headers. - ExitMap Exits; ///< Successor edges (and weights). - NodeList Nodes; ///< Header and the members of the loop. - BlockMass BackedgeMass; ///< Mass returned to loop header. + typedef SmallVector HeaderMassList; + LoopData *Parent; ///< The parent loop. + bool IsPackaged; ///< Whether this has been packaged. + uint32_t NumHeaders; ///< Number of headers. + ExitMap Exits; ///< Successor edges (and weights). + NodeList Nodes; ///< Header and the members of the loop. + HeaderMassList BackedgeMass; ///< Mass returned to each loop header. BlockMass Mass; Scaled64 Scale; @@ -431,6 +432,16 @@ /// \brief Compute the loop scale for a loop. void computeLoopScale(LoopData &Loop); + /// Adjust the mass of all headers in an irreducible loop. + /// + /// Initially, irreducible loops are assumed to distribute their mass + /// equally among its headers. This can lead to wrong frequency estimates + /// since some headers may be executed more frequently than others. + /// + /// This adjusts header mass distribution so it matches the weights of + /// the backedges going into each of the loop headers. + void adjustLoopHeaderMass(LoopData &Loop); + /// \brief Package up a loop. void packageLoop(LoopData &Loop); @@ -1042,6 +1053,8 @@ for (const BlockNode &M : Loop.Nodes) if (!propagateMassToSuccessors(&Loop, M)) llvm_unreachable("unhandled irreducible control flow"); + + adjustLoopHeaderMass(Loop); } else { Working[Loop.getHeader().Index].getMass() = BlockMass::getFull(); if (!propagateMassToSuccessors(&Loop, Loop.getHeader())) Index: lib/Analysis/BlockFrequencyInfoImpl.cpp =================================================================== --- lib/Analysis/BlockFrequencyInfoImpl.cpp +++ lib/Analysis/BlockFrequencyInfoImpl.cpp @@ -286,7 +286,7 @@ if (isLoopHeader(Resolved)) { DEBUG(debugSuccessor("backedge")); - Dist.addBackedge(OuterLoop->getHeader(), Weight); + Dist.addBackedge(Resolved, Weight); return true; } @@ -349,7 +349,10 @@ // LoopScale == 1 / ExitMass // ExitMass == HeadMass - BackedgeMass - BlockMass ExitMass = BlockMass::getFull() - Loop.BackedgeMass; + BlockMass TotalBackedgeMass; + for (auto &Mass : Loop.BackedgeMass) + TotalBackedgeMass += Mass; + BlockMass ExitMass = BlockMass::getFull() - TotalBackedgeMass; // Block scale stores the inverse of the scale. If this is an infinite loop, // its exit mass will be zero. In this case, use an arbitrary scale for the @@ -358,7 +361,7 @@ ExitMass.isEmpty() ? InifiniteLoopScale : ExitMass.toScaled().inverse(); DEBUG(dbgs() << " - exit-mass = " << ExitMass << " (" << BlockMass::getFull() - << " - " << Loop.BackedgeMass << ")\n" + << " - " << TotalBackedgeMass << ")\n" << " - scale = " << Loop.Scale << "\n"); } @@ -411,8 +414,10 @@ // Check for a backedge. if (W.Type == Weight::Backedge) { - OuterLoop->BackedgeMass += Taken; - DEBUG(debugAssign(BlockNode(), Taken, "back")); + if (W.TargetNode.Index >= OuterLoop->BackedgeMass.size()) + OuterLoop->BackedgeMass.resize(W.TargetNode.Index + 1); + OuterLoop->BackedgeMass[W.TargetNode.Index] += Taken; + DEBUG(debugAssign(W.TargetNode, Taken, "back")); continue; } @@ -713,10 +718,56 @@ void BlockFrequencyInfoImplBase::updateLoopWithIrreducible(LoopData &OuterLoop) { OuterLoop.Exits.clear(); - OuterLoop.BackedgeMass = BlockMass::getEmpty(); + OuterLoop.BackedgeMass.clear(); auto O = OuterLoop.Nodes.begin() + 1; for (auto I = O, E = OuterLoop.Nodes.end(); I != E; ++I) if (!Working[I->Index].isPackaged()) *O++ = *I; OuterLoop.Nodes.erase(O, OuterLoop.Nodes.end()); } + +void BlockFrequencyInfoImplBase::adjustLoopHeaderMass(LoopData &Loop) { + assert(Loop.isIrreducible() && "this only makes sense on irreducible loops"); + + // Since the loop has more than one header block, the mass flowing back into + // each header will be different. Adjust the mass in each header loop to + // reflect the masses flowing through back edges. + // + // To do this, we distribute the initial mass using the backedge masses + // as weights for the distribution. + BlockMass LoopMass = BlockMass::getFull(); + Distribution Dist; + + DEBUG(dbgs() << "adjust-loop-header-mass:\n"); + for (uint32_t H = 0; H < Loop.NumHeaders; ++H) { + auto &HeaderNode = Loop.Nodes[H]; + auto &BackedgeMass = Loop.BackedgeMass[HeaderNode.Index]; + DEBUG(dbgs() << " - Add back edge mass for node " + << getBlockName(HeaderNode) << ": " << BackedgeMass << "\n"); + Dist.addLocal(HeaderNode, BackedgeMass.getMass()); + } + + DitheringDistributer D(Dist, LoopMass); + +#ifndef NDEBUG + auto debugAssign = [&](const BlockNode &T, const BlockMass &M, + const char *Desc) { + dbgs() << " => assign " << M << " (" << D.RemMass << ")"; + if (Desc) + dbgs() << " [" << Desc << "]"; + if (T.isValid()) + dbgs() << " to " << getBlockName(T); + dbgs() << "\n"; + }; + (void)debugAssign; +#endif + + DEBUG(dbgs() << " Distribute loop mass " << LoopMass + << " to headers using above weights\n"); + for (const Weight &W : Dist.Weights) { + BlockMass Taken = D.takeMass(W.Amount); + assert(W.Type == Weight::Local && "all weights should be local"); + Working[W.TargetNode.Index].getMass() = Taken; + DEBUG(debugAssign(W.TargetNode, Taken, nullptr)); + } +} Index: test/Analysis/BlockFrequencyInfo/PR23525.ll =================================================================== --- /dev/null +++ test/Analysis/BlockFrequencyInfo/PR23525.ll @@ -0,0 +1,80 @@ +; RUN: opt < %s -analyze -block-freq | FileCheck %s + +@g = global i32 0, align 4 + +; Function Attrs: inlinehint noinline nounwind uwtable +define i32 @_Z8hot_loopi(i32 %n) !prof !1 { +entry: + %div = sdiv i32 %n, 2 + %rem12 = and i32 %n, 1 + %cmp = icmp eq i32 %rem12, 0 + br i1 %cmp, label %Next, label %for.cond, !prof !2 + +; CHECK: - for.cond: float = 25.85{{[0-9]*}}, int = 206 +for.cond: ; preds = %entry, %for.inc + %i.0 = phi i32 [ %inc, %for.inc ], [ %div, %entry ] + %cmp1 = icmp slt i32 %i.0, %n + br i1 %cmp1, label %for.body, label %for.end, !prof !3, !llvm.loop !4 + +; CHECK: - for.body: float = 24.52, int = 196 +for.body: ; preds = %for.cond + %rem213 = and i32 %i.0, 1 + %cmp3 = icmp eq i32 %rem213, 0 + br i1 %cmp3, label %if.then.4, label %Next, !prof !6 + +; CHECK: - if.then.4: float = 12.26{{[0-9]*}}, int = 98 +if.then.4: ; preds = %for.body + %0 = load i32, i32* @g, align 4, !tbaa !7 + %mul = shl nsw i32 %0, 1 + br label %for.inc + +; CHECK: - Next: float = 12.41{{[0-9]*}}, int = 99 +Next: ; preds = %for.body, %entry + %i.1 = phi i32 [ %div, %entry ], [ %i.0, %for.body ] + %1 = load i32, i32* @g, align 4, !tbaa !7 + %add = add nsw i32 %1, %n + br label %for.inc + +; CHECK: - for.inc: float = 38.28{{[0-9]*}}, int = 306 +for.inc: ; preds = %if.then.4, %Next + %storemerge = phi i32 [ %add, %Next ], [ %mul, %if.then.4 ] + %i.2 = phi i32 [ %i.1, %Next ], [ %i.0, %if.then.4 ] + store i32 %storemerge, i32* @g, align 4, !tbaa !7 + %inc = add nsw i32 %i.2, 1 + br label %for.cond + +; CHECK: - for.end: float = 1.0, int = 8 +for.end: ; preds = %for.cond + %2 = load i32, i32* @g, align 4, !tbaa !7 + ret i32 %2 +} + +; Function Attrs: nounwind uwtable +define i32 @main() !prof !11 { +entry: + br label %for.body + +for.cond.cleanup: ; preds = %for.body + ret i32 0 + +for.body: ; preds = %for.body, %entry + %i.04 = phi i32 [ 1, %entry ], [ %inc, %for.body ] + %call = tail call i32 @_Z8hot_loopi(i32 %i.04) + %inc = add nuw nsw i32 %i.04, 1 + %exitcond = icmp eq i32 %inc, 100 + br i1 %exitcond, label %for.cond.cleanup, label %for.body, !prof !12 +} + + +!1 = !{!"function_entry_count", i64 99} +!2 = !{!"branch_weights", i32 50, i32 51} +!3 = !{!"branch_weights", i32 2452, i32 100} +!4 = distinct !{!4, !5} +!5 = !{!"llvm.loop.unroll.disable"} +!6 = !{!"branch_weights", i32 1227, i32 1226} +!7 = !{!8, !8, i64 0} +!8 = !{!"int", !9, i64 0} +!9 = !{!"omnipotent char", !10, i64 0} +!10 = !{!"Simple C/C++ TBAA"} +!11 = !{!"function_entry_count", i64 1} +!12 = !{!"branch_weights", i32 2, i32 100} Index: test/Analysis/BlockFrequencyInfo/irreducible.ll =================================================================== --- test/Analysis/BlockFrequencyInfo/irreducible.ll +++ test/Analysis/BlockFrequencyInfo/irreducible.ll @@ -130,9 +130,6 @@ ; At the first step, c1 and c2 each get 1/3 of the entry. At each subsequent ; step, c1 and c2 each get 1/3 of what's left in c1 and c2 combined. This ; infinite series sums to 1. -; -; Since the currently algorithm *always* assumes entry blocks are equal, -; -block-freq gets the right answers here. define void @crossloops(i2 %x) { ; CHECK-LABEL: Printing analysis {{.*}} for function 'crossloops': ; CHECK-NEXT: block-frequency-info: crossloops @@ -386,7 +383,7 @@ ; ; This testcases uses non-trivial branch weights. The CHECK statements here ; will start to fail if we change -block-freq to be more accurate. Currently, -; we expect left, right and top to be treated as equal headers. +; loop headers are affected by the weight of their corresponding back edges. define void @nonentry_header(i1 %x, i2 %y) { ; CHECK-LABEL: Printing analysis {{.*}} for function 'nonentry_header': ; CHECK-NEXT: block-frequency-info: nonentry_header @@ -395,15 +392,15 @@ br i1 %x, label %left, label %right, !prof !21 left: -; CHECK-NEXT: left: float = 3.0, +; CHECK-NEXT: left: float = 0.14{{[0-9]*}}, br i1 %x, label %top, label %bottom, !prof !22 right: -; CHECK-NEXT: right: float = 3.0, +; CHECK-NEXT: right: float = 0.42{{[0-9]*}}, br i1 %x, label %top, label %bottom, !prof !22 top: -; CHECK-NEXT: top: float = 3.0, +; CHECK-NEXT: top: float = 8.43{{[0-9]*}}, switch i2 %y, label %exit [ i2 0, label %left i2 1, label %right i2 2, label %bottom ], !prof !23