Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -247,8 +247,10 @@ MachineBasicBlock *findBestLoopExit(MachineFunction &F, MachineLoop &L, const BlockFilterSet &LoopBlockSet); void buildLoopChains(MachineFunction &F, MachineLoop &L); - void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, + void rotateLoop(BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(MachineFunction &F); public: @@ -753,9 +755,10 @@ /// with fallthrough out of the loop if doing so doesn't introduce unnecessary /// branches. For example, if the loop has fallthrough into its header and out /// of its bottom already, don't rotate it. -void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, - MachineBasicBlock *ExitingBB, +void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + auto F = (*LoopChain.begin())->getParent(); + MachineBasicBlock *ExitingBB = findBestLoopExit(*F, L, LoopBlockSet); if (!ExitingBB) return; @@ -791,6 +794,92 @@ std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); } +/// \brief Attempt to rotate a loop based on profile data to reduce branch cost. +/// +/// With profile data, we could determine the branch cost when rotating a loop +/// chain and select the best rotation. Basically, there are three kinds of +/// branch costs to consider for each rotation: +/// 1. The fall through edge (if it exists) from BB out of the loop to the +/// loop header. +/// 2. The fall through edge (if it exists) from the loop exit to BB out of +/// the loop. +/// 3. The once fall through edge (if it exists) from the last BB to the +/// first BB in the loop chain. +/// Therefore, the cost for a given rotation is 3 - 1 - 2. We select the best +/// rotation with the least cost. +void MachineBlockPlacement::rotateLoopWithProfile( + BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + auto HeaderBB = L.getHeader(); + auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB); + double BestRotationCost(std::numeric_limits::max()); + auto RotationPos = LoopChain.end(); + + // Get the largest frequency of fall-through edges to the loop header. + BlockFrequency HeaderFallThruFreq(0); + for (auto *Pred : HeaderBB->predecessors()) { + BlockChain *PredChain = BlockToChain[Pred]; + if (!LoopBlockSet.count(Pred) && + (!PredChain || Pred == *std::prev(PredChain->end()))) { + auto EdgeFreq = + MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB); + HeaderFallThruFreq = std::max(HeaderFallThruFreq, EdgeFreq); + } + } + + for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), + EndIter = LoopChain.end(); + Iter != EndIter; Iter++, TailIter++) { + if (TailIter == LoopChain.end()) + TailIter = LoopChain.begin(); + + auto TailBB = *TailIter; + + // Calculate the cost by putting this BB to the top: + double Cost = 0; + + // If the current BB is not the loop header, we need to take into account + // the fall through edge from outside of the loop to the header. + if (Iter == HeaderIter) + Cost -= static_cast(HeaderFallThruFreq.getFrequency()); + + // The cost of the fall through edge with the largest frequency from the + // tail BB to outside. + BlockFrequency ExitFallThruFreq(0); + for (auto *Succ : TailBB->successors()) { + BlockChain *SuccChain = BlockToChain[Succ]; + if (!LoopBlockSet.count(Succ) && + (!SuccChain || Succ == *SuccChain->begin())) { + auto EdgeFreq = + MBFI->getBlockFreq(TailBB) * MBPI->getEdgeProbability(TailBB, Succ); + ExitFallThruFreq = std::max(ExitFallThruFreq, EdgeFreq); + } + } + Cost -= static_cast(ExitFallThruFreq.getFrequency()); + + // The cost of breaking the once fall-through edge from the tail to the top + // of the loop chain. + if (TailBB->isSuccessor(*Iter)) { + auto FallThruEdgeFreq = + MBFI->getBlockFreq(TailBB) * MBPI->getEdgeProbability(TailBB, *Iter); + // If the tail BB has only an unconditional jump to the top, we need to + // consider the cost of this jump. + Cost += (TailBB->succ_size() == 1 ? 2 : 1) * + static_cast(FallThruEdgeFreq.getFrequency()); + } + + if (Cost < BestRotationCost) { + BestRotationCost = Cost; + RotationPos = Iter; + } + } + + if (RotationPos != LoopChain.end()) { + DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos) + << " to the top\n"); + std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); + } +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -812,14 +901,6 @@ // predecessors to the header if there is one which will result in strictly // fewer branches in the loop body. MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet); - - // If we selected just the header for the loop top, look for a potentially - // profitable exit block in the event that rotating the loop can eliminate - // branches by placing an exit edge at the bottom. - MachineBasicBlock *ExitingBB = nullptr; - if (LoopTop == L.getHeader()) - ExitingBB = findBestLoopExit(F, L, LoopBlockSet); - BlockChain &LoopChain = *BlockToChain[LoopTop]; // FIXME: This is a really lame way of walking the chains in the loop: we @@ -848,7 +929,17 @@ } buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); - rotateLoop(LoopChain, ExitingBB, LoopBlockSet); + + // Check if we have profile data for this function. If yes, we rotate this + // loop by heavily using the profile data for better layout. + if (F.getFunction()->getEntryCount()) + rotateLoopWithProfile(LoopChain, L, LoopBlockSet); + else if (LoopTop == L.getHeader()) { + // If we selected just the header for the loop top, look for a potentially + // profitable exit block in the event that rotating the loop can eliminate + // branches by placing an exit edge at the bottom. + rotateLoop(LoopChain, L, LoopBlockSet); + } DEBUG({ // Crash at the end so we get all of the debugging output first. Index: test/CodeGen/X86/code_placement_loop_rotation.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/code_placement_loop_rotation.ll @@ -0,0 +1,69 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK + +define void @foo() { +; Test that the back edge of the loop that is quite hot will not become +; fall-through without profile data. +; +; CHECK-LABEL: foo: +; CHECK: callq a +; CHECK: callq b +; CHECK: callq c +; CHECK: callq d + +entry: + br label %while.body + +while.body: + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then, label %if.end, !prof !2 + +if.then: + call void @b() + br label %if.end + +if.end: + %call = call zeroext i1 @c() + br i1 %call, label %while.body, label %while.end + +while.end: + call void @d() + ret void +} + +define void @bar() !prof !1 { +; Test that the back edge of the loop that is quite hot will become +; fall-through with profile data. +; +; CHECK-LABEL: bar: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq a +; CHECK: callq d + +entry: + br label %while.body + +while.body: + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then, label %if.end, !prof !2 + +if.then: + call void @b() + br label %if.end + +if.end: + %call = call zeroext i1 @c() + br i1 %call, label %while.body, label %while.end + +while.end: + call void @d() + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare zeroext i1 @c() +declare void @d() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 128, i32 128}