Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -81,6 +81,11 @@ "instruction count below this threshold"), cl::init(4), cl::Hidden); +static cl::opt UseProfileBasedCostAnalysis( + "use-profile-based-cost-analysis", + cl::desc("Use profile data to guide machine block placement."), + cl::init(false), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -249,7 +254,12 @@ void buildLoopChains(MachineFunction &F, MachineLoop &L); void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(MachineFunction &F); + bool useProfileData(MachineFunction &F) { + return UseProfileBasedCostAnalysis && F.getFunction()->getEntryCount(); + } public: static char ID; // Pass identification, replacement for typeid @@ -791,6 +801,100 @@ std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); } +/// \brief Attempt to rotate a loop based on profile data to reduce branch cost. +/// +/// With profile data, we could determine the benefit in terms of fall through +/// opportunities when rotating a loop chain and select the best rotation. +/// Basically, there are three kinds of benefit costs to consider for each +/// rotation: +/// 1. The fall through edge (if it exists) from BB out of the loop to the +/// loop header. +/// 2. The fall through edge (if it exists) from the loop exit to BB out of +/// the loop. +/// 3. The fall through edge (if it exists) from the last BB to the first BB +/// in the loop chain. +/// Therefore, the benefit for a given rotation is 1 + 2 - 3. We select the +/// best rotation with the largest benefit. +void MachineBlockPlacement::rotateLoopWithProfile( + BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + auto HeaderBB = L.getHeader(); + auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB); + double BestRotationBenefit = 0; + auto RotationPos = LoopChain.end(); + + double HeaderFallThruFreq = 0; + for (auto *Pred : HeaderBB->predecessors()) { + BlockChain *PredChain = BlockToChain[Pred]; + if (!LoopBlockSet.count(Pred) && + (!PredChain || Pred == *std::prev(PredChain->end()))) { + double EdgeFreq = static_cast( + (MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB)) + .getFrequency()); + // If the predecessor has only an unconditional jump to the header, we + // need to consider the cost of this jump. + if (Pred->succ_size() == 1) + EdgeFreq *= 2; + HeaderFallThruFreq = std::max(HeaderFallThruFreq, EdgeFreq); + } + } + + for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), + EndIter = LoopChain.end(); + Iter != EndIter; Iter++, TailIter++) { + if (TailIter == LoopChain.end()) + TailIter = LoopChain.begin(); + + auto TailBB = *TailIter; + + // Calculate the cost by putting this BB to the top: + double Benefit = 0; + + // If the current BB is not the loop header, we need to take into account + // the fall through edge from outside of the loop to the header. + if (Iter == HeaderIter) + Benefit += HeaderFallThruFreq; + + // The cost of the fall through edge with the largest frequency from the + // tail BB to outside. + BlockFrequency ExitFallThruFreq(0); + for (auto *Succ : TailBB->successors()) { + BlockChain *SuccChain = BlockToChain[Succ]; + if (!LoopBlockSet.count(Succ) && + (!SuccChain || Succ == *SuccChain->begin())) { + auto EdgeFreq = + MBFI->getBlockFreq(TailBB) * MBPI->getEdgeProbability(TailBB, Succ); + ExitFallThruFreq = std::max(ExitFallThruFreq, EdgeFreq); + } + } + Benefit += static_cast(ExitFallThruFreq.getFrequency()); + + // The cost of breaking the once fall-through edge from the tail to the top + // of the loop chain. + if (TailBB->isSuccessor(*Iter)) { + auto FallThruEdgeFreq = + MBFI->getBlockFreq(TailBB) * MBPI->getEdgeProbability(TailBB, *Iter); + // If the tail BB has only an unconditional jump to the top, we need to + // consider the cost of this jump. In other words, if we put such a block + // to the bottom of the chain, we will end up with one more jump + // instruction, which is unnecessary if it is in other position of the + // chain. + Benefit -= (TailBB->succ_size() == 1 ? 2 : 1) * + static_cast(FallThruEdgeFreq.getFrequency()); + } + + if (Benefit > BestRotationBenefit) { + BestRotationBenefit = Benefit; + RotationPos = Iter; + } + } + + if (RotationPos != LoopChain.end()) { + DEBUG(dbgs() << "Rotate loop by making " << getBlockName(*RotationPos) + << " to the top\n"); + std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); + } +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -848,7 +952,13 @@ } buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); - rotateLoop(LoopChain, ExitingBB, LoopBlockSet); + + // Check if we have profile data for this function. If yes, we rotate this + // loop by heavily using the profile data for better layout. + if (useProfileData(F)) + rotateLoopWithProfile(LoopChain, L, LoopBlockSet); + else + rotateLoop(LoopChain, ExitingBB, LoopBlockSet); DEBUG({ // Crash at the end so we get all of the debugging output first. Index: test/CodeGen/X86/code_placement_loop_rotation.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/code_placement_loop_rotation.ll @@ -0,0 +1,69 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK + +define void @foo() { +; Test that the back edge of the loop that is quite hot will not become +; fall-through without profile data. +; +; CHECK-LABEL: foo: +; CHECK: callq a +; CHECK: callq b +; CHECK: callq c +; CHECK: callq d + +entry: + br label %while.body + +while.body: + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then, label %if.end, !prof !2 + +if.then: + call void @b() + br label %if.end + +if.end: + %call = call zeroext i1 @c() + br i1 %call, label %while.body, label %while.end + +while.end: + call void @d() + ret void +} + +define void @bar() !prof !1 { +; Test that the back edge of the loop that is quite hot will become +; fall-through with profile data. +; +; CHECK-LABEL: bar: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq a +; CHECK: callq d + +entry: + br label %while.body + +while.body: + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then, label %if.end, !prof !2 + +if.then: + call void @b() + br label %if.end + +if.end: + %call = call zeroext i1 @c() + br i1 %call, label %while.body, label %while.end + +while.end: + call void @d() + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare zeroext i1 @c() +declare void @d() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 128, i32 128}