Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -81,6 +81,20 @@ "instruction count below this threshold"), cl::init(4), cl::Hidden); +static cl::opt UseProfileBasedCostAnalysis( + "use-profile-based-cost-analysis", + cl::desc("Use profile data to guide machine block placement."), + cl::init(false), cl::Hidden); + +static cl::opt MisfetchCost( + "misfetch-cost", + cl::desc("The cost of misfetch due to conditional/unconditional jumps."), + cl::init(1), cl::Hidden); + +static cl::opt + JumpInstCost("jump-inst-cost", cl::desc("The cost of jump instructions."), + cl::init(1), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -249,7 +263,12 @@ void buildLoopChains(MachineFunction &F, MachineLoop &L); void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(MachineFunction &F); + bool useProfileData(MachineFunction &F) { + return UseProfileBasedCostAnalysis && F.getFunction()->getEntryCount(); + } public: static char ID; // Pass identification, replacement for typeid @@ -791,6 +810,126 @@ std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); } +/// \brief Attempt to rotate a loop based on profile data to reduce branch cost. +/// +/// With profile data, we could determine the benefit in terms of fall through +/// opportunities when rotating a loop chain and select the best rotation. +/// Basically, there are three kinds of benefit costs to consider for each +/// rotation: +/// 1. The fall through edge (if it exists) from BB out of the loop to the +/// loop header. +/// 2. The fall through edge (if it exists) from the loop exit to BB out of +/// the loop. +/// 3. The fall through edge (if it exists) from the last BB to the first BB +/// in the loop chain. +/// Therefore, the benefit for a given rotation is 1 + 2 - 3. We select the +/// best rotation with the largest benefit. +void MachineBlockPlacement::rotateLoopWithProfile( + BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + auto HeaderBB = L.getHeader(); + auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB); + double BestRotationBenefit = -std::numeric_limits::max(); + auto RotationPos = LoopChain.end(); + + // Compute the benefit of the fall-through edge to the loop header. As we only + // consider natural loops with single header, this computation can be done + // only once. + double HeaderFallThruBenefit = 0; + for (auto *Pred : HeaderBB->predecessors()) { + BlockChain *PredChain = BlockToChain[Pred]; + if (!LoopBlockSet.count(Pred) && + (!PredChain || Pred == *std::prev(PredChain->end()))) { + double EdgeFreq = static_cast( + (MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB)) + .getFrequency()); + double FallThruBenefit = EdgeFreq * MisfetchCost; + // If the predecessor has only an unconditional jump to the header, we + // need to consider the cost of this jump. + if (Pred->succ_size() == 1) + FallThruBenefit += EdgeFreq * JumpInstCost; + HeaderFallThruBenefit = std::max(HeaderFallThruBenefit, FallThruBenefit); + } + } + + for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), + EndIter = LoopChain.end(); + Iter != EndIter; Iter++, TailIter++) { + if (TailIter == LoopChain.end()) + TailIter = LoopChain.begin(); + + auto TailBB = *TailIter; + + // Calculate the cost by putting this BB to the top: + double Benefit = 0; + + // If the current BB is not the loop header, we need to take into account + // the fall through edge from outside of the loop to the header. + if (Iter == HeaderIter) + Benefit += HeaderFallThruBenefit; + + // The benefit of the fall through edge with the largest frequency from the + // tail BB to outside. + BlockFrequency ExitFallThruFreq(0); + for (auto *Succ : TailBB->successors()) { + BlockChain *SuccChain = BlockToChain[Succ]; + if (!LoopBlockSet.count(Succ) && + (!SuccChain || Succ == *SuccChain->begin())) { + auto EdgeFreq = + MBFI->getBlockFreq(TailBB) * MBPI->getEdgeProbability(TailBB, Succ); + ExitFallThruFreq = std::max(ExitFallThruFreq, EdgeFreq); + } + } + Benefit += + static_cast(ExitFallThruFreq.getFrequency()) * MisfetchCost; + + // The cost of breaking the once fall-through edge from the tail to the top + // of the loop chain. Here we need to consider two cases: + // 1. If the tail node has only one successor, then we will get an + // additional jmp instruction. So the cost here is (MisfetchCost + + // JumpInstCost) * tail node frequency. + // 2. If the tail node has two successors, then we still get an additional + // jmp instruction. Note that the more frequent executed jmp instruction + // will be put ahead of the other one. Assume the frequency of those two + // branches are x and y (x >= y), then the cost will be (x * MisfetechCost + // + y * JumpInstCost) * tail node frequency. + // 3. If the tail node has more than two successors (this rarely happens), + // we won't consider any additional cost. + if (TailBB->isSuccessor(*Iter)) { + auto TailBBFreq = MBFI->getBlockFreq(TailBB); + if (TailBB->succ_size() == 1) + Benefit -= static_cast(TailBBFreq.getFrequency()) * + (MisfetchCost + JumpInstCost); + else if (TailBB->succ_size() == 2) { + double Freq1 = static_cast( + (TailBBFreq * + MBPI->getEdgeProbability(TailBB, *(TailBB->succ_begin()))) + .getFrequency()); + double Freq2 = static_cast( + (TailBBFreq * + MBPI->getEdgeProbability(TailBB, *(TailBB->succ_rbegin()))) + .getFrequency()); + if (Freq1 < Freq2) + std::swap(Freq1, Freq2); + Benefit -= Freq1 * MisfetchCost + Freq2 * JumpInstCost; + } + } + + DEBUG(dbgs() << "The benefit of loop rotation by making " + << getBlockNum(*Iter) << " to the top: " << Benefit << "\n"); + + if (Benefit > BestRotationBenefit) { + BestRotationBenefit = Benefit; + RotationPos = Iter; + } + } + + if (RotationPos != LoopChain.end()) { + DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos) + << " to the top\n"); + std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); + } +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -848,7 +987,13 @@ } buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); - rotateLoop(LoopChain, ExitingBB, LoopBlockSet); + + // Check if we have profile data for this function. If yes, we rotate this + // loop by heavily using the profile data for better layout. + if (useProfileData(F)) + rotateLoopWithProfile(LoopChain, L, LoopBlockSet); + else + rotateLoop(LoopChain, ExitingBB, LoopBlockSet); DEBUG({ // Crash at the end so we get all of the debugging output first. Index: test/CodeGen/X86/code_placement_loop_rotation.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/code_placement_loop_rotation.ll @@ -0,0 +1,80 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -use-profile-based-cost-analysis < %s | FileCheck %s -check-prefix=CHECK-PROFILE + +define void @foo() { +; Test that not all edges in the loop chain are fall through without profile +; data. +; +; CHECK-LABEL: foo: +; CHECK: callq e +; CHECK: callq f +; CHECK: callq g +; CHECK: callq h + +entry: + br label %header + +header: + call void @e() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + call void @f() + br label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end + +end: + ret void +} + +define void @bar() !prof !1 { +; Test that all edges in the loop chain are fall through with profile data. +; +; CHECK-PROFILE-LABEL: bar: +; CHECK-PROFILE: callq g +; CHECK-PROFILE: callq h +; CHECK-PROFILE: callq e +; CHECK-PROFILE: callq f + +entry: + br label %header + +header: + call void @e() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + call void @f() + br label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end + +end: + ret void +} + +declare zeroext i1 @a() +declare void @e() +declare void @f() +declare void @g() +declare void @h() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 16, i32 16} Index: test/CodeGen/X86/code_placement_loop_rotation2.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/code_placement_loop_rotation2.ll @@ -0,0 +1,122 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -use-profile-based-cost-analysis < %s | FileCheck %s -check-prefix=CHECK-PROFILE + +define void @foo() { +; Test a nested loop case when profile data is not available. +; +; CHECK-LABEL: foo: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq d +; CHECK: callq e +; CHECK: callq f +; CHECK: callq g +; CHECK: callq h + +entry: + br label %header + +header: + call void @b() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + br label %header2 + +header2: + call void @c() + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then2, label %if.else2, !prof !2 + +if.then2: + call void @d() + br label %if.end2 + +if.else2: + call void @e() + br label %if.end2 + +if.end2: + call void @f() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end + +end: + ret void +} + +define void @bar() !prof !1 { +; Test a nested loop case when profile data is available. +; +; CHECK-PROFILE-LABEL: bar: +; CHECK-PROFILE: callq e +; CHECK-PROFILE: callq f +; CHECK-PROFILE: callq c +; CHECK-PROFILE: callq d +; CHECK-PROFILE: callq h +; CHECK-PROFILE: callq b +; CHECK-PROFILE: callq g + +entry: + br label %header + +header: + call void @b() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + br label %header2 + +header2: + call void @c() + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then2, label %if.else2, !prof !2 + +if.then2: + call void @d() + br label %if.end2 + +if.else2: + call void @e() + br label %if.end2 + +if.end2: + call void @f() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end + +end: + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @f() +declare void @g() +declare void @h() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 16, i32 16}