Index: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp +++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp @@ -81,6 +81,23 @@ "instruction count below this threshold"), cl::init(4), cl::Hidden); +static cl::opt + PreciseRotationCost("precise-rotation-cost", + cl::desc("Model the cost of loop rotation more " + "precisely by using profile data."), + cl::init(false), cl::Hidden); + +static cl::opt MisfetchCost( + "misfetch-cost", + cl::desc("Cost that models the probablistic risk of an instruction " + "misfetch due to a jump comparing to falling through, whose cost " + "is zero."), + cl::init(1), cl::Hidden); + +static cl::opt JumpInstCost("jump-inst-cost", + cl::desc("Cost of jump instructions."), + cl::init(1), cl::Hidden); + namespace { class BlockChain; /// \brief Type for our function-wide basic block -> block chain mapping. @@ -249,6 +266,8 @@ void buildLoopChains(MachineFunction &F, MachineLoop &L); void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB, const BlockFilterSet &LoopBlockSet); + void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L, + const BlockFilterSet &LoopBlockSet); void buildCFGChains(MachineFunction &F); public: @@ -791,6 +810,156 @@ std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end()); } +/// \brief Attempt to rotate a loop based on profile data to reduce branch cost. +/// +/// With profile data, we can determine the cost in terms of missed fall through +/// opportunities when rotating a loop chain and select the best rotation. +/// Basically, there are three kinds of cost to consider for each rotation: +/// 1. The possibly missed fall through edge (if it exists) from BB out of +/// the loop to the loop header. +/// 2. The possibly missed fall through edges (if they exist) from the loop +/// exits to BB out of the loop. +/// 3. The missed fall through edge (if it exists) from the last BB to the +/// first BB in the loop chain. +/// Therefore, the cost for a given rotation is the sum of costs listed above. +/// We select the best rotation with the smallest cost. +void MachineBlockPlacement::rotateLoopWithProfile( + BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + auto HeaderBB = L.getHeader(); + auto HeaderIter = std::find(LoopChain.begin(), LoopChain.end(), HeaderBB); + auto RotationPos = LoopChain.end(); + + BlockFrequency SmallestRotationCost = BlockFrequency::getMaxFrequency(); + + // A utility lambda that scales up a block frequency by dividing it by a + // branch probability which is the reciprocal of the scale. + auto ScaleBlockFrequency = [](BlockFrequency Freq, + unsigned Scale) -> BlockFrequency { + if (Scale == 0) + return 0; + // Use operator / between BlockFrequency and BranchProbability to implement + // saturating multiplication. + return Freq / BranchProbability(1, Scale); + }; + + // Compute the cost of the missed fall-through edge to the loop header if the + // chain head is not the loop header. As we only consider natural loops with + // single header, this computation can be done only once. + BlockFrequency HeaderFallThroughCost(0); + for (auto *Pred : HeaderBB->predecessors()) { + BlockChain *PredChain = BlockToChain[Pred]; + if (!LoopBlockSet.count(Pred) && + (!PredChain || Pred == *std::prev(PredChain->end()))) { + auto EdgeFreq = + MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, HeaderBB); + auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost); + // If the predecessor has only an unconditional jump to the header, we + // need to consider the cost of this jump. + if (Pred->succ_size() == 1) + FallThruCost += ScaleBlockFrequency(EdgeFreq, JumpInstCost); + HeaderFallThroughCost = std::max(HeaderFallThroughCost, FallThruCost); + } + } + + // Here we collect all exit blocks in the loop, and for each exit we find out + // its hottest exit edge. For each loop rotation, we define the loop exit cost + // as the sum of frequencies of exit edges we collect here, excluding the exit + // edge from the tail of the loop chain. + SmallVector, 4> ExitsWithFreq; + for (auto BB : LoopChain) { + uint32_t LargestExitEdgeWeight = 0; + for (auto *Succ : BB->successors()) { + BlockChain *SuccChain = BlockToChain[Succ]; + if (!LoopBlockSet.count(Succ) && + (!SuccChain || Succ == *SuccChain->begin())) { + uint32_t SuccWeight = MBPI->getEdgeWeight(BB, Succ); + LargestExitEdgeWeight = std::max(LargestExitEdgeWeight, SuccWeight); + } + } + if (LargestExitEdgeWeight > 0) { + uint32_t WeightScale = 0; + uint32_t SumWeight = MBPI->getSumForBlock(BB, WeightScale); + auto ExitFreq = + MBFI->getBlockFreq(BB) * + BranchProbability(LargestExitEdgeWeight / WeightScale, SumWeight); + ExitsWithFreq.emplace_back(BB, ExitFreq); + } + } + + // In this loop we iterate every block in the loop chain and calculate the + // cost assuming the block is the head of the loop chain. When the loop ends, + // we should have found the best candidate as the loop chain's head. + for (auto Iter = LoopChain.begin(), TailIter = std::prev(LoopChain.end()), + EndIter = LoopChain.end(); + Iter != EndIter; Iter++, TailIter++) { + // TailIter is used to track the tail of the loop chain if the block we are + // checking (pointed by Iter) is the head of the chain. + if (TailIter == LoopChain.end()) + TailIter = LoopChain.begin(); + + auto TailBB = *TailIter; + + // Calculate the cost by putting this BB to the top. + BlockFrequency Cost = 0; + + // If the current BB is the loop header, we need to take into account the + // cost of the missed fall through edge from outside of the loop to the + // header. + if (Iter != HeaderIter) + Cost += HeaderFallThroughCost; + + // Collect the loop exit cost by summing up frequencies of all exit edges + // except the one from the chain tail. + for (auto &ExitWithFreq : ExitsWithFreq) + if (TailBB != ExitWithFreq.first) + Cost += ExitWithFreq.second; + + // The cost of breaking the once fall-through edge from the tail to the top + // of the loop chain. Here we need to consider three cases: + // 1. If the tail node has only one successor, then we will get an + // additional jmp instruction. So the cost here is (MisfetchCost + + // JumpInstCost) * tail node frequency. + // 2. If the tail node has two successors, then we may still get an + // additional jmp instruction if the layout successor after the loop + // chain is not its CFG successor. Note that the more frequently executed + // jmp instruction will be put ahead of the other one. Assume the + // frequency of those two branches are x and y, where x is the frequency + // of the edge to the chain head, then the cost will be + // (x * MisfetechCost + min(x, y) * JumpInstCost) * tail node frequency. + // 3. If the tail node has more than two successors (this rarely happens), + // we won't consider any additional cost. + if (TailBB->isSuccessor(*Iter)) { + auto TailBBFreq = MBFI->getBlockFreq(TailBB); + if (TailBB->succ_size() == 1) + Cost += ScaleBlockFrequency(TailBBFreq.getFrequency(), + MisfetchCost + JumpInstCost); + else if (TailBB->succ_size() == 2) { + auto TailToHeadProb = MBPI->getEdgeProbability(TailBB, *Iter); + auto TailToHeadFreq = TailBBFreq * TailToHeadProb; + auto ColderEdgeFreq = TailToHeadProb > BranchProbability(1, 2) + ? TailBBFreq * TailToHeadProb.getCompl() + : TailToHeadFreq; + Cost += ScaleBlockFrequency(TailToHeadFreq, MisfetchCost) + + ScaleBlockFrequency(ColderEdgeFreq, JumpInstCost); + } + } + + DEBUG(dbgs() << "The cost of loop rotation by making " << getBlockNum(*Iter) + << " to the top: " << Cost.getFrequency() << "\n"); + + if (Cost < SmallestRotationCost) { + SmallestRotationCost = Cost; + RotationPos = Iter; + } + } + + if (RotationPos != LoopChain.end()) { + DEBUG(dbgs() << "Rotate loop by making " << getBlockNum(*RotationPos) + << " to the top\n"); + std::rotate(LoopChain.begin(), RotationPos, LoopChain.end()); + } +} + /// \brief Forms basic block chains from the natural loop structures. /// /// These chains are designed to preserve the existing *structure* of the code @@ -807,17 +976,25 @@ SmallVector BlockWorkList; BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end()); + // Check if we have profile data for this function. If yes, we will rotate + // this loop by modeling costs more precisely which requires the profile data + // for better layout. + bool RotateLoopWithProfile = + PreciseRotationCost && F.getFunction()->getEntryCount(); + // First check to see if there is an obviously preferable top block for the // loop. This will default to the header, but may end up as one of the // predecessors to the header if there is one which will result in strictly // fewer branches in the loop body. - MachineBasicBlock *LoopTop = findBestLoopTop(L, LoopBlockSet); + // When we use profile data to rotate the loop, this is unnecessary. + MachineBasicBlock *LoopTop = + RotateLoopWithProfile ? L.getHeader() : findBestLoopTop(L, LoopBlockSet); // If we selected just the header for the loop top, look for a potentially // profitable exit block in the event that rotating the loop can eliminate // branches by placing an exit edge at the bottom. MachineBasicBlock *ExitingBB = nullptr; - if (LoopTop == L.getHeader()) + if (!RotateLoopWithProfile && LoopTop == L.getHeader()) ExitingBB = findBestLoopExit(F, L, LoopBlockSet); BlockChain &LoopChain = *BlockToChain[LoopTop]; @@ -848,7 +1025,11 @@ } buildChain(LoopTop, LoopChain, BlockWorkList, &LoopBlockSet); - rotateLoop(LoopChain, ExitingBB, LoopBlockSet); + + if (RotateLoopWithProfile) + rotateLoopWithProfile(LoopChain, L, LoopBlockSet); + else + rotateLoop(LoopChain, ExitingBB, LoopBlockSet); DEBUG({ // Crash at the end so we get all of the debugging output first. Index: llvm/trunk/test/CodeGen/X86/code_placement_loop_rotation.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/code_placement_loop_rotation.ll +++ llvm/trunk/test/CodeGen/X86/code_placement_loop_rotation.ll @@ -0,0 +1,80 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE + +define void @foo() { +; Test that not all edges in the loop chain are fall through without profile +; data. +; +; CHECK-LABEL: foo: +; CHECK: callq e +; CHECK: callq f +; CHECK: callq g +; CHECK: callq h + +entry: + br label %header + +header: + call void @e() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + call void @f() + br label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end + +end: + ret void +} + +define void @bar() !prof !1 { +; Test that all edges in the loop chain are fall through with profile data. +; +; CHECK-PROFILE-LABEL: bar: +; CHECK-PROFILE: callq g +; CHECK-PROFILE: callq h +; CHECK-PROFILE: callq e +; CHECK-PROFILE: callq f + +entry: + br label %header + +header: + call void @e() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + call void @f() + br label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header, label %end + +end: + ret void +} + +declare zeroext i1 @a() +declare void @e() +declare void @f() +declare void @g() +declare void @h() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 16, i32 16} Index: llvm/trunk/test/CodeGen/X86/code_placement_loop_rotation2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/code_placement_loop_rotation2.ll +++ llvm/trunk/test/CodeGen/X86/code_placement_loop_rotation2.ll @@ -0,0 +1,122 @@ +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK +; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -precise-rotation-cost < %s | FileCheck %s -check-prefix=CHECK-PROFILE + +define void @foo() { +; Test a nested loop case when profile data is not available. +; +; CHECK-LABEL: foo: +; CHECK: callq b +; CHECK: callq c +; CHECK: callq d +; CHECK: callq e +; CHECK: callq f +; CHECK: callq g +; CHECK: callq h + +entry: + br label %header + +header: + call void @b() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + br label %header2 + +header2: + call void @c() + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then2, label %if.else2, !prof !2 + +if.then2: + call void @d() + br label %if.end2 + +if.else2: + call void @e() + br label %if.end2 + +if.end2: + call void @f() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end + +end: + ret void +} + +define void @bar() !prof !1 { +; Test a nested loop case when profile data is available. +; +; CHECK-PROFILE-LABEL: bar: +; CHECK-PROFILE: callq e +; CHECK-PROFILE: callq f +; CHECK-PROFILE: callq c +; CHECK-PROFILE: callq d +; CHECK-PROFILE: callq h +; CHECK-PROFILE: callq b +; CHECK-PROFILE: callq g + +entry: + br label %header + +header: + call void @b() + %call = call zeroext i1 @a() + br i1 %call, label %if.then, label %if.else, !prof !2 + +if.then: + br label %header2 + +header2: + call void @c() + %call1 = call zeroext i1 @a() + br i1 %call1, label %if.then2, label %if.else2, !prof !2 + +if.then2: + call void @d() + br label %if.end2 + +if.else2: + call void @e() + br label %if.end2 + +if.end2: + call void @f() + %call2 = call zeroext i1 @a() + br i1 %call2, label %header2, label %if.end + +if.else: + call void @g() + br label %if.end + +if.end: + call void @h() + %call3 = call zeroext i1 @a() + br i1 %call3, label %header, label %end + +end: + ret void +} + +declare zeroext i1 @a() +declare void @b() +declare void @c() +declare void @d() +declare void @e() +declare void @f() +declare void @g() +declare void @h() + +!1 = !{!"function_entry_count", i64 1} +!2 = !{!"branch_weights", i32 16, i32 16}