Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -78,10 +78,20 @@ "over the original exit to be considered the new exit."), cl::init(0), cl::Hidden); +// Some definitions: +// +// - Outlining: placement of a basic block outside the chain or hot path. +// +// - CFG Constraints: for diamand shaped CFG parts, don't layout the blocks +// that break topological ordering. The topological ordering +// is broken only when the probability is high engough that this +// looks beneficial to do so. + static cl::opt OutlineOptionalBranches( "outline-optional-branches", - cl::desc("Put completely optional branches, i.e. branches with a common " - "post dominator, out of line."), + cl::desc("Outlining optional branches will place blocks that are optional " + "branches, i.e. branches with a common post dominator, outside " + "the hot path or chain"), cl::init(false), cl::Hidden); static cl::opt OutlineOptionalThreshold( @@ -562,15 +572,26 @@ // | Pred // | / // Succ - // In this case, we are evaluating whether to select edge -> Succ, e.g. - // set Succ as the layout successor of BB. Picking Succ as BB's - // successor breaks the CFG constraints (FIXME: define these constraints). + // + // Here, we are evaluating to select Succ or Pred as a predecessor of BB. + // If we choose Succ, then we break the topological ordering and Pred needs + // to be outlined which is shown here: + // + // ---<---- + // | | + // BB-->Succ Pred + // | | + // ------>------ + // + // The cost = taken_branch(BB->Pred) + taken_back_branch(Pred->Succ) + + // uncond_branch(Pred->Succ) + // // With this layout, Pred BB // is forced to be outlined, so the overall cost will be cost of the // branch taken from BB to Pred, plus the cost of back taken branch // from Pred to Succ, as well as the additional cost associated // with the needed unconditional jump instruction from Pred To Succ. - + // // The cost of the topological order layout is the taken branch cost // from BB to Succ, so to make BB->Succ a viable candidate, the following // must hold: @@ -632,8 +653,9 @@ // Forward checking. For case 2, SuccProb will be 1. if (SuccProb < HotProb) { - DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb - << " (prob) (CFG conflict)\n"); + DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " " + << "Respecting topological ordering because " + << "probability = " << SuccProb << "\n"); return true; } @@ -669,7 +691,7 @@ } if (BadCFGConflict) { - DEBUG(dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb + DEBUG(dbgs() << " Not a candidate: " << getBlockName(Succ) << " -> " << SuccProb << " (prob) (non-cold CFG conflict)\n"); return true; } @@ -699,7 +721,7 @@ auto AdjustedSumProb = collectViableSuccessors(BB, Chain, BlockFilter, Successors); - DEBUG(dbgs() << "Attempting merge from: " << getBlockName(BB) << "\n"); + DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n"); for (MachineBasicBlock *Succ : Successors) { auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ); BranchProbability SuccProb = @@ -718,15 +740,23 @@ continue; DEBUG( - dbgs() << " " << getBlockName(Succ) << " -> " << SuccProb - << " (prob)" + dbgs() << " Candidate: " << getBlockName(Succ) << ", probability: " + << SuccProb << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "") << "\n"); - if (BestSucc && BestProb >= SuccProb) + + if (BestSucc && BestProb >= SuccProb) { + DEBUG(dbgs() << " Not the best candidate, continuing\n"); continue; + } + + DEBUG(dbgs() << " Setting it as best candidate\n"); BestSucc = Succ; BestProb = SuccProb; } + if (BestSucc) + DEBUG(dbgs() << " Selected: " << getBlockName(BestSucc) << "\n"); + return BestSucc; } @@ -922,6 +952,12 @@ MachineBasicBlock * MachineBlockPlacement::findBestLoopTop(MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + // Moving a block up introduces an unconditional jump that skips this block + // the first time the loop is executed. This extra jump increases + // code size, so we don't do this reordering when optimising for size. + if (L.getHeader()->getParent()->getFunction()->optForSize()) + return L.getHeader(); + // Check that the header hasn't been fused with a preheader block due to // crazy branches. If it has, we need to start with the header at the top to // prevent pulling the preheader into the loop body. @@ -937,7 +973,7 @@ for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) { if (!LoopBlockSet.count(Pred)) continue; - DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", " + DEBUG(dbgs() << " header pred: " << getBlockName(Pred) << ", has " << Pred->succ_size() << " successors, "; MBFI->printBlockFreq(dbgs(), Pred) << " freq\n"); if (Pred->succ_size() > 1) @@ -1066,8 +1102,14 @@ } // Without a candidate exiting block or with only a single block in the // loop, just use the loop header to layout the loop. - if (!ExitingBB || L.getNumBlocks() == 1) + if (!ExitingBB) { + DEBUG(dbgs() << " No other candidate exit blocks, using loop header\n"); + return nullptr; + } + if (L.getNumBlocks() == 1) { + DEBUG(dbgs() << " Loop has 1 block, using loop header as exit\n"); return nullptr; + } // Also, if we have exit blocks which lead to outer loops but didn't select // one of them as the exiting block we are rotating toward, disable loop Index: test/CodeGen/X86/loop-blocks.ll =================================================================== --- test/CodeGen/X86/loop-blocks.ll +++ test/CodeGen/X86/loop-blocks.ll @@ -228,6 +228,43 @@ ret void } +; This is exactly the same function as slightly_more_involved. +; The difference is that when optimising for size, we do not want +; to see this reordering. + +; CHECK-LABEL: slightly_more_involved_2: +; CHECK-NOT: jmp .LBB5_1 +; CHECK: .LBB5_1: +; CHECK-NEXT: callq body +; CECK-NOT-NEXT: callq bar99 + +define void @slightly_more_involved_2() #0 { +entry: + br label %loop + +loop: + call void @body() + %t0 = call i32 @get() + %t1 = icmp slt i32 %t0, 2 + br i1 %t1, label %block_a, label %bb + +bb: + %t2 = call i32 @get() + %t3 = icmp slt i32 %t2, 99 + br i1 %t3, label %exit, label %loop + +block_a: + call void @bar99() + br label %loop + +exit: + call void @exit() + ret void +} + +attributes #0 = { minsize norecurse nounwind optsize readnone uwtable } + + declare void @bar99() nounwind declare void @bar100() nounwind declare void @bar101() nounwind