Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -953,6 +953,22 @@ << getBlockName(*Chain.begin()) << "\n"); } +// Helper function that checks for fallthrough layout predecessors of MB. +static bool hasFallthroughLayoutPredecessor(MachineBasicBlock &MB) { + for (MachineBasicBlock *Pred : MB.predecessors()) { + if (Pred->isLayoutSuccessor(&MB)) { + DEBUG(dbgs() << " Found layout pred: " << Pred->getName() << "\n"); + if (Pred->canFallThrough()) { + DEBUG(dbgs() << " It can fallthrough: use loop header and don't " + << "try to rotate latch block\n"); + return true; + } + } + } + DEBUG(dbgs() << " Cannot find layout pred.\n"); + return false; +} + /// \brief Find the best loop top block for layout. /// /// Look for a block which is strictly better than the loop header for laying @@ -966,6 +982,19 @@ MachineBasicBlock * MachineBlockPlacement::findBestLoopTop(MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader()) + << "\n"); + // Placing the latch block before the header may introduce an extra branch + // that skips this block the first time the loop is executed, which we want + // to avoid when optimising for size. So we keep the header, i.e. don't move + // the latch block, when the header layout predecessor can fallthrough + // because that would introduce this new branch. But if there is already a + // branch, then we can still move up the latch block without introducing an + // extra branch. + if (L.getHeader()->getParent()->getFunction()->optForSize() && + hasFallthroughLayoutPredecessor(*L.getHeader())) + return L.getHeader(); + // Check that the header hasn't been fused with a preheader block due to // crazy branches. If it has, we need to start with the header at the top to // prevent pulling the preheader into the loop body. @@ -973,9 +1002,6 @@ if (!LoopBlockSet.count(*HeaderChain.begin())) return L.getHeader(); - DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader()) - << "\n"); - BlockFrequency BestPredFreq; MachineBasicBlock *BestPred = nullptr; for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) { Index: test/CodeGen/X86/loop-blocks.ll =================================================================== --- test/CodeGen/X86/loop-blocks.ll +++ test/CodeGen/X86/loop-blocks.ll @@ -228,6 +228,41 @@ ret void } +; This is exactly the same function as slightly_more_involved. +; The difference is that when optimising for size, we do not want +; to see this reordering. + +; CHECK-LABEL: slightly_more_involved_2: +; CHECK-NOT: jmp .LBB5_1 +; CHECK: .LBB5_1: +; CHECK-NEXT: callq body + +define void @slightly_more_involved_2() #0 { +entry: + br label %loop + +loop: + call void @body() + %t0 = call i32 @get() + %t1 = icmp slt i32 %t0, 2 + br i1 %t1, label %block_a, label %bb + +bb: + %t2 = call i32 @get() + %t3 = icmp slt i32 %t2, 99 + br i1 %t3, label %exit, label %loop + +block_a: + call void @bar99() + br label %loop + +exit: + call void @exit() + ret void +} + +attributes #0 = { minsize norecurse nounwind optsize readnone uwtable } + declare void @bar99() nounwind declare void @bar100() nounwind declare void @bar101() nounwind