Index: lib/CodeGen/MachineBlockPlacement.cpp =================================================================== --- lib/CodeGen/MachineBlockPlacement.cpp +++ lib/CodeGen/MachineBlockPlacement.cpp @@ -934,6 +934,18 @@ << getBlockName(*Chain.begin()) << "\n"); } +static MachineBasicBlock * findLayoutPredecessor(MachineBasicBlock &MB) +{ + for (MachineBasicBlock *Pred : MB.predecessors()) { + if (Pred->isLayoutSuccessor(&MB)) { + DEBUG(dbgs() << " Found layout pred: " << Pred->getName() << "\n"); + return Pred; + } + } + DEBUG(dbgs() << " Cannot find layout pred.\n"); + return nullptr; +} + /// \brief Find the best loop top block for layout. /// /// Look for a block which is strictly better than the loop header for laying @@ -947,6 +959,28 @@ MachineBasicBlock * MachineBlockPlacement::findBestLoopTop(MachineLoop &L, const BlockFilterSet &LoopBlockSet) { + DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader()) + << "\n"); + // Placing the latch block before the header may introduce an extra branch + // that skips this block the first time the loop is executed, which we want to + // avoid when optimising for size. So we keep the header, i.e. don't move the + // latch block, when the header layout predecessor is a fallthrough or + // unconditional branch to the header, because that would introduce this new + // branch. But if there is already a branch, then we can still move up the + // latch block. + MachineBasicBlock * LayoutPredBB = findLayoutPredecessor(*L.getHeader()); + if (L.getHeader()->getParent()->getFunction()->optForSize() && LayoutPredBB) { + SmallVector Cond; + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; + if (!TII->analyzeBranch(*LayoutPredBB, TBB, FBB, Cond) || // fallthrough + (TBB == L.getHeader() && !FBB)) { // branch to loop header + DEBUG(dbgs() << " Using loop header: layout predecessor is a " + << "fallthrough or unconditional branch and we optimize " + << "for size\n"); + return L.getHeader(); + } + } + // Check that the header hasn't been fused with a preheader block due to // crazy branches. If it has, we need to start with the header at the top to // prevent pulling the preheader into the loop body. @@ -954,9 +988,6 @@ if (!LoopBlockSet.count(*HeaderChain.begin())) return L.getHeader(); - DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader()) - << "\n"); - BlockFrequency BestPredFreq; MachineBasicBlock *BestPred = nullptr; for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) { Index: test/CodeGen/X86/loop-blocks.ll =================================================================== --- test/CodeGen/X86/loop-blocks.ll +++ test/CodeGen/X86/loop-blocks.ll @@ -228,6 +228,43 @@ ret void } +; This is exactly the same function as slightly_more_involved. +; The difference is that when optimising for size, we do not want +; to see this reordering. + +; CHECK-LABEL: slightly_more_involved_2: +; CHECK-NOT: jmp .LBB5_1 +; CHECK: .LBB5_1: +; CHECK-NEXT: callq body +; HECK-NOT-NEXT: callq bar99 + +define void @slightly_more_involved_2() #0 { +entry: + br label %loop + +loop: + call void @body() + %t0 = call i32 @get() + %t1 = icmp slt i32 %t0, 2 + br i1 %t1, label %block_a, label %bb + +bb: + %t2 = call i32 @get() + %t3 = icmp slt i32 %t2, 99 + br i1 %t3, label %exit, label %loop + +block_a: + call void @bar99() + br label %loop + +exit: + call void @exit() + ret void +} + +attributes #0 = { minsize norecurse nounwind optsize readnone uwtable } + + declare void @bar99() nounwind declare void @bar100() nounwind declare void @bar101() nounwind