Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -934,6 +934,18 @@
                << getBlockName(*Chain.begin()) << "\n");
 }
 
+static MachineBasicBlock * findLayoutPredecessor(MachineBasicBlock &MB)
+{
+  for (MachineBasicBlock *Pred : MB.predecessors()) {
+    if (Pred->isLayoutSuccessor(&MB)) {
+      DEBUG(dbgs() << "   Found layout pred: " << Pred->getName() << "\n");
+      return Pred;
+    }
+  }
+  DEBUG(dbgs() << "    Cannot find layout pred.\n");
+  return nullptr;
+}
+
 /// \brief Find the best loop top block for layout.
 ///
 /// Look for a block which is strictly better than the loop header for laying
@@ -947,6 +959,28 @@
 MachineBasicBlock *
 MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
                                        const BlockFilterSet &LoopBlockSet) {
+  DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader())
+               << "\n");
+  // Placing the latch block before the header may introduce an extra branch
+  // that skips this block the first time the loop is executed, which we want to
+  // avoid when optimising for size. So we keep the header, i.e. don't move the
+  // latch block, when the header layout predecessor is a fallthrough or
+  // unconditional branch to the header, because that would introduce this new
+  // branch. But if there is already a branch, then we can still move up the
+  // latch block.
+  MachineBasicBlock * LayoutPredBB = findLayoutPredecessor(*L.getHeader());
+  if (L.getHeader()->getParent()->getFunction()->optForSize() && LayoutPredBB) {
+    SmallVector<MachineOperand, 4> Cond;
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+    if (!TII->analyzeBranch(*LayoutPredBB, TBB, FBB, Cond) ||    // fallthrough
+        (TBB == L.getHeader() && !FBB)) {              // branch to loop header
+      DEBUG(dbgs() << "    Using loop header: layout predecessor is a "
+                   << "fallthrough or unconditional branch and we optimize "
+                   << "for size\n");
+      return L.getHeader();
+    }
+  }
+
   // Check that the header hasn't been fused with a preheader block due to
   // crazy branches. If it has, we need to start with the header at the top to
   // prevent pulling the preheader into the loop body.
@@ -954,9 +988,6 @@
   if (!LoopBlockSet.count(*HeaderChain.begin()))
     return L.getHeader();
 
-  DEBUG(dbgs() << "Finding best loop top for: " << getBlockName(L.getHeader())
-               << "\n");
-
   BlockFrequency BestPredFreq;
   MachineBasicBlock *BestPred = nullptr;
   for (MachineBasicBlock *Pred : L.getHeader()->predecessors()) {
Index: test/CodeGen/X86/loop-blocks.ll
===================================================================
--- test/CodeGen/X86/loop-blocks.ll
+++ test/CodeGen/X86/loop-blocks.ll
@@ -228,6 +228,43 @@
   ret void
 }
 
+; This is exactly the same function as slightly_more_involved.
+; The difference is that when optimising for size, we do not want
+; to see this reordering.
+
+; CHECK-LABEL: slightly_more_involved_2:
+; CHECK-NOT:      jmp .LBB5_1
+; CHECK:          .LBB5_1:
+; CHECK-NEXT:     callq body
+; HECK-NOT-NEXT: callq bar99
+
+define void @slightly_more_involved_2() #0 {
+entry:
+  br label %loop
+
+loop:
+  call void @body()
+  %t0 = call i32 @get()
+  %t1 = icmp slt i32 %t0, 2
+  br i1 %t1, label %block_a, label %bb
+
+bb:
+  %t2 = call i32 @get()
+  %t3 = icmp slt i32 %t2, 99
+  br i1 %t3, label %exit, label %loop
+
+block_a:
+  call void @bar99()
+  br label %loop
+
+exit:
+  call void @exit()
+  ret void
+}
+
+attributes #0 = { minsize norecurse nounwind optsize readnone uwtable }
+
+
 declare void @bar99() nounwind
 declare void @bar100() nounwind
 declare void @bar101() nounwind