Index: llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
+++ llvm/trunk/lib/CodeGen/MachineBlockPlacement.cpp
@@ -1917,6 +1917,12 @@
     return;
 
   MachineBasicBlock *Top = *LoopChain.begin();
+  MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
+
+  // If ExitingBB is already the last one in a chain then nothing to do.
+  if (Bottom == ExitingBB)
+    return;
+
   bool ViableTopFallthrough = false;
   for (MachineBasicBlock *Pred : Top->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
@@ -1931,7 +1937,6 @@
   // bottom is a viable exiting block. If so, bail out as rotating will
   // introduce an unnecessary branch.
   if (ViableTopFallthrough) {
-    MachineBasicBlock *Bottom = *std::prev(LoopChain.end());
     for (MachineBasicBlock *Succ : Bottom->successors()) {
       BlockChain *SuccChain = BlockToChain[Succ];
       if (!LoopBlockSet.count(Succ) &&
@@ -1944,6 +1949,36 @@
   if (ExitIt == LoopChain.end())
     return;
 
+  // Rotating a loop exit to the bottom when there is a fallthrough to top
+  // trades the entry fallthrough for an exit fallthrough.
+  // If there is no bottom->top edge, but the chosen exit block does have
+  // a fallthrough, we break that fallthrough for nothing in return.
+
+  // Let's consider an example. We have a built chain of basic blocks
+  // B1, B2, ..., Bn, where Bk is a ExitingBB - chosen exit block.
+  // By doing a rotation we get
+  // Bk+1, ..., Bn, B1, ..., Bk
+  // Break of fallthrough to B1 is compensated by a fallthrough from Bk.
+  // If we had a fallthrough Bk -> Bk+1 it is broken now.
+  // It might be compensated by fallthrough Bn -> B1.
+  // So we have a condition to avoid creation of extra branch by loop rotation.
+  // All below must be true to avoid loop rotation:
+  //   If there is a fallthrough to top (B1)
+  //   There was fallthrough from chosen exit block (Bk) to next one (Bk+1)
+  //   There is no fallthrough from bottom (Bn) to top (B1).
+  // Please note that there is no exit fallthrough from Bn because we checked it
+  // above.
+  if (ViableTopFallthrough) {
+    assert(std::next(ExitIt) != LoopChain.end() &&
+           "Exit should not be last BB");
+    MachineBasicBlock *NextBlockInChain = *std::next(ExitIt);
+    if (ExitingBB->isSuccessor(NextBlockInChain))
+      if (!Bottom->isSuccessor(Top))
+        return;
+  }
+
+  DEBUG(dbgs() << "Rotating loop to put exit " << getBlockName(ExitingBB)
+               << " at bottom\n");
   std::rotate(LoopChain.begin(), std::next(ExitIt), LoopChain.end());
 }
 
Index: llvm/trunk/test/CodeGen/X86/block-placement.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/block-placement.ll
+++ llvm/trunk/test/CodeGen/X86/block-placement.ll
@@ -354,6 +354,7 @@
 ; single-source GCC.
 ; CHECK-LABEL: unnatural_cfg2
 ; CHECK: %entry
+; CHECK: %loop.header
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
 ; CHECK: %loop.body4
@@ -361,7 +362,6 @@
 ; CHECK: %loop.inner2.begin
 ; CHECK: %loop.body3
 ; CHECK: %loop.inner1.begin
-; CHECK: %loop.header
 ; CHECK: %bail
 
 entry:
@@ -1491,6 +1491,102 @@
   ret void
 }
 
+define i32 @not_rotate_if_extra_branch(i32 %count) {
+; Test checks that there is no loop rotation
+; if it introduces extra branch.
+; Specifically in this case because best exit is .header
+; but it has fallthrough to .middle block and last block in
+; loop chain .slow does not have afallthrough to .header.
+; CHECK-LABEL: not_rotate_if_extra_branch
+; CHECK: %.entry
+; CHECK: %.header
+; CHECK: %.middle
+; CHECK: %.backedge
+; CHECK: %.slow
+; CHECK: %.bailout
+; CHECK: %.stop
+.entry:
+  %sum.0 = shl nsw i32 %count, 1
+  br label %.header
+
+.header:
+  %i = phi i32 [ %i.1, %.backedge ], [ 0, %.entry ]
+  %sum = phi i32 [ %sum.1, %.backedge ], [ %sum.0, %.entry ]
+  %is_exc = icmp sgt i32 %i, 9000000
+  br i1 %is_exc, label %.bailout, label %.middle, !prof !13
+
+.bailout:
+  %sum.2 = add nsw i32 %count, 1
+  br label %.stop
+
+.middle:
+  %pr.1 = and i32 %i, 1023
+  %pr.2 = icmp eq i32 %pr.1, 0
+  br i1 %pr.2, label %.slow, label %.backedge, !prof !14
+
+.slow:
+  tail call void @effect(i32 %sum)
+  br label %.backedge
+
+.backedge:
+  %sum.1 = add nsw i32 %i, %sum
+  %i.1 = add nsw i32 %i, 1
+  %end = icmp slt i32 %i.1, %count
+  br i1 %end, label %.header, label %.stop, !prof !15
+
+.stop:
+  %sum.phi = phi i32 [ %sum.1, %.backedge ], [ %sum.2, %.bailout ]
+  ret i32 %sum.phi
+}
+
+define i32 @not_rotate_if_extra_branch_regression(i32 %count, i32 %init) {
+; This is a regression test against patch avoid loop rotation if
+; it introduce an extra btanch.
+; CHECK-LABEL: not_rotate_if_extra_branch_regression
+; CHECK: %.entry
+; CHECK: %.first_backedge
+; CHECK: %.slow
+; CHECK: %.second_header
+.entry:
+  %sum.0 = shl nsw i32 %count, 1
+  br label %.first_header
+
+.first_header:
+  %i = phi i32 [ %i.1, %.first_backedge ], [ 0, %.entry ]
+  %is_bo1 = icmp sgt i32 %i, 9000000
+  br i1 %is_bo1, label %.bailout, label %.first_backedge, !prof !14
+
+.first_backedge:
+  %i.1 = add nsw i32 %i, 1
+  %end = icmp slt i32 %i.1, %count
+  br i1 %end, label %.first_header, label %.second_header, !prof !13
+
+.second_header:
+  %j = phi i32 [ %j.1, %.second_backedge ], [ %init, %.first_backedge ]
+  %end.2 = icmp sgt i32 %j, %count
+  br i1 %end.2, label %.stop, label %.second_middle, !prof !14
+
+.second_middle:
+  %is_slow = icmp sgt i32 %j, 9000000
+  br i1 %is_slow, label %.slow, label %.second_backedge, !prof !14
+
+.slow:
+  tail call void @effect(i32 %j)
+  br label %.second_backedge
+
+.second_backedge:
+  %j.1 = add nsw i32 %j, 1
+  %end.3 = icmp slt i32 %j, 10000000
+  br i1 %end.3, label %.second_header, label %.stop, !prof !13
+
+.stop:
+  %res = add nsw i32 %j, %i.1
+  ret i32 %res
+
+.bailout:
+  ret i32 0
+}
+
 declare void @effect(i32)
 
 !5 = !{!"branch_weights", i32 84, i32 16}
@@ -1501,3 +1597,6 @@
 !10 = !{!"branch_weights", i32 90, i32 10}
 !11 = !{!"branch_weights", i32 1, i32 1}
 !12 = !{!"branch_weights", i32 5, i32 3}
+!13 = !{!"branch_weights", i32 1, i32 1}
+!14 = !{!"branch_weights", i32 1, i32 1023}
+!15 = !{!"branch_weights", i32 4095, i32 1}
Index: llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
+++ llvm/trunk/test/CodeGen/X86/code_placement_cold_loop_blocks.ll
@@ -37,7 +37,7 @@
   ret void
 }
 
-define void @nested_loop_0() !prof !1 {
+define void @nested_loop_0(i1 %flag) !prof !1 {
 ; Test if a block that is cold in the inner loop but not cold in the outer loop
 ; will merged to the outer loop chain.
 ;
@@ -68,8 +68,7 @@
 
 if.else:
   call void @e()
-  %call2 = call zeroext i1 @a()
-  br i1 %call2, label %header2, label %header, !prof !3
+  br i1 %flag, label %header2, label %header, !prof !3
 
 end:
   call void @f()