Index: lib/CodeGen/MachineBlockPlacement.cpp
===================================================================
--- lib/CodeGen/MachineBlockPlacement.cpp
+++ lib/CodeGen/MachineBlockPlacement.cpp
@@ -804,8 +804,34 @@
   for (MachineLoop *InnerLoop : L)
     buildLoopChains(F, *InnerLoop);
 
+  MachineBasicBlock *LoopHeader = L.getHeader();
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
-  BlockFilterSet LoopBlockSet(L.block_begin(), L.block_end());
+  BlockFilterSet LoopBlockSet;
+
+  // Filter cold blocks off from LoopBlockSet when profile data is available. 
+  // Collect the sum of frequencies of incoming edges to the loop header from
+  // outside. If we treat the loop as a super block, this is the frequency of
+  // the loop. Then for each block in the loop, we calculate the ratio between
+  // its frequency and the frequency of the loop block. When it is too small,
+  // don't add it to the loop chain. This needs precise profile data and we
+  // only do this when profile data is available.
+  if (F.getFunction()->getEntryCount()) {
+    BlockFrequency LoopFreq(0);
+    for (auto LoopPred : LoopHeader->predecessors())
+      if (!L.contains(LoopPred))
+        LoopFreq += MBFI->getBlockFreq(LoopPred) *
+                    MBPI->getEdgeProbability(LoopPred, LoopHeader);
+
+    // We use the threshold 20% here to filter cold blocks.
+    const unsigned LoopToColdBBRatio = 5;
+    for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+      auto Freq = MBFI->getBlockFreq(LoopBB).getFrequency();
+      if (Freq == 0 || LoopFreq.getFrequency() / Freq > LoopToColdBBRatio)
+        continue;
+      LoopBlockSet.insert(LoopBB);
+    }
+  } else
+    LoopBlockSet.insert(L.block_begin(), L.block_end());
 
   // First check to see if there is an obviously preferable top block for the
   // loop. This will default to the header, but may end up as one of the
@@ -817,7 +843,7 @@
   // profitable exit block in the event that rotating the loop can eliminate
   // branches by placing an exit edge at the bottom.
   MachineBasicBlock *ExitingBB = nullptr;
-  if (LoopTop == L.getHeader())
+  if (LoopTop == LoopHeader)
     ExitingBB = findBestLoopExit(F, L, LoopBlockSet);
 
   BlockChain &LoopChain = *BlockToChain[LoopTop];
@@ -828,7 +854,8 @@
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   assert(LoopChain.LoopPredecessors == 0);
   UpdatedPreds.insert(&LoopChain);
-  for (MachineBasicBlock *LoopBB : L.getBlocks()) {
+
+  for (MachineBasicBlock *LoopBB : LoopBlockSet) {
     BlockChain &Chain = *BlockToChain[LoopBB];
     if (!UpdatedPreds.insert(&Chain).second)
       continue;
Index: test/CodeGen/X86/code_placement_cold_loop_blocks.ll
===================================================================
--- /dev/null
+++ test/CodeGen/X86/code_placement_cold_loop_blocks.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s -check-prefix=CHECK
+
+define void @foo() !prof !1 {
+; Test if a cold block in a loop will be placed at the end of the function
+; chain.
+;
+; CHECK-LABEL: foo:
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq f
+; CHECK: callq d
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %if.then, label %if.else, !prof !2
+
+if.then:
+  call void @c()
+  br label %if.end
+
+if.else:
+  call void @d()
+  br label %if.end
+
+if.end:
+  call void @e()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header, label %end, !prof !3
+
+end:
+  call void @f()
+  ret void
+}
+
+define void @bar() !prof !1 {
+; Test if a cold block in an inner loop will be placed at the end of the
+; function chain.
+;
+; CHECK-LABEL: bar:
+; CHECK: callq b
+; CHECK: callq c
+; CHECK: callq e
+; CHECK: callq d
+
+entry:
+  br label %header
+
+header:
+  call void @b()
+  br label %header2
+
+header2:
+  call void @c()
+  %call = call zeroext i1 @a()
+  br i1 %call, label %end, label %if.else, !prof !2
+
+if.else:
+  call void @d()
+  %call2 = call zeroext i1 @a()
+  br i1 %call2, label %header2, label %header, !prof !3
+
+end:
+  call void @e()
+  ret void
+}
+
+declare zeroext i1 @a()
+declare void @b()
+declare void @c()
+declare void @d()
+declare void @e()
+declare void @f()
+
+!1 = !{!"function_entry_count", i64 1}
+!2 = !{!"branch_weights", i32 1000, i32 1}
+!3 = !{!"branch_weights", i32 100, i32 1}