Index: lib/Transforms/Utils/LoopSimplify.cpp
===================================================================
--- lib/Transforms/Utils/LoopSimplify.cpp
+++ lib/Transforms/Utils/LoopSimplify.cpp
@@ -232,6 +232,123 @@
   return nullptr;
 }
 
+static uint64_t getIntMDOperand(MDNode* MD, int operand) {
+  Constant *C = cast<ConstantAsMetadata>(MD->getOperand(operand))->getValue();
+  return cast<ConstantInt>(C)->getSExtValue();
+}
+
+static bool isLoopLatch(Loop &L, BasicBlock *BB) {
+  assert(L.contains(BB) && "must be in loop");
+  // A basic block is a latch exactly when one of it's successors is the loop
+  // header. 
+  for (auto SI = succ_begin(BB), SE = succ_end(BB);
+       SI != SE; SI++)
+    if(*SI == L.getHeader())
+      return true;
+  return false;
+}
+
+/// Rareness - the ratio between the expected number of times the loop header
+/// executes and the expected number of times this latch block executes.
+/// This is based on *profiling* information and may be wrong, but this routine
+/// general attempts to compute a lower bound on this ratio - i.e. the latch is
+/// at least this rare (according to the profile data.)  A larger number
+/// implies the latch executes less frequently.
+static uint64_t computeLatchRareness(DominatorTree &DT, Loop &L,
+                                     BasicBlock *Latch) {
+  // The basic structure of the estimation algorithm is a walk of the
+  // dominating conditions which control this latch.  We only consider
+  // dominating conditions in LoopExit and Latch blocks so that we don't need
+  // to worry about the other path contributing to the execution count of the
+  // Latch we're interested in.  i.e. We know that to reach the Latch in
+  // question, the given edge must be traversed.  We do loose some 
+  // information to gain this simplicity.
+
+  BasicBlock *Header = L.getHeader();
+  if (Latch == Header)
+    return 1;
+  
+  BasicBlock *Current = DT.getNode(Latch)->getIDom()->getBlock();
+  BasicBlock *Last = Latch;
+  assert(DT.dominates(Header, Current) && "malformed loop");
+
+  uint64_t Rareness = 1;
+  while (true) {
+    //How likely is the path to Latch *not* to be taken?
+    uint64_t Likelyhood = 1;
+    if (L.isLoopExiting(Current) || isLoopLatch(L, Current))
+      if (BranchInst *BR = dyn_cast<BranchInst>(Current->getTerminator()))
+        if (BR->isConditional() &&
+            BR->getSuccessor(0) != BR->getSuccessor(1))
+          if (MDNode *MD = BR->getMetadata(LLVMContext::MD_prof)) {
+            uint64_t Dest1 = getIntMDOperand(MD, 1);
+            uint64_t Dest2 = getIntMDOperand(MD, 2);
+            if (BR->getSuccessor(0) == Last) {
+              Likelyhood = Dest2/Dest1;
+            } else if (BR->getSuccessor(1) == Last) {
+              Likelyhood = Dest1/Dest2;
+            } else {
+              // We get no information from this branch
+            }
+          }
+    // If the path we're interested was the most frequent one, just count it as
+    // not contributing to the rareness at all (rather than zeroing the count
+    // entirely). 
+    Likelyhood = std::max((uint64_t)1, Likelyhood);
+    // TODO: Should cap this to avoid overflow
+    Rareness *= Likelyhood;
+
+    if (Current == Header)
+      break;
+    Last = Current;
+    Current = DT.getNode(Current)->getIDom()->getBlock();
+  }
+
+  return Rareness;
+}
+
+// Go looking for a 'rare' latch block.  We could do this via block frequency,
+// but that pass is relatively expensive and not preserved.  Instead, directly
+// examine dominating prof metadata nodes to determine an upper bound on the
+// fraction of times a backedge is taken that the loop executes.  If that's
+// less than 1/10 (a randomly choosen threshold), split it out as an
+// independent outer loop.
+static BasicBlock *findRareLatchBlock(DominatorTree &DT, Loop &L) {
+  SmallVector<BasicBlock*, 8> Latches;
+  L.getLoopLatches(Latches);
+
+  uint64_t MostRare = 1;
+  BasicBlock *CandidateBB = nullptr;
+  for (BasicBlock *Latch : Latches) {
+    uint64_t Likelyhood = computeLatchRareness(DT, L, Latch);
+    // TODO: If this is a conditional latch, we're actually interested in the
+    // rareness of the *backedge*.
+    if (Likelyhood > MostRare)
+      CandidateBB = Latch;
+    MostRare = std::max(MostRare, Likelyhood);
+
+    errs() << Latch->getName() << ": " << Likelyhood << "\n";
+  }
+
+  // If each latch was executed equal often, each latch would have a rareness
+  // which is equal and #Latch*(1/R) == 1
+  const uint64_t ExpectedAvgRareness = Latches.size();
+
+  // If we found a disporpotionately rare backedge, we return it's Latch as
+  // being the one we one to split into an outer loop.  The reason we're
+  // checking relative frequency is to avoid loops with equally disributed
+  // latch probabilies where each latch is rare, but no one latch is unusually
+  // rare.  The threshold of '8' is chosen such that a single __builtin_expect
+  // is enough to indicate a latch is rare in a two latch loop with no other
+  // information.
+  if (MostRare >= ExpectedAvgRareness * 8) {
+    assert(CandidateBB);
+    return CandidateBB;
+  }
+
+  return nullptr;
+}
+
 /// \brief If this loop has multiple backedges, try to pull one of them out into
 /// a nested loop.
 ///
@@ -259,26 +376,48 @@
   if (!Preheader)
     return nullptr;
 
+  BasicBlock *Header = L->getHeader();
+
   // The header is not a landing pad; preheader insertion should ensure this.
-  assert(!L->getHeader()->isLandingPad() &&
+  assert(!Header->isLandingPad() &&
          "Can't insert backedge to landing pad");
 
-  PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AT);
-  if (!PN) return nullptr;  // No known way to partition.
-
-  // Pull out all predecessors that have varying values in the loop.  This
-  // handles the case when a PHI node has multiple instances of itself as
-  // arguments.
   SmallVector<BasicBlock*, 8> OuterLoopPreds;
-  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-    if (PN->getIncomingValue(i) != PN ||
-        !L->contains(PN->getIncomingBlock(i))) {
-      // We can't split indirectbr edges.
-      if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
-        return nullptr;
-      OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+  if (PHINode *PN = findPHIToPartitionLoops(L, AA, DT, AT)) {
+    // Pull out all predecessors that have varying values in the loop.  This
+    // handles the case when a PHI node has multiple instances of itself as
+    // arguments.
+    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+      if (PN->getIncomingValue(i) != PN ||
+          !L->contains(PN->getIncomingBlock(i))) {
+        // We can't split indirectbr edges.
+        if (isa<IndirectBrInst>(PN->getIncomingBlock(i)->getTerminator()))
+          return nullptr;
+        OuterLoopPreds.push_back(PN->getIncomingBlock(i));
+      }
     }
-  }
+  } else if (BasicBlock *RareLatch = findRareLatchBlock(*DT, *L)) {
+    // If we can find an unusually rarely taken latch, pull that out to it's
+    // own loop so that we can optimize the inner loop without worrying about
+    // rarely executed code that dominates the rare latch.
+    // Gather the list of predeccessors we want to be outside the new inner
+    // loop, this *must* include all the blocks outside the current loop, plus
+    // any predeccessors in L we want to become exits from the inner loop.
+    OuterLoopPreds.push_back(RareLatch);
+    for (pred_iterator PI=pred_begin(Header), E = pred_end(Header);
+         PI!=E; ++PI) {
+      BasicBlock *P = *PI;
+      if (!L->contains(P)) {
+        // We can't split indirectbr edges, but a preheader should never end
+        // with one.
+        if (isa<IndirectBrInst>(P->getTerminator()))
+          return nullptr;
+        OuterLoopPreds.push_back(P);
+      }
+    }
+  } else
+    return nullptr;  // No known way to partition.
+  
   DEBUG(dbgs() << "LoopSimplify: Splitting out a new outer loop\n");
 
   // If ScalarEvolution is around and knows anything about values in
@@ -287,7 +426,6 @@
   if (SE)
     SE->forgetLoop(L);
 
-  BasicBlock *Header = L->getHeader();
   BasicBlock *NewBB =
     SplitBlockPredecessors(Header, OuterLoopPreds,  ".outer", PP);