Index: llvm/include/llvm/Analysis/LoopInfo.h
===================================================================
--- llvm/include/llvm/Analysis/LoopInfo.h
+++ llvm/include/llvm/Analysis/LoopInfo.h
@@ -330,6 +330,8 @@
   void verifyLoopNest(DenseSet<const LoopT*> *Loops) const;
 
   void print(raw_ostream &OS, unsigned Depth = 0) const;
+  /// Print loop with all the BBs inside it.
+  void printVerbose(raw_ostream &OS, unsigned Depth = 0) const;
 
 protected:
   friend class LoopInfoBase<BlockT, LoopT>;
@@ -451,6 +453,7 @@
   BasicBlock *getUniqueExitBlock() const;
 
   void dump() const;
+  void dumpVerbose() const;
 
   /// Return the debug location of the start of this loop.
   /// This looks for a BB terminating instruction with a known debug
Index: llvm/include/llvm/Analysis/LoopInfoImpl.h
===================================================================
--- llvm/include/llvm/Analysis/LoopInfoImpl.h
+++ llvm/include/llvm/Analysis/LoopInfoImpl.h
@@ -252,6 +252,8 @@
         HasInsideLoopSuccs = true;
         break;
       }
+    assert(HasInsideLoopSuccs && "Loop block has no in-loop successors!");
+
     typedef GraphTraits<Inverse<BlockT*> > InvBlockTraits;
     for (typename InvBlockTraits::ChildIteratorType PI =
            InvBlockTraits::child_begin(BB), PE = InvBlockTraits::child_end(BB);
@@ -262,6 +264,7 @@
       else
         OutsideLoopPreds.push_back(N);
     }
+    assert(HasInsideLoopPreds && "Loop block has no in-loop predecessors!");
 
     if (BB == getHeader()) {
         assert(!OutsideLoopPreds.empty() && "Loop is unreachable!");
@@ -275,8 +278,6 @@
           assert(CB != OutsideLoopPreds[i] &&
                  "Loop has multiple entry points!");
     }
-    assert(HasInsideLoopPreds && "Loop block has no in-loop predecessors!");
-    assert(HasInsideLoopSuccs && "Loop block has no in-loop successors!");
     assert(BB != &getHeader()->getParent()->front() &&
            "Loop contains function entry block!");
 
@@ -334,6 +335,27 @@
     (*I)->print(OS, Depth+2);
 }
 
+template<class BlockT, class LoopT>
+void LoopBase<BlockT, LoopT>::printVerbose(raw_ostream &OS,
+                                           unsigned Depth) const {
+  OS.indent(Depth*2) << "Loop at depth " << getLoopDepth()
+       << " containing: ";
+
+  BlockT *H = getHeader();
+  BlockT *L = getLoopLatch();
+  for (unsigned i = 0; i < getBlocks().size(); ++i) {
+    BlockT *BB = getBlocks()[i];
+    if (BB == H) OS << "<header>\n";
+    if (BB == L) OS << "<latch>\n";
+    if (isLoopExiting(BB)) OS << "<exiting>\n";
+    BB->print(OS);
+  }
+  OS << "\n";
+
+  for (iterator I = begin(), E = end(); I != E; ++I)
+    (*I)->print(OS, Depth+2);
+}
+
 //===----------------------------------------------------------------------===//
 /// Stable LoopInfo Analysis - Build a loop tree using stable iterators so the
 /// result does / not depend on use list (block predecessor) order.
Index: llvm/include/llvm/Transforms/Utils/Cloning.h
===================================================================
--- llvm/include/llvm/Transforms/Utils/Cloning.h
+++ llvm/include/llvm/Transforms/Utils/Cloning.h
@@ -231,6 +231,22 @@
 void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
                                ValueToValueMapTy &VMap);
 
+/// \brief Returns true of the region formed by [Entry, Exit] is
+/// a single-entry-single-exit (SESE) region. All the traces
+/// from \p Entry to \p Exit should be dominated by \p Entry
+/// and post-dominated by \p Exit.
+bool isSESE(const BasicBlock *Entry, const BasicBlock *Exit, DominatorTree *DT,
+            DominatorTree *PDT);
+
+/// \brief Returns true of the region formed by [Entry, Exit] is
+/// a single-entry-multiple-exit (SEME) region. All the traces
+/// from \p Entry which leads to the \p Exit are analyzed.
+bool isSEME(const BasicBlock *Entry, const BasicBlock *Exit, DominatorTree *DT);
+
+BasicBlock* copySEME(const SmallVectorImpl<BasicBlock *> &Blocks,
+                     const SmallPtrSetImpl<BasicBlock *> &Exits,
+                     ValueToValueMapTy &VMap, const Twine &NameSuffix,
+                     DominatorTree *DT, LoopInfo *LI);
 } // End llvm namespace
 
 #endif
Index: llvm/lib/Analysis/LoopInfo.cpp
===================================================================
--- llvm/lib/Analysis/LoopInfo.cpp
+++ llvm/lib/Analysis/LoopInfo.cpp
@@ -387,6 +387,10 @@
 LLVM_DUMP_METHOD void Loop::dump() const {
   print(dbgs());
 }
+
+LLVM_DUMP_METHOD void Loop::dumpVerbose() const {
+  printVerbose(dbgs());
+}
 #endif
 
 //===----------------------------------------------------------------------===//
Index: llvm/lib/Transforms/Scalar/LoopRotation.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -7,7 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements Loop Rotation Pass.
+// This file implements Loop Rotation Pass:
+// 1. Canonicalize loop latch to have only one successor.
+// 2. Clone all the BBs which are exiting the loop.
+// 3. Adjust phi of cloned BBs
+// 4. Add phi to the new loop header
+// 5. Update DOM
 //
 //===----------------------------------------------------------------------===//
 
@@ -30,11 +35,13 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -43,13 +50,51 @@
 
 #define DEBUG_TYPE "loop-rotate"
 
-static cl::opt<unsigned> DefaultRotationThreshold(
+static cl::opt<unsigned> RotationMaxHeaderSize(
     "rotation-max-header-size", cl::init(16), cl::Hidden,
     cl::desc("The default maximum header size for automatic loop rotation"));
 
+static cl::opt<unsigned> RotationMaxSize(
+    "rotation-max-size", cl::init(100), cl::Hidden,
+    cl::desc("The default maximum loop size for automatic loop rotation"));
+
+static cl::opt<int> MaxLoopsRotated(
+    "max-loops-rotated", cl::init(-1), cl::Hidden,
+    cl::desc("The maximum loops to be rotated (-1 means no limit)"));
+
+static int LoopsRotated = 0;
+
 STATISTIC(NumRotated, "Number of loops rotated");
 
-namespace {
+static void insertBetween(BasicBlock *NewBB, BasicBlock *PredBefore,
+                   BasicBlock *Succ) {
+  BranchInst *NewBI = BranchInst::Create(Succ, NewBB);
+  NewBI->setDebugLoc(PredBefore->getTerminator()->getDebugLoc());
+
+  BranchInst *BLI = dyn_cast<BranchInst>(PredBefore->getTerminator());
+  for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I)
+    if (BLI->getSuccessor(I) == Succ) {
+      BLI->setSuccessor(I, NewBB);
+      break;
+    }
+  // Move NewBB physically from the end of the block list.
+  Function *F = Succ->getParent();
+  F->getBasicBlockList().splice(Succ->getIterator(), F->getBasicBlockList(),
+                                NewBB);
+}
+
+// Remove the arguments of all phi nodes in PhiBB coming from block From.
+
+static void discardIncomingValues(BasicBlock *PhiBB, BasicBlock *From) {
+  for (BasicBlock::iterator I = PhiBB->begin(),
+       E = PhiBB->end(); I != E; ++I) {
+    PHINode *PN = dyn_cast<PHINode>(I);
+    if (!PN)
+      break;
+    PN->removeIncomingValue(PN->getBasicBlockIndex(From));
+  }
+}
+
 /// A simple loop rotation transformation.
 class LoopRotate {
   const unsigned MaxHeaderSize;
@@ -57,564 +102,347 @@
   const TargetTransformInfo *TTI;
   AssumptionCache *AC;
   DominatorTree *DT;
-  ScalarEvolution *SE;
+  Function *F;
+  Loop *L;
 
 public:
   LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
              const TargetTransformInfo *TTI, AssumptionCache *AC,
-             DominatorTree *DT, ScalarEvolution *SE)
-      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) {
-  }
-  bool processLoop(Loop *L);
+             DominatorTree *DT, Function *F, Loop *L)
+      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT),
+        F(F), L(L) {}
+  bool processLoop();
 
 private:
-  bool rotateLoop(Loop *L, bool SimplifiedLatch);
-  bool simplifyLoopLatch(Loop *L);
-};
-} // end anonymous namespace
-
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
-/// old header into the preheader.  If there were uses of the values produced by
-/// these instruction that were outside of the loop, we have to insert PHI nodes
-/// to merge the two values.  Do this now.
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
-                                            BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap) {
-  // Remove PHI node entries that are no longer live.
-  BasicBlock::iterator I, E = OrigHeader->end();
-  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-
-  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
-  // as necessary.
-  SSAUpdater SSA;
-  for (I = OrigHeader->begin(); I != E; ++I) {
-    Value *OrigHeaderVal = &*I;
-
-    // If there are no uses of the value (e.g. because it returns void), there
-    // is nothing to rewrite.
-    if (OrigHeaderVal->use_empty())
-      continue;
+  void rotateLoop();
 
-    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
-
-    // The value now exits in two versions: the initial value in the preheader
-    // and the loop "next" value in the original header.
-    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
-    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
-    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-
-    // Visit each use of the OrigHeader instruction.
-    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
-                             UE = OrigHeaderVal->use_end();
-         UI != UE;) {
-      // Grab the use before incrementing the iterator.
-      Use &U = *UI;
-
-      // Increment the iterator before removing the use from the list.
-      ++UI;
-
-      // SSAUpdater can't handle a non-PHI use in the same block as an
-      // earlier def. We can easily handle those cases manually.
-      Instruction *UserInst = cast<Instruction>(U.getUser());
-      if (!isa<PHINode>(UserInst)) {
-        BasicBlock *UserBB = UserInst->getParent();
-
-        // The original users in the OrigHeader are already using the
-        // original definitions.
-        if (UserBB == OrigHeader)
-          continue;
+  void addPhis(SmallVectorImpl<BasicBlock *> &Blocks, ValueToValueMapTy &VMap,
+               const Twine &NameSuffix, BasicBlock *NewHeader,
+               BasicBlock *NewPreheader, BasicBlock *NewLatch);
 
-        // Users in the OrigPreHeader need to use the value to which the
-        // original definitions are mapped.
-        if (UserBB == OrigPreheader) {
-          U = OrigPreHeaderVal;
-          continue;
-        }
-      }
+  bool isProfitableToRotate();
 
-      // Anything else can be handled by SSAUpdater.
-      SSA.RewriteUse(U);
-    }
+  bool isLegalToRotate();
 
-    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
-    // intrinsics.
-    LLVMContext &C = OrigHeader->getContext();
-    if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) {
-      if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) {
-        for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) {
-          // Grab the use before incrementing the iterator. Otherwise, altering
-          // the Use will invalidate the iterator.
-          Use &U = *UI++;
-          DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser());
-          if (!UserInst)
-            continue;
+  void adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH,
+                           BasicBlock *NewPH);
 
-          // The original users in the OrigHeader are already using the original
-          // definitions.
-          BasicBlock *UserBB = UserInst->getParent();
-          if (UserBB == OrigHeader)
-            continue;
+  BasicBlock *collectSEMEBlocks(BasicBlock *OrigH, BasicBlock *OrigLatch,
+                                SmallVectorImpl<BasicBlock *> &Blocks,
+                                SmallPtrSetImpl<BasicBlock *> &Exits);
 
-          // Users in the OrigPreHeader need to use the value to which the
-          // original definitions are mapped and anything else can be handled by
-          // the SSAUpdater. To avoid adding PHINodes, check if the value is
-          // available in UserBB, if not substitute undef.
-          Value *NewVal;
-          if (UserBB == OrigPreheader)
-            NewVal = OrigPreHeaderVal;
-          else if (SSA.HasValueForBlock(UserBB))
-            NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
-          else
-            NewVal = UndefValue::get(OrigHeaderVal->getType());
-          U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal));
-        }
-      }
-    }
-  }
-}
+  void addPhisToNewHeader(SmallVectorImpl<BasicBlock *> &Blocks,
+                          BasicBlock *NewHeader, BasicBlock *NewPreheader,
+                          BasicBlock *NewLatch, ValueToValueMapTy &VMap);
+
+};
 
-/// Rotate loop LP. Return true if the loop is rotated.
-///
-/// \param SimplifiedLatch is true if the latch was just folded into the final
-/// loop exit. In this case we may want to rotate even though the new latch is
-/// now an exiting branch. This rotation would have happened had the latch not
-/// been simplified. However, if SimplifiedLatch is false, then we avoid
-/// rotating loops in which the latch exits to avoid excessive or endless
-/// rotation. LoopRotate should be repeatable and converge to a canonical
-/// form. This property is satisfied because simplifying the loop latch can only
-/// happen once across multiple invocations of the LoopRotate pass.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+bool LoopRotate::isLegalToRotate() {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
-  BasicBlock *OrigHeader = L->getHeader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
-
-  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (!BI || BI->isUnconditional())
-    return false;
-
   // If the loop header is not one of the loop exiting blocks then
   // either this loop is already rotated or it is not
   // suitable for loop rotation transformations.
+  const BasicBlock *OrigHeader = L->getHeader();
   if (!L->isLoopExiting(OrigHeader))
     return false;
 
-  // If the loop latch already contains a branch that leaves the loop then the
-  // loop is already rotated.
-  if (!OrigLatch)
-    return false;
-
-  // Rotate if either the loop latch does *not* exit the loop, or if the loop
-  // latch was just simplified.
-  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+  const BranchInst *BI = dyn_cast<const BranchInst>(OrigHeader->getTerminator());
+  if (!BI || BI->isUnconditional())
     return false;
 
-  // Check size of original header and reject loop if it is very big or we can't
-  // duplicate blocks inside it.
-  {
-    SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
-    CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
-    if (Metrics.notDuplicatable) {
-      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
-                   << " instructions: ";
-            L->dump());
-      return false;
-    }
-    if (Metrics.convergent) {
-      DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
-                      "instructions: ";
-            L->dump());
-      return false;
-    }
-    if (Metrics.NumInsts > MaxHeaderSize)
-      return false;
-  }
-
-  // Now, this loop is suitable for rotation.
-  BasicBlock *OrigPreheader = L->getLoopPreheader();
-
   // If the loop could not be converted to canonical form, it must have an
   // indirectbr in it, just give up.
-  if (!OrigPreheader)
+  if (!L->getLoopPreheader())
     return false;
 
-  // Anything ScalarEvolution may know about this loop or the PHI nodes
-  // in its header will soon be invalidated.
-  if (SE)
-    SE->forgetLoop(L);
+  const BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!isSEME(OrigHeader, LoopLatch, DT))
+    return false;
 
-  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+  return true;
+}
 
-  // Find new Loop header. NewHeader is a Header's one and only successor
-  // that is inside loop.  Header's other successor is outside the
-  // loop.  Otherwise loop is not suitable for rotation.
-  BasicBlock *Exit = BI->getSuccessor(0);
-  BasicBlock *NewHeader = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    std::swap(Exit, NewHeader);
-  assert(NewHeader && "Unable to determine new loop header");
-  assert(L->contains(NewHeader) && !L->contains(Exit) &&
-         "Unable to determine loop header and exit blocks");
-
-  // This code assumes that the new header has exactly one predecessor.
-  // Remove any single-entry PHI nodes in it.
-  assert(NewHeader->getSinglePredecessor() &&
-         "New header doesn't have one pred!");
-  FoldSingleEntryPHINodes(NewHeader);
-
-  // Begin by walking OrigHeader and populating ValueMap with an entry for
-  // each Instruction.
-  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-  ValueToValueMapTy ValueMap;
-
-  // For PHI nodes, the value available in OldPreHeader is just the
-  // incoming value from OldPreHeader.
-  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
-
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
-  // For the rest of the instructions, either hoist to the OrigPreheader if
-  // possible or create a clone in the OldPreHeader if not.
-  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
-  while (I != E) {
-    Instruction *Inst = &*I++;
-
-    // If the instruction's operands are invariant and it doesn't read or write
-    // memory, then it is safe to hoist.  Doing this doesn't change the order of
-    // execution in the preheader, but does prevent the instruction from
-    // executing in each iteration of the loop.  This means it is safe to hoist
-    // something that might trap, but isn't safe to hoist something that reads
-    // memory (without proving that the loop doesn't write).
-    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
-        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
-      Inst->moveBefore(LoopEntryBranch);
-      continue;
-    }
+// FIXME: count the number of instructions in Blocks and discard when reaching
+// upper bound.
+bool LoopRotate::isProfitableToRotate() {
+  const BasicBlock *OrigHeader = L->getHeader();
 
-    // Otherwise, create a duplicate of the instruction.
-    Instruction *C = Inst->clone();
-
-    // Eagerly remap the operands of the instruction.
-    RemapInstruction(C, ValueMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-    // With the operands remapped, see if the instruction constant folds or is
-    // otherwise simplifyable.  This commonly occurs because the entry from PHI
-    // nodes allows icmps and other instructions to fold.
-    // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
-    Value *V = SimplifyInstruction(C, DL);
-    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-      // If so, then delete the temporary instruction and stick the folded value
-      // in the map.
-      ValueMap[Inst] = V;
-      if (!C->mayHaveSideEffects()) {
-        delete C;
-        C = nullptr;
-      }
-    } else {
-      ValueMap[Inst] = C;
-    }
-    if (C) {
-      // Otherwise, stick the new instruction into the new block!
-      C->setName(Inst->getName());
-      C->insertBefore(LoopEntryBranch);
-    }
+  // Check size of original header and reject loop if it is very big or we can't
+  // duplicate blocks inside it.
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+  CodeMetrics Metrics;
+  Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+  if (Metrics.notDuplicatable) {
+    DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+          << " instructions: ";
+        L->dump());
+    return false;
   }
 
-  // Along with all the other instructions, we just cloned OrigHeader's
-  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-  // successors by duplicating their incoming values for OrigHeader.
-  TerminatorInst *TI = OrigHeader->getTerminator();
-  for (BasicBlock *SuccBB : TI->successors())
-    for (BasicBlock::iterator BI = SuccBB->begin();
-         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-  // OrigPreHeader's old terminator (the original branch into the loop), and
-  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-  LoopEntryBranch->eraseFromParent();
-
-  // If there were any uses of instructions in the duplicated block outside the
-  // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
-
-  // NewHeader is now the header of the loop.
-  L->moveToHeader(NewHeader);
-  assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-  // At this point, we've finished our major CFG changes.  As part of cloning
-  // the loop into the preheader we've simplified instructions and the
-  // duplicated conditional branch may now be branching on a constant.  If it is
-  // branching on a constant and if that constant means that we enter the loop,
-  // then we fold away the cond branch to an uncond branch.  This simplifies the
-  // loop in cases important for nested loops, and it also means we don't have
-  // to split as many edges.
-  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-  if (!isa<ConstantInt>(PHBI->getCondition()) ||
-      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
-          NewHeader) {
-    // The conditional branch can't be folded, handle the general case.
-    // Update DominatorTree to reflect the CFG change we just made.  Then split
-    // edges as necessary to preserve LoopSimplify form.
-    if (DT) {
-      // Everything that was dominated by the old loop header is now dominated
-      // by the original loop preheader. Conceptually the header was merged
-      // into the preheader, even though we reuse the actual block as a new
-      // loop latch.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
-      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
-        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
-
-      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
-      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
-
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
-    }
-
-    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-    // thus is not a preheader anymore.
-    // Split the edge to form a real preheader.
-    BasicBlock *NewPH = SplitCriticalEdge(
-        OrigPreheader, NewHeader,
-        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-    NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-    // Preserve canonical loop form, which means that 'Exit' should have only
-    // one predecessor. Note that Exit could be an exit block for multiple
-    // nested loops, causing both of the edges to now be critical and need to
-    // be split.
-    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
-    bool SplitLatchEdge = false;
-    for (BasicBlock *ExitPred : ExitPreds) {
-      // We only need to split loop exit edges.
-      Loop *PredLoop = LI->getLoopFor(ExitPred);
-      if (!PredLoop || PredLoop->contains(Exit))
-        continue;
-      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
-        continue;
-      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-      BasicBlock *ExitSplit = SplitCriticalEdge(
-          ExitPred, Exit,
-          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-      ExitSplit->moveBefore(Exit);
-    }
-    assert(SplitLatchEdge &&
-           "Despite splitting all preds, failed to split latch exit?");
-  } else {
-    // We can fold the conditional branch in the preheader, this makes things
-    // simpler. The first step is to remove the extra edge to the Exit block.
-    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
-    NewBI->setDebugLoc(PHBI->getDebugLoc());
-    PHBI->eraseFromParent();
-
-    // With our CFG finalized, update DomTree if it is available.
-    if (DT) {
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
-
-      // Brute force incremental dominator tree update. Call
-      // findNearestCommonDominator on all CFG predecessors of each child of the
-      // original header.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      bool Changed;
-      do {
-        Changed = false;
-        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
-          DomTreeNode *Node = HeaderChildren[I];
-          BasicBlock *BB = Node->getBlock();
-
-          pred_iterator PI = pred_begin(BB);
-          BasicBlock *NearestDom = *PI;
-          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
-            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
-
-          // Remember if this changes the DomTree.
-          if (Node->getIDom()->getBlock() != NearestDom) {
-            DT->changeImmediateDominator(BB, NearestDom);
-            Changed = true;
-          }
-        }
-
-        // If the dominator changed, this may have an effect on other
-        // predecessors, continue until we reach a fixpoint.
-      } while (Changed);
-    }
+  if (Metrics.convergent) {
+    DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                    "instructions: ";
+        L->dump());
+    return false;
   }
 
-  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
-
-  // Now that the CFG and DomTree are in a consistent state again, try to merge
-  // the OrigHeader block into OrigLatch.  This will succeed if they are
-  // connected by an unconditional branch.  This is just a cleanup so the
-  // emitted code isn't too gross in this common case.
-  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+  if (Metrics.NumInsts > MaxHeaderSize)
+    return false;
 
-  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+  unsigned LoopSize = MaxHeaderSize;
+  for(const BasicBlock* BB : L->getBlocks()) {
+    if (BB->size() + LoopSize < RotationMaxSize)
+      LoopSize += BB->size();
+    else
+      return false;
+  }
 
-  ++NumRotated;
   return true;
 }
 
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
-                                  BasicBlock::iterator End, Loop *L) {
-  bool seenIncrement = false;
-  bool MultiExitLoop = false;
-
-  if (!L->getExitingBlock())
-    MultiExitLoop = true;
+// Add phis to the new header and adjust the phi nodes from the OrigHeader.
+void LoopRotate::addPhisToNewHeader(SmallVectorImpl<BasicBlock *> &Blocks,
+                                    BasicBlock *NewHeader, BasicBlock *NewPreheader,
+                                    BasicBlock *NewLatch, ValueToValueMapTy &VMap) {
+  // Add to NewHeader phi nodes for all copied variables which are used.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &Inst : *BB) {
+      // Skip Ins with no use e.g., branches.
+      if (Inst.use_begin() == Inst.use_end())
+        continue;
 
-  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+      for (auto UI = Inst.use_begin(), E = Inst.use_end(); UI != E;) {
+        Use &U = *UI++;
+        Instruction *UserInst = cast<Instruction>(U.getUser());
 
-    if (!isSafeToSpeculativelyExecute(&*I))
-      return false;
+        // Nothing to rename when the use is dominated by the definition.
+        if (DT->dominates(&Inst, UserInst))
+          continue;
 
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
+        if (!L->contains(UserInst->getParent())) {
+          // Handle uses in the loop-closed-phi.
+          PHINode *ClosePhi = cast<PHINode>(UserInst);
+          BasicBlock *Pred = ClosePhi->getIncomingBlock(U.getOperandNo());
 
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::GetElementPtr:
-      // GEPs are cheap if all indices are constant.
-      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
-        return false;
-    // fall-thru to increment case
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr: {
-      Value *IVOpnd =
-          !isa<Constant>(I->getOperand(0))
-              ? I->getOperand(0)
-              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
-      if (!IVOpnd)
-        return false;
-
-      // If increment operand is used outside of the loop, this speculation
-      // could cause extra live range interference.
-      if (MultiExitLoop) {
-        for (User *UseI : IVOpnd->users()) {
-          auto *UserInst = cast<Instruction>(UseI);
-          if (!L->contains(UserInst))
-            return false;
+          // Do not rename a loop close phi node if its predecessor in the loop
+          // is dominated by Inst.
+          if (L->contains(Pred) && DT->dominates(BB, Pred))
+            continue;
         }
+
+        PHINode *PN = PHINode::Create(Inst.getType(), 2, "phi.nh",
+                                      &*NewHeader->begin());
+        PN->addIncoming(&Inst, NewLatch);
+        PN->addIncoming(cast<Instruction>(VMap[&Inst]), NewPreheader);
+
+        // When Inst does not dominate U, it is going to use the updated
+        // definition coming from PN.
+        U.set(PN);
       }
+    }
+  }
+}
 
-      if (seenIncrement)
-        return false;
-      seenIncrement = true;
+// Add incoming values to the (already present) PHIs of NewH.
+void LoopRotate::adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH,
+                                     BasicBlock *NewPH) {
+  for (Instruction &Inst : *NewH) {
+    PHINode *PN = dyn_cast<PHINode>(&Inst);
+    if (!PN)
       break;
+    assert((PN->getNumOperands() == 1) && "NewH had multiple predecessors.");
+    Value *Op = PN->getIncomingValue(0);
+    if (Value *RenamedVal = VMap[Op])
+      PN->addIncoming(RenamedVal, NewPH);
+    else // When no mapping is available (e.g., in case of a constant).
+      PN->addIncoming(Op, NewPH);
+  }
+}
+
+BasicBlock *
+LoopRotate::collectSEMEBlocks(BasicBlock *OrigH, BasicBlock *OrigLatch,
+                              SmallVectorImpl<BasicBlock *> &Blocks,
+                              SmallPtrSetImpl<BasicBlock *> &Exits) {
+  BasicBlock *NewH = nullptr;
+  for (auto I = df_begin(OrigH), E = df_end(OrigH); I != E;) {
+    if (!L->contains(*I)) {
+      I.skipChildren();
+      continue;
     }
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-      // ignore type conversions
-      break;
+
+    // Copy until any BB where the branch does not exit loop, or the loop-latch.
+    if (OrigLatch == *I || !L->isLoopExiting(*I)
+        || !isa<BranchInst>((*I)->getTerminator())) {
+      // This will become the new header.
+      NewH = *I;
+      I.skipChildren();
+    } else {
+      Blocks.push_back(*I);
+
+      BranchInst *BI = cast<BranchInst>((*I)->getTerminator());
+      for (unsigned B = 0, E = BI->getNumSuccessors(); B < E; ++B) {
+        BasicBlock *Succ = BI->getSuccessor(B);
+        if (!L->contains(Succ))
+          Exits.insert(Succ);
+      }
+      ++I;
     }
   }
-  return true;
+  return NewH;
 }
 
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  if (!Latch || Latch->hasAddressTaken())
-    return false;
+/// Rotate loop L.
+/// TODO: arrange newly inserted bbs.
+void LoopRotate::rotateLoop() {
+  BasicBlock *OrigH = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+  BasicBlock *OrigPH = L->getLoopPreheader();
 
-  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!Jmp || !Jmp->isUnconditional())
-    return false;
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
 
-  BasicBlock *LastExit = Latch->getSinglePredecessor();
-  if (!LastExit || !L->isLoopExiting(LastExit))
-    return false;
+  // Basic blocks to be copied.
+  SmallVector<BasicBlock *, 4> Blocks;
+  SmallPtrSet<BasicBlock *, 4> Exits;
+  // Collect all nodes of the loop from header to latch.
+  BasicBlock *NewH = collectSEMEBlocks(OrigH, OrigLatch, Blocks, Exits);
+  assert(NewH);
+
+  ValueToValueMapTy VMap;
+  copySEME(Blocks, Exits, VMap, ".lr", DT, LI);
+
+  // Redirect original preheader to the entry of SEME.
+  BranchInst *PBI = dyn_cast<BranchInst>(OrigPH->getTerminator());
+  assert(PBI && (1 == PBI->getNumSuccessors()));
+
+  BasicBlock *CopyOrigH = cast<BasicBlock>(VMap[OrigH]);
+  PBI->setSuccessor(0, CopyOrigH);
+  DT->changeImmediateDominator(CopyOrigH, OrigPH);
+  L->moveToHeader(NewH);
+
+  // Remove this code.
+  BasicBlock *BeforeLoop = nullptr;
+  for (BasicBlock *BB: predecessors(NewH))
+    if (!L->contains(BB)) {
+      BeforeLoop = BB;
+      break;
+    }
+  assert(BeforeLoop);
+
+  BasicBlock *NewPH = BasicBlock::Create(NewH->getContext(),
+                                         NewH->getName() + ".lr.ph",
+                                         NewH->getParent(), BeforeLoop);
+  Loop *OuterLoop = LI->getLoopFor(OrigPH);
+  if (OuterLoop)
+    OuterLoop->addBasicBlockToLoop(NewPH, *LI);
+
+  // Move NewH physically to the beginning of the loop.
+  F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(),
+                                  NewH);
+  // BeforeLoop --> NewPH --> NewH.
+  insertBetween(NewPH, BeforeLoop, NewH);
+
+  DT->addNewBlock(NewPH, BeforeLoop);
+  DT->changeImmediateDominator(NewPH, BeforeLoop);
+  DT->changeImmediateDominator(NewH, NewPH);
+
+  // Also, the original entry lost its immediate dominator so its dominator
+  // should be adjusted. We use SEME property => idom (OrigH) = its single pred.
+  DT->changeImmediateDominator(OrigH, OrigH->getSinglePredecessor());
+
+  for (BasicBlock *BB : Blocks) {
+    typedef DomTreeNodeBase<BasicBlock> DTNode;
+    DTNode *IDom = DT->getNode(BB);
+    std::vector<DTNode*>::iterator I = IDom->begin();
+    for (; I != IDom->end();) {
+      BasicBlock *ExitBB = (*I)->getBlock();
+      if (L->contains(ExitBB)) {
+        ++I;
+        continue;
+      }
 
-  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
-  if (!BI)
-    return false;
+      BasicBlock *StaleIDom = DT->getNode(ExitBB)->getIDom()->getBlock();
+      //assert (BB == StaleIDom);
+      BasicBlock *NewBB = cast<BasicBlock>(VMap[BB]);
+      // VERIFY: NewIDom will be correct because this part of CFG is up-to-date.
+      BasicBlock *NewIDom = DT->findNearestCommonDominator(StaleIDom, NewBB);
+      NewIDom = DT->findNearestCommonDominator(NewIDom, BB);
+      if (NewIDom != StaleIDom) {
+        DT->changeImmediateDominator(ExitBB, NewIDom);
+        DEBUG(dbgs() << "\nChanging IDom of " << *ExitBB << "to" << *NewIDom);
+        I = IDom->begin();
+      } else
+        ++I;
+    }
+  }
 
-  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
-    return false;
+  adjustNewHeaderPhis(VMap, NewH, NewPH);
 
-  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
-               << LastExit->getName() << "\n");
+  BasicBlock *NewLatch = L->getLoopLatch();
+  assert(L->getLoopPreheader() && "Invalid loop preheader after rotation");
+  assert(NewLatch && "Invalid loop latch after rotation");
 
-  // Hoist the instructions from Latch into LastExit.
-  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
-                                 Latch->begin(), Jmp->getIterator());
+  addPhisToNewHeader(Blocks, NewH, NewPH, NewLatch, VMap);
 
-  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
-  BasicBlock *Header = Jmp->getSuccessor(0);
-  assert(Header == L->getHeader() && "expected a backward branch");
+  // Discard incoming values in the CopyOrigHeader, which are coming from
+  // OrigLatch since it has only one predecessor.
+  discardIncomingValues(CopyOrigH, OrigLatch);
+  discardIncomingValues(OrigH, OrigPH);
 
-  // Remove Latch from the CFG so that LastExit becomes the new Latch.
-  BI->setSuccessor(FallThruPath, Header);
-  Latch->replaceSuccessorsPhiUsesWith(LastExit);
-  Jmp->eraseFromParent();
+  assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
+  LI->verify();
+  verifyFunction(*F);
+  DEBUG(dbgs() << "LoopRotation: into "; L->dumpVerbose());
 
-  // Nuke the Latch block.
-  assert(Latch->empty() && "unable to evacuate Latch");
-  LI->removeBlock(Latch);
-  if (DT)
-    DT->eraseNode(Latch);
-  Latch->eraseFromParent();
-  return true;
+  assert (isSEME(L->getHeader(), NewLatch, DT));
+  assert (isSEME(CopyOrigH, NewPH, DT));
+
+  ++NumRotated;
 }
 
 /// Rotate \c L, and return true if any modification was made.
-bool LoopRotate::processLoop(Loop *L) {
+bool LoopRotate::processLoop() {
   // Save the loop metadata.
   MDNode *LoopMD = L->getLoopID();
 
-  // Simplify the loop latch before attempting to rotate the header
-  // upward. Rotation may not be needed if the loop tail can be folded into the
-  // loop exit.
-  bool SimplifiedLatch = simplifyLoopLatch(L);
+  BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch)
+    return false;
+
+  if (!isLegalToRotate())
+    return false;
+
+  if (!isProfitableToRotate())
+    return false;
 
-  bool MadeChange = rotateLoop(L, SimplifiedLatch);
-  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+  if (MaxLoopsRotated != -1) {
+    if (LoopsRotated >= MaxLoopsRotated)
+      return false;
+    ++LoopsRotated;
+  }
+
+  // Make sure the latch has only one successor.
+  if (!LoopLatch->getSingleSuccessor()) {
+    DEBUG(dbgs() << "\nSplitting the edge of Loop:"; L->dumpVerbose(););
+    LoopLatch = SplitEdge(LoopLatch, L->getHeader(), DT, LI);
+  }
+
+  assert(LoopLatch->getSingleSuccessor());
+
+  rotateLoop();
+  assert(L->isLoopExiting(L->getLoopLatch()) &&
          "Loop latch should be exiting after loop-rotate.");
 
   // Restore the loop metadata.
   // NB! We presume LoopRotation DOESN'T ADD its own metadata.
-  if ((MadeChange || SimplifiedLatch) && LoopMD)
+  if (LoopMD)
     L->setLoopID(LoopMD);
 
-  return MadeChange;
+  return true;
 }
 
 LoopRotatePass::LoopRotatePass() {}
@@ -630,10 +458,9 @@
 
   // Optional analyses.
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
-  auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  LoopRotate LR(DefaultRotationThreshold, LI, TTI, AC, DT, SE);
+  LoopRotate LR(RotationMaxHeaderSize, LI, TTI, AC, DT, F, &L);
 
-  bool Changed = LR.processLoop(&L);
+  bool Changed = LR.processLoop();
   if (!Changed)
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
@@ -649,7 +476,7 @@
   LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
     initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
     if (SpecifiedMaxHeaderSize == -1)
-      MaxHeaderSize = DefaultRotationThreshold;
+      MaxHeaderSize = RotationMaxHeaderSize;
     else
       MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
   }
@@ -658,6 +485,7 @@
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 
@@ -671,10 +499,8 @@
     auto *AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-    auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
-    auto *SE = SEWP ? &SEWP->getSE() : nullptr;
-    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE);
-    return LR.processLoop(L);
+    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, &F, L);
+    return LR.processLoop();
   }
 };
 }
@@ -684,6 +510,7 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
                     false)
Index: llvm/lib/Transforms/Utils/CloneFunction.cpp
===================================================================
--- llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -710,3 +710,133 @@
 
   return NewLoop;
 }
+
+bool llvm::isSESE(const BasicBlock *Entry, const BasicBlock *Exit,
+                  DominatorTree *DT, DominatorTree *PDT) {
+  if (!DT->dominates(Entry, Exit))
+    return false;
+
+  if (!PDT->dominates(Exit, Entry))
+    return false;
+
+  for (auto I = df_begin(Entry), E = df_end(Entry); I != E;) {
+    if (*I == Exit) {
+      I.skipChildren();
+      continue;
+    }
+    if (!DT->dominates(Entry, *I))
+      return false;
+    ++I;
+  }
+  return true;
+}
+
+bool llvm::isSEME(const BasicBlock *Entry, const BasicBlock *Exit,
+                  DominatorTree *DT) {
+  if (!DT->dominates(Entry, Exit))
+    return false;
+
+  for (auto I = idf_begin(Exit), E = idf_end(Exit); I != E;) {
+    if (*I == Entry) {
+      I.skipChildren();
+      continue;
+    }
+
+    if (!DT->dominates(Entry, *I))
+      return false;
+
+    ++I;
+  }
+  return true;
+}
+
+static void adjustExitingPhis(ValueToValueMapTy &VMap,
+                              const SmallPtrSetImpl<BasicBlock *> &Exits) {
+  for (BasicBlock *BB : Exits) {
+    for (Instruction &Inst : *BB) {
+      PHINode *PN = dyn_cast<PHINode>(&Inst);
+      if (!PN)
+        break;
+      bool EdgeFromOrigBB = false;
+      for (unsigned i = 0, e = PN->getNumOperands(); i != e; ++i) {
+        Value *CopyB = VMap[PN->getIncomingBlock(i)];
+        if (!CopyB) // Skip args coming from outside the SEME.
+          continue;
+        BasicBlock *CopyBB = cast<BasicBlock>(CopyB);
+        EdgeFromOrigBB = true;
+        Value *Op = PN->getIncomingValue(i);
+        if (Value *RenamedVal = VMap[Op])
+          PN->addIncoming(RenamedVal, CopyBB);
+        else
+          // When no mapping is available it must be a constant.
+          PN->addIncoming(Op, CopyBB);
+      }
+      assert(EdgeFromOrigBB && "Illegal exit from SEME.");
+    }
+  }
+}
+
+BasicBlock* llvm::copySEME(const SmallVectorImpl<BasicBlock *> &Blocks,
+                           const SmallPtrSetImpl<BasicBlock *> &Exits,
+                           ValueToValueMapTy &VMap,
+                           const Twine &NameSuffix,
+                           DominatorTree *DT, LoopInfo *LI) {
+  BasicBlock *DomEntry = DT->getNode(Blocks[0])->getIDom()->getBlock();
+  assert(DomEntry && "no dominator");
+
+  Function *F = DomEntry->getParent();
+  BasicBlock *OrigH = Blocks[0];
+  SmallVector<BasicBlock *, 4> NewBlocks;
+  for (BasicBlock *BB : Blocks) {
+    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+    // Move them physically from the end of the block list.
+    F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(),
+                                  NewBB);
+    Loop *BBLoop = LI->getLoopFor(BB);
+    Loop *BBParentLoop = BBLoop->getParentLoop();
+    if (BBParentLoop)
+      BBParentLoop->addBasicBlockToLoop(NewBB, *LI);
+    VMap[BB] = NewBB;
+    NewBlocks.push_back(NewBB);
+  }
+
+  remapInstructionsInBlocks(NewBlocks, VMap);
+
+  for (BasicBlock *BB : Blocks) {
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(VMap[BB]);
+    BranchInst *BI = dyn_cast<BranchInst>(NewBB->getTerminator());
+    if (!BI)
+      continue;
+
+    for (unsigned I = 0, E = BI->getNumSuccessors(); I < E; ++I) {
+      BasicBlock *NewSucc = cast_or_null<BasicBlock>(VMap[BI->getSuccessor(I)]);
+      if (!NewSucc)
+        continue;
+      BI->setSuccessor(I, NewSucc);
+    }
+  }
+
+  // For all the basic blocks in the SEME, update the DOM.  Except for the entry
+  // block the tree structure is the same so the dominators also follow the same
+  // structural property. If the imm-dom of orig BB is not in SEME that means it
+  // is the entry block, in that case the new idom of the new BB must be its
+  // single predecessor because we are dealing with an SEME region.
+  BasicBlock *EntryNewSEME = nullptr;
+  for (BasicBlock *BB : Blocks) {
+    BasicBlock *NewBB = cast_or_null<BasicBlock>(VMap[BB]);
+    assert(NewBB);
+
+    BasicBlock *Dom = DT->getNode(BB)->getIDom()->getBlock();
+    BasicBlock *NewDom = cast_or_null<BasicBlock>(VMap[Dom]);
+    if (!NewDom) { // Dom does not belong to SEME => entry block.
+      EntryNewSEME = NewBB;
+      NewDom = Dom;
+      assert (Dom == DomEntry);
+    }
+    DT->addNewBlock(NewBB, NewDom);
+    DT->changeImmediateDominator(NewBB, NewDom);
+  }
+
+  adjustExitingPhis(VMap, Exits);
+  return EntryNewSEME;
+}
Index: llvm/test/Analysis/GlobalsModRef/memset-escape.ll
===================================================================
--- llvm/test/Analysis/GlobalsModRef/memset-escape.ll
+++ llvm/test/Analysis/GlobalsModRef/memset-escape.ll
@@ -6,15 +6,19 @@
 @a = internal global [3 x i32] zeroinitializer, align 4
 @b = common global i32 0, align 4
 
-; The important thing we're checking for here is the reload of (some element of)
-; @a after the memset.
+; Check that load and the call to abort is redundant.
+; CHECK:   store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
+; CHECK:   store i32 0, i32* @b, align 4
+; CHECK:   br label %for.body
 
-; CHECK-LABEL: @main
-; CHECK: call void @llvm.memset.p0i8.i64{{.*}} @a
-; CHECK: store i32 3
-; CHECK: load i32, i32* getelementptr {{.*}} @a
-; CHECK: icmp eq i32
-; CHECK: br i1
+; CHECK: for.body:                                         ; preds = %for.body.preheader
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 0), align 4
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
+; CHECK:   store i32 3, i32* @b, align 4
+; CHECK:   br i1 true, label %if.end, label %if.then
+; CHECK-NOT: load
+; CHECK-NOT: call void @abort()
 
 define i32 @main() {
 entry:
Index: llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
===================================================================
--- llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
+++ llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -instcombine -indvars -loop-deletion -constmerge -S | FileCheck %s
+; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -simplifycfg -S | FileCheck %s
+
 ; PR11882: ComputeLoadConstantCompareExitLimit crash.
 ;
 ; for.body is deleted leaving a loop-invariant load.
Index: llvm/test/Transforms/LoopRotate/alloca.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/alloca.ll
+++ llvm/test/Transforms/LoopRotate/alloca.ll
@@ -5,8 +5,9 @@
 ; We expect a different value for %ptr each iteration (according to the
 ; definition of alloca). I.e. each @use must be paired with an alloca.
 
-; CHECK: call void @use(i8* %
-; CHECK: %ptr = alloca i8
+; CHECK: alloca i8
+; CHECK: call void @use
+; CHECK: alloca i8
 
 @e = global i16 10
 
Index: llvm/test/Transforms/LoopRotate/basic.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/basic.ll
+++ llvm/test/Transforms/LoopRotate/basic.ll
@@ -18,9 +18,10 @@
   %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0
   br i1 %cmp, label %for.body, label %for.end
 
-; CHECK: for.body:
+; CHECK: for.cond.lr:
 ; CHECK-NEXT: phi i32 [ 0
-; CHECK-NEXT: store i32 0
+; CHECK: for.body:
+; CHECK: store i32 0
 
 for.body:                                         ; preds = %for.cond
   store i32 0, i32* %arrayidx, align 16
Index: llvm/test/Transforms/LoopRotate/dbgvalue.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -5,9 +5,9 @@
 
 define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp !dbg !0 {
 ; CHECK-LABEL: define i32 @tak(
-; CHECK: entry
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %x
-; CHECK: tail call void @llvm.dbg.value(metadata i32 %call
+; CHECK: tailrecurse.lr
+; CHECK: call void @llvm.dbg.value(metadata i32 %x
+; CHECK: tail call void @llvm.dbg.value(metadata i32 %y
 
 entry:
   br label %tailrecurse
@@ -42,7 +42,7 @@
 ; CHECK-LABEL: define i32 @tak2(
 ; CHECK: entry
 ; CHECK: tail call void @llvm.dbg.value(metadata i32 %x.tr
-; CHECK: tail call void @llvm.dbg.value(metadata i32 undef
+; CHECK: tail call void @llvm.dbg.value(metadata i32 %y.tr
 
 entry:
   br label %tailrecurse
@@ -83,12 +83,13 @@
 ; Ensure that the loop increment basic block is rotated into the tail of the
 ; body, even though it contains a debug intrinsic call.
 ; CHECK-LABEL: define void @FindFreeHorzSeg(
+; CHECK: for.inc:
+; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
 ; CHECK: %dec = add
 ; CHECK-NEXT: tail call void @llvm.dbg.value
+; CHECK-NEXT: br label %for.cond
 ; CHECK: %cmp = icmp
 ; CHECK: br i1 %cmp
-; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
-; CHECK-NEXT: br label %for.end
 
 
 entry:
Index: llvm/test/Transforms/LoopRotate/loop-rotate-0.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-0.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @a2i_ASN1_ENUMERATED() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end94, %entry
+  %first.0 = phi i32 [ 1, %entry ], [ 0, %if.end94 ]
+  br i1 undef, label %err_sl, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  br i1 undef, label %err_sl, label %if.end75
+
+if.end75:                                         ; preds = %if.end
+  br i1 undef, label %if.end94, label %if.then93
+
+if.then93:                                        ; preds = %if.end75
+  ret i32 0
+
+if.end94:                                         ; preds = %if.end75
+  %call179 = tail call i32 @gets()
+  br label %for.cond
+
+err_sl:                                           ; preds = %if.end, %for.cond
+  unreachable
+}
+
+declare i32 @gets()
Index: llvm/test/Transforms/LoopRotate/loop-rotate-1.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-1.ll
@@ -0,0 +1,30 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%fp = type { i64 }
+
+declare void @foo(%fp* %this, %fp* %that)
+
+define void @bar(%fp* %__begin1, %fp* %__end1, %fp** %__end2) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %__end1.addr.0 = phi %fp* [ %__end1, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq %fp* %__end1.addr.0, %__begin1
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %0 = load %fp*, %fp** %__end2, align 8
+  %add.ptr = getelementptr inbounds %fp, %fp* %0, i64 -1
+  %incdec.ptr = getelementptr inbounds %fp, %fp* %__end1.addr.0, i64 -1
+  tail call void @foo(%fp* %add.ptr, %fp* %incdec.ptr)
+  %1 = load %fp*, %fp** %__end2, align 8
+  %incdec.ptr2 = getelementptr inbounds %fp, %fp* %1, i64 -1
+  store %fp* %incdec.ptr2, %fp** %__end2, align 8
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-10.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-10.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @foo()
+define i32 @bar() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.then178, %entry
+  %first.0 = phi i32 [ 1, %entry ], [ 0, %if.then178 ]
+  br i1 undef, label %err_sl, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  br i1 undef, label %err_sl, label %if.end75
+
+if.end75:                                         ; preds = %if.end
+  br i1 undef, label %if.end94, label %if.then93
+
+if.then93:                                        ; preds = %if.end75
+  unreachable
+
+if.end94:                                         ; preds = %if.end75
+  br i1 undef, label %if.then178, label %for.end182
+
+if.then178:                                       ; preds = %if.end94
+  %call179 = tail call i32 @foo()
+  br label %for.cond
+
+for.end182:                                       ; preds = %if.end94
+  ret i32 1
+
+err_sl:                                           ; preds = %if.end, %for.cond
+  unreachable
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-2.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-2.ll
@@ -0,0 +1,36 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%0 = type { i64, i64, i8* }
+%1 = type { %2 }
+%2 = type { %3 }
+%3 = type { %4 }
+%4 = type { %5 }
+%5 = type { %6 }
+%6 = type { i64, i64, i8* }
+%7 = type { i8 }
+
+declare void @foo(%0*, %0*)
+
+define linkonce_odr void @bar(%7*, %0*, %0*, %0**) {
+  br label %5
+
+; <label>:5:                                      ; preds = %8, %4
+  %6 = phi %0* [ %2, %4 ], [ %11, %8 ]
+  %7 = icmp eq %0* %6, %1
+  br i1 %7, label %14, label %8
+
+; <label>:8:                                      ; preds = %5
+  %9 = load %0*, %0** %3, align 8
+  %10 = getelementptr inbounds %0, %0* %9, i64 -1
+  %11 = getelementptr inbounds %0, %0* %6, i64 -1
+  tail call void @foo(%0* %10, %0* nonnull dereferenceable(24) %11)
+  %12 = load %0*, %0** %3, align 8
+  %13 = getelementptr inbounds %0, %0* %12, i64 -1
+  store %0* %13, %0** %3, align 8
+  br label %5
+
+; <label>:14:                                     ; preds = %5
+  ret void
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-3.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-3.ll
@@ -0,0 +1,33 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo() {
+entry:
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry
+  br i1 undef, label %inverse.exit21, label %if.else.i3
+
+if.else.i3:                                       ; preds = %for.body
+  br label %do.body.i10
+
+do.body.i10:                                      ; preds = %do.cond.i19, %if.else.i3
+  %b1.0.i4 = phi i64 [ 0, %if.else.i3 ], [ %b2.0.i7, %do.cond.i19 ]
+  %b2.0.i7 = phi i64 [ 1, %if.else.i3 ], [ %sub8.i18, %do.cond.i19 ]
+  br i1 undef, label %do.cond.thread.i15, label %do.cond.i19
+
+do.cond.thread.i15:                               ; preds = %do.body.i10
+  br label %inverse.exit21
+
+do.cond.i19:                                      ; preds = %do.body.i10
+  %mul.i17 = mul nsw i64 undef, %b2.0.i7
+  %sub8.i18 = sub nsw i64 %b1.0.i4, %mul.i17
+  br label %do.body.i10
+
+inverse.exit21:                                   ; preds = %do.cond.thread.i15, %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %inverse.exit21, %entry
+  ret void
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-4.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-4.ll
@@ -0,0 +1,88 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo() {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %if.then467, %entry
+  br i1 undef, label %if.then, label %if.end6
+
+if.then:                                          ; preds = %while.cond
+  unreachable
+
+if.end6:                                          ; preds = %while.cond
+  %conv7 = ashr exact i64 undef, 32
+  br i1 undef, label %if.else, label %if.then19
+
+if.then19:                                        ; preds = %if.end6
+  unreachable
+
+if.else:                                          ; preds = %if.end6
+  br i1 undef, label %thr, label %if.end2.i
+
+if.end2.i:                                        ; preds = %if.else
+  br i1 undef, label %thr, label %if.end37
+
+thr:                      ; preds = %if.end2.i, %if.else
+  unreachable
+
+if.end37:                                         ; preds = %if.end2.i
+  br i1 undef, label %if.else96, label %if.then40
+
+if.then40:                                        ; preds = %if.end37
+  unreachable
+
+if.else96:                                        ; preds = %if.end37
+  br i1 undef, label %if.else106, label %if.then99
+
+if.then99:                                        ; preds = %if.else96
+  unreachable
+
+if.else106:                                       ; preds = %if.else96
+  switch i32 undef, label %if.else427 [
+    i32 26, label %if.then130
+    i32 24, label %if.then130
+    i32 23, label %if.then130
+    i32 22, label %if.then130
+    i32 20, label %if.then130
+    i32 19, label %if.then130
+    i32 18, label %if.then130
+    i32 12, label %if.then130
+    i32 6, label %if.then149
+    i32 1, label %if.then172
+    i32 30, label %if.then467
+    i32 4, label %if.then193
+    i32 2, label %if.then310
+    i32 10, label %if.then369
+  ]
+
+if.then130:                                       ; preds = %if.else106, %if.else106, %if.else106, %if.else106, %if.else106, %if.else106, %if.else106, %if.else106
+  br i1 undef, label %land.lhs.true138, label %if.then467
+
+land.lhs.true138:                                 ; preds = %if.then130
+  unreachable
+
+if.then149:                                       ; preds = %if.else106
+  %add151 = add nsw i64 undef, %conv7
+  unreachable
+
+if.then172:                                       ; preds = %if.else106
+  unreachable
+
+if.then193:                                       ; preds = %if.else106
+  unreachable
+
+if.then310:                                       ; preds = %if.else106
+  unreachable
+
+if.then369:                                       ; preds = %if.else106
+  unreachable
+
+if.else427:                                       ; preds = %if.else106
+  unreachable
+
+if.then467:                                       ; preds = %if.then130, %if.else106
+  br label %while.cond
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-5.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-5.ll
@@ -0,0 +1,26 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i8*, i8*, i64) {
+  %4 = and i32 undef, 7
+  br label %5
+
+; <label>:5:                                      ; preds = %10, %3
+  %6 = phi i32 [ 0, %10 ], [ %4, %3 ]
+  br i1 undef, label %11, label %7
+
+; <label>:7:                                      ; preds = %5
+  %8 = sub nsw i32 8, %6
+  br i1 undef, label %9, label %10
+
+; <label>:9:                                      ; preds = %7
+  unreachable
+
+; <label>:10:                                     ; preds = %7
+  store i32 undef, i32* undef, align 8
+  br label %5
+
+; <label>:11:                                     ; preds = %5
+  ret void
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-6.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-6.ll
@@ -0,0 +1,75 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @foo(i8*, i32)
+
+define i32 @bar(i8*, i32, i8*, i32) {
+  switch i32 %3, label %6 [
+    i32 0, label %25
+    i32 -1, label %5
+  ]
+
+; <label>:5:                                      ; preds = %4
+  unreachable
+
+; <label>:6:                                      ; preds = %4
+  br i1 undef, label %7, label %8
+
+; <label>:7:                                      ; preds = %6
+  br i1 undef, label %9, label %10
+
+; <label>:8:                                      ; preds = %6
+  unreachable
+
+; <label>:9:                                      ; preds = %7
+  unreachable
+
+; <label>:10:                                     ; preds = %7
+  br i1 undef, label %24, label %11
+
+; <label>:11:                                     ; preds = %10
+  switch i32 undef, label %12 [
+    i32 46, label %21
+    i32 32, label %21
+  ]
+
+; <label>:12:                                     ; preds = %11
+  unreachable
+
+; <label>:13:                                     ; preds = %21
+  switch i8 undef, label %14 [
+    i8 32, label %23
+    i8 46, label %23
+  ]
+
+; <label>:14:                                     ; preds = %13
+  br i1 undef, label %15, label %16
+
+; <label>:15:                                     ; preds = %14
+  unreachable
+
+; <label>:16:                                     ; preds = %14
+  br i1 undef, label %19, label %17
+
+; <label>:17:                                     ; preds = %16
+  %18 = call i32 @foo(i8* %22, i32 10) #4
+  unreachable
+
+; <label>:19:                                     ; preds = %16
+  %20 = phi i8* [ %22, %16 ]
+  br label %21
+
+; <label>:21:                                     ; preds = %19, %11, %11
+  %22 = phi i8* [ %20, %19 ], [ null, %11 ], [ null, %11 ]
+  br i1 undef, label %23, label %13
+
+; <label>:23:                                     ; preds = %21, %13, %13
+  unreachable
+
+; <label>:24:                                     ; preds = %10
+  unreachable
+
+; <label>:25:                                     ; preds = %4
+  ret i32 0
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-7.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-7.ll
@@ -0,0 +1,20 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @bar() {
+entry:
+  br i1 undef, label %return, label %for.cond65
+
+for.cond65:                                       ; preds = %entry, %for.body68
+  %n.0.pn = phi i8* [ %n.1, %for.body68 ], [ undef, %entry ]
+  %n.1 = getelementptr inbounds i8, i8* %n.0.pn, i64 -1
+  br i1 undef, label %for.body68, label %return
+
+for.body68:                                       ; preds = %for.cond65
+  %xor71 = xor i8 undef, -1
+  br label %for.cond65
+
+return:                                           ; preds = %for.cond65, %entry
+  ret i32 undef
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-8.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-8.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i8* @foo()
+
+define void @bar() #1 {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %return, label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  br i1 undef, label %for.body, label %return
+
+for.body:                                         ; preds = %for.cond
+  %call15 = call i8* @foo()
+  br i1 undef, label %if.end20, label %return
+
+if.end20:                                         ; preds = %for.body
+  %issuer.i = getelementptr inbounds i8, i8* %call15, i64 24
+  br i1 undef, label %if.then.i, label %for.cond.i
+
+if.then.i:                                        ; preds = %if.end20
+  br i1 undef, label %if.then23, label %crl
+
+for.cond.i:                                       ; preds = %if.end20
+  br i1 undef, label %for.body.i, label %for.inc
+
+for.body.i:                                       ; preds = %for.cond.i
+  unreachable
+
+crl:                    ; preds = %if.then.i
+  br i1 undef, label %if.then23, label %for.inc
+
+if.then23:                                        ; preds = %crl, %if.then.i
+  %reason = getelementptr inbounds i8, i8* %call15, i64 32
+  br label %return
+
+for.inc:                                          ; preds = %crl, %for.cond.i
+  br label %for.cond
+
+return:                                           ; preds = %if.then23, %for.body, %for.cond, %if.end
+  ret void
+}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-9.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-9.ll
@@ -0,0 +1,50 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info
+; ModuleID = 'bugpoint-reduced-instructions.bc'
+source_filename = "bugpoint-output-cebc3aa.bc"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @bar() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret i32 -1
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %if.then39, label %if.end48
+
+if.then39:                                        ; preds = %if.end
+  unreachable
+
+if.end48:                                         ; preds = %if.end
+  br i1 undef, label %if.else61, label %if.then59
+
+if.then59:                                        ; preds = %if.end48
+  unreachable
+
+if.else61:                                        ; preds = %if.end48
+  br i1 undef, label %if.then91, label %if.end92
+
+if.then91:                                        ; preds = %if.else61
+  unreachable
+
+if.end92:                                         ; preds = %if.else61
+  br i1 undef, label %if.then96, label %while.cond
+
+if.then96:                                        ; preds = %if.end92
+  unreachable
+
+while.cond:                                       ; preds = %if.end107, %if.end92
+  %len.3 = phi i64 [ %add109, %if.end107 ], [ 0, %if.end92 ]
+  br i1 undef, label %if.then106, label %if.end107
+
+if.then106:                                       ; preds = %while.cond
+  unreachable
+
+if.end107:                                        ; preds = %while.cond
+  %add109 = add i64 undef, %len.3
+  %sub111 = sub i64 undef, undef
+  br label %while.cond
+}
Index: llvm/test/Transforms/LoopRotate/multiple-exits-merge-phi.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/multiple-exits-merge-phi.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-fdebd3b.bc"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind readonly
+define void @test1() #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  br i1 undef, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %land.rhs, %for.cond
+  %sum.1 = phi i32 [ 0, %land.rhs ], [ %sum.0, %for.cond ]
+  br i1 undef, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  br i1 undef, label %return, label %for.cond1
+
+return:                                           ; preds = %land.rhs, %for.cond
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret void
+}
+
+attributes #0 = { nounwind readonly }
+
+; check that loop with multiple exit is rotated and has a preheader.
+; CHECK: entry:
+; CHECK: br label %for.cond.lr
+; CHECK: for.cond.loopexit:
+; CHECK:   br label %for.cond
+; CHECK: for.cond.lr:
+; CHECK:   br i1 false, label %for.cond1.preheader.lr.ph, label %return.loopexit1
+; CHECK: for.cond1.preheader.lr.ph:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
Index: llvm/test/Transforms/LoopRotate/multiple-exits.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/multiple-exits.ll
+++ llvm/test/Transforms/LoopRotate/multiple-exits.ll
@@ -3,7 +3,18 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-; PR7447
+; PR7447: there should be two loops rotated.
+; CHECK-LABEL: @test1
+
+; Check that the outer loop is rotated.
+; CHECK: for.cond.lr:
+; CHECK: for.cond1.preheader.lr.ph:
+
+; Check that the inner loop is rotated.
+; CHECK: for.cond1.lr:
+; CHECK: land.rhs.lr:
+; CHECK: land.rhs.for.cond1_crit_edge.lr.ph:
+
 define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
 entry:
   br label %for.cond
@@ -31,17 +42,6 @@
 return:                                           ; preds = %for.cond, %land.rhs
   %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
   ret i32 %retval.0
-
-; CHECK-LABEL: @test1(
-; CHECK: for.cond1.preheader:
-; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
-; CHECK: br label %for.cond1
-
-; CHECK: for.cond1:
-; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
-; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
-; CHECK: %cmp2 = icmp ult i32 %i.1, 100
-; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
 }
 
 define void @test2(i32 %x) nounwind {
@@ -75,9 +75,9 @@
 
 ; CHECK-LABEL: @test2(
 ; CHECK: if.end:
-; CHECK: %inc = add i32 %i.02, 1
-; CHECK: %cmp = icmp eq i32 %inc, %x
-; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+; CHECK: %inc = add i32 %phi.nh, 1
+; CHECK: %cmp = icmp eq i32 %i.0, %x
+; CHECK: br i1 %cmp, label %return.loopexit, label %for.body
 }
 
 declare i32 @foo(i32)
Index: llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
+++ llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
@@ -3,7 +3,8 @@
 target triple = "arm64-apple-ios8.0.0"
 
 ;CHECK: for.inc:
-;CHECK-NEXT: %incdec.ptr.i = getelementptr 
+;CHECK: %incdec.ptr.i = getelementptr
+;CHECK: br
 
 ; Function Attrs: alwaysinline inlinehint nounwind readonly ssp
 define linkonce_odr hidden i64 @_ZNSt3__14findINS_11__wrap_iterIPiEEiEET_S4_S4_RKT0_(i64 %__first.coerce, i64 %__last.coerce, i32* nocapture readonly dereferenceable(4) %__value_) {
Index: llvm/test/Transforms/LoopRotate/phi-duplicate.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/phi-duplicate.ll
+++ llvm/test/Transforms/LoopRotate/phi-duplicate.ll
@@ -31,10 +31,15 @@
 ; Should only end up with one phi.
 ; CHECK-LABEL:      define void @test(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   br label %for.body
+; CHECK-NEXT:   br label %for.cond.lr
 ; CHECK:      for.body:
-; CHECK-NEXT:   %j.01 = phi i64
-; CHECK-NOT:  br
-; CHECK:   br i1 %cmp, label %for.body, label %for.end
-; CHECK:      for.end:
-; CHECK-NEXT:        ret void
+; CHECK:  %phi.nh3 = phi i64
+; CHECK:  %phi.nh2 = phi i64
+; CHECK:  %phi.nh1 = phi i64
+; CHECK:  %phi.nh = phi i64
+; CHECK:  %inc = add nsw i64 %phi.nh, 1
+; CHECK:   br label %for.cond
+; CHECK: for.cond:
+; CHECK:   br i1 %cmp
+; CHECK: for.end:
+; CHECK-NEXT:  ret void
Index: llvm/test/Transforms/LoopRotate/pr7447.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/pr7447.ll
@@ -0,0 +1,34 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-rotate -stats 2>&1 | FileCheck %s
+
+; PR7447: there should be two loops rotated.
+; CHECK: 2 loop-rotate
+
+define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ]
+  br i1 %i.0, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %for.cond, %land.rhs
+  %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ]
+  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ]
+  %cmp2 = icmp ult i32 %i.1, 100
+  br i1 %cmp2, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  %conv = zext i32 %i.1 to i64
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %a, i64 0, i64 %conv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum.1
+  %cmp4 = icmp ugt i32 %add, 1000
+  %inc = add i32 %i.1, 1
+  br i1 %cmp4, label %return, label %for.cond1
+
+return:                                           ; preds = %for.cond, %land.rhs
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret i32 %retval.0
+}
Index: llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
+++ llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
@@ -16,22 +16,28 @@
 ; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER:[^,]*]], label %outer.body
 
 ; CHECK: [[INNER_PREROTATE_PREHEADER]]:
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER_SPLIT_RETURN:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]]
+; CHECK: br label %inner.header.lr
+
+; CHECK: inner.header.lr:
+; CHECK: br i1 true, label %return, label %inner.body.lr
+
+; CHECK: inner.body.lr:
+; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]]
 
 ; CHECK: [[INNER_ROTATED_PREHEADER]]:
-; CHECK-NEXT: br label %inner.body
+; CHECK-NEXT: br label %inner.latch
 
 inner.header:
 ; Now the latch!
 ; CHECK: inner.header:
   br i1 undef, label %return, label %inner.body
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_RETURN:[^,]*]], label %inner.body
+; CHECK-NEXT: br i1 {{[^,]*}}, label %return, label %inner.body
 
 inner.body:
 ; Now the header!
 ; CHECK: inner.body:
   br i1 undef, label %outer.latch, label %inner.latch
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_OUTER_LATCH:[^,]*]], label %inner.header
+; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT]], label %inner.latch
 
 inner.latch:
 ; Dead!
@@ -42,8 +48,7 @@
   br label %outer.latch
 ; CHECK-NEXT: br label %outer.latch
 
-; L2 -> L1 exit edge needs a simplified exit block.
-; CHECK: [[INNER_SPLIT_OUTER_LATCH]]:
+; CHECK: [[OUTER_LATCH_LOOPEXIT]]:
 ; CHECK-NEXT: br label %outer.latch
 
 outer.latch:
@@ -51,14 +56,6 @@
   br label %outer.header
 ; CHECK-NEXT: br label %outer.header
 
-; L1 -> L0 exit edge need sa simplified exit block.
-; CHECK: [[INNER_PREROTATE_PREHEADER_SPLIT_RETURN]]:
-; CHECK-NEXT: br label %return
-
-; L2 -> L0 exit edge needs a simplified exit block.
-; CHECK: [[INNER_SPLIT_RETURN]]:
-; CHECK-NEXT: br label %return
-
 return:
 ; CHECK: return:
   unreachable
Index: llvm/test/Transforms/LoopRotate/simplifylatch.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/simplifylatch.ll
+++ llvm/test/Transforms/LoopRotate/simplifylatch.ll
@@ -3,8 +3,13 @@
 
 @mode_table = global [4 x i32] zeroinitializer		; <[4 x i32]*> [#uses=1]
 
+; Check that loop with multiple exits is rotated.
 ; CHECK-LABEL: @f(
-; CHECK-NOT: bb:
+; CHECK: bb.lr:
+; CHECK: bb2.lr:
+; CHECK: bb4.lr.ph:
+; CHECK: bb:
+
 define i8 @f() {
 entry:
 	tail call i32 @fegetround( )		; <i32>:0 [#uses=1]
@@ -38,11 +43,21 @@
 
 declare void @raise_exception() noreturn
 
-;CHECK: for.body.lr.ph:
-;CHECK-NEXT:  %arrayidx1 = getelementptr inbounds i8, i8* %CurPtr, i64 0
-;CHECK-NEXT:  %0 = load i8, i8* %arrayidx1, align 1
-;CHECK-NEXT:  %conv2 = sext i8 %0 to i32
-;CHECK-NEXT:  br label %for.body
+;CHECK-LABEL: @foo(
+;CHECK: for.body.lr:
+;CHECK: %arrayidx.lr = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom.lr
+;CHECK: %0 = load i8, i8* %arrayidx.lr, align 1
+;CHECK: %conv.lr = sext i8 %0 to i32
+;CHECK: %arrayidx1.lr = getelementptr inbounds i8, i8* %CurPtr, i64 0
+;CHECK: %1 = load i8, i8* %arrayidx1.lr, align 1
+;CHECK: %conv2.lr = sext i8 %1 to i32
+;CHECK: for.body:
+;CHECK: %arrayidx = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom
+;CHECK: %2 = load i8, i8* %arrayidx, align 1
+;CHECK: %conv = sext i8 %2 to i32
+;CHECK: %3 = load i8, i8* %arrayidx1, align 1
+;CHECK: %conv2 = sext i8 %3 to i32
+;CHECK: br i1 %cmp3, label %return, label %for.inc
 
 define i32 @foo(i8* %CurPtr, i32 %a) #0 {
 entry:
Index: llvm/test/Transforms/LoopSimplify/ashr-crash.ll
===================================================================
--- llvm/test/Transforms/LoopSimplify/ashr-crash.ll
+++ llvm/test/Transforms/LoopSimplify/ashr-crash.ll
@@ -29,9 +29,9 @@
 ; CHECK-LABEL: entry:
 ; CHECK-LABEL: for.cond1.preheader:
 ; CHECK-LABEL: for.body3:
-; CHECK: %cmp4.le.le
-; CHECK: %conv.le.le = zext i1 %cmp4.le.le to i32
-; CHECK: %xor.le.le = xor i32 %conv6.le.le, 1
+; CHECK: %cmp4
+; CHECK: %conv = zext i1 %cmp4 to i32
+; CHECK: %xor = xor i32 %conv6, 1
 define void @foo() {
 entry:
   br label %for.cond
Index: llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
+++ llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -4,11 +4,11 @@
 ; Loop from "rotated"
 ; CHECK: LV: Loop hints: force=enabled
 ; Loop from "nonrotated"
-; CHECK: LV: Loop hints: force=enabled
-; No more loops in the module
 ; CHECK-NOT: LV: Loop hints: force=
-; In total only 1 loop should be rotated.
-; CHECK: 1 loop-rotate
+; checking the stats for the function @nonrotated
+; CHECK: 2 gvn{{.*}}Number of equalities propagated
+; With the new loop rotation both the loops should be rotated.
+; CHECK: 2 loop-rotate
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"