Index: llvm/include/llvm/Analysis/LoopInfo.h
===================================================================
--- llvm/include/llvm/Analysis/LoopInfo.h
+++ llvm/include/llvm/Analysis/LoopInfo.h
@@ -441,7 +441,9 @@
   /// header then the node is returned. If any latch instruction does not
   /// contain llvm.loop or or if multiple latches contain different nodes then
   /// 0 is returned.
-  MDNode *getLoopID() const;
+  std::pair<MDNode *, Instruction *> getLoopIDWithInstr() const;
+  MDNode *getLoopID() const { return getLoopIDWithInstr().first; }
+
   /// Set the llvm.loop loop id metadata for this loop.
   ///
   /// The LoopID metadata node will be added to each terminator instruction in
Index: llvm/include/llvm/IR/Instruction.h
===================================================================
--- llvm/include/llvm/IR/Instruction.h
+++ llvm/include/llvm/IR/Instruction.h
@@ -188,6 +188,10 @@
       getAllMetadataOtherThanDebugLocImpl(MDs);
   }
 
+  /// Get all the metadata IDs attached to this Instruction except the ones
+  /// associated with debug location. The Vector \p MDIDs is not cleared.
+  void getAllNonDebugMetadataIDs(SmallVectorImpl<unsigned> &MDIDs) const;
+
   /// Fills the AAMDNodes structure with AA metadata from this instruction.
   /// When Merge is true, the existing AA metadata is merged with that from this
   /// instruction providing the most-general result.
Index: llvm/include/llvm/Transforms/Scalar.h
===================================================================
--- llvm/include/llvm/Transforms/Scalar.h
+++ llvm/include/llvm/Transforms/Scalar.h
@@ -183,7 +183,7 @@
 //
 // LoopRotate - This pass is a simple loop rotating pass.
 //
-Pass *createLoopRotatePass(int MaxHeaderSize = -1);
+Pass *createLoopRotatePass();
 
 //===----------------------------------------------------------------------===//
 //
Index: llvm/lib/Analysis/LoopInfo.cpp
===================================================================
--- llvm/lib/Analysis/LoopInfo.cpp
+++ llvm/lib/Analysis/LoopInfo.cpp
@@ -203,10 +203,12 @@
   return true;
 }
 
-MDNode *Loop::getLoopID() const {
+std::pair<MDNode *, Instruction *> Loop::getLoopIDWithInstr() const {
   MDNode *LoopID = nullptr;
+  Instruction *I = nullptr;
   if (isLoopSimplifyForm()) {
-    LoopID = getLoopLatch()->getTerminator()->getMetadata(LLVMContext::MD_loop);
+    I = getLoopLatch()->getTerminator();
+    LoopID = I->getMetadata(LLVMContext::MD_loop);
   } else {
     // Go through each predecessor of the loop header and check the
     // terminator for the metadata.
@@ -218,17 +220,19 @@
 
       TerminatorInst *TI = BB->getTerminator();
       if (MDNode *MD = TI->getMetadata(LLVMContext::MD_loop)) {
-        if (!LoopID)
+        if (!LoopID) {
           LoopID = MD;
+          I = TI;
+        }
         else if (MD != LoopID) // Multiple MD_loop found => corrupt metadata.
-          return nullptr;
+          return { nullptr, nullptr };
       }
     }
   }
   if (!LoopID || LoopID->getNumOperands() == 0 ||
       LoopID->getOperand(0) != LoopID)
-    return nullptr;
-  return LoopID;
+    return { nullptr, nullptr };
+  return { LoopID, I };
 }
 
 void Loop::setLoopID(MDNode *LoopID) const {
Index: llvm/lib/IR/LLVMContextImpl.h
===================================================================
--- llvm/lib/IR/LLVMContextImpl.h
+++ llvm/lib/IR/LLVMContextImpl.h
@@ -993,6 +993,7 @@
   /// Copies all the current attachments into \c Result, sorting by attachment
   /// ID.  This function does \em not clear \c Result.
   void getAll(SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const;
+  void getAllIDs(SmallVectorImpl<unsigned> &Result) const;
 
   /// \brief Erase matching attachments.
   ///
Index: llvm/lib/IR/Metadata.cpp
===================================================================
--- llvm/lib/IR/Metadata.cpp
+++ llvm/lib/IR/Metadata.cpp
@@ -1106,6 +1106,11 @@
     array_pod_sort(Result.begin(), Result.end());
 }
 
+void MDAttachmentMap::getAllIDs(SmallVectorImpl<unsigned> &Result) const {
+  for (auto &A : Attachments)
+    Result.push_back(A.first);
+}
+
 void MDGlobalAttachmentMap::insert(unsigned ID, MDNode &MD) {
   Attachments.push_back({ID, TrackingMDNodeRef(&MD)});
 }
@@ -1154,14 +1159,13 @@
 }
 
 void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
-  SmallSet<unsigned, 5> KnownSet;
-  KnownSet.insert(KnownIDs.begin(), KnownIDs.end());
-
   if (!hasMetadataHashEntry())
     return; // Nothing to remove!
 
   auto &InstructionMetadata = getContext().pImpl->InstructionMetadata;
 
+  SmallSet<unsigned, 4> KnownSet;
+  KnownSet.insert(KnownIDs.begin(), KnownIDs.end());
   if (KnownSet.empty()) {
     // Just drop our entry at the store.
     InstructionMetadata.erase(this);
@@ -1270,6 +1274,15 @@
   Info.getAll(Result);
 }
 
+void
+Instruction::getAllNonDebugMetadataIDs(SmallVectorImpl<unsigned> &MDIDs) const {
+  if (!hasMetadataOtherThanDebugLoc())
+    return;
+  const auto &Info = getContext().pImpl->InstructionMetadata.find(this)->second;
+  assert(!Info.empty() && "Shouldn't have called this");
+  Info.getAllIDs(MDIDs);
+}
+
 bool Instruction::extractProfMetadata(uint64_t &TrueVal,
                                       uint64_t &FalseVal) const {
   assert(
Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -303,8 +303,10 @@
   MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
-  // Rotate Loop - disable header duplication at -Oz
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+
+  // Rotate Loop - disable loop rotation at -Oz
+  if (SizeLevel != 2)
+    MPM.add(createLoopRotatePass());
   MPM.add(createLICMPass());                  // Hoist loop invariants
   MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
   MPM.add(createCFGSimplificationPass());
@@ -545,7 +547,8 @@
   // Re-rotate loops in all our loop nests. These may have fallout out of
   // rotated form due to GVN or other transformations, and the vectorizer relies
   // on the rotated form. Disable header duplication at -Oz.
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  if (SizeLevel != 2)
+    MPM.add(createLoopRotatePass());
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
Index: llvm/lib/Transforms/Scalar/LoopRotation.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -7,18 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements Loop Rotation Pass.
+// This file implements Loop Rotation Pass:
+// 1. Canonicalize loop latch to have only one successor.
+// 2. Clone all the BBs which are exiting the loop.
+// 3. Adjust phi of cloned BBs
+// 4. Add phi to the new loop header
+// 5. Update DT
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopRotation.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CodeMetrics.h"
-#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/LoopPassManager.h"
 #include "llvm/Analysis/ScalarEvolution.h"
@@ -30,592 +35,620 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+
+#include<tuple>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-rotate"
 
-static cl::opt<unsigned> DefaultRotationThreshold(
-    "rotation-max-header-size", cl::init(16), cl::Hidden,
-    cl::desc("The default maximum header size for automatic loop rotation"));
+static cl::opt<unsigned> RotationMaxSize(
+    "rotation-max-size", cl::init(100), cl::Hidden,
+    cl::desc("The default maximum loop size for automatic loop rotation"));
+
+static cl::opt<unsigned> MaxExits("lr-max-exits", cl::init(10), cl::Hidden,
+                                  cl::desc("The maximum exits to be cloned"));
+
+static cl::opt<unsigned> RotationsMaxAllowed(
+    "lr-max-allowed", cl::init(1), cl::Hidden,
+    cl::desc("The maximum rotations of a loop"));
 
 STATISTIC(NumRotated, "Number of loops rotated");
 
-namespace {
+// Insert \p NewBB in between \p PredBefore and \p Succ, and redirect edges
+// accordingly. PredBefore -> NewBB -> Succ. Also move NewBB before Succ.
+static void insertBetween(BasicBlock *NewBB, BasicBlock *PredBefore,
+                          BasicBlock *Succ) {
+  BranchInst *NewBI = BranchInst::Create(Succ, NewBB);
+  NewBI->setDebugLoc(PredBefore->getTerminator()->getDebugLoc());
+
+  BranchInst *BLI = dyn_cast<BranchInst>(PredBefore->getTerminator());
+  for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I)
+    if (BLI->getSuccessor(I) == Succ) {
+      BLI->setSuccessor(I, NewBB);
+      break;
+    }
+  // Move NewBB physically from the end of the block list.
+  Function *F = Succ->getParent();
+  F->getBasicBlockList().splice(Succ->getIterator(), F->getBasicBlockList(),
+                                NewBB);
+}
+
+// Remove the arguments of all phi nodes in PhiBB coming from block From.
+static void discardIncomingValues(BasicBlock *PhiBB, BasicBlock *From) {
+  for (Instruction &I : *PhiBB) {
+    PHINode *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+    PN->removeIncomingValue(PN->getBasicBlockIndex(From));
+  }
+}
+
+/// Returns true of the region formed by [Entry, Exit] is a
+/// single-entry-single-exit (SESE) region. All the traces from \p Entry to
+/// \p Exit should be dominated by \p Entry and post-dominated by \p Exit.
+static bool isSingleEntrySingleExit(const BasicBlock *Entry,
+                                    const BasicBlock *Exit,
+                                    DominatorTree *DT, DominatorTree *PDT) {
+  if (!DT->dominates(Entry, Exit))
+    return false;
+
+  if (!PDT->dominates(Exit, Entry))
+    return false;
+
+  for (auto I = df_begin(Entry), E = df_end(Entry); I != E;) {
+    if (*I == Exit) {
+      I.skipChildren();
+      continue;
+    }
+    if (!DT->dominates(Entry, *I))
+      return false;
+    ++I;
+  }
+  return true;
+}
+
+/// Returns true of the region formed by [Entry, Exit] is a
+/// single-entry-multiple-exit (SEME) region. All the traces from \p Entry
+/// which leads to the \p Exit are analyzed.
+static bool isSingleEntryMultipleExit(const BasicBlock *Entry,
+                                      const BasicBlock *Exit,
+                                      DominatorTree *DT) {
+  if (!DT->dominates(Entry, Exit))
+    return false;
+
+  for (auto I = idf_begin(Exit), E = idf_end(Exit); I != E;) {
+    if (*I == Entry) {
+      I.skipChildren();
+      continue;
+    }
+
+    if (!DT->dominates(Entry, *I))
+      return false;
+
+    ++I;
+  }
+  return true;
+}
+
 /// A simple loop rotation transformation.
 class LoopRotate {
-  const unsigned MaxHeaderSize;
   LoopInfo *LI;
   const TargetTransformInfo *TTI;
   AssumptionCache *AC;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  Loop *L;
+  const char *LoopRotatedTimes = "llvm.loop.rotated.times";
+  typedef SmallVectorImpl<BasicBlock *> SmallVecBB;
+  typedef SmallPtrSetImpl<BasicBlock *> SmallPtrSetBB;
 
 public:
-  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
-             const TargetTransformInfo *TTI, AssumptionCache *AC,
-             DominatorTree *DT, ScalarEvolution *SE)
-      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) {
-  }
-  bool processLoop(Loop *L);
+  LoopRotate(LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, Loop *L)
+      : LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), L(L) {}
+  bool processLoop();
 
 private:
-  bool rotateLoop(Loop *L, bool SimplifiedLatch);
-  bool simplifyLoopLatch(Loop *L);
-};
-} // end anonymous namespace
-
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
-/// old header into the preheader.  If there were uses of the values produced by
-/// these instruction that were outside of the loop, we have to insert PHI nodes
-/// to merge the two values.  Do this now.
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
-                                            BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap) {
-  // Remove PHI node entries that are no longer live.
-  BasicBlock::iterator I, E = OrigHeader->end();
-  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-
-  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
-  // as necessary.
-  SSAUpdater SSA;
-  for (I = OrigHeader->begin(); I != E; ++I) {
-    Value *OrigHeaderVal = &*I;
-
-    // If there are no uses of the value (e.g. because it returns void), there
-    // is nothing to rewrite.
-    if (OrigHeaderVal->use_empty())
-      continue;
+  void rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks,
+                  const SmallPtrSetBB &Exits) const;
 
-    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
-
-    // The value now exits in two versions: the initial value in the preheader
-    // and the loop "next" value in the original header.
-    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
-    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
-    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-
-    // Visit each use of the OrigHeader instruction.
-    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
-                             UE = OrigHeaderVal->use_end();
-         UI != UE;) {
-      // Grab the use before incrementing the iterator.
-      Use &U = *UI;
-
-      // Increment the iterator before removing the use from the list.
-      ++UI;
-
-      // SSAUpdater can't handle a non-PHI use in the same block as an
-      // earlier def. We can easily handle those cases manually.
-      Instruction *UserInst = cast<Instruction>(U.getUser());
-      if (!isa<PHINode>(UserInst)) {
-        BasicBlock *UserBB = UserInst->getParent();
-
-        // The original users in the OrigHeader are already using the
-        // original definitions.
-        if (UserBB == OrigHeader)
-          continue;
-
-        // Users in the OrigPreHeader need to use the value to which the
-        // original definitions are mapped.
-        if (UserBB == OrigPreheader) {
-          U = OrigPreHeaderVal;
-          continue;
-        }
-      }
+  bool isProfitableToRotate(const SmallVecBB &Blocks,
+                            const SmallPtrSetBB &Exits) const;
 
-      // Anything else can be handled by SSAUpdater.
-      SSA.RewriteUse(U);
-    }
+  bool isLegalToRotate() const;
 
-    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
-    // intrinsics.
-    LLVMContext &C = OrigHeader->getContext();
-    if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) {
-      if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) {
-        for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) {
-          // Grab the use before incrementing the iterator. Otherwise, altering
-          // the Use will invalidate the iterator.
-          Use &U = *UI++;
-          DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser());
-          if (!UserInst)
-            continue;
+  void adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH,
+                           BasicBlock *NewPH) const;
 
-          // The original users in the OrigHeader are already using the original
-          // definitions.
-          BasicBlock *UserBB = UserInst->getParent();
-          if (UserBB == OrigHeader)
-            continue;
+  BasicBlock *collectSEMEBlocks(BasicBlock *OrigH, BasicBlock *OrigLatch,
+                                SmallVecBB &Blocks, SmallPtrSetBB &Exits) const;
 
-          // Users in the OrigPreHeader need to use the value to which the
-          // original definitions are mapped and anything else can be handled by
-          // the SSAUpdater. To avoid adding PHINodes, check if the value is
-          // available in UserBB, if not substitute undef.
-          Value *NewVal;
-          if (UserBB == OrigPreheader)
-            NewVal = OrigPreHeaderVal;
-          else if (SSA.HasValueForBlock(UserBB))
-            NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
-          else
-            NewVal = UndefValue::get(OrigHeaderVal->getType());
-          U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal));
-        }
-      }
-    }
-  }
-}
+  void addNewPhisToNewHeader(const SmallVecBB &Blocks, BasicBlock *NewHeader,
+                             BasicBlock *NewPreheader, BasicBlock *NewLatch,
+                             ValueToValueMapTy &VMap) const;
+};
 
-/// Rotate loop LP. Return true if the loop is rotated.
-///
-/// \param SimplifiedLatch is true if the latch was just folded into the final
-/// loop exit. In this case we may want to rotate even though the new latch is
-/// now an exiting branch. This rotation would have happened had the latch not
-/// been simplified. However, if SimplifiedLatch is false, then we avoid
-/// rotating loops in which the latch exits to avoid excessive or endless
-/// rotation. LoopRotate should be repeatable and converge to a canonical
-/// form. This property is satisfied because simplifying the loop latch can only
-/// happen once across multiple invocations of the LoopRotate pass.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+// Check if there is something preventing loop from being rotated.
+bool LoopRotate::isLegalToRotate() const {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
-  BasicBlock *OrigHeader = L->getHeader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
+  const BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch || isa<IndirectBrInst>(LoopLatch->getTerminator()))
+    return false;
 
-  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (!BI || BI->isUnconditional())
+  const BasicBlock *OrigH = L->getHeader();
+  // The header may have invoke instruction to BBs within the loop.
+  const BranchInst *BI = dyn_cast<const BranchInst>(OrigH->getTerminator());
+  if (!BI || !BI->isConditional())
     return false;
 
   // If the loop header is not one of the loop exiting blocks then
   // either this loop is already rotated or it is not
   // suitable for loop rotation transformations.
-  if (!L->isLoopExiting(OrigHeader))
+  if (!L->isLoopExiting(OrigH))
     return false;
 
-  // If the loop latch already contains a branch that leaves the loop then the
-  // loop is already rotated.
-  if (!OrigLatch)
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
     return false;
 
-  // Rotate if either the loop latch does *not* exit the loop, or if the loop
-  // latch was just simplified.
-  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+  if (!isSingleEntryMultipleExit(OrigH, LoopLatch, DT))
     return false;
 
+  return true;
+}
+
+// Check if the size of rotated loop will be within limits.
+bool LoopRotate::isProfitableToRotate(const SmallVecBB &Blocks,
+                                      const SmallPtrSetBB &Exits) const {
   // Check size of original header and reject loop if it is very big or we can't
   // duplicate blocks inside it.
-  {
-    SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
+  if (Exits.size() > MaxExits)
+    return false;
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+  unsigned LoopSize = 0;
+  for (const BasicBlock *BB : Blocks) {
     CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+    // TODO: Modify this because there might be blocks with indirectbr, invoke
+    // in the loop but we can cut the cloning part at that point and that will
+    // be the last exit BB.
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
                    << " instructions: ";
             L->dump());
       return false;
     }
+
     if (Metrics.convergent) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
                       "instructions: ";
             L->dump());
       return false;
     }
-    if (Metrics.NumInsts > MaxHeaderSize)
+
+    // TODO: Once this is crossed we can copy until the previous BB.
+    if (LoopSize + Metrics.NumInsts >= RotationMaxSize)
       return false;
+    LoopSize += Metrics.NumInsts;
   }
+  return true;
+}
 
-  // Now, this loop is suitable for rotation.
-  BasicBlock *OrigPreheader = L->getLoopPreheader();
+// Add phis to the new header and adjust the phi nodes from the OrigHeader.
+void LoopRotate::addNewPhisToNewHeader(const SmallVecBB &Blocks,
+                                       BasicBlock *NewHeader,
+                                       BasicBlock *NewPreheader,
+                                       BasicBlock *NewLatch,
+                                       ValueToValueMapTy &VMap) const {
+  // Add to NewHeader phi nodes for all copied variables which are used.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &Inst : *BB) {
+      // Skip Ins with no use e.g., branches.
+      if (Inst.use_begin() == Inst.use_end())
+        continue;
 
-  // If the loop could not be converted to canonical form, it must have an
-  // indirectbr in it, just give up.
-  if (!OrigPreheader)
-    return false;
+      for (auto UI = Inst.use_begin(), E = Inst.use_end(); UI != E;) {
+        Use &U = *UI++;
+        Instruction *UserInst = cast<Instruction>(U.getUser());
 
-  // Anything ScalarEvolution may know about this loop or the PHI nodes
-  // in its header will soon be invalidated.
-  if (SE)
-    SE->forgetLoop(L);
+        // Nothing to rename when the use is dominated by the definition.
+        if (DT->dominates(&Inst, UserInst))
+          continue;
 
-  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+        if (!L->contains(UserInst->getParent())) {
+          // Handle uses in the loop-closed-phi.
+          PHINode *ClosePhi = cast<PHINode>(UserInst);
+          BasicBlock *Pred = ClosePhi->getIncomingBlock(U.getOperandNo());
 
-  // Find new Loop header. NewHeader is a Header's one and only successor
-  // that is inside loop.  Header's other successor is outside the
-  // loop.  Otherwise loop is not suitable for rotation.
-  BasicBlock *Exit = BI->getSuccessor(0);
-  BasicBlock *NewHeader = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    std::swap(Exit, NewHeader);
-  assert(NewHeader && "Unable to determine new loop header");
-  assert(L->contains(NewHeader) && !L->contains(Exit) &&
-         "Unable to determine loop header and exit blocks");
-
-  // This code assumes that the new header has exactly one predecessor.
-  // Remove any single-entry PHI nodes in it.
-  assert(NewHeader->getSinglePredecessor() &&
-         "New header doesn't have one pred!");
-  FoldSingleEntryPHINodes(NewHeader);
-
-  // Begin by walking OrigHeader and populating ValueMap with an entry for
-  // each Instruction.
-  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-  ValueToValueMapTy ValueMap;
-
-  // For PHI nodes, the value available in OldPreHeader is just the
-  // incoming value from OldPreHeader.
-  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
-
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
-  // For the rest of the instructions, either hoist to the OrigPreheader if
-  // possible or create a clone in the OldPreHeader if not.
-  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
-  while (I != E) {
-    Instruction *Inst = &*I++;
-
-    // If the instruction's operands are invariant and it doesn't read or write
-    // memory, then it is safe to hoist.  Doing this doesn't change the order of
-    // execution in the preheader, but does prevent the instruction from
-    // executing in each iteration of the loop.  This means it is safe to hoist
-    // something that might trap, but isn't safe to hoist something that reads
-    // memory (without proving that the loop doesn't write).
-    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
-        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
-      Inst->moveBefore(LoopEntryBranch);
-      continue;
-    }
+          // Do not rename a loop close phi node if its predecessor in the loop
+          // is dominated by Inst.
+          if (L->contains(Pred) && DT->dominates(BB, Pred))
+            continue;
+        }
+
+        PHINode *PN =
+            PHINode::Create(Inst.getType(), 2, "phi.nh", &*NewHeader->begin());
+        PN->addIncoming(&Inst, NewLatch);
+        PN->addIncoming(cast<Instruction>(VMap[&Inst]), NewPreheader);
 
-    // Otherwise, create a duplicate of the instruction.
-    Instruction *C = Inst->clone();
-
-    // Eagerly remap the operands of the instruction.
-    RemapInstruction(C, ValueMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-    // With the operands remapped, see if the instruction constant folds or is
-    // otherwise simplifyable.  This commonly occurs because the entry from PHI
-    // nodes allows icmps and other instructions to fold.
-    // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
-    Value *V = SimplifyInstruction(C, DL);
-    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-      // If so, then delete the temporary instruction and stick the folded value
-      // in the map.
-      ValueMap[Inst] = V;
-      if (!C->mayHaveSideEffects()) {
-        delete C;
-        C = nullptr;
+        // When Inst does not dominate U, it is going to use the updated
+        // definition coming from PN.
+        U.set(PN);
       }
-    } else {
-      ValueMap[Inst] = C;
-    }
-    if (C) {
-      // Otherwise, stick the new instruction into the new block!
-      C->setName(Inst->getName());
-      C->insertBefore(LoopEntryBranch);
     }
   }
+}
+
+// Add incoming values to the (already present) PHIs of NewH.
+void LoopRotate::adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH,
+                                     BasicBlock *NewPH) const {
+  for (Instruction &Inst : *NewH) {
+    PHINode *PN = dyn_cast<PHINode>(&Inst);
+    if (!PN)
+      break;
+    assert((PN->getNumOperands() == 1) && "NewH had multiple predecessors.");
+    Value *Op = PN->getIncomingValue(0);
+    if (Value *RenamedVal = VMap[Op])
+      PN->addIncoming(RenamedVal, NewPH);
+    else // When no mapping is available (e.g., in case of a constant).
+      PN->addIncoming(Op, NewPH);
+  }
+}
 
-  // Along with all the other instructions, we just cloned OrigHeader's
-  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-  // successors by duplicating their incoming values for OrigHeader.
-  TerminatorInst *TI = OrigHeader->getTerminator();
-  for (BasicBlock *SuccBB : TI->successors())
-    for (BasicBlock::iterator BI = SuccBB->begin();
-         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-  // OrigPreHeader's old terminator (the original branch into the loop), and
-  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-  LoopEntryBranch->eraseFromParent();
-
-  // If there were any uses of instructions in the duplicated block outside the
-  // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
-
-  // NewHeader is now the header of the loop.
-  L->moveToHeader(NewHeader);
-  assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-  // At this point, we've finished our major CFG changes.  As part of cloning
-  // the loop into the preheader we've simplified instructions and the
-  // duplicated conditional branch may now be branching on a constant.  If it is
-  // branching on a constant and if that constant means that we enter the loop,
-  // then we fold away the cond branch to an uncond branch.  This simplifies the
-  // loop in cases important for nested loops, and it also means we don't have
-  // to split as many edges.
-  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-  if (!isa<ConstantInt>(PHBI->getCondition()) ||
-      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
-          NewHeader) {
-    // The conditional branch can't be folded, handle the general case.
-    // Update DominatorTree to reflect the CFG change we just made.  Then split
-    // edges as necessary to preserve LoopSimplify form.
-    if (DT) {
-      // Everything that was dominated by the old loop header is now dominated
-      // by the original loop preheader. Conceptually the header was merged
-      // into the preheader, even though we reuse the actual block as a new
-      // loop latch.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
-      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
-        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
-
-      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
-      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
-
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+// OrigH and OrigLatch represent an SEME region.  Collect all BasicBlocks
+// (bounded by \p OrigH and \p OrigLatch), which are exiting the loop in
+// \p Blocks and collect all the exits from the region in \p Exits Returns the
+// first basic block which was not copied.
+BasicBlock *LoopRotate::collectSEMEBlocks(BasicBlock *OrigH,
+                                          BasicBlock *OrigLatch,
+                                          SmallVecBB &Blocks,
+                                          SmallPtrSetBB &Exits) const {
+  BasicBlock *NewH = nullptr;
+  for (auto BB = df_begin(OrigH), E = df_end(OrigH); BB != E;) {
+    if (!L->contains(*BB)) {
+      BB.skipChildren();
+      continue;
     }
 
-    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-    // thus is not a preheader anymore.
-    // Split the edge to form a real preheader.
-    BasicBlock *NewPH = SplitCriticalEdge(
-        OrigPreheader, NewHeader,
-        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-    NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-    // Preserve canonical loop form, which means that 'Exit' should have only
-    // one predecessor. Note that Exit could be an exit block for multiple
-    // nested loops, causing both of the edges to now be critical and need to
-    // be split.
-    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
-    bool SplitLatchEdge = false;
-    for (BasicBlock *ExitPred : ExitPreds) {
-      // We only need to split loop exit edges.
-      Loop *PredLoop = LI->getLoopFor(ExitPred);
-      if (!PredLoop || PredLoop->contains(Exit))
-        continue;
-      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
-        continue;
-      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-      BasicBlock *ExitSplit = SplitCriticalEdge(
-          ExitPred, Exit,
-          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-      ExitSplit->moveBefore(Exit);
+    // Copy until any BB where the branch does not exit loop, or the loop-latch.
+    if (OrigLatch == *BB || !L->isLoopExiting(*BB) ||
+        !isa<BranchInst>((*BB)->getTerminator())) {
+      // This will become the new header.
+      NewH = *BB;
+      BB.skipChildren();
+    } else {
+      Blocks.push_back(*BB);
+
+      BranchInst *BI = cast<BranchInst>((*BB)->getTerminator());
+      for (unsigned B = 0, E = BI->getNumSuccessors(); B < E; ++B) {
+        BasicBlock *Succ = BI->getSuccessor(B);
+        if (!L->contains(Succ))
+          Exits.insert(Succ);
+      }
+      ++BB;
     }
-    assert(SplitLatchEdge &&
-           "Despite splitting all preds, failed to split latch exit?");
-  } else {
-    // We can fold the conditional branch in the preheader, this makes things
-    // simpler. The first step is to remove the extra edge to the Exit block.
-    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
-    NewBI->setDebugLoc(PHBI->getDebugLoc());
-    PHBI->eraseFromParent();
-
-    // With our CFG finalized, update DomTree if it is available.
-    if (DT) {
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
-
-      // Brute force incremental dominator tree update. Call
-      // findNearestCommonDominator on all CFG predecessors of each child of the
-      // original header.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      bool Changed;
-      do {
-        Changed = false;
-        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
-          DomTreeNode *Node = HeaderChildren[I];
-          BasicBlock *BB = Node->getBlock();
-
-          pred_iterator PI = pred_begin(BB);
-          BasicBlock *NearestDom = *PI;
-          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
-            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
-
-          // Remember if this changes the DomTree.
-          if (Node->getIDom()->getBlock() != NearestDom) {
-            DT->changeImmediateDominator(BB, NearestDom);
-            Changed = true;
-          }
-        }
+  }
+  return NewH;
+}
 
-        // If the dominator changed, this may have an effect on other
-        // predecessors, continue until we reach a fixpoint.
-      } while (Changed);
+// Helper function for copySEME. Adjusts the PHIs of all the \p Exits bounding
+// an SEME. \p VMap contains mapping of original BB vs copied BB.
+static void adjustExitingPhis(ValueToValueMapTy &VMap,
+                              const SmallPtrSetImpl<BasicBlock *> &Exits) {
+  for (BasicBlock *BB : Exits) {
+    for (Instruction &Inst : *BB) {
+      PHINode *PN = dyn_cast<PHINode>(&Inst);
+      if (!PN)
+        break;
+      bool EdgeFromOrigBB = false;
+      for (unsigned i = 0, e = PN->getNumOperands(); i != e; ++i) {
+        Value *CopyB = VMap[PN->getIncomingBlock(i)];
+        if (!CopyB) // Skip args coming from outside the SEME.
+          continue;
+        BasicBlock *CopyBB = cast<BasicBlock>(CopyB);
+        EdgeFromOrigBB = true;
+        Value *Op = PN->getIncomingValue(i);
+        if (Value *RenamedVal = VMap[Op])
+          PN->addIncoming(RenamedVal, CopyBB);
+        else
+          // When no mapping is available it must be a constant.
+          PN->addIncoming(Op, CopyBB);
+      }
+      assert(EdgeFromOrigBB && "Illegal exit from SEME.");
     }
   }
+}
 
-  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+/// Clones the basic blocks (\p Blocks) of an SEME bounded by \p Exits.
+/// Blocks[0] is the entry basic block of the SEME.
+/// The mapping between original BBs and correponding copied BBs are
+/// populated in \p VMap. During copy the DOM and LI of CFG are updated.
+/// \returns The entry point of the copied SEME. \p NameSuffix is used to suffix
+/// name of the copied BBs. The copied SEME is also an SEME.
+static BasicBlock* copyBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
+                              const SmallPtrSetImpl<BasicBlock *> &Exits,
+                              ValueToValueMapTy &VMap,
+                              const Twine &NameSuffix,
+                              DominatorTree *DT, LoopInfo *LI) {
+  // Step1: Clone the basic blocks and populate VMap.
+  BasicBlock *OrigH = Blocks[0];
+
+  Function *F = OrigH->getParent();
+  SmallVector<BasicBlock *, 4> NewBlocks;
+  for (BasicBlock *BB : Blocks) {
+    assert(!isa<IndirectBrInst>(BB->getTerminator()) &&
+           "Cannot clone SEME with indirect branches.");
+
+    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+    // Move them physically from the end of the block list.
+    F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(),
+                                  NewBB);
+    Loop *BBLoop = LI->getLoopFor(BB);
+    Loop *BBParentLoop = BBLoop->getParentLoop();
+    if (BBParentLoop)
+      BBParentLoop->addBasicBlockToLoop(NewBB, *LI);
+    VMap[BB] = NewBB;
+    NewBlocks.push_back(NewBB);
+  }
 
-  // Now that the CFG and DomTree are in a consistent state again, try to merge
-  // the OrigHeader block into OrigLatch.  This will succeed if they are
-  // connected by an unconditional branch.  This is just a cleanup so the
-  // emitted code isn't too gross in this common case.
-  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+  // Step2: Remap the names in copied BBs.
+  remapInstructionsInBlocks(NewBlocks, VMap);
 
-  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+  // Step3: Redirect the edges.
+  for (BasicBlock *BB : Blocks) {
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[BB]);
+    BranchInst *BI = dyn_cast<BranchInst>(NewBB->getTerminator());
+    if (!BI)
+      continue;
 
-  ++NumRotated;
-  return true;
-}
+    for (unsigned I = 0, E = BI->getNumSuccessors(); I < E; ++I)
+      if (auto *NewSucc = cast_or_null<BasicBlock>(VMap[BI->getSuccessor(I)]))
+        BI->setSuccessor(I, NewSucc);
+  }
 
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
-                                  BasicBlock::iterator End, Loop *L) {
-  bool seenIncrement = false;
-  bool MultiExitLoop = false;
+  // Step4: Update the DOM of copied SEME. Except for the entry block its tree
+  // structure is the same as of original SEME so the dominators also follow the
+  // same structural property. If the IDom of original BB is not in SEME that
+  // means it is the entry block, in that case the new IDom of the new BB must
+  // be its single predecessor because we are dealing with an SEME region.
+  BasicBlock *EntryNewSEME = nullptr;
+  if (auto *DomT = DT->getNode(Blocks[0])->getIDom()) {
+    // Entry to SEME has a dominator, update the copied entry.
+    BasicBlock *Dom = DomT->getBlock();
+    assert(!VMap[Dom]); // Dom does not belong to SEME => entry block.
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[Blocks[0]]);
+    EntryNewSEME = NewBB;
+    DT->addNewBlock(NewBB, Dom);
+    DT->changeImmediateDominator(NewBB, Dom);
+  }
 
-  if (!L->getExitingBlock())
-    MultiExitLoop = true;
+  for (auto BBI = std::next(Blocks.begin()); BBI != Blocks.end(); ++BBI) {
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[*BBI]);
+    BasicBlock *Dom = DT->getNode(*BBI)->getIDom()->getBlock();
+    BasicBlock *NewDom = cast<BasicBlock>(VMap[Dom]);
+    DT->addNewBlock(NewBB, NewDom);
+    DT->changeImmediateDominator(NewBB, NewDom);
+  }
 
-  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+  // Step5: Adjust PHI nodes.
+  adjustExitingPhis(VMap, Exits);
+  return EntryNewSEME;
+}
 
-    if (!isSafeToSpeculativelyExecute(&*I))
-      return false;
+/// Rotate loop L.
+/// TODO: arrange newly inserted bbs.
+void LoopRotate::rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks,
+                            const SmallPtrSetBB &Exits) const {
+  BasicBlock *OrigH = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+  BasicBlock *OrigPH = L->getLoopPreheader();
 
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
 
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::GetElementPtr:
-      // GEPs are cheap if all indices are constant.
-      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
-        return false;
-      // fall-thru to increment case
-      LLVM_FALLTHROUGH;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr: {
-      Value *IVOpnd =
-          !isa<Constant>(I->getOperand(0))
-              ? I->getOperand(0)
-              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
-      if (!IVOpnd)
-        return false;
-
-      // If increment operand is used outside of the loop, this speculation
-      // could cause extra live range interference.
-      if (MultiExitLoop) {
-        for (User *UseI : IVOpnd->users()) {
-          auto *UserInst = cast<Instruction>(UseI);
-          if (!L->contains(UserInst))
-            return false;
-        }
-      }
+  ValueToValueMapTy VMap;
+  copyBlocks(Blocks, Exits, VMap, ".lr", DT, LI);
+
+  // Redirect original preheader to the entry of SEME.
+  BranchInst *PBI = dyn_cast<BranchInst>(OrigPH->getTerminator());
+  assert(PBI && (1 == PBI->getNumSuccessors()) &&
+         "Preheader does not have single successor.");
 
-      if (seenIncrement)
-        return false;
-      seenIncrement = true;
+  BasicBlock *CopyOrigH = cast<BasicBlock>(VMap[OrigH]);
+  PBI->setSuccessor(0, CopyOrigH);
+  DT->changeImmediateDominator(CopyOrigH, OrigPH);
+  L->moveToHeader(NewH);
+
+  BasicBlock *BeforeLoop = nullptr;
+  for (BasicBlock *BB : predecessors(NewH))
+    if (!L->contains(BB)) {
+      BeforeLoop = BB;
       break;
     }
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-      // ignore type conversions
-      break;
+  assert(BeforeLoop && "No entry point to the loop from New Header.");
+  Function *F = BeforeLoop->getParent();
+  // Move NewH physically to the beginning of the loop.
+  F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(),
+                                NewH);
+
+  // SplitEdge does not work properly with single-entry PHIs.
+  BasicBlock *NewPH = BasicBlock::Create(
+      NewH->getContext(), NewH->getName() + ".lr.ph", F, BeforeLoop);
+
+  Loop *OuterLoop = LI->getLoopFor(OrigPH);
+  if (OuterLoop)
+    OuterLoop->addBasicBlockToLoop(NewPH, *LI);
+
+  // BeforeLoop --> NewPH --> NewH.
+  insertBetween(NewPH, BeforeLoop, NewH);
+
+  DT->addNewBlock(NewPH, BeforeLoop);
+  DT->changeImmediateDominator(NewPH, BeforeLoop);
+  DT->changeImmediateDominator(NewH, NewPH);
+
+  // Also, the original entry lost its immediate dominator so its dominator
+  // should be adjusted. We use SEME property => idom (OrigH) = its single pred.
+  DT->changeImmediateDominator(OrigH, OrigH->getSinglePredecessor());
+
+  for (BasicBlock *BB : Blocks) {
+    typedef DomTreeNodeBase<BasicBlock> DTNode;
+    DTNode *IDom = DT->getNode(BB);
+    std::vector<DTNode *>::iterator I = IDom->begin();
+    for (; I != IDom->end();) {
+      BasicBlock *ExitBB = (*I)->getBlock();
+      if (L->contains(ExitBB)) {
+        ++I;
+        continue;
+      }
+
+      BasicBlock *StaleIDom = DT->getNode(ExitBB)->getIDom()->getBlock();
+      BasicBlock *NewBB = cast<BasicBlock>(VMap[BB]);
+      // NewIDom is correct because this part of CFG is up-to-date.
+      BasicBlock *NewIDom = DT->findNearestCommonDominator(StaleIDom, NewBB);
+      NewIDom = DT->findNearestCommonDominator(NewIDom, BB);
+      if (NewIDom != StaleIDom) {
+        DT->changeImmediateDominator(ExitBB, NewIDom);
+        DEBUG(dbgs() << "\nChanging IDom of " << *ExitBB << "to" << *NewIDom);
+        I = IDom->begin();
+      } else
+        ++I;
     }
   }
-  return true;
-}
-
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  if (!Latch || Latch->hasAddressTaken())
-    return false;
-
-  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!Jmp || !Jmp->isUnconditional())
-    return false;
 
-  BasicBlock *LastExit = Latch->getSinglePredecessor();
-  if (!LastExit || !L->isLoopExiting(LastExit))
-    return false;
+  adjustNewHeaderPhis(VMap, NewH, NewPH);
 
-  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
-  if (!BI)
-    return false;
+  BasicBlock *NewLatch = L->getLoopLatch();
+  assert(L->getLoopPreheader() && "Invalid loop preheader after rotation");
+  assert(NewLatch && "Invalid loop latch after rotation");
 
-  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
-    return false;
+  addNewPhisToNewHeader(Blocks, NewH, NewPH, NewLatch, VMap);
 
-  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
-               << LastExit->getName() << "\n");
+  // Discard incoming values in the CopyOrigHeader, which are coming from
+  // OrigLatch since it has only one predecessor.
+  discardIncomingValues(CopyOrigH, OrigLatch);
+  discardIncomingValues(OrigH, OrigPH);
 
-  // Hoist the instructions from Latch into LastExit.
-  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
-                                 Latch->begin(), Jmp->getIterator());
+  DEBUG(LI->verify(*DT));
+  DEBUG(DT->verifyDomTree());
+  assert(L->isRecursivelyLCSSAForm(*DT) && "Loop is not in LCSSA form.");
 
-  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
-  BasicBlock *Header = Jmp->getSuccessor(0);
-  assert(Header == L->getHeader() && "expected a backward branch");
+  DEBUG(dbgs() << "\nLoopRotation: rotated "; L->dumpVerbose());
 
-  // Remove Latch from the CFG so that LastExit becomes the new Latch.
-  BI->setSuccessor(FallThruPath, Header);
-  Latch->replaceSuccessorsPhiUsesWith(LastExit);
-  Jmp->eraseFromParent();
+  assert(isSingleEntryMultipleExit(L->getHeader(), NewLatch, DT) &&
+         "Rotated loop not an SEME");
+  assert(isSingleEntryMultipleExit(CopyOrigH, NewPH, DT) &&
+         "Copied SEME not an SEME");
 
-  // Nuke the Latch block.
-  assert(Latch->empty() && "unable to evacuate Latch");
-  LI->removeBlock(Latch);
-  if (DT)
-    DT->eraseNode(Latch);
-  Latch->eraseFromParent();
-  return true;
+  ++NumRotated;
 }
 
 /// Rotate \c L, and return true if any modification was made.
-bool LoopRotate::processLoop(Loop *L) {
+bool LoopRotate::processLoop() {
+  int Times = 0;
+  if (auto Value = findStringMetadataForLoop(L, LoopRotatedTimes)) {
+    const MDOperand *Op = *Value;
+    assert(Op && mdconst::hasa<ConstantInt>(*Op) && "invalid metadata");
+    Times = mdconst::extract<ConstantInt>(*Op)->getZExtValue();
+    if (Times >= RotationsMaxAllowed)
+      return false;
+  }
+
+  if (!isLegalToRotate())
+    return false;
+
+  BasicBlock *OrigH = L->getHeader();
+  BasicBlock *LoopLatch = L->getLoopLatch();
+
+  // Basic blocks to be copied.
+  SmallVector<BasicBlock *, 4> Blocks;
+  SmallPtrSet<BasicBlock *, 4> Exits;
+  // Collect all nodes of the loop from header to latch.
+  BasicBlock *NewH = collectSEMEBlocks(OrigH, LoopLatch, Blocks, Exits);
+  assert(NewH && "Invalid SEME region.");
+
+  if (!isProfitableToRotate(Blocks, Exits))
+    return false;
+
   // Save the loop metadata.
-  MDNode *LoopMD = L->getLoopID();
+  MDNode *LoopMD = nullptr;
+  Instruction *TI = nullptr;
+  std::tie(LoopMD, TI) = L->getLoopIDWithInstr();
+
+  // Make sure the latch has only one successor.
+  if (!LoopLatch->getSingleSuccessor()) {
+    // Since LoopLatch was skipped while collecting SEME blocks, exits out of
+    // the loop-latch is collected here.
+    BranchInst *BLI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
+    if (!BLI)
+      return false;
+    // If the NewH is the loop-latch => Exits does not include the exiting block
+    // from loop-latch see (collectSEMEBlocks).
+    if (NewH == LoopLatch)
+      for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I) {
+        BasicBlock *Succ = BLI->getSuccessor(I);
+        if (Succ != OrigH && !L->contains(Succ))
+          Exits.insert(Succ);
+      }
+
+    // The old loop-latch will become the parent of new loop-latch.
+    DEBUG(dbgs() << "\nSplitting the edge of Loop:"; L->dumpVerbose(););
+    BasicBlock *NewLoopLatch = SplitEdge(LoopLatch, OrigH, DT, LI);
+
+    // If the NewH is the loop-latch => NewH would change as the loop-latch has
+    // changed after splitting. Also, we need to add the (old) loop-latch to the
+    // blocks because it was skipped (all the blocks before NewH are copied to
+    // new SEME).
+    if (NewH == LoopLatch) {
+      NewH = NewLoopLatch;
+      Blocks.push_back(LoopLatch);
+    }
+    LoopLatch = NewLoopLatch;
+  }
 
-  // Simplify the loop latch before attempting to rotate the header
-  // upward. Rotation may not be needed if the loop tail can be folded into the
-  // loop exit.
-  bool SimplifiedLatch = simplifyLoopLatch(L);
+  assert(LoopLatch->getSingleSuccessor() && "Invalid SEME region.");
 
-  bool MadeChange = rotateLoop(L, SimplifiedLatch);
-  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+  // Anything ScalarEvolution may know about this loop or the PHI nodes in its
+  // header will soon be invalidated.
+  if (SE)
+    SE->forgetLoop(L);
+
+  rotateLoop(NewH, Blocks, Exits);
+  assert(L->isLoopExiting(L->getLoopLatch()) &&
          "Loop latch should be exiting after loop-rotate.");
 
-  // Restore the loop metadata.
-  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
-  if ((MadeChange || SimplifiedLatch) && LoopMD)
+  if (LoopMD) {
+    // Drop the metadata from the old loop header.
+    SmallVector<unsigned, 2> MDs;
+    TI->getAllNonDebugMetadataIDs(MDs);
+    MDs.erase(llvm::remove_if(MDs, [](unsigned ID) {
+          return ID == LLVMContext::MD_loop; }), MDs.end());
+
+    TI->dropUnknownNonDebugMetadata(MDs);
+
+    // Restore the loop metadata to the new loop header.
     L->setLoopID(LoopMD);
+  }
 
-  return MadeChange;
+  // Add new metadata that loop has been rotated once.
+  // The metadata reflects how many times loop has been rotated.
+  addStringMetadataToLoop(L, LoopRotatedTimes, Times + 1);
+  return true;
 }
 
 LoopRotatePass::LoopRotatePass() {}
@@ -632,9 +665,9 @@
   // Optional analyses.
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(*F);
   auto *SE = FAM.getCachedResult<ScalarEvolutionAnalysis>(*F);
-  LoopRotate LR(DefaultRotationThreshold, LI, TTI, AC, DT, SE);
+  LoopRotate LR(LI, TTI, AC, DT, SE, &L);
 
-  bool Changed = LR.processLoop(&L);
+  bool Changed = LR.processLoop();
   if (!Changed)
     return PreservedAnalyses::all();
   return getLoopPassPreservedAnalyses();
@@ -643,22 +676,17 @@
 namespace {
 
 class LoopRotateLegacyPass : public LoopPass {
-  unsigned MaxHeaderSize;
-
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+  LoopRotateLegacyPass() : LoopPass(ID) {
     initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
-    if (SpecifiedMaxHeaderSize == -1)
-      MaxHeaderSize = DefaultRotationThreshold;
-    else
-      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
   }
 
   // LCSSA form makes instruction renaming easier.
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 
@@ -674,8 +702,8 @@
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
     auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     auto *SE = SEWP ? &SEWP->getSE() : nullptr;
-    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE);
-    return LR.processLoop(L);
+    LoopRotate LR(LI, TTI, AC, DT, SE, L);
+    return LR.processLoop();
   }
 };
 }
@@ -685,10 +713,9 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
                     false)
 
-Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
-  return new LoopRotateLegacyPass(MaxHeaderSize);
-}
+Pass *llvm::createLoopRotatePass() { return new LoopRotateLegacyPass(); }
Index: llvm/test/Analysis/GlobalsModRef/memset-escape.ll
===================================================================
--- llvm/test/Analysis/GlobalsModRef/memset-escape.ll
+++ llvm/test/Analysis/GlobalsModRef/memset-escape.ll
@@ -6,15 +6,19 @@
 @a = internal global [3 x i32] zeroinitializer, align 4
 @b = common global i32 0, align 4
 
-; The important thing we're checking for here is the reload of (some element of)
-; @a after the memset.
+; Check that load and the call to abort is redundant.
+; CHECK:   store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
+; CHECK:   store i32 0, i32* @b, align 4
+; CHECK:   br label %for.body
 
-; CHECK-LABEL: @main
-; CHECK: call void @llvm.memset.p0i8.i64{{.*}} @a
-; CHECK: store i32 3
-; CHECK: load i32, i32* getelementptr {{.*}} @a
-; CHECK: icmp eq i32
-; CHECK: br i1
+; CHECK: for.body:                                         ; preds = %for.body.preheader
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 0), align 4
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
+; CHECK:   store i32 3, i32* @b, align 4
+; CHECK:   br i1 true, label %if.end, label %if.then
+; CHECK-NOT: load
+; CHECK-NOT: call void @abort()
 
 define i32 @main() {
 entry:
Index: llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
===================================================================
--- llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
+++ llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -instcombine -indvars -loop-deletion -constmerge -S | FileCheck %s
+; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -simplifycfg -S | FileCheck %s
+
 ; PR11882: ComputeLoadConstantCompareExitLimit crash.
 ;
 ; for.body is deleted leaving a loop-invariant load.
Index: llvm/test/Transforms/LoopRotate/alloca.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/alloca.ll
+++ llvm/test/Transforms/LoopRotate/alloca.ll
@@ -5,8 +5,9 @@
 ; We expect a different value for %ptr each iteration (according to the
 ; definition of alloca). I.e. each @use must be paired with an alloca.
 
-; CHECK: call void @use(i8* %
-; CHECK: %ptr = alloca i8
+; CHECK: alloca i8
+; CHECK: call void @use
+; CHECK: alloca i8
 
 @e = global i16 10
 
Index: llvm/test/Transforms/LoopRotate/basic.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/basic.ll
+++ llvm/test/Transforms/LoopRotate/basic.ll
@@ -18,9 +18,10 @@
   %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0
   br i1 %cmp, label %for.body, label %for.end
 
-; CHECK: for.body:
+; CHECK: for.cond.lr:
 ; CHECK-NEXT: phi i32 [ 0
-; CHECK-NEXT: store i32 0
+; CHECK: for.body:
+; CHECK: store i32 0
 
 for.body:                                         ; preds = %for.cond
   store i32 0, i32* %arrayidx, align 16
Index: llvm/test/Transforms/LoopRotate/dbgvalue.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -5,9 +5,9 @@
 
 define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp !dbg !0 {
 ; CHECK-LABEL: define i32 @tak(
-; CHECK: entry
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %x
-; CHECK: tail call void @llvm.dbg.value(metadata i32 %call
+; CHECK: tailrecurse.lr
+; CHECK: call void @llvm.dbg.value(metadata i32 %x
+; CHECK: tail call void @llvm.dbg.value(metadata i32 %y
 
 entry:
   br label %tailrecurse
@@ -42,7 +42,7 @@
 ; CHECK-LABEL: define i32 @tak2(
 ; CHECK: entry
 ; CHECK: tail call void @llvm.dbg.value(metadata i32 %x.tr
-; CHECK: tail call void @llvm.dbg.value(metadata i32 undef
+; CHECK: tail call void @llvm.dbg.value(metadata i32 %y.tr
 
 entry:
   br label %tailrecurse
@@ -83,12 +83,13 @@
 ; Ensure that the loop increment basic block is rotated into the tail of the
 ; body, even though it contains a debug intrinsic call.
 ; CHECK-LABEL: define void @FindFreeHorzSeg(
+; CHECK: for.inc:
+; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
 ; CHECK: %dec = add
 ; CHECK-NEXT: tail call void @llvm.dbg.value
+; CHECK-NEXT: br label %for.cond
 ; CHECK: %cmp = icmp
 ; CHECK: br i1 %cmp
-; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
-; CHECK-NEXT: br label %for.end
 
 
 entry:
Index: llvm/test/Transforms/LoopRotate/indirectbr-1.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/indirectbr-1.ll
@@ -0,0 +1,77 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s
+; ModuleID = 'ind.c'
+source_filename = "ind.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+; Check that the loop-latch with indirect branch is not rotated.
+; CHECK-NOT: {{.*}}.lr
+; CHECK-NOT: {{.*}}.lr.ph
+
+@f.x = internal global [3 x i8*] [i8* blockaddress(@f, %F), i8* blockaddress(@f, %G), i8* blockaddress(@f, %H)], align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @f(i32 %i, i32 %j, i32 %k) {
+entry:
+  %retval = alloca i32, align 4
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  %k.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  store i32 %k, i32* %k.addr, align 4
+  br label %F
+
+F:                                                ; preds = %entry, %indirectgoto
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %F
+  br label %G
+
+if.else:                                          ; preds = %F
+  %2 = load i32, i32* %i.addr, align 4
+  %3 = load i32, i32* %k.addr, align 4
+  %cmp1 = icmp sgt i32 %2, %3
+  br i1 %cmp1, label %if.then2, label %if.end
+
+if.then2:                                         ; preds = %if.else
+  br label %H
+
+if.end:                                           ; preds = %if.else
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.end
+  %call = call i32 (...) @z()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds [3 x i8*], [3 x i8*]* @f.x, i64 0, i64 %idxprom
+  %4 = load i8*, i8** %arrayidx, align 8
+  br label %indirectgoto
+
+G:                                                ; preds = %if.then, %indirectgoto
+  %call4 = call i32 (...) @g()
+  store i32 %call4, i32* %retval, align 4
+  br label %return
+
+H:                                                ; preds = %if.then2, %indirectgoto
+  %call5 = call i32 (...) @h()
+  store i32 %call5, i32* %retval, align 4
+  br label %return
+
+return:                                           ; preds = %H, %G
+  %5 = load i32, i32* %retval, align 4
+  ret i32 %5
+
+indirectgoto:                                     ; preds = %if.end3
+  %indirect.goto.dest = phi i8* [ %4, %if.end3 ]
+  indirectbr i8* %indirect.goto.dest, [label %F, label %G, label %H]
+}
+
+declare i32 @g(...)
+declare i32 @h(...)
+declare i32 @z(...)
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 3.9.0 "}
Index: llvm/test/Transforms/LoopRotate/loop-rotate-before-latch.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate-before-latch.ll
@@ -0,0 +1,46 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s
+; RUN: opt -S < %s  -loop-rotate -verify-dom-info -verify-loop-info -loop-rotate | FileCheck %s -check-prefix=CHECK1
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL:foo
+; CHECK: "llvm.loop.rotated.times", i32 1
+
+; Check that the loop is rotated only once despite multiple invocations of loop-rotate on a loop.
+; CHECK1-LABEL:foo
+; CHECK1: "llvm.loop.rotated.times", i32 1
+
+; Function Attrs: nounwind uwtable
+define hidden void @foo(i32 %arg1, i32 %arg3) local_unnamed_addr {
+
+entry:
+  br label %bb25
+
+bb25:                                             ; preds = %bb140, %entry
+  br i1 undef, label %bb165, label %bb34
+
+bb34:                                             ; preds = %bb25
+  switch i32 undef, label %bb35 [
+    i32 46, label %bb36
+    i32 32, label %bb36
+  ]
+
+bb35:                                             ; preds = %bb34
+  unreachable
+
+bb36:                                             ; preds = %bb34, %bb34
+  br label %bb140
+
+bb140:                                            ; preds = %bb36
+  %tmp145 = add nsw i32 undef, undef
+  br i1 undef, label %bb25, label %bb146
+
+bb146:                                            ; preds = %bb140
+  %tmp147 = icmp sgt i32 %tmp145, %arg1
+  unreachable
+
+bb165:                                            ; preds = %bb25
+  unreachable
+}
+
Index: llvm/test/Transforms/LoopRotate/loop-rotate.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate.ll
@@ -0,0 +1,415 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK1 %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK2 %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK3 %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK4 %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK5 %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK6 %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK7 %s
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck -check-prefix=CHECK8 %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check that the loop is rotated.
+; CHECK-LABEL: bar0
+; CHECK: while.cond.lr:                                    ; preds = %entry
+; CHECK: while.body.lr.ph:                                 ; preds = %while.cond.lr
+
+%fp = type { i64 }
+
+declare void @foo0(%fp* %this, %fp* %that)
+
+define void @bar0(%fp* %__begin1, %fp* %__end1, %fp** %__end2) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %__end1.addr.0 = phi %fp* [ %__end1, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq %fp* %__end1.addr.0, %__begin1
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %0 = load %fp*, %fp** %__end2, align 8
+  %add.ptr = getelementptr inbounds %fp, %fp* %0, i64 -1
+  %incdec.ptr = getelementptr inbounds %fp, %fp* %__end1.addr.0, i64 -1
+  tail call void @foo0(%fp* %add.ptr, %fp* %incdec.ptr)
+  %1 = load %fp*, %fp** %__end2, align 8
+  %incdec.ptr2 = getelementptr inbounds %fp, %fp* %1, i64 -1
+  store %fp* %incdec.ptr2, %fp** %__end2, align 8
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+
+; Check that the loop is rotated.
+; CHECK1-LABEL: bar1
+; CHECK1: for.cond.lr:                                      ; preds = %entry
+; CHECK1: if.end.lr:                                        ; preds = %for.cond.lr
+; CHECK1: if.end75.lr:                                      ; preds = %if.end.lr
+; CHECK1: if.end94.lr:                                      ; preds = %if.end75.lr
+; CHECK1: if.then178.lr.ph:                                 ; preds = %if.end94.lr
+
+declare i32 @foo1()
+define i32 @bar1() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.then178, %entry
+  %first.0 = phi i32 [ 1, %entry ], [ 0, %if.then178 ]
+  br i1 undef, label %err_sl, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  br i1 undef, label %err_sl, label %if.end75
+
+if.end75:                                         ; preds = %if.end
+  br i1 undef, label %if.end94, label %if.then93
+
+if.then93:                                        ; preds = %if.end75
+  unreachable
+
+if.end94:                                         ; preds = %if.end75
+  br i1 undef, label %if.then178, label %for.end182
+
+if.then178:                                       ; preds = %if.end94
+  %call179 = tail call i32 @foo1()
+  br label %for.cond
+
+for.end182:                                       ; preds = %if.end94
+  ret i32 1
+
+err_sl:                                           ; preds = %if.end, %for.cond
+  unreachable
+}
+
+
+; Check that the loop is rotated and multiple phis are updated.
+; CHECK2-LABEL: foo2
+; CHECK2: do.body.i10.lr:                                   ; preds = %if.else.i3
+; CHECK2: do.cond.i19.lr.ph:                                ; preds = %do.body.i10.lr
+
+define void @foo2() {
+entry:
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry
+  br i1 undef, label %inverse.exit21, label %if.else.i3
+
+if.else.i3:                                       ; preds = %for.body
+  br label %do.body.i10
+
+do.body.i10:                                      ; preds = %do.cond.i19, %if.else.i3
+  %b1.0.i4 = phi i64 [ 0, %if.else.i3 ], [ %b2.0.i7, %do.cond.i19 ]
+  %b2.0.i7 = phi i64 [ 1, %if.else.i3 ], [ %sub8.i18, %do.cond.i19 ]
+  br i1 undef, label %do.cond.thread.i15, label %do.cond.i19
+
+do.cond.thread.i15:                               ; preds = %do.body.i10
+  br label %inverse.exit21
+
+do.cond.i19:                                      ; preds = %do.body.i10
+  %mul.i17 = mul nsw i64 undef, %b2.0.i7
+  %sub8.i18 = sub nsw i64 %b1.0.i4, %mul.i17
+  br label %do.body.i10
+
+inverse.exit21:                                   ; preds = %do.cond.thread.i15, %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %inverse.exit21, %entry
+  ret void
+}
+
+; Check that the loop with indirect branches is rotated.
+; CHECK3-LABEL: foo3
+; CHECK3: while.cond.lr:                                    ; preds = %entry
+; CHECK3: if.end6.lr:                                       ; preds = %while.cond.lr
+; CHECK3: if.else.lr:                                       ; preds = %if.end6.lr
+; CHECK3: if.end2.i.lr:                                     ; preds = %if.else.lr
+; CHECK3: if.end37.lr:                                      ; preds = %if.end2.i.lr
+; CHECK3: if.else96.lr:                                     ; preds = %if.end37.lr
+; CHECK3: if.else106.lr.ph:                                 ; preds = %if.else96.lr
+
+define i32 @foo3() {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %if.then467, %entry
+  br i1 undef, label %if.then, label %if.end6
+
+if.then:                                          ; preds = %while.cond
+  unreachable
+
+if.end6:                                          ; preds = %while.cond
+  %conv7 = ashr exact i64 undef, 32
+  br i1 undef, label %if.else, label %if.then19
+
+if.then19:                                        ; preds = %if.end6
+  unreachable
+
+if.else:                                          ; preds = %if.end6
+  br i1 undef, label %thr, label %if.end2.i
+
+if.end2.i:                                        ; preds = %if.else
+  br i1 undef, label %thr, label %if.end37
+
+thr:                                              ; preds = %if.end2.i, %if.else
+  unreachable
+
+if.end37:                                         ; preds = %if.end2.i
+  br i1 undef, label %if.else96, label %if.then40
+
+if.then40:                                        ; preds = %if.end37
+  unreachable
+
+if.else96:                                        ; preds = %if.end37
+  br i1 undef, label %if.else106, label %if.then99
+
+if.then99:                                        ; preds = %if.else96
+  unreachable
+
+if.else106:                                       ; preds = %if.else96
+  switch i32 undef, label %if.else427 [
+    i32 12, label %if.then130
+    i32 30, label %if.then467
+  ]
+
+if.then130:                                       ; preds = %if.else106
+  br label %if.then467
+
+if.else427:                                       ; preds = %if.else106
+  unreachable
+
+if.then467:                                       ; preds = %if.then130, %if.else106
+  br label %while.cond
+}
+
+; Check that the loop is rotated.
+; CHECK4-LABEL: foo4
+; CHECK4: bb3.lr:                                           ; preds = %bb
+; CHECK4: bb5.lr:                                           ; preds = %bb3.lr
+; CHECK4: bb8.lr.ph:                                        ; preds = %bb5.lr
+
+define void @foo4(i8* %arg, i8* %arg1, i64 %arg2) {
+bb:
+  %tmp = and i32 undef, 7
+  br label %bb3
+
+bb3:                                              ; preds = %bb8, %bb
+  %tmp4 = phi i32 [ 0, %bb8 ], [ %tmp, %bb ]
+  br i1 undef, label %bb9, label %bb5
+
+bb5:                                              ; preds = %bb3
+  %tmp6 = sub nsw i32 8, %tmp4
+  br i1 undef, label %bb7, label %bb8
+
+bb7:                                              ; preds = %bb5
+  unreachable
+
+bb8:                                              ; preds = %bb5
+  store i32 undef, i32* undef, align 8
+  br label %bb3
+
+bb9:                                              ; preds = %bb3
+  ret void
+}
+
+
+; Check that the loop with pre-header and loop-body having a switch block is rotated.
+; CHECK5-LABEL: bar5
+; CHECK5: bb19.preheader:                                   ; preds = %bb10, %bb10
+; CHECK5: bb19.lr:                                          ; preds = %bb19.preheader
+; CHECK5: bb12.lr.ph:                                       ; preds = %bb19.lr
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @foo5(i8*, i32)
+
+define i32 @bar5(i8* %arg, i32 %arg1, i8* %arg2, i32 %arg3) {
+bb6:
+  br i1 undef, label %bb8, label %bb10
+
+bb8:                                              ; preds = %bb6
+  unreachable
+
+bb10:                                             ; preds = %bb6
+  switch i32 undef, label %bb11 [
+    i32 46, label %bb19
+    i32 32, label %bb19
+  ]
+
+bb11:                                             ; preds = %bb10
+  unreachable
+
+bb12:                                             ; preds = %bb19
+  switch i8 undef, label %bb13 [
+    i8 32, label %bb21
+    i8 46, label %bb21
+  ]
+
+bb13:                                             ; preds = %bb12
+  br label %bb15
+
+bb15:                                             ; preds = %bb13
+  br i1 undef, label %bb17, label %bb16
+
+bb16:                                             ; preds = %bb15
+  %tmp = call i32 @foo5(i8* %tmp20, i32 10)
+  ret i32 0
+
+bb17:                                             ; preds = %bb15
+  %tmp18 = phi i8* [ %tmp20, %bb15 ]
+  br label %bb19
+
+bb19:                                             ; preds = %bb17, %bb10, %bb10
+  %tmp20 = phi i8* [ %tmp18, %bb17 ], [ null, %bb10 ], [ null, %bb10 ]
+  br i1 undef, label %bb21, label %bb12
+
+bb21:                                             ; preds = %bb19, %bb12, %bb12
+  unreachable
+}
+
+
+; Check that the loop is rotated.
+; CHECK6-LABEL: bar6
+; CHECK6: while.cond.preheader:                             ; preds = %if.end92
+; CHECK6: while.cond.lr:                                    ; preds = %while.cond.preheader
+; CHECK6: if.end107.lr.ph:                                  ; preds = %while.cond.lr
+
+; Function Attrs: nounwind uwtable
+define i32 @bar6() {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  ret i32 -1
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %if.then39, label %if.end48
+
+if.then39:                                        ; preds = %if.end
+  unreachable
+
+if.end48:                                         ; preds = %if.end
+  br i1 undef, label %if.else61, label %if.then59
+
+if.then59:                                        ; preds = %if.end48
+  unreachable
+
+if.else61:                                        ; preds = %if.end48
+  br i1 undef, label %if.then91, label %if.end92
+
+if.then91:                                        ; preds = %if.else61
+  unreachable
+
+if.end92:                                         ; preds = %if.else61
+  br i1 undef, label %if.then96, label %while.cond
+
+if.then96:                                        ; preds = %if.end92
+  unreachable
+
+while.cond:                                       ; preds = %if.end107, %if.end92
+  %len.3 = phi i64 [ %add109, %if.end107 ], [ 0, %if.end92 ]
+  br i1 undef, label %if.then106, label %if.end107
+
+if.then106:                                       ; preds = %while.cond
+  unreachable
+
+if.end107:                                        ; preds = %while.cond
+  %add109 = add i64 undef, %len.3
+  %sub111 = sub i64 undef, undef
+  br label %while.cond
+}
+
+
+declare i8* @foo7()
+
+; Check that the loop is rotated.
+; CHECK7-LABEL: bar7
+; CHECK7: for.cond.preheader:                               ; preds = %if.end
+; CHECK7: for.cond.lr:                                      ; preds = %for.cond.preheader
+; CHECK7: for.body.lr:                                      ; preds = %for.cond.lr
+; CHECK7: if.end20.lr.ph:                                   ; preds = %for.body.lr
+
+define void @bar7() #1 {
+entry:
+  br i1 undef, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  unreachable
+
+if.end:                                           ; preds = %entry
+  br i1 undef, label %return, label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %if.end
+  br i1 undef, label %for.body, label %return
+
+for.body:                                         ; preds = %for.cond
+  %call15 = call i8* @foo7()
+  br i1 undef, label %if.end20, label %return
+
+if.end20:                                         ; preds = %for.body
+  %issuer.i = getelementptr inbounds i8, i8* %call15, i64 24
+  br i1 undef, label %if.then.i, label %for.cond.i
+
+if.then.i:                                        ; preds = %if.end20
+  br i1 undef, label %if.then23, label %crl
+
+for.cond.i:                                       ; preds = %if.end20
+  br i1 undef, label %for.body.i, label %for.inc
+
+for.body.i:                                       ; preds = %for.cond.i
+  unreachable
+
+crl:                    ; preds = %if.then.i
+  br i1 undef, label %if.then23, label %for.inc
+
+if.then23:                                        ; preds = %crl, %if.then.i
+  %reason = getelementptr inbounds i8, i8* %call15, i64 32
+  br label %return
+
+for.inc:                                          ; preds = %crl, %for.cond.i
+  br label %for.cond
+
+return:                                           ; preds = %if.then23, %for.body, %for.cond, %if.end
+  ret void
+}
+
+
+; Check that the loop with an if-block within the loop is rotated.
+; There are multiple exits from the loop.
+
+; CHECK8-LABEL: foo8
+; CHECK8: for.cond.lr:                                      ; preds = %entry
+; CHECK8: %first.0.lr = phi i32 [ 1, %entry ]
+; CHECK8: if.end.lr:                                        ; preds = %for.cond.lr
+; CHECK8: if.end75.lr:                                      ; preds = %if.end.lr
+; CHECK8: if.end94.lr.ph:                                   ; preds = %if.end75.lr
+
+define i32 @foo8() {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %if.end94, %entry
+  %first.0 = phi i32 [ 1, %entry ], [ 0, %if.end94 ]
+  br i1 undef, label %err_sl, label %if.end
+
+if.end:                                           ; preds = %for.cond
+  br i1 undef, label %err_sl, label %if.end75
+
+if.end75:                                         ; preds = %if.end
+  br i1 undef, label %if.end94, label %if.then93
+
+if.then93:                                        ; preds = %if.end75
+  ret i32 0
+
+if.end94:                                         ; preds = %if.end75
+  %call179 = tail call i32 @gets()
+  br label %for.cond
+
+err_sl:                                           ; preds = %if.end, %for.cond
+  unreachable
+}
+
+declare i32 @gets()
Index: llvm/test/Transforms/LoopRotate/multiple-exits-merge-phi.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/multiple-exits-merge-phi.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -loop-rotate < %s -verify-loop-info -verify-dom-info | FileCheck %s
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-fdebd3b.bc"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+; Function Attrs: nounwind readonly
+define void @test1() #0 {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  br i1 undef, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %land.rhs, %for.cond
+  %sum.1 = phi i32 [ 0, %land.rhs ], [ %sum.0, %for.cond ]
+  br i1 undef, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  br i1 undef, label %return, label %for.cond1
+
+return:                                           ; preds = %land.rhs, %for.cond
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret void
+}
+
+attributes #0 = { nounwind readonly }
+
+; check that loop with multiple exit is rotated and has a preheader.
+; CHECK-LABEL: test1
+; CHECK: entry:
+; CHECK: br label %for.cond.lr
+; CHECK: for.cond.loopexit:
+; CHECK:   br label %for.cond
+; CHECK: for.cond.lr:
+; CHECK:   br i1 false, label %for.cond1.preheader.lr.ph, label %return.loopexit1
+; CHECK: for.cond1.preheader.lr.ph:
+; CHECK:   br label %for.cond1.preheader
+; CHECK: for.cond1.preheader:
Index: llvm/test/Transforms/LoopRotate/multiple-exits.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/multiple-exits.ll
+++ llvm/test/Transforms/LoopRotate/multiple-exits.ll
@@ -3,7 +3,18 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-; PR7447
+; PR7447: there should be two loops rotated.
+; CHECK-LABEL: @test1
+
+; Check that the outer loop is rotated.
+; CHECK: for.cond.lr:
+; CHECK: for.cond1.preheader.lr.ph:
+
+; Check that the inner loop is rotated.
+; CHECK: for.cond1.lr:
+; CHECK: land.rhs.lr:
+; CHECK: land.rhs.for.cond1_crit_edge.lr.ph:
+
 define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
 entry:
   br label %for.cond
@@ -31,17 +42,6 @@
 return:                                           ; preds = %for.cond, %land.rhs
   %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
   ret i32 %retval.0
-
-; CHECK-LABEL: @test1(
-; CHECK: for.cond1.preheader:
-; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
-; CHECK: br label %for.cond1
-
-; CHECK: for.cond1:
-; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
-; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
-; CHECK: %cmp2 = icmp ult i32 %i.1, 100
-; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
 }
 
 define void @test2(i32 %x) nounwind {
@@ -75,9 +75,9 @@
 
 ; CHECK-LABEL: @test2(
 ; CHECK: if.end:
-; CHECK: %inc = add i32 %i.02, 1
-; CHECK: %cmp = icmp eq i32 %inc, %x
-; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+; CHECK: %inc = add i32 %phi.nh, 1
+; CHECK: %cmp = icmp eq i32 %i.0, %x
+; CHECK: br i1 %cmp, label %return.loopexit, label %for.body
 }
 
 declare i32 @foo(i32)
Index: llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
+++ llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
@@ -3,7 +3,8 @@
 target triple = "arm64-apple-ios8.0.0"
 
 ;CHECK: for.inc:
-;CHECK-NEXT: %incdec.ptr.i = getelementptr 
+;CHECK: %incdec.ptr.i = getelementptr
+;CHECK: br
 
 ; Function Attrs: alwaysinline inlinehint nounwind readonly ssp
 define linkonce_odr hidden i64 @_ZNSt3__14findINS_11__wrap_iterIPiEEiEET_S4_S4_RKT0_(i64 %__first.coerce, i64 %__last.coerce, i32* nocapture readonly dereferenceable(4) %__value_) {
Index: llvm/test/Transforms/LoopRotate/phi-duplicate.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/phi-duplicate.ll
+++ llvm/test/Transforms/LoopRotate/phi-duplicate.ll
@@ -31,10 +31,15 @@
 ; Should only end up with one phi.
 ; CHECK-LABEL:      define void @test(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   br label %for.body
+; CHECK-NEXT:   br label %for.cond.lr
 ; CHECK:      for.body:
-; CHECK-NEXT:   %j.01 = phi i64
-; CHECK-NOT:  br
-; CHECK:   br i1 %cmp, label %for.body, label %for.end
-; CHECK:      for.end:
-; CHECK-NEXT:        ret void
+; CHECK:  %phi.nh3 = phi i64
+; CHECK:  %phi.nh2 = phi i64
+; CHECK:  %phi.nh1 = phi i64
+; CHECK:  %phi.nh = phi i64
+; CHECK:  %inc = add nsw i64 %phi.nh, 1
+; CHECK:   br label %for.cond
+; CHECK: for.cond:
+; CHECK:   br i1 %cmp
+; CHECK: for.end:
+; CHECK-NEXT:  ret void
Index: llvm/test/Transforms/LoopRotate/pr7447.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/pr7447.ll
@@ -0,0 +1,34 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-rotate -stats 2>&1 | FileCheck %s
+
+; PR7447: there should be two loops rotated.
+; CHECK: 2 loop-rotate
+
+define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond1, %entry
+  %sum.0 = phi i32 [ 0, %entry ], [ %sum.1, %for.cond1 ]
+  %i.0 = phi i1 [ true, %entry ], [ false, %for.cond1 ]
+  br i1 %i.0, label %for.cond1, label %return
+
+for.cond1:                                        ; preds = %for.cond, %land.rhs
+  %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.0, %for.cond ]
+  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond ]
+  %cmp2 = icmp ult i32 %i.1, 100
+  br i1 %cmp2, label %land.rhs, label %for.cond
+
+land.rhs:                                         ; preds = %for.cond1
+  %conv = zext i32 %i.1 to i64
+  %arrayidx = getelementptr inbounds [100 x i32], [100 x i32]* %a, i64 0, i64 %conv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add i32 %0, %sum.1
+  %cmp4 = icmp ugt i32 %add, 1000
+  %inc = add i32 %i.1, 1
+  br i1 %cmp4, label %return, label %for.cond1
+
+return:                                           ; preds = %for.cond, %land.rhs
+  %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
+  ret i32 %retval.0
+}
Index: llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
+++ llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
@@ -16,22 +16,28 @@
 ; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER:[^,]*]], label %outer.body
 
 ; CHECK: [[INNER_PREROTATE_PREHEADER]]:
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER_SPLIT_RETURN:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]]
+; CHECK: br label %inner.header.lr
+
+; CHECK: inner.header.lr:
+; CHECK: br i1 true, label %return, label %inner.body.lr
+
+; CHECK: inner.body.lr:
+; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]]
 
 ; CHECK: [[INNER_ROTATED_PREHEADER]]:
-; CHECK-NEXT: br label %inner.body
+; CHECK-NEXT: br label %inner.latch
 
 inner.header:
 ; Now the latch!
 ; CHECK: inner.header:
   br i1 undef, label %return, label %inner.body
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_RETURN:[^,]*]], label %inner.body
+; CHECK-NEXT: br i1 {{[^,]*}}, label %return, label %inner.body
 
 inner.body:
 ; Now the header!
 ; CHECK: inner.body:
   br i1 undef, label %outer.latch, label %inner.latch
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_OUTER_LATCH:[^,]*]], label %inner.header
+; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT]], label %inner.latch
 
 inner.latch:
 ; Dead!
@@ -42,8 +48,7 @@
   br label %outer.latch
 ; CHECK-NEXT: br label %outer.latch
 
-; L2 -> L1 exit edge needs a simplified exit block.
-; CHECK: [[INNER_SPLIT_OUTER_LATCH]]:
+; CHECK: [[OUTER_LATCH_LOOPEXIT]]:
 ; CHECK-NEXT: br label %outer.latch
 
 outer.latch:
@@ -51,14 +56,6 @@
   br label %outer.header
 ; CHECK-NEXT: br label %outer.header
 
-; L1 -> L0 exit edge need sa simplified exit block.
-; CHECK: [[INNER_PREROTATE_PREHEADER_SPLIT_RETURN]]:
-; CHECK-NEXT: br label %return
-
-; L2 -> L0 exit edge needs a simplified exit block.
-; CHECK: [[INNER_SPLIT_RETURN]]:
-; CHECK-NEXT: br label %return
-
 return:
 ; CHECK: return:
   unreachable
Index: llvm/test/Transforms/LoopRotate/simplifylatch.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/simplifylatch.ll
+++ llvm/test/Transforms/LoopRotate/simplifylatch.ll
@@ -3,8 +3,13 @@
 
 @mode_table = global [4 x i32] zeroinitializer		; <[4 x i32]*> [#uses=1]
 
+; Check that loop with multiple exits is rotated.
 ; CHECK-LABEL: @f(
-; CHECK-NOT: bb:
+; CHECK: bb.lr:
+; CHECK: bb2.lr:
+; CHECK: bb4.lr.ph:
+; CHECK: bb:
+
 define i8 @f() {
 entry:
 	tail call i32 @fegetround( )		; <i32>:0 [#uses=1]
@@ -38,11 +43,21 @@
 
 declare void @raise_exception() noreturn
 
-;CHECK: for.body.lr.ph:
-;CHECK-NEXT:  %arrayidx1 = getelementptr inbounds i8, i8* %CurPtr, i64 0
-;CHECK-NEXT:  %0 = load i8, i8* %arrayidx1, align 1
-;CHECK-NEXT:  %conv2 = sext i8 %0 to i32
-;CHECK-NEXT:  br label %for.body
+;CHECK-LABEL: @foo(
+;CHECK: for.body.lr:
+;CHECK: %arrayidx.lr = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom.lr
+;CHECK: %0 = load i8, i8* %arrayidx.lr, align 1
+;CHECK: %conv.lr = sext i8 %0 to i32
+;CHECK: %arrayidx1.lr = getelementptr inbounds i8, i8* %CurPtr, i64 0
+;CHECK: %1 = load i8, i8* %arrayidx1.lr, align 1
+;CHECK: %conv2.lr = sext i8 %1 to i32
+;CHECK: for.body:
+;CHECK: %arrayidx = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom
+;CHECK: %2 = load i8, i8* %arrayidx, align 1
+;CHECK: %conv = sext i8 %2 to i32
+;CHECK: %3 = load i8, i8* %arrayidx1, align 1
+;CHECK: %conv2 = sext i8 %3 to i32
+;CHECK: br i1 %cmp3, label %return, label %for.inc
 
 define i32 @foo(i8* %CurPtr, i32 %a) #0 {
 entry:
Index: llvm/test/Transforms/LoopSimplify/ashr-crash.ll
===================================================================
--- llvm/test/Transforms/LoopSimplify/ashr-crash.ll
+++ llvm/test/Transforms/LoopSimplify/ashr-crash.ll
@@ -29,9 +29,9 @@
 ; CHECK-LABEL: entry:
 ; CHECK-LABEL: for.cond1.preheader:
 ; CHECK-LABEL: for.body3:
-; CHECK: %cmp4.le.le
-; CHECK: %conv.le.le = zext i1 %cmp4.le.le to i32
-; CHECK: %xor.le.le = xor i32 %conv6.le.le, 1
+; CHECK: %cmp4
+; CHECK: %conv = zext i1 %cmp4 to i32
+; CHECK: %xor = xor i32 %conv6, 1
 define void @foo() {
 entry:
   br label %for.cond
Index: llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
+++ llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -4,11 +4,11 @@
 ; Loop from "rotated"
 ; CHECK: LV: Loop hints: force=enabled
 ; Loop from "nonrotated"
-; CHECK: LV: Loop hints: force=enabled
-; No more loops in the module
 ; CHECK-NOT: LV: Loop hints: force=
-; In total only 1 loop should be rotated.
-; CHECK: 1 loop-rotate
+; checking the stats for the function @nonrotated
+; CHECK: 2 gvn{{.*}}Number of equalities propagated
+; With the new loop rotation both the loops should be rotated.
+; CHECK: 2 loop-rotate
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"