Index: llvm/include/llvm/Analysis/LoopInfo.h
===================================================================
--- llvm/include/llvm/Analysis/LoopInfo.h
+++ llvm/include/llvm/Analysis/LoopInfo.h
@@ -456,13 +456,19 @@
   /// iterations.
   bool isAnnotatedParallel() const;
 
+  /// Return the llvm.loop loop id metadata node for this loop and the
+  /// instruction which contains the metadata. If there is no such instruction
+  /// having a loop metadata then return a pair of nullptrs.
+  std::pair<MDNode *, Instruction *> getLoopIDWithInstr() const;
+
   /// Return the llvm.loop loop id metadata node for this loop if it is present.
   ///
   /// If this loop contains the same llvm.loop metadata on each branch to the
   /// header then the node is returned. If any latch instruction does not
   /// contain llvm.loop or or if multiple latches contain different nodes then
   /// 0 is returned.
-  MDNode *getLoopID() const;
+  MDNode *getLoopID() const { return getLoopIDWithInstr().first; }
+
   /// Set the llvm.loop loop id metadata for this loop.
   ///
   /// The LoopID metadata node will be added to each terminator instruction in
Index: llvm/include/llvm/IR/Instruction.h
===================================================================
--- llvm/include/llvm/IR/Instruction.h
+++ llvm/include/llvm/IR/Instruction.h
@@ -201,6 +201,10 @@
       getAllMetadataOtherThanDebugLocImpl(MDs);
   }
 
+  /// Get all the metadata IDs attached to this Instruction except the ones
+  /// associated with debug location. The Vector \p MDIDs is not cleared.
+  void getAllNonDebugMetadataIDs(SmallVectorImpl<unsigned> &MDIDs) const;
+
   /// Fills the AAMDNodes structure with AA metadata from this instruction.
   /// When Merge is true, the existing AA metadata is merged with that from this
   /// instruction providing the most-general result.
Index: llvm/include/llvm/Transforms/Scalar.h
===================================================================
--- llvm/include/llvm/Transforms/Scalar.h
+++ llvm/include/llvm/Transforms/Scalar.h
@@ -197,7 +197,7 @@
 //
 // LoopRotate - This pass is a simple loop rotating pass.
 //
-Pass *createLoopRotatePass(int MaxHeaderSize = -1);
+Pass *createLoopRotatePass();
 
 //===----------------------------------------------------------------------===//
 //
Index: llvm/lib/Analysis/LoopInfo.cpp
===================================================================
--- llvm/lib/Analysis/LoopInfo.cpp
+++ llvm/lib/Analysis/LoopInfo.cpp
@@ -209,10 +209,12 @@
   return true;
 }
 
-MDNode *Loop::getLoopID() const {
+std::pair<MDNode *, Instruction *> Loop::getLoopIDWithInstr() const {
   MDNode *LoopID = nullptr;
+  Instruction *I = nullptr;
   if (BasicBlock *Latch = getLoopLatch()) {
-    LoopID = Latch->getTerminator()->getMetadata(LLVMContext::MD_loop);
+    I = Latch->getTerminator();
+    LoopID = I->getMetadata(LLVMContext::MD_loop);
   } else {
     // Go through each predecessor of the loop header and check the
     // terminator for the metadata.
@@ -222,17 +224,18 @@
 
       TerminatorInst *TI = BB->getTerminator();
       if (MDNode *MD = TI->getMetadata(LLVMContext::MD_loop)) {
-        if (!LoopID)
+        if (!LoopID) {
           LoopID = MD;
-        else if (MD != LoopID) // Multiple MD_loop found => corrupt metadata.
-          return nullptr;
+          I = TI;
+        } else // Multiple MD_loop found.
+          return { nullptr, nullptr };
       }
     }
   }
   if (!LoopID || LoopID->getNumOperands() == 0 ||
       LoopID->getOperand(0) != LoopID)
-    return nullptr;
-  return LoopID;
+    return { nullptr, nullptr };
+  return { LoopID, I };
 }
 
 void Loop::setLoopID(MDNode *LoopID) const {
Index: llvm/lib/IR/LLVMContextImpl.h
===================================================================
--- llvm/lib/IR/LLVMContextImpl.h
+++ llvm/lib/IR/LLVMContextImpl.h
@@ -1050,6 +1050,9 @@
   /// ID.  This function does \em not clear \c Result.
   void getAll(SmallVectorImpl<std::pair<unsigned, MDNode *>> &Result) const;
 
+  /// \brief Appends the IDs of all current attachments into \c Result.
+  void getAllIDs(SmallVectorImpl<unsigned> &Result) const;
+
   /// \brief Erase matching attachments.
   ///
   /// Erases all attachments matching the \c shouldRemove predicate.
Index: llvm/lib/IR/Metadata.cpp
===================================================================
--- llvm/lib/IR/Metadata.cpp
+++ llvm/lib/IR/Metadata.cpp
@@ -1135,6 +1135,11 @@
     array_pod_sort(Result.begin(), Result.end());
 }
 
+void MDAttachmentMap::getAllIDs(SmallVectorImpl<unsigned> &Result) const {
+  for (auto &A : Attachments)
+    Result.push_back(A.first);
+}
+
 void MDGlobalAttachmentMap::insert(unsigned ID, MDNode &MD) {
   Attachments.push_back({ID, TrackingMDNodeRef(&MD)});
 }
@@ -1298,6 +1303,15 @@
   Info.getAll(Result);
 }
 
+void
+Instruction::getAllNonDebugMetadataIDs(SmallVectorImpl<unsigned> &MDIDs) const {
+  if (!hasMetadataOtherThanDebugLoc())
+    return;
+  const auto &Info = getContext().pImpl->InstructionMetadata.find(this)->second;
+  assert(!Info.empty() && "Shouldn't have called this");
+  Info.getAllIDs(MDIDs);
+}
+
 bool Instruction::extractProfMetadata(uint64_t &TrueVal,
                                       uint64_t &FalseVal) const {
   assert(
Index: llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -304,8 +304,10 @@
   MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
-  // Rotate Loop - disable header duplication at -Oz
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+
+  // Rotate Loop - disable loop rotation at -Oz
+  if (SizeLevel != 2)
+    MPM.add(createLoopRotatePass());
   MPM.add(createLICMPass());                  // Hoist loop invariants
   MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
   MPM.add(createCFGSimplificationPass());
@@ -556,7 +558,8 @@
   // Re-rotate loops in all our loop nests. These may have fallout out of
   // rotated form due to GVN or other transformations, and the vectorizer relies
   // on the rotated form. Disable header duplication at -Oz.
-  MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+  if (SizeLevel != 2)
+    MPM.add(createLoopRotatePass());
 
   // Distribute loops to allow partial vectorization.  I.e. isolate dependences
   // into separate loop that would otherwise inhibit vectorization.  This is
Index: llvm/lib/Transforms/Scalar/LoopRotation.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -8,7 +8,23 @@
 //===----------------------------------------------------------------------===//
 //
 // This file implements Loop Rotation Pass.
+// All the basic blocks which are between the loop-header and the basic blocks
+// exiting the loop are copied during rotation. This is helpful for passes like
+// gvn/licm which can now remove loop invariant code because some computations
+// (which might not dominate all the basic blocks of loop) can now be PRE'd as
+// they will be available outside of the loop (due to loop rotation).
 //
+// Only the loops satisfying following properties are rotated:
+// - The loop is an SEME (Single Entry Multiple Exit) region.
+// - The loop has a header and its terminator conditionally exits the loop.
+// - The loop has a latch and its terminator does not exit the loop.
+// - The loop has preheader.
+//
+// Post conditions:
+// The dominator tree is updated during loop rotation.
+// A loop already in loop-simplified form remains in loop-simplified form.
+// LoopInfo is preserved.
+// The rotated loop and the cloned region are SEME regions.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopRotation.h"
@@ -29,597 +45,660 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Verifier.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
+
+#include<tuple>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-rotate"
 
-static cl::opt<unsigned> DefaultRotationThreshold(
-    "rotation-max-header-size", cl::init(16), cl::Hidden,
-    cl::desc("The default maximum header size for automatic loop rotation"));
+static cl::opt<unsigned> RotationMaxSize(
+    "rotation-max-size", cl::init(100), cl::Hidden,
+    cl::desc("The default maximum loop size for automatic loop rotation"));
+
+static cl::opt<unsigned> MaxExits("lr-max-exits", cl::init(10), cl::Hidden,
+                                  cl::desc("The maximum exits to be cloned"));
 
 STATISTIC(NumRotated, "Number of loops rotated");
 
-namespace {
+// Insert \p NewBB in between \p PredBefore and \p Succ, and redirect edges
+// accordingly. PredBefore -> NewBB -> Succ. Also move NewBB before Succ.
+static void insertBetween(BasicBlock *NewBB, BasicBlock *PredBefore,
+                          BasicBlock *Succ) {
+  BranchInst *NewBI = BranchInst::Create(Succ, NewBB);
+  NewBI->setDebugLoc(PredBefore->getTerminator()->getDebugLoc());
+
+  BranchInst *BLI = dyn_cast<BranchInst>(PredBefore->getTerminator());
+  for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I)
+    if (BLI->getSuccessor(I) == Succ) {
+      BLI->setSuccessor(I, NewBB);
+      break;
+    }
+  // Move NewBB physically from the end of the block list.
+  Function *F = Succ->getParent();
+  F->getBasicBlockList().splice(Succ->getIterator(), F->getBasicBlockList(),
+                                NewBB);
+}
+
+// Remove the arguments of all phi nodes in PhiBB coming from block From.
+static void discardIncomingValues(BasicBlock *PhiBB, BasicBlock *From) {
+  for (Instruction &I : *PhiBB) {
+    PHINode *PN = dyn_cast<PHINode>(&I);
+    if (!PN)
+      break;
+    PN->removeIncomingValue(PN->getBasicBlockIndex(From));
+  }
+}
+
+/// Returns true of the region formed by [Entry, Exit] is a
+/// single-entry-multiple-exit (SEME) region. All the traces from \p Entry
+/// which leads to the \p Exit are analyzed.
+static bool isSingleEntryMultipleExit(const BasicBlock *Entry,
+                                      const BasicBlock *Exit,
+                                      DominatorTree *DT) {
+  if (!DT->dominates(Entry, Exit))
+    return false;
+
+  for (auto I = idf_begin(Exit), E = idf_end(Exit); I != E;) {
+    if (*I == Entry) {
+      I.skipChildren();
+      continue;
+    }
+
+    if (!DT->dominates(Entry, *I))
+      return false;
+
+    ++I;
+  }
+  return true;
+}
+
 /// A simple loop rotation transformation.
 class LoopRotate {
-  const unsigned MaxHeaderSize;
   LoopInfo *LI;
   const TargetTransformInfo *TTI;
   AssumptionCache *AC;
   DominatorTree *DT;
   ScalarEvolution *SE;
+  Loop *L;
+  typedef SmallVectorImpl<BasicBlock *> SmallVecBB;
+  typedef SmallPtrSetImpl<BasicBlock *> SmallPtrSetBB;
 
 public:
-  LoopRotate(unsigned MaxHeaderSize, LoopInfo *LI,
-             const TargetTransformInfo *TTI, AssumptionCache *AC,
-             DominatorTree *DT, ScalarEvolution *SE)
-      : MaxHeaderSize(MaxHeaderSize), LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE) {
-  }
-  bool processLoop(Loop *L);
+  LoopRotate(LoopInfo *LI, const TargetTransformInfo *TTI, AssumptionCache *AC,
+             DominatorTree *DT, ScalarEvolution *SE, Loop *L)
+      : LI(LI), TTI(TTI), AC(AC), DT(DT), SE(SE), L(L) {}
+  bool processLoop();
 
 private:
-  bool rotateLoop(Loop *L, bool SimplifiedLatch);
-  bool simplifyLoopLatch(Loop *L);
-};
-} // end anonymous namespace
-
-/// RewriteUsesOfClonedInstructions - We just cloned the instructions from the
-/// old header into the preheader.  If there were uses of the values produced by
-/// these instruction that were outside of the loop, we have to insert PHI nodes
-/// to merge the two values.  Do this now.
-static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
-                                            BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap) {
-  // Remove PHI node entries that are no longer live.
-  BasicBlock::iterator I, E = OrigHeader->end();
-  for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    PN->removeIncomingValue(PN->getBasicBlockIndex(OrigPreheader));
-
-  // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
-  // as necessary.
-  SSAUpdater SSA;
-  for (I = OrigHeader->begin(); I != E; ++I) {
-    Value *OrigHeaderVal = &*I;
-
-    // If there are no uses of the value (e.g. because it returns void), there
-    // is nothing to rewrite.
-    if (OrigHeaderVal->use_empty())
-      continue;
+  void rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks,
+                  const SmallPtrSetBB &Exits) const;
 
-    Value *OrigPreHeaderVal = ValueMap.lookup(OrigHeaderVal);
-
-    // The value now exits in two versions: the initial value in the preheader
-    // and the loop "next" value in the original header.
-    SSA.Initialize(OrigHeaderVal->getType(), OrigHeaderVal->getName());
-    SSA.AddAvailableValue(OrigHeader, OrigHeaderVal);
-    SSA.AddAvailableValue(OrigPreheader, OrigPreHeaderVal);
-
-    // Visit each use of the OrigHeader instruction.
-    for (Value::use_iterator UI = OrigHeaderVal->use_begin(),
-                             UE = OrigHeaderVal->use_end();
-         UI != UE;) {
-      // Grab the use before incrementing the iterator.
-      Use &U = *UI;
-
-      // Increment the iterator before removing the use from the list.
-      ++UI;
-
-      // SSAUpdater can't handle a non-PHI use in the same block as an
-      // earlier def. We can easily handle those cases manually.
-      Instruction *UserInst = cast<Instruction>(U.getUser());
-      if (!isa<PHINode>(UserInst)) {
-        BasicBlock *UserBB = UserInst->getParent();
-
-        // The original users in the OrigHeader are already using the
-        // original definitions.
-        if (UserBB == OrigHeader)
-          continue;
+  bool preserveLoopSimplifyForm(const SmallPtrSetBB &Exits) const;
 
-        // Users in the OrigPreHeader need to use the value to which the
-        // original definitions are mapped.
-        if (UserBB == OrigPreheader) {
-          U = OrigPreHeaderVal;
-          continue;
-        }
-      }
+  bool isLoopSizeWithinLimits(const SmallVecBB &Blocks,
+                              const SmallPtrSetBB &Exits) const;
 
-      // Anything else can be handled by SSAUpdater.
-      SSA.RewriteUse(U);
-    }
+  bool isLegalToRotate() const;
 
-    // Replace MetadataAsValue(ValueAsMetadata(OrigHeaderVal)) uses in debug
-    // intrinsics.
-    LLVMContext &C = OrigHeader->getContext();
-    if (auto *VAM = ValueAsMetadata::getIfExists(OrigHeaderVal)) {
-      if (auto *MAV = MetadataAsValue::getIfExists(C, VAM)) {
-        for (auto UI = MAV->use_begin(), E = MAV->use_end(); UI != E;) {
-          // Grab the use before incrementing the iterator. Otherwise, altering
-          // the Use will invalidate the iterator.
-          Use &U = *UI++;
-          DbgInfoIntrinsic *UserInst = dyn_cast<DbgInfoIntrinsic>(U.getUser());
-          if (!UserInst)
-            continue;
+  void adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH,
+                           BasicBlock *NewPH) const;
 
-          // The original users in the OrigHeader are already using the original
-          // definitions.
-          BasicBlock *UserBB = UserInst->getParent();
-          if (UserBB == OrigHeader)
-            continue;
+  BasicBlock *collectSEMEBlocks(BasicBlock *OrigH, BasicBlock *OrigLatch,
+                                SmallVecBB &Blocks, SmallPtrSetBB &Exits) const;
 
-          // Users in the OrigPreHeader need to use the value to which the
-          // original definitions are mapped and anything else can be handled by
-          // the SSAUpdater. To avoid adding PHINodes, check if the value is
-          // available in UserBB, if not substitute undef.
-          Value *NewVal;
-          if (UserBB == OrigPreheader)
-            NewVal = OrigPreHeaderVal;
-          else if (SSA.HasValueForBlock(UserBB))
-            NewVal = SSA.GetValueInMiddleOfBlock(UserBB);
-          else
-            NewVal = UndefValue::get(OrigHeaderVal->getType());
-          U = MetadataAsValue::get(C, ValueAsMetadata::get(NewVal));
-        }
-      }
-    }
-  }
-}
+  PHINode *getOrCreatePHI(Instruction *Inst, BasicBlock *NewHeader,
+                          BasicBlock *NewPreheader, BasicBlock *NewLatch,
+                          ValueToValueMapTy &VMap) const;
 
-/// Rotate loop LP. Return true if the loop is rotated.
-///
-/// \param SimplifiedLatch is true if the latch was just folded into the final
-/// loop exit. In this case we may want to rotate even though the new latch is
-/// now an exiting branch. This rotation would have happened had the latch not
-/// been simplified. However, if SimplifiedLatch is false, then we avoid
-/// rotating loops in which the latch exits to avoid excessive or endless
-/// rotation. LoopRotate should be repeatable and converge to a canonical
-/// form. This property is satisfied because simplifying the loop latch can only
-/// happen once across multiple invocations of the LoopRotate pass.
-bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
+  void addNewPhisToNewHeader(const SmallVecBB &Blocks, BasicBlock *NewHeader,
+                             BasicBlock *NewPreheader, BasicBlock *NewLatch,
+                             ValueToValueMapTy &VMap) const;
+};
+
+// Check if there is something preventing loop from being rotated.
+bool LoopRotate::isLegalToRotate() const {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
-  BasicBlock *OrigHeader = L->getHeader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
+  const BasicBlock *LoopLatch = L->getLoopLatch();
+  if (!LoopLatch || isa<IndirectBrInst>(LoopLatch->getTerminator()))
+    return false;
+
+  const BasicBlock *OrigH = L->getHeader();
+  // The header should have a conditional branch to within the loop for rotation
+  // to happen. In cases where the header does not conditionally branch to
+  // within the loop e.g., header with an invoke instruction rotation will fail.
+  const BranchInst *BI = dyn_cast<const BranchInst>(OrigH->getTerminator());
+  if (!BI || !BI->isConditional())
+    return false;
 
-  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (!BI || BI->isUnconditional())
+  // If the loop latch is exiting then, probably, this loop is already rotated.
+  if (L->isLoopExiting(LoopLatch))
     return false;
 
-  // If the loop header is not one of the loop exiting blocks then
-  // either this loop is already rotated or it is not
-  // suitable for loop rotation transformations.
-  if (!L->isLoopExiting(OrigHeader))
+  // If the loop header is not one of the loop exiting blocks then it might
+  // have been rotated already.
+  if (!L->isLoopExiting(OrigH))
     return false;
 
-  // If the loop latch already contains a branch that leaves the loop then the
-  // loop is already rotated.
-  if (!OrigLatch)
+  // If the loop could not be converted to canonical form, it must have an
+  // indirectbr in it, just give up.
+  if (!L->getLoopPreheader())
     return false;
 
-  // Rotate if either the loop latch does *not* exit the loop, or if the loop
-  // latch was just simplified.
-  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch)
+  if (!isSingleEntryMultipleExit(OrigH, LoopLatch, DT))
     return false;
 
+  return true;
+}
+
+// Check if the size of rotated loop will be within limits.
+bool LoopRotate::isLoopSizeWithinLimits(const SmallVecBB &Blocks,
+                                        const SmallPtrSetBB &Exits) const {
   // Check size of original header and reject loop if it is very big or we can't
   // duplicate blocks inside it.
-  {
-    SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
+  if (Exits.size() > MaxExits)
+    return false;
+  SmallPtrSet<const Value *, 32> EphValues;
+  CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+  unsigned LoopSize = 0;
+  for (const BasicBlock *BB : Blocks) {
     CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+    Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
+    // TODO: Modify this because there might be blocks with indirectbr, invoke
+    // in the loop but we can cut the cloning part at that point and that will
+    // be the last exit BB.
     if (Metrics.notDuplicatable) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
                    << " instructions: ";
             L->dump());
       return false;
     }
+
     if (Metrics.convergent) {
       DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
                       "instructions: ";
             L->dump());
       return false;
     }
-    if (Metrics.NumInsts > MaxHeaderSize)
+
+    LoopSize += Metrics.NumInsts;
+    // TODO: Even if the loop's size is greater than RotationMaxSize, what we
+    // can do is peel fewer basic blocks than needed to completely rotate the
+    // loop. That way, at least, some redundancies will be exposed.
+    if (LoopSize >= RotationMaxSize)
       return false;
   }
+  return true;
+}
 
-  // Now, this loop is suitable for rotation.
-  BasicBlock *OrigPreheader = L->getLoopPreheader();
+// Return a PHI for incoming values from NewPreHeader and NewLatch. If such a
+// PHI already exsits then return that otherwise create one. NewHeader has two
+// predecessors NewPreHeader and NewLatch. Inst is the original instruction
+// for which the PHI is to be created, its value is coming from NewLatch
+// and its cloned value VMap[Inst] is coming from NewPreheader.
+PHINode *LoopRotate::getOrCreatePHI(Instruction *Inst, BasicBlock *NewHeader,
+                                    BasicBlock *NewPreheader,
+                                    BasicBlock *NewLatch,
+                                    ValueToValueMapTy &VMap) const {
+    // Look within existing PHIs having same incoming values.
+    for (Instruction &I: *NewHeader) {
+        PHINode *PN = dyn_cast<PHINode>(&I);
+        if (!PN)
+          break;
+        assert(PN->getNumOperands() == 2);
+        int NLVal = PN->getBasicBlockIndex(NewLatch);
+        assert(NLVal >= 0);
+        int NPHVal = PN->getBasicBlockIndex(NewPreheader);
+        assert(NPHVal >= 0);
+        if (PN->getIncomingValue(NLVal) == Inst &&
+            PN->getIncomingValue(NPHVal) == VMap[Inst])
+          return PN;
+    }
 
-  // If the loop could not be converted to canonical form, it must have an
-  // indirectbr in it, just give up.
-  if (!OrigPreheader)
-    return false;
+    PHINode *PN = PHINode::Create(Inst->getType(), 2, "phi.nh",
+                                  &*NewHeader->begin());
+    PN->addIncoming(Inst, NewLatch);
+    PN->addIncoming(cast<Instruction>(VMap[Inst]), NewPreheader);
 
-  // Anything ScalarEvolution may know about this loop or the PHI nodes
-  // in its header will soon be invalidated.
-  if (SE)
-    SE->forgetLoop(L);
+    return PN;
+}
 
-  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+// Add phis to the new header and adjust the phi nodes from the OrigHeader.
+void LoopRotate::addNewPhisToNewHeader(const SmallVecBB &Blocks,
+                                       BasicBlock *NewHeader,
+                                       BasicBlock *NewPreheader,
+                                       BasicBlock *NewLatch,
+                                       ValueToValueMapTy &VMap) const {
+  // Add to NewHeader, phi nodes for all copied variables which are used.
+  for (BasicBlock *BB : Blocks) {
+    for (Instruction &Inst : *BB) {
+      // Skip Inst with no use e.g., branches.
+      if (Inst.use_begin() == Inst.use_end())
+        continue;
 
-  // Find new Loop header. NewHeader is a Header's one and only successor
-  // that is inside loop.  Header's other successor is outside the
-  // loop.  Otherwise loop is not suitable for rotation.
-  BasicBlock *Exit = BI->getSuccessor(0);
-  BasicBlock *NewHeader = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    std::swap(Exit, NewHeader);
-  assert(NewHeader && "Unable to determine new loop header");
-  assert(L->contains(NewHeader) && !L->contains(Exit) &&
-         "Unable to determine loop header and exit blocks");
-
-  // This code assumes that the new header has exactly one predecessor.
-  // Remove any single-entry PHI nodes in it.
-  assert(NewHeader->getSinglePredecessor() &&
-         "New header doesn't have one pred!");
-  FoldSingleEntryPHINodes(NewHeader);
-
-  // Begin by walking OrigHeader and populating ValueMap with an entry for
-  // each Instruction.
-  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-  ValueToValueMapTy ValueMap;
-
-  // For PHI nodes, the value available in OldPreHeader is just the
-  // incoming value from OldPreHeader.
-  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    ValueMap[PN] = PN->getIncomingValueForBlock(OrigPreheader);
-
-  const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
-
-  // For the rest of the instructions, either hoist to the OrigPreheader if
-  // possible or create a clone in the OldPreHeader if not.
-  TerminatorInst *LoopEntryBranch = OrigPreheader->getTerminator();
-  while (I != E) {
-    Instruction *Inst = &*I++;
-
-    // If the instruction's operands are invariant and it doesn't read or write
-    // memory, then it is safe to hoist.  Doing this doesn't change the order of
-    // execution in the preheader, but does prevent the instruction from
-    // executing in each iteration of the loop.  This means it is safe to hoist
-    // something that might trap, but isn't safe to hoist something that reads
-    // memory (without proving that the loop doesn't write).
-    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-        !Inst->mayWriteToMemory() && !isa<TerminatorInst>(Inst) &&
-        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
-      Inst->moveBefore(LoopEntryBranch);
-      continue;
-    }
+      for (auto UI = Inst.use_begin(), E = Inst.use_end(); UI != E;) {
+        Use &U = *UI++;
+        Instruction *UserInst = cast<Instruction>(U.getUser());
+
+        // Nothing to rename when the use is dominated by the definition.
+        if (DT->dominates(&Inst, UserInst))
+          continue;
+
+        if (!L->contains(UserInst->getParent())) {
+          // Handle uses in the loop-closed-phi.
+          PHINode *ClosePhi = cast<PHINode>(UserInst);
+          BasicBlock *Pred = ClosePhi->getIncomingBlock(U.getOperandNo());
+
+          // Do not rename a loop close phi node if its predecessor in the loop
+          // is dominated by Inst.
+          if (L->contains(Pred) && DT->dominates(BB, Pred))
+            continue;
+        }
+
+        PHINode* PN = getOrCreatePHI(&Inst, NewHeader, NewPreheader, NewLatch, VMap);
 
-    // Otherwise, create a duplicate of the instruction.
-    Instruction *C = Inst->clone();
-
-    // Eagerly remap the operands of the instruction.
-    RemapInstruction(C, ValueMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
-
-    // With the operands remapped, see if the instruction constant folds or is
-    // otherwise simplifyable.  This commonly occurs because the entry from PHI
-    // nodes allows icmps and other instructions to fold.
-    // FIXME: Provide TLI, DT, AC to SimplifyInstruction.
-    Value *V = SimplifyInstruction(C, DL);
-    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-      // If so, then delete the temporary instruction and stick the folded value
-      // in the map.
-      ValueMap[Inst] = V;
-      if (!C->mayHaveSideEffects()) {
-        delete C;
-        C = nullptr;
+        // When Inst does not dominate U, it is going to use the updated
+        // definition coming from PN.
+        U.set(PN);
       }
-    } else {
-      ValueMap[Inst] = C;
-    }
-    if (C) {
-      // Otherwise, stick the new instruction into the new block!
-      C->setName(Inst->getName());
-      C->insertBefore(LoopEntryBranch);
-
-      if (auto *II = dyn_cast<IntrinsicInst>(C))
-        if (II->getIntrinsicID() == Intrinsic::assume)
-          AC->registerAssumption(II);
     }
   }
+}
+
+// Add incoming values to the (already present) PHIs of NewH.
+void LoopRotate::adjustNewHeaderPhis(ValueToValueMapTy &VMap, BasicBlock *NewH,
+                                     BasicBlock *NewPH) const {
+  for (Instruction &Inst : *NewH) {
+    PHINode *PN = dyn_cast<PHINode>(&Inst);
+    if (!PN)
+      break;
+    assert((PN->getNumOperands() == 1) && "NewH had multiple predecessors.");
+    Value *Op = PN->getIncomingValue(0);
+    if (Value *RenamedVal = VMap[Op])
+      PN->addIncoming(RenamedVal, NewPH);
+    else // When no mapping is available (e.g., in case of a constant).
+      PN->addIncoming(Op, NewPH);
+  }
+}
 
-  // Along with all the other instructions, we just cloned OrigHeader's
-  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-  // successors by duplicating their incoming values for OrigHeader.
-  TerminatorInst *TI = OrigHeader->getTerminator();
-  for (BasicBlock *SuccBB : TI->successors())
-    for (BasicBlock::iterator BI = SuccBB->begin();
-         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-  // OrigPreHeader's old terminator (the original branch into the loop), and
-  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-  LoopEntryBranch->eraseFromParent();
-
-  // If there were any uses of instructions in the duplicated block outside the
-  // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
-
-  // NewHeader is now the header of the loop.
-  L->moveToHeader(NewHeader);
-  assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-  // At this point, we've finished our major CFG changes.  As part of cloning
-  // the loop into the preheader we've simplified instructions and the
-  // duplicated conditional branch may now be branching on a constant.  If it is
-  // branching on a constant and if that constant means that we enter the loop,
-  // then we fold away the cond branch to an uncond branch.  This simplifies the
-  // loop in cases important for nested loops, and it also means we don't have
-  // to split as many edges.
-  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-  if (!isa<ConstantInt>(PHBI->getCondition()) ||
-      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
-          NewHeader) {
-    // The conditional branch can't be folded, handle the general case.
-    // Update DominatorTree to reflect the CFG change we just made.  Then split
-    // edges as necessary to preserve LoopSimplify form.
-    if (DT) {
-      // Everything that was dominated by the old loop header is now dominated
-      // by the original loop preheader. Conceptually the header was merged
-      // into the preheader, even though we reuse the actual block as a new
-      // loop latch.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      DomTreeNode *OrigPreheaderNode = DT->getNode(OrigPreheader);
-      for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I)
-        DT->changeImmediateDominator(HeaderChildren[I], OrigPreheaderNode);
-
-      assert(DT->getNode(Exit)->getIDom() == OrigPreheaderNode);
-      assert(DT->getNode(NewHeader)->getIDom() == OrigPreheaderNode);
-
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
+// OrigH and OrigLatch represent an SEME region.  Collect all BasicBlocks
+// (bounded by \p OrigH and \p OrigLatch), which are exiting the loop in
+// \p Blocks and collect all the exits from the region in \p Exits. Returns the
+// first basic block which will not be cloned.
+BasicBlock *LoopRotate::collectSEMEBlocks(BasicBlock *OrigH,
+                                          BasicBlock *OrigLatch,
+                                          SmallVecBB &Blocks,
+                                          SmallPtrSetBB &Exits) const {
+  BasicBlock *NewH = nullptr;
+  for (auto BB = df_begin(OrigH), E = df_end(OrigH); BB != E;) {
+    if (!L->contains(*BB)) {
+      BB.skipChildren();
+      continue;
     }
 
-    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-    // thus is not a preheader anymore.
-    // Split the edge to form a real preheader.
-    BasicBlock *NewPH = SplitCriticalEdge(
-        OrigPreheader, NewHeader,
-        CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-    NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-    // Preserve canonical loop form, which means that 'Exit' should have only
-    // one predecessor. Note that Exit could be an exit block for multiple
-    // nested loops, causing both of the edges to now be critical and need to
-    // be split.
-    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
-    bool SplitLatchEdge = false;
-    for (BasicBlock *ExitPred : ExitPreds) {
-      // We only need to split loop exit edges.
-      Loop *PredLoop = LI->getLoopFor(ExitPred);
-      if (!PredLoop || PredLoop->contains(Exit))
-        continue;
-      if (isa<IndirectBrInst>(ExitPred->getTerminator()))
-        continue;
-      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-      BasicBlock *ExitSplit = SplitCriticalEdge(
-          ExitPred, Exit,
-          CriticalEdgeSplittingOptions(DT, LI).setPreserveLCSSA());
-      ExitSplit->moveBefore(Exit);
+    // Copy until any BB where the branch does not exit loop, or the loop-latch.
+    if (OrigLatch == *BB || !L->isLoopExiting(*BB) ||
+        !isa<BranchInst>((*BB)->getTerminator())) {
+      // This will become the new header.
+      NewH = *BB;
+      BB.skipChildren();
+    } else {
+      Blocks.push_back(*BB);
+
+      BranchInst *BI = cast<BranchInst>((*BB)->getTerminator());
+      for (unsigned B = 0, E = BI->getNumSuccessors(); B < E; ++B) {
+        BasicBlock *Succ = BI->getSuccessor(B);
+        if (!L->contains(Succ))
+          Exits.insert(Succ);
+      }
+      ++BB;
     }
-    assert(SplitLatchEdge &&
-           "Despite splitting all preds, failed to split latch exit?");
-  } else {
-    // We can fold the conditional branch in the preheader, this makes things
-    // simpler. The first step is to remove the extra edge to the Exit block.
-    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
-    NewBI->setDebugLoc(PHBI->getDebugLoc());
-    PHBI->eraseFromParent();
-
-    // With our CFG finalized, update DomTree if it is available.
-    if (DT) {
-      // Update OrigHeader to be dominated by the new header block.
-      DT->changeImmediateDominator(NewHeader, OrigPreheader);
-      DT->changeImmediateDominator(OrigHeader, OrigLatch);
-
-      // Brute force incremental dominator tree update. Call
-      // findNearestCommonDominator on all CFG predecessors of each child of the
-      // original header.
-      DomTreeNode *OrigHeaderNode = DT->getNode(OrigHeader);
-      SmallVector<DomTreeNode *, 8> HeaderChildren(OrigHeaderNode->begin(),
-                                                   OrigHeaderNode->end());
-      bool Changed;
-      do {
-        Changed = false;
-        for (unsigned I = 0, E = HeaderChildren.size(); I != E; ++I) {
-          DomTreeNode *Node = HeaderChildren[I];
-          BasicBlock *BB = Node->getBlock();
-
-          pred_iterator PI = pred_begin(BB);
-          BasicBlock *NearestDom = *PI;
-          for (pred_iterator PE = pred_end(BB); PI != PE; ++PI)
-            NearestDom = DT->findNearestCommonDominator(NearestDom, *PI);
-
-          // Remember if this changes the DomTree.
-          if (Node->getIDom()->getBlock() != NearestDom) {
-            DT->changeImmediateDominator(BB, NearestDom);
-            Changed = true;
-          }
-        }
+  }
+  return NewH;
+}
 
-        // If the dominator changed, this may have an effect on other
-        // predecessors, continue until we reach a fixpoint.
-      } while (Changed);
+// Helper function for copySEME. Adjusts the PHIs of all the \p Exits bounding
+// an SEME. \p VMap contains mapping of original BB vs copied BB.
+static void adjustExitingPhis(ValueToValueMapTy &VMap,
+                              const SmallPtrSetImpl<BasicBlock *> &Exits) {
+  for (BasicBlock *BB : Exits) {
+    for (Instruction &Inst : *BB) {
+      PHINode *PN = dyn_cast<PHINode>(&Inst);
+      if (!PN)
+        break;
+      bool EdgeFromOrigBB = false;
+      for (unsigned i = 0, e = PN->getNumOperands(); i != e; ++i) {
+        Value *CopyB = VMap[PN->getIncomingBlock(i)];
+        if (!CopyB) // Skip args coming from outside the SEME.
+          continue;
+        BasicBlock *CopyBB = cast<BasicBlock>(CopyB);
+        EdgeFromOrigBB = true;
+        Value *Op = PN->getIncomingValue(i);
+        if (Value *RenamedVal = VMap[Op])
+          PN->addIncoming(RenamedVal, CopyBB);
+        else
+          // When no mapping is available it may be a constant,
+          // a function argument, or a global, add as it is.
+          PN->addIncoming(Op, CopyBB);
+      }
+      assert(EdgeFromOrigBB && "Illegal exit from SEME.");
     }
   }
+}
 
-  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+/// Clones the basic blocks (\p Blocks) of an SEME bounded by \p Exits.
+/// Blocks[0] is the entry basic block of the SEME.
+/// The mapping between original BBs and correponding copied BBs are
+/// populated in \p VMap. During copy the DOM and LI of CFG are updated.
+/// \returns The entry point of the copied SEME. \p NameSuffix is used to suffix
+/// name of the copied BBs. The copied SEME is also an SEME.
+static BasicBlock* copyBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
+                              const SmallPtrSetImpl<BasicBlock *> &Exits,
+                              ValueToValueMapTy &VMap,
+                              const Twine &NameSuffix,
+                              DominatorTree *DT, LoopInfo *LI) {
+  // Step1: Clone the basic blocks and populate VMap.
+  BasicBlock *OrigH = Blocks[0];
+
+  Function *F = OrigH->getParent();
+  SmallVector<BasicBlock *, 4> NewBlocks;
+  for (BasicBlock *BB : Blocks) {
+    assert(!isa<IndirectBrInst>(BB->getTerminator()) &&
+           "Cannot clone SEME with indirect branches.");
+
+    BasicBlock *NewBB = CloneBasicBlock(BB, VMap, NameSuffix, F);
+    // Move them physically from the end of the block list.
+    F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(),
+                                  NewBB);
+    Loop *BBLoop = LI->getLoopFor(BB);
+    Loop *BBParentLoop = BBLoop->getParentLoop();
+    if (BBParentLoop)
+      BBParentLoop->addBasicBlockToLoop(NewBB, *LI);
+    VMap[BB] = NewBB;
+    NewBlocks.push_back(NewBB);
+  }
 
-  // Now that the CFG and DomTree are in a consistent state again, try to merge
-  // the OrigHeader block into OrigLatch.  This will succeed if they are
-  // connected by an unconditional branch.  This is just a cleanup so the
-  // emitted code isn't too gross in this common case.
-  MergeBlockIntoPredecessor(OrigHeader, DT, LI);
+  // Step2: Remap the names in copied BBs.
+  remapInstructionsInBlocks(NewBlocks, VMap);
 
-  DEBUG(dbgs() << "LoopRotation: into "; L->dump());
+  // Step3: Redirect the edges.
+  for (BasicBlock *BB : Blocks) {
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[BB]);
+    BranchInst *BI = dyn_cast<BranchInst>(NewBB->getTerminator());
+    if (!BI)
+      continue;
 
-  ++NumRotated;
-  return true;
-}
+    for (unsigned I = 0, E = BI->getNumSuccessors(); I < E; ++I)
+      if (auto *NewSucc = cast_or_null<BasicBlock>(VMap[BI->getSuccessor(I)]))
+        BI->setSuccessor(I, NewSucc);
+  }
 
-/// Determine whether the instructions in this range may be safely and cheaply
-/// speculated. This is not an important enough situation to develop complex
-/// heuristics. We handle a single arithmetic instruction along with any type
-/// conversions.
-static bool shouldSpeculateInstrs(BasicBlock::iterator Begin,
-                                  BasicBlock::iterator End, Loop *L) {
-  bool seenIncrement = false;
-  bool MultiExitLoop = false;
+  // Step4: Update the DOM of copied SEME. Except for the entry block its tree
+  // structure is the same as of original SEME so the dominators also follow the
+  // same structural property. If the IDom of original BB is not in SEME that
+  // means it is the entry block, in that case the new IDom of the new BB must
+  // be its single predecessor because we are dealing with an SEME region.
+  BasicBlock *EntryNewSEME = nullptr;
+  if (auto *DomT = DT->getNode(Blocks[0])->getIDom()) {
+    // Entry to SEME has a dominator, update the copied entry.
+    BasicBlock *Dom = DomT->getBlock();
+    assert(!VMap[Dom]); // Dom does not belong to SEME => entry block.
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[Blocks[0]]);
+    EntryNewSEME = NewBB;
+    DT->addNewBlock(NewBB, Dom);
+    DT->changeImmediateDominator(NewBB, Dom);
+  }
 
-  if (!L->getExitingBlock())
-    MultiExitLoop = true;
+  for (auto BBI = std::next(Blocks.begin()); BBI != Blocks.end(); ++BBI) {
+    BasicBlock *NewBB = cast<BasicBlock>(VMap[*BBI]);
+    BasicBlock *Dom = DT->getNode(*BBI)->getIDom()->getBlock();
+    BasicBlock *NewDom = cast<BasicBlock>(VMap[Dom]);
+    DT->addNewBlock(NewBB, NewDom);
+    DT->changeImmediateDominator(NewBB, NewDom);
+  }
 
-  for (BasicBlock::iterator I = Begin; I != End; ++I) {
+  // Step5: Adjust PHI nodes for edges exiting the SEME.
+  adjustExitingPhis(VMap, Exits);
+  return EntryNewSEME;
+}
 
-    if (!isSafeToSpeculativelyExecute(&*I))
-      return false;
+bool LoopRotate::preserveLoopSimplifyForm(const SmallPtrSetBB &Exits) const {
+    bool HasIndirectBr = false;
+    // Split the edge from exiting BB to exit BB if not in loop-simplify form.
+    // We do not guarantee loop-simplify form to be preserved if the original
+    // loop has indirect exiting branches.
+    for (BasicBlock *BB: Exits) {
+      if (!BB->getSinglePredecessor()) {
+        // Splitting edges on the fly causes predecessor list to be corrupted
+        // so collect all the predecessors of BB to be split.
+        SmallVector<BasicBlock *, 2> EdgesToBeSplit;
+        for (BasicBlock *Pred : predecessors(BB))
+          if (L->contains(Pred)) {
+            if (isa<IndirectBrInst>(Pred->getTerminator()))
+              HasIndirectBr = true;
+            else
+              EdgesToBeSplit.push_back(Pred);
+          }
+        for (BasicBlock *Pred: EdgesToBeSplit)
+          SplitEdge(Pred, BB, DT, LI); // Split Pred->BB
+      }
+    }
+  return HasIndirectBr;
+}
 
-    if (isa<DbgInfoIntrinsic>(I))
-      continue;
+/// Rotate loop L.
+/// Rotate the SEME (\p Blocks) bounded by \p Exits.
+/// Blocks[0] is the entry basic block of the SEME.
+void LoopRotate::rotateLoop(BasicBlock *NewH, const SmallVecBB &Blocks,
+                            const SmallPtrSetBB &Exits) const {
+  bool LoopSimplifyForm = false;
+  DEBUG(LoopSimplifyForm = L->isLoopSimplifyForm());
+  BasicBlock *OrigH = L->getHeader();
+  BasicBlock *OrigLatch = L->getLoopLatch();
+  BasicBlock *OrigPH = L->getLoopPreheader();
 
-    switch (I->getOpcode()) {
-    default:
-      return false;
-    case Instruction::GetElementPtr:
-      // GEPs are cheap if all indices are constant.
-      if (!cast<GEPOperator>(I)->hasAllConstantIndices())
-        return false;
-      // fall-thru to increment case
-      LLVM_FALLTHROUGH;
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::And:
-    case Instruction::Or:
-    case Instruction::Xor:
-    case Instruction::Shl:
-    case Instruction::LShr:
-    case Instruction::AShr: {
-      Value *IVOpnd =
-          !isa<Constant>(I->getOperand(0))
-              ? I->getOperand(0)
-              : !isa<Constant>(I->getOperand(1)) ? I->getOperand(1) : nullptr;
-      if (!IVOpnd)
-        return false;
-
-      // If increment operand is used outside of the loop, this speculation
-      // could cause extra live range interference.
-      if (MultiExitLoop) {
-        for (User *UseI : IVOpnd->users()) {
-          auto *UserInst = cast<Instruction>(UseI);
-          if (!L->contains(UserInst))
-            return false;
-        }
-      }
+  DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
 
-      if (seenIncrement)
-        return false;
-      seenIncrement = true;
+  /// The mapping between original BBs in Blocks and correponding copied BBs are
+  /// populated in VMap.
+  ValueToValueMapTy VMap;
+  copyBlocks(Blocks, Exits, VMap, ".lr", DT, LI);
+
+  // Redirect original preheader to the entry of copied SEME.
+  BranchInst *OrigPHBI = dyn_cast<BranchInst>(OrigPH->getTerminator());
+  assert(OrigPHBI && (1 == OrigPHBI->getNumSuccessors()) &&
+         "Preheader does not have single successor.");
+
+  BasicBlock *CopyOrigH = cast<BasicBlock>(VMap[OrigH]);
+  OrigPHBI->setSuccessor(0, CopyOrigH);
+  DT->changeImmediateDominator(CopyOrigH, OrigPH);
+  L->moveToHeader(NewH);
+
+  BasicBlock *BeforeLoop = nullptr;
+  for (BasicBlock *BB : predecessors(NewH))
+    if (!L->contains(BB)) {
+      BeforeLoop = BB;
       break;
     }
-    case Instruction::Trunc:
-    case Instruction::ZExt:
-    case Instruction::SExt:
-      // ignore type conversions
-      break;
+  assert(BeforeLoop && "No entry point to the loop from New Header.");
+
+  Function *F = BeforeLoop->getParent();
+  // Move NewH physically to the beginning of the loop.
+  F->getBasicBlockList().splice(OrigH->getIterator(), F->getBasicBlockList(),
+                                NewH);
+
+  // SplitEdge does not work properly with single-entry PHIs.
+  BasicBlock *NewPH = BasicBlock::Create(
+      NewH->getContext(), NewH->getName() + ".lr.ph", F, BeforeLoop);
+
+  Loop *OuterLoop = LI->getLoopFor(OrigPH);
+  if (OuterLoop)
+    OuterLoop->addBasicBlockToLoop(NewPH, *LI);
+
+  // BeforeLoop --> NewPH --> NewH.
+  insertBetween(NewPH, BeforeLoop, NewH);
+
+  DT->addNewBlock(NewPH, BeforeLoop);
+  DT->changeImmediateDominator(NewPH, BeforeLoop);
+  DT->changeImmediateDominator(NewH, NewPH);
+
+  // Also, the original entry lost its immediate dominator so its dominator
+  // should be adjusted. We use SEME property => idom (OrigH) = its single pred.
+  DT->changeImmediateDominator(OrigH, OrigH->getSinglePredecessor());
+
+  // Adjust the dominators of original SEME.
+  for (BasicBlock *BB : Blocks) {
+    typedef DomTreeNodeBase<BasicBlock> DTNode;
+    // Get the subtree of BB in the dominator tree.
+    DTNode *DTBB = (*DT)[BB];
+    std::vector<DTNode *>::const_iterator I = DTBB->begin();
+    while (I != DTBB->end()) {
+      BasicBlock *ExitBB = (*I)->getBlock();
+      if (L->contains(ExitBB)) {
+        ++I;
+        continue;
+      }
+      BasicBlock *StaleIDom = DT->getNode(ExitBB)->getIDom()->getBlock();
+      BasicBlock *NewBB = cast<BasicBlock>(VMap[BB]);
+      // NewIDom is correct because this part of CFG is up-to-date.
+      BasicBlock *NewIDom = DT->findNearestCommonDominator(StaleIDom, NewBB);
+      NewIDom = DT->findNearestCommonDominator(NewIDom, BB);
+      if (NewIDom != StaleIDom) {
+        DT->changeImmediateDominator(ExitBB, NewIDom);
+        DEBUG(dbgs() << "\nChanging IDom of " << *ExitBB << "to" << *NewIDom);
+        I = DTBB->begin();
+      } else
+        ++I;
     }
   }
-  return true;
-}
-
-/// Fold the loop tail into the loop exit by speculating the loop tail
-/// instructions. Typically, this is a single post-increment. In the case of a
-/// simple 2-block loop, hoisting the increment can be much better than
-/// duplicating the entire loop header. In the case of loops with early exits,
-/// rotation will not work anyway, but simplifyLoopLatch will put the loop in
-/// canonical form so downstream passes can handle it.
-///
-/// I don't believe this invalidates SCEV.
-bool LoopRotate::simplifyLoopLatch(Loop *L) {
-  BasicBlock *Latch = L->getLoopLatch();
-  if (!Latch || Latch->hasAddressTaken())
-    return false;
 
-  BranchInst *Jmp = dyn_cast<BranchInst>(Latch->getTerminator());
-  if (!Jmp || !Jmp->isUnconditional())
-    return false;
+  adjustNewHeaderPhis(VMap, NewH, NewPH);
 
-  BasicBlock *LastExit = Latch->getSinglePredecessor();
-  if (!LastExit || !L->isLoopExiting(LastExit))
-    return false;
+  BasicBlock *NewLatch = L->getLoopLatch();
+  assert(L->getLoopPreheader() && "Invalid loop preheader after rotation");
+  assert(NewLatch && "Invalid loop latch after rotation");
 
-  BranchInst *BI = dyn_cast<BranchInst>(LastExit->getTerminator());
-  if (!BI)
-    return false;
+  addNewPhisToNewHeader(Blocks, NewH, NewPH, NewLatch, VMap);
 
-  if (!shouldSpeculateInstrs(Latch->begin(), Jmp->getIterator(), L))
-    return false;
+  // Discard incoming values in the CopyOrigHeader, which are coming from
+  // OrigLatch since it has only one predecessor.
+  discardIncomingValues(CopyOrigH, OrigLatch);
+  discardIncomingValues(OrigH, OrigPH);
+  const bool HasIndirectBr = preserveLoopSimplifyForm(Exits);
 
-  DEBUG(dbgs() << "Folding loop latch " << Latch->getName() << " into "
-               << LastExit->getName() << "\n");
+  DEBUG(DT->verifyDomTree());
+  DEBUG(LI->verify(*DT));
 
-  // Hoist the instructions from Latch into LastExit.
-  LastExit->getInstList().splice(BI->getIterator(), Latch->getInstList(),
-                                 Latch->begin(), Jmp->getIterator());
+  // If the loop was not in loop-simplify form then after the loop
+  // rotation, it is still not going to be in that form.
+  if (!HasIndirectBr && LoopSimplifyForm)
+    assert(L->isLoopSimplifyForm() && "Loop simplify form not preserved.");
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) && "Loop is not in LCSSA form.");
 
-  unsigned FallThruPath = BI->getSuccessor(0) == Latch ? 0 : 1;
-  BasicBlock *Header = Jmp->getSuccessor(0);
-  assert(Header == L->getHeader() && "expected a backward branch");
+  DEBUG(dbgs() << "\nLoopRotation: rotated "; L->dumpVerbose());
 
-  // Remove Latch from the CFG so that LastExit becomes the new Latch.
-  BI->setSuccessor(FallThruPath, Header);
-  Latch->replaceSuccessorsPhiUsesWith(LastExit);
-  Jmp->eraseFromParent();
+  assert(isSingleEntryMultipleExit(L->getHeader(), NewLatch, DT) &&
+         "Rotated loop not an SEME");
+  assert(isSingleEntryMultipleExit(CopyOrigH, NewPH, DT) &&
+         "Copied SEME not an SEME");
 
-  // Nuke the Latch block.
-  assert(Latch->empty() && "unable to evacuate Latch");
-  LI->removeBlock(Latch);
-  if (DT)
-    DT->eraseNode(Latch);
-  Latch->eraseFromParent();
-  return true;
+  ++NumRotated;
 }
 
 /// Rotate \c L, and return true if any modification was made.
-bool LoopRotate::processLoop(Loop *L) {
+bool LoopRotate::processLoop() {
+  if (!isLegalToRotate())
+    return false;
+
+  BasicBlock *OrigH = L->getHeader();
+  BasicBlock *LoopLatch = L->getLoopLatch();
+
+  // Basic blocks to be copied.
+  SmallVector<BasicBlock *, 4> Blocks;
+  SmallPtrSet<BasicBlock *, 4> Exits;
+  // Collect all nodes of the loop from header to latch.
+  BasicBlock *NewH = collectSEMEBlocks(OrigH, LoopLatch, Blocks, Exits);
+  assert(NewH && "Invalid SEME region.");
+
+  if (!isLoopSizeWithinLimits(Blocks, Exits))
+    return false;
+
   // Save the loop metadata.
-  MDNode *LoopMD = L->getLoopID();
+  MDNode *LoopMD = nullptr;
+  Instruction *TI = nullptr;
+  std::tie(LoopMD, TI) = L->getLoopIDWithInstr();
+
+  // Make sure the latch has only one successor.
+  if (!LoopLatch->getSingleSuccessor()) {
+    // Since LoopLatch was skipped while collecting SEME blocks, exits out of
+    // the loop-latch is collected here.
+    BranchInst *BLI = dyn_cast<BranchInst>(LoopLatch->getTerminator());
+    if (!BLI)
+      return false;
+    // If the NewH is the loop-latch => Exits does not include the exiting block
+    // from loop-latch (see collectSEMEBlocks).
+    if (NewH == LoopLatch)
+      for (unsigned I = 0, E = BLI->getNumSuccessors(); I < E; ++I) {
+        BasicBlock *Succ = BLI->getSuccessor(I);
+        if (Succ != OrigH && !L->contains(Succ))
+          Exits.insert(Succ);
+      }
+
+    // The old loop-latch will become the parent of new loop-latch.
+    DEBUG(dbgs() << "\nSplitting the edge of Loop:"; L->dumpVerbose(););
+    BasicBlock *NewLoopLatch = SplitEdge(LoopLatch, OrigH, DT, LI);
+
+    // If the NewH is the loop-latch => NewH would change as the loop-latch has
+    // changed after splitting. Also, we need to add the (old) loop-latch to the
+    // blocks because it was skipped (all the blocks before NewH are copied to
+    // new SEME).
+    if (NewH == LoopLatch) {
+      NewH = NewLoopLatch;
+      Blocks.push_back(LoopLatch);
+    }
+    LoopLatch = NewLoopLatch;
+  }
 
-  // Simplify the loop latch before attempting to rotate the header
-  // upward. Rotation may not be needed if the loop tail can be folded into the
-  // loop exit.
-  bool SimplifiedLatch = simplifyLoopLatch(L);
+  assert(LoopLatch->getSingleSuccessor() && "Invalid SEME region.");
 
-  bool MadeChange = rotateLoop(L, SimplifiedLatch);
-  assert((!MadeChange || L->isLoopExiting(L->getLoopLatch())) &&
+  // Anything ScalarEvolution may know about this loop or the PHI nodes in its
+  // header will soon be invalidated.
+  if (SE)
+    SE->forgetLoop(L);
+
+  rotateLoop(NewH, Blocks, Exits);
+  assert(L->isLoopExiting(L->getLoopLatch()) &&
          "Loop latch should be exiting after loop-rotate.");
 
-  // Restore the loop metadata.
-  // NB! We presume LoopRotation DOESN'T ADD its own metadata.
-  if ((MadeChange || SimplifiedLatch) && LoopMD)
+  if (LoopMD) {
+    // Drop loop metadata from the old loop header.
+    SmallVector<unsigned, 2> MDs;
+    TI->getAllNonDebugMetadataIDs(MDs);
+    MDs.erase(llvm::remove_if(MDs, [](unsigned ID) {
+          return ID == LLVMContext::MD_loop; }), MDs.end());
+
+    TI->dropUnknownNonDebugMetadata(MDs);
+
+    // Restore the loop metadata to the new loop header.
     L->setLoopID(LoopMD);
+  }
 
-  return MadeChange;
+  return true;
 }
 
 LoopRotatePass::LoopRotatePass(bool EnableHeaderDuplication)
@@ -628,10 +707,9 @@
 PreservedAnalyses LoopRotatePass::run(Loop &L, LoopAnalysisManager &AM,
                                       LoopStandardAnalysisResults &AR,
                                       LPMUpdater &) {
-  int Threshold = EnableHeaderDuplication ? DefaultRotationThreshold : 0;
-  LoopRotate LR(Threshold, &AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE);
+  LoopRotate LR(&AR.LI, &AR.TTI, &AR.AC, &AR.DT, &AR.SE, &L);
 
-  bool Changed = LR.processLoop(&L);
+  bool Changed = LR.processLoop();
   if (!Changed)
     return PreservedAnalyses::all();
 
@@ -641,22 +719,17 @@
 namespace {
 
 class LoopRotateLegacyPass : public LoopPass {
-  unsigned MaxHeaderSize;
-
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopRotateLegacyPass(int SpecifiedMaxHeaderSize = -1) : LoopPass(ID) {
+  LoopRotateLegacyPass() : LoopPass(ID) {
     initializeLoopRotateLegacyPassPass(*PassRegistry::getPassRegistry());
-    if (SpecifiedMaxHeaderSize == -1)
-      MaxHeaderSize = DefaultRotationThreshold;
-    else
-      MaxHeaderSize = unsigned(SpecifiedMaxHeaderSize);
   }
 
   // LCSSA form makes instruction renaming easier.
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 
@@ -672,8 +745,8 @@
     auto *DT = DTWP ? &DTWP->getDomTree() : nullptr;
     auto *SEWP = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     auto *SE = SEWP ? &SEWP->getSE() : nullptr;
-    LoopRotate LR(MaxHeaderSize, LI, TTI, AC, DT, SE);
-    return LR.processLoop(L);
+    LoopRotate LR(LI, TTI, AC, DT, SE, L);
+    return LR.processLoop();
   }
 };
 }
@@ -683,10 +756,9 @@
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops", false,
                     false)
 
-Pass *llvm::createLoopRotatePass(int MaxHeaderSize) {
-  return new LoopRotateLegacyPass(MaxHeaderSize);
-}
+Pass *llvm::createLoopRotatePass() { return new LoopRotateLegacyPass(); }
Index: llvm/test/Analysis/GlobalsModRef/memset-escape.ll
===================================================================
--- llvm/test/Analysis/GlobalsModRef/memset-escape.ll
+++ llvm/test/Analysis/GlobalsModRef/memset-escape.ll
@@ -6,15 +6,19 @@
 @a = internal global [3 x i32] zeroinitializer, align 4
 @b = common global i32 0, align 4
 
-; The important thing we're checking for here is the reload of (some element of)
-; @a after the memset.
+; Check that load and the call to abort is redundant.
+; CHECK:   store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
+; CHECK:   store i32 0, i32* @b, align 4
+; CHECK:   br label %for.body
 
-; CHECK-LABEL: @main
-; CHECK: call void @llvm.memset.p0i8.i64{{.*}} @a
-; CHECK: store i32 3
-; CHECK: load i32, i32* getelementptr {{.*}} @a
-; CHECK: icmp eq i32
-; CHECK: br i1
+; CHECK: for.body:                                         ; preds = %for.body.lr.ph
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 0), align 4
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
+; CHECK:   store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
+; CHECK:   store i32 3, i32* @b, align 4
+; CHECK:   br i1 true, label %if.end, label %if.then
+; CHECK-NOT: load
+; CHECK-NOT: call void @abort()
 
 define i32 @main() {
 entry:
Index: llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
===================================================================
--- llvm/test/Analysis/ScalarEvolution/2012-03-26-LoadConstant.ll
+++ /dev/null
@@ -1,43 +0,0 @@
-; RUN: opt < %s -basicaa -globalopt -instcombine -loop-rotate -licm -instcombine -indvars -loop-deletion -constmerge -S | FileCheck %s
-; PR11882: ComputeLoadConstantCompareExitLimit crash.
-;
-; for.body is deleted leaving a loop-invariant load.
-; CHECK-NOT: for.body
-target datalayout = "e-p:64:64:64-n32:64"
-
-@func_21_l_773 = external global i32, align 4
-@g_814 = external global i32, align 4
-@g_244 = internal global [1 x [0 x i32]] zeroinitializer, align 4
-
-define void @func_21() nounwind uwtable ssp {
-entry:
-  br label %lbl_818
-
-lbl_818:                                          ; preds = %for.end, %entry
-  call void (...) @func_27()
-  store i32 0, i32* @g_814, align 4
-  br label %for.cond
-
-for.cond:                                         ; preds = %for.body, %lbl_818
-  %0 = load i32, i32* @g_814, align 4
-  %cmp = icmp sle i32 %0, 0
-  br i1 %cmp, label %for.body, label %for.end
-
-for.body:                                         ; preds = %for.cond
-  %idxprom = sext i32 %0 to i64
-  %arrayidx = getelementptr inbounds [0 x i32], [0 x i32]* getelementptr inbounds ([1 x [0 x i32]], [1 x [0 x i32]]* @g_244, i32 0, i64 0), i32 0, i64 %idxprom
-  %1 = load i32, i32* %arrayidx, align 1
-  store i32 %1, i32* @func_21_l_773, align 4
-  store i32 1, i32* @g_814, align 4
-  br label %for.cond
-
-for.end:                                          ; preds = %for.cond
-  %2 = load i32, i32* @func_21_l_773, align 4
-  %tobool = icmp ne i32 %2, 0
-  br i1 %tobool, label %lbl_818, label %if.end
-
-if.end:                                           ; preds = %for.end
-  ret void
-}
-
-declare void @func_27(...)
Index: llvm/test/Transforms/LoopRotate/basic.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/basic.ll
+++ llvm/test/Transforms/LoopRotate/basic.ll
@@ -18,9 +18,10 @@
   %arrayidx = getelementptr inbounds [20 x i32], [20 x i32]* %array, i64 0, i64 0
   br i1 %cmp, label %for.body, label %for.end
 
-; CHECK: for.body:
+; CHECK: for.cond.lr:
 ; CHECK-NEXT: phi i32 [ 0
-; CHECK-NEXT: store i32 0
+; CHECK: for.body:
+; CHECK: store i32 0
 
 for.body:                                         ; preds = %for.cond
   store i32 0, i32* %arrayidx, align 16
Index: llvm/test/Transforms/LoopRotate/dbgvalue.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/dbgvalue.ll
+++ llvm/test/Transforms/LoopRotate/dbgvalue.ll
@@ -5,9 +5,9 @@
 
 define i32 @tak(i32 %x, i32 %y, i32 %z) nounwind ssp !dbg !0 {
 ; CHECK-LABEL: define i32 @tak(
-; CHECK: entry
-; CHECK-NEXT: call void @llvm.dbg.value(metadata i32 %x
-; CHECK: tail call void @llvm.dbg.value(metadata i32 %call
+; CHECK: tailrecurse.lr
+; CHECK: call void @llvm.dbg.value(metadata i32 %x
+; CHECK: tail call void @llvm.dbg.value(metadata i32 %y
 
 entry:
   br label %tailrecurse
@@ -42,7 +42,7 @@
 ; CHECK-LABEL: define i32 @tak2(
 ; CHECK: entry
 ; CHECK: tail call void @llvm.dbg.value(metadata i32 %x.tr
-; CHECK: tail call void @llvm.dbg.value(metadata i32 undef
+; CHECK: tail call void @llvm.dbg.value(metadata i32 %y.tr
 
 entry:
   br label %tailrecurse
@@ -83,12 +83,13 @@
 ; Ensure that the loop increment basic block is rotated into the tail of the
 ; body, even though it contains a debug intrinsic call.
 ; CHECK-LABEL: define void @FindFreeHorzSeg(
+; CHECK: for.inc:
+; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
 ; CHECK: %dec = add
 ; CHECK-NEXT: tail call void @llvm.dbg.value
+; CHECK-NEXT: br label %for.cond
 ; CHECK: %cmp = icmp
 ; CHECK: br i1 %cmp
-; CHECK: phi i64 [ %{{[^,]*}}, %{{[^,]*}} ]
-; CHECK-NEXT: br label %for.end
 
 
 entry:
Index: llvm/test/Transforms/LoopRotate/loop-rotate.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/loop-rotate.ll
@@ -0,0 +1,284 @@
+; RUN: opt -S < %s -loop-rotate -verify-dom-info -verify-loop-info | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Check that a basic while loop is rotated.
+; CHECK-LABEL: @bar0
+; CHECK: while.cond.lr:                                    ; preds = %entry
+; CHECK: while.body.lr.ph:                                 ; preds = %while.cond.lr
+
+declare void @foo0(i64* %this, i64* %that)
+
+define void @bar0(i64* %__begin1, i64* %__end1, i64** %__end2) {
+entry:
+  br label %while.cond
+
+while.cond:                                       ; preds = %while.body, %entry
+  %__end1.addr.0 = phi i64* [ %__end1, %entry ], [ %incdec.ptr, %while.body ]
+  %cmp = icmp eq i64* %__end1.addr.0, %__begin1
+  br i1 %cmp, label %while.end, label %while.body
+
+while.body:                                       ; preds = %while.cond
+  %incdec.ptr = load i64*, i64** %__end2, align 8
+  tail call void @foo0(i64* %__begin1, i64* %incdec.ptr)
+  store i64* %incdec.ptr, i64** %__end2, align 8
+  br label %while.cond
+
+while.end:                                        ; preds = %while.cond
+  ret void
+}
+
+; Check that the loop is rotated and multiple phis are updated.
+; CHECK-LABEL: @foo2
+; CHECK: do.body.i10.lr:                                   ; preds = %if.else.i3
+; CHECK: do.cond.i19.lr.ph:                                ; preds = %do.body.i10.lr
+
+define void @foo2() {
+entry:
+  br i1 undef, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry
+  br i1 undef, label %inverse.exit21, label %if.else.i3
+
+if.else.i3:                                       ; preds = %for.body
+  br label %do.body.i10
+
+do.body.i10:                                      ; preds = %do.cond.i19, %if.else.i3
+  %b1.0.i4 = phi i64 [ 0, %if.else.i3 ], [ %b2.0.i7, %do.cond.i19 ]
+  %b2.0.i7 = phi i64 [ 1, %if.else.i3 ], [ %sub8.i18, %do.cond.i19 ]
+  br i1 undef, label %do.cond.thread.i15, label %do.cond.i19
+
+do.cond.thread.i15:                               ; preds = %do.body.i10
+  br label %inverse.exit21
+
+do.cond.i19:                                      ; preds = %do.body.i10
+  %mul.i17 = mul nsw i64 undef, %b2.0.i7
+  %sub8.i18 = sub nsw i64 %b1.0.i4, %mul.i17
+  br label %do.body.i10
+
+inverse.exit21:                                   ; preds = %do.cond.thread.i15, %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %inverse.exit21, %entry
+  ret void
+}
+
+; Check that the loop with switch block within loop is rotated and the critical edge is first split.
+; CHECK-LABEL: @foo3
+; CHECK: entry:
+; CHECK: br i1 %cmp1, label %while.cond.preheader, label %while.exit
+; CHECK: while.cond.lr:                                    ; preds = %while.cond.preheader
+; CHECK: %conv7.lr = ashr exact i64 undef, 32
+; CHECK: br i1 %cmp, label %if.else106.lr.ph, label %while.exit.loopexit
+; CHECK: if.else106.lr.ph:                                 ; preds = %while.cond.lr
+; CHECK: if.else106:                                       ; preds = %if.else106.lr.ph, %while.cond
+; CHECK: switch i32 %val, label %if.else106.while.exit.loopexit_crit_edge [
+; CHECK: if.else106.while.exit.loopexit_crit_edge:         ; preds = %if.else106
+; CHECK: while.cond:                                       ; preds = %if.then467
+; CHECK: %conv7 = ashr exact i64 undef, 32
+; CHECK: br i1 %cmp, label %if.else106, label %while.cond.while.exit.loopexit_crit_edge
+; CHECK: while.cond.while.exit.loopexit_crit_edge:         ; preds = %while.cond
+; CHECK: if.then130:                                       ; preds = %if.else106
+; CHECK: if.then467:                                       ; preds = %if.then130, %if.else106
+; CHECK: while.exit.loopexit:                              ; preds = %if.else106.while.exit.loopexit_crit_edge, %while.cond.while.exit.loopexit_crit_edge, %while.cond.lr
+; CHECK: while.exit:                                       ; preds = %while.exit.loopexit, %entry
+
+define i32 @foo3(i1 %cmp, i1 %cmp1, i32 %val) {
+entry:
+  br i1 %cmp1, label %while.cond, label %while.exit
+
+while.cond:
+  %conv7 = ashr exact i64 undef, 32
+  br i1 %cmp, label %if.else106, label %while.exit
+
+if.else106:
+  switch i32 %val, label %while.exit [
+    i32 12, label %if.then130
+    i32 30, label %if.then467
+  ]
+
+if.then130:
+  br label %if.then467
+
+if.then467:
+  br label %while.cond
+
+while.exit:
+  ret i32 0
+}
+
+; Check that the loop with pre-header and loop-body having a switch block is rotated.
+; CHECK-LABEL: @bar5
+; CHECK: bb19.preheader:                                   ; preds = %bb10, %bb10
+; CHECK: bb19.lr:                                          ; preds = %bb19.preheader
+; CHECK: bb12.lr.ph:                                       ; preds = %bb19.lr
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare i32 @foo5(i8*, i32)
+
+define i32 @bar5(i8* %arg, i32 %arg1, i8* %arg2, i32 %arg3) {
+bb10:
+  switch i32 %arg3, label %bb11 [
+    i32 46, label %bb19
+    i32 32, label %bb19
+  ]
+
+bb11:                                             ; preds = %bb10
+  ret i32 -1
+
+bb12:                                             ; preds = %bb19
+  switch i8 undef, label %bb13 [
+    i8 32, label %bb21
+    i8 46, label %bb21
+  ]
+
+bb13:                                             ; preds = %bb12
+  br label %bb15
+
+bb15:                                             ; preds = %bb13
+  br i1 undef, label %bb17, label %bb16
+
+bb16:                                             ; preds = %bb15
+  %tmp = call i32 @foo5(i8* %tmp20, i32 10)
+  ret i32 0
+
+bb17:                                             ; preds = %bb15
+  %tmp18 = phi i8* [ %tmp20, %bb15 ]
+  br label %bb19
+
+bb19:                                             ; preds = %bb17, %bb10, %bb10
+  %tmp20 = phi i8* [ %tmp18, %bb17 ], [ null, %bb10 ], [ null, %bb10 ]
+  br i1 undef, label %bb21, label %bb12
+
+bb21:                                             ; preds = %bb19, %bb12, %bb12
+  ret i32 0
+}
+
+declare i8* @foo7()
+
+; Check that the loop with multiple exits and a if-block branching within the loop is rotated.
+; CHECK-LABEL: @bar7
+; CHECK: entry:
+; CHECK: for.cond.preheader:                               ; preds = %entry
+; CHECK: br label %for.cond.lr
+; CHECK: for.cond.lr:                                      ; preds = %for.cond.preheader
+; CHECK: for.body.lr:                                      ; preds = %for.cond.lr
+; CHECK: if.end20.lr.ph:                                   ; preds = %for.body.lr
+; CHECK: if.end20:                                         ; preds = %if.end20.lr.ph, %for.body
+; CHECK: %phi.nh = phi i8* [ %call15, %for.body ], [ %call15.lr, %if.end20.lr.ph ]
+; CHECK: for.cond:                                         ; preds = %for.inc
+; CHECK: br i1 false, label %for.body, label %for.cond.return.loopexit_crit_edge
+; CHECK: for.cond.return.loopexit_crit_edge:               ; preds = %for.cond
+; CHECK: for.body:                                         ; preds = %for.cond
+; CHECK: br i1 false, label %if.end20, label %for.body.return.loopexit_crit_edge
+; CHECK: for.body.return.loopexit_crit_edge:               ; preds = %for.body
+; CHECK: if.then23:                                        ; preds = %crl, %if.then.i
+; CHECK: %call15.lcssa = phi i8* [ %phi.nh, %crl ], [ %phi.nh, %if.then.i ]
+
+define void @bar7(i1 %cmp1) {
+entry:
+  br i1 %cmp1, label %return, label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  br i1 undef, label %for.body, label %return
+
+for.body:                                         ; preds = %for.cond
+  %call15 = call i8* @foo7()
+  br i1 undef, label %if.end20, label %return
+
+if.end20:                                         ; preds = %for.body
+  %issuer.i = getelementptr inbounds i8, i8* %call15, i64 24
+  br i1 undef, label %if.then.i, label %for.cond.i
+
+if.then.i:                                        ; preds = %if.end20
+  br i1 undef, label %if.then23, label %crl
+
+for.cond.i:                                       ; preds = %if.end20
+  br i1 undef, label %crl, label %for.inc
+
+crl:                                              ; preds = %if.then.i, %for.cond.i
+  br i1 undef, label %if.then23, label %for.inc
+
+if.then23:                                        ; preds = %crl, %if.then.i
+  %reason = getelementptr inbounds i8, i8* %call15, i64 32
+  br label %return
+
+for.inc:                                          ; preds = %crl, %for.cond.i
+  br label %for.cond
+
+return:                                           ; preds = %if.then23, %for.body, %for.cond, %if.end
+  ret void
+}
+
+; Check that the loop-latch with indirect branch is not rotated.
+; CHECK-LABEL: @foo8
+; CHECK-NOT: {{.*}}.lr
+; CHECK-NOT: {{.*}}.lr.ph
+
+@f.x = internal global [3 x i8*] [i8* blockaddress(@foo8, %F), i8* blockaddress(@foo8, %G), i8* blockaddress(@foo8, %H)], align 16
+
+; Function Attrs: nounwind uwtable
+define i32 @foo8(i32 %i, i32 %j, i32 %k) {
+entry:
+  %retval = alloca i32, align 4
+  %i.addr = alloca i32, align 4
+  %j.addr = alloca i32, align 4
+  %k.addr = alloca i32, align 4
+  store i32 %i, i32* %i.addr, align 4
+  store i32 %j, i32* %j.addr, align 4
+  store i32 %k, i32* %k.addr, align 4
+  br label %F
+
+F:                                                ; preds = %entry, %indirectgoto
+  %0 = load i32, i32* %i.addr, align 4
+  %1 = load i32, i32* %j.addr, align 4
+  %cmp = icmp sgt i32 %0, %1
+  br i1 %cmp, label %if.then, label %if.else
+
+if.then:                                          ; preds = %F
+  br label %G
+
+if.else:                                          ; preds = %F
+  %2 = load i32, i32* %i.addr, align 4
+  %3 = load i32, i32* %k.addr, align 4
+  %cmp1 = icmp sgt i32 %2, %3
+  br i1 %cmp1, label %if.then2, label %if.end
+
+if.then2:                                         ; preds = %if.else
+  br label %H
+
+if.end:                                           ; preds = %if.else
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.end
+  %call = call i32 (...) @z()
+  %idxprom = sext i32 %call to i64
+  %arrayidx = getelementptr inbounds [3 x i8*], [3 x i8*]* @f.x, i64 0, i64 %idxprom
+  %4 = load i8*, i8** %arrayidx, align 8
+  br label %indirectgoto
+
+G:                                                ; preds = %if.then, %indirectgoto
+  %call4 = call i32 (...) @g()
+  store i32 %call4, i32* %retval, align 4
+  br label %return
+
+H:                                                ; preds = %if.then2, %indirectgoto
+  %call5 = call i32 (...) @h()
+  store i32 %call5, i32* %retval, align 4
+  br label %return
+
+return:                                           ; preds = %H, %G
+  %5 = load i32, i32* %retval, align 4
+  ret i32 %5
+
+indirectgoto:                                     ; preds = %if.end3
+  %indirect.goto.dest = phi i8* [ %4, %if.end3 ]
+  indirectbr i8* %indirect.goto.dest, [label %F, label %G, label %H]
+}
+
+declare i32 @g(...)
+declare i32 @h(...)
+declare i32 @z(...)
Index: llvm/test/Transforms/LoopRotate/multiple-exits.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/multiple-exits.ll
+++ llvm/test/Transforms/LoopRotate/multiple-exits.ll
@@ -3,7 +3,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
-; PR7447
+; PR7447: there should be one loop rotated, for.cond is rotated.
 define i32 @test1([100 x i32]* nocapture %a) nounwind readonly {
 entry:
   br label %for.cond
@@ -32,16 +32,13 @@
   %retval.0 = phi i32 [ 1000, %land.rhs ], [ %sum.0, %for.cond ]
   ret i32 %retval.0
 
-; CHECK-LABEL: @test1(
-; CHECK: for.cond1.preheader:
-; CHECK: %sum.04 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.loopexit ]
-; CHECK: br label %for.cond1
-
-; CHECK: for.cond1:
-; CHECK: %sum.1 = phi i32 [ %add, %land.rhs ], [ %sum.04, %for.cond1.preheader ]
-; CHECK: %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
-; CHECK: %cmp2 = icmp ult i32 %i.1, 100
-; CHECK: br i1 %cmp2, label %land.rhs, label %for.cond.loopexit
+; CHECK-LABEL: @test1
+; Check that the outer loop is rotated.
+; CHECK: for.cond.lr:
+; CHECK: for.cond1.preheader.lr.ph:
+; CHECK: for.cond1:                                        ; preds = %for.cond1.preheader, %land.rhs
+; CHECK:  %sum.1 = phi i32 [ %add, %land.rhs ], [ %phi.nh, %for.cond1.preheader ]
+; CHECK:  %i.1 = phi i32 [ %inc, %land.rhs ], [ 0, %for.cond1.preheader ]
 }
 
 define void @test2(i32 %x) nounwind {
@@ -73,11 +70,16 @@
 return:                                           ; preds = %return.loopexit, %a
   ret void
 
+; Both for.cond and for.body are cloned outside of the loop and if.end is the new loop header.
 ; CHECK-LABEL: @test2(
 ; CHECK: if.end:
-; CHECK: %inc = add i32 %i.02, 1
-; CHECK: %cmp = icmp eq i32 %inc, %x
-; CHECK: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+; CHECK-NEXT: phi
+; CHECK: %inc = add i32 %phi.nh, 1
+; CHECK: for.cond:
+; CHECK-NEXT: phi
+; CHECK-NEXT: %cmp = icmp eq i32 %i.0, %x
+; CHECK-NEXT: br i1 %cmp, label %for.cond.return.loopexit_crit_edge, label %for.body
+; CHECK: for.cond.return.loopexit_crit_edge:
 }
 
 declare i32 @foo(i32)
Index: llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
+++ llvm/test/Transforms/LoopRotate/nosimplifylatch.ll
@@ -3,7 +3,9 @@
 target triple = "arm64-apple-ios8.0.0"
 
 ;CHECK: for.inc:
-;CHECK-NEXT: %incdec.ptr.i = getelementptr 
+;CHECK-NEXT: phi
+;CHECK-NEXT: %incdec.ptr.i = getelementptr
+;CHECK-NEXT: br
 
 ; Function Attrs: alwaysinline inlinehint nounwind readonly ssp
 define linkonce_odr hidden i64 @_ZNSt3__14findINS_11__wrap_iterIPiEEiEET_S4_S4_RKT0_(i64 %__first.coerce, i64 %__last.coerce, i32* nocapture readonly dereferenceable(4) %__value_) {
Index: llvm/test/Transforms/LoopRotate/phi-duplicate.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/phi-duplicate.ll
+++ llvm/test/Transforms/LoopRotate/phi-duplicate.ll
@@ -31,10 +31,13 @@
 ; Should only end up with one phi.
 ; CHECK-LABEL:      define void @test(
 ; CHECK-NEXT: entry:
-; CHECK-NEXT:   br label %for.body
+; CHECK-NEXT:   br label %for.cond.lr
 ; CHECK:      for.body:
-; CHECK-NEXT:   %j.01 = phi i64
-; CHECK-NOT:  br
-; CHECK:   br i1 %cmp, label %for.body, label %for.end
-; CHECK:      for.end:
-; CHECK-NEXT:        ret void
+; CHECK-NEXT:  %phi.nh = phi i64
+; CHECK-NOT: = phi
+; CHECK:  %inc = add nsw i64 %phi.nh, 1
+; CHECK:   br label %for.cond
+; CHECK: for.cond:
+; CHECK:   br i1 %cmp
+; CHECK: for.end:
+; CHECK-NEXT:  ret void
Index: llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
+++ llvm/test/Transforms/LoopRotate/preserve-loop-simplify.ll
@@ -4,6 +4,9 @@
 ; structures. We manually validate the CFG with FileCheck because currently we
 ; can't cause a failure when LoopSimplify fails to be preserved.
 
+; Check that inner.header and inner.body both are cloned outside the loop
+; such that inner.latch becomes the new loop header.
+
 define void @PR18643() {
 ; CHECK-LABEL: @PR18643(
 entry:
@@ -16,10 +19,16 @@
 ; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER:[^,]*]], label %outer.body
 
 ; CHECK: [[INNER_PREROTATE_PREHEADER]]:
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_PREROTATE_PREHEADER_SPLIT_RETURN:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]]
+; CHECK: br label %inner.header.lr
+
+; CHECK: inner.header.lr:
+; CHECK: br i1 true, label %return, label %inner.body.lr
+
+; CHECK: inner.body.lr:
+; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT:[^,]*]], label %[[INNER_ROTATED_PREHEADER:[^,]*]]
 
 ; CHECK: [[INNER_ROTATED_PREHEADER]]:
-; CHECK-NEXT: br label %inner.body
+; CHECK-NEXT: br label %inner.latch
 
 inner.header:
 ; Now the latch!
@@ -27,11 +36,17 @@
   br i1 undef, label %return, label %inner.body
 ; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_RETURN:[^,]*]], label %inner.body
 
+; CHECK: [[INNER_SPLIT_RETURN]]:
+; CHECK: br label %return
+
 inner.body:
 ; Now the header!
 ; CHECK: inner.body:
   br i1 undef, label %outer.latch, label %inner.latch
-; CHECK-NEXT: br i1 {{[^,]*}}, label %[[INNER_SPLIT_OUTER_LATCH:[^,]*]], label %inner.header
+; CHECK-NEXT: br i1 {{[^,]*}}, label %[[OUTER_LATCH_LOOPEXIT:[^,]*]], label %inner.latch
+
+; CHECK: [[OUTER_LATCH_LOOPEXIT]]:
+; CHECK-NEXT: br label %outer.latch.loopexit
 
 inner.latch:
 ; Dead!
@@ -42,23 +57,12 @@
   br label %outer.latch
 ; CHECK-NEXT: br label %outer.latch
 
-; L2 -> L1 exit edge needs a simplified exit block.
-; CHECK: [[INNER_SPLIT_OUTER_LATCH]]:
-; CHECK-NEXT: br label %outer.latch
 
 outer.latch:
 ; CHECK: outer.latch:
   br label %outer.header
 ; CHECK-NEXT: br label %outer.header
 
-; L1 -> L0 exit edge need sa simplified exit block.
-; CHECK: [[INNER_PREROTATE_PREHEADER_SPLIT_RETURN]]:
-; CHECK-NEXT: br label %return
-
-; L2 -> L0 exit edge needs a simplified exit block.
-; CHECK: [[INNER_SPLIT_RETURN]]:
-; CHECK-NEXT: br label %return
-
 return:
 ; CHECK: return:
   unreachable
Index: llvm/test/Transforms/LoopRotate/simplifylatch.ll
===================================================================
--- llvm/test/Transforms/LoopRotate/simplifylatch.ll
+++ llvm/test/Transforms/LoopRotate/simplifylatch.ll
@@ -3,8 +3,13 @@
 
 @mode_table = global [4 x i32] zeroinitializer		; <[4 x i32]*> [#uses=1]
 
-; CHECK-LABEL: @f(
-; CHECK-NOT: bb:
+; Check that loop with multiple exits is rotated.
+; CHECK-LABEL: @f
+; CHECK: bb.lr:
+; CHECK: bb2.lr:
+; CHECK: bb4.lr.ph:
+; CHECK: bb:
+
 define i8 @f() {
 entry:
 	tail call i32 @fegetround( )		; <i32>:0 [#uses=1]
@@ -35,42 +40,5 @@
 }
 
 declare i32 @fegetround()
-
 declare void @raise_exception() noreturn
 
-;CHECK: for.body.lr.ph:
-;CHECK-NEXT:  %arrayidx1 = getelementptr inbounds i8, i8* %CurPtr, i64 0
-;CHECK-NEXT:  %0 = load i8, i8* %arrayidx1, align 1
-;CHECK-NEXT:  %conv2 = sext i8 %0 to i32
-;CHECK-NEXT:  br label %for.body
-
-define i32 @foo(i8* %CurPtr, i32 %a) #0 {
-entry:
-  br label %for.cond
-
-for.cond:					  ; preds = %for.inc, %entry
-  %i.0 = phi i32 [ 1, %entry ], [ %inc, %for.inc ]
-  %cmp = icmp ne i32 %i.0, %a
-  br i1 %cmp, label %for.body, label %return
-
-for.body:					  ; preds = %for.cond
-  %idxprom = zext i32 %i.0 to i64
-  %arrayidx = getelementptr inbounds i8, i8* %CurPtr, i64 %idxprom
-  %0 = load i8, i8* %arrayidx, align 1
-  %conv = sext i8 %0 to i32
-  %arrayidx1 = getelementptr inbounds i8, i8* %CurPtr, i64 0
-  %1 = load i8, i8* %arrayidx1, align 1
-  %conv2 = sext i8 %1 to i32
-  %cmp3 = icmp ne i32 %conv, %conv2
-  br i1 %cmp3, label %return, label %for.inc
-
-for.inc:					  ; preds = %for.body
-  %inc = add i32 %i.0, 1
-  br label %for.cond
-
-return:						  ; preds = %for.cond, %for.body
-  %retval.0 = phi i32 [ 0, %for.body ], [ 1, %for.cond ]
-  ret i32 %retval.0
-}
-
-attributes #0 = { nounwind uwtable }
Index: llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/LoopRotate/vect.omp.persistence.ll
@@ -0,0 +1,98 @@
+; RUN: opt < %s -loop-rotate -S | FileCheck -check-prefix=CHECK1 %s
+; REQUIRES: asserts
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+
+; The loop rotate should not remove the metadata.
+; See http://reviews.llvm.org/D3348 for details.
+
+;
+; Test #1
+;
+; Ensure that "llvm.loop.vectorize.enable" metadata was not lost after loop-rotate.
+; In past LoopRotate was clearing that metadata.
+;
+; The source C code is:
+; void rotated(float *a, int size)
+; {
+;   int t = 0;
+;   #pragma omp simd
+;   for (int i = 0; i < size; ++i) {
+;     a[i] = a[i-5] * a[i+2];
+;     ++t;
+;   }
+;}
+
+; CHECK1: @rotated1
+; CHECK1: for.header.lr:
+; CHECK1: for.body.lr.ph:
+; CHECK1: for.header:
+; CHECK1: br i1 %cmp2, label %for.header.for.end.loopexit_crit_edge, label %for.body, !llvm.loop !0
+; CHECK1: for.header.for.end.loopexit_crit_edge:
+
+define void @rotated1(float* nocapture %a, i64 %size) {
+entry:
+  %cmp1 = icmp sgt i64 %size, 0
+  br i1 %cmp1, label %for.header, label %for.end
+
+for.header:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %cmp2 = icmp sgt i64 %indvars.iv, %size
+  br i1 %cmp2, label %for.end, label %for.body
+
+for.body:
+
+  %0 = add nsw i64 %indvars.iv, -5
+  %arrayidx = getelementptr inbounds float, float* %a, i64 %0
+  %1 = load float, float* %arrayidx, align 4, !llvm.mem.parallel_loop_access !1
+  %2 = add nsw i64 %indvars.iv, 2
+  %arrayidx2 = getelementptr inbounds float, float* %a, i64 %2
+  %3 = load float, float* %arrayidx2, align 4, !llvm.mem.parallel_loop_access !1
+  %mul = fmul float %1, %3
+  %arrayidx4 = getelementptr inbounds float, float* %a, i64 %indvars.iv
+  store float %mul, float* %arrayidx4, align 4, !llvm.mem.parallel_loop_access !1
+
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  br label %for.header, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2}
+!2 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+; CHECK1: @rotated2
+; CHECK1: loop_cond.lr:
+; CHECK1: loop_inc.lr.ph:
+; CHECK1: loop_inc:
+; CHECK1: loop_cond:
+; CHECK1: br i1 %cmp, label %loop_cond.return_crit_edge, label %loop_inc, !llvm.loop !2
+; CHECK1: loop_cond.return_crit_edge:
+
+;
+; Test #2
+;
+; Ensure that "llvm.loop.vectorize.enable" metadata was not lost due to loop rotation
+; (see http://reviews.llvm.org/D3348#comment-4).
+;
+define i32 @rotated2(i32 %a) {
+entry:
+  br label %loop_cond
+loop_cond:
+  %indx = phi i32 [ 1, %entry ], [ %inc, %loop_inc ]
+  %cmp = icmp ne i32 %indx, %a
+  br i1 %cmp, label %return, label %loop_inc
+loop_inc:
+  %inc = add i32 %indx, 1
+  br label %loop_cond, !llvm.loop !3
+return:
+  ret i32 0
+}
+
+!3 = !{!3, !4}
+!4 = !{!"llvm.loop.vectorize.enable", i1 true}
+
+; CHECK1: !0 = distinct !{!0, !1}
+; CHECK1: !1 = !{!"llvm.loop.vectorize.enable", i1 true}
+; CHECK1: !2 = distinct !{!2, !1}
Index: llvm/test/Transforms/LoopSimplify/ashr-crash.ll
===================================================================
--- llvm/test/Transforms/LoopSimplify/ashr-crash.ll
+++ llvm/test/Transforms/LoopSimplify/ashr-crash.ll
@@ -29,9 +29,9 @@
 ; CHECK-LABEL: entry:
 ; CHECK-LABEL: for.cond1.preheader:
 ; CHECK-LABEL: for.body3:
-; CHECK: %cmp4.le.le
-; CHECK: %conv.le.le = zext i1 %cmp4.le.le to i32
-; CHECK: %xor.le.le = xor i32 %conv6.le.le, 1
+; CHECK: %cmp4
+; CHECK: %conv = zext i1 %cmp4 to i32
+; CHECK: %xor = xor i32 %conv6, 1
 define void @foo() {
 entry:
   br label %for.cond
Index: llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
===================================================================
--- llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
+++ llvm/test/Transforms/LoopVectorize/vect.omp.persistence.ll
@@ -3,8 +3,6 @@
 
 ; Loop from "rotated"
 ; CHECK: LV: Loop hints: force=enabled
-; Loop from "nonrotated"
-; CHECK: LV: Loop hints: force=enabled
 ; No more loops in the module
 ; CHECK-NOT: LV: Loop hints: force=
 ; In total only 1 loop should be rotated.
@@ -63,25 +61,3 @@
 !1 = !{!1, !2}
 !2 = !{!"llvm.loop.vectorize.enable", i1 true}
 
-;
-; Test #2
-;
-; Ensure that "llvm.loop.vectorize.enable" metadata was not lost even
-; if loop was not rotated (see http://reviews.llvm.org/D3348#comment-4).
-;
-define i32 @nonrotated(i32 %a) {
-entry:
-  br label %loop_cond
-loop_cond:
-  %indx = phi i32 [ 1, %entry ], [ %inc, %loop_inc ]
-  %cmp = icmp ne i32 %indx, %a
-  br i1 %cmp, label %return, label %loop_inc
-loop_inc:
-  %inc = add i32 %indx, 1
-  br label %loop_cond, !llvm.loop !3
-return:
-  ret i32 0
-}
-
-!3 = !{!3, !4}
-!4 = !{!"llvm.loop.vectorize.enable", i1 true}