Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -155,6 +155,7 @@
 void initializeJumpThreadingPass(PassRegistry&);
 void initializeLCSSAPass(PassRegistry&);
 void initializeLICMPass(PassRegistry&);
+void initializeLoopVersioningPass(PassRegistry&);
 void initializeLazyValueInfoPass(PassRegistry&);
 void initializeLibCallAliasAnalysisPass(PassRegistry&);
 void initializeLintPass(PassRegistry&);
Index: include/llvm/LinkAllPasses.h
===================================================================
--- include/llvm/LinkAllPasses.h
+++ include/llvm/LinkAllPasses.h
@@ -95,6 +95,7 @@
       (void) llvm::createJumpInstrTablesPass();
       (void) llvm::createLCSSAPass();
       (void) llvm::createLICMPass();
+      (void) llvm::createLoopVersioningPass();
       (void) llvm::createLazyValueInfoPass();
       (void) llvm::createLoopExtractorPass();
       (void) llvm::createLoopSimplifyPass();
Index: include/llvm/Transforms/Scalar.h
===================================================================
--- include/llvm/Transforms/Scalar.h
+++ include/llvm/Transforms/Scalar.h
@@ -140,6 +140,13 @@
 
 //===----------------------------------------------------------------------===//
 //
+// LoopVersioning - This pass is a loop multi versioning pass.
+//
+Pass *createLoopVersioningPass();
+extern char &LoopVersioningID;
+
+//===----------------------------------------------------------------------===//
+//
 // LoopStrengthReduce - This pass is strength reduces GEP instructions that use
 // a loop's canonical induction variable as one of their indices.
 //
Index: lib/Transforms/IPO/PassManagerBuilder.cpp
===================================================================
--- lib/Transforms/IPO/PassManagerBuilder.cpp
+++ lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -55,6 +55,10 @@
   cl::init(true), cl::Hidden,
   cl::desc("Enable the new, experimental SROA pass"));
 
+static cl::opt<bool> UseLoopVersioning("loop-versioning",
+  cl::init(false), cl::Hidden,
+  cl::desc("Enable the new, experimental Loop Versioning pass"));
+
 static cl::opt<bool>
 RunLoopRerolling("reroll-loops", cl::Hidden,
                  cl::desc("Run the loop rerolling pass"));
@@ -194,9 +198,15 @@
 
     MPM.add(createIPSCCPPass());              // IP SCCP
     MPM.add(createGlobalOptimizerPass());     // Optimize out global vars
-
     MPM.add(createDeadArgEliminationPass());  // Dead argument elimination
-
+    // Do loop versioning if right option provided, default disabled.
+    if (UseLoopVersioning)
+    {
+      MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
+      MPM.add(createLICMPass());                  // Hoist loop invariants
+      MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
+      MPM.add(createLoopVersioningPass());        // Do Loop Versioning
+    }
     MPM.add(createInstructionCombiningPass());// Clean up after IPCP & DAE
     addExtensionsToPM(EP_Peephole, MPM);
     MPM.add(createCFGSimplificationPass());   // Clean up after IPCP & DAE
Index: lib/Transforms/Scalar/CMakeLists.txt
===================================================================
--- lib/Transforms/Scalar/CMakeLists.txt
+++ lib/Transforms/Scalar/CMakeLists.txt
@@ -14,6 +14,7 @@
   IndVarSimplify.cpp
   JumpThreading.cpp
   LICM.cpp
+  LoopVersioning.cpp
   LoadCombine.cpp
   LoopDeletion.cpp
   LoopIdiomRecognize.cpp
Index: lib/Transforms/Scalar/LoopVersioning.cpp
===================================================================
--- lib/Transforms/Scalar/LoopVersioning.cpp
+++ lib/Transforms/Scalar/LoopVersioning.cpp
@@ -0,0 +1,855 @@
+//===-- LoopVersioning.cpp - Creates new version of a loop ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/MDBuilder.h"
+
+
+using namespace llvm;
+
+#define DEBUG_TYPE "LoopVersioning"
+typedef SmallPtrSet<Value*, 16> ValueSet;
+
+// Maximum memory/pointer limit for memory check.
+static const unsigned RuntimeMemoryCheckThreshold = 5;
+// Maximum loop nest threshold
+static const unsigned LoopDepthThreshold = 2;
+
+/// \brief Find the operand of the GEP that should be checked for consecutive
+/// stores. This ignores trailing indices that have no effect on the final
+/// pointer.
+static unsigned getGEPInductionOperand(const DataLayout *DL,
+                                       const GetElementPtrInst *Gep) {
+  unsigned LastOperand = Gep->getNumOperands() - 1;
+  unsigned GEPAllocSize = DL->getTypeAllocSize(
+      cast<PointerType>(Gep->getType()->getScalarType())->getElementType());
+
+  // Walk backwards and try to peel off zeros.
+  while (LastOperand > 1 && 
+         match(Gep->getOperand(LastOperand), llvm::PatternMatch::m_Zero())) {
+    // Find the type we're currently indexing into.
+    gep_type_iterator GEPTI = gep_type_begin(Gep);
+    std::advance(GEPTI, LastOperand - 1);
+
+    // If it's a type with the same allocation size as the result of the GEP we
+    // can peel off the zero index.
+    if (DL->getTypeAllocSize(*GEPTI) != GEPAllocSize)
+      break;
+    --LastOperand;
+  }
+
+  return LastOperand;
+}
+
+///\brief Remove GEPs whose indices but the last one are loop invariant and
+/// return the induction operand of the gep pointer.
+static Value *stripGetElementPtr(Value *Ptr, ScalarEvolution *SE,
+                                 const DataLayout *DL, Loop *Lp) {
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP)
+    return Ptr;
+
+  unsigned InductionOperand = getGEPInductionOperand(DL, GEP);
+
+  // Check that all of the gep indices are uniform except for our induction
+  // operand.
+  for (unsigned i = 0, e = GEP->getNumOperands(); i != e; ++i)
+    if (i != InductionOperand &&
+        !SE->isLoopInvariant(SE->getSCEV(GEP->getOperand(i)), Lp))
+      return Ptr;
+  return GEP->getOperand(InductionOperand);
+}
+
+static Instruction * getFirstInst(Instruction *FirstInst, Value *V, 
+                                  Instruction *Loc) {
+  if (FirstInst)
+    return FirstInst;
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() == Loc->getParent() ? I : nullptr;
+  return nullptr;
+}
+  
+/// cloneLoop - Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                       LoopInfo *LI, LPPassManager *LPM) {
+  Loop *New = new Loop();
+  LPM->insertLoop(New, PL);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    cloneLoop(*I, New, VM, LI, LPM);
+
+  return New;
+}
+
+struct RuntimeMemoryCheck {
+  private:
+    Loop *CurLoop;
+    ScalarEvolution *SE;
+    const DataLayout *DL;
+    ValueSet *PointerSet;
+    SmallVector<TrackingVH<Value> , 2> Starts;
+    SmallVector<TrackingVH<Value> , 2> Ends;
+  public:
+    std::pair<Instruction *, Instruction *> createRuntimeCheck(Instruction *Loc);
+    RuntimeMemoryCheck(Loop *CurLoop, ScalarEvolution *SE, const DataLayout *, 
+                       ValueSet *IPointerSet);
+  };
+
+RuntimeMemoryCheck::RuntimeMemoryCheck(Loop *ICurLoop, ScalarEvolution *ISE, 
+                                       const DataLayout *IDL, 
+                                       ValueSet *IPointerSet) 
+                    :CurLoop(ICurLoop), SE(ISE), 
+                     DL(IDL), PointerSet(IPointerSet) {
+    Starts.clear();
+    Ends.clear();
+}
+
+std::pair<Instruction *, Instruction *>
+RuntimeMemoryCheck::createRuntimeCheck(Instruction *Loc)
+{
+  LLVMContext &Ctx = Loc->getContext();
+  SCEVExpander Exp(*SE, "LVer.induction");
+  Instruction *FirstInst = nullptr;
+  for (ValueSet::const_iterator I = PointerSet->begin(), E = PointerSet->end(); 
+       I != E; ++I) {
+      Value* Ptr = *I;
+      const SCEV *Sc = SE->getSCEV(Ptr);
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Sc);
+
+      assert(AR != 0 && "Null SCEVAddRecExpr found");
+
+      if(CurLoop->isLoopInvariant(Ptr))
+      {
+         Starts.push_back(Ptr);
+         Ends.push_back(Ptr);
+      }
+      else
+      {
+        const SCEV *Ex = SE->getBackedgeTakenCount(CurLoop);
+        const SCEV *ScEnd = AR->evaluateAtIteration(Ex, *SE);
+        unsigned AS = Ptr->getType()->getPointerAddressSpace();
+        Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+        Value *Start = Exp.expandCodeFor(AR->getStart(), PtrArithTy, Loc);
+        Value *End = Exp.expandCodeFor(ScEnd, PtrArithTy, Loc);
+        Starts.push_back(Start);
+        Ends.push_back(End);
+      }
+  }
+
+  IRBuilder<> ChkBuilder(Loc);
+
+  Value *MemoryRuntimeCheck = nullptr;
+  for (unsigned i = 0; i < PointerSet->size(); ++i) {
+    for (unsigned j = i+1; j < PointerSet->size(); ++j) {
+
+      unsigned AS0 = Starts[i]->getType()->getPointerAddressSpace();
+      unsigned AS1 = Starts[j]->getType()->getPointerAddressSpace();
+
+      assert((AS0 == Ends[j]->getType()->getPointerAddressSpace()) &&
+             (AS1 == Ends[i]->getType()->getPointerAddressSpace()) &&
+             "Trying to bounds check pointers with different address spaces");
+
+      Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+      Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+      Value *Start0 = ChkBuilder.CreateBitCast(Starts[i], PtrArithTy0, "bc");
+      Value *Start1 = ChkBuilder.CreateBitCast(Starts[j], PtrArithTy1, "bc");
+      Value *End0 =   ChkBuilder.CreateBitCast(Ends[i],   PtrArithTy1, "bc");
+      Value *End1 =   ChkBuilder.CreateBitCast(Ends[j],   PtrArithTy0, "bc");
+
+      Value *Cmp0 = ChkBuilder.CreateICmpULE(Start0, End1, "LVer.bound0");
+      FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
+
+      Value *Cmp1 = ChkBuilder.CreateICmpULE(Start1, End0, "LVer.bound1");
+      FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
+
+      Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "LVer.found.conflict");
+      FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+
+      if (MemoryRuntimeCheck) {
+        IsConflict = ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict,
+                                         "LVer.conflict.rdx");
+        FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
+      }
+      MemoryRuntimeCheck = IsConflict;
+    }
+  }
+
+  
+  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
+                                                 ConstantInt::getTrue(Ctx));
+  ChkBuilder.Insert(Check, "LVer.memcheck.conflict");
+  FirstInst = getFirstInst(FirstInst, Check, Loc);
+  return std::make_pair(FirstInst, Check);
+}
+
+
+namespace {
+  struct LoopVersioning : public LoopPass {
+    static char ID; 
+
+    LoopVersioning() : LoopPass(ID) {
+      initializeLoopVersioningPass(*PassRegistry::getPassRegistry());
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+    void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.setPreservesCFG(); 
+      AU.addRequired<DominatorTreeWrapperPass>();
+      AU.addRequired<LoopInfoWrapperPass>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<ScalarEvolution>();
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
+    }
+
+    using llvm::Pass::doFinalization;
+
+    bool doFinalization() override {
+      return false;
+    }
+
+    AliasAnalysis *AA;       // Current AliasAnalysis information
+    ScalarEvolution *SE;
+    LoopInfo      *LI;       // Current LoopInfo
+    DominatorTree *DT;       // Dominator Tree for the current Loop.
+    const DataLayout *DL;    // DataLayout for constant folding.
+    TargetLibraryInfo *TLI;  // TargetLibraryInfo for constant folding.
+
+    bool Changed;            // Set to true when we change anything.
+    BasicBlock *Preheader;   // The preheader block of the current loop...
+    Loop *CurLoop;           // The current loop we are working on...
+    AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    ValueSet *PointerSet;
+    MapVector<Value *, unsigned> * UniquePointer;
+
+    bool hasComputableBounds(ScalarEvolution *, Value *); 
+    bool isUniquePointer(Value *);
+    bool isGuaranteedToExecute(Instruction &Inst);
+    bool isVersioningBeneficial();
+    Loop * versionizeLoop(LPPassManager &);
+    void splitExitEdges(Loop *, const SmallVectorImpl<BasicBlock *> &);
+
+    const char *getPassName() const override {
+      return "Loop Versioning";
+    }
+  };
+}
+
+bool LoopVersioning::hasComputableBounds(ScalarEvolution *SE, Value *Ptr) {
+  const SCEV *PtrScev = SE->getSCEV(Ptr);
+  const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(PtrScev);
+  if (AR)
+    return AR->isAffine();
+  return false;
+}
+
+/// splitExitEdges - Split all of the edges from inside the loop to their exit
+/// blocks.  Update the appropriate Phi nodes as we do so.
+void LoopVersioning::splitExitEdges(Loop *L,
+                               const SmallVectorImpl<BasicBlock *> &ExitBlocks){
+
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
+                                       pred_end(ExitBlock));
+
+    // Although SplitBlockPredecessors doesn't preserve loop-simplify in
+    // general, if we call it on all predecessors of all exits then it does.
+    if (!ExitBlock->isLandingPad()) {
+      SplitBlockPredecessors(ExitBlock, Preds, ".us-lcssa", AA, DT, 
+                             LI, this->mustPreserveAnalysisID(LCSSAID)); 
+    } else {
+      SmallVector<BasicBlock*, 2> NewBBs;
+      SplitLandingPadPredecessors(ExitBlock, Preds, ".us-lcssa", ".us-lcssa",
+                                  NewBBs, AA, DT, LI);
+    }
+  }
+}
+
+bool LoopVersioning::isUniquePointer(Value * Ptr)
+{
+  // Get GetElementPtrInst, If not available return false.  
+  GetElementPtrInst *I = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!I)
+    return false;
+
+  // Check for input Value pointer:
+  // Query unique pointer map to see pointer already visited.
+  // if its a visited pointer, simply return false.
+  MapVector<Value *, unsigned>::const_iterator It = UniquePointer->find(Ptr);
+  if(It != UniquePointer->end())
+  {
+    ++(*UniquePointer)[Ptr];
+    return false;
+  }
+  // Its a new memory, update map with it.
+  ++(*UniquePointer)[Ptr];
+
+  // Check for GetElementPtrInst operand:
+  // Get pointer operand from instruction
+  // Query unique pointer map to see pointer operand already visited.
+  // if visited, simply return false.
+  Value *PtrOprnd = I->getPointerOperand();
+  It = UniquePointer->find(PtrOprnd);
+  if(It != UniquePointer->end())
+  {
+    ++(*UniquePointer)[PtrOprnd];
+    return false;
+  }
+  // Its a new memory, update map with it.
+  ++(*UniquePointer)[PtrOprnd];
+
+  // Check for GEP induction operand:
+  // Get pointer operand from GEP for induction operand
+  // Query unique pointer map to see pointer operand already visited.
+  // if visited, simply return false.
+  Value *StripPtr = stripGetElementPtr(Ptr, SE, DL, CurLoop);
+  It = UniquePointer->find(StripPtr);
+  if(It != UniquePointer->end())
+  {
+    ++(*UniquePointer)[StripPtr];
+    return false;
+  }
+  // Its a new memory, update map with it.
+  ++(*UniquePointer)[StripPtr];
+  return true;
+}
+
+bool LoopVersioning::isVersioningBeneficial()
+{
+  // Loop must have a preheader, if not return false.
+  if (!CurLoop->getLoopPreheader()) {
+    DEBUG(dbgs() << "loop preheader is missing");
+    return false;
+  }
+  // Loop should be innermost loops, if not return false.
+  if(CurLoop->getSubLoops().size()){
+    DEBUG(dbgs() << "loop is not innermost");
+    return false;
+  }
+  // Loop should have a single backedge, if not return false. 
+  if (CurLoop->getNumBackEdges() != 1) {
+    DEBUG(dbgs() << "loop has multiple backedges");
+    return false;
+  }
+  // Loop must have a single exiting block, if not return false.
+  if (!CurLoop->getExitingBlock()) {
+    DEBUG(dbgs() << "loop has multiple exiting block");
+    return false;
+  }
+  // We only handle bottom-tested loops, i.e. loop in which the condition is
+  // checked at the end of each iteration. With that we can assume that all
+  // instructions in the loop are executed the same number of times.
+  if (CurLoop->getExitingBlock() != CurLoop->getLoopLatch()) {
+    DEBUG(dbgs() << "loop control flow is not understood by LoopVersioning");
+    return false;
+  }
+  // Loop more then LoopDepthThreshold are not allowed 
+  if(CurLoop->getLoopDepth() > LoopDepthThreshold) {
+    DEBUG(dbgs() << "loop depth is more then threshold");
+    return false;
+  }
+  // PAS: Loop should have a dedicated exit block, if not return false.
+  if(!CurLoop->hasDedicatedExits()) {
+    DEBUG(dbgs() << "loop does not has dedicated exit blocks");
+    return false;
+  }
+  // Loop should have a trip count, if not return false.
+  // ScalarEvolution needs to be able to find the exit count.
+  const SCEV *ExitCount = SE->getBackedgeTakenCount(CurLoop);
+  if (ExitCount == SE->getCouldNotCompute()) {
+    DEBUG(dbgs() << "loop does not has trip count");
+    return false;
+  }
+
+  // Counters with initial values.  
+  unsigned NumLoads = 0;
+  unsigned NumStores = 0;
+  bool StoreToInvar = false;
+  bool GuaranteedToExecution = false;
+  // Check parallel loop
+  const bool IsAnnotatedParallel = CurLoop->isAnnotatedParallel();
+  // Instruction check:
+  // Iterate over loop blocks and instructions of each block.
+  // 1) Check all load store in loop body are non atomic & non volatile.
+  // 2) Make sure loop body should not have any call instruction.
+  // 3) Check store instruction execution guarantee.
+  // 4) Loop body should have a invariant store.
+  // 5) Loop body should not have any may throw instruction.
+  for(Loop::block_iterator bb = CurLoop->block_begin(),
+       be = CurLoop->block_end(); bb != be; ++bb) {
+     for(BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end(); 
+         it != e; ++it) {
+         // We dont support call instructions. however, we ignore dbg intrinsic.
+         // If any call instruction found then simply return false.
+         CallInst *CI = dyn_cast<CallInst>(it);
+         if (CI && !isa<DbgInfoIntrinsic>(CI)) {
+           DEBUG(dbgs() << "loop body has a call instruction");
+           return false;
+         }
+         if(it->mayThrow()){
+           DEBUG(dbgs() << "may throw instruction found in loop body");
+           return false;
+         }
+         // If current instruction is load instructions
+         // make sure its a simple load(non atomic & non volatile) 
+         if (it->mayReadFromMemory()) {
+           LoadInst *Ld = dyn_cast<LoadInst>(it);
+           if (!Ld || (!Ld->isSimple() && !IsAnnotatedParallel)) {
+             DEBUG(dbgs() << "Found a non-simple load.\n");
+             return false;
+           }
+           NumLoads++;
+         }
+         // If current instruction is store instruction
+         // make sure its a simple store(non atomic & non volatile) 
+         else if (it->mayWriteToMemory()) {
+           StoreInst *St = dyn_cast<StoreInst>(it);
+           if (!St || (!St->isSimple() && !IsAnnotatedParallel)) {
+             DEBUG(dbgs() << "Found a non-simple store.\n");
+             return false;
+           }
+           // Check store guaranteed execution, here we are interested 
+           // in atleast one guaranteed execution store. 
+           GuaranteedToExecution |=  isGuaranteedToExecute(*St);
+           // Check for store to a loop invariant, again interested in
+           // atleast one loop invariant store.
+           Value* Ptr = St->getPointerOperand();
+           StoreToInvar |= SE->isLoopInvariant(SE->getSCEV(Ptr), CurLoop);
+           NumStores++;
+        }
+     }  // Next instr.
+  } // Next block.
+
+  // Loop should have store instruction, if not return false.
+  if(!NumStores){
+    DEBUG(dbgs() << "Found a read-only loop!");
+    return false;
+  }
+  // PAS: Make sure atleast one store guarnteed to execute.
+  if(!GuaranteedToExecution){
+    DEBUG(dbgs() << "Non guaranteed store execution found");
+    return false;
+  }
+  // Loop should have atleast invariant store instruction.
+  if(!StoreToInvar){
+    DEBUG(dbgs() << "Invariant store not found !!");
+    return false;
+  }
+
+  // Counters with initial values.
+  bool IsMod = 0;
+  bool HasMayAlias = 0;
+  bool TypeSafety = 0;
+  PointerSet->clear();
+  // Memory check:
+  // Iterate over alias set tracker and individual alias sets.
+  // 1) Make sure all pointers has computable bounds, if not return false. 
+  // 2) Make sure atleast one alias tracker should have pointers of same data type.
+  // 3) Make sure atlease one alias tracker pointer should modified.
+  // 4) Collect all unique memory/pointers in PointerSet for memCheck.
+  // 5) Make sure PointerSet should have atleast 2 pointers.
+  // 6) PointerSet should'nt have pointers more then RuntimeMemoryCheckThreshold 
+  for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end(); 
+                                                      I != E; ++I) {
+     AliasSet &AS = *I;
+     // Skip Forward Alias Sets, dont consider ignored alias sets object.
+     if(AS.isForwardingAliasSet())
+        continue;
+     // Return false in case of MustAlias
+     if(AS.isMustAlias())
+        return false;
+     Value *SomePtr = AS.begin()->getValue();
+     bool typeCheck = 1;
+
+     HasMayAlias |= AS.isMayAlias();
+     IsMod |= AS.isMod();
+
+     for (auto A : AS) {
+       Value *Ptr = A.getValue();
+       // alias tracker should have pointers of same data type.
+       typeCheck = (typeCheck && (SomePtr->getType() == Ptr->getType()));
+       // Check for repeatation of pointer.
+       // If the same pointer already visited then continue.
+       if(!isUniquePointer(Ptr))
+         continue;
+       // Check pointer has computable bounds
+       // In case of non computable bounds, return false.
+       if(!hasComputableBounds(SE, Ptr)) {
+         DEBUG(dbgs() << "Non computable bound found !");
+         return false;
+       }
+       // Insert memory to PointerSet
+       PointerSet->insert(Ptr);
+       // Check any memory modified in loop body.
+     }
+     // Atleast one alias tracker should have pointers of same data type.
+     TypeSafety |= typeCheck; 
+  }
+  // PAS: Ensure types should be of same type. 
+  if(!TypeSafety) {
+    DEBUG(dbgs() << "Alias tracker type safety failed!");
+    return false;
+  }
+  // PAS: Ensure loop body should not be ready only. 
+  if(!IsMod) {
+    DEBUG(dbgs() << "No memory modified in loop body");
+    return false;
+  }
+  // PAS: Make sure alias set has may alias case.
+  // If there no alias memory ambiguity, return false.
+  if(!HasMayAlias) {
+    DEBUG(dbgs() << "No ambiguity in memory access.");
+    return false;
+  }
+  // Atleast 2 pointers needed for runtime check.
+  if(PointerSet->size() <= 1) {
+    DEBUG(dbgs() << "Didnt found sufficient pointers in loop body");
+    return false;
+  }
+  // Check memory threshold, should be in limit.
+  if(PointerSet->size() > RuntimeMemoryCheckThreshold) {
+    DEBUG(dbgs() << "Loop body has pointers more then defined threshold");
+    return false;
+  }
+  // Collected pointers should be in the same address space.
+  // If different address space pointers found then return false.
+  for (ValueSet::const_iterator I = PointerSet->begin(), 
+       E = PointerSet->end(); I != E;) {
+     Value *PtrI = *I;
+     ValueSet::const_iterator J = ++I;
+     for (; J != E; ++J) {
+      Value *PtrJ = *J;
+      unsigned ASi = PtrI->getType()->getPointerAddressSpace();
+      unsigned ASj = PtrJ->getType()->getPointerAddressSpace();
+      if (ASi != ASj) {
+        DEBUG(dbgs() << "Different address space pointer memcheck not possible\n");
+        return false;
+      }
+    }
+  }
+  // Loop versioning is feasible, return true.
+  DEBUG(dbgs() << "Loop Versioning found to be beneficial");
+  return true;
+}
+
+bool LoopVersioning::isGuaranteedToExecute(Instruction &Inst) {
+
+  // Otherwise we have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    return true;
+
+  // Get the exit blocks for the current loop.
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // Verify that the block dominates each of the exit blocks of the loop.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!DT->dominates(Inst.getParent(), ExitBlocks[i]))
+      return false;
+
+  // As a degenerate case, if the loop is statically infinite then we haven't
+  // proven anything since there are no exit blocks.
+  if (ExitBlocks.empty())
+    return false;
+
+  return true;
+}
+
+// Here we does actual loop versioning.
+// 1) Clone original loop
+// 2) Create a runtime memory check.
+// 3) Add both loops under runtime check results target.
+// 4) Set all blocks properly.
+// It transform loop as shown below 
+//                         +----------------+                               
+//                         |Runtime Memcheck|                               
+//                         +----------------+                               
+//                                 |                                        
+//              +----------+----------------+----------+                    
+//              |                                      |                    
+//    +---------+----------+               +-----------+----------+         
+//    |Orig Loop Preheader |               |Cloned Loop Preheader |         
+//    +--------------------+               +----------------------+         
+//              |                                      |                    
+//    +--------------------+               +----------------------+         
+//    |Orig Loop Body      |               |Cloned Loop Body      |         
+//    +--------------------+               +----------------------+         
+//              |                                      |                    
+//    +--------------------+               +----------------------+         
+//    |Orig Loop|Exit Block|               |Cloned Loop Exit Block|         
+//    +--------------------+               +-----------+----------+         
+//              |                                      |                    
+//              +----------+--------------+-----------+                    
+//                                 |                                        
+//                           +-----+----+                                   
+//                           |Join Block|                                   
+//                           +----------+    
+
+Loop * LoopVersioning::versionizeLoop(LPPassManager &LPM)
+{
+  std::vector<BasicBlock*> LoopBlocks;
+  std::vector<BasicBlock*> NewBlocks;
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  LoopBlocks.clear();
+  NewBlocks.clear();
+  ExitBlocks.clear();
+
+  BasicBlock *LoopHeader = CurLoop->getHeader();
+  BasicBlock *LoopPreheader = CurLoop->getLoopPreheader();
+  Function *F = LoopHeader->getParent();
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *NewPreheader = SplitEdge(LoopPreheader, LoopHeader, DT, LI);
+  LoopBlocks.push_back(NewPreheader);
+  // We want the loop to come after the preheader, but before the exit blocks.
+  LoopBlocks.insert(LoopBlocks.end(), CurLoop->block_begin(), 
+                    CurLoop->block_end());
+
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  splitExitEdges(CurLoop, ExitBlocks);
+
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  CurLoop->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
+  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  NewBlocks.reserve(LoopBlocks.size());
+  ValueToValueMapTy VMap;
+  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".loopVersion", F);
+    NewBlocks.push_back(NewBB);
+    VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
+    LPM.cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, CurLoop);
+  }
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(NewPreheader, F->getBasicBlockList(),
+                                NewBlocks[0], F->end());
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = cloneLoop(CurLoop, CurLoop->getParentLoop(), VMap, LI, &LPM);
+
+  Loop *ParentLoop = CurLoop->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+  }
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         PHINode *PN = dyn_cast<PHINode>(I); ++I) {
+      Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+      ValueToValueMapTy::iterator It = VMap.find(V);
+      if (It != VMap.end()) V = It->second;
+      PN->addIncoming(V, NewExit);
+    }
+
+    if (LandingPadInst *LPad = NewExit->getLandingPadInst()) {
+      PHINode *PN = PHINode::Create(LPad->getType(), 0, "",
+                                    ExitSucc->getFirstInsertionPt());
+      for (pred_iterator I = pred_begin(ExitSucc), E = pred_end(ExitSucc);
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        LandingPadInst *LPI = BB->getLandingPadInst();
+        LPI->replaceAllUsesWith(PN);
+        PN->addIncoming(LPI, BB);
+      }
+    }
+  }
+  // Rewrite the code to refer to itself.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+      RemapInstruction(I, VMap,RF_NoModuleLevelChanges|RF_IgnoreMissingEntries);
+
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+  // create runtime check.
+  RuntimeMemoryCheck RTCheck (CurLoop, SE, DL, PointerSet);
+  Instruction *MemRuntimeCheck;
+  Instruction *FirstCheckInst;
+  std::tie(FirstCheckInst, MemRuntimeCheck) = 
+                    RTCheck.createRuntimeCheck(LoopPreheader->getTerminator());
+
+  if (MemRuntimeCheck) {
+    BasicBlock * CheckBlock = 
+              LoopPreheader->splitBasicBlock(MemRuntimeCheck, "LVer.memcheck");
+    if (ParentLoop)
+      ParentLoop->addBasicBlockToLoop(CheckBlock, *LI);
+    BranchInst::Create(LoopBlocks[0], NewBlocks[0], MemRuntimeCheck, CheckBlock);
+    LPM.deleteSimpleAnalysisValue(OldBR, CurLoop);
+    OldBR->eraseFromParent();
+  }
+
+  return NewLoop;
+}
+
+bool LoopVersioning::runOnLoop(Loop *L, LPPassManager &LPM) {
+  if (skipOptnoneFunction(L))
+    return false;
+  Changed = false;
+  // Get our Loop and Alias Analysis information...
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  AA = &getAnalysis<AliasAnalysis>();
+  SE = &getAnalysis<ScalarEvolution>();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
+
+  DataLayoutPass *DLP = getAnalysisIfAvailable<DataLayoutPass>();
+  if(!DLP) {
+    DEBUG(dbgs() << "DataLayout missing, loop versioning not possible !\n");
+    return false;
+  }
+  DL = &DLP->getDataLayout();
+  // Set Current Loop
+  CurLoop = L;
+  // Get the preheader block to move instructions into...
+  Preheader = L->getLoopPreheader();
+  // Initial allocation 
+  CurAST = new AliasSetTracker(*AA);
+  PointerSet = new ValueSet();
+  UniquePointer = new MapVector<Value *, unsigned>;
+ 
+  // Loop over the body of this loop, construct AST.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    if (LI->getLoopFor(BB) == L)        // Ignore blocks in subloops.
+      CurAST->add(*BB);                 // Incorporate the specified basic block
+  }
+  // Check feasiblity of loop versioning. 
+  // If versioning found to be feasible and beneficial then procees 
+  // else simply return, by cleaning up memory.
+  if(isVersioningBeneficial())
+  {
+    // Do loop versioning.
+    // Create memcheck for unique memory accessed inside loop. 
+    // Clone original loop, and set blocks properly.
+    Loop * VerLoop = versionizeLoop(LPM);
+    // Set metadata to both loop versions.
+    Function *F = L->getHeader()->getParent();
+    // Get latch terminator instruction.
+    Instruction *I = VerLoop->getLoopLatch()->getTerminator();
+    // Create alias scope domain.
+    MDBuilder MDB(I->getContext());
+    MDNode *NewDomain = MDB.createAnonymousAliasScopeDomain("LVDomain");
+    DenseMap<const Instruction *, MDNode *> NewScopes;
+    SmallVector<Metadata *, 4> Scopes, NoAliases;
+    std::string Name = "LVAliasScope";
+    // Iterate over each instruction of loop.
+    // set no-alias for all load & store instructions.
+    for(Loop::block_iterator bb = VerLoop->block_begin(),
+        be = VerLoop->block_end(); bb != be; ++bb) {
+      for(BasicBlock::iterator it = (*bb)->begin(), e = (*bb)->end();
+          it != e; ++it) {
+        // Only interested in load & stores
+        if(!it->mayReadFromMemory() && !it->mayWriteToMemory())
+           continue;
+        // Create alias scope.
+        Instruction *I = dyn_cast<Instruction>(it);
+        MDNode *NewScope = MDB.createAnonymousAliasScope(NewDomain, Name);
+        NewScopes.insert(std::make_pair(I, NewScope));
+        NoAliases.push_back(NewScope);
+        Scopes.push_back(NewScope);
+        // Set no-alias for current instruction.
+        I->setMetadata(LLVMContext::MD_noalias,
+                       MDNode::concatenate(I->getMetadata(LLVMContext::MD_noalias),
+                       MDNode::get(I->getContext(), NoAliases)));
+        // set alias-scope for current instruction.
+        I->setMetadata(LLVMContext::MD_alias_scope,
+                       MDNode::concatenate(I->getMetadata(LLVMContext::MD_alias_scope),
+                       MDNode::get(I->getContext(), Scopes)));
+      }
+    }
+    // Recalculate dominator tree. 
+    DT->recalculate(*F);
+    // Delete newly created loop from loop pass manager. 
+    LPM.deleteLoopFromQueue(VerLoop);
+    Changed = true;
+  }
+  // Delete allocated memory.
+  delete CurAST;
+  delete PointerSet;
+  delete UniquePointer;
+  return Changed;
+}
+
+
+char LoopVersioning::ID = 0;
+INITIALIZE_PASS_BEGIN(LoopVersioning, "do-loop-versioning", "Loop Versioning", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_DEPENDENCY(LCSSA)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolution)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_AG_DEPENDENCY(AliasAnalysis)
+INITIALIZE_PASS_END(LoopVersioning, "do-loop-versioning", "Loop Versioning", false, false)
+
+Pass *llvm::createLoopVersioningPass() { return new LoopVersioning(); }
+char &llvm::LoopVersioningID = LoopVersioning::ID;
+
Index: lib/Transforms/Scalar/Scalar.cpp
===================================================================
--- lib/Transforms/Scalar/Scalar.cpp
+++ lib/Transforms/Scalar/Scalar.cpp
@@ -45,6 +45,7 @@
   initializeIndVarSimplifyPass(Registry);
   initializeJumpThreadingPass(Registry);
   initializeLICMPass(Registry);
+  initializeLoopVersioningPass(Registry);
   initializeLoopDeletionPass(Registry);
   initializeLoopAccessAnalysisPass(Registry);
   initializeLoopInstSimplifyPass(Registry);