diff --git a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
--- a/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
+++ b/llvm/include/llvm/Transforms/Scalar/LoopIdiomRecognize.h
@@ -44,7 +44,11 @@
                         LoopStandardAnalysisResults &AR, LPMUpdater &U);
 };
 
-// NFC LoopNestPass with regards to the current LoopPass-LoopIdiomRecognize
+// The LoopNestIdiomRecognize is a LoopNestPass that feeds LoopNest into
+// LoopIdiomRecognize. The main difference from LoopIdiomRecognize is it
+// allows runtime-determined store size optimization by versioning and creates
+// runtime checks on the top-level loop. The reason to only version on the
+// top-level loop is to avoid the exponential growth of versioning.
 class LoopNestIdiomRecognizePass
     : public PassInfoMixin<LoopNestIdiomRecognizePass> {
 public:
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -96,6 +96,7 @@
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/LoopVersioning.h"
 #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
@@ -108,6 +109,8 @@
 #define DEBUG_TYPE "loop-idiom"
 
 STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
+STATISTIC(NumMemSetRuntimeLength,
+          "Number of memset's formed from memset with runtime length");
 STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
 STATISTIC(
     NumShiftUntilBitTest,
@@ -144,11 +147,39 @@
              "with -Os/-Oz"),
     cl::init(true), cl::Hidden);
 
+static cl::opt<bool> ForceNoLoopVersion(
+    DEBUG_TYPE "-no-loop-version",
+    cl::desc("Force not to create loop versions if the user guarantees that"
+             "the length of each array dimension is positive value, and"
+             "the multiplication of lengths of all array dimensions does not"
+             "exceeds the range of type size_t"),
+    cl::init(false), cl::Hidden);
+
 namespace {
 
+typedef DenseMap<const SCEV *, int> SCEVConstValPairMap;
+
+/// A helper class to do the following SCEV expression conversions.
+///   1) "sext %val" to "zext %val"
+///   2) "SOME_CONSTANT_VALUE smax %val" to "%val"
+class SCEVExprConverter {
+public:
+  Loop *CurLoop;
+  ScalarEvolution &SE;
+  SCEVConstValPairMap CheckSltMap;
+
+  SCEVExprConverter(ScalarEvolution &SE)
+      : SE(SE) {}
+
+  const SCEV *convertSCEV(const SCEV *Expr, bool AddRuntimCheck);
+};
+
 class LoopIdiomRecognize {
   Loop *CurLoop = nullptr;
   LoopNest *LN;
+  Loop *TopLoop = nullptr;
+  Loop *FallBackLoop = nullptr;
+  BasicBlock *RuntimeCheckBB = nullptr;
   AliasAnalysis *AA;
   DominatorTree *DT;
   LoopInfo *LI;
@@ -159,6 +190,8 @@
   OptimizationRemarkEmitter &ORE;
   bool ApplyCodeSizeHeuristics;
   std::unique_ptr<MemorySSAUpdater> MSSAU;
+  SCEVConstValPairMap CheckSltMap;
+  SCEVExprConverter Converter;
 
 public:
   explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
@@ -168,9 +201,11 @@
                               const DataLayout *DL,
                               OptimizationRemarkEmitter &ORE)
       : LN(LN), AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL),
-        ORE(ORE) {
+        ORE(ORE), Converter(*SE) {
     if (MSSA)
       MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+    if (LN)
+      TopLoop = &LN->getOutermostLoop();
   }
 
   bool runOnLoopNest();
@@ -235,6 +270,8 @@
                                   const SCEV *BECount);
   bool avoidLIRForMultiBlockLoop(bool IsMemset = false,
                                  bool IsLoopMemset = false);
+  bool isTopLoopVersioned() const { return RuntimeCheckBB != nullptr; }
+  void versionTopLoop();
 
   /// @}
   /// \name Noncountable Loop Idiom Handling
@@ -311,6 +348,144 @@
 
 } // end anonymous namespace
 
+/// Implementation of SCEVExprConverter.
+/// Tries to fold the SCEV with regard to loop guards of CurLoop
+const SCEV *SCEVExprConverter::convertSCEV(const SCEV *Expr, bool AddRuntimeCheck) {
+  switch (Expr->getSCEVType()) {
+    case scConstant:
+    case scUnknown:
+    case scCouldNotCompute:
+      return Expr;
+    case scTruncate: {
+      const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(Expr);
+      Type *Ty = Trunc->getType();
+      const SCEV *NewTrunc = convertSCEV(Trunc->getOperand(), AddRuntimeCheck);
+      return SE.getTruncateExpr(NewTrunc, Ty);
+    }
+    case scZeroExtend: {
+      const SCEVZeroExtendExpr *Zext = cast<SCEVZeroExtendExpr>(Expr);
+      Type *Ty = Zext->getType();
+      const SCEV *NewZext = convertSCEV(Zext->getOperand(), AddRuntimeCheck);
+      return SE.getZeroExtendExpr(NewZext, Ty);
+    }
+    case scSignExtend: {
+      // Return original SCEV if expression is not guarded by loop with Zero and
+      // dont want to add runtime check. Otherwise we fold constant and add appropriate
+      // runtime check.
+      const SCEVSignExtendExpr *Sext = cast<SCEVSignExtendExpr>(Expr);
+      if (SE.isLoopEntryGuardedByCond(CurLoop, ICmpInst::ICMP_SGE, Sext, SE.getZero(Sext->getType())) == false) {
+        if (AddRuntimeCheck == false)
+          return Sext;
+        if (CheckSltMap[Sext] < 0)
+          CheckSltMap[Sext] = 0;
+      }
+      const SCEV *NewZext = convertSCEV(Sext->getOperand(), AddRuntimeCheck);
+      return SE.getZeroExtendExpr(NewZext, Sext->getType());
+    }
+    case scAddExpr: {
+      const SCEVAddExpr *Add = cast<SCEVAddExpr>(Expr);
+      const SCEV *NewAdd = convertSCEV(Add->getOperand(0), AddRuntimeCheck);
+      for (int I = 1, E = Add->getNumOperands(); I != E; ++I) {
+        NewAdd = SE.getAddExpr(NewAdd, convertSCEV(Add->getOperand(I), AddRuntimeCheck));
+      }
+      return NewAdd;
+    }
+    case scMulExpr: {
+      const SCEVMulExpr *Mul = cast<SCEVMulExpr>(Expr);
+      const SCEV *NewMul = convertSCEV(Mul->getOperand(0), AddRuntimeCheck);
+      for (int I = 1, E = Mul->getNumOperands(); I != E; ++I) {
+        NewMul = SE.getMulExpr(NewMul, convertSCEV(Mul->getOperand(I), AddRuntimeCheck));
+      }
+      return NewMul;
+    }
+    case scUDivExpr: {
+      const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(Expr);
+      const SCEV *NewLHS = convertSCEV(UDiv->getLHS(), AddRuntimeCheck);
+      const SCEV *NewRHS = convertSCEV(UDiv->getRHS(), AddRuntimeCheck);
+      return SE.getUDivExpr(NewLHS, NewRHS);
+    }
+    case scAddRecExpr:
+      assert(false && "Do not expect AddRec here!");
+    case scUMaxExpr: {
+      const SCEVUMaxExpr *UMax = cast<SCEVUMaxExpr>(Expr);
+      const SCEV *NewUMax = convertSCEV(UMax->getOperand(0), AddRuntimeCheck);
+      for (int I = 1, E = UMax->getNumOperands(); I != E; ++I) {
+        NewUMax = SE.getUMaxExpr(NewUMax, convertSCEV(UMax->getOperand(I), AddRuntimeCheck));
+      }
+      return NewUMax;
+    }
+    case scSMaxExpr: {
+      // Return original SCEV if expression is not guarded by loop with Constant and
+      // dont want to add runtime check. Otherwise we fold constant and add appropriate
+      // runtime check.
+      const SCEVSMaxExpr *SMax = cast<SCEVSMaxExpr>(Expr);
+      const int NumOfOps = SMax->getNumOperands();
+      bool Fold = false;
+      // If an operand is constant, it will be the first operand.
+      const SCEV *SMaxOp0 = SMax->getOperand(0);
+      const SCEVConstant *Cst = dyn_cast<SCEVConstant>(SMaxOp0);
+
+      if (Cst) {
+        // check if the operand is guarded to the constant
+        // if not, return orignal expression
+        Fold = true;
+        for (int I = 1, E = NumOfOps; I != E; ++I) {
+          auto Ev = SMax->getOperand(I);
+          if (SE.isLoopEntryGuardedByCond(CurLoop, ICmpInst::ICMP_SGE, Ev, Cst) == false) {
+            if (AddRuntimeCheck == false)
+              return SMax;
+            int CstValue = Cst->getAPInt().roundToDouble();
+            if (CheckSltMap[SMax] < CstValue)
+              CheckSltMap[SMax] = CstValue;
+          }
+        }
+      }
+
+      const int StartIdx = Fold ? 1 : 0;
+      const SCEV *NewSMax = convertSCEV(SMax->getOperand(StartIdx), AddRuntimeCheck);
+      for (int I = StartIdx + 1, E = NumOfOps; I != E; ++I) {
+        NewSMax = SE.getSMaxExpr(NewSMax, convertSCEV(SMax->getOperand(I), AddRuntimeCheck));
+      }
+      return NewSMax;
+    }
+    case scUMinExpr: {
+      const SCEVUMinExpr *UMin = cast<SCEVUMinExpr>(Expr);
+      const SCEV *NewUMin = convertSCEV(UMin->getOperand(0), AddRuntimeCheck);
+      for (int I = 1, E = UMin->getNumOperands(); I != E; ++I) {
+        NewUMin = SE.getUMinExpr(NewUMin, convertSCEV(UMin->getOperand(I), AddRuntimeCheck));
+      }
+      return NewUMin;
+    }
+    case scSMinExpr: {
+      const SCEVSMinExpr *SMin = cast<SCEVSMinExpr>(Expr);
+      const SCEV *NewSMin = convertSCEV(SMin->getOperand(0), AddRuntimeCheck);
+      for (int I = 1, E = SMin->getNumOperands(); I != E; ++I) {
+        NewSMin = SE.getSMinExpr(NewSMin, convertSCEV(SMin->getOperand(I), AddRuntimeCheck));
+      }
+      return NewSMin;
+    }
+    default:
+      llvm_unreachable("Unexpected SCEV expression!");
+  }
+}
+
+/// Helper function to generate predicate "X < ConstVal".
+// This function is used to add runtime check conditions for value that we
+// assume in order to canonicalize the SCEV for comparison.
+// For example: turn sext to zext, eliminate smax
+static Value *generateSltCstPredicate(const SCEV *Ev, int ConstVal,
+                                       BranchInst *BI,
+                                       const DataLayout *DL,
+                                       ScalarEvolution *SE,
+                                       IRBuilder<> &Builder) {
+  SCEVExpander Expander(*SE, *DL, "loop-nest-idiom-runtime-bound-check");
+  Type *Ty = Ev->getType();
+  Value *Val = Expander.expandCodeFor(Ev, Ty, BI);
+  Constant *Cst = ConstantInt::get(Ty, ConstVal);
+
+  return Builder.CreateICmpSLT(Val, Cst);
+}
+
 char LoopIdiomRecognizeLegacyPass::ID = 0;
 
 PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
@@ -400,6 +575,35 @@
     Changed |= runOnLoop(L);
   }
 
+  // After processing all the loops, we now add the stored conditions into
+  // the RuntimeCheckBB. Conditions are stored when:
+  //   - detect runtime store size in StridedStore
+  if (Changed && isTopLoopVersioned()) {
+    // Get the branch instruction in the runtime check basic block.
+    BranchInst *BI = dyn_cast<BranchInst>(RuntimeCheckBB->getTerminator());
+    assert(BI && "Expects a BranchInst");
+    LLVM_DEBUG(dbgs() << "runOnLoopNest: Add runtime check to RuntimeCheckBB\n");
+    // Create conditional branch instructions for SCEV that is folded.
+    // If any of the condition above is true, the fallback loop is taken.
+    // Otherwise, the optimized loop is taken.
+    LLVMContext &Context = TopLoop->getHeader()->getContext();
+    Value *Cond = ConstantInt::getFalse(Context);
+
+    IRBuilder<> Builder(BI);
+    // add runtime check for SCEV we fold in SCEVExprConverter
+    for (auto Pair : CheckSltMap) {
+      const SCEV *Ev = Pair.first;
+      const int Cst = Pair.second;
+      LLVM_DEBUG(dbgs() << "  check: " << *Ev << " < " << Cst << "\n");
+      Value *CheckInBoundCond = generateSltCstPredicate(Ev, Cst, BI, DL, SE, Builder);
+      Cond = Builder.CreateOr(Cond, CheckInBoundCond);
+    }
+
+    BranchInst::Create(FallBackLoop->getLoopPreheader(),
+                       LN->getOutermostLoop().getLoopPreheader(), Cond, BI);
+    deleteDeadInstruction(BI);
+  }
+
   return Changed;
 }
 
@@ -462,8 +666,11 @@
   // Scan all the blocks in the loop that are not in subloops.
   for (auto *BB : CurLoop->getBlocks()) {
     // Ignore blocks in subloops.
-    if (LI->getLoopFor(BB) != CurLoop)
+    LLVM_DEBUG(dbgs() << "Loop Block: " << BB->getName() << "\n");
+    if (LI->getLoopFor(BB) != CurLoop) {
+      LLVM_DEBUG(dbgs() << "block is not on current loop, abort\n");
       continue;
+    }
 
     MadeChange |= runOnLoopBlock(BB, BECount, ExitBlocks);
   }
@@ -673,8 +880,10 @@
   // executed in the loop.  For a block to be unconditionally executed, it has
   // to dominate all the exit blocks of the loop.  Verify this now.
   for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-    if (!DT->dominates(BB, ExitBlocks[i]))
+    if (!DT->dominates(BB, ExitBlocks[i])) {
+      LLVM_DEBUG(dbgs() << "does not dominate all exit blocks to promote the store, abort.\n");
       return false;
+    }
 
   bool MadeChange = false;
   // Look for store instructions, which may be optimized to memset/memcpy.
@@ -858,6 +1067,7 @@
     Instruction *Inst = &*I++;
     // Look for memory instructions, which may be optimized to a larger one.
     if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
+      LLVM_DEBUG(dbgs() << "found MemInst: " << *MI << "\n");
       WeakTrackingVH InstPtr(&*I);
       if (!(this->*Processor)(MI, BECount))
         continue;
@@ -944,7 +1154,7 @@
 bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
                                            const SCEV *BECount) {
   // We can only handle non-volatile memsets with a constant size.
-  if (MSI->isVolatile() || !isa<ConstantInt>(MSI->getLength()))
+  if (MSI->isVolatile())
     return false;
 
   // If we're not allowed to hack on memset, we fail.
@@ -956,28 +1166,104 @@
   // See if the pointer expression is an AddRec like {base,+,1} on the current
   // loop, which indicates a strided store.  If we have something else, it's a
   // random store we can't handle.
+  const SCEV *PointerSCEV = SE->getSCEV(Pointer);
+  if (PointerSCEV)
+    LLVM_DEBUG(dbgs() << "PointerSCEV: " << *PointerSCEV << "\n");
   const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
-  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine())
+  if (!Ev || Ev->getLoop() != CurLoop || !Ev->isAffine()) {
+    LLVM_DEBUG(dbgs() << "PointerSCEV cannot be converted to SCEVAddRecExpr, abort\n");
     return false;
-
+  }
+  const SCEV *StrideSCEV = Ev->getOperand(1);
   const SCEV *MemsetSizeSCEV = SE->getSCEV(MSI->getLength());
-  if (!MemsetSizeSCEV)
+  if (!StrideSCEV || !MemsetSizeSCEV)
     return false;
 
-  // Reject memsets that are so large that they overflow an unsigned.
-  uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
-  if ((SizeInBytes >> 32) != 0)
-    return false;
+  bool NegStride;
+  const bool IsConstantSize = isa<ConstantInt>(MSI->getLength());
+  if (IsConstantSize) {
+    // Memset size is constant
+    // Reject memsets that are so large that they overflow an unsigned.
+    LLVM_DEBUG(dbgs() << "  memset size is constant\n");
+    uint64_t SizeInBytes = cast<ConstantInt>(MSI->getLength())->getZExtValue();
+    if ((SizeInBytes >> 32) != 0)
+      return false;
 
-  // Check to see if the stride matches the size of the memset.  If so, then we
-  // know that every byte is touched in the loop.
-  const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
-  if (!ConstStride)
-    return false;
+    // Check to see if the stride matches the size of the memset.  If so, then
+    // we know that every byte is touched in the loop.
+    const SCEVConstant *ConstStride = dyn_cast<SCEVConstant>(Ev->getOperand(1));
+    if (!ConstStride)
+      return false;
 
-  APInt Stride = ConstStride->getAPInt();
-  if (SizeInBytes != Stride && SizeInBytes != -Stride)
-    return false;
+    APInt Stride = ConstStride->getAPInt();
+    if (SizeInBytes != Stride && SizeInBytes != -Stride)
+      return false;
+
+    NegStride = SizeInBytes == -Stride;
+  } else {
+    // Memset size is non-constant
+    // Check if the stride matches the memset size, by comparing the SCEV
+    // expressions of the stride and memset size. The two expressions match
+    // if they are equal. If they match, then we know that every byte is
+    // touched in the loop. We only handle memset length and stride that
+    // are invariant for the top level loop.
+    // To be conservative, in runtime we would not promote pointers that isn't
+    // in address space zero
+    LLVM_DEBUG(dbgs() << "  memset size is non-constant\n");
+    if (Pointer->getType()->getPointerAddressSpace() != 0) {
+      LLVM_DEBUG(dbgs() << "  pointer is not in address space zero\n");
+      return false;
+    }
+    if (!SE->isLoopInvariant(MemsetSizeSCEV, TopLoop) ||
+        !SE->isLoopInvariant(StrideSCEV, TopLoop)) {
+      LLVM_DEBUG(dbgs() << "  memset size or stride is not a loop-invariant, "
+                        << "abort\n");
+      return false;
+    }
+
+    // compare positive direction strideSCEV with MemsizeSizeSCEV
+    NegStride = StrideSCEV->isNonConstantNegative();
+    const SCEV *PositiveStrideSCEV =
+        NegStride ? SE->getNegativeSCEV(StrideSCEV) : StrideSCEV;
+    LLVM_DEBUG(dbgs() << "  MemsetSizeSCEV: " << *MemsetSizeSCEV << "\n"
+                      << "  PositiveStrideSCEV: " << *PositiveStrideSCEV
+                      << "\n");
+
+    if (PositiveStrideSCEV != MemsetSizeSCEV) {
+      // If the original StrideSCEV and MemsetSizeSCEV does not match, the pass will
+      // fold expressions that is covered by the loop guard at loop entry.
+      // We will compare again after the folding and proceed if equal
+      Converter.CheckSltMap.clear();
+      Converter.CurLoop = CurLoop;
+      const SCEV *FoldedPositiveStride =
+          Converter.convertSCEV(PositiveStrideSCEV, /*AddRuntimeCheck=*/false);
+      const SCEV *FoldedMemsetSize =
+          Converter.convertSCEV(MemsetSizeSCEV, /*AddRuntimeCheck=*/false);
+      LLVM_DEBUG(dbgs() << "  Try to fold SCEV expression covered by loop guard\n"
+                        << "    FoldedMemsetSCEV: " << *FoldedMemsetSize << "\n"
+                        << "    FoldedPositiveStrideSCEV: "
+                        << *FoldedPositiveStride << "\n");
+
+      if (FoldedPositiveStride != FoldedMemsetSize) {
+        if (LN == nullptr || ForceNoLoopVersion) {
+          LLVM_DEBUG(dbgs() << "  unable to do loop versioning here, abort\n");
+          return false;
+        }
+        const SCEV *ConvertedPositiveStride =
+            Converter.convertSCEV(FoldedPositiveStride, /*AddRuntimeCheck=*/true);
+        const SCEV *ConvertedMemsetSize =
+            Converter.convertSCEV(FoldedMemsetSize, /*AddRuntimeCheck=*/true);
+        LLVM_DEBUG(dbgs() << "  Try to convert SCEV expression and add appropriate runtime check\n"
+                          << "    ConvertedMemsetSCEV: " << *ConvertedMemsetSize << "\n"
+                          << "    ConvertedPositiveStrideSCEV: "
+                          << *ConvertedPositiveStride << "\n");
+        if (ConvertedPositiveStride != ConvertedMemsetSize) {
+          LLVM_DEBUG(dbgs() << "  Converted SCEV inequal, abort\n");
+          return false;
+        }
+      }
+    }
+  }
 
   // Verify that the memset value is loop invariant.  If not, we can't promote
   // the memset.
@@ -987,10 +1273,23 @@
 
   SmallPtrSet<Instruction *, 1> MSIs;
   MSIs.insert(MSI);
-  bool NegStride = SizeInBytes == -Stride;
-  return processLoopStridedStore(
+  bool Changed = processLoopStridedStore(
       Pointer, MemsetSizeSCEV, MaybeAlign(MSI->getDestAlignment()),
       SplatValue, MSI, MSIs, Ev, BECount, NegStride, /*IsLoopMemset=*/true);
+
+  // if we have successfully changed with processLoopStridedStore
+  // add the require runtime check information into list.
+  if (Changed && isTopLoopVersioned()) {
+    for (auto Pair : Converter.CheckSltMap) {
+      auto Ev = Pair.first;
+      auto Cst = Pair.second;
+      if (CheckSltMap[Ev] < Cst)
+        CheckSltMap[Ev] = Cst;
+    }
+    Converter.CheckSltMap.clear();
+  }
+
+  return Changed;
 }
 
 /// mayLoopAccessLocation - Return true if the specified loop might access the
@@ -1180,8 +1479,12 @@
   bool Changed = false;
   const SCEV *Start = Ev->getStart();
   // Handle negative strided loops.
-  if (NegStride)
+  if (NegStride) {
+    LLVM_DEBUG(dbgs() << "Handle negative stride,"
+                      << "  original Start: " << Start << "\n");
     Start = getStartForNegStride(Start, BECount, IntIdxTy, StoreSizeSCEV, SE);
+    LLVM_DEBUG(dbgs() << "  converted Start: " << Start << "\n");
+  }
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
@@ -1216,15 +1519,39 @@
 
   // NumBytes = TripCount * StoreSize
   const SCEV *TripCountS = getTripCount(BECount, IntIdxTy, CurLoop, DL, SE);
+
+  // If store size is not constant and we need to add runtime checks for the optimization
+  // to proceed, then versioning is required.
+  const bool IsConstantSize = isa<SCEVConstant>(StoreSizeSCEV);
+  if (IsLoopMemset && !IsConstantSize && Converter.CheckSltMap.size()) {
+    if (LN == nullptr) {
+      LLVM_DEBUG(dbgs() << "requires versioning but running LoopIdiomRecognizer, "
+                        << "abort (run LoopNestIdiomRecognizer instead support versioning)\n");
+      return Changed;
+    }
+    if (!SE->isLoopInvariant(TripCountS, TopLoop) || ForceNoLoopVersion) {
+      LLVM_DEBUG(dbgs() << "abort becuase TripCount is not TopLoop's invariant "
+                        << "or ForceNoLoopVersion = true\n");
+      return Changed;
+    }
+  }
+
   const SCEVConstant *ConstSize = dyn_cast<SCEVConstant>(StoreSizeSCEV);
   const SCEV *NumBytesS;
-  if (!ConstSize && ConstSize->getAPInt() == 1)
+
+  if (ConstSize && ConstSize->getAPInt() == 1) {
     NumBytesS = TripCountS;
-  else
+    LLVM_DEBUG(dbgs() << "StoreSize = 1, NumbytesS: " << *NumBytesS << "\n");
+  } else {
     NumBytesS =
         SE->getMulExpr(TripCountS,
                        SE->getTruncateOrZeroExtend(StoreSizeSCEV, IntIdxTy),
                        SCEV::FlagNUW);
+    LLVM_DEBUG(dbgs() << "  Calculate NumBytesS = TripCountS * StoreSizeSCEV\n"
+                      << "    TripCountS: " << *TripCountS << "\n"
+                      << "    StoreSizeSCEV: " << *StoreSizeSCEV << "\n"
+                      << "    NumBytesS: " << *NumBytesS << "\n");
+  }
 
   // TODO: ideally we should still be able to generate memset if SCEV expander
   // is taught to generate the dependencies at the latest point.
@@ -1234,6 +1561,26 @@
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
 
+  // If the memset size is not a constant, we will need to version the top
+  // level loop in the current loop nest with runtime checks. We are going
+  // to version on only the top level loop once to avoid exponential growth
+  // of versioning.
+  // Here we check whether the top level clone has beed created yet, and create
+  // it if it hasn't. The initial runtime check is set to false and the
+  // conditions would be updated after we process all the loops.
+  if (LN != nullptr && IsLoopMemset && !IsConstantSize && !ForceNoLoopVersion &&
+      !isTopLoopVersioned() && Converter.CheckSltMap.size()) {
+    LLVM_DEBUG(dbgs() << "  Create versioning for top loop because runtime check for SCEV is needed\n");
+    versionTopLoop();
+
+    // If current loop is the top loop, versioning would change the loop's
+    // preheader to RuntimeCheckBB, so we need to reset the insert point.
+    if (CurLoop == TopLoop) {
+      Preheader = CurLoop->getLoopPreheader();
+      Builder.SetInsertPoint(Preheader->getTerminator());
+    }
+  }
+
   CallInst *NewCall;
   if (SplatValue) {
     NewCall = Builder.CreateMemSet(BasePtr, SplatValue, NumBytes,
@@ -1291,6 +1638,8 @@
     MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemSet;
   ExpCleaner.markResultUsed();
+  if (IsLoopMemset && !IsConstantSize)
+    ++NumMemSetRuntimeLength;
   return true;
 }
 
@@ -1503,6 +1852,18 @@
   return false;
 }
 
+/// versionTopLoop - Create a fallback version the TopLoop
+void LoopIdiomRecognize::versionTopLoop() {
+  const LoopAccessInfo LAI(TopLoop, SE, TLI, AA, DT, LI);
+  LoopVersioning LV(LAI, LAI.getRuntimePointerChecking()->getChecks(), TopLoop,
+                    LI, DT, SE);
+
+  LV.versionLoopWithPlainRuntimeCheck();
+
+  RuntimeCheckBB = LV.getRuntimeCheckBB();
+  FallBackLoop = LV.getNonVersionedLoop();
+}
+
 bool LoopIdiomRecognize::runOnNoncountableLoop() {
   LLVM_DEBUG(dbgs() << DEBUG_TYPE " Scanning: F["
                     << CurLoop->getHeader()->getParent()->getName()
diff --git a/llvm/test/Transforms/LoopIdiom/memset-runtime-lir.ll b/llvm/test/Transforms/LoopIdiom/memset-runtime-lir.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memset-runtime-lir.ll
@@ -0,0 +1,237 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="function(loop(loop-idiom,loop-deletion),simplifycfg)" < %s -S | FileCheck %s
+; The C code to generate this testcase:
+; void test(int ar[][m], long n, long m)
+; {
+;	  long i;
+;	  for (i=0; i<n; ++i) {
+;     int *arr = ar + i * m; // ar[i];
+;     memset(arr, 0, m * sizeof(int));
+;   }
+; }
+; The optimized IR should be similar to the following:
+; void test(int ar[][m], long n, long m)
+; {
+;   memset(ar, 0, m * n * sizeof(int));
+; }
+define void @test_simple(i32* nocapture %ar, i64 %n, i64 %m) {
+; CHECK-LABEL: @test_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AR1:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[M:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[M]], [[N:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR1]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = shl nuw i64 %m, 2
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc4, %entry
+  %i.017 = phi i64 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %1 = mul i64 %m, %i.017
+  %scevgep = getelementptr i32, i32* %ar, i64 %1
+  %scevgep1 = bitcast i32* %scevgep to i8*
+  %mul = mul nsw i64 %i.017, %m
+  call void @llvm.memset.p0i8.i64(i8* align 4 %scevgep1, i8 0, i64 %0, i1 false)
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.cond1.preheader
+  %inc5 = add nuw nsw i64 %i.017, 1
+  %exitcond18.not = icmp eq i64 %inc5, %n
+  br i1 %exitcond18.not, label %for.end6, label %for.cond1.preheader
+
+for.end6:                                         ; preds = %for.inc4
+  ret void
+}
+
+; The C code to generate this testcase:
+; void test(int n, int m, int o, int *ar)
+; {
+;   long i = 0, j;
+;   do {
+;     j = 0;
+;     do {
+;       int *arr = ar + i * m * o + j * o;
+;       memset(arr, 0, o * sizeof(int));
+;       ++j;
+;     } while (j < m);
+;     ++i;
+;   } while (i < n);
+; }
+define void @test_nested_do_while(i32 %n, i32 %m, i32 %o, i32* nocapture %ar){
+; CHECK-LABEL: @test_nested_do_while(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[O:%.*]] to i64
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[MUL8:%.*]] = shl nsw i64 [[CONV2]], 2
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[CONV]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 1)
+; CHECK-NEXT:    [[SMAX27:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[SMAX]], [[CONV2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC11:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP1]], [[I_0]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[AR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[MUL3:%.*]] = mul i64 [[MUL]], [[I_0]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[AR]], i64 [[MUL3]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[SCEVGEP1]], i8 0, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    [[INC11]] = add nuw nsw i64 [[I_0]], 1
+; CHECK-NEXT:    [[EXITCOND28_NOT:%.*]] = icmp eq i64 [[INC11]], [[SMAX27]]
+; CHECK-NEXT:    br i1 [[EXITCOND28_NOT]], label [[DO_END16:%.*]], label [[DO_BODY]]
+; CHECK:       do.end16:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i32 %m to i64
+  %conv2 = sext i32 %o to i64
+  %mul = mul nsw i64 %conv2, %conv
+  %mul8 = shl nsw i64 %conv2, 2
+  %smax = call i64 @llvm.smax.i64(i64 %conv, i64 1)
+  %0 = call i32 @llvm.smax.i32(i32 %n, i32 1)
+  %smax27 = zext i32 %0 to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.end, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc11, %do.end ]
+  %mul3 = mul i64 %mul, %i.0
+  %add.ptr = getelementptr inbounds i32, i32* %ar, i64 %mul3
+  br label %do.body1
+
+do.body1:                                         ; preds = %do.body1, %do.body
+  %j.0 = phi i64 [ 0, %do.body ], [ %inc, %do.body1 ]
+  %mul5 = mul nsw i64 %j.0, %conv2
+  %add.ptr6 = getelementptr inbounds i32, i32* %add.ptr, i64 %mul5
+  %1 = bitcast i32* %add.ptr6 to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 %mul8, i1 false)
+  %inc = add nuw nsw i64 %j.0, 1
+  %exitcond.not = icmp eq i64 %inc, %smax
+  br i1 %exitcond.not, label %do.end, label %do.body1
+
+do.end:                                           ; preds = %do.body1
+  %inc11 = add nuw nsw i64 %i.0, 1
+  %exitcond28.not = icmp eq i64 %inc11, %smax27
+  br i1 %exitcond28.not, label %do.end16, label %do.body
+
+do.end16:                                         ; preds = %do.end
+  ret void
+}
+; The C code to generate this testcase:
+; void test(int n, int m, int o, int *ar)
+; {
+;   for (int i=0; i<n; ++i) {
+;     for (int j=0; j<m; ++j) {
+;       int *arr = ar + i * m * o + j * o;
+;       memset(arr, 0, o * sizeof(int));
+;     }
+;   }
+; }
+define void @test_nested_for(i32 %n, i32 %m, i32 %o, i32* %ar) {
+; CHECK-LABEL: @test_nested_for(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AR2:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 0, [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END11:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp slt i32 0, [[M:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[O:%.*]] to i64
+; CHECK-NEXT:    [[MUL8:%.*]] = mul i64 [[CONV]], 4
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_LR_PH_SPLIT_US:%.*]], label [[FOR_END11]]
+; CHECK:       for.body.lr.ph.split.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[O]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[O]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT10:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP5]], [[WIDE_TRIP_COUNT10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 2
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR2]], i8 0, i64 [[TMP8]], i1 false)
+; CHECK-NEXT:    br label [[FOR_END11]]
+; CHECK:       for.end11:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp3 = icmp slt i32 0, %n
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end11
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp21 = icmp slt i32 0, %m
+  %conv = sext i32 %o to i64
+  %mul8 = mul i64 %conv, 4
+  br i1 %cmp21, label %for.body.lr.ph.split.us, label %for.body.lr.ph.split
+
+for.body.lr.ph.split.us:                          ; preds = %for.body.lr.ph
+  %0 = sext i32 %o to i64
+  %1 = sext i32 %m to i64
+  %2 = sext i32 %o to i64
+  %wide.trip.count10 = zext i32 %n to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.inc9.us, %for.body.lr.ph.split.us
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %for.inc9.us ], [ 0, %for.body.lr.ph.split.us ]
+  br label %for.body3.lr.ph.us
+
+for.end.us:                                       ; preds = %for.cond1.for.end_crit_edge.us
+  br label %for.inc9.us
+
+for.inc9.us:                                      ; preds = %for.end.us
+  %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1
+  %exitcond11 = icmp ne i64 %indvars.iv.next7, %wide.trip.count10
+  br i1 %exitcond11, label %for.body.us, label %for.cond.for.end11_crit_edge.split.us
+
+for.body3.us:                                     ; preds = %for.body3.lr.ph.us, %for.inc.us
+  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next, %for.inc.us ]
+  %3 = mul nsw i64 %indvars.iv, %0
+  %add.ptr7.us = getelementptr inbounds i32, i32* %add.ptr.us, i64 %3
+  %4 = bitcast i32* %add.ptr7.us to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 4 %4, i8 0, i64 %mul8, i1 false)
+  br label %for.inc.us
+
+for.inc.us:                                       ; preds = %for.body3.us
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body3.us, label %for.cond1.for.end_crit_edge.us
+
+for.body3.lr.ph.us:                               ; preds = %for.body.us
+  %5 = mul nsw i64 %indvars.iv6, %1
+  %6 = mul nsw i64 %5, %2
+  %add.ptr.us = getelementptr inbounds i32, i32* %ar, i64 %6
+  %wide.trip.count = zext i32 %m to i64
+  br label %for.body3.us
+
+for.cond1.for.end_crit_edge.us:                   ; preds = %for.inc.us
+  br label %for.end.us
+
+for.cond.for.end11_crit_edge.split.us:            ; preds = %for.inc9.us
+  br label %for.cond.for.end11_crit_edge
+
+for.body.lr.ph.split:                             ; preds = %for.body.lr.ph
+  br label %for.cond.for.end11_crit_edge.split
+
+for.cond.for.end11_crit_edge.split:               ; preds = %for.body.lr.ph.split
+  br label %for.cond.for.end11_crit_edge
+
+for.cond.for.end11_crit_edge:                     ; preds = %for.cond.for.end11_crit_edge.split.us, %for.cond.for.end11_crit_edge.split
+  br label %for.end11
+
+for.end11:                                        ; preds = %for.cond.for.end11_crit_edge, %entry
+  ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i64 @llvm.smax.i64(i64, i64)
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32)
diff --git a/llvm/test/Transforms/LoopIdiom/memset-runtime-lnir.ll b/llvm/test/Transforms/LoopIdiom/memset-runtime-lnir.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/LoopIdiom/memset-runtime-lnir.ll
@@ -0,0 +1,259 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes="function(loop(loop-nest-idiom,loop-deletion),simplifycfg)" < %s -S | FileCheck %s
+; The C code to generate this testcase:
+; void test(int ar[][m], long n, long m)
+; {
+;	  long i;
+;	  for (i=0; i<n; ++i) {
+;     int *arr = ar + i * m; // ar[i];
+;     memset(arr, 0, m * sizeof(int));
+;   }
+; }
+; The optimized IR should be similar to the following:
+; void test(int ar[][m], long n, long m)
+; {
+;   memset(ar, 0, m * n * sizeof(int));
+; }
+define void @test_simple(i32* nocapture %ar, i64 %n, i64 %m) {
+; CHECK-LABEL: @test_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AR1:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[M:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[M]], [[N:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[TMP1]], 2
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR1]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = shl nuw i64 %m, 2
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc4, %entry
+  %i.017 = phi i64 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %1 = mul i64 %m, %i.017
+  %scevgep = getelementptr i32, i32* %ar, i64 %1
+  %scevgep1 = bitcast i32* %scevgep to i8*
+  %mul = mul nsw i64 %i.017, %m
+  call void @llvm.memset.p0i8.i64(i8* align 4 %scevgep1, i8 0, i64 %0, i1 false)
+  br label %for.inc4
+
+for.inc4:                                         ; preds = %for.cond1.preheader
+  %inc5 = add nuw nsw i64 %i.017, 1
+  %exitcond18.not = icmp eq i64 %inc5, %n
+  br i1 %exitcond18.not, label %for.end6, label %for.cond1.preheader
+
+for.end6:                                         ; preds = %for.inc4
+  ret void
+}
+
+; The C code to generate this testcase:
+; void test(int n, int m, int o, int *ar)
+; {
+;   long i = 0, j;
+;   do {
+;     j = 0;
+;     do {
+;       int *arr = ar + i * m * o + j * o;
+;       memset(arr, 0, o * sizeof(int));
+;       ++j;
+;     } while (j < m);
+;     ++i;
+;   } while (i < n);
+; }
+define void @test_nested_do_while(i32 %n, i32 %m, i32 %o, i32* nocapture %ar){
+; CHECK-LABEL: @test_nested_do_while(
+; CHECK-NEXT:  do.body.lver.check:
+; CHECK-NEXT:    [[AR2:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[M:%.*]] to i64
+; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[O:%.*]] to i64
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[MUL8:%.*]] = shl nsw i64 [[CONV2]], 2
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[CONV]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.smax.i32(i32 [[N:%.*]], i32 1)
+; CHECK-NEXT:    [[SMAX27:%.*]] = zext i32 [[TMP0]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[CONV2]], [[CONV]]
+; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[SMAX]], [[CONV2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[SMAX27]]
+; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[CONV]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i64 [[CONV2]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[SMAX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[DO_BODY_LVER_ORIG:%.*]], label [[DO_BODY_PH:%.*]]
+; CHECK:       do.body.lver.orig:
+; CHECK-NEXT:    [[I_0_LVER_ORIG:%.*]] = phi i64 [ [[INC11_LVER_ORIG:%.*]], [[DO_END_LVER_ORIG:%.*]] ], [ 0, [[DO_BODY_LVER_CHECK:%.*]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP1]], [[I_0_LVER_ORIG]]
+; CHECK-NEXT:    [[SCEVGEP_LVER_ORIG:%.*]] = getelementptr i32, i32* [[AR]], i64 [[TMP12]]
+; CHECK-NEXT:    [[SCEVGEP1_LVER_ORIG:%.*]] = bitcast i32* [[SCEVGEP_LVER_ORIG]] to i8*
+; CHECK-NEXT:    [[MUL3_LVER_ORIG:%.*]] = mul i64 [[MUL]], [[I_0_LVER_ORIG]]
+; CHECK-NEXT:    [[ADD_PTR_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[AR]], i64 [[MUL3_LVER_ORIG]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[SCEVGEP1_LVER_ORIG]], i8 0, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    br label [[DO_BODY1_LVER_ORIG:%.*]]
+; CHECK:       do.body1.lver.orig:
+; CHECK-NEXT:    [[J_0_LVER_ORIG:%.*]] = phi i64 [ 0, [[DO_BODY_LVER_ORIG]] ], [ [[INC_LVER_ORIG:%.*]], [[DO_BODY1_LVER_ORIG]] ]
+; CHECK-NEXT:    [[MUL5_LVER_ORIG:%.*]] = mul nsw i64 [[J_0_LVER_ORIG]], [[CONV2]]
+; CHECK-NEXT:    [[ADD_PTR6_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR_LVER_ORIG]], i64 [[MUL5_LVER_ORIG]]
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ADD_PTR6_LVER_ORIG]] to i8*
+; CHECK-NEXT:    [[INC_LVER_ORIG]] = add nuw nsw i64 [[J_0_LVER_ORIG]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[INC_LVER_ORIG]], [[SMAX]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT_LVER_ORIG]], label [[DO_END_LVER_ORIG]], label [[DO_BODY1_LVER_ORIG]]
+; CHECK:       do.end.lver.orig:
+; CHECK-NEXT:    [[INC11_LVER_ORIG]] = add nuw nsw i64 [[I_0_LVER_ORIG]], 1
+; CHECK-NEXT:    [[EXITCOND28_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[INC11_LVER_ORIG]], [[SMAX27]]
+; CHECK-NEXT:    br i1 [[EXITCOND28_NOT_LVER_ORIG]], label [[DO_END16:%.*]], label [[DO_BODY_LVER_ORIG]]
+; CHECK:       do.body.ph:
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR2]], i8 0, i64 [[TMP5]], i1 false)
+; CHECK-NEXT:    br label [[DO_END16]]
+; CHECK:       do.end16:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %conv = sext i32 %m to i64
+  %conv2 = sext i32 %o to i64
+  %mul = mul nsw i64 %conv2, %conv
+  %mul8 = shl nsw i64 %conv2, 2
+  %smax = call i64 @llvm.smax.i64(i64 %conv, i64 1)
+  %0 = call i32 @llvm.smax.i32(i32 %n, i32 1)
+  %smax27 = zext i32 %0 to i64
+  br label %do.body
+
+do.body:                                          ; preds = %do.end, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc11, %do.end ]
+  %mul3 = mul i64 %mul, %i.0
+  %add.ptr = getelementptr inbounds i32, i32* %ar, i64 %mul3
+  br label %do.body1
+
+do.body1:                                         ; preds = %do.body1, %do.body
+  %j.0 = phi i64 [ 0, %do.body ], [ %inc, %do.body1 ]
+  %mul5 = mul nsw i64 %j.0, %conv2
+  %add.ptr6 = getelementptr inbounds i32, i32* %add.ptr, i64 %mul5
+  %1 = bitcast i32* %add.ptr6 to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 4 %1, i8 0, i64 %mul8, i1 false)
+  %inc = add nuw nsw i64 %j.0, 1
+  %exitcond.not = icmp eq i64 %inc, %smax
+  br i1 %exitcond.not, label %do.end, label %do.body1
+
+do.end:                                           ; preds = %do.body1
+  %inc11 = add nuw nsw i64 %i.0, 1
+  %exitcond28.not = icmp eq i64 %inc11, %smax27
+  br i1 %exitcond28.not, label %do.end16, label %do.body
+
+do.end16:                                         ; preds = %do.end
+  ret void
+}
+; The C code to generate this testcase:
+; void test(int n, int m, int o, int *ar)
+; {
+;   for (int i=0; i<n; ++i) {
+;     for (int j=0; j<m; ++j) {
+;       int *arr = ar + i * m * o + j * o;
+;       memset(arr, 0, o * sizeof(int));
+;     }
+;   }
+; }
+define void @test_nested_for(i32 %n, i32 %m, i32 %o, i32* %ar) {
+; CHECK-LABEL: @test_nested_for(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AR2:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 0, [[N:%.*]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END11:%.*]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[CMP21:%.*]] = icmp slt i32 0, [[M:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[O:%.*]] to i64
+; CHECK-NEXT:    [[MUL8:%.*]] = mul i64 [[CONV]], 4
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_LR_PH_SPLIT_US:%.*]], label [[FOR_END11]]
+; CHECK:       for.body.lr.ph.split.us:
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[O]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[O]] to i64
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT10:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[M]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP0]], [[TMP4]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP5]], [[WIDE_TRIP_COUNT10]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 2
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR2]], i8 0, i64 [[TMP8]], i1 false)
+; CHECK-NEXT:    br label [[FOR_END11]]
+; CHECK:       for.end11:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp3 = icmp slt i32 0, %n
+  br i1 %cmp3, label %for.body.lr.ph, label %for.end11
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp21 = icmp slt i32 0, %m
+  %conv = sext i32 %o to i64
+  %mul8 = mul i64 %conv, 4
+  br i1 %cmp21, label %for.body.lr.ph.split.us, label %for.body.lr.ph.split
+
+for.body.lr.ph.split.us:                          ; preds = %for.body.lr.ph
+  %0 = sext i32 %o to i64
+  %1 = sext i32 %m to i64
+  %2 = sext i32 %o to i64
+  %wide.trip.count10 = zext i32 %n to i64
+  br label %for.body.us
+
+for.body.us:                                      ; preds = %for.inc9.us, %for.body.lr.ph.split.us
+  %indvars.iv6 = phi i64 [ %indvars.iv.next7, %for.inc9.us ], [ 0, %for.body.lr.ph.split.us ]
+  br label %for.body3.lr.ph.us
+
+for.end.us:                                       ; preds = %for.cond1.for.end_crit_edge.us
+  br label %for.inc9.us
+
+for.inc9.us:                                      ; preds = %for.end.us
+  %indvars.iv.next7 = add nuw nsw i64 %indvars.iv6, 1
+  %exitcond11 = icmp ne i64 %indvars.iv.next7, %wide.trip.count10
+  br i1 %exitcond11, label %for.body.us, label %for.cond.for.end11_crit_edge.split.us
+
+for.body3.us:                                     ; preds = %for.body3.lr.ph.us, %for.inc.us
+  %indvars.iv = phi i64 [ 0, %for.body3.lr.ph.us ], [ %indvars.iv.next, %for.inc.us ]
+  %3 = mul nsw i64 %indvars.iv, %0
+  %add.ptr7.us = getelementptr inbounds i32, i32* %add.ptr.us, i64 %3
+  %4 = bitcast i32* %add.ptr7.us to i8*
+  call void @llvm.memset.p0i8.i64(i8* align 4 %4, i8 0, i64 %mul8, i1 false)
+  br label %for.inc.us
+
+for.inc.us:                                       ; preds = %for.body3.us
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body3.us, label %for.cond1.for.end_crit_edge.us
+
+for.body3.lr.ph.us:                               ; preds = %for.body.us
+  %5 = mul nsw i64 %indvars.iv6, %1
+  %6 = mul nsw i64 %5, %2
+  %add.ptr.us = getelementptr inbounds i32, i32* %ar, i64 %6
+  %wide.trip.count = zext i32 %m to i64
+  br label %for.body3.us
+
+for.cond1.for.end_crit_edge.us:                   ; preds = %for.inc.us
+  br label %for.end.us
+
+for.cond.for.end11_crit_edge.split.us:            ; preds = %for.inc9.us
+  br label %for.cond.for.end11_crit_edge
+
+for.body.lr.ph.split:                             ; preds = %for.body.lr.ph
+  br label %for.cond.for.end11_crit_edge.split
+
+for.cond.for.end11_crit_edge.split:               ; preds = %for.body.lr.ph.split
+  br label %for.cond.for.end11_crit_edge
+
+for.cond.for.end11_crit_edge:                     ; preds = %for.cond.for.end11_crit_edge.split.us, %for.cond.for.end11_crit_edge.split
+  br label %for.end11
+
+for.end11:                                        ; preds = %for.cond.for.end11_crit_edge, %entry
+  ret void
+}
+
+; Function Attrs: argmemonly nofree nounwind willreturn writeonly
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg)
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i64 @llvm.smax.i64(i64, i64)
+
+; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
+declare i32 @llvm.smax.i32(i32, i32)