diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -157,7 +157,6 @@
 
 namespace {
 
-typedef SmallSet<const SCEV *, 10> SCEVSet;
 typedef DenseMap<const SCEV *, int> SCEVConstValPairMap;
 
 /// A helper class to do the following SCEV expression conversions.
@@ -165,13 +164,14 @@
 ///   2) "SOME_CONSTANT_VALUE smax %val" to "%val"
 class SCEVExprConverter {
 public:
+  Loop *CurLoop;
   ScalarEvolution &SE;
   SCEVConstValPairMap CheckSltMap;
 
   SCEVExprConverter(ScalarEvolution &SE)
       : SE(SE) {}
 
-  const SCEV *convertSCEV(const SCEV *Expr);
+  const SCEV *convertSCEV(const SCEV *Expr, bool AddRuntimCheck);
 };
 
 class LoopIdiomRecognize {
@@ -349,7 +349,8 @@
 } // end anonymous namespace
 
 /// Implementation of SCEVExprConverter.
-const SCEV *SCEVExprConverter::convertSCEV(const SCEV *Expr) {
+/// Tries to fold the SCEV with regard to loop guards of CurLoop
+const SCEV *SCEVExprConverter::convertSCEV(const SCEV *Expr, bool AddRuntimeCheck) {
   switch (Expr->getSCEVType()) {
     case scConstant:
     case scUnknown:
@@ -358,98 +359,108 @@
     case scTruncate: {
       const SCEVTruncateExpr *Trunc = cast<SCEVTruncateExpr>(Expr);
       Type *Ty = Trunc->getType();
-      const SCEV *NewTrunc = convertSCEV(Trunc->getOperand());
+      const SCEV *NewTrunc = convertSCEV(Trunc->getOperand(), AddRuntimeCheck);
       return SE.getTruncateExpr(NewTrunc, Ty);
     }
     case scZeroExtend: {
       const SCEVZeroExtendExpr *Zext = cast<SCEVZeroExtendExpr>(Expr);
       Type *Ty = Zext->getType();
-      const SCEV *NewZext = convertSCEV(Zext->getOperand());
+      const SCEV *NewZext = convertSCEV(Zext->getOperand(), AddRuntimeCheck);
       return SE.getZeroExtendExpr(NewZext, Ty);
     }
     case scSignExtend: {
-      // Record the original SCEV sext expression, and
-      // convert it to zext.
+      // Return original SCEV if expression is not guarded by loop with Zero and
+      // dont want to add runtime check. Otherwise we fold constant and add appropriate
+      // runtime check.
       const SCEVSignExtendExpr *Sext = cast<SCEVSignExtendExpr>(Expr);
-      if (CheckSltMap[Sext] < 0)
-        CheckSltMap[Sext] = 0;
-      Type *Ty = Sext->getType();
-      const SCEV *NewZext = convertSCEV(Sext->getOperand());
-      return SE.getZeroExtendExpr(NewZext, Ty);
+      if (SE.isLoopEntryGuardedByCond(CurLoop, ICmpInst::ICMP_SGE, Sext, SE.getZero(Sext->getType())) == false) {
+        if (AddRuntimeCheck == false)
+          return Sext;
+        if (CheckSltMap[Sext] < 0)
+          CheckSltMap[Sext] = 0;
+      }
+      const SCEV *NewZext = convertSCEV(Sext->getOperand(), AddRuntimeCheck);
+      return SE.getZeroExtendExpr(NewZext, Sext->getType());
     }
     case scAddExpr: {
       const SCEVAddExpr *Add = cast<SCEVAddExpr>(Expr);
-      const SCEV *NewAdd = convertSCEV(Add->getOperand(0));
+      const SCEV *NewAdd = convertSCEV(Add->getOperand(0), AddRuntimeCheck);
       for (int I = 1, E = Add->getNumOperands(); I != E; ++I) {
-        NewAdd = SE.getAddExpr(NewAdd, convertSCEV(Add->getOperand(I)));
+        NewAdd = SE.getAddExpr(NewAdd, convertSCEV(Add->getOperand(I), AddRuntimeCheck));
       }
       return NewAdd;
     }
     case scMulExpr: {
       const SCEVMulExpr *Mul = cast<SCEVMulExpr>(Expr);
-      const SCEV *NewMul = convertSCEV(Mul->getOperand(0));
+      const SCEV *NewMul = convertSCEV(Mul->getOperand(0), AddRuntimeCheck);
       for (int I = 1, E = Mul->getNumOperands(); I != E; ++I) {
-        NewMul = SE.getMulExpr(NewMul, convertSCEV(Mul->getOperand(I)));
+        NewMul = SE.getMulExpr(NewMul, convertSCEV(Mul->getOperand(I), AddRuntimeCheck));
       }
       return NewMul;
     }
     case scUDivExpr: {
       const SCEVUDivExpr *UDiv = cast<SCEVUDivExpr>(Expr);
-      const SCEV *NewLHS = convertSCEV(UDiv->getLHS());
-      const SCEV *NewRHS = convertSCEV(UDiv->getRHS());
+      const SCEV *NewLHS = convertSCEV(UDiv->getLHS(), AddRuntimeCheck);
+      const SCEV *NewRHS = convertSCEV(UDiv->getRHS(), AddRuntimeCheck);
       return SE.getUDivExpr(NewLHS, NewRHS);
     }
     case scAddRecExpr:
       assert(false && "Do not expect AddRec here!");
     case scUMaxExpr: {
       const SCEVUMaxExpr *UMax = cast<SCEVUMaxExpr>(Expr);
-      const SCEV *NewUMax = convertSCEV(UMax->getOperand(0));
+      const SCEV *NewUMax = convertSCEV(UMax->getOperand(0), AddRuntimeCheck);
       for (int I = 1, E = UMax->getNumOperands(); I != E; ++I) {
-        NewUMax = SE.getUMaxExpr(NewUMax, convertSCEV(UMax->getOperand(I)));
+        NewUMax = SE.getUMaxExpr(NewUMax, convertSCEV(UMax->getOperand(I), AddRuntimeCheck));
       }
       return NewUMax;
     }
     case scSMaxExpr: {
+      // Return original SCEV if expression is not guarded by loop with Constant and
+      // dont want to add runtime check. Otherwise we fold constant and add appropriate
+      // runtime check.
       const SCEVSMaxExpr *SMax = cast<SCEVSMaxExpr>(Expr);
       const int NumOfOps = SMax->getNumOperands();
       bool Fold = false;
-      // If an operand is constant zero, it will be the first operand.
+      // If an operand is constant, it will be the first operand.
       const SCEV *SMaxOp0 = SMax->getOperand(0);
-      const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(SMaxOp0);
+      const SCEVConstant *Cst = dyn_cast<SCEVConstant>(SMaxOp0);
 
-      if (LHSC) {
-        // fold the constant with the other operands. there will be
-        // runtime-check to check our assumption for folding the smax is
-        // feasible.
+      if (Cst) {
+        // check if the operand is guarded to the constant
+        // if not, return orignal expression
         Fold = true;
         for (int I = 1, E = NumOfOps; I != E; ++I) {
           auto Ev = SMax->getOperand(I);
-          auto Cst = LHSC->getAPInt().roundToDouble();
-          if (CheckSltMap[Ev] < Cst)
-           CheckSltMap[Ev] = Cst;
+          if (SE.isLoopEntryGuardedByCond(CurLoop, ICmpInst::ICMP_SGE, Ev, Cst) == false) {
+            if (AddRuntimeCheck == false)
+              return SMax;
+            int CstValue = Cst->getAPInt().roundToDouble();
+            if (CheckSltMap[SMax] < CstValue)
+              CheckSltMap[SMax] = CstValue;
+          }
         }
       }
 
       const int StartIdx = Fold ? 1 : 0;
-      const SCEV *NewSMax = convertSCEV(SMax->getOperand(StartIdx));
+      const SCEV *NewSMax = convertSCEV(SMax->getOperand(StartIdx), AddRuntimeCheck);
       for (int I = StartIdx + 1, E = NumOfOps; I != E; ++I) {
-        NewSMax = SE.getSMaxExpr(NewSMax, convertSCEV(SMax->getOperand(I)));
+        NewSMax = SE.getSMaxExpr(NewSMax, convertSCEV(SMax->getOperand(I), AddRuntimeCheck));
       }
       return NewSMax;
     }
     case scUMinExpr: {
       const SCEVUMinExpr *UMin = cast<SCEVUMinExpr>(Expr);
-      const SCEV *NewUMin = convertSCEV(UMin->getOperand(0));
+      const SCEV *NewUMin = convertSCEV(UMin->getOperand(0), AddRuntimeCheck);
       for (int I = 1, E = UMin->getNumOperands(); I != E; ++I) {
-        NewUMin = SE.getUMinExpr(NewUMin, convertSCEV(UMin->getOperand(I)));
+        NewUMin = SE.getUMinExpr(NewUMin, convertSCEV(UMin->getOperand(I), AddRuntimeCheck));
       }
       return NewUMin;
     }
     case scSMinExpr: {
       const SCEVSMinExpr *SMin = cast<SCEVSMinExpr>(Expr);
-      const SCEV *NewSMin = convertSCEV(SMin->getOperand(0));
+      const SCEV *NewSMin = convertSCEV(SMin->getOperand(0), AddRuntimeCheck);
       for (int I = 1, E = SMin->getNumOperands(); I != E; ++I) {
-        NewSMin = SE.getSMinExpr(NewSMin, convertSCEV(SMin->getOperand(I)));
+        NewSMin = SE.getSMinExpr(NewSMin, convertSCEV(SMin->getOperand(I), AddRuntimeCheck));
       }
       return NewSMin;
     }
@@ -1196,10 +1207,11 @@
     // if they are equal. If they match, then we know that every byte is
     // touched in the loop. We only handle memset length and stride that
     // are invariant for the top level loop.
+    // To be conservative, in runtime we would not promote pointers that isn't
+    // in address space zero
     LLVM_DEBUG(dbgs() << "  memset size is non-constant\n");
-    if (LN == nullptr) {
-      LLVM_DEBUG(dbgs() << "  need to call LNIR for non-constant memset"
-                        << "optimization\n");
+    if (Pointer->getType()->getPointerAddressSpace() != 0) {
+      LLVM_DEBUG(dbgs() << "  pointer is not in address space zero\n");
       return false;
     }
     if (!SE->isLoopInvariant(MemsetSizeSCEV, TopLoop) ||
@@ -1218,23 +1230,37 @@
                       << "\n");
 
     if (PositiveStrideSCEV != MemsetSizeSCEV) {
-      // We will convert the SCEV expressions, and compare again.
-      // required conversion to SCEV will be saved inside Converter.
-      // if this function returns true, which means the optimization does happen,
-      // the <SCEV, APInt> pair will be added when we return to processLoopMemIntrinsic.
+      // If the original StrideSCEV and MemsetSizeSCEV does not match, the pass will
+      // fold expressions that is covered by the loop guard at loop entry.
+      // We will compare again after the folding and proceed if equal
       Converter.CheckSltMap.clear();
-      const SCEV *PositiveStrideSCEVConv =
-          Converter.convertSCEV(PositiveStrideSCEV);
-      const SCEV *MemsetSizeSCEVConv =
-          Converter.convertSCEV(MemsetSizeSCEV);
-      LLVM_DEBUG(dbgs() << "  Try to convert SCEV expression and compare again\n"
-                        << "    MemsetSCEVConv: " << *MemsetSizeSCEVConv << "\n"
-                        << "    PositiveStrideSCEVConv: "
-                        << *PositiveStrideSCEVConv << "\n");
-
-      if (PositiveStrideSCEVConv != MemsetSizeSCEVConv) {
-        LLVM_DEBUG(dbgs() << "  Converted SCEV still inequal, abort\n");
-        return false;
+      Converter.CurLoop = CurLoop;
+      const SCEV *FoldedPositiveStride =
+          Converter.convertSCEV(PositiveStrideSCEV, /*AddRuntimeCheck=*/false);
+      const SCEV *FoldedMemsetSize =
+          Converter.convertSCEV(MemsetSizeSCEV, /*AddRuntimeCheck=*/false);
+      LLVM_DEBUG(dbgs() << "  Try to fold SCEV expression covered by loop guard\n"
+                        << "    FoldedMemsetSCEV: " << *FoldedMemsetSize << "\n"
+                        << "    FoldedPositiveStrideSCEV: "
+                        << *FoldedPositiveStride << "\n");
+
+      if (FoldedPositiveStride != FoldedMemsetSize) {
+        if (LN == nullptr || ForceNoLoopVersion) {
+          LLVM_DEBUG(dbgs() << "  unable to do loop versioning here, abort\n");
+          return false;
+        }
+        const SCEV *ConvertedPositiveStride =
+            Converter.convertSCEV(FoldedPositiveStride, /*AddRuntimeCheck=*/true);
+        const SCEV *ConvertedMemsetSize =
+            Converter.convertSCEV(FoldedMemsetSize, /*AddRuntimeCheck=*/true);
+        LLVM_DEBUG(dbgs() << "  Try to convert SCEV expression and add appropriate runtime check\n"
+                          << "    ConvertedMemsetSCEV: " << *ConvertedMemsetSize << "\n"
+                          << "    ConvertedPositiveStrideSCEV: "
+                          << *ConvertedPositiveStride << "\n");
+        if (ConvertedPositiveStride != ConvertedMemsetSize) {
+          LLVM_DEBUG(dbgs() << "  Converted SCEV inequal, abort\n");
+          return false;
+        }
       }
     }
   }
@@ -1253,7 +1279,7 @@
 
   // if we have successfully changed with processLoopStridedStore
   // add the require runtime check information into list.
-  if (Changed) {
+  if (Changed && isTopLoopVersioned()) {
     for (auto Pair : Converter.CheckSltMap) {
       auto Ev = Pair.first;
       auto Cst = Pair.second;
@@ -1494,16 +1520,18 @@
   // NumBytes = TripCount * StoreSize
   const SCEV *TripCountS = getTripCount(BECount, IntIdxTy, CurLoop, DL, SE);
 
-  // This check is possible only for LoopNestIdiomRecognize, since we are
-  // trying to version on the top-level loop.
-  // Give up if the store size is not constant and the trip count SCEV
-  // expression is variant to the top level loop. In this sense versioning is
-  // needed and compile option enforces not to.
-  if (LN != nullptr && !SE->isLoopInvariant(TripCountS, TopLoop)) {
-    const bool IsConstantSize = isa<SCEVConstant>(StoreSizeSCEV);
-    if (IsLoopMemset && !IsConstantSize && ForceNoLoopVersion) {
-      LLVM_DEBUG(dbgs() << "requires versioning but abort becuase "
-                        << "ForceNoLoopVersion is set to true\n");
+  // If store size is not constant and we need to add runtime checks for the optimization
+  // to proceed, then versioning is required.
+  const bool IsConstantSize = isa<SCEVConstant>(StoreSizeSCEV);
+  if (IsLoopMemset && !IsConstantSize && Converter.CheckSltMap.size()) {
+    if (LN == nullptr) {
+      LLVM_DEBUG(dbgs() << "requires versioning but running LoopIdiomRecognizer, "
+                        << "abort (run LoopNestIdiomRecognizer instead support versioning)\n");
+      return Changed;
+    }
+    if (!SE->isLoopInvariant(TripCountS, TopLoop) || ForceNoLoopVersion) {
+      LLVM_DEBUG(dbgs() << "abort becuase TripCount is not TopLoop's invariant "
+                        << "or ForceNoLoopVersion = true\n");
       return Changed;
     }
   }
@@ -1540,18 +1568,16 @@
   // Here we check whether the top level clone has beed created yet, and create
   // it if it hasn't. The initial runtime check is set to false and the
   // conditions would be updated after we process all the loops.
-  const bool IsConstantSize = isa<SCEVConstant>(StoreSizeSCEV);
-  if (LN != nullptr && IsLoopMemset && !IsConstantSize && !ForceNoLoopVersion) {
-    if (!isTopLoopVersioned() && Converter.CheckSltMap.size()) {
-      LLVM_DEBUG(dbgs() << "  Create versioning for top loop because SCEV folding is needed\n");
-      versionTopLoop();
-
-      // If current loop is the top loop, versioning would change the loop's
-      // preheader to RuntimeCheckBB, so we need to reset the insert point.
-      if (CurLoop == TopLoop) {
-        Preheader = CurLoop->getLoopPreheader();
-        Builder.SetInsertPoint(Preheader->getTerminator());
-      }
+  if (LN != nullptr && IsLoopMemset && !IsConstantSize && !ForceNoLoopVersion &&
+      !isTopLoopVersioned() && Converter.CheckSltMap.size()) {
+    LLVM_DEBUG(dbgs() << "  Create versioning for top loop because runtime check for SCEV is needed\n");
+    versionTopLoop();
+
+    // If current loop is the top loop, versioning would change the loop's
+    // preheader to RuntimeCheckBB, so we need to reset the insert point.
+    if (CurLoop == TopLoop) {
+      Preheader = CurLoop->getLoopPreheader();
+      Builder.SetInsertPoint(Preheader->getTerminator());
     }
   }
 
diff --git a/llvm/test/Transforms/LoopIdiom/memset-runtime.ll b/llvm/test/Transforms/LoopIdiom/memset-runtime-lir.ll
rename from llvm/test/Transforms/LoopIdiom/memset-runtime.ll
rename to llvm/test/Transforms/LoopIdiom/memset-runtime-lir.ll
--- a/llvm/test/Transforms/LoopIdiom/memset-runtime.ll
+++ b/llvm/test/Transforms/LoopIdiom/memset-runtime-lir.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes="function(loop(loop-nest-idiom,loop-deletion),simplifycfg)" < %s -S | FileCheck %s
+; RUN: opt -passes="function(loop(loop-idiom,loop-deletion),simplifycfg)" < %s -S | FileCheck %s
 ; The C code to generate this testcase:
 ; void test(int ar[][m], long n, long m)
 ; {
@@ -12,10 +12,7 @@
 ; The optimized IR should be similar to the following:
 ; void test(int ar[][m], long n, long m)
 ; {
-;   if (n < 0 || m < 0 || (n >> 32) != 0 || (4 * m >> 32) != 0)
-;     /* optimization result identical to LoopIdiomRecognize */
-;   else
-;     /* hoists memset to loop-preheader */
+;   memset(ar, 0, m * n * sizeof(int));
 ; }
 define void @test_simple(i32* nocapture %ar, i64 %n, i64 %m) {
 ; CHECK-LABEL: @test_simple(
@@ -65,8 +62,7 @@
 ; }
 define void @test_nested_do_while(i32 %n, i32 %m, i32 %o, i32* nocapture %ar){
 ; CHECK-LABEL: @test_nested_do_while(
-; CHECK-NEXT:  do.body.lver.check:
-; CHECK-NEXT:    [[AR2:%.*]] = bitcast i32* [[AR:%.*]] to i8*
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[M:%.*]] to i64
 ; CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[O:%.*]] to i64
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i64 [[CONV2]], [[CONV]]
@@ -77,37 +73,18 @@
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul i64 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul i64 [[SMAX]], [[CONV2]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[SMAX27]]
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[CONV]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i64 [[CONV2]], 0
-; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[DO_BODY_LVER_ORIG:%.*]], label [[DO_BODY_PH:%.*]]
-; CHECK:       do.body.lver.orig:
-; CHECK-NEXT:    [[I_0_LVER_ORIG:%.*]] = phi i64 [ [[INC11_LVER_ORIG:%.*]], [[DO_END_LVER_ORIG:%.*]] ], [ 0, [[DO_BODY_LVER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP1]], [[I_0_LVER_ORIG]]
-; CHECK-NEXT:    [[SCEVGEP_LVER_ORIG:%.*]] = getelementptr i32, i32* [[AR]], i64 [[TMP10]]
-; CHECK-NEXT:    [[SCEVGEP1_LVER_ORIG:%.*]] = bitcast i32* [[SCEVGEP_LVER_ORIG]] to i8*
-; CHECK-NEXT:    [[MUL3_LVER_ORIG:%.*]] = mul i64 [[MUL]], [[I_0_LVER_ORIG]]
-; CHECK-NEXT:    [[ADD_PTR_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[AR]], i64 [[MUL3_LVER_ORIG]]
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[SCEVGEP1_LVER_ORIG]], i8 0, i64 [[TMP3]], i1 false)
-; CHECK-NEXT:    br label [[DO_BODY1_LVER_ORIG:%.*]]
-; CHECK:       do.body1.lver.orig:
-; CHECK-NEXT:    [[J_0_LVER_ORIG:%.*]] = phi i64 [ 0, [[DO_BODY_LVER_ORIG]] ], [ [[INC_LVER_ORIG:%.*]], [[DO_BODY1_LVER_ORIG]] ]
-; CHECK-NEXT:    [[MUL5_LVER_ORIG:%.*]] = mul nsw i64 [[J_0_LVER_ORIG]], [[CONV2]]
-; CHECK-NEXT:    [[ADD_PTR6_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR_LVER_ORIG]], i64 [[MUL5_LVER_ORIG]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ADD_PTR6_LVER_ORIG]] to i8*
-; CHECK-NEXT:    [[INC_LVER_ORIG]] = add nuw nsw i64 [[J_0_LVER_ORIG]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[INC_LVER_ORIG]], [[SMAX]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT_LVER_ORIG]], label [[DO_END_LVER_ORIG]], label [[DO_BODY1_LVER_ORIG]]
-; CHECK:       do.end.lver.orig:
-; CHECK-NEXT:    [[INC11_LVER_ORIG]] = add nuw nsw i64 [[I_0_LVER_ORIG]], 1
-; CHECK-NEXT:    [[EXITCOND28_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[INC11_LVER_ORIG]], [[SMAX27]]
-; CHECK-NEXT:    br i1 [[EXITCOND28_NOT_LVER_ORIG]], label [[DO_END16:%.*]], label [[DO_BODY_LVER_ORIG]]
-; CHECK:       do.body.ph:
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR2]], i8 0, i64 [[TMP5]], i1 false)
-; CHECK-NEXT:    br label [[DO_END16]]
+; CHECK-NEXT:    br label [[DO_BODY:%.*]]
+; CHECK:       do.body:
+; CHECK-NEXT:    [[I_0:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC11:%.*]], [[DO_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP1]], [[I_0]]
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i32, i32* [[AR:%.*]], i64 [[TMP4]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = bitcast i32* [[SCEVGEP]] to i8*
+; CHECK-NEXT:    [[MUL3:%.*]] = mul i64 [[MUL]], [[I_0]]
+; CHECK-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i32, i32* [[AR]], i64 [[MUL3]]
+; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[SCEVGEP1]], i8 0, i64 [[TMP3]], i1 false)
+; CHECK-NEXT:    [[INC11]] = add nuw nsw i64 [[I_0]], 1
+; CHECK-NEXT:    [[EXITCOND28_NOT:%.*]] = icmp eq i64 [[INC11]], [[SMAX27]]
+; CHECK-NEXT:    br i1 [[EXITCOND28_NOT]], label [[DO_END16:%.*]], label [[DO_BODY]]
 ; CHECK:       do.end16:
 ; CHECK-NEXT:    ret void
 ;
@@ -151,7 +128,7 @@
 ;   for (int i=0; i<n; ++i) {
 ;     for (int j=0; j<m; ++j) {
 ;       int *arr = ar + i * m * o + j * o;
-;       memset(arr, 0, o * sizeof(int));      
+;       memset(arr, 0, o * sizeof(int));
 ;     }
 ;   }
 ; }
@@ -165,8 +142,8 @@
 ; CHECK-NEXT:    [[CMP21:%.*]] = icmp slt i32 0, [[M:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[O:%.*]] to i64
 ; CHECK-NEXT:    [[MUL8:%.*]] = mul i64 [[CONV]], 4
-; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_US_LVER_CHECK:%.*]], label [[FOR_END11]]
-; CHECK:       for.body.us.lver.check:
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_LR_PH_SPLIT_US:%.*]], label [[FOR_END11]]
+; CHECK:       for.body.lr.ph.split.us:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[O]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[M]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[O]] to i64
@@ -177,35 +154,6 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP5]], [[WIDE_TRIP_COUNT10]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = or i1 false, [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[FOR_BODY_US_LVER_ORIG:%.*]], label [[FOR_BODY_US_PH:%.*]]
-; CHECK:       for.body.us.lver.orig:
-; CHECK-NEXT:    [[INDVARS_IV6_LVER_ORIG:%.*]] = phi i64 [ [[INDVARS_IV_NEXT7_LVER_ORIG:%.*]], [[FOR_INC9_US_LVER_ORIG:%.*]] ], [ 0, [[FOR_BODY_US_LVER_CHECK]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP3]], [[INDVARS_IV6_LVER_ORIG]]
-; CHECK-NEXT:    [[SCEVGEP_LVER_ORIG:%.*]] = getelementptr i32, i32* [[AR]], i64 [[TMP13]]
-; CHECK-NEXT:    [[SCEVGEP1_LVER_ORIG:%.*]] = bitcast i32* [[SCEVGEP_LVER_ORIG]] to i8*
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw i64 [[INDVARS_IV6_LVER_ORIG]], [[TMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 [[TMP14]], [[TMP2]]
-; CHECK-NEXT:    [[ADD_PTR_US_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[AR]], i64 [[TMP15]]
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT_LVER_ORIG:%.*]] = zext i32 [[M]] to i64
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[SCEVGEP1_LVER_ORIG]], i8 0, i64 [[TMP6]], i1 false)
-; CHECK-NEXT:    br label [[FOR_BODY3_US_LVER_ORIG:%.*]]
-; CHECK:       for.body3.us.lver.orig:
-; CHECK-NEXT:    [[INDVARS_IV_LVER_ORIG:%.*]] = phi i64 [ 0, [[FOR_BODY_US_LVER_ORIG]] ], [ [[INDVARS_IV_NEXT_LVER_ORIG:%.*]], [[FOR_BODY3_US_LVER_ORIG]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 [[INDVARS_IV_LVER_ORIG]], [[TMP0]]
-; CHECK-NEXT:    [[ADD_PTR7_US_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR_US_LVER_ORIG]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ADD_PTR7_US_LVER_ORIG]] to i8*
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_LVER_ORIG]] = add nuw nsw i64 [[INDVARS_IV_LVER_ORIG]], 1
-; CHECK-NEXT:    [[EXITCOND_LVER_ORIG:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT_LVER_ORIG]], [[WIDE_TRIP_COUNT_LVER_ORIG]]
-; CHECK-NEXT:    br i1 [[EXITCOND_LVER_ORIG]], label [[FOR_BODY3_US_LVER_ORIG]], label [[FOR_INC9_US_LVER_ORIG]]
-; CHECK:       for.inc9.us.lver.orig:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT7_LVER_ORIG]] = add nuw nsw i64 [[INDVARS_IV6_LVER_ORIG]], 1
-; CHECK-NEXT:    [[EXITCOND11_LVER_ORIG:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT7_LVER_ORIG]], [[WIDE_TRIP_COUNT10]]
-; CHECK-NEXT:    br i1 [[EXITCOND11_LVER_ORIG]], label [[FOR_BODY_US_LVER_ORIG]], label [[FOR_END11]]
-; CHECK:       for.body.us.ph:
 ; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR2]], i8 0, i64 [[TMP8]], i1 false)
 ; CHECK-NEXT:    br label [[FOR_END11]]
 ; CHECK:       for.end11:
diff --git a/llvm/test/Transforms/LoopIdiom/memset-runtime.ll b/llvm/test/Transforms/LoopIdiom/memset-runtime-lnir.ll
rename from llvm/test/Transforms/LoopIdiom/memset-runtime.ll
rename to llvm/test/Transforms/LoopIdiom/memset-runtime-lnir.ll
--- a/llvm/test/Transforms/LoopIdiom/memset-runtime.ll
+++ b/llvm/test/Transforms/LoopIdiom/memset-runtime-lnir.ll
@@ -12,10 +12,7 @@
 ; The optimized IR should be similar to the following:
 ; void test(int ar[][m], long n, long m)
 ; {
-;   if (n < 0 || m < 0 || (n >> 32) != 0 || (4 * m >> 32) != 0)
-;     /* optimization result identical to LoopIdiomRecognize */
-;   else
-;     /* hoists memset to loop-preheader */
+;   memset(ar, 0, m * n * sizeof(int));
 ; }
 define void @test_simple(i32* nocapture %ar, i64 %n, i64 %m) {
 ; CHECK-LABEL: @test_simple(
@@ -79,15 +76,17 @@
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP2]], [[SMAX27]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[TMP4]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[CONV]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp slt i64 [[CONV]], 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = or i1 false, [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp slt i64 [[CONV2]], 0
 ; CHECK-NEXT:    [[TMP9:%.*]] = or i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[DO_BODY_LVER_ORIG:%.*]], label [[DO_BODY_PH:%.*]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp slt i64 [[SMAX]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[DO_BODY_LVER_ORIG:%.*]], label [[DO_BODY_PH:%.*]]
 ; CHECK:       do.body.lver.orig:
 ; CHECK-NEXT:    [[I_0_LVER_ORIG:%.*]] = phi i64 [ [[INC11_LVER_ORIG:%.*]], [[DO_END_LVER_ORIG:%.*]] ], [ 0, [[DO_BODY_LVER_CHECK:%.*]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP1]], [[I_0_LVER_ORIG]]
-; CHECK-NEXT:    [[SCEVGEP_LVER_ORIG:%.*]] = getelementptr i32, i32* [[AR]], i64 [[TMP10]]
+; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP1]], [[I_0_LVER_ORIG]]
+; CHECK-NEXT:    [[SCEVGEP_LVER_ORIG:%.*]] = getelementptr i32, i32* [[AR]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[SCEVGEP1_LVER_ORIG:%.*]] = bitcast i32* [[SCEVGEP_LVER_ORIG]] to i8*
 ; CHECK-NEXT:    [[MUL3_LVER_ORIG:%.*]] = mul i64 [[MUL]], [[I_0_LVER_ORIG]]
 ; CHECK-NEXT:    [[ADD_PTR_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[AR]], i64 [[MUL3_LVER_ORIG]]
@@ -97,7 +96,7 @@
 ; CHECK-NEXT:    [[J_0_LVER_ORIG:%.*]] = phi i64 [ 0, [[DO_BODY_LVER_ORIG]] ], [ [[INC_LVER_ORIG:%.*]], [[DO_BODY1_LVER_ORIG]] ]
 ; CHECK-NEXT:    [[MUL5_LVER_ORIG:%.*]] = mul nsw i64 [[J_0_LVER_ORIG]], [[CONV2]]
 ; CHECK-NEXT:    [[ADD_PTR6_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR_LVER_ORIG]], i64 [[MUL5_LVER_ORIG]]
-; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[ADD_PTR6_LVER_ORIG]] to i8*
+; CHECK-NEXT:    [[TMP13:%.*]] = bitcast i32* [[ADD_PTR6_LVER_ORIG]] to i8*
 ; CHECK-NEXT:    [[INC_LVER_ORIG]] = add nuw nsw i64 [[J_0_LVER_ORIG]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT_LVER_ORIG:%.*]] = icmp eq i64 [[INC_LVER_ORIG]], [[SMAX]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_LVER_ORIG]], label [[DO_END_LVER_ORIG]], label [[DO_BODY1_LVER_ORIG]]
@@ -151,7 +150,7 @@
 ;   for (int i=0; i<n; ++i) {
 ;     for (int j=0; j<m; ++j) {
 ;       int *arr = ar + i * m * o + j * o;
-;       memset(arr, 0, o * sizeof(int));      
+;       memset(arr, 0, o * sizeof(int));
 ;     }
 ;   }
 ; }
@@ -165,8 +164,8 @@
 ; CHECK-NEXT:    [[CMP21:%.*]] = icmp slt i32 0, [[M:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[O:%.*]] to i64
 ; CHECK-NEXT:    [[MUL8:%.*]] = mul i64 [[CONV]], 4
-; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_US_LVER_CHECK:%.*]], label [[FOR_END11]]
-; CHECK:       for.body.us.lver.check:
+; CHECK-NEXT:    br i1 [[CMP21]], label [[FOR_BODY_LR_PH_SPLIT_US:%.*]], label [[FOR_END11]]
+; CHECK:       for.body.lr.ph.split.us:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[O]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[M]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[O]] to i64
@@ -177,35 +176,6 @@
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
 ; CHECK-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP5]], [[WIDE_TRIP_COUNT10]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = icmp slt i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = or i1 false, [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp slt i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = or i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[FOR_BODY_US_LVER_ORIG:%.*]], label [[FOR_BODY_US_PH:%.*]]
-; CHECK:       for.body.us.lver.orig:
-; CHECK-NEXT:    [[INDVARS_IV6_LVER_ORIG:%.*]] = phi i64 [ [[INDVARS_IV_NEXT7_LVER_ORIG:%.*]], [[FOR_INC9_US_LVER_ORIG:%.*]] ], [ 0, [[FOR_BODY_US_LVER_CHECK]] ]
-; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP3]], [[INDVARS_IV6_LVER_ORIG]]
-; CHECK-NEXT:    [[SCEVGEP_LVER_ORIG:%.*]] = getelementptr i32, i32* [[AR]], i64 [[TMP13]]
-; CHECK-NEXT:    [[SCEVGEP1_LVER_ORIG:%.*]] = bitcast i32* [[SCEVGEP_LVER_ORIG]] to i8*
-; CHECK-NEXT:    [[TMP14:%.*]] = mul nsw i64 [[INDVARS_IV6_LVER_ORIG]], [[TMP1]]
-; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw i64 [[TMP14]], [[TMP2]]
-; CHECK-NEXT:    [[ADD_PTR_US_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[AR]], i64 [[TMP15]]
-; CHECK-NEXT:    [[WIDE_TRIP_COUNT_LVER_ORIG:%.*]] = zext i32 [[M]] to i64
-; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[SCEVGEP1_LVER_ORIG]], i8 0, i64 [[TMP6]], i1 false)
-; CHECK-NEXT:    br label [[FOR_BODY3_US_LVER_ORIG:%.*]]
-; CHECK:       for.body3.us.lver.orig:
-; CHECK-NEXT:    [[INDVARS_IV_LVER_ORIG:%.*]] = phi i64 [ 0, [[FOR_BODY_US_LVER_ORIG]] ], [ [[INDVARS_IV_NEXT_LVER_ORIG:%.*]], [[FOR_BODY3_US_LVER_ORIG]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul nsw i64 [[INDVARS_IV_LVER_ORIG]], [[TMP0]]
-; CHECK-NEXT:    [[ADD_PTR7_US_LVER_ORIG:%.*]] = getelementptr inbounds i32, i32* [[ADD_PTR_US_LVER_ORIG]], i64 [[TMP16]]
-; CHECK-NEXT:    [[TMP17:%.*]] = bitcast i32* [[ADD_PTR7_US_LVER_ORIG]] to i8*
-; CHECK-NEXT:    [[INDVARS_IV_NEXT_LVER_ORIG]] = add nuw nsw i64 [[INDVARS_IV_LVER_ORIG]], 1
-; CHECK-NEXT:    [[EXITCOND_LVER_ORIG:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT_LVER_ORIG]], [[WIDE_TRIP_COUNT_LVER_ORIG]]
-; CHECK-NEXT:    br i1 [[EXITCOND_LVER_ORIG]], label [[FOR_BODY3_US_LVER_ORIG]], label [[FOR_INC9_US_LVER_ORIG]]
-; CHECK:       for.inc9.us.lver.orig:
-; CHECK-NEXT:    [[INDVARS_IV_NEXT7_LVER_ORIG]] = add nuw nsw i64 [[INDVARS_IV6_LVER_ORIG]], 1
-; CHECK-NEXT:    [[EXITCOND11_LVER_ORIG:%.*]] = icmp ne i64 [[INDVARS_IV_NEXT7_LVER_ORIG]], [[WIDE_TRIP_COUNT10]]
-; CHECK-NEXT:    br i1 [[EXITCOND11_LVER_ORIG]], label [[FOR_BODY_US_LVER_ORIG]], label [[FOR_END11]]
-; CHECK:       for.body.us.ph:
 ; CHECK-NEXT:    call void @llvm.memset.p0i8.i64(i8* align 4 [[AR2]], i8 0, i64 [[TMP8]], i1 false)
 ; CHECK-NEXT:    br label [[FOR_END11]]
 ; CHECK:       for.end11: