Index: llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
===================================================================
--- llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -51,6 +51,8 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -109,6 +111,20 @@
 DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
               "Controls which MemoryDefs are eliminated.");
 
+// If enabled, this optimizes loop patterns like this
+// memset(P, 0, sizeof(int) * N); <- removed
+// for (int i = 0; i < N; ++i)
+//    P[i] = 1;
+static cl::opt<bool> EnableOptimizationAcrossLoops(
+    "enable-dse-optimize-across-loops", cl::init(true), cl::Hidden,
+    cl::desc("Enable elimination for redundant "
+             "loop-variant stores and memory intrinsics"));
+
+static cl::opt<unsigned> AcrossLoopStoreThreshold(
+    "dse-across-loop-store-threshold", cl::init(4),
+    cl::Hidden,
+    cl::desc("The maximum number of stores across loops to optimize"));
+
 static cl::opt<bool>
 EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
   cl::init(true), cl::Hidden,
@@ -725,10 +741,33 @@
   return false;
 }
 
+struct ContinuousMemoryRange {
+  const SCEV *Start;
+  const SCEV *Length;
+
+  ContinuousMemoryRange(const SCEV *Start, const SCEV *Length)
+      : Start(Start), Length(Length) {}
+
+  static ContinuousMemoryRange createEmpty() {
+    return ContinuousMemoryRange(nullptr, nullptr);
+  }
+
+  bool isEmpty() { return Start == nullptr || Length == nullptr; }
+
+  bool operator==(const ContinuousMemoryRange &Other) const {
+    return Start == Other.Start && Length == Other.Length;
+  }
+
+  bool operator!=(const ContinuousMemoryRange &Other) const {
+    return !(*this == Other);
+  }
+};
+
 struct DSEState {
   Function &F;
   AliasAnalysis &AA;
   EarliestEscapeInfo EI;
+  ScalarEvolution *SE;
 
   /// The single BatchAA instance that is used to cache AA queries. It will
   /// not be invalidated over the whole run. This is safe, because:
@@ -768,6 +807,8 @@
   // analysis.
   SmallPtrSet<const Value *, 32> EphValues;
 
+  DenseMap<Instruction *, ContinuousMemoryRange> MemRanges;
+
   /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
   /// basic block.
   MapVector<BasicBlock *, InstOverlapIntervalsTy> IOLs;
@@ -785,11 +826,30 @@
   DSEState(const DSEState &) = delete;
   DSEState &operator=(const DSEState &) = delete;
 
+  void prepareStateForAcrossLoopOptimization() {
+    MemDefs.clear();
+    SkipStores.clear();
+    for (auto *L : LI) {
+      if (!canAddMoreStoresFromLoop()) {
+        break;
+      }
+      if (!L->isInnermost() || !L->isOutermost()) {
+        continue;
+      }
+
+      if (L->getNumBlocks() != 1) {
+        continue;
+      }
+
+      prepareLoopVariantStoresWithRanges(L);
+    }
+  }
+
   DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
-           PostDominatorTree &PDT, AssumptionCache &AC,
+           PostDominatorTree &PDT, ScalarEvolution *SE, AssumptionCache &AC,
            const TargetLibraryInfo &TLI, const LoopInfo &LI)
       : F(F), AA(AA), EI(DT, LI, EphValues), BatchAA(AA, &EI), MSSA(MSSA),
-        DT(DT), PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI) {
+        DT(DT), PDT(PDT), TLI(TLI), DL(F.getParent()->getDataLayout()), LI(LI), SE(SE) {
     // Collect blocks with throwing instructions not modeled in MemorySSA and
     // alloc-like objects.
     unsigned PO = 0;
@@ -823,6 +883,106 @@
     CodeMetrics::collectEphemeralValues(&F, &AC, EphValues);
   }
 
+  bool canAddMoreStoresFromLoop() {
+    return MemDefs.size() <= AcrossLoopStoreThreshold;
+  }
+
+  void prepareLoopVariantStoresWithRanges(Loop *L) {
+    auto *BB = L->getLoopLatch();
+    if (ThrowingBlocks.count(BB))
+      return;
+
+    for (auto &I : *BB) {
+      if (I.mayThrow())
+        break;
+      if (auto *S = dyn_cast<StoreInst>(&I))
+        if (auto *MD = dyn_cast<MemoryDef>(MSSA.getMemoryAccess(S))) {
+          if (S->isVolatile() || S->isAtomic()) {
+            continue;
+          }
+
+          auto Range = computeMemoryRange(L, S);
+          if (!Range.isEmpty()) {
+            MemDefs.push_back(MD);
+            addMemRange(S, Range);
+            // Do not waste resources on analyzing this block further
+            // One store should be enough for eliminating
+            // memsets or stores that killed by this loop
+            return;
+          }
+        }
+    }
+  }
+
+  void addMemRange(Instruction *I, ContinuousMemoryRange &Range) {
+    if (Range.isEmpty())
+      return;
+    MemRanges.insert({I, Range});
+  }
+
+  ContinuousMemoryRange computeMemoryRange(const AnyMemIntrinsic *MemIntr) {
+    const auto *Length = SE->getSCEV(MemIntr->getLength());
+    if (isa<SCEVCouldNotCompute>(Length))
+      return ContinuousMemoryRange::createEmpty();
+    auto *Start = SE->getSCEV(MemIntr->getDest());
+    return ContinuousMemoryRange(Start, Length);
+  }
+
+  ContinuousMemoryRange computeMemoryRange(Loop *L, StoreInst *Store) {
+    auto *GEP = dyn_cast<GetElementPtrInst>(Store->getPointerOperand());
+    if (!GEP)
+      return ContinuousMemoryRange::createEmpty();
+
+    // GEP must be in the same loop as store that uses it
+    if (!L->contains(GEP))
+      return ContinuousMemoryRange::createEmpty();
+
+    auto *Type = Store->getValueOperand()->getType();
+    if (!Type->isFloatingPointTy() && !Type->isIntegerTy())
+      return ContinuousMemoryRange::createEmpty();
+
+    auto *StoreRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(GEP));
+    if (!StoreRec)
+      return ContinuousMemoryRange::createEmpty();
+
+    if (!StoreRec->isAffine())
+      return ContinuousMemoryRange::createEmpty();
+
+    auto *BTC = SE->getBackedgeTakenCount(L);
+    if (isa<SCEVCouldNotCompute>(BTC))
+      return ContinuousMemoryRange::createEmpty();
+
+    const auto *Start = StoreRec->getOperand(0);
+    if (isa<SCEVCouldNotCompute>(Start))
+      return ContinuousMemoryRange::createEmpty();
+
+    auto *Count = SE->getAddExpr(BTC, SE->getOne(BTC->getType()));
+    auto SizeInBytes =
+        Store->getValueOperand()->getType()->getScalarSizeInBits() / 8;
+    auto *StoreSize = SE->getConstant(Count->getType(), SizeInBytes);
+
+    auto *Step = StoreRec->getOperand(1);
+    if (Step != StoreSize)
+      return ContinuousMemoryRange::createEmpty();
+
+    const auto *Length = SE->getMulExpr(Count, StoreSize);
+    if (isa<SCEVCouldNotCompute>(Length))
+      return ContinuousMemoryRange::createEmpty();
+    return ContinuousMemoryRange(Start, Length);
+  }
+
+  bool isOptimizableAcrossLoops(const Instruction *KillingI,
+                                const Instruction *DeadI) {
+    if (!EnableOptimizationAcrossLoops)
+      return false;
+
+    // Alias Analysis can handle two memory intrinsics, skip it
+    if (isa<AnyMemIntrinsic>(DeadI) && isa<AnyMemIntrinsic>(KillingI))
+      return false;
+    return true;
+  }
+
+
   /// Return 'OW_Complete' if a store to the 'KillingLoc' location (by \p
   /// KillingI instruction) completely overwrites a store to the 'DeadLoc'
   /// location (by \p DeadI instruction).
@@ -836,6 +996,10 @@
                               const MemoryLocation &KillingLoc,
                               const MemoryLocation &DeadLoc,
                               int64_t &KillingOff, int64_t &DeadOff) {
+
+    if (isOptimizableAcrossLoops(KillingI, DeadI))
+      if (isLoopVariantStoreOverwrite(KillingI, DeadI))
+        return OW_Complete;
     // AliasAnalysis does not always account for loops. Limit overwrite checks
     // to dependencies for which we can guarantee they are independent of any
     // loops they are in.
@@ -1327,9 +1491,11 @@
       // candidates for which we can guarantee they always store to the same
       // memory location and not located in different loops.
       if (!isGuaranteedLoopIndependent(CurrentI, KillingI, *CurrentLoc)) {
-        LLVM_DEBUG(dbgs() << "  ... not guaranteed loop independent\n");
-        CanOptimize = false;
-        continue;
+        if (!EnableOptimizationAcrossLoops || MemRanges.empty()) {
+          LLVM_DEBUG(dbgs() << "  ... not guaranteed loop independent\n");
+          CanOptimize = false;
+          continue;
+        }
       }
 
       if (IsMemTerm) {
@@ -1466,8 +1632,11 @@
       // store kills itself.
       if (MaybeDeadAccess == UseAccess &&
           !isGuaranteedLoopInvariant(MaybeDeadLoc.Ptr)) {
-        LLVM_DEBUG(dbgs() << "    ... found not loop invariant self access\n");
-        return None;
+        if (!EnableOptimizationAcrossLoops || !MemRanges.count(MaybeDeadI)) {
+          LLVM_DEBUG(dbgs()
+                     << "    ... found not loop invariant self access\n");
+          return None;
+        }
       }
       // Otherwise, for the KillingDef and MaybeDeadAccess we only have to check
       // if it reads the memory location.
@@ -1515,6 +1684,7 @@
       SmallPtrSet<BasicBlock *, 16> KillingBlocks;
       for (Instruction *KD : KillingDefs)
         KillingBlocks.insert(KD->getParent());
+
       assert(!KillingBlocks.empty() &&
              "Expected at least a single killing block");
 
@@ -1583,6 +1753,27 @@
     return {MaybeDeadAccess};
   }
 
+  bool isLoopVariantStoreOverwrite(const Instruction *KillingI,
+                                   const Instruction *DeadI) {
+    auto KillingRange = MemRanges.find(KillingI);
+    auto DeadRange = MemRanges.find(DeadI);
+
+    if (MemRanges.count(DeadI) && MemRanges.count(KillingI))
+      return DeadRange->second == KillingRange->second;
+
+    if (MemRanges.count(DeadI) && isa<AnyMemIntrinsic>(KillingI)) {
+      auto MemIntrRange = computeMemoryRange(cast<AnyMemIntrinsic>(KillingI));
+      if (!MemIntrRange.isEmpty())
+        return DeadRange->second == MemIntrRange;
+    } else if (MemRanges.count(KillingI) && isa<AnyMemIntrinsic>(DeadI)) {
+      auto MemIntrRange = computeMemoryRange(cast<AnyMemIntrinsic>(DeadI));
+      if (!MemIntrRange.isEmpty())
+        return KillingRange->second == MemIntrRange;
+    }
+
+    return false;
+  }
+
   // Delete dead memory defs
   void deleteDeadInstruction(Instruction *SI) {
     MemorySSAUpdater Updater(&MSSA);
@@ -1949,15 +2140,8 @@
   }
 };
 
-static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
-                                DominatorTree &DT, PostDominatorTree &PDT,
-                                AssumptionCache &AC,
-                                const TargetLibraryInfo &TLI,
-                                const LoopInfo &LI) {
+static bool runDSEOptimizationLoop(DSEState &State) {
   bool MadeChange = false;
-
-  MSSA.ensureOptimizedUses();
-  DSEState State(F, AA, MSSA, DT, PDT, AC, TLI, LI);
   // For each store:
   for (unsigned I = 0; I < State.MemDefs.size(); I++) {
     MemoryDef *KillingDef = State.MemDefs[I];
@@ -2066,10 +2250,10 @@
           // We are re-using tryToMergePartialOverlappingStores, which requires
           // DeadSI to dominate DeadSI.
           // TODO: implement tryToMergeParialOverlappingStores using MemorySSA.
-          if (DeadSI && KillingSI && DT.dominates(DeadSI, KillingSI)) {
+          if (DeadSI && KillingSI && State.DT.dominates(DeadSI, KillingSI)) {
             if (Constant *Merged = tryToMergePartialOverlappingStores(
                     KillingSI, DeadSI, KillingOffset, DeadOffset, State.DL,
-                    State.BatchAA, &DT)) {
+                    State.BatchAA, &State.DT)) {
 
               // Update stored value of earlier store to merged constant.
               DeadSI->setOperand(0, Merged);
@@ -2117,13 +2301,39 @@
       continue;
     }
   }
+  return MadeChange;
+}
+
+static bool runAcrossLoopDSE(DSEState &State) {
+  if (!EnableOptimizationAcrossLoops) {
+    return false;
+  }
+  State.prepareStateForAcrossLoopOptimization();
+  return runDSEOptimizationLoop(State);
+}
+
+static bool eliminateDeadStores(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                                DominatorTree &DT, PostDominatorTree &PDT,
+                                ScalarEvolution *SE, 
+                                AssumptionCache &AC,
+                                const TargetLibraryInfo &TLI,
+                                const LoopInfo &LI) {
+
+  MSSA.ensureOptimizedUses();   
+  DSEState State(F, AA, MSSA, DT, PDT, SE, AC, TLI, LI);
+  bool MadeChange = false;
 
+  MadeChange |= runDSEOptimizationLoop(State);
   if (EnablePartialOverwriteTracking)
     for (auto &KV : State.IOLs)
       MadeChange |= State.removePartiallyOverlappedStores(KV.second);
 
   MadeChange |= State.eliminateRedundantStoresOfExistingValues();
   MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
+  // Optimize across loops at the end because there should be
+  // less stores to analyze. Plus, stores that overwrite some other stores
+  // in the same loop are removed too
+  MadeChange |= runAcrossLoopDSE(State);
   return MadeChange;
 }
 } // end anonymous namespace
@@ -2140,7 +2350,11 @@
   AssumptionCache &AC = AM.getResult<AssumptionAnalysis>(F);
   LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
 
-  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI);
+  ScalarEvolution *SE = nullptr;
+  if (EnableOptimizationAcrossLoops)
+    SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
+
+  bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, AC, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
   if (AreStatisticsEnabled())
@@ -2148,10 +2362,14 @@
       NumRemainingStores += isa<StoreInst>(&I);
 #endif
 
-  if (!Changed)
+  if (!Changed) {
     return PreservedAnalyses::all();
-
+  }
   PreservedAnalyses PA;
+  if (EnableOptimizationAcrossLoops) {
+    PA.preserve<ScalarEvolutionAnalysis>();
+  }
+
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<MemorySSAAnalysis>();
   PA.preserve<LoopAnalysis>();
@@ -2183,8 +2401,11 @@
     AssumptionCache &AC =
         getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-
-    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, AC, TLI, LI);
+    ScalarEvolution *SE = nullptr;
+    if (EnableOptimizationAcrossLoops) {
+      SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+    }
+    bool Changed = eliminateDeadStores(F, AA, MSSA, DT, PDT, SE, AC, TLI, LI);
 
 #ifdef LLVM_ENABLE_STATS
     if (AreStatisticsEnabled())
@@ -2197,6 +2418,10 @@
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    if (EnableOptimizationAcrossLoops) {
+      AU.addRequired<ScalarEvolutionWrapperPass>();
+      AU.addPreserved<ScalarEvolutionWrapperPass>();
+    }
     AU.addRequired<AAResultsWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
@@ -2227,6 +2452,7 @@
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
                     false)
 
Index: llvm/test/Transforms/DeadStoreElimination/loop-variant-store-complete-overwrite.ll
===================================================================
--- /dev/null
+++ llvm/test/Transforms/DeadStoreElimination/loop-variant-store-complete-overwrite.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S --passes=dse < %s | FileCheck %s
+define dso_local void @init_vector(ptr %p) {
+; CHECK-LABEL: @init_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[N_06:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[CONV:%.*]] = trunc i64 [[N_06]] to i32
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[N_06]]
+; CHECK-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[N_06]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], 4096
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[NRVO_SKIPDTOR:%.*]], label [[FOR_BODY]]
+; CHECK:       nrvo.skipdtor:
+; CHECK-NEXT:    ret void
+;
+entry:
+; 1 = MemoryDef(liveOnEntry)
+  call void @llvm.memset.p0.i64(ptr %p, i8 0, i64 16384, i1 false)
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; 3 = MemoryPhi({entry,1},{for.body,2})
+  %n.06 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
+  %conv = trunc i64 %n.06 to i32
+  %arrayidx = getelementptr inbounds i32, ptr %p, i64 %n.06
+; 2 = MemoryDef(3)
+  store i32 %conv, ptr %arrayidx, align 4
+  %inc = add nuw nsw i64 %n.06, 1
+  %exitcond.not = icmp eq i64 %inc, 4096
+  br i1 %exitcond.not, label %nrvo.skipdtor, label %for.body
+
+nrvo.skipdtor:                                    ; preds = %for.body
+  ret void
+}
+
+define dso_local void @store_in_two_loops(ptr nocapture noundef writeonly %P, i32 noundef %N) {
+; CHECK-LABEL: @store_in_two_loops(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP17_NOT:%.*]] = icmp eq i32 [[N:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP17_NOT]], label [[FOR_COND_CLEANUP4:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.cond2.preheader:
+; CHECK-NEXT:    br i1 [[CMP17_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_BODY5_PREHEADER:%.*]]
+; CHECK:       for.body5.preheader:
+; CHECK-NEXT:    [[WIDE_TRIP_COUNT25:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY5:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND2_PREHEADER:%.*]], label [[FOR_BODY]]
+; CHECK:       for.cond.cleanup4:
+; CHECK-NEXT:    ret void
+; CHECK:       for.body5:
+; CHECK-NEXT:    [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[FOR_BODY5_PREHEADER]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[FOR_BODY5]] ]
+; CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[INDVARS_IV22]]
+; CHECK-NEXT:    store i32 1, ptr [[ARRAYIDX7]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1
+; CHECK-NEXT:    [[EXITCOND26_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], [[WIDE_TRIP_COUNT25]]
+; CHECK-NEXT:    br i1 [[EXITCOND26_NOT]], label [[FOR_COND_CLEANUP4]], label [[FOR_BODY5]]
+;
+entry:
+  %cmp17.not = icmp eq i32 %N, 0
+  br i1 %cmp17.not, label %for.cond.cleanup4, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %N to i64
+  br label %for.body
+
+for.cond2.preheader:                              ; preds = %for.body
+  br i1 %cmp17.not, label %for.cond.cleanup4, label %for.body5.preheader
+
+for.body5.preheader:                              ; preds = %for.cond2.preheader
+  %wide.trip.count25 = zext i32 %N to i64
+  br label %for.body5
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+; 5 = MemoryPhi({for.body.preheader,liveOnEntry},{for.body,1})
+  %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, ptr %P, i64 %indvars.iv
+; 1 = MemoryDef(5)
+  store i32 1, ptr %arrayidx, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond2.preheader, label %for.body
+
+for.cond.cleanup4:                                ; preds = %for.body5, %for.cond2.preheader, %entry
+; 3 = MemoryPhi({entry,liveOnEntry},{for.cond2.preheader,1},{for.body5,2})
+  ret void
+
+for.body5:                                        ; preds = %for.body5, %for.body5.preheader
+; 4 = MemoryPhi({for.body5.preheader,1},{for.body5,2})
+  %indvars.iv22 = phi i64 [ 0, %for.body5.preheader ], [ %indvars.iv.next23, %for.body5 ]
+  %arrayidx7 = getelementptr inbounds i32, ptr %P, i64 %indvars.iv22
+; 2 = MemoryDef(4)
+  store i32 1, ptr %arrayidx7, align 4
+  %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1
+  %exitcond26.not = icmp eq i64 %indvars.iv.next23, %wide.trip.count25
+  br i1 %exitcond26.not, label %for.cond.cleanup4, label %for.body5
+}
+
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)