diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
@@ -70,17 +70,19 @@
     : public AffineScalarReplacementBase<AffineScalarReplacement> {
   void runOnFunction() override;
 
-  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp);
-  void loadCSE(AffineReadOpInterface loadOp);
-
-  // A list of memref's that are potentially dead / could be eliminated.
-  SmallPtrSet<Value, 4> memrefsToErase;
-  // Load op's whose results were replaced by those forwarded from stores
-  // dominating stores or loads..
-  SmallVector<Operation *, 8> loadOpsToErase;
-
-  DominanceInfo *domInfo = nullptr;
-  PostDominanceInfo *postDomInfo = nullptr;
+  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp,
+                                   SmallVectorImpl<Operation *> &loadOpsToErase,
+                                   SmallPtrSetImpl<Value> &memrefsToErase,
+                                   DominanceInfo *domInfo,
+                                   PostDominanceInfo *postDominanceInfo);
+  void removeUnusedStore(AffineWriteOpInterface loadOp,
+                         SmallVectorImpl<Operation *> &loadOpsToErase,
+                         SmallPtrSetImpl<Value> &memrefsToErase,
+                         DominanceInfo *domInfo,
+                         PostDominanceInfo *postDominanceInfo);
+  void loadCSE(AffineReadOpInterface loadOp,
+               SmallVectorImpl<Operation *> &loadOpsToErase,
+               DominanceInfo *domInfo);
 };
 
 } // end anonymous namespace
@@ -92,32 +94,173 @@
   return std::make_unique<AffineScalarReplacement>();
 }
 
-// Check if the store may be reaching the load.
-static bool storeMayReachLoad(Operation *storeOp, Operation *loadOp,
-                              unsigned minSurroundingLoops) {
-  MemRefAccess srcAccess(storeOp);
-  MemRefAccess destAccess(loadOp);
-  FlatAffineConstraints dependenceConstraints;
-  unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
-  unsigned d;
-  // Dependences at loop depth <= minSurroundingLoops do NOT matter.
-  for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
-    DependenceResult result = checkMemrefAccessDependence(
-        srcAccess, destAccess, d, &dependenceConstraints,
-        /*dependenceComponents=*/nullptr);
-    if (hasDependence(result))
-      break;
-  }
-  if (d <= minSurroundingLoops)
-    return false;
+/// Ensure that all operations between start (noninclusive) and memOp
+/// do not have the potential memory effect EffectType on memOp
+template <typename EffectType, typename T>
+bool hasNoInterveningEffect(Operation *start, T memOp) {
+
+  Value originalMemref = memOp.getMemRef();
+  bool isOriginalAllocation =
+      originalMemref.getDefiningOp<memref::AllocaOp>() ||
+      originalMemref.getDefiningOp<memref::AllocOp>();
+  bool legal = true;
+
+  // Check whether the effect on memOp can be caused by
+  // a given operation op.
+  std::function<void(Operation *)> check = [&](Operation *op) {
+    // If the effect has alreay been found, early exit
+    if (!legal)
+      return;
+
+    if (auto memEffect = dyn_cast<MemoryEffectOpInterface>(op)) {
+      SmallVector<MemoryEffects::EffectInstance, 1> effects;
+      memEffect.getEffects(effects);
+
+      for (auto effect : effects) {
+        // If op causes EffectType on a potentially aliasing
+        // location for memOp, mark as illegal.
+        if (isa<EffectType>(effect.getEffect())) {
+          if (isOriginalAllocation && effect.getValue() &&
+              (effect.getValue().getDefiningOp<memref::AllocaOp>() ||
+               effect.getValue().getDefiningOp<memref::AllocOp>()))
+            if (effect.getValue() != originalMemref)
+              continue;
+          legal = false;
+          return;
+        }
+      }
+    } else if (op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
+      // Recurse into the regions for this op and check whether
+      // the internal operations may have the effect
+      for (auto &region : op->getRegions())
+        for (auto &block : region)
+          for (auto &op : block)
+            check(&op);
+    } else {
+      // Otherwise, conservatively assume generic operations have
+      // the effect on the operation
+      legal = false;
+      return;
+    }
+  };
+
+  // Check all paths from ancestor op `parent` to the
+  // operation `to` for the effect. It is known that
+  // `to` must be contained within `parent`
+  auto until = [&](Operation *parent, Operation *to) {
+    // TODO check only the paths from `parent` to `to`
+    // Currently we fallback an check the entire parent op.
+    assert(parent->isAncestor(to));
+    check(parent);
+  };
+
+  // Check for all paths from operation `from` to operation
+  // `to` for the given memory effect.
+  std::function<void(Operation *, Operation *)> recur = [&](Operation *from,
+                                                            Operation *to) {
+    assert(from->getParentRegion()->isAncestor(to->getParentRegion()));
+
+    // If the operations are in different regions, recursively
+    // consider all path from `from` to the parent of `to` and
+    // all paths from the parent of `to` to `to`.
+    if (from->getParentRegion() != to->getParentRegion()) {
+      recur(from, to->getParentOp());
+      until(to->getParentOp(), to);
+      return;
+    }
+
+    // Now, assuming that from and to exist in the same region, perform
+    // a CFG traversal to check all the relevant operations
+
+    // Additional blocks to consider
+    SmallVector<Block *, 2> todo;
+    {
+      // First consier the parent block of `from` an check all operations
+      // after `from`.
+      for (auto iter = ++from->getIterator(), end = from->getBlock()->end();
+           iter != end && &*iter != to; iter++) {
+        check(&*iter);
+      }
+
+      // If the parent of `from` doesn't contain `to`, add the successors
+      // to the list of blocks to check.
+      if (to->getBlock() != from->getBlock())
+        for (auto succ : from->getBlock()->getSuccessors())
+          todo.push_back(succ);
+    }
+
+    SmallPtrSet<Block *, 4> done;
+    // Traverse the CFG until hitting `to`
+    while (todo.size()) {
+      auto blk = todo.pop_back_val();
+      if (done.count(blk))
+        continue;
+      done.insert(blk);
+      for (auto &op : *blk) {
+        if (&op == to)
+          break;
+        check(&op);
+        if (&op == blk->getTerminator())
+          for (auto succ : blk->getSuccessors())
+            todo.push_back(succ);
+      }
+    }
+  };
+  recur(start, memOp.getOperation());
+  return legal;
+}
+
+// This attempts to remove stores which have no impact on the final result.
+// A writing op writeA will be eliminated if there exists an op writeB if
+// 1) writeA and writeB have mathematically equivalent affine access functions.
+// 2) writeB postdominates loadA.
+// 3) There is no potential read between writeA and writeB
+void AffineScalarReplacement::removeUnusedStore(
+    AffineWriteOpInterface writeA, SmallVectorImpl<Operation *> &opsToErase,
+    SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo *domInfo,
+    PostDominanceInfo *postDominanceInfo) {
+
+  for (auto *user : writeA.getMemRef().getUsers()) {
+    // Only consider writing operations
+    auto writeB = dyn_cast<AffineWriteOpInterface>(user);
+    if (!writeB)
+      continue;
 
-  return true;
+    // The operations must be distinct
+    if (writeB == writeA)
+      continue;
+
+    // Both operations must lie in the same region
+    if (writeB->getParentRegion() != writeA->getParentRegion())
+      continue;
+
+    // Both operations must write to the same memory
+    MemRefAccess srcAccess(writeB);
+    MemRefAccess destAccess(writeA);
+
+    if (srcAccess != destAccess)
+      continue;
+
+    // writeB must postdominate writeA
+    if (!postDominanceInfo->postDominates(writeB, writeA))
+      continue;
+
+    // There cannot be an operation which reads from memory between
+    // the two writes
+    if (!hasNoInterveningEffect<MemoryEffects::Read>(writeA, writeB))
+      continue;
+
+    opsToErase.push_back(writeA);
+    break;
+  }
 }
 
 // This is a straightforward implementation not optimized for speed. Optimize
 // if needed.
-LogicalResult
-AffineScalarReplacement::forwardStoreToLoad(AffineReadOpInterface loadOp) {
+LogicalResult AffineScalarReplacement::forwardStoreToLoad(
+    AffineReadOpInterface loadOp, SmallVectorImpl<Operation *> &loadOpsToErase,
+    SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo *domInfo,
+    PostDominanceInfo *postDominanceInfo) {
   // First pass over the use list to get the minimum number of surrounding
   // loops common between the load op and the store op, with min taken across
   // all store ops.
@@ -140,10 +283,9 @@
   // forwarding candidates). Each forwarding candidate will be checked for a
   // post-dominance on these. 'fwdingCandidates' are a subset of depSrcStores.
   SmallVector<Operation *, 8> depSrcStores;
-
   for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
-      continue;
+    MemRefAccess srcAccess(storeOp);
+    MemRefAccess destAccess(loadOp);
 
     // Stores that *may* be reaching the load.
     depSrcStores.push_back(storeOp);
@@ -156,15 +298,15 @@
     //     store %A[%M]
     //     load %A[%N]
     // Use the AffineValueMap difference based memref access equality checking.
-    MemRefAccess srcAccess(storeOp);
-    MemRefAccess destAccess(loadOp);
     if (srcAccess != destAccess)
       continue;
 
-    // 2. The store has to dominate the load op to be candidate.
     if (!domInfo->dominates(storeOp, loadOp))
       continue;
 
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp))
+      continue;
+
     // We now have a candidate for forwarding.
     fwdingCandidates.push_back(storeOp);
   }
@@ -177,13 +319,10 @@
   // traversals. Consider this for the future if needed.
   Operation *lastWriteStoreOp = nullptr;
   for (auto *storeOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores, [&](Operation *depStore) {
-          return postDomInfo->postDominates(storeOp, depStore);
-        })) {
-      lastWriteStoreOp = storeOp;
-      break;
-    }
+    assert(!lastWriteStoreOp);
+    lastWriteStoreOp = storeOp;
   }
+
   if (!lastWriteStoreOp)
     return failure();
 
@@ -199,6 +338,7 @@
   memrefsToErase.insert(loadOp.getMemRef());
   // Record this to erase later.
   loadOpsToErase.push_back(loadOp);
+
   return success();
 }
 
@@ -207,109 +347,95 @@
 // loadA will be be replaced with loadB if:
 // 1) loadA and loadB have mathematically equivalent affine access functions.
 // 2) loadB dominates loadA.
-// 3) loadB postdominates all the store op's that have a dependence into loadA.
-void AffineScalarReplacement::loadCSE(AffineReadOpInterface loadOp) {
-  // The list of load op candidates for forwarding that satisfy conditions
-  // (1) and (2) above - they will be filtered later when checking (3).
-  SmallVector<Operation *, 8> fwdingCandidates;
-  SmallVector<Operation *, 8> storeOps;
-  unsigned minSurroundingLoops = getNestingDepth(loadOp);
-  MemRefAccess memRefAccess(loadOp);
-  // First pass over the use list to get 1) the minimum number of surrounding
-  // loops common between the load op and an load op candidate, with min taken
-  // across all load op candidates; 2) load op candidates; 3) store ops.
-  // We take min across all load op candidates instead of all load ops to make
-  // sure later dependence check is performed at loop depths that do matter.
-  for (auto *user : loadOp.getMemRef().getUsers()) {
-    if (auto storeOp = dyn_cast<AffineWriteOpInterface>(user)) {
-      storeOps.push_back(storeOp);
-    } else if (auto aLoadOp = dyn_cast<AffineReadOpInterface>(user)) {
-      MemRefAccess otherMemRefAccess(aLoadOp);
-      // No need to consider Load ops that have been replaced in previous store
-      // to load forwarding or loadCSE. If loadA or storeA can be forwarded to
-      // loadB, then loadA or storeA can be forwarded to loadC iff loadB can be
-      // forwarded to loadC.
-      // If loadB is visited before loadC and replace with loadA, we do not put
-      // loadB in candidates list, only loadA. If loadC is visited before loadB,
-      // loadC may be replaced with loadB, which will be replaced with loadA
-      // later.
-      if (aLoadOp != loadOp && !llvm::is_contained(loadOpsToErase, aLoadOp) &&
-          memRefAccess == otherMemRefAccess &&
-          domInfo->dominates(aLoadOp, loadOp)) {
-        fwdingCandidates.push_back(aLoadOp);
-        unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *aLoadOp);
-        minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
-      }
+// 3) There is no write between loadA and loadB
+void AffineScalarReplacement::loadCSE(
+    AffineReadOpInterface loadA, SmallVectorImpl<Operation *> &loadOpsToErase,
+    DominanceInfo *domInfo) {
+  SmallVector<AffineReadOpInterface, 4> LoadOptions;
+  for (auto *user : loadA.getMemRef().getUsers()) {
+    auto loadB = dyn_cast<AffineReadOpInterface>(user);
+    if (!loadB || loadB == loadA)
+      continue;
+
+    MemRefAccess srcAccess(loadB);
+    MemRefAccess destAccess(loadA);
+
+    if (srcAccess != destAccess) {
+      continue;
     }
-  }
 
-  // No forwarding candidate.
-  if (fwdingCandidates.empty())
-    return;
+    // 2. The store has to dominate the load op to be candidate.
+    if (!domInfo->dominates(loadB, loadA))
+      continue;
 
-  // Store ops that have a dependence into the load.
-  SmallVector<Operation *, 8> depSrcStores;
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(loadB.getOperation(),
+                                                      loadA))
+      continue;
 
-  for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
+    // Check if 2 values have the same shape. This is needed for affine vector
+    // loads.
+    if (loadB.getValue().getType() != loadA.getValue().getType())
       continue;
 
-    // Stores that *may* be reaching the load.
-    depSrcStores.push_back(storeOp);
+    LoadOptions.push_back(loadB);
   }
 
-  // 3. Of all the load op's that meet the above criteria, return the first load
-  // found that postdominates all 'depSrcStores' and has the same shape as the
-  // load to be replaced (if one exists). The shape check is needed for affine
-  // vector loads.
-  Operation *firstLoadOp = nullptr;
-  Value oldVal = loadOp.getValue();
-  for (auto *loadOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores,
-                     [&](Operation *depStore) {
-                       return postDomInfo->postDominates(loadOp, depStore);
-                     }) &&
-        cast<AffineReadOpInterface>(loadOp).getValue().getType() ==
-            oldVal.getType()) {
-      firstLoadOp = loadOp;
+  // Of the legal load candidates, use the one that dominates all others
+  // to minimize the subsequent need to loadCSE
+  Value loadB = nullptr;
+  for (auto option : LoadOptions) {
+    if (llvm::all_of(LoadOptions, [&](AffineReadOpInterface depStore) {
+          return depStore == option ||
+                 domInfo->dominates(option.getOperation(),
+                                    depStore.getOperation());
+        })) {
+      loadB = option.getValue();
       break;
     }
   }
-  if (!firstLoadOp)
-    return;
 
-  // Perform the actual load to load forwarding.
-  Value loadVal = cast<AffineReadOpInterface>(firstLoadOp).getValue();
-  loadOp.getValue().replaceAllUsesWith(loadVal);
-  // Record this to erase later.
-  loadOpsToErase.push_back(loadOp);
+  if (loadB) {
+    loadA.getValue().replaceAllUsesWith(loadB);
+    // Record this to erase later.
+    loadOpsToErase.push_back(loadA);
+  }
 }
 
 void AffineScalarReplacement::runOnFunction() {
   // Only supports single block functions at the moment.
   FuncOp f = getFunction();
-  if (!llvm::hasSingleElement(f)) {
-    markAllAnalysesPreserved();
-    return;
-  }
 
-  domInfo = &getAnalysis<DominanceInfo>();
-  postDomInfo = &getAnalysis<PostDominanceInfo>();
+  // Load op's whose results were replaced by those forwarded from stores.
+  SmallVector<Operation *, 8> opsToErase;
+
+  // A list of memref's that are potentially dead / could be eliminated.
+  SmallPtrSet<Value, 4> memrefsToErase;
 
-  loadOpsToErase.clear();
-  memrefsToErase.clear();
+  auto domInfo = &getAnalysis<DominanceInfo>();
+  auto postDominanceInfo = &getAnalysis<PostDominanceInfo>();
 
-  // Walk all load's and perform store to load forwarding and loadCSE.
+  // Walk all load's and perform store to load forwarding.
   f.walk([&](AffineReadOpInterface loadOp) {
-    // Do store to load forwarding first, if no success, try loadCSE.
-    if (failed(forwardStoreToLoad(loadOp)))
-      loadCSE(loadOp);
+    if (failed(forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo,
+                                  postDominanceInfo))) {
+      loadCSE(loadOp, opsToErase, domInfo);
+    }
+  });
+
+  // Erase all load op's whose results were replaced with store fwd'ed ones.
+  for (auto *op : opsToErase)
+    op->erase();
+  opsToErase.clear();
+
+  f.walk([&](AffineWriteOpInterface loadOp) {
+    removeUnusedStore(loadOp, opsToErase, memrefsToErase, domInfo,
+                      postDominanceInfo);
   });
 
-  // Erase all load op's whose results were replaced with store or load fwd'ed
-  // ones.
-  for (auto *loadOp : loadOpsToErase)
-    loadOp->erase();
+  // Erase all store op's which are unnecessary.
+  for (auto *op : opsToErase)
+    op->erase();
+  opsToErase.clear();
 
   // Check if the store fwd'ed memrefs are now left with only stores and can
   // thus be completely deleted. Note: the canonicalize pass should be able
diff --git a/mlir/test/Dialect/Affine/scalrep.mlir b/mlir/test/Dialect/Affine/scalrep.mlir
--- a/mlir/test/Dialect/Affine/scalrep.mlir
+++ b/mlir/test/Dialect/Affine/scalrep.mlir
@@ -235,17 +235,17 @@
   // Due to this load, the memref isn't optimized away.
   %v3 = affine.load %m[%c1] : memref<10xf32>
   return %v3 : f32
-// CHECK:       %{{.*}} = memref.alloc() : memref<10xf32>
-// CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
-// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
-// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
-// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
-// CHECK-NEXT:      %{{.*}} = affine.apply [[$MAP4]](%{{.*}})
-// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
-// CHECK-NEXT:    }
-// CHECK-NEXT:  }
-// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
-// CHECK-NEXT:  return %{{.*}} : f32
+// TODO:       %{{.*}} = memref.alloc() : memref<10xf32>
+// TODO-NEXT:  affine.for %{{.*}} = 0 to 10 {
+// TODO-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// TODO-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
+// TODO-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// TODO-NEXT:      %{{.*}} = affine.apply [[$MAP4]](%{{.*}})
+// TODO-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// TODO-NEXT:    }
+// TODO-NEXT:  }
+// TODO-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// TODO-NEXT:  return %{{.*}} : f32
 }
 
 // CHECK-LABEL: func @should_not_fwd
@@ -515,18 +515,139 @@
   return
 }
 
-// CHECK-LABEL: func @vector_load_affine_apply_store_load
+// TODO-LABEL: func @vector_load_affine_apply_store_load
 func @vector_load_affine_apply_store_load(%in : memref<512xf32>, %out : memref<512xf32>) {
   %cf1 = constant 1: index
   affine.for %i = 0 to 15 {
-    // CHECK:       affine.vector_load
+    // TODO:       affine.vector_load
     %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
     %idx = affine.apply affine_map<(d0) -> (d0 + 1)> (%i)
     affine.vector_store %ld0, %in[32*%idx] : memref<512xf32>, vector<32xf32>
-    // CHECK-NOT:   affine.vector_load
+    // TODO-NOT:   affine.vector_load
     %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
     %add = addf %ld0, %ld1 : vector<32xf32>
     affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32>
   }
   return
 }
+
+// CHECK-LABEL: func @external_no_forward_load
+// CHECK:   affine.load
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+func @external_no_forward_load(%in : memref<512xf32>, %out : memref<512xf32>) {
+  affine.for %i = 0 to 16 {
+    %ld0 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld0, %out[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @external_no_forward_store
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+func @external_no_forward_store(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @external_no_forward_cst
+// CHECK:   affine.store
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT:   affine.load
+// CHECK-NEXT:   affine.store
+
+func @external_no_forward_cst(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  %cf2 = constant 2.0 : f32
+  %m2 = memref.cast %in : memref<512xf32> to memref<?xf32>
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    affine.store %cf2, %m2[32*%i] : memref<?xf32>
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+// Although there is a dependence from the second store to the load, it is
+// satisfied by the outer surrounding loop, and does not prevent the first
+// store to be forwarded to the load.
+func @overlap_no_fwd(%N : index) -> f32 {
+  %cf7 = constant 7.0 : f32
+  %cf9 = constant 9.0 : f32
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %m = memref.alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 5 {
+    affine.store %cf7, %m[2 * %i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      %v0 = affine.load %m[2 * %i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+      affine.store %cf9, %m[%i0 + 1] : memref<10xf32>
+    }
+  }
+  // Due to this load, the memref isn't optimized away.
+  %v3 = affine.load %m[%c1] : memref<10xf32>
+  return %v3 : f32
+
+// CHECK-LABEL: func @overlap_no_fwd
+// CHECK:  affine.for %{{.*}} = 0 to 5 {
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
+// CHECK-NEXT:      %{{.*}} = affine.load
+// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:  return %{{.*}} : f32
+}
+
+
+// CHECK-LABEL: func @redundant_store_elim
+// CHECK: affine.for
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT: }
+
+func @redundant_store_elim(%out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  %cf2 = constant 2.0 : f32
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %out[32*%i] : memref<512xf32>
+    affine.store %cf2, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+
+// CHECK-LABEL: func @redundant_store_elim
+// CHECK: affine.for
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT:   "test.use"
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT: }
+
+func @redundant_store_elim_fail(%out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  %cf2 = constant 2.0 : f32
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %out[32*%i] : memref<512xf32>
+    "test.use"(%out) : (memref<512xf32>) -> ()
+    affine.store %cf2, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
\ No newline at end of file