diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
@@ -33,27 +33,17 @@
 namespace {
 // The store to load forwarding and load CSE rely on three conditions:
 //
-// 1) store/load and load need to have mathematically equivalent affine access
-// functions (checked after full composition of load/store operands); this
-// implies that they access the same single memref element for all iterations of
-// the common surrounding loop,
+// 1) store/load providing a replacement value and load being replaced need to
+// have mathematically equivalent affine access functions (checked after full
+// composition of load/store operands); this implies that they access the same
+// single memref element for all iterations of the common surrounding loop,
 //
 // 2) the store/load op should dominate the load op,
 //
-// 3) among all op's that satisfy both (1) and (2), for store to load
-// forwarding, the one that does not dominate any store op that has a
-// dependence into the load, is provably the last writer to the particular
-// memref location being loaded at the load op, and its store value can be
-// forwarded to the load; for load CSE, any op that does not dominate any store
-// op that have a dependence into the load can be forwarded and the first one
-// found is chosen. Note that the only dependences that are to be considered are
-// those that are satisfied at the block* of the innermost common surrounding
-// loop of the <store/load, load> being considered.
-//
-// (* A dependence being satisfied at a block: a dependence that is satisfied by
-// virtue of the destination operation appearing textually / lexically after
-// the source operation within the body of a 'affine.for' operation; thus, a
-// dependence is always either satisfied by a loop or by a block).
+// 3) no operation that may write to memory read by the load being replaced can
+// occur after executing the instruction (load or store) providing the
+// replacement value and before the load being replaced (thus potentially
+// allowing overwriting the memory read by the load).
 //
 // The above conditions are simple to check, sufficient, and powerful for most
 // cases in practice - they are sufficient, but not necessary --- since they
@@ -70,16 +60,14 @@
     : public AffineScalarReplacementBase<AffineScalarReplacement> {
   void runOnFunction() override;
 
-  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp);
-  void loadCSE(AffineReadOpInterface loadOp);
-
-  // A list of memref's that are potentially dead / could be eliminated.
-  SmallPtrSet<Value, 4> memrefsToErase;
-  // Load ops whose results were replaced by those forwarded from stores
-  // dominating stores or loads..
-  SmallVector<Operation *, 8> loadOpsToErase;
+  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp,
+                                   SmallVectorImpl<Operation *> &loadOpsToErase,
+                                   SmallPtrSetImpl<Value> &memrefsToErase,
+                                   DominanceInfo &domInfo);
 
-  DominanceInfo *domInfo = nullptr;
+  void loadCSE(AffineReadOpInterface loadOp,
+               SmallVectorImpl<Operation *> &loadOpsToErase,
+               DominanceInfo &domInfo);
 };
 
 } // end anonymous namespace
@@ -91,61 +79,204 @@
   return std::make_unique<AffineScalarReplacement>();
 }
 
-// Check if the store may be reaching the load.
-static bool storeMayReachLoad(Operation *storeOp, Operation *loadOp,
-                              unsigned minSurroundingLoops) {
-  MemRefAccess srcAccess(storeOp);
-  MemRefAccess destAccess(loadOp);
-  FlatAffineConstraints dependenceConstraints;
-  unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
-  unsigned d;
-  // Dependences at loop depth <= minSurroundingLoops do NOT matter.
-  for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
-    DependenceResult result = checkMemrefAccessDependence(
-        srcAccess, destAccess, d, &dependenceConstraints,
-        /*dependenceComponents=*/nullptr);
-    if (hasDependence(result))
-      break;
-  }
-  if (d <= minSurroundingLoops)
-    return false;
+/// Ensure that all operations that could be executed after `start`
+/// (noninclusive) and prior to `memOp` (e.g. on a control flow/op path
+/// between the operations) do not have the potential memory effect
+/// `EffectType` on `memOp`. `memOp`  is an operation that reads or writes to
+/// a memref. For example, if `EffectType` is MemoryEffects::Write, this method
+/// will check if there is no write to the memory between `start` and `memOp`
+/// that would change the read within `memOp`.
+template <typename EffectType, typename T>
+bool hasNoInterveningEffect(Operation *start, T memOp) {
+
+  Value memref = memOp.getMemRef();
+  bool isOriginalAllocation = memref.getDefiningOp<memref::AllocaOp>() ||
+                              memref.getDefiningOp<memref::AllocOp>();
+
+  // A boolean representing whether an intervening operation could have impacted
+  // memOp.
+  bool hasSideEffect = false;
+
+  // Check whether the effect on memOp can be caused by a given operation op.
+  std::function<void(Operation *)> checkOperation = [&](Operation *op) {
+    // If the effect has alreay been found, early exit,
+    if (hasSideEffect)
+      return;
+
+    if (auto memEffect = dyn_cast<MemoryEffectOpInterface>(op)) {
+      SmallVector<MemoryEffects::EffectInstance, 1> effects;
+      memEffect.getEffects(effects);
+
+      bool opMayHaveEffect = false;
+      for (auto effect : effects) {
+        // If op causes EffectType on a potentially aliasing location for
+        // memOp, mark as having the effect.
+        if (isa<EffectType>(effect.getEffect())) {
+          if (isOriginalAllocation && effect.getValue() &&
+              (effect.getValue().getDefiningOp<memref::AllocaOp>() ||
+               effect.getValue().getDefiningOp<memref::AllocOp>())) {
+            if (effect.getValue() != memref)
+              continue;
+          }
+          opMayHaveEffect = true;
+          break;
+        }
+      }
 
-  return true;
+      if (!opMayHaveEffect)
+        return;
+
+      // If the side effect comes from an affine read or write, try to
+      // prove the side effecting `op` cannot reach `memOp`.
+      if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
+        MemRefAccess srcAccess(op);
+        MemRefAccess destAccess(memOp);
+        // Dependence analysis is only correct if both ops operate on the same
+        // memref.
+        if (srcAccess.memref == destAccess.memref) {
+          FlatAffineConstraints dependenceConstraints;
+
+          // Number of loops containing the start op and the ending operation.
+          unsigned minSurroundingLoops =
+              getNumCommonSurroundingLoops(*start, *memOp);
+
+          // Number of loops containing the operation `op` which has the
+          // potential memory side effect and can occur on a path between
+          // `start` and `memOp`.
+          unsigned nsLoops = getNumCommonSurroundingLoops(*op, *memOp);
+
+          // For ease, let's consider the case that `op` is a store and we're
+          // looking for other potential stores (e.g `op`) that overwrite memory
+          // after `start`, and before being read in `memOp`. In this case, we
+          // only need to consider other potential stores with depth >
+          // minSurrounding loops since `start` would overwrite any store with a
+          // smaller number of surrounding loops before.
+          unsigned d;
+          for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
+            DependenceResult result = checkMemrefAccessDependence(
+                srcAccess, destAccess, d, &dependenceConstraints,
+                /*dependenceComponents=*/nullptr);
+            if (hasDependence(result)) {
+              hasSideEffect = true;
+              return;
+            }
+          }
+
+          // No side effect was seen, simply return.
+          return;
+        }
+      }
+      hasSideEffect = true;
+      return;
+    }
+
+    if (op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
+      // Recurse into the regions for this op and check whether the internal
+      // operations may have the side effect `EffectType` on memOp.
+      for (Region &region : op->getRegions())
+        for (Block &block : region)
+          for (Operation &op : block)
+            checkOperation(&op);
+      return;
+    }
+
+    // Otherwise, conservatively assume generic operations have the effect
+    // on the operation
+    hasSideEffect = true;
+    return;
+  };
+
+  // Check all paths from ancestor op `parent` to the operation `to` for the
+  // effect. It is known that `to` must be contained within `parent`.
+  auto until = [&](Operation *parent, Operation *to) {
+    // TODO check only the paths from `parent` to `to`.
+    // Currently we fallback and check the entire parent op, rather than
+    // just the paths from the parent path, stopping after reaching `to`.
+    // This is conservatively correct, but could be made more aggressive.
+    assert(parent->isAncestor(to));
+    checkOperation(parent);
+  };
+
+  // Check for all paths from operation `from` to operation `untilOp` for the
+  // given memory effect.
+  std::function<void(Operation *, Operation *)> recur =
+      [&](Operation *from, Operation *untilOp) {
+        assert(
+            from->getParentRegion()->isAncestor(untilOp->getParentRegion()) &&
+            "Checking for side effect between two operations without a common "
+            "ancestor");
+
+        // If the operations are in different regions, recursively consider all
+        // path from `from` to the parent of `to` and all paths from the parent
+        // of `to` to `to`.
+        if (from->getParentRegion() != untilOp->getParentRegion()) {
+          recur(from, untilOp->getParentOp());
+          until(untilOp->getParentOp(), untilOp);
+          return;
+        }
+
+        // Now, assuming that `from` and `to` exist in the same region, perform
+        // a CFG traversal to check all the relevant operations.
+
+        // Additional blocks to consider.
+        SmallVector<Block *, 2> todoBlocks;
+        {
+          // First consider the parent block of `from` an check all operations
+          // after `from`.
+          for (auto iter = ++from->getIterator(), end = from->getBlock()->end();
+               iter != end && &*iter != untilOp; ++iter) {
+            checkOperation(&*iter);
+          }
+
+          // If the parent of `from` doesn't contain `to`, add the successors
+          // to the list of blocks to check.
+          if (untilOp->getBlock() != from->getBlock())
+            for (Block *succ : from->getBlock()->getSuccessors())
+              todoBlocks.push_back(succ);
+        }
+
+        SmallPtrSet<Block *, 4> done;
+        // Traverse the CFG until hitting `to`.
+        while (todoBlocks.size()) {
+          Block *blk = todoBlocks.pop_back_val();
+          if (done.count(blk))
+            continue;
+          done.insert(blk);
+          for (auto &op : *blk) {
+            if (&op == untilOp)
+              break;
+            checkOperation(&op);
+            if (&op == blk->getTerminator())
+              for (Block *succ : blk->getSuccessors())
+                todoBlocks.push_back(succ);
+          }
+        }
+      };
+  recur(start, memOp);
+  return !hasSideEffect;
 }
 
-// This is a straightforward implementation not optimized for speed. Optimize
-// if needed.
-LogicalResult
-AffineScalarReplacement::forwardStoreToLoad(AffineReadOpInterface loadOp) {
-  // First pass over the use list to get the minimum number of surrounding
-  // loops common between the load op and the store op, with min taken across
-  // all store ops.
-  SmallVector<Operation *, 8> storeOps;
-  unsigned minSurroundingLoops = getNestingDepth(loadOp);
+/// Attempt to eliminate loadOp by replacing it with a value stored into memory
+/// which the load is guaranteed to retrieve. This check involves three
+/// components: 1) The store and load must be on the same location 2) The store
+/// must dominate (and therefore must always occur prior to) the load 3) No
+/// other operations will overwrite the memory loaded between the given load
+/// and store.  If such a value exists, the replaced `loadOp` will be added to
+/// `loadOpsToErase` and its memref will be added to `memrefsToErase`.
+LogicalResult AffineScalarReplacement::forwardStoreToLoad(
+    AffineReadOpInterface loadOp, SmallVectorImpl<Operation *> &loadOpsToErase,
+    SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo &domInfo) {
+
+  // The store op candidate for forwarding that satisfies all conditions
+  // to replace the load, if any.
+  Operation *lastWriteStoreOp = nullptr;
+
   for (auto *user : loadOp.getMemRef().getUsers()) {
     auto storeOp = dyn_cast<AffineWriteOpInterface>(user);
     if (!storeOp)
       continue;
-    unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
-    minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
-    storeOps.push_back(storeOp);
-  }
-
-  // The list of store op candidates for forwarding that satisfy conditions
-  // (1) and (2) above - they will be filtered later when checking (3).
-  SmallVector<Operation *, 8> fwdingCandidates;
-
-  // Store ops that have a dependence into the load (even if they aren't
-  // forwarding candidates). Each forwarding candidate will be checked for a
-  // dominance on these. 'fwdingCandidates' are a subset of depSrcStores.
-  SmallVector<Operation *, 8> depSrcStores;
-
-  for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
-      continue;
-
-    // Stores that *may* be reaching the load.
-    depSrcStores.push_back(storeOp);
+    MemRefAccess srcAccess(storeOp);
+    MemRefAccess destAccess(loadOp);
 
     // 1. Check if the store and the load have mathematically equivalent
     // affine access functions; this implies that they statically refer to the
@@ -155,41 +286,24 @@
     //     store %A[%M]
     //     load %A[%N]
     // Use the AffineValueMap difference based memref access equality checking.
-    MemRefAccess srcAccess(storeOp);
-    MemRefAccess destAccess(loadOp);
     if (srcAccess != destAccess)
       continue;
 
     // 2. The store has to dominate the load op to be candidate.
-    if (!domInfo->dominates(storeOp, loadOp))
+    if (!domInfo.dominates(storeOp, loadOp))
+      continue;
+
+    // 3. Ensure there is no intermediate operation which could replace the
+    // value in memory.
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp))
       continue;
 
     // We now have a candidate for forwarding.
-    fwdingCandidates.push_back(storeOp);
+    assert(lastWriteStoreOp == nullptr &&
+           "multiple simulataneous replacement stores");
+    lastWriteStoreOp = storeOp;
   }
 
-  // 3. Of all the store ops that meet the above criteria, the store op
-  // that does not dominate any of the ops in 'depSrcStores' (if such exists)
-  // will not have any of those latter ops on its paths to `loadOp`. It would
-  // thus be the unique store providing the value to the load. This condition is
-  // however conservative for eg:
-  //
-  // for ... {
-  //    store
-  //    load
-  //    store
-  //    load
-  // }
-  //
-  Operation *lastWriteStoreOp = nullptr;
-  for (auto *storeOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores, [&](Operation *depStore) {
-          return !domInfo->properlyDominates(storeOp, depStore);
-        })) {
-      lastWriteStoreOp = storeOp;
-      break;
-    }
-  }
   if (!lastWriteStoreOp)
     return failure();
 
@@ -213,109 +327,85 @@
 // loadA will be be replaced with loadB if:
 // 1) loadA and loadB have mathematically equivalent affine access functions.
 // 2) loadB dominates loadA.
-// 3) loadB does not dominate any of the store ops that have a dependence into
-//    loadA.
-void AffineScalarReplacement::loadCSE(AffineReadOpInterface loadOp) {
-  // The list of load op candidates for forwarding that satisfy conditions
-  // (1) and (2) above - they will be filtered later when checking (3).
-  SmallVector<Operation *, 8> fwdingCandidates;
-  SmallVector<Operation *, 8> storeOps;
-  unsigned minSurroundingLoops = getNestingDepth(loadOp);
-  MemRefAccess memRefAccess(loadOp);
-  // First pass over the use list to get 1) the minimum number of surrounding
-  // loops common between the load op and an load op candidate, with min taken
-  // across all load op candidates; 2) load op candidates; 3) store ops.
-  // We take min across all load op candidates instead of all load ops to make
-  // sure later dependence check is performed at loop depths that do matter.
-  for (auto *user : loadOp.getMemRef().getUsers()) {
-    if (auto storeOp = dyn_cast<AffineWriteOpInterface>(user)) {
-      storeOps.push_back(storeOp);
-    } else if (auto aLoadOp = dyn_cast<AffineReadOpInterface>(user)) {
-      MemRefAccess otherMemRefAccess(aLoadOp);
-      // No need to consider Load ops that have been replaced in previous store
-      // to load forwarding or loadCSE. If loadA or storeA can be forwarded to
-      // loadB, then loadA or storeA can be forwarded to loadC iff loadB can be
-      // forwarded to loadC.
-      // If loadB is visited before loadC and replace with loadA, we do not put
-      // loadB in candidates list, only loadA. If loadC is visited before loadB,
-      // loadC may be replaced with loadB, which will be replaced with loadA
-      // later.
-      if (aLoadOp != loadOp && !llvm::is_contained(loadOpsToErase, aLoadOp) &&
-          memRefAccess == otherMemRefAccess &&
-          domInfo->dominates(aLoadOp, loadOp)) {
-        fwdingCandidates.push_back(aLoadOp);
-        unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *aLoadOp);
-        minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
-      }
+// 3) There is no write between loadA and loadB.
+void AffineScalarReplacement::loadCSE(
+    AffineReadOpInterface loadA, SmallVectorImpl<Operation *> &loadOpsToErase,
+    DominanceInfo &domInfo) {
+  SmallVector<AffineReadOpInterface, 4> loadCandidates;
+  for (auto *user : loadA.getMemRef().getUsers()) {
+    auto loadB = dyn_cast<AffineReadOpInterface>(user);
+    if (!loadB || loadB == loadA)
+      continue;
+
+    MemRefAccess srcAccess(loadB);
+    MemRefAccess destAccess(loadA);
+
+    // 1. The accesses have to be to the same location.
+    if (srcAccess != destAccess) {
+      continue;
     }
-  }
 
-  // No forwarding candidate.
-  if (fwdingCandidates.empty())
-    return;
+    // 2. The store has to dominate the load op to be candidate.
+    if (!domInfo.dominates(loadB, loadA))
+      continue;
 
-  // Store ops that have a dependence into the load.
-  SmallVector<Operation *, 8> depSrcStores;
+    // 3. There is no write between loadA and loadB.
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(loadB.getOperation(),
+                                                      loadA))
+      continue;
 
-  for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
+    // Check if two values have the same shape. This is needed for affine vector
+    // loads.
+    if (loadB.getValue().getType() != loadA.getValue().getType())
       continue;
 
-    // Stores that *may* be reaching the load.
-    depSrcStores.push_back(storeOp);
+    loadCandidates.push_back(loadB);
   }
 
-  // 3. Of all the load op's that meet the above criteria, return the first load
-  // found that does not dominate any op in 'depSrcStores' and has the same
-  // shape as the load to be replaced (if one exists). The shape check is needed
-  // for affine vector loads.
-  Operation *firstLoadOp = nullptr;
-  Value oldVal = loadOp.getValue();
-  for (auto *loadOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores,
-                     [&](Operation *depStore) {
-                       return !domInfo->properlyDominates(loadOp, depStore);
-                     }) &&
-        cast<AffineReadOpInterface>(loadOp).getValue().getType() ==
-            oldVal.getType()) {
-      firstLoadOp = loadOp;
+  // Of the legal load candidates, use the one that dominates all others
+  // to minimize the subsequent need to loadCSE
+  Value loadB;
+  for (AffineReadOpInterface option : loadCandidates) {
+    if (llvm::all_of(loadCandidates, [&](AffineReadOpInterface depStore) {
+          return depStore == option ||
+                 domInfo.dominates(option.getOperation(),
+                                   depStore.getOperation());
+        })) {
+      loadB = option.getValue();
       break;
     }
   }
-  if (!firstLoadOp)
-    return;
 
-  // Perform the actual load to load forwarding.
-  Value loadVal = cast<AffineReadOpInterface>(firstLoadOp).getValue();
-  loadOp.getValue().replaceAllUsesWith(loadVal);
-  // Record this to erase later.
-  loadOpsToErase.push_back(loadOp);
+  if (loadB) {
+    loadA.getValue().replaceAllUsesWith(loadB);
+    // Record this to erase later.
+    loadOpsToErase.push_back(loadA);
+  }
 }
 
 void AffineScalarReplacement::runOnFunction() {
   // Only supports single block functions at the moment.
   FuncOp f = getFunction();
-  if (!llvm::hasSingleElement(f)) {
-    markAllAnalysesPreserved();
-    return;
-  }
 
-  domInfo = &getAnalysis<DominanceInfo>();
+  // Load op's whose results were replaced by those forwarded from stores.
+  SmallVector<Operation *, 8> opsToErase;
+
+  // A list of memref's that are potentially dead / could be eliminated.
+  SmallPtrSet<Value, 4> memrefsToErase;
 
-  loadOpsToErase.clear();
-  memrefsToErase.clear();
+  auto &domInfo = getAnalysis<DominanceInfo>();
 
-  // Walk all load's and perform store to load forwarding and loadCSE.
+  // Walk all load's and perform store to load forwarding.
   f.walk([&](AffineReadOpInterface loadOp) {
-    // Do store to load forwarding first, if no success, try loadCSE.
-    if (failed(forwardStoreToLoad(loadOp)))
-      loadCSE(loadOp);
+    if (failed(
+            forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo))) {
+      loadCSE(loadOp, opsToErase, domInfo);
+    }
   });
 
-  // Erase all load op's whose results were replaced with store or load fwd'ed
-  // ones.
-  for (auto *loadOp : loadOpsToErase)
-    loadOp->erase();
+  // Erase all load op's whose results were replaced with store fwd'ed ones.
+  for (auto *op : opsToErase)
+    op->erase();
 
   // Check if the store fwd'ed memrefs are now left with only stores and can
   // thus be completely deleted. Note: the canonicalize pass should be able
diff --git a/mlir/test/Dialect/Affine/scalrep.mlir b/mlir/test/Dialect/Affine/scalrep.mlir
--- a/mlir/test/Dialect/Affine/scalrep.mlir
+++ b/mlir/test/Dialect/Affine/scalrep.mlir
@@ -554,3 +554,91 @@
   }
   return
 }
+
+// CHECK-LABEL: func @external_no_forward_load
+
+func @external_no_forward_load(%in : memref<512xf32>, %out : memref<512xf32>) {
+  affine.for %i = 0 to 16 {
+    %ld0 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld0, %out[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+// CHECK:   affine.load
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+// CHECK-LABEL: func @external_no_forward_store
+
+func @external_no_forward_store(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+// CHECK-LABEL: func @no_forward_cast
+
+func @no_forward_cast(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  %cf2 = constant 2.0 : f32
+  %m2 = memref.cast %in : memref<512xf32> to memref<?xf32>
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    affine.store %cf2, %m2[32*%i] : memref<?xf32>
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+// CHECK:   affine.store
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT:   affine.load
+// CHECK-NEXT:   affine.store
+
+// Although there is a dependence from the second store to the load, it is
+// satisfied by the outer surrounding loop, and does not prevent the first
+// store to be forwarded to the load.
+
+// CHECK-LABEL: func @overlap_no_fwd
+func @overlap_no_fwd(%N : index) -> f32 {
+  %cf7 = constant 7.0 : f32
+  %cf9 = constant 9.0 : f32
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %m = memref.alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 5 {
+    affine.store %cf7, %m[2 * %i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      %v0 = affine.load %m[2 * %i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+      affine.store %cf9, %m[%i0 + 1] : memref<10xf32>
+    }
+  }
+  // Due to this load, the memref isn't optimized away.
+  %v3 = affine.load %m[%c1] : memref<10xf32>
+  return %v3 : f32
+
+// CHECK:  affine.for %{{.*}} = 0 to 5 {
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
+// CHECK-NEXT:      %{{.*}} = affine.load
+// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:  return %{{.*}} : f32
+}
+