diff --git a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
--- a/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/AffineScalarReplacement.cpp
@@ -70,17 +70,15 @@
     : public AffineScalarReplacementBase<AffineScalarReplacement> {
   void runOnFunction() override;
 
-  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp);
-  void loadCSE(AffineReadOpInterface loadOp);
-
-  // A list of memref's that are potentially dead / could be eliminated.
-  SmallPtrSet<Value, 4> memrefsToErase;
-  // Load op's whose results were replaced by those forwarded from stores
-  // dominating stores or loads..
-  SmallVector<Operation *, 8> loadOpsToErase;
-
-  DominanceInfo *domInfo = nullptr;
-  PostDominanceInfo *postDomInfo = nullptr;
+  LogicalResult forwardStoreToLoad(AffineReadOpInterface loadOp,
+                                   SmallVectorImpl<Operation *> &loadOpsToErase,
+                                   SmallPtrSetImpl<Value> &memrefsToErase,
+                                   DominanceInfo &domInfo,
+                                   PostDominanceInfo &postDominanceInfo);
+
+  void loadCSE(AffineReadOpInterface loadOp,
+               SmallVectorImpl<Operation *> &loadOpsToErase,
+               DominanceInfo &domInfo);
 };
 
 } // end anonymous namespace
@@ -92,32 +90,169 @@
   return std::make_unique<AffineScalarReplacement>();
 }
 
-// Check if the store may be reaching the load.
-static bool storeMayReachLoad(Operation *storeOp, Operation *loadOp,
-                              unsigned minSurroundingLoops) {
-  MemRefAccess srcAccess(storeOp);
-  MemRefAccess destAccess(loadOp);
-  FlatAffineConstraints dependenceConstraints;
-  unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *storeOp);
-  unsigned d;
-  // Dependences at loop depth <= minSurroundingLoops do NOT matter.
-  for (d = nsLoops + 1; d > minSurroundingLoops; d--) {
-    DependenceResult result = checkMemrefAccessDependence(
-        srcAccess, destAccess, d, &dependenceConstraints,
-        /*dependenceComponents=*/nullptr);
-    if (hasDependence(result))
-      break;
-  }
-  if (d <= minSurroundingLoops)
-    return false;
+/// Ensure that all operations between `start` (noninclusive) and `memOp`
+/// do not have the potential memory effect `EffectType` on `memOp`. `memOp`
+/// is an operation that reads or writes to a memref. For example, if
+/// `EffectType` is MemoryEffects::Write, this method will checks if is no
+/// write to the memory between `start` and `memOp` that would change the read
+/// within `memOp`.
+template <typename EffectType, typename T>
+bool hasNoInterveningEffect(Operation *start, T memOp) {
+
+  Value memref = memOp.getMemRef();
+  bool isOriginalAllocation = memref.getDefiningOp<memref::AllocaOp>() ||
+                              memref.getDefiningOp<memref::AllocOp>();
+
+  // A boolean representing whether an intervening operation could have impacted
+  // memOp.
+  bool hasSideEffect = false;
+
+  // Check whether the effect on memOp can be caused by a given operation op.
+  std::function<void(Operation *)> checkOperation = [&](Operation *op) {
+    // If the effect has alreay been found, early exit,
+    if (hasSideEffect)
+      return;
+
+    if (auto memEffect = dyn_cast<MemoryEffectOpInterface>(op)) {
+      SmallVector<MemoryEffects::EffectInstance, 1> effects;
+      memEffect.getEffects(effects);
+
+      bool opMayHaveEffect = false;
+      for (auto effect : effects) {
+        // If op causes EffectType on a potentially aliasing location for
+        // memOp, mark as having the effect.
+        if (isa<EffectType>(effect.getEffect())) {
+          if (isOriginalAllocation && effect.getValue() &&
+              (effect.getValue().getDefiningOp<memref::AllocaOp>() ||
+               effect.getValue().getDefiningOp<memref::AllocOp>())) {
+            if (effect.getValue() != memref)
+              continue;
+          }
+          opMayHaveEffect = true;
+          break;
+        }
+      }
 
-  return true;
+      if (!opMayHaveEffect)
+        return;
+
+      // If the side effect comes from an affine read or write, try to
+      // prove the side effecting `op` cannot reach `memOp`.
+      if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op)) {
+        MemRefAccess srcAccess(op);
+        MemRefAccess destAccess(memOp);
+        // Dependence analysis is only correct if both ops operate on the same
+        // memref.
+        if (srcAccess.memref == destAccess.memref) {
+          FlatAffineConstraints dependenceConstraints;
+          unsigned nsLoops = getNumCommonSurroundingLoops(*op, *memOp);
+          unsigned d;
+          // Dependences at loop depth <= minSurroundingLoops do NOT matter.
+          for (d = 1; d <= nsLoops + 1; d++) {
+            DependenceResult result = checkMemrefAccessDependence(
+                srcAccess, destAccess, d, &dependenceConstraints,
+                /*dependenceComponents=*/nullptr);
+            if (hasDependence(result)) {
+              hasSideEffect = true;
+              return;
+            }
+          }
+          return;
+        }
+      }
+      hasSideEffect = true;
+      return;
+    } else if (op->hasTrait<OpTrait::HasRecursiveSideEffects>()) {
+      // Recurse into the regions for this op and check whether the internal
+      // operations may have the side effect `EffectType` on memOp.
+      for (Region &region : op->getRegions())
+        for (Block &block : region)
+          for (Operation &op : block)
+            checkOperation(&op);
+    } else {
+      // Otherwise, conservatively assume generic operations have the effect
+      // on the operation
+      hasSideEffect = true;
+      return;
+    }
+  };
+
+  // Check all paths from ancestor op `parent` to the operation `to` for the
+  // effect. It is known that `to` must be contained within `parent`.
+  auto until = [&](Operation *parent, Operation *to) {
+    // TODO check only the paths from `parent` to `to`.
+    // Currently we fallback an check the entire parent op.
+    assert(parent->isAncestor(to));
+    checkOperation(parent);
+  };
+
+  // Check for all paths from operation `from` to operation
+  // `untilOp` for the given memory effect.
+  std::function<void(Operation *, Operation *)> recur =
+      [&](Operation *from, Operation *untilOp) {
+        assert(
+            from->getParentRegion()->isAncestor(untilOp->getParentRegion()) &&
+            "Checking for side effect between two operations without a common "
+            "ancestor");
+
+        // If the operations are in different regions, recursively
+        // consider all path from `from` to the parent of `to` and
+        // all paths from the parent of `to` to `to`.
+        if (from->getParentRegion() != untilOp->getParentRegion()) {
+          recur(from, untilOp->getParentOp());
+          until(untilOp->getParentOp(), untilOp);
+          return;
+        }
+
+        // Now, assuming that from and to exist in the same region, perform
+        // a CFG traversal to check all the relevant operations.
+
+        // Additional blocks to consider.
+        SmallVector<Block *, 2> todoBlocks;
+        {
+          // First consider the parent block of `from` an check all operations
+          // after `from`.
+          for (auto iter = ++from->getIterator(), end = from->getBlock()->end();
+               iter != end && &*iter != untilOp; ++iter) {
+            checkOperation(&*iter);
+          }
+
+          // If the parent of `from` doesn't contain `to`, add the successors
+          // to the list of blocks to check.
+          if (untilOp->getBlock() != from->getBlock())
+            for (Block *succ : from->getBlock()->getSuccessors())
+              todoBlocks.push_back(succ);
+        }
+
+        SmallPtrSet<Block *, 4> done;
+        // Traverse the CFG until hitting `to`.
+        while (todoBlocks.size()) {
+          Block *blk = todoBlocks.pop_back_val();
+          if (done.count(blk))
+            continue;
+          done.insert(blk);
+          for (auto &op : *blk) {
+            if (&op == untilOp)
+              break;
+            checkOperation(&op);
+            if (&op == blk->getTerminator())
+              for (Block *succ : blk->getSuccessors())
+                todoBlocks.push_back(succ);
+          }
+        }
+      };
+  recur(start, memOp);
+  return !hasSideEffect;
 }
 
-// This is a straightforward implementation not optimized for speed. Optimize
-// if needed.
-LogicalResult
-AffineScalarReplacement::forwardStoreToLoad(AffineReadOpInterface loadOp) {
+// Attempt to eliminate loadOp by replacing it with a value stored
+// into memory which the load is guaranteed to retrieve.
+// LoadOp will be added to`loadOpsToErase` if it can be removed and its
+// memref to `memrefsToErase`.
+LogicalResult AffineScalarReplacement::forwardStoreToLoad(
+    AffineReadOpInterface loadOp, SmallVectorImpl<Operation *> &loadOpsToErase,
+    SmallPtrSetImpl<Value> &memrefsToErase, DominanceInfo &domInfo,
+    PostDominanceInfo &postDominanceInfo) {
   // First pass over the use list to get the minimum number of surrounding
   // loops common between the load op and the store op, with min taken across
   // all store ops.
@@ -140,10 +275,9 @@
   // forwarding candidates). Each forwarding candidate will be checked for a
   // post-dominance on these. 'fwdingCandidates' are a subset of depSrcStores.
   SmallVector<Operation *, 8> depSrcStores;
-
   for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
-      continue;
+    MemRefAccess srcAccess(storeOp);
+    MemRefAccess destAccess(loadOp);
 
     // Stores that *may* be reaching the load.
     depSrcStores.push_back(storeOp);
@@ -156,13 +290,14 @@
     //     store %A[%M]
     //     load %A[%N]
     // Use the AffineValueMap difference based memref access equality checking.
-    MemRefAccess srcAccess(storeOp);
-    MemRefAccess destAccess(loadOp);
     if (srcAccess != destAccess)
       continue;
 
     // 2. The store has to dominate the load op to be candidate.
-    if (!domInfo->dominates(storeOp, loadOp))
+    if (!domInfo.dominates(storeOp, loadOp))
+      continue;
+
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(storeOp, loadOp))
       continue;
 
     // We now have a candidate for forwarding.
@@ -175,17 +310,11 @@
   // memref loc.
   // Note: this can be implemented in a cleaner way with postdominator tree
   // traversals. Consider this for the future if needed.
-  Operation *lastWriteStoreOp = nullptr;
-  for (auto *storeOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores, [&](Operation *depStore) {
-          return postDomInfo->postDominates(storeOp, depStore);
-        })) {
-      lastWriteStoreOp = storeOp;
-      break;
-    }
-  }
-  if (!lastWriteStoreOp)
+  if (fwdingCandidates.size() == 0)
     return failure();
+  assert(fwdingCandidates.size() == 1 && "multiple dominating stores");
+
+  Operation *lastWriteStoreOp = fwdingCandidates.front();
 
   // Perform the actual store to load forwarding.
   Value storeVal =
@@ -207,109 +336,84 @@
 // loadA will be be replaced with loadB if:
 // 1) loadA and loadB have mathematically equivalent affine access functions.
 // 2) loadB dominates loadA.
-// 3) loadB postdominates all the store op's that have a dependence into loadA.
-void AffineScalarReplacement::loadCSE(AffineReadOpInterface loadOp) {
-  // The list of load op candidates for forwarding that satisfy conditions
-  // (1) and (2) above - they will be filtered later when checking (3).
-  SmallVector<Operation *, 8> fwdingCandidates;
-  SmallVector<Operation *, 8> storeOps;
-  unsigned minSurroundingLoops = getNestingDepth(loadOp);
-  MemRefAccess memRefAccess(loadOp);
-  // First pass over the use list to get 1) the minimum number of surrounding
-  // loops common between the load op and an load op candidate, with min taken
-  // across all load op candidates; 2) load op candidates; 3) store ops.
-  // We take min across all load op candidates instead of all load ops to make
-  // sure later dependence check is performed at loop depths that do matter.
-  for (auto *user : loadOp.getMemRef().getUsers()) {
-    if (auto storeOp = dyn_cast<AffineWriteOpInterface>(user)) {
-      storeOps.push_back(storeOp);
-    } else if (auto aLoadOp = dyn_cast<AffineReadOpInterface>(user)) {
-      MemRefAccess otherMemRefAccess(aLoadOp);
-      // No need to consider Load ops that have been replaced in previous store
-      // to load forwarding or loadCSE. If loadA or storeA can be forwarded to
-      // loadB, then loadA or storeA can be forwarded to loadC iff loadB can be
-      // forwarded to loadC.
-      // If loadB is visited before loadC and replace with loadA, we do not put
-      // loadB in candidates list, only loadA. If loadC is visited before loadB,
-      // loadC may be replaced with loadB, which will be replaced with loadA
-      // later.
-      if (aLoadOp != loadOp && !llvm::is_contained(loadOpsToErase, aLoadOp) &&
-          memRefAccess == otherMemRefAccess &&
-          domInfo->dominates(aLoadOp, loadOp)) {
-        fwdingCandidates.push_back(aLoadOp);
-        unsigned nsLoops = getNumCommonSurroundingLoops(*loadOp, *aLoadOp);
-        minSurroundingLoops = std::min(nsLoops, minSurroundingLoops);
-      }
+// 3) There is no write between loadA and loadB
+void AffineScalarReplacement::loadCSE(
+    AffineReadOpInterface loadA, SmallVectorImpl<Operation *> &loadOpsToErase,
+    DominanceInfo &domInfo) {
+  SmallVector<AffineReadOpInterface, 4> loadCandidates;
+  for (auto *user : loadA.getMemRef().getUsers()) {
+    auto loadB = dyn_cast<AffineReadOpInterface>(user);
+    if (!loadB || loadB == loadA)
+      continue;
+
+    MemRefAccess srcAccess(loadB);
+    MemRefAccess destAccess(loadA);
+
+    if (srcAccess != destAccess) {
+      continue;
     }
-  }
 
-  // No forwarding candidate.
-  if (fwdingCandidates.empty())
-    return;
+    // 2. The store has to dominate the load op to be candidate.
+    if (!domInfo.dominates(loadB, loadA))
+      continue;
 
-  // Store ops that have a dependence into the load.
-  SmallVector<Operation *, 8> depSrcStores;
+    if (!hasNoInterveningEffect<MemoryEffects::Write>(loadB.getOperation(),
+                                                      loadA))
+      continue;
 
-  for (auto *storeOp : storeOps) {
-    if (!storeMayReachLoad(storeOp, loadOp, minSurroundingLoops))
+    // Check if two values have the same shape. This is needed for affine vector
+    // loads.
+    if (loadB.getValue().getType() != loadA.getValue().getType())
       continue;
 
-    // Stores that *may* be reaching the load.
-    depSrcStores.push_back(storeOp);
+    loadCandidates.push_back(loadB);
   }
 
-  // 3. Of all the load op's that meet the above criteria, return the first load
-  // found that postdominates all 'depSrcStores' and has the same shape as the
-  // load to be replaced (if one exists). The shape check is needed for affine
-  // vector loads.
-  Operation *firstLoadOp = nullptr;
-  Value oldVal = loadOp.getValue();
-  for (auto *loadOp : fwdingCandidates) {
-    if (llvm::all_of(depSrcStores,
-                     [&](Operation *depStore) {
-                       return postDomInfo->postDominates(loadOp, depStore);
-                     }) &&
-        cast<AffineReadOpInterface>(loadOp).getValue().getType() ==
-            oldVal.getType()) {
-      firstLoadOp = loadOp;
+  // Of the legal load candidates, use the one that dominates all others
+  // to minimize the subsequent need to loadCSE
+  Value loadB;
+  for (AffineReadOpInterface option : loadCandidates) {
+    if (llvm::all_of(loadCandidates, [&](AffineReadOpInterface depStore) {
+          return depStore == option ||
+                 domInfo.dominates(option.getOperation(),
+                                   depStore.getOperation());
+        })) {
+      loadB = option.getValue();
       break;
     }
   }
-  if (!firstLoadOp)
-    return;
 
-  // Perform the actual load to load forwarding.
-  Value loadVal = cast<AffineReadOpInterface>(firstLoadOp).getValue();
-  loadOp.getValue().replaceAllUsesWith(loadVal);
-  // Record this to erase later.
-  loadOpsToErase.push_back(loadOp);
+  if (loadB) {
+    loadA.getValue().replaceAllUsesWith(loadB);
+    // Record this to erase later.
+    loadOpsToErase.push_back(loadA);
+  }
 }
 
 void AffineScalarReplacement::runOnFunction() {
   // Only supports single block functions at the moment.
   FuncOp f = getFunction();
-  if (!llvm::hasSingleElement(f)) {
-    markAllAnalysesPreserved();
-    return;
-  }
 
-  domInfo = &getAnalysis<DominanceInfo>();
-  postDomInfo = &getAnalysis<PostDominanceInfo>();
+  // Load op's whose results were replaced by those forwarded from stores.
+  SmallVector<Operation *, 8> opsToErase;
+
+  // A list of memref's that are potentially dead / could be eliminated.
+  SmallPtrSet<Value, 4> memrefsToErase;
 
-  loadOpsToErase.clear();
-  memrefsToErase.clear();
+  auto &domInfo = getAnalysis<DominanceInfo>();
+  auto &postDominanceInfo = getAnalysis<PostDominanceInfo>();
 
-  // Walk all load's and perform store to load forwarding and loadCSE.
+  // Walk all load's and perform store to load forwarding.
   f.walk([&](AffineReadOpInterface loadOp) {
-    // Do store to load forwarding first, if no success, try loadCSE.
-    if (failed(forwardStoreToLoad(loadOp)))
-      loadCSE(loadOp);
+    if (failed(forwardStoreToLoad(loadOp, opsToErase, memrefsToErase, domInfo,
+                                  postDominanceInfo))) {
+      loadCSE(loadOp, opsToErase, domInfo);
+    }
   });
 
-  // Erase all load op's whose results were replaced with store or load fwd'ed
-  // ones.
-  for (auto *loadOp : loadOpsToErase)
-    loadOp->erase();
+  // Erase all load op's whose results were replaced with store fwd'ed ones.
+  for (auto *op : opsToErase)
+    op->erase();
 
   // Check if the store fwd'ed memrefs are now left with only stores and can
   // thus be completely deleted. Note: the canonicalize pass should be able
diff --git a/mlir/test/Dialect/Affine/scalrep.mlir b/mlir/test/Dialect/Affine/scalrep.mlir
--- a/mlir/test/Dialect/Affine/scalrep.mlir
+++ b/mlir/test/Dialect/Affine/scalrep.mlir
@@ -235,6 +235,10 @@
   // Due to this load, the memref isn't optimized away.
   %v3 = affine.load %m[%c1] : memref<10xf32>
   return %v3 : f32
+// This test is currently disabled as the affine store to i0+1 is seen as
+// having a side effect that potentially conflicts with the load of i0.
+// More fine grained analysis of the side effecting behavior (and dependence
+// structure) is necessary for this to succeed.
 // CHECK:       %{{.*}} = memref.alloc() : memref<10xf32>
 // CHECK-NEXT:  affine.for %{{.*}} = 0 to 10 {
 // CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
@@ -515,18 +519,105 @@
   return
 }
 
-// CHECK-LABEL: func @vector_load_affine_apply_store_load
+// TODO-LABEL: func @vector_load_affine_apply_store_load
 func @vector_load_affine_apply_store_load(%in : memref<512xf32>, %out : memref<512xf32>) {
   %cf1 = constant 1: index
   affine.for %i = 0 to 15 {
-    // CHECK:       affine.vector_load
+    // TODO:       affine.vector_load
     %ld0 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
     %idx = affine.apply affine_map<(d0) -> (d0 + 1)> (%i)
     affine.vector_store %ld0, %in[32*%idx] : memref<512xf32>, vector<32xf32>
-    // CHECK-NOT:   affine.vector_load
+    // TODO-NOT:   affine.vector_load
     %ld1 = affine.vector_load %in[32*%i] : memref<512xf32>, vector<32xf32>
     %add = addf %ld0, %ld1 : vector<32xf32>
     affine.vector_store %ld1, %out[32*%i] : memref<512xf32>, vector<32xf32>
   }
   return
 }
+
+// CHECK-LABEL: func @external_no_forward_load
+// CHECK:   affine.load
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+func @external_no_forward_load(%in : memref<512xf32>, %out : memref<512xf32>) {
+  affine.for %i = 0 to 16 {
+    %ld0 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld0, %out[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @external_no_forward_store
+// CHECK:   affine.store
+// CHECK:   affine.load
+// CHECK:   affine.store
+
+func @external_no_forward_store(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    "memop"(%in, %out) : (memref<512xf32>, memref<512xf32>) -> ()
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+// CHECK-LABEL: func @external_no_forward_cst
+// CHECK:   affine.store
+// CHECK-NEXT:   affine.store
+// CHECK-NEXT:   affine.load
+// CHECK-NEXT:   affine.store
+
+func @external_no_forward_cst(%in : memref<512xf32>, %out : memref<512xf32>) {
+  %cf1 = constant 1.0 : f32
+  %cf2 = constant 2.0 : f32
+  %m2 = memref.cast %in : memref<512xf32> to memref<?xf32>
+  affine.for %i = 0 to 16 {
+    affine.store %cf1, %in[32*%i] : memref<512xf32>
+    affine.store %cf2, %m2[32*%i] : memref<?xf32>
+    %ld1 = affine.load %in[32*%i] : memref<512xf32>
+    affine.store %ld1, %out[32*%i] : memref<512xf32>
+  }
+  return
+}
+
+// Although there is a dependence from the second store to the load, it is
+// satisfied by the outer surrounding loop, and does not prevent the first
+// store to be forwarded to the load.
+func @overlap_no_fwd(%N : index) -> f32 {
+  %cf7 = constant 7.0 : f32
+  %cf9 = constant 9.0 : f32
+  %c0 = constant 0 : index
+  %c1 = constant 1 : index
+  %m = memref.alloc() : memref<10xf32>
+  affine.for %i0 = 0 to 5 {
+    affine.store %cf7, %m[2 * %i0] : memref<10xf32>
+    affine.for %i1 = 0 to %N {
+      %v0 = affine.load %m[2 * %i0] : memref<10xf32>
+      %v1 = addf %v0, %v0 : f32
+      affine.store %cf9, %m[%i0 + 1] : memref<10xf32>
+    }
+  }
+  // Due to this load, the memref isn't optimized away.
+  %v3 = affine.load %m[%c1] : memref<10xf32>
+  return %v3 : f32
+
+// CHECK-LABEL: func @overlap_no_fwd
+// CHECK:  affine.for %{{.*}} = 0 to 5 {
+// CHECK-NEXT:    affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    affine.for %{{.*}} = 0 to %{{.*}} {
+// CHECK-NEXT:      %{{.*}} = affine.load
+// CHECK-NEXT:      %{{.*}} = addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:      affine.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:    }
+// CHECK-NEXT:  }
+// CHECK-NEXT:  %{{.*}} = affine.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:  return %{{.*}} : f32
+}
+