diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h
--- a/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/AffineAnalysis.h
@@ -53,8 +53,10 @@
     SmallVectorImpl<LoopReduction> *parallelReductions = nullptr);
 
 /// Returns true if `forOp' doesn't have memory dependences preventing
-/// parallelization. This function doesn't check iter_args and should be used
-/// only as a building block for full parallel-checking functions.
+/// parallelization. Memrefs that are allocated inside `forOp` do not impact its
+/// dependences and parallelism. This function does not check iter_args (for
+/// values other than memref types) and should be used only as a building block
+/// for complete parallelism-checking functions.
 bool isLoopMemoryParallel(AffineForOp forOp);
 
 /// Returns in `affineApplyOps`, the sequence of those AffineApplyOp
diff --git a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
--- a/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
+++ b/mlir/lib/Dialect/Affine/Analysis/AffineAnalysis.cpp
@@ -17,12 +17,10 @@
 #include "mlir/Dialect/Affine/Analysis/Utils.h"
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Affine/IR/AffineValueMap.h"
-#include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/IntegerSet.h"
-#include "mlir/Support/MathExtras.h"
-#include "llvm/ADT/DenseMap.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -114,18 +112,51 @@
   return isLoopMemoryParallel(forOp);
 }
 
-/// Returns true if `forOp' doesn't have memory dependences preventing
-/// parallelization. This function doesn't check iter_args and should be used
-/// only as a building block for full parallel-checking functions.
+/// Returns true if `op` is an alloc-like op, i.e., one allocating memrefs.
+static bool isAllocLikeOp(Operation *op) {
+  auto memEffects = dyn_cast<MemoryEffectOpInterface>(op);
+  return memEffects && memEffects.hasEffect<MemoryEffects::Allocate>();
+}
+
+/// Returns true if `v` is allocated locally to `enclosingOp` -- i.e., it is
+/// allocated by an operation nested within `enclosingOp`.
+static bool isLocallyDefined(Value v, Operation *enclosingOp) {
+  Operation *defOp = v.getDefiningOp();
+  if (!defOp)
+    return false;
+
+  if (isAllocLikeOp(defOp) && enclosingOp->isProperAncestor(defOp))
+    return true;
+
+  // Aliasing ops.
+  auto viewOp = dyn_cast<ViewLikeOpInterface>(defOp);
+  return viewOp && isLocallyDefined(viewOp.getViewSource(), enclosingOp);
+}
+
 bool mlir::isLoopMemoryParallel(AffineForOp forOp) {
+  // Any memref-typed iteration arguments are treated as serializing.
+  if (llvm::any_of(forOp.getResultTypes(),
+                   [](Type type) { return type.isa<BaseMemRefType>(); }))
+    return false;
+
   // Collect all load and store ops in loop nest rooted at 'forOp'.
   SmallVector<Operation *, 8> loadAndStoreOps;
   auto walkResult = forOp.walk([&](Operation *op) -> WalkResult {
-    if (isa<AffineReadOpInterface, AffineWriteOpInterface>(op))
-      loadAndStoreOps.push_back(op);
-    else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(op) &&
-             !MemoryEffectOpInterface::hasNoEffect(op))
+    if (auto readOp = dyn_cast<AffineReadOpInterface>(op)) {
+      // Memrefs that are allocated inside `forOp` need not be considered.
+      if (!isLocallyDefined(readOp.getMemRef(), forOp))
+        loadAndStoreOps.push_back(op);
+    } else if (auto writeOp = dyn_cast<AffineWriteOpInterface>(op)) {
+      // Filter out stores the same way as above.
+      if (!isLocallyDefined(writeOp.getMemRef(), forOp))
+        loadAndStoreOps.push_back(op);
+    } else if (!isa<AffineForOp, AffineYieldOp, AffineIfOp>(op) &&
+               !isAllocLikeOp(op) &&
+               !MemoryEffectOpInterface::hasNoEffect(op)) {
+      // Alloc-like ops inside `forOp` are fine (they don't impact parallelism)
+      // as long as they don't escape the loop (which has been checked above).
       return WalkResult::interrupt();
+    }
 
     return WalkResult::advance();
   });
diff --git a/mlir/test/Dialect/Affine/parallelize.mlir b/mlir/test/Dialect/Affine/parallelize.mlir
--- a/mlir/test/Dialect/Affine/parallelize.mlir
+++ b/mlir/test/Dialect/Affine/parallelize.mlir
@@ -269,3 +269,57 @@
   }
   return
 }
+
+// Test in the presence of locally allocated memrefs.
+
+// CHECK: func @local_alloc
+func @local_alloc() {
+  %cst = arith.constant 0.0 : f32
+  affine.for %i = 0 to 100 {
+    %m = memref.alloc() : memref<1xf32>
+    %ma = memref.alloca() : memref<1xf32>
+    affine.store %cst, %m[0] : memref<1xf32>
+  }
+  // CHECK: affine.parallel
+  return
+}
+
+// CHECK: func @local_alloc_cast
+func @local_alloc_cast() {
+  %cst = arith.constant 0.0 : f32
+  affine.for %i = 0 to 100 {
+    %m = memref.alloc() : memref<128xf32>
+    affine.for %j = 0 to 128 {
+      affine.store %cst, %m[%j] : memref<128xf32>
+    }
+    affine.for %j = 0 to 128 {
+      affine.store %cst, %m[0] : memref<128xf32>
+    }
+    %r = memref.reinterpret_cast %m to offset: [0], sizes: [8, 16],
+           strides: [16, 1] : memref<128xf32> to memref<8x16xf32>
+    affine.for %j = 0 to 8 {
+      affine.store %cst, %r[%j, %j] : memref<8x16xf32>
+    }
+  }
+  // CHECK: affine.parallel
+  // CHECK:   affine.parallel
+  // CHECK:   }
+  // CHECK:   affine.for
+  // CHECK:   }
+  // CHECK:   affine.parallel
+  // CHECK:   }
+  // CHECK: }
+
+  return
+}
+
+// CHECK-LABEL: @iter_arg_memrefs
+func @iter_arg_memrefs(%in: memref<10xf32>) {
+  %mi = memref.alloc() : memref<f32>
+  // Loop-carried memrefs are treated as serializing the loop.
+  // CHECK: affine.for
+  %mo = affine.for %i = 0 to 10 iter_args(%m_arg = %mi) -> (memref<f32>) {
+    affine.yield %m_arg : memref<f32>
+  }
+  return
+}