diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h
--- a/mlir/include/mlir/Transforms/LoopUtils.h
+++ b/mlir/include/mlir/Transforms/LoopUtils.h
@@ -24,6 +24,7 @@
 class FuncOp;
 class OpBuilder;
 class Value;
+struct MemRefRegion;
 
 namespace loop {
 class ForOp;
@@ -185,6 +186,36 @@
                                 Optional<Value> filterMemRef,
                                 DenseSet<Operation *> &copyNests);
 
+struct CopyGenerateResult {
+  // Number of bytes used by alloc.
+  uint64_t sizeInBytes;
+
+  // The newly created buffer.
+  Operation *alloc;
+
+  // Generated loop nest for copying data between `alloc` and the original
+  // memref.
+  Operation *copyNest;
+};
+
+/// generateCopyFromMemRefRegion is similar to affineDataCopyGenerate, but with
+/// some simplifications:
+/// * The logic of "find relevant memrefs and their uses" is de-coupled and
+/// pushed back to the users. It focuses on generating fast buffers and
+/// associated loops/DMAs.
+/// * It processes a single memref denoted by `memrefRegion`.
+/// * The prologue and epilogue always surround `insertion_point`.
+///
+/// Note that `insertion_point` is a single op for API convenience, and the
+/// [begin, end) version can be added as needed.
+///
+/// Also note that certain options in `copyOptions` isn't looked at anymore,
+/// like slowMemorySpace.
+LogicalResult generateCopyFromMemRefRegion(const MemRefRegion &memrefRegion,
+                                           Operation *insertion_point,
+                                           const AffineCopyOptions &copyOptions,
+                                           CopyGenerateResult &result);
+
 /// Tile a nest of standard for loops rooted at `rootForOp` by finding such
 /// parametric tile sizes that the outer loops have a fixed number of iterations
 /// as defined in `sizes`.
diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp
--- a/mlir/lib/Transforms/Utils/LoopUtils.cpp
+++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp
@@ -1797,6 +1797,28 @@
                                 filterMemRef, copyNests);
 }
 
+LogicalResult mlir::generateCopyFromMemRefRegion(
+    const MemRefRegion &memrefRegion, Operation *insertion_point,
+    const AffineCopyOptions &copyOptions, CopyGenerateResult &result) {
+  Block *block = insertion_point->getBlock();
+  auto begin = insertion_point->getIterator();
+  auto end = std::next(begin);
+  DenseMap<Value, Value> fastBufferMap;
+  DenseSet<Operation *> copyNests;
+
+  auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end,
+                          copyOptions, fastBufferMap, copyNests,
+                          &result.sizeInBytes, &begin, &end);
+  if (failed(err))
+    return err;
+
+  result.alloc =
+      fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp();
+  assert(copyNests.size() <= 1 && "At most one copy nest is expected.");
+  result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin();
+  return success();
+}
+
 /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'.
 static void
 gatherLoopsInBlock(Block *block, unsigned currLoopDepth,
diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir
--- a/mlir/test/Transforms/affine-data-copy.mlir
+++ b/mlir/test/Transforms/affine-data-copy.mlir
@@ -7,6 +7,7 @@
 // '-test-affine-data-copy-memref-filter' passes the first memref found in an
 // affine.load op in the innermost loop as a filter.
 // RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER
+// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='from-memref-region=1' | FileCheck %s --check-prefix=MEMREF_REGION
 
 // -copy-skip-non-stride-loops forces the copies to be placed right inside the
 // tile space loops, avoiding the sensitivity of copy placement depth to memory
@@ -198,3 +199,13 @@
 // FILTER-NEXT:     affine.for %{{.*}} = 0 to 1024 {
 //      FILTER: dealloc %{{.*}} : memref<1024x1024xf32>
 //  FILTER-NOT: dealloc
+
+//      MEMREF_REGION: alloc() : memref<1024x1024xf32>
+//  MEMREF_REGION-NOT: alloc()
+//      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+//      MEMREF_REGION:   affine.for %{{.*}} = 0 to 1024 {
+//      MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION-NEXT:   affine.for %{{.*}} = 0 to 1024 {
+// MEMREF_REGION-NEXT:     affine.for %{{.*}} = 0 to 1024 {
+//      MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32>
+//  MEMREF_REGION-NOT: dealloc
diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
--- a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
+++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Passes.h"
+#include "mlir/Analysis/Utils.h"
 #include "mlir/Dialect/AffineOps/AffineOps.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Transforms/LoopUtils.h"
@@ -37,6 +38,10 @@
       llvm::cl::desc(
           "Enable memref filter testing in affine data copy optimization"),
       llvm::cl::init(false)};
+  Option<bool> clTestGenerateCopyFromMemRefRegion{
+      *this, "from-memref-region",
+      llvm::cl::desc("Test copy generation for a single memref region"),
+      llvm::cl::init(false)};
 };
 
 } // end anonymous namespace
@@ -55,13 +60,13 @@
 
   auto loopNest = depthToLoops[0][0];
   auto innermostLoop = depthToLoops[innermostLoopIdx][0];
-  Optional<Value> memrefFilter;
-  if (clMemRefFilter) {
+  AffineLoadOp load;
+  if (clMemRefFilter || clTestGenerateCopyFromMemRefRegion) {
     // Gather MemRef filter. For simplicity, we use the first loaded memref
     // found in the innermost loop.
     for (auto &op : *innermostLoop.getBody()) {
-      if (auto load = dyn_cast<AffineLoadOp>(op)) {
-        memrefFilter = load.getMemRef();
+      if (auto ld = dyn_cast<AffineLoadOp>(op)) {
+        load = ld;
         break;
       }
     }
@@ -72,8 +77,15 @@
                                    /*fastMemorySpace=*/0,
                                    /*tagMemorySpace=*/0,
                                    /*fastMemCapacityBytes=*/32 * 1024 * 1024UL};
-  DenseSet<Operation *> copyNests;
-  affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests);
+  if (clMemRefFilter) {
+    DenseSet<Operation *> copyNests;
+    affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests);
+  } else if (clTestGenerateCopyFromMemRefRegion) {
+    CopyGenerateResult result;
+    MemRefRegion region(loopNest.getLoc());
+    region.compute(load, 0);
+    generateCopyFromMemRefRegion(region, loopNest, copyOptions, result);
+  }
 }
 
 namespace mlir {