diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -24,6 +24,7 @@ class FuncOp; class OpBuilder; class Value; +struct MemRefRegion; namespace loop { class ForOp; @@ -185,6 +186,36 @@ Optional filterMemRef, DenseSet ©Nests); +struct CopyGenerateResult { + // Number of bytes used by alloc. + uint64_t sizeInBytes; + + // The newly created buffer. + Operation *alloc; + + // Generated loop nest for copying data between `alloc` and the original + // memref. + Operation *copyNest; +}; + +/// generateCopyFromMemRefRegion is similar to affineDataCopyGenerate, but with +/// some simplifications: +/// * The logic of "find relevant memrefs and their uses" is de-coupled and +/// pushed back to the users. It focuses on generating fast buffers and +/// associated loops/DMAs. +/// * It processes a single memref denoted by `memrefRegion`. +/// * The prologue and epilogue always surround `insertion_point`. +/// +/// Note that `insertion_point` is a single op for API convenience, and the +/// [begin, end) version can be added as needed. +/// +/// Also note that certain options in `copyOptions` isn't looked at anymore, +/// like slowMemorySpace. +LogicalResult generateCopyFromMemRefRegion(const MemRefRegion &memrefRegion, + Operation *insertion_point, + const AffineCopyOptions ©Options, + CopyGenerateResult &result); + /// Tile a nest of standard for loops rooted at `rootForOp` by finding such /// parametric tile sizes that the outer loops have a fixed number of iterations /// as defined in `sizes`. diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -1797,6 +1797,28 @@ filterMemRef, copyNests); } +LogicalResult mlir::generateCopyFromMemRefRegion( + const MemRefRegion &memrefRegion, Operation *insertion_point, + const AffineCopyOptions ©Options, CopyGenerateResult &result) { + Block *block = insertion_point->getBlock(); + auto begin = insertion_point->getIterator(); + auto end = std::next(begin); + DenseMap fastBufferMap; + DenseSet copyNests; + + auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end, + copyOptions, fastBufferMap, copyNests, + &result.sizeInBytes, &begin, &end); + if (failed(err)) + return err; + + result.alloc = + fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp(); + assert(copyNests.size() <= 1 && "At most one copy nest is expected."); + result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin(); + return success(); +} + /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir --- a/mlir/test/Transforms/affine-data-copy.mlir +++ b/mlir/test/Transforms/affine-data-copy.mlir @@ -7,6 +7,7 @@ // '-test-affine-data-copy-memref-filter' passes the first memref found in an // affine.load op in the innermost loop as a filter. // RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='from-memref-region=1' | FileCheck %s --check-prefix=MEMREF_REGION // -copy-skip-non-stride-loops forces the copies to be placed right inside the // tile space loops, avoiding the sensitivity of copy placement depth to memory @@ -198,3 +199,13 @@ // FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { // FILTER: dealloc %{{.*}} : memref<1024x1024xf32> // FILTER-NOT: dealloc + +// MEMREF_REGION: alloc() : memref<1024x1024xf32> +// MEMREF_REGION-NOT: alloc() +// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32> +// MEMREF_REGION-NOT: dealloc diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp --- a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp +++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/Passes.h" +#include "mlir/Analysis/Utils.h" #include "mlir/Dialect/AffineOps/AffineOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" @@ -37,6 +38,10 @@ llvm::cl::desc( "Enable memref filter testing in affine data copy optimization"), llvm::cl::init(false)}; + Option clTestGenerateCopyFromMemRefRegion{ + *this, "from-memref-region", + llvm::cl::desc("Test copy generation for a single memref region"), + llvm::cl::init(false)}; }; } // end anonymous namespace @@ -55,13 +60,13 @@ auto loopNest = depthToLoops[0][0]; auto innermostLoop = depthToLoops[innermostLoopIdx][0]; - Optional memrefFilter; - if (clMemRefFilter) { + AffineLoadOp load; + if (clMemRefFilter || clTestGenerateCopyFromMemRefRegion) { // Gather MemRef filter. For simplicity, we use the first loaded memref // found in the innermost loop. for (auto &op : *innermostLoop.getBody()) { - if (auto load = dyn_cast(op)) { - memrefFilter = load.getMemRef(); + if (auto ld = dyn_cast(op)) { + load = ld; break; } } @@ -72,8 +77,15 @@ /*fastMemorySpace=*/0, /*tagMemorySpace=*/0, /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; - DenseSet copyNests; - affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests); + if (clMemRefFilter) { + DenseSet copyNests; + affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests); + } else if (clTestGenerateCopyFromMemRefRegion) { + CopyGenerateResult result; + MemRefRegion region(loopNest.getLoc()); + region.compute(load, 0); + generateCopyFromMemRefRegion(region, loopNest, copyOptions, result); + } } namespace mlir {