diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -24,6 +24,7 @@ class FuncOp; class OpBuilder; class Value; +struct MemRefRegion; namespace loop { class ForOp; @@ -185,6 +186,34 @@ Optional filterMemRef, DenseSet ©Nests); +/// Result for calling generateCopyForMemRegion. +struct CopyGenerateResult { + // Number of bytes used by alloc. + uint64_t sizeInBytes; + + // The newly created buffer allocation. + Operation *alloc; + + // Generated loop nest for copying data between the allocated buffer and the + // original memref. + Operation *copyNest; +}; + +/// generateCopyForMemRegion is similar to affineDataCopyGenerate, but works +/// with a single memref region. `memrefRegion` is supposed to contain analysis +/// information within analyzedOp. The generated prologue and epilogue always +/// surround `analyzedOp`. +/// +/// Note that `analyzedOp` is a single op for API convenience, and the +/// [begin, end) version can be added as needed. +/// +/// Also note that certain options in `copyOptions` aren't looked at anymore, +/// like slowMemorySpace. +LogicalResult generateCopyForMemRegion(const MemRefRegion &memrefRegion, + Operation *analyzedOp, + const AffineCopyOptions ©Options, + CopyGenerateResult &result); + /// Tile a nest of standard for loops rooted at `rootForOp` by finding such /// parametric tile sizes that the outer loops have a fixed number of iterations /// as defined in `sizes`. diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -1797,6 +1797,28 @@ filterMemRef, copyNests); } +LogicalResult mlir::generateCopyForMemRegion( + const MemRefRegion &memrefRegion, Operation *analyzedOp, + const AffineCopyOptions ©Options, CopyGenerateResult &result) { + Block *block = analyzedOp->getBlock(); + auto begin = analyzedOp->getIterator(); + auto end = std::next(begin); + DenseMap fastBufferMap; + DenseSet copyNests; + + auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end, + copyOptions, fastBufferMap, copyNests, + &result.sizeInBytes, &begin, &end); + if (failed(err)) + return err; + + result.alloc = + fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp(); + assert(copyNests.size() <= 1 && "At most one copy nest is expected."); + result.copyNest = copyNests.empty() ? nullptr : *copyNests.begin(); + return success(); +} + /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir --- a/mlir/test/Transforms/affine-data-copy.mlir +++ b/mlir/test/Transforms/affine-data-copy.mlir @@ -6,7 +6,8 @@ // affine data copy utility on the input loop nest. // '-test-affine-data-copy-memref-filter' passes the first memref found in an // affine.load op in the innermost loop as a filter. -// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter' | FileCheck %s --check-prefix=FILTER +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='for-memref-region' | FileCheck %s --check-prefix=MEMREF_REGION // -copy-skip-non-stride-loops forces the copies to be placed right inside the // tile space loops, avoiding the sensitivity of copy placement depth to memory @@ -140,6 +141,7 @@ // // CHECK-SMALL-LABEL: func @foo // FILTER-LABEL: func @foo +// MEMREF_REGION-LABEL: func @foo func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> { affine.for %i = 0 to 1024 { affine.for %j = 0 to 1024 { @@ -198,3 +200,15 @@ // FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { // FILTER: dealloc %{{.*}} : memref<1024x1024xf32> // FILTER-NOT: dealloc + +// CHeck that only one memref is copied, because for-memref-region is enabled +// (and the first ever encountered load is analyzed). +// MEMREF_REGION: alloc() : memref<1024x1024xf32> +// MEMREF_REGION-NOT: alloc() +// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION-NEXT: affine.for %{{.*}} = 0 to 1024 { +// MEMREF_REGION: dealloc %{{.*}} : memref<1024x1024xf32> +// MEMREF_REGION-NOT: dealloc diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp --- a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp +++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "mlir/Analysis/Passes.h" +#include "mlir/Analysis/Utils.h" #include "mlir/Dialect/AffineOps/AffineOps.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopUtils.h" @@ -37,6 +38,10 @@ llvm::cl::desc( "Enable memref filter testing in affine data copy optimization"), llvm::cl::init(false)}; + Option clTestGenerateCopyForMemRegion{ + *this, "for-memref-region", + llvm::cl::desc("Test copy generation for a single memref region"), + llvm::cl::init(false)}; }; } // end anonymous namespace @@ -55,13 +60,13 @@ auto loopNest = depthToLoops[0][0]; auto innermostLoop = depthToLoops[innermostLoopIdx][0]; - Optional memrefFilter; - if (clMemRefFilter) { + AffineLoadOp load; + if (clMemRefFilter || clTestGenerateCopyForMemRegion) { // Gather MemRef filter. For simplicity, we use the first loaded memref // found in the innermost loop. for (auto &op : *innermostLoop.getBody()) { - if (auto load = dyn_cast(op)) { - memrefFilter = load.getMemRef(); + if (auto ld = dyn_cast(op)) { + load = ld; break; } } @@ -72,8 +77,15 @@ /*fastMemorySpace=*/0, /*tagMemorySpace=*/0, /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; - DenseSet copyNests; - affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests); + if (clMemRefFilter) { + DenseSet copyNests; + affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests); + } else if (clTestGenerateCopyForMemRegion) { + CopyGenerateResult result; + MemRefRegion region(loopNest.getLoc()); + region.compute(load, /*loopDepth=*/0); + generateCopyForMemRegion(region, loopNest, copyOptions, result); + } } namespace mlir {