diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -15,6 +15,7 @@ #ifndef MLIR_TRANSFORMS_LOOP_UTILS_H #define MLIR_TRANSFORMS_LOOP_UTILS_H +#include "mlir/Analysis/Utils.h" #include "mlir/IR/Block.h" #include "mlir/Support/LLVM.h" #include "mlir/Support/LogicalResult.h" @@ -185,6 +186,28 @@ Optional filterMemRef, DenseSet ©Nests); +/// generateDataCopyAroundOp is similar to affineDataCopyGenerate, but with some +/// simplifications: +/// * The logic of "find relavant memrefs and their uses" is de-coupled and push +/// back to the users. It focuses on generating fast buffers and associated +/// loops/DMAs. +/// * It handles a single memref per call. +/// * The prologue and epilogue always surround `op`, not in potentially +/// arbitrary places. +/// +/// Notice that certain options in `copyOptions` isn't looked at anymore, like +/// slowMemorySpace. +struct CopyGenerateResult { + uint64_t sizeInBytes; + Operation *alloc; + Operation *copyNest; +}; + +LogicalResult generateDataCopyAroundOp(const MemRefRegion &memrefRegion, + Operation *where, + const AffineCopyOptions ©Options, + CopyGenerateResult &result); + /// Tile a nest of standard for loops rooted at `rootForOp` by finding such /// parametric tile sizes that the outer loops have a fixed number of iterations /// as defined in `sizes`. diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -1797,6 +1797,33 @@ filterMemRef, copyNests); } +LogicalResult mlir::generateDataCopyAroundOp( + const MemRefRegion &memrefRegion, Operation *where, + const AffineCopyOptions ©Options, CopyGenerateResult &result) { + Block *block = where->getBlock(); + auto begin = where->getIterator(); + auto end = std::next(begin); + DenseMap fastBufferMap; + DenseSet copyNests; + + auto err = generateCopy(memrefRegion, block, begin, end, block, begin, end, + copyOptions, fastBufferMap, copyNests, + &result.sizeInBytes, &begin, &end); + if (failed(err)) { + return err; + } + result.alloc = + fastBufferMap.find(memrefRegion.memref)->second.getDefiningOp(); + if (copyNests.empty()) { + result.copyNest = nullptr; + } else { + assert(copyNests.size() == 1 && + "Multiple copy nests generated appear for a single memref."); + result.copyNest = *copyNests.begin(); + } + return success(); +} + /// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. static void gatherLoopsInBlock(Block *block, unsigned currLoopDepth, diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir --- a/mlir/test/Transforms/affine-data-copy.mlir +++ b/mlir/test/Transforms/affine-data-copy.mlir @@ -7,6 +7,7 @@ // '-test-affine-data-copy-memref-filter' passes the first memref found in an // affine.load op in the innermost loop as a filter. // RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='test-generate-data-copy-around-op=1' | FileCheck %s --check-prefix=COPY_AROUND_OP // -copy-skip-non-stride-loops forces the copies to be placed right inside the // tile space loops, avoiding the sensitivity of copy placement depth to memory @@ -198,3 +199,13 @@ // FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { // FILTER: dealloc %{{.*}} : memref<1024x1024xf32> // FILTER-NOT: dealloc + +// COPY_AROUND_OP: alloc() : memref<1024x1024xf32> +// COPY_AROUND_OP-NOT: alloc() +// COPY_AROUND_OP: affine.for %{{.*}} = 0 to 1024 { +// COPY_AROUND_OP: affine.for %{{.*}} = 0 to 1024 { +// COPY_AROUND_OP: affine.for %{{.*}} = 0 to 1024 { +// COPY_AROUND_OP-NEXT: affine.for %{{.*}} = 0 to 1024 { +// COPY_AROUND_OP-NEXT: affine.for %{{.*}} = 0 to 1024 { +// COPY_AROUND_OP: dealloc %{{.*}} : memref<1024x1024xf32> +// COPY_AROUND_OP-NOT: dealloc diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp --- a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp +++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp @@ -37,6 +37,10 @@ llvm::cl::desc( "Enable memref filter testing in affine data copy optimization"), llvm::cl::init(false)}; + Option clTestGenerateDataCopyAroundOp{ + *this, "test-generate-data-copy-around-op", + llvm::cl::desc("Test through generateCopyAroundOp"), + llvm::cl::init(false)}; }; } // end anonymous namespace @@ -55,13 +59,13 @@ auto loopNest = depthToLoops[0][0]; auto innermostLoop = depthToLoops[innermostLoopIdx][0]; - Optional memrefFilter; - if (clMemRefFilter) { + AffineLoadOp load; + if (clMemRefFilter || clTestGenerateDataCopyAroundOp) { // Gather MemRef filter. For simplicity, we use the first loaded memref // found in the innermost loop. for (auto &op : *innermostLoop.getBody()) { - if (auto load = dyn_cast(op)) { - memrefFilter = load.getMemRef(); + if (auto ld = dyn_cast(op)) { + load = ld; break; } } @@ -72,8 +76,15 @@ /*fastMemorySpace=*/0, /*tagMemorySpace=*/0, /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; - DenseSet copyNests; - affineDataCopyGenerate(loopNest, copyOptions, memrefFilter, copyNests); + if (clMemRefFilter) { + DenseSet copyNests; + affineDataCopyGenerate(loopNest, copyOptions, load.getMemRef(), copyNests); + } else if (clTestGenerateDataCopyAroundOp) { + CopyGenerateResult result; + MemRefRegion region(loopNest.getLoc()); + region.compute(load, 0); + generateDataCopyAroundOp(region, loopNest, copyOptions, result); + } } namespace mlir {