diff --git a/mlir/include/mlir/Transforms/LoopUtils.h b/mlir/include/mlir/Transforms/LoopUtils.h --- a/mlir/include/mlir/Transforms/LoopUtils.h +++ b/mlir/include/mlir/Transforms/LoopUtils.h @@ -171,9 +171,11 @@ /// by its root affine.for. Since we generate alloc's and dealloc's for all fast /// buffers (before and after the range of operations resp. or at a hoisted /// position), all of the fast memory capacity is assumed to be available for -/// processing this block range. +/// processing this block range. When 'filterMemRef' is specified, copies are +/// only generated for the provided MemRef. uint64_t affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, + Optional filterMemRef, DenseSet ©Nests); /// Tile a nest of standard for loops rooted at `rootForOp` by finding such @@ -220,6 +222,11 @@ /// ``` void mapLoopToProcessorIds(loop::ForOp forOp, ArrayRef processorId, ArrayRef numProcessors); + +/// Gathers all AffineForOps in 'func' grouped by loop depth. +void gatherLoops(FuncOp func, + DenseMap> &depthToLoops); + } // end namespace mlir #endif // MLIR_TRANSFORMS_LOOP_UTILS_H diff --git a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp --- a/mlir/lib/Transforms/AffineDataCopyGeneration.cpp +++ b/mlir/lib/Transforms/AffineDataCopyGeneration.cpp @@ -179,7 +179,7 @@ if ((forOp = dyn_cast(&*it)) && copyNests.count(forOp) == 0) { // Perform the copying up unti this 'for' op first. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/it, copyOptions, - copyNests); + /*filterMemRef=*/llvm::None, copyNests); // Returns true if the footprint is known to exceed capacity. auto exceedsCapacity = [&](AffineForOp forOp) { @@ -213,7 +213,7 @@ // consumed capacity. The footprint check above guarantees this inner // loop's footprint fits. affineDataCopyGenerate(/*begin=*/it, /*end=*/std::next(it), copyOptions, - copyNests); + /*filterMemRef=*/llvm::None, copyNests); } // Get to the next load or store op after 'forOp'. curBegin = std::find_if(std::next(it), block->end(), [&](Operation &op) { @@ -236,7 +236,7 @@ assert(!curBegin->isKnownTerminator() && "can't be a terminator"); // Exclude the affine terminator - hence, the std::prev. affineDataCopyGenerate(/*begin=*/curBegin, /*end=*/std::prev(block->end()), - copyOptions, copyNests); + copyOptions, /*filterMemRef=*/llvm::None, copyNests); } return success(); diff --git a/mlir/lib/Transforms/Utils/LoopUtils.cpp b/mlir/lib/Transforms/Utils/LoopUtils.cpp --- a/mlir/lib/Transforms/Utils/LoopUtils.cpp +++ b/mlir/lib/Transforms/Utils/LoopUtils.cpp @@ -1585,16 +1585,21 @@ return true; } -/// Generates copies for a contiguous sequence of operations in `block` in the -/// iterator range [`begin', `end'), where `end' can't be past the terminator of -/// the block (since additional operations are potentially inserted right before -/// `end'. Returns the total size of the fast buffers used. -// Since we generate alloc's and dealloc's for all fast buffers (before and -// after the range of operations resp.), all of the fast memory capacity is -// assumed to be available for processing this block range. +/// Performs explicit copying for the contiguous sequence of operations in the +/// block iterator range [`begin', `end'), where `end' can't be past the +/// terminator of the block (since additional operations are potentially +/// inserted right before `end`. Returns the total size of fast memory space +/// buffers used. `copyOptions` provides various parameters, and the output +/// argument `copyNests` is the set of all copy nests inserted, each represented +/// by its root affine.for. Since we generate alloc's and dealloc's for all fast +/// buffers (before and after the range of operations resp. or at a hoisted +/// position), all of the fast memory capacity is assumed to be available for +/// processing this block range. When 'filterMemRef' is specified, copies are +/// only generated for the provided MemRef. uint64_t mlir::affineDataCopyGenerate(Block::iterator begin, Block::iterator end, const AffineCopyOptions ©Options, + Optional filterMemRef, DenseSet ©Nests) { if (begin == end) return 0; @@ -1631,12 +1636,14 @@ block->walk(begin, end, [&](Operation *opInst) { // Gather regions to allocate to buffers in faster memory space. if (auto loadOp = dyn_cast(opInst)) { - if ((loadOp.getMemRefType().getMemorySpace() != + if ((filterMemRef.hasValue() && filterMemRef != loadOp.getMemRef()) || + (loadOp.getMemRefType().getMemorySpace() != copyOptions.slowMemorySpace)) return; } else if (auto storeOp = dyn_cast(opInst)) { - if (storeOp.getMemRefType().getMemorySpace() != - copyOptions.slowMemorySpace) + if ((filterMemRef.hasValue() && filterMemRef != storeOp.getMemRef()) || + storeOp.getMemRefType().getMemorySpace() != + copyOptions.slowMemorySpace) return; } else { // Neither load nor a store op. @@ -1776,3 +1783,24 @@ return totalCopyBuffersSizeInBytes; } + +/// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. +static void gatherLoopsInBlock( + Block *block, unsigned currLoopDepth, + DenseMap> &depthToLoops) { + auto &loopsAtDepth = depthToLoops[currLoopDepth]; + for (auto &op : *block) { + if (auto forOp = dyn_cast(op)) { + loopsAtDepth.push_back(forOp); + gatherLoopsInBlock(forOp.getBody(), currLoopDepth + 1, depthToLoops); + } + } +} + +/// Gathers all AffineForOps in 'func' grouped by loop depth. +void mlir::gatherLoops( + FuncOp func, + DenseMap> &depthToLoops) { + for (auto &block : func) + gatherLoopsInBlock(&block, /*currLoopDepth=*/0, depthToLoops); +} diff --git a/mlir/test/Transforms/affine-data-copy.mlir b/mlir/test/Transforms/affine-data-copy.mlir --- a/mlir/test/Transforms/affine-data-copy.mlir +++ b/mlir/test/Transforms/affine-data-copy.mlir @@ -2,6 +2,12 @@ // Small buffer size to trigger fine copies. // RUN: mlir-opt %s -affine-data-copy-generate -affine-data-copy-generate-dma=false -affine-data-copy-generate-fast-mem-space=0 -affine-data-copy-generate-fast-mem-capacity=1 | FileCheck --check-prefix=CHECK-SMALL %s +// Test affine data copy with a memref filter. We use a test pass that invokes +// affine data copy utility on the input loop nest. +// '-test-affine-data-copy-memref-filter' passes the first memref found in an +// affine.load op in the innermost loop as a filter. +// RUN: mlir-opt %s -split-input-file -test-affine-data-copy='memref-filter=1' | FileCheck %s --check-prefix=FILTER + // -copy-skip-non-stride-loops forces the copies to be placed right inside the // tile space loops, avoiding the sensitivity of copy placement depth to memory // footprint -- so that one could write a definite test case and not have to @@ -16,6 +22,7 @@ // CHECK-DAG: [[BUF_IDX_MAP:map[0-9]+]] = affine_map<(d0, d1, d2, d3) -> (-d0 + d2, -d1 + d3)> // CHECK-LABEL: func @matmul +// FILTER-LABEL: func @matmul func @matmul(%A: memref<4096x4096xf32>, %B: memref<4096x4096xf32>, %C: memref<4096x4096xf32>) -> memref<4096x4096xf32> { affine.for %i = 0 to 4096 step 128 { affine.for %j = 0 to 4096 step 128 { @@ -110,11 +117,29 @@ // CHECK: } // CHECK: } +// Check that only one memref is copied when memref filter is used. + +// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER: alloc() : memref<128x4096xf32> +// FILTER-NOT: alloc() +// FILTER: affine.for %{{.*}} = 0 to 128 { +// FILTER: affine.for %{{.*}} = 0 to 4096 { +// FILTER: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 4096 step 128 { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER-NEXT: affine.for %{{.*}} = #map{{.*}}(%{{.*}}) to #map{{.*}}(%{{.*}}) { +// FILTER: dealloc %1 : memref<128x4096xf32> +// FILTER-NOT: dealloc %1 : memref<128x4096xf32> + +// ----- + // // This test case will lead to single element buffers. These are eventually // expected to be turned into registers via alloca and mem2reg. // -// CHECK-SMALL: func @foo +// CHECK-SMALL-LABEL: func @foo +// FILTER-LABEL: func @foo func @foo(%arg0: memref<1024x1024xf32>, %arg1: memref<1024x1024xf32>, %arg2: memref<1024x1024xf32>) -> memref<1024x1024xf32> { affine.for %i = 0 to 1024 { affine.for %j = 0 to 1024 { @@ -161,3 +186,15 @@ // CHECK-SMALL: } // CHECK-SMALL: } // CHECK-SMALL: return + +// Check that only one memref is copied when memref filter is used. + +// FILTER: alloc() : memref<1024x1024xf32> +// FILTER-NOT: alloc() +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER: affine.for %{{.*}} = 0 to 1024 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { +// FILTER-NEXT: affine.for %{{.*}} = 0 to 1024 { +// FILTER: dealloc %{{.*}} : memref<1024x1024xf32> +// FILTER-NOT: dealloc diff --git a/mlir/test/Transforms/dma-generate.mlir b/mlir/test/Transforms/dma-generate.mlir --- a/mlir/test/Transforms/dma-generate.mlir +++ b/mlir/test/Transforms/dma-generate.mlir @@ -543,7 +543,7 @@ // CHECK: affine.dma_wait %{{.*}}[%{{.*}}], %{{.*}} : memref<1xi32> // CHECK: affine.for %{{.*}} = -// ---- +// ----- #map3 = affine_map<(d0) -> (d0)> #map12 = affine_map<(d0) -> (d0 + 3)> @@ -551,6 +551,7 @@ #map15 = affine_map<(d0, d1) -> ((d0 + d1 * 72) mod 2304 - (((d0 + d1 * 72) mod 2304) floordiv 1152) * 1151 - ((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) * 9 - (((((d0 + d1 * 72) mod 2304) mod 1152) mod 9) floordiv 3) * 3)> #map16 = affine_map<(d0, d1) -> (((((d0 + d1 * 72) mod 2304) mod 1152) floordiv 9) floordiv 8)> // Test for test case in b/128303048 #4. +// CHECK-LABEL: func @test_memref_bounds func @test_memref_bounds(%arg0: memref<4x4x16x1xvector<8x128xf32>>, %arg1: memref<144x9xvector<8x128xf32>>, %arg2: memref<2xvector<8x128xf32>>) -> (memref<144x9xvector<8x128xf32>>, memref<2xvector<8x128xf32>>) { %c0 = constant 0 : index affine.for %i8 = 0 to 9 step 3 { diff --git a/mlir/test/lib/Transforms/CMakeLists.txt b/mlir/test/lib/Transforms/CMakeLists.txt --- a/mlir/test/lib/Transforms/CMakeLists.txt +++ b/mlir/test/lib/Transforms/CMakeLists.txt @@ -1,4 +1,5 @@ add_llvm_library(MLIRTestTransforms + TestAffineDataCopy.cpp TestAllReduceLowering.cpp TestCallGraph.cpp TestConstantFold.cpp diff --git a/mlir/test/lib/Transforms/TestAffineDataCopy.cpp b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp new file mode 100644 --- /dev/null +++ b/mlir/test/lib/Transforms/TestAffineDataCopy.cpp @@ -0,0 +1,86 @@ +//===- TestAffineDataCopy.cpp - Test affine data copy utility -------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements a pass to test affine data copy utility functions and +// options. +// +//===----------------------------------------------------------------------===// + +#include "mlir/Analysis/Passes.h" +#include "mlir/Dialect/AffineOps/AffineOps.h" +#include "mlir/Pass/Pass.h" +#include "mlir/Transforms/LoopUtils.h" +#include "mlir/Transforms/Passes.h" + +#define PASS_NAME "test-affine-data-copy" + +using namespace mlir; + +static llvm::cl::OptionCategory clOptionsCategory(PASS_NAME " options"); + +namespace { + +struct TestAffineDataCopy : public FunctionPass { + TestAffineDataCopy() = default; + TestAffineDataCopy(const TestAffineDataCopy &pass){}; + + void runOnFunction() override; + +private: + Option clMemRefFilter{ + *this, "memref-filter", + llvm::cl::desc( + "Enable memref filter testing in affine data copy optimization"), + llvm::cl::init(false)}; +}; + +} // end anonymous namespace + +void TestAffineDataCopy::runOnFunction() { + // Gather all AffineForOps by loop depth. + DenseMap> depthToLoops; + gatherLoops(getFunction(), depthToLoops); + assert(depthToLoops.size() && "Loop nest not found"); + + // Only support tests with a single loop nest and a single innermost loop + // for now. + unsigned innermostLoopIdx = depthToLoops.size() - 2; + if (depthToLoops[0].size() != 1 || depthToLoops[innermostLoopIdx].size() != 1) + return; + + auto loopNest = depthToLoops[0][0]; + auto innermostLoop = depthToLoops[innermostLoopIdx][0]; + Optional memrefFilter; + if (clMemRefFilter) { + // Gather MemRef filter. For simplicity, we use the first loaded memref + // found in the innermost loop. + for (auto &op : *innermostLoop.getBody()) { + if (auto load = dyn_cast(op)) { + memrefFilter = load.getMemRef(); + break; + } + } + } + + AffineCopyOptions copyOptions = {/*generateDma=*/false, + /*slowMemorySpace=*/0, + /*fastMemorySpace=*/0, + /*tagMemorySpace=*/0, + /*fastMemCapacityBytes=*/32 * 1024 * 1024UL}; + DenseSet copyNests; + affineDataCopyGenerate(loopNest.getBody()->begin(), + std::prev(loopNest.getBody()->end()), copyOptions, + memrefFilter, copyNests); +} + +namespace mlir { +void registerTestAffineDataCopyPass() { + PassRegistration( + PASS_NAME, "Tests affine data copy utility functions."); +} +} // namespace mlir diff --git a/mlir/test/lib/Transforms/TestLoopFusion.cpp b/mlir/test/lib/Transforms/TestLoopFusion.cpp --- a/mlir/test/lib/Transforms/TestLoopFusion.cpp +++ b/mlir/test/lib/Transforms/TestLoopFusion.cpp @@ -19,6 +19,7 @@ #include "mlir/IR/Builders.h" #include "mlir/Pass/Pass.h" #include "mlir/Transforms/LoopFusionUtils.h" +#include "mlir/Transforms/LoopUtils.h" #include "mlir/Transforms/Passes.h" #include "llvm/ADT/STLExtras.h" @@ -54,19 +55,6 @@ } // end anonymous namespace -// Gathers all AffineForOps in 'block' at 'currLoopDepth' in 'depthToLoops'. -static void -gatherLoops(Block *block, unsigned currLoopDepth, - DenseMap> &depthToLoops) { - auto &loopsAtDepth = depthToLoops[currLoopDepth]; - for (auto &op : *block) { - if (auto forOp = dyn_cast(op)) { - loopsAtDepth.push_back(forOp); - gatherLoops(forOp.getBody(), currLoopDepth + 1, depthToLoops); - } - } -} - // Run fusion dependence check on 'loops[i]' and 'loops[j]' at loop depths // in range ['loopDepth' + 1, 'maxLoopDepth']. // Emits a remark on 'loops[i]' if a fusion-preventing dependence exists. @@ -194,8 +182,7 @@ do { depthToLoops.clear(); // Gather all AffineForOps by loop depth. - for (auto &block : getFunction()) - gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); + gatherLoops(getFunction(), depthToLoops); // Try to fuse all combinations of src/dst loop nests in 'depthToLoops'. } while (iterateLoops(depthToLoops, testLoopFusionTransformation, @@ -204,8 +191,7 @@ } // Gather all AffineForOps by loop depth. - for (Block &block : getFunction()) - gatherLoops(&block, /*currLoopDepth=*/0, depthToLoops); + gatherLoops(getFunction(), depthToLoops); // Run tests on all combinations of src/dst loop nests in 'depthToLoops'. if (clTestDependenceCheck) diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -34,6 +34,7 @@ void registerPatternsTestPass(); void registerSimpleParametricTilingPass(); void registerSymbolTestPasses(); +void registerTestAffineDataCopyPass(); void registerTestAllReduceLoweringPass(); void registerTestCallGraphPass(); void registerTestConstantFold(); @@ -85,6 +86,7 @@ registerPatternsTestPass(); registerSimpleParametricTilingPass(); registerSymbolTestPasses(); + registerTestAffineDataCopyPass(); registerTestAllReduceLoweringPass(); registerTestCallGraphPass(); registerTestConstantFold();