diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -793,7 +793,15 @@ backwardSlice.insert(padTensorOp); // Stack step 1. iteratively clone loops and push `packedTensor`. for (Operation *op : backwardSlice) { - if (op->getNumRegions() == 0 || isa(op)) { + // Specifically sit out in the subtenso(packedTensor) case: this is the + // piece we seek to replace. + if (auto subTensor = dyn_cast(op)) + if (bvm.lookupOrDefault(subTensor.source()) == packedTensor) + continue; + auto effects = dyn_cast(op); + bool hasNoEffects = !effects || effects.hasNoEffect(); + if (hasNoEffects && + (op->getNumRegions() == 0 || isa(op))) { b.clone(*op, bvm); continue; } @@ -808,8 +816,10 @@ b.create(loc, bvm.lookupOrDefault(forOp.lowerBound()), bvm.lookupOrDefault(forOp.upperBound()), bvm.lookupOrDefault(forOp.step()), packedTensor); - + // Map the induction var, region args and results to the `clonedForOp`. bvm.map(forOp.getInductionVar(), clonedForOp.getInductionVar()); + bvm.map(forOp.getRegionIterArgs(), clonedForOp.getRegionIterArgs()); + bvm.map(forOp.getResults(), clonedForOp.getResults()); assert(clonedForOp->getNumRegions() == 1); clonedLoopIvs.push_back(clonedForOp.getInductionVar()); diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir --- a/mlir/test/Dialect/Linalg/hoist-padding.mlir +++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir @@ -1,4 +1,13 @@ -// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s +// Specific structural checks are performed on 2-level hoisting +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=2 -canonicalize | FileCheck %s + +// IR verification is performed on [0-6]-level hoisting +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=0 | FileCheck %s --check-prefix=VERIFIER-ONLY +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=1 | FileCheck %s --check-prefix=VERIFIER-ONLY +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=3 | FileCheck %s --check-prefix=VERIFIER-ONLY +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=4 | FileCheck %s --check-prefix=VERIFIER-ONLY +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=5 | FileCheck %s --check-prefix=VERIFIER-ONLY +// RUN: mlir-opt %s -split-input-file -test-linalg-transform-patterns=test-hoist-padding=6 | FileCheck %s --check-prefix=VERIFIER-ONLY // CHECK-DAG: #[[$DIV3:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 3)> // CHECK-DAG: #[[$DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> @@ -14,6 +23,7 @@ // CHECK-SAME: %[[TA:[0-9a-z]+]]: tensor // CHECK-SAME: %[[TB:[0-9a-z]+]]: tensor // CHECK-SAME: %[[TC:[0-9a-z]+]]: tensor +// VERIFIER-ONLY-LABEL: func @matmul_tensors func @matmul_tensors( %arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor @@ -140,6 +150,7 @@ #map2 = affine_map<(d0, d1) -> (2, d0 - d1)> // CHECK-LABEL: func @dot +// VERIFIER-ONLY-LABEL: func @dot func @dot(%arg0: tensor, %arg1: tensor, %arg2: tensor) -> tensor { @@ -217,3 +228,63 @@ } return %4 : tensor } + +// ----- + +// CHECK-LABEL: func @matmul_2d_tiling +// VERIFIER-ONLY-LABEL: func @matmul_2d_tiling +func @matmul_2d_tiling(%arg0: tensor<32x128xf32>, %arg1: tensor<128x64xf32>, %arg2: tensor<32x64xf32>) -> tensor<32x64xf32> { + %c128 = constant 128 : index + %c64 = constant 64 : index + %c32 = constant 32 : index + %c16 = constant 16 : index + %cst = constant 0.000000e+00 : f32 + %c2 = constant 2 : index + %c4 = constant 4 : index + %c0 = constant 0 : index + %1 = scf.for %arg3 = %c0 to %c32 step %c16 iter_args(%arg4 = %arg2) -> (tensor<32x64xf32>) { + %2 = scf.for %arg5 = %c0 to %c64 step %c32 iter_args(%arg6 = %arg4) -> (tensor<32x64xf32>) { + %3 = scf.for %arg7 = %c0 to %c128 step %c32 iter_args(%arg8 = %arg6) -> (tensor<32x64xf32>) { + %4 = subtensor %arg0[%arg3, %arg7] [16, 32] [1, 1] : tensor<32x128xf32> to tensor<16x32xf32> + %5 = subtensor %arg1[%arg7, %arg5] [32, 32] [1, 1] : tensor<128x64xf32> to tensor<32x32xf32> + %6 = subtensor %arg8[%arg3, %arg5] [16, 32] [1, 1] : tensor<32x64xf32> to tensor<16x32xf32> + %7 = scf.for %arg9 = %c0 to %c16 step %c2 iter_args(%arg10 = %6) -> (tensor<16x32xf32>) { + %10 = scf.for %arg11 = %c0 to %c32 step %c4 iter_args(%arg12 = %arg10) -> (tensor<16x32xf32>) { + %11 = scf.for %arg13 = %c0 to %c32 step %c16 iter_args(%arg14 = %arg12) -> (tensor<16x32xf32>) { + %12 = subtensor %4[%arg9, %arg13] [2, 16] [1, 1] : tensor<16x32xf32> to tensor<2x16xf32> + %13 = tensor.cast %12 : tensor<2x16xf32> to tensor + %14 = subtensor %5[%arg13, %arg11] [16, 4] [1, 1] : tensor<32x32xf32> to tensor<16x4xf32> + %15 = tensor.cast %14 : tensor<16x4xf32> to tensor + %16 = subtensor %arg14[%arg9, %arg11] [2, 4] [1, 1] : tensor<16x32xf32> to tensor<2x4xf32> + %17 = tensor.cast %16 : tensor<2x4xf32> to tensor + %18 = linalg.pad_tensor %13 low[%c0, %c0] high[%c0, %c0] { + ^bb0(%arg15: index, %arg16: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<2x16xf32> + %19 = linalg.pad_tensor %15 low[%c0, %c0] high[%c0, %c0] { + ^bb0(%arg15: index, %arg16: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<16x4xf32> + %20 = linalg.pad_tensor %17 low[%c0, %c0] high[%c0, %c0] { + ^bb0(%arg15: index, %arg16: index): // no predecessors + linalg.yield %cst : f32 + } : tensor to tensor<2x4xf32> + %21 = linalg.matmul ins(%18, %19 : tensor<2x16xf32>, tensor<16x4xf32>) outs(%20 : tensor<2x4xf32>) -> tensor<2x4xf32> + %22 = tensor.cast %21 : tensor<2x4xf32> to tensor + %23 = subtensor_insert %22 into %arg14[%arg9, %arg11] [%c2, %c4] [1, 1] : tensor into tensor<16x32xf32> + scf.yield %23 : tensor<16x32xf32> + } + scf.yield %11 : tensor<16x32xf32> + } + scf.yield %10 : tensor<16x32xf32> + } + %8 = tensor.cast %7 : tensor<16x32xf32> to tensor + %9 = subtensor_insert %8 into %arg8[%arg3, %arg5] [%c16, %c32] [1, 1] : tensor into tensor<32x64xf32> + scf.yield %9 : tensor<32x64xf32> + } + scf.yield %3 : tensor<32x64xf32> + } + scf.yield %2 : tensor<32x64xf32> + } + return %1 : tensor<32x64xf32> +} diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp --- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp @@ -84,9 +84,9 @@ Option testTileAndPadPattern{ *this, "test-tile-and-pad-pattern", llvm::cl::desc("Test tile and pad pattern"), llvm::cl::init(false)}; - Option testHoistPadding2Levels{*this, "test-hoist-padding-2-level", - llvm::cl::desc("Test hoist padding"), - llvm::cl::init(false)}; + Option testHoistPadding{*this, "test-hoist-padding", + llvm::cl::desc("Test hoist padding"), + llvm::cl::init(0)}; }; } // end anonymous namespace @@ -571,9 +571,9 @@ return applyAffineMinSCFCanonicalizationPatterns(getFunction()); if (testTileAndPadPattern) return applyTileAndPadPattern(getFunction()); - if (testHoistPadding2Levels) { - getFunction().walk([](linalg::PadTensorOp padTensorOp) { - (void)linalg::hoistPaddingOnTensors(padTensorOp, 2); + if (testHoistPadding) { + getFunction().walk([&](linalg::PadTensorOp padTensorOp) { + (void)linalg::hoistPaddingOnTensors(padTensorOp, testHoistPadding); }); } }