diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Hoisting.h @@ -11,8 +11,10 @@ namespace mlir { class FuncOp; +class LogicalResult; namespace linalg { +class SimplePadOp; /// Hoist alloc/dealloc pairs and alloca op out of immediately enclosing /// scf::ForOp if both conditions are true: @@ -40,6 +42,44 @@ /// instead of buffers. void hoistRedundantVectorTransfersOnTensor(FuncOp func); +/// Mechanically hoist padding operations on tensors by `nLoops` into a new, +/// generally larger tensor. This achieves packing of multiple padding ops into +/// a larger tensor. On success, `simplePadOp` is replaced by the cloned version +/// in the packing loop so the caller can continue reasoning about the padding +/// operation. +/// +/// Example in pseudo-mlir: +/// ======================= +/// +/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR. +/// ``` +/// scf.for (%i, %j, %k) +/// %st0 = subtensor f(%i, %k) : ... to tensor +/// %0 = linalg.simple_pad %st0 pad %pad : +/// tensor to tensor<4x8xf32> +/// compute(%0) +/// ``` +/// +/// IR resembling the following is produced: +/// +/// ``` +/// scf.for (%i) { +/// %packed_init = linalg.init_tensor range(%j) : tensor +/// %packed = scf.for (%k) iter_args(%p : %packed_init) +/// %st0 = subtensor f(%i, %k) : ... to tensor +/// %0 = linalg.simple_pad %st0 pad %pad : +/// tensor to tensor<4x8xf32> +/// scf.yield %1: tensor +/// } -> tensor +/// scf.for (%j, %k) { +/// %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] : +/// tensor to tensor<4x8xf32> +/// compute(%st0) +/// } +/// } +/// ``` +LogicalResult hoistPaddingOnTensors(SimplePadOp &simplePadOp, unsigned nLoops); + } // namespace linalg } // namespace mlir diff --git a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td --- a/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td +++ b/mlir/include/mlir/Dialect/StandardOps/IR/Ops.td @@ -3058,6 +3058,17 @@ // Build a SubTensorOp with all dynamic entries and custom result type. OpBuilderDAG<(ins "RankedTensorType":$resultType, "Value":$source, "ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides, + CArg<"ArrayRef", "{}">:$attrs)>, + // Build a SubTensorOp with mixed static and dynamic entries and inferred + // result type. + OpBuilderDAG<(ins "Value":$source, "ArrayRef":$offsets, + "ArrayRef":$sizes, "ArrayRef":$strides, + CArg<"ArrayRef", "{}">:$attrs)>, + // Build a SubTensorOp with mixed static and dynamic entries and custom + // result type. If the type passed is nullptr, it is inferred. + OpBuilderDAG<(ins "RankedTensorType":$resultType, "Value":$source, + "ArrayRef":$offsets, "ArrayRef":$sizes, + "ArrayRef":$strides, CArg<"ArrayRef", "{}">:$attrs)> ]; @@ -3154,6 +3165,11 @@ // Build a SubTensorInsertOp with all dynamic entries. OpBuilderDAG<(ins "Value":$source, "Value":$dest, "ValueRange":$offsets, "ValueRange":$sizes, "ValueRange":$strides, + CArg<"ArrayRef", "{}">:$attrs)>, + // Build a SubTensorInsertOp with mixed static and dynamic entries. + OpBuilderDAG<(ins "Value":$source, "Value":$dest, + "ArrayRef":$offsets, "ArrayRef":$sizes, + "ArrayRef":$strides, CArg<"ArrayRef", "{}">:$attrs)> ]; diff --git a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Hoisting.cpp @@ -334,3 +334,253 @@ }); } } + +/// Ensure prerequisites that guarantee pad op hoisting can occur. +/// Return failure in the cases when we cannot perform hoisting; i.e. if either: +/// 1. There exists a use of `simplePadOp` that is not a linalg input operand. +/// 2. There isn't an enclosing `outermostEnclosingForOp` loop. +/// 3. There exists an op with a region that is dominated by +/// `outermostEnclosingForOp` and that isn't a LoopLikeInterface or a +/// LinalgOp. +/// +/// While ensuring prerequisites: +/// 1. Fill the `backwardSlice` to contain the topologically sorted ops +/// dominated by `outermostEnclosingForOp`. +/// 2. Fill the `packingLoops` to contain only the enclosing loops of +/// `backwardSlice` whose IV is actually used in computing padding. Loops that +/// remain in `backwardSlice` but that are not in `packingLoops` are +/// dimensions of reuse. +static LogicalResult +hoistPaddingOnTensorsPrerequisites(linalg::SimplePadOp simplePadOp, int nLevels, + llvm::SetVector &backwardSlice, + llvm::SetVector &packingLoops) { + // Bail on any use that isn't an input of a Linalg op. + // Hoisting of inplace updates happens after vectorization. + for (OpOperand &use : simplePadOp.result().getUses()) { + auto linalgUser = dyn_cast(use.getOwner()); + if (!linalgUser || !linalgUser.isInputTensor(&use)) + return failure(); + } + + // Get at most nLevels of enclosing loops. + SmallVector reverseEnclosingLoops; + Operation *outermostEnclosingForOp = nullptr, + *nextEnclosingForOp = + simplePadOp->getParentOfType(); + while (nLevels-- > 0 && nextEnclosingForOp) { + outermostEnclosingForOp = nextEnclosingForOp; + reverseEnclosingLoops.push_back(outermostEnclosingForOp); + nextEnclosingForOp = + nextEnclosingForOp->getParentOfType(); + } + if (!outermostEnclosingForOp) + return failure(); + + // Get the backwards slice from `simplePadOp` that is dominated by the + // outermost enclosing loop. + DominanceInfo domInfo(outermostEnclosingForOp); + getBackwardSlice(simplePadOp, &backwardSlice, [&](Operation *op) { + return domInfo.dominates(outermostEnclosingForOp, op); + }); + + // Bail on any op with a region that is not a LoopLikeInterface or a LinalgOp. + if (llvm::any_of(backwardSlice, [](Operation *op) { + return op->getNumRegions() > 0 && !isa(op) && + !isa(op); + })) + return failure(); + + // Filter out the loops whose induction variable is not used to compute the + // padded result. As a first approximation, just look for IVs that have no use + // in the backwardSlice. + // These are the dimensions of reuse that we can exploit to reduce the amount + // of work / memory. + // TODO: would this optimization compose better as a canonicalization? + for (LoopLikeOpInterface loop : reverseEnclosingLoops) { + auto forOp = dyn_cast(loop.getOperation()); + if (!forOp) + continue; + for (Operation *user : forOp.getInductionVar().getUsers()) { + if (backwardSlice.contains(user)) { + packingLoops.insert(forOp); + break; + } + } + } + + // Backward slice is a topologically sorted list of ops starting at + // `outermostEnclosingForOp`. + assert(outermostEnclosingForOp == backwardSlice.front()); + + return success(); +} + +static Value buildLoopTripCount(OpBuilder &b, Operation *op) { + MLIRContext *ctx = op->getContext(); + AffineExpr lb, ub, step = getAffineSymbolExpr(0, ctx); + bindDims(ctx, lb, ub); + scf::ForOp forOp = cast(op); + return b.create( + op->getLoc(), AffineMap::get(2, 1, {(ub - lb).ceilDiv(step)}, ctx), + ValueRange{forOp.lowerBound(), forOp.upperBound(), forOp.step()}); +} + +/// Mechanically hoist padding operations on tensors by at most `nLoops` into a +/// new, generally larger tensor. This achieves packing of multiple padding ops +/// into a larger tensor. On success, `simplePadOp` is replaced by the cloned +/// version in the packing loop so the caller can continue reasoning about the +/// padding operation. +/// +/// Example in pseudo-mlir: +/// ======================= +/// +/// If hoistPaddingOnTensors is called with `nLoops` = 2 on the following IR. +/// ``` +/// scf.for (%i, %j, %k) +/// %st0 = subtensor f(%i, %k) : ... to tensor +/// %0 = linalg.simple_pad %st0 pad %pad : +/// tensor to tensor<4x8xf32> +/// compute(%0) +/// ``` +/// +/// IR resembling the following is produced: +/// +/// ``` +/// scf.for (%i) { +/// %packed_init = linalg.init_tensor range(%j) : tensor +/// %packed = scf.for (%k) iter_args(%p : %packed_init) +/// %st0 = subtensor f(%i, %k) : ... to tensor +/// %0 = linalg.simple_pad %st0 pad %pad : +/// tensor to tensor<4x8xf32> +/// scf.yield %1: tensor +/// } -> tensor +/// scf.for (%j, %k) { +/// %st0 = subtensor %packed [%k, 0, 0][1, 4, 8][1, 1, 1] : +/// tensor to tensor<4x8xf32> +/// compute(%st0) +/// } +/// } +/// ``` +LogicalResult mlir::linalg::hoistPaddingOnTensors(SimplePadOp &simplePadOp, + unsigned nLoops) { + llvm::SetVector backwardSlice, packingLoops; + if (failed(hoistPaddingOnTensorsPrerequisites(simplePadOp, nLoops, + backwardSlice, packingLoops))) + return failure(); + + // Update actual number of loops, which may be smaller. + nLoops = packingLoops.size(); + + Location loc = simplePadOp->getLoc(); + RankedTensorType paddedTensorType = simplePadOp.getResultType(); + unsigned paddedRank = paddedTensorType.getRank(); + + // Backward slice is a topologically sorted list of ops starting at + // `outermostEnclosingForOp`. + Operation *outermostEnclosingForOp = backwardSlice.front(); + // IP just before the outermost loop considered that we hoist above. + OpBuilder b(outermostEnclosingForOp); + + // Create the packed tensor into which we amortize + // padding. + SmallVector packedShape(nLoops, ShapedType::kDynamicSize); + // TODO: go grab dims when necessary, for now SimplePadOp returns a static + // tensor. + llvm::append_range(packedShape, paddedTensorType.getShape()); + auto packedTensorType = + RankedTensorType::get(packedShape, paddedTensorType.getElementType()); + auto dynamicSizes = llvm::to_vector<4>(llvm::map_range( + packingLoops, [&](Operation *op) { return buildLoopTripCount(b, op); })); + Value packedTensor = b.create( + loc, dynamicSizes, packedTensorType.getShape(), + packedTensorType.getElementType()); + + // Clone the operations involved in the backward slice, iteratively stepping + // into the loops that we encounter. + // The implementation proceeds in a stack-like fashion: + // 1. Iteratively clone and step into the loops, pushing the `packedTensor` + // deeper in the stack. + // 2. Create a SubTensorInsert at the top of the stack. + // 3. Iteratively pop and yield the result of the SubTensorInsertOp across + // the cloned loops. + SmallVector clonedLoopIvs; + clonedLoopIvs.reserve(nLoops); + BlockAndValueMapping bvm; + // Stack step 1. iteratively clone loops and push `packedTensor`. + // Insert `simplePadOp` into the backwardSlice so we clone it too. + backwardSlice.insert(simplePadOp); + for (Operation *op : backwardSlice) { + if (op->getNumRegions() == 0) { + b.clone(*op, bvm); + continue; + } + // TODO: support more cases as they appear. + auto forOp = dyn_cast(op); + assert(forOp && "Expected scf::ForOp when hoisting pad ops"); + // Unused loop, just skip it. + if (!packingLoops.contains(forOp)) + continue; + auto clonedForOp = + b.create(loc, forOp.lowerBound(), forOp.upperBound(), + forOp.step(), packedTensor); + assert(clonedForOp->getNumRegions() == 1); + clonedLoopIvs.push_back(clonedForOp.getInductionVar()); + b.setInsertionPointToStart(&clonedForOp->getRegion(0).front()); + bvm.map(forOp.getInductionVar(), clonedLoopIvs.back()); + packedTensor = clonedForOp.getRegionIterArgs().front(); + } + + // Stack step 2. create SubTensorInsertOp at the top of the stack. + // offsets = [clonedLoopIvs, 0 .. 0]. + SmallVector offsets(clonedLoopIvs.begin(), clonedLoopIvs.end()); + offsets.append(paddedRank, b.getIndexAttr(0)); + // sizes = [1 .. 1, paddedShape]. + SmallVector sizes(nLoops, b.getIndexAttr(1)); + for (int64_t sz : paddedTensorType.getShape()) { + // TODO: go grab dims when necessary, for now SimplePadOp returns a static + // tensor. + assert(!ShapedType::isDynamic(sz) && "padded tensor needs static sizes"); + sizes.push_back(b.getIndexAttr(sz)); + } + // strides = [1 .. 1]. + SmallVector strides(nLoops + paddedRank, b.getIndexAttr(1)); + + Value inserted = + b.create(loc, bvm.lookup(simplePadOp.result()), + packedTensor, offsets, sizes, strides); + + // Stack step 3. iteratively pop the stack and propagate the yield. + Value valueToYield = inserted; + for (Value iv : llvm::reverse(clonedLoopIvs)) { + auto forOp = scf::getForInductionVarOwner(iv); + b.setInsertionPointToEnd(&forOp.getRegion().front()); + b.create(loc, valueToYield); + valueToYield = forOp.getResult(0); + } + + // Now the packed tensor is ready, replace the original padding op by a + // 1x..x1 SubTensor [originalLoopIvs, 0 .. 0][1 .. 1, paddedShape][1 .. 1]. + b.setInsertionPoint(simplePadOp); + SmallVector originalLoopIvs = + llvm::to_vector<4>(llvm::map_range(packingLoops, [](Operation *loop) { + return cast(loop).getInductionVar(); + })); + // offsets = [originalLoopIvs, 0 .. 0]. + offsets.assign(originalLoopIvs.begin(), originalLoopIvs.end()); + offsets.append(paddedRank, b.getIndexAttr(0)); + // sizes = [1 .. 1, paddedShape] (definedabove). + // strides = [1 .. 1] (defined above) + packedTensor = + scf::getForInductionVarOwner(clonedLoopIvs.front())->getResult(0); + simplePadOp.replaceAllUsesWith( + b.create(loc, simplePadOp.getResultType(), packedTensor, + offsets, sizes, strides) + ->getResult(0)); + simplePadOp.erase(); + + // Make the newly cloned `simplePadOp` available to the caller. + simplePadOp = + cast(bvm.lookup(simplePadOp.result()).getDefiningOp()); + + return success(); +} diff --git a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp --- a/mlir/lib/Dialect/StandardOps/IR/Ops.cpp +++ b/mlir/lib/Dialect/StandardOps/IR/Ops.cpp @@ -3505,6 +3505,69 @@ staticStridesVector, offsets, sizes, strides, attrs); } +/// Dispatch `ofr` into either `dynamicVec` if it is a Value or into `staticVec` +/// otherwise. In the dynamic case, `sentinel` is appended to `staticVec` to +/// represent the dynamic value `?`. +static void unpackOpFoldResult(OpFoldResult ofr, + SmallVectorImpl &dynamicVec, + SmallVectorImpl &staticVec, + int64_t sentinel) { + Value v = ofr.dyn_cast(); + if (v) { + dynamicVec.push_back(v); + staticVec.push_back(sentinel); + } else { + APInt apInt = ofr.dyn_cast().cast().getValue(); + staticVec.push_back(apInt.getSExtValue()); + } +} + +static void unpackOpFoldResults(ArrayRef ofrs, + SmallVector &dynamicVec, + SmallVector &staticVec, + int64_t sentinel) { + for (auto ofr : ofrs) + unpackOpFoldResult(ofr, dynamicVec, staticVec, sentinel); +} + +// Build a SubTensorOp with mixed static and dynamic entries and custom result +// type. If the type passed is nullptr, it is inferred. +void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result, + RankedTensorType resultType, Value source, + ArrayRef offsets, + ArrayRef sizes, + ArrayRef strides, + ArrayRef attrs) { + SmallVector staticOffsets, staticSizes, staticStrides; + SmallVector dynamicOffsets, dynamicSizes, dynamicStrides; + unpackOpFoldResults(offsets, dynamicOffsets, staticOffsets, + ShapedType::kDynamicStrideOrOffset); + unpackOpFoldResults(sizes, dynamicSizes, staticSizes, + ShapedType::kDynamicSize); + unpackOpFoldResults(strides, dynamicStrides, staticStrides, + ShapedType::kDynamicStrideOrOffset); + auto sourceRankedTensorType = source.getType().cast(); + // Structuring implementation this way avoids duplication between builders. + if (!resultType) { + resultType = + SubTensorOp::inferResultType(sourceRankedTensorType, staticOffsets, + staticSizes, staticStrides) + .cast(); + } + build(b, result, resultType, source, staticOffsets, staticSizes, + staticStrides, dynamicOffsets, dynamicSizes, dynamicStrides, attrs); +} + +// Build a SubTensorOp with mixed static and dynamic entries and inferred result +// type. +void mlir::SubTensorOp::build(OpBuilder &b, OperationState &result, + Value source, ArrayRef offsets, + ArrayRef sizes, + ArrayRef strides, + ArrayRef attrs) { + build(b, result, RankedTensorType(), source, offsets, sizes, strides, attrs); +} + /// Verifier for SubTensorOp. static LogicalResult verify(SubTensorOp op) { // Verify result type against inferred type. @@ -3600,6 +3663,25 @@ staticStridesVector, offsets, sizes, strides, attrs); } +// Build a SubTensorInsertOp with mixed static and dynamic entries. +void mlir::SubTensorInsertOp::build(OpBuilder &b, OperationState &result, + Value source, Value dest, + ArrayRef offsets, + ArrayRef sizes, + ArrayRef strides, + ArrayRef attrs) { + SmallVector staticOffsets, staticSizes, staticStrides; + SmallVector dynamicOffsets, dynamicSizes, dynamicStrides; + unpackOpFoldResults(offsets, dynamicOffsets, staticOffsets, + ShapedType::kDynamicStrideOrOffset); + unpackOpFoldResults(sizes, dynamicSizes, staticSizes, + ShapedType::kDynamicSize); + unpackOpFoldResults(strides, dynamicStrides, staticStrides, + ShapedType::kDynamicStrideOrOffset); + build(b, result, source, dest, staticOffsets, staticSizes, staticStrides, + dynamicOffsets, dynamicSizes, dynamicStrides, attrs); +} + /// Verifier for SubViewOp. static LogicalResult verify(SubTensorInsertOp op) { if (op.getType() != op.dest().getType()) diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/hoist-padding.mlir @@ -0,0 +1,85 @@ +// RUN: mlir-opt %s -test-linalg-transform-patterns=test-hoist-padding-2-level -canonicalize | FileCheck %s + +#map0 = affine_map<(d0)[s0] -> (2, -d0 + s0)> +#map1 = affine_map<(d0)[s0] -> (4, -d0 + s0)> +#map2 = affine_map<(d0)[s0] -> (3, -d0 + s0)> +#map3 = affine_map<(d0, d1) -> (2, d0 - d1)> +#map4 = affine_map<(d0, d1) -> (3, d0 - d1)> + +// CHECK-LABEL: func @matmul_tensors +func @matmul_tensors( + %arg0: tensor, %arg1: tensor, %arg2: tensor) + -> tensor +{ + %c2 = constant 2 : index + %c3 = constant 3 : index + %c4 = constant 4 : index + %cst = constant 0.000000e+00 : f32 + %c0 = constant 0 : index + %c1 = constant 1 : index + %0 = dim %arg0, %c0 : tensor + %1 = dim %arg0, %c1 : tensor + %2 = dim %arg1, %c1 : tensor + + // CHECK: scf.for + // CHECK: linalg.init_tensor [%{{.*}}, 2, 4] : tensor + // 1-D loop + // CHECK: %[[A:.*]] = scf.for + // CHECK-NOT: scf.for + // CHECK: subtensor %{{.*}} [1, 1] : tensor to tensor + // CHECK: linalg.simple_pad %{{.*}} : tensor to tensor<2x4xf32> pad f32 + // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, 0, 0] + // CHECK-SAME: [1, 2, 4] [1, 1, 1] : tensor<2x4xf32> into tensor + // 2-D loop + // CHECK: linalg.init_tensor [%{{.*}}, %{{.*}}, 4, 3] : tensor + // CHECK: %[[B:.*]] = scf.for + // CHECK: scf.for + // CHECK-NOT: scf.for + // CHECK: subtensor %{{.*}} [1, 1] : tensor to tensor + // CHECK: linalg.simple_pad %{{.*}} : tensor to tensor<4x3xf32> pad f32 + // CHECK: subtensor_insert %{{.*}} into %{{.*}}[%{{.*}}, %{{.*}}, 0, 0] + // CHECK-SAME: [1, 1, 4, 3] [1, 1, 1, 1] : tensor<4x3xf32> into tensor + // 2-D loop + // CHECK: scf.for %[[J:[0-9a-zA-Z]+]] + // CHECK: scf.for %[[K:[0-9a-zA-Z]+]] + // CHECK-NOT: scf.for + // CHECK: %[[stA:.*]] = subtensor %[[A]][%[[K]], 0, 0] [1, 2, 4] [1, 1, 1] : + // CHECK-SAME: tensor to tensor<2x4xf32> + // CHECK: %[[stB:.*]] = subtensor %[[B]][%[[K]], %[[J]], 0, 0] [1, 1, 4, 3] [1, 1, 1, 1] : + // CHECK-SAME: tensor to tensor<4x3xf32> + // CHECK: %[[stC:.*]] = linalg.simple_pad %{{.*}} pad %{{.*}} : + // CHECK-SAME: tensor to tensor<2x3xf32> pad f32 + // CHECK: linalg.matmul ins(%[[stA]], %[[stB]] : tensor<2x4xf32>, tensor<4x3xf32>) + // CHECK-SAME: outs(%[[stC]] : tensor<2x3xf32>) -> tensor<2x3xf32> + %3 = scf.for %arg3 = %c0 to %0 step %c2 iter_args(%arg4 = %arg2) -> (tensor) { + %4 = scf.for %arg5 = %c0 to %2 step %c3 iter_args(%arg6 = %arg4) -> (tensor) { + %5 = scf.for %arg7 = %c0 to %1 step %c4 iter_args(%arg8 = %arg6) -> (tensor) { + %6 = dim %arg0, %c0 : tensor + %7 = affine.min #map0(%arg3)[%6] + %8 = dim %arg0, %c1 : tensor + %9 = affine.min #map1(%arg7)[%8] + %10 = subtensor %arg0[%arg3, %arg7] [%7, %9] [1, 1] : tensor to tensor + %11 = dim %arg1, %c0 : tensor + %12 = affine.min #map1(%arg7)[%11] + %13 = dim %arg1, %c1 : tensor + %14 = affine.min #map2(%arg5)[%13] + %15 = subtensor %arg1[%arg7, %arg5] [%12, %14] [1, 1] : tensor to tensor + %16 = dim %arg8, %c0 : tensor + %17 = affine.min #map3(%16, %arg3) + %18 = dim %arg8, %c1 : tensor + %19 = affine.min #map4(%18, %arg5) + %20 = subtensor %arg8[%arg3, %arg5] [%17, %19] [1, 1] : tensor to tensor + %21 = linalg.simple_pad %10 pad %cst : tensor to tensor<2x4xf32> pad f32 + %22 = linalg.simple_pad %15 pad %cst : tensor to tensor<4x3xf32> pad f32 + %23 = linalg.simple_pad %20 pad %cst : tensor to tensor<2x3xf32> pad f32 + %24 = linalg.matmul ins(%21, %22 : tensor<2x4xf32>, tensor<4x3xf32>) outs(%23 : tensor<2x3xf32>) -> tensor<2x3xf32> + %25 = subtensor %24[0, 0] [%7, %14] [1, 1] : tensor<2x3xf32> to tensor + %26 = subtensor_insert %25 into %arg8[%arg3, %arg5] [%17, %19] [%c1, %c1] : tensor into tensor + scf.yield %26 : tensor + } + scf.yield %5 : tensor + } + scf.yield %4 : tensor + } + return %3 : tensor +} diff --git a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp --- a/mlir/test/lib/Transforms/TestLinalgTransforms.cpp +++ b/mlir/test/lib/Transforms/TestLinalgTransforms.cpp @@ -13,6 +13,7 @@ #include "mlir/Dialect/Affine/IR/AffineOps.h" #include "mlir/Dialect/GPU/GPUDialect.h" #include "mlir/Dialect/Linalg/IR/LinalgOps.h" +#include "mlir/Dialect/Linalg/Transforms/Hoisting.h" #include "mlir/Dialect/Linalg/Transforms/Transforms.h" #include "mlir/Dialect/Linalg/Utils/Utils.h" #include "mlir/Dialect/StandardOps/IR/Ops.h" @@ -82,6 +83,9 @@ Option testTileAndPadPattern{ *this, "test-tile-and-pad-pattern", llvm::cl::desc("Test tile and pad pattern"), llvm::cl::init(false)}; + Option testHoistPadding2Levels{*this, "test-hoist-padding-2-level", + llvm::cl::desc("Test hoist padding"), + llvm::cl::init(false)}; }; } // end anonymous namespace @@ -546,6 +550,11 @@ return applyAffineMinSCFCanonicalizationPatterns(getFunction()); if (testTileAndPadPattern) return applyTileAndPadPattern(getFunction()); + if (testHoistPadding2Levels) { + getFunction().walk([](linalg::SimplePadOp simplePadOp) { + linalg::hoistPaddingOnTensors(simplePadOp, 2); + }); + } } namespace mlir {