diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h --- a/mlir/include/mlir/Dialect/Linalg/Passes.h +++ b/mlir/include/mlir/Dialect/Linalg/Passes.h @@ -21,6 +21,9 @@ std::unique_ptr createLinalgFusionOfTensorOpsPass(); std::unique_ptr createFoldReshapeOpsByLinearizationPass(); +std::unique_ptr> +createLinalgTileAndFusePass(ArrayRef tileSizes = {}); + std::unique_ptr> createLinalgTilingPass(ArrayRef tileSizes = {}); diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td --- a/mlir/include/mlir/Dialect/Linalg/Passes.td +++ b/mlir/include/mlir/Dialect/Linalg/Passes.td @@ -29,6 +29,19 @@ let dependentDialects = ["linalg::LinalgDialect", "AffineDialect"]; } +def LinalgTileAndFuseOps : FunctionPass<"linalg-tile-and-fuse-ops"> { + let summary = "Tile and fuse sequence of Linalg operations"; + let constructor = "mlir::createLinalgTileAndFusePass()"; + let dependentDialects = [ + "AffineDialect", "linalg::LinalgDialect", "scf::SCFDialect" + ]; + let options = [ + ListOption<"tileSizes", "linalg-tile-sizes", "int64_t", + "Set the tile sizes to use for all the ops", + "llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated"> + ]; +} + def LinalgFoldReshapeOpsByLinearization : Pass<"linalg-fold-reshape-ops-by-linearization"> { let summary = "Fold TensorReshapeOps with generic/indexed generic ops by " diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -39,9 +39,8 @@ struct TiledAndFusedLinalgOps { LinalgOp op; SmallVector fusedProducers; - SmallVector originalProducers; SmallVector fusedLoops; - SmallVector unfusedLoops; + SmallVector, 1> unfusedLoops; }; /// Populates patterns for vectorization of all ConvN-D ops. @@ -72,14 +71,12 @@ Optional tileLinalgOp(OpBuilder &b, LinalgOp op, const LinalgTilingOptions &options); -/// Tile and fuse the `op` with its producers. The tile and fuse proceeds in -/// three steps -/// - Find tile loops that are fusable with its producer tile loops (a.k.a. tile -/// + fuse loops). -/// - Tile just these loops of the consumer (root operation) and fuse with -/// the producer. -/// - Tile again the tiled consumer operation produced above to do rest of -/// the tiling specified by the `tilingOptions`. +/// Fuse a sequence of linalg operations (`ops`) using tile-and-fuse. This +/// proceeds as follows: +/// - Find outer parallel loops in these ops that can be fused. +/// - Tile fusable outer parallel loops of the last operation in the sequence. +/// - Fuse the remaining operations with the tiled operation +/// - Tile the unfused loops in each of the fused operations id needed. /// /// For example, consider the sequence of matmul below /// @@ -110,32 +107,35 @@ /// ins(%2, %3 : memref<16x32xf32, #map0>, memref<32x32xf32, #map1>) /// outs(%0 : memref<16x32xf32, #map0>) /// scf.parallel (%arg6) = (%c0) to (%c32) step (%c8) { -/// scf.for %arg7 = %c0 to %c32 step %c4 { -/// %4 = subview %0[0, %arg7] [16, 4] [1, 1] -/// : memref<16x32xf32, #map0> to memref<16x4xf32, #map0> -/// %5 = subview %arg3[%arg7, %arg6] [4, 8] [1, 1] -/// : memref<32x32xf32> to memref<4x8xf32, #map0> -/// %6 = subview %1[0, %arg6] [16, 8] [1, 1] -/// : memref<16x32xf32, #map0> to memref<16x8xf32, #map0> -/// linalg.matmul -/// ins(%4, %5 : memref<16x4xf32, #map0>, memref<4x8xf32, #map0>) -/// outs(%6 : memref<16x8xf32, #map0>) +/// scf.for %arg7 = %c0 to %c32 step %c4 { +/// %4 = subview %0[0, %arg7] [16, 4] [1, 1] +/// : memref<16x32xf32, #map0> to memref<16x4xf32, #map0> +/// %5 = subview %arg3[%arg7, %arg6] [4, 8] [1, 1] +/// : memref<32x32xf32> to memref<4x8xf32, #map0> +/// %6 = subview %1[0, %arg6] [16, 8] [1, 1] +/// : memref<16x32xf32, #map0> to memref<16x8xf32, #map0> +/// linalg.matmul +/// ins(%4, %5 : memref<16x4xf32, #map0>, memref<4x8xf32, #map0>) +/// outs(%6 : memref<16x8xf32, #map0>) /// } /// scf.yield /// } /// scf.yield /// } /// -/// The following tiling options are handled differently in tile+fuse (compared -/// to tile only) +/// `tilingOptions` are used to tile the corresponding operation in `ops` (the +/// size of the former should be same as size of the latter. Based on how +/// tile+fuse is implemented, the fused loops are generated based on the last +/// operation in the sequence. For example, the tile sizes for the fused loops +/// is obtained from `tilingOptions.back()`. The following tiling options are +/// handled differently in tile+fuse (compared to tile only) /// - Interchange of the tiling loops is not supported right now. -/// - Distribution is only done for the tile+fuse loops. The tiled loops +/// - Distribution is only done for the fused loops. The tiled loops /// generated by the second tiling is not distributed. Optional -tileAndFuseLinalgOps(PatternRewriter &rewriter, LinalgOp op, +tileAndFuseLinalgOps(OpBuilder &builder, ArrayRef ops, const LinalgDependenceGraph &dependenceGraph, - const LinalgTilingOptions &tilingOptions, - const LinalgFusionOptions &fusionOptions); + ArrayRef tilingOptions); /// Interchanges the `iterator_types` and `iterator_maps` dimensions of `op`. /// This is an in-place transformation controlled by `interchangeVector`. diff --git a/mlir/test/Dialect/Linalg/fusion-pattern.mlir b/mlir/test/Dialect/Linalg/fusion-pattern.mlir --- a/mlir/test/Dialect/Linalg/fusion-pattern.mlir +++ b/mlir/test/Dialect/Linalg/fusion-pattern.mlir @@ -47,7 +47,9 @@ // CHECK: %[[TILE_N_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N_2]]] // CHECK: %[[SV3:.+]] = subview %[[ARG2]][%[[IV0]], %[[IV1]]] // CHECK-SAME: [%[[TILE_M_2]], %[[TILE_N_2]]] -// CHECK: linalg.fill(%[[SV3]], %[[CST]]) +// CHECK: %[[SV3_2:.+]] = subview %[[ARG2]][%[[IV0]], %[[IV1]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N]]] +// CHECK: linalg.fill(%[[SV3_2]], %[[CST]]) // CHECK-SAME: __internal_linalg_transform__ = "after_basic_fusion_producer" // CHECK: scf.for %[[IV2:.+]] = %[[C0]] to %[[K]] step %[[C16]] { // CHECK: %[[TILE_K:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[K]]] @@ -109,9 +111,12 @@ // CHECK: %[[TILE_N_2:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[N_2]]] // CHECK: %[[SV2:.+]] = subview %[[ARG3]][0, %[[IV0]]] // CHECK-SAME: [%[[M]], %[[TILE_N_2]]] +// CHECK: %[[K_2:.+]] = dim %[[ARG1]], %[[C0]] // CHECK: %[[SV3:.+]] = subview %[[ARG1]][0, %[[IV0]]] -// CHECK-SAME: [%[[K]], %[[TILE_N]]] -// CHECK: linalg.copy(%[[SV3]], %[[SV1]]) +// CHECK-SAME: [%[[K_2]], %[[TILE_N]]] +// CHECK: %[[SV3_2:.+]] = subview %[[ARG2]][0, %[[IV0]]] +// CHECK-SAME: [%[[K_2]], %[[TILE_N]]] +// CHECK: linalg.copy(%[[SV3]], %[[SV3_2]]) // CHECK-SAME: __internal_linalg_transform__ = "after_rhs_fusion_producer" // CHECK-NOT: linalg.fill // CHECK-DAG: %[[M_2:.+]] = dim %[[ARG0]], %[[C0]] @@ -186,11 +191,16 @@ // CHECK: %[[N:.+]] = dim %[[ARG3]], %[[C1]] // CHECK: %[[SV2:.+]] = subview %[[ARG3]][%[[IV0]], 0] // CHECK-SAME: [%[[TILE_M_2]], %[[N]]] +// CHECK: %[[SV2_2:.+]] = subview %[[ARG3]][%[[IV0]], 0] +// CHECK-SAME: [%[[TILE_M]], %[[N]]] +// CHECK: %[[K_2:.+]] = dim %[[ARG0]], %[[C1]] // CHECK: %[[SV3:.+]] = subview %[[ARG0]][%[[IV0]], 0] -// CHECK-SAME: [%[[TILE_M]], %[[K]]] -// CHECK: linalg.copy(%[[SV3]], %[[SV1]]) +// CHECK-SAME: [%[[TILE_M]], %[[K_2]]] +// CHECK: %[[SV1_2:.+]] = subview %[[ARG1]][%[[IV0]], 0] +// CHECK-SAME: [%[[TILE_M]], %[[K_2]]] +// CHECK: linalg.copy(%[[SV3]], %[[SV1_2]]) // CHECK-SAME: __internal_linalg_transform__ = "after_two_operand_fusion_producer" -// CHECK: linalg.fill(%[[SV2]], %[[CST]]) +// CHECK: linalg.fill(%[[SV2_2]], %[[CST]]) // CHECK-SAME: __internal_linalg_transform__ = "after_two_operand_fusion_producer" // CHECK-DAG: %[[N_2:.+]] = dim %[[ARG2]], %[[C1]] // CHECK: scf.parallel (%[[IV1:.+]]) = @@ -253,23 +263,26 @@ // CHECK: scf.parallel (%[[IV0:.+]]) = // CHECK-SAME: (%[[C0]]) to (%[[M]]) step (%[[C32]]) { // CHECK: %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M]]] -// CHECK: %[[K2:.+]] = dim %[[ARG2]], %[[C1]] +// CHECK: %[[K:.+]] = dim %[[ARG2]], %[[C1]] // CHECK: %[[SV1:.+]] = subview %[[ARG2]][%[[IV0]], 0] -// CHECK-SAME: [%[[TILE_M]], %[[K2]]] +// CHECK-SAME: [%[[TILE_M]], %[[K]]] // CHECK: %[[M_2:.+]] = dim %[[ARG4]], %[[C0]] // CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M_2]]] // CHECK: %[[N:.+]] = dim %[[ARG4]], %[[C1]] // CHECK: %[[SV2:.+]] = subview %[[ARG4]][%[[IV0]], 0] // CHECK-SAME: [%[[TILE_M_2]], %[[N]]] -// CHECK: %[[K1:.+]] = dim %[[ARG0]], %[[C1]] +// CHECK: %[[N_2:.+]] = dim %[[ARG1]], %[[C1]] +// CHECK: %[[K_2:.+]] = dim %[[ARG0]], %[[C1]] // CHECK: %[[SV3:.+]] = subview %[[ARG0]][%[[IV0]], 0] -// CHECK-SAME: [%[[TILE_M]], %[[K1]]] -// CHECK: %[[SV4:.+]] = subview %[[ARG1]][0, 0] [%[[K1]], %[[K2]]] +// CHECK-SAME: [%[[TILE_M]], %[[K_2]]] +// CHECK: %[[SV4:.+]] = subview %[[ARG1]][0, 0] [%[[K_2]], %[[N_2]]] +// CHECK: %[[SV5:.+]] = subview %[[ARG2]][%[[IV0]], 0] +// CHECK: [%[[TILE_M]], %[[N_2]]] // CHECK: linalg.matmul // CHECK-SAME: __internal_linalg_transform__ = "after_lhs_fusion_producer" // CHECK-SAME: ins(%[[SV3]], %[[SV4]] // CHECK-SAME: : memref, memref) -// CHECK-SAME: outs(%[[SV1]] : memref) +// CHECK-SAME: outs(%[[SV5]] : memref) // CHECK-DAG: %[[N_2:.+]] = dim %[[ARG3]], %[[C1]] // CHECK: scf.parallel (%[[IV1:.+]]) = // CHECK-SAME: (%[[C0]]) to (%[[N_2]]) step (%[[C64]]) { diff --git a/mlir/test/Dialect/Linalg/fusion-sequence.mlir b/mlir/test/Dialect/Linalg/fusion-sequence.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/fusion-sequence.mlir @@ -0,0 +1,207 @@ +// RUN: mlir-opt -pass-pipeline="linalg-tile-and-fuse-ops{linalg-tile-sizes=16,32,64},canonicalize,cse" -split-input-file %s | FileCheck %s + +module { + func @three_op_fusion(%arg0: memref, %arg1: memref, + %arg2: memref, %arg3 : memref) { + %cst = constant 0.000000e+00 : f32 + %c0 = constant 0 : index + %c1 = constant 1 : index + %d0 = dim %arg0, %c0 : memref + %d1 = dim %arg1, %c1 : memref + %0 = alloc(%d0, %d1) : memref + linalg.fill(%0, %cst) : memref, f32 + linalg.matmul ins(%arg0, %arg1 : memref, memref) + outs(%0 : memref) + linalg.generic + {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d1)>, + affine_map<(d0, d1) -> (d0, d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%0, %arg2 : memref, memref) + outs(%arg3 : memref) { + ^bb0(%arg4 : f32, %arg5 : f32, %arg6 : f32) : + %5 = addf %arg4, %arg5 : f32 + linalg.yield %5 : f32 + } + return + } +} + +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (d0 + s0)> +// CHECK: func @three_op_fusion +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: memref +// CHECK: %[[TEMP:.+]] = alloc(%{{.*}}, %{{.*}}) : memref +// CHECK: scf.parallel (%[[IV0:.+]], %[[IV1:.+]]) = {{.*}} { +// CHECK-DAG: %[[SV_TEMP:.+]] = subview %[[TEMP]][%[[IV0]], %[[IV1]]] +// CHECK-DAG: %[[SV_ARG2:.+]] = subview %[[ARG2]][%[[IV1]]] +// CHECK-DAG: %[[SV_ARG3:.+]] = subview %[[ARG3]][%[[IV0]], %[[IV1]]] +// CHECK-DAG: %[[SV_ARG0:.+]] = subview %[[ARG0]][%[[IV0]], 0] +// CHECK-DAG: %[[SV_ARG1:.+]] = subview %[[ARG1]][0, %[[IV1]]] +// CHECK: linalg.fill(%[[SV_TEMP]], %{{.+}}) +// CHECK: scf.for %[[IV2:.+]] = %{{.+}} to %{{.+}} step %{{.+}} { +// CHECK: %[[SV_ARG0_K:.+]] = subview %[[SV_ARG0]][0, %[[IV2]]] +// CHECK: %[[SV_ARG1_K:.+]] = subview %[[SV_ARG1]][%[[IV2]], 0] +// CHECK: linalg.matmul +// CHECK-SAME: ins(%[[SV_ARG0_K]], %[[SV_ARG1_K]] +// CHECK-SAME: : memref, memref) +// CHECK-SAME: outs(%[[SV_TEMP]] : memref) +// CHECK: } +// CHECK: linalg.generic +// CHECK-SAME: ins(%[[SV_TEMP]], %[[SV_ARG2]] +// CHECK-SAME: : memref, memref) +// CHECK-SAME: outs(%[[SV_ARG3]] : memref) +// CHECK: scf.yield +// CHECK: } + +// ----- + +module { + func @sequence_of_matmul(%arg0: memref, %arg1: memref, + %arg2: memref, %arg3: memref, + %arg4: memref) { + %cst = constant 0.000000e+00 : f32 + %c0 = constant 0 : index + %c1 = constant 1 : index + %m = dim %arg0, %c0 : memref + %n1 = dim %arg1, %c1 : memref + %n2 = dim %arg2, %c1 : memref + %n3 = dim %arg3, %c1 : memref + %0 = alloc(%m, %n1) : memref + %1 = alloc(%m, %n2) : memref + linalg.fill(%0, %cst) : memref, f32 + linalg.matmul ins(%arg0, %arg1 : memref, memref) + outs(%0 : memref) + linalg.fill(%1, %cst) : memref, f32 + linalg.matmul ins(%0, %arg2 : memref, memref) + outs(%1 : memref) + linalg.fill(%arg4, %cst) : memref, f32 + linalg.matmul ins(%1, %arg3 : memref, memref) + outs(%arg4 : memref) + return + } +} + +// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0)[s0] -> (16, -d0 + s0)> +// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1)[s0, s1] -> (d0 * s1 + s0 + d1)> +// CHECK-DAG: #[[MAP2:.+]] = affine_map<(d0)[s0] -> (32, -d0 + s0)> +// CHECK-DAG: #[[MAP3:.+]] = affine_map<(d0)[s0] -> (64, -d0 + s0)> +// CHECK: func @sequence_of_matmul +// CHECK-SAME: %[[ARG0:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG1:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG2:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG3:[a-zA-Z0-9_]+]]: memref +// CHECK-SAME: %[[ARG4:[a-zA-Z0-9_]+]]: memref +// CHECK-DAG: %[[C0:.+]] = constant 0 : index +// CHECK-DAG: %[[C1:.+]] = constant 1 : index +// CHECK-DAG: %[[C16:.+]] = constant 16 : index +// CHECK-DAG: %[[C32:.+]] = constant 32 : index +// CHECK-DAG: %[[C64:.+]] = constant 64 : index +// CHECK-DAG: %[[M:.+]] = dim %[[ARG0]], %[[C0]] +// CHECK-DAG: %[[N1:.+]] = dim %[[ARG1]], %[[C1]] +// CHECK-DAG: %[[N2:.+]] = dim %[[ARG2]], %[[C1]] +// CHECK: %[[ALLOC1:.+]] = alloc(%[[M]], %[[N1]]) +// CHECK: %[[ALLOC2:.+]] = alloc(%[[M]], %[[N2]]) +// CHECK: scf.parallel (%[[IV0:.+]]) = (%[[C0]]) to (%[[M]]) +// CHECK-SAME: step (%[[C16]]) { +// CHECK: %[[TILE_M:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M]]] +// CHECK: %[[SV_ALLOC2:.+]] = subview %[[ALLOC2]][%[[IV0]], 0] +// CHECK-SAME: [%[[TILE_M]], %[[N2]]] +// CHECK: %[[M_2:.+]] = dim %[[ARG4]], %[[C0]] +// CHECK: %[[TILE_M_2:.+]] = affine.min #[[MAP0]](%[[IV0]])[%[[M_2]]] +// CHECK: %[[N3:.+]] = dim %[[ARG4]], %[[C1]] +// CHECK: %[[SV_ARG4:.+]] = subview %[[ARG4]][%[[IV0]], 0] +// CHECK-SAME: [%[[TILE_M_2]], %[[N3]]] +// CHECK: %[[SV_ARG4_2:.+]] = subview %[[ARG4]][%[[IV0]], 0] +// CHECK-SAME: [%[[TILE_M]], %[[N3]]] +// CHECK: %[[SV_ALLOC1:.+]] = subview %[[ALLOC1]][%[[IV0]], 0] +// CHECK-SAME: [%[[TILE_M]], %[[N1]]] +// CHECK: %[[SV_ARG2:.+]] = subview %[[ARG2]][0, 0] [%[[N1]], %[[N2]]] +// CHECK: %[[N0:.+]] = dim %[[ARG0]], %[[C1]] +// CHECK: %[[SV_ARG0:.+]] = subview %[[ARG0]][%[[IV0]], 0] +// CHECK-SAME: [%[[TILE_M:.+]], %[[N0]]] +// CHECK: %[[SV_ARG1:.+]] = subview %[[ARG1]][0, 0] [%[[N0]], %[[N1]]] + +// CHECK: scf.parallel (%[[IV1:.+]]) = (%[[C0]]) to (%[[N1]]) +// CHECK-SAME: step (%[[C32]]) { +// CHECK: %[[TILE_N1:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N1]]] +// CHECK: %[[SV_SV_ALLOC1:.+]] = subview %[[SV_ALLOC1]][0, %[[IV1]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N1]]] +// CHECK: linalg.fill(%[[SV_SV_ALLOC1]], %{{.+}}) +// CHECK: } + +// CHECK: scf.parallel (%[[IV1:.+]]) = (%[[C0]]) to (%[[N1]]) +// CHECK-SAME: step (%[[C32]]) { +// CHECK: scf.for %[[IV2:.+]] = %[[C0]] to %[[N0]] step %[[C64]] { +// CHECK: %[[TILE_N0:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[N0]]] +// CHECK: %[[SV_SV_ARG0:.+]] = subview %[[SV_ARG0]][0, %[[IV2]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N0]]] +// CHECK: %[[TILE_N1:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N1]]] +// CHECK: %[[SV_SV_ARG1:.+]] = subview %[[SV_ARG1]][%[[IV2]], %[[IV1]]] +// CHECK-SAME: [%[[TILE_N0]], %[[TILE_N1]]] +// CHECK: %[[SV_SV_ALLOC1:.+]] = subview %[[SV_ALLOC1]][0, %[[IV1]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N1]]] +// CHECK: linalg.matmul ins(%[[SV_SV_ARG0]], %[[SV_SV_ARG1]] +// CHECK-SAME: : memref, memref) +// CHECK-SAME: outs(%[[SV_SV_ALLOC1]] : memref) +// CHECK: } +// CHECK: scf.yield +// CHECK: } + +// CHECK: scf.parallel (%[[IV1:.+]]) = (%[[C0]]) to (%[[N2]]) +// CHECK-SAME: step (%[[C32]]) { +// CHECK: %[[TILE_N2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N2]]] +// CHECK: %[[SV_SV_ALLOC2:.+]] = subview %[[SV_ALLOC2]][0, %[[IV1]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N2]]] +// CHECK: linalg.fill(%[[SV_SV_ALLOC2]], %{{.+}}) +// CHECK: } +// CHECK: scf.parallel (%[[IV1:.+]]) = (%[[C0]]) to (%[[N2]]) +// CHECK-SAME: step (%[[C32]]) { +// CHECK: scf.for %[[IV2:.+]] = %[[C0]] to %[[N1]] step %[[C64]] { +// CHECK: %[[TILE_N1:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[N1]]] +// CHECK: %[[SV_SV_ALLOC1:.+]] = subview %[[SV_ALLOC1]][0, %[[IV2]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N1]]] +// CHECK: %[[TILE_N2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N2]]] +// CHECK: %[[SV_SV_ARG2:.+]] = subview %[[SV_ARG2]][%[[IV2]], %[[IV1]]] +// CHECK-SAME: [%[[TILE_N1]], %[[TILE_N2]]] +// CHECK: %[[SV_SV_ALLOC2:.+]] = subview %[[SV_ALLOC2]][0, %[[IV1]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N2]]] +// CHECK: linalg.matmul ins(%[[SV_SV_ALLOC1]], %[[SV_SV_ARG2]] +// CHECK-SAME: : memref, memref) +// CHECK-SAME: outs(%[[SV_SV_ALLOC2]] : memref) +// CHECK: } +// CHECK: scf.yield +// CHECK: } + +// CHECK: scf.parallel (%[[IV1:.+]]) = (%[[C0]]) to (%[[N3]]) +// CHECK-SAME: step (%[[C32]]) { +// CHECK: %[[TILE_N3:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N3]]] +// CHECK: %[[SV_SV_ARG4:.+]] = subview %[[SV_ARG4_2]][0, %[[IV1]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N3]]] +// CHECK: linalg.fill(%[[SV_SV_ARG4]], %{{.+}}) +// CHECK: } +// CHECK: %[[N3_2:.+]] = dim %[[ARG3]], %[[C1]] +// CHECK: scf.parallel (%[[IV1:.+]]) = (%[[C0]]) to (%[[N3_2]]) +// CHECK-SAME: step (%[[C32]]) { +// CHECK: scf.for %[[IV2:.+]] = %[[C0]] to %[[N2]] step %[[C64]] { +// CHECK: %[[TILE_N2:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[N2]]] +// CHECK: %[[SV_SV_ALLOC2:.+]] = subview %[[SV_ALLOC2]][0, %[[IV2]]] +// CHECK-SAME: [%[[TILE_M]], %[[TILE_N2]]] +// CHECK: %[[N2_2:.+]] = dim %[[ARG3]], %[[C0]] +// CHECK: %[[TILE_N2:.+]] = affine.min #[[MAP3]](%[[IV2]])[%[[N2_2]]] +// CHECK: %[[TILE_N3:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N3_2]]] +// CHECK: %[[SV_ARG3:.+]] = subview %[[ARG3]][%[[IV2]], %[[IV1]]] +// CHECK-SAME: [%[[TILE_N2]], %[[TILE_N3]]] +// CHECK: %[[TILE_N3_2:.+]] = affine.min #[[MAP2]](%[[IV1]])[%[[N3]]] +// CHECK: %[[SV_SV_ARG4:.+]] = subview %[[SV_ARG4]][0, %[[IV1]]] +// CHECK-SAME: [%[[TILE_M_2]], %[[TILE_N3_2]]] +// CHECK: linalg.matmul ins(%[[SV_SV_ALLOC2]], %[[SV_ARG3]] +// CHECK-SAME: : memref, memref) +// CHECK-SAME: outs(%[[SV_SV_ARG4]] : memref) +// CHECK: } +// CHECK: scf.yield +// CHECK: } +