diff --git a/mlir/test/Dialect/Linalg/codegen-strategy.mlir b/mlir/test/Dialect/Linalg/codegen-strategy.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/codegen-strategy.mlir +++ /dev/null @@ -1,92 +0,0 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=2,4,8 vectorize vectorize-contraction-to=matrixintrinsics unroll-vector-transfers=true" -split-input-file | FileCheck %s --check-prefix=CHECK-INTRINSIC -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 promote promote-full-tile-pad register-tile-sizes=2,4,8 vectorize vectorize-contraction-to=outerproduct split-transfers=true unroll-vector-transfers=false" -split-input-file | FileCheck %s --check-prefix=CHECK-OUTER -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 tile-interchange=1,2,0 generalize iterator-interchange=0,2,1" -split-input-file | FileCheck %s --check-prefix=CHECK-INTERCHANGE -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=1,1,0 hoist-paddings=3,3,0" -split-input-file | FileCheck %s --check-prefix=CHECK-PAD -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul tile-sizes=16,32,64 fuse pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 vectorize" -split-input-file | FileCheck %s --check-prefix=CHECK-FUSE -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-func=conv anchor-op=linalg.conv_2d_nhwc_hwcf tile-sizes=1,1,8,32,1,1,8 fuse pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 decompose vectorize vectorize-padding" -split-input-file | FileCheck %s --check-prefix=CHECK-DECOMP - -// CHECK-INTRINSIC: func @matmul( -// CHECK-OUTER: func @matmul( -func.func @matmul(%arg0: memref<72x72xf32>, %arg1: memref<72x72xf32>, %arg2: memref<72x72xf32>) { - - // Check the matrix intrinsic lowering is triggered. - // CHECK-INTRINSIC: vector.matrix_multiply - // CHECK-INTRINSIC-SAME: {lhs_columns = 8 : i32, lhs_rows = 2 : i32, rhs_columns = 4 : i32} - // CHECK-INTRINSIC-SAME: (vector<16xf32>, vector<32xf32>) -> vector<8xf32> - - // Check the outer product lowering is triggered. - // CHECK-OUTER: vector.outerproduct {{.*}} : vector<2xf32>, vector<4xf32> - linalg.matmul ins(%arg0, %arg1: memref<72x72xf32>, memref<72x72xf32>) outs(%arg2: memref<72x72xf32>) - func.return -} - -// ----- - -// CHECK-INTERCHANGE: func @matmul( -func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> { - // CHECK-INTERCHANGE-DAG: %[[C16:.*]] = arith.constant 16 - // CHECK-INTERCHANGE-DAG: %[[C32:.*]] = arith.constant 32 - // CHECK-INTERCHANGE-DAG: %[[C64:.*]] = arith.constant 64 - - // Check the tile loops are interchanged. - // CHECK-INTERCHANGE: scf.for {{.*}} step %[[C32]] - // CHECK-INTERCHANGE: scf.for {{.*}} step %[[C64]] - // CHECK-INTERCHANGE: scf.for {{.*}} step %[[C16]] - - // Check the operation has been generalized and interchanged. - // CHECK-INTERCHANGE: linalg.generic - // CHECK-INTERCHANGE-SAME: iterator_types = ["parallel", "reduction", "parallel"] - %0 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%arg2: tensor<72x72xf32>) -> tensor<72x72xf32> - func.return %0 : tensor<72x72xf32> -} - -// ----- - -// CHECK-PAD-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 72, 16)> - -// CHECK-PAD: func @matmul( -func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> { - - // Check the padding of the input operands has been hoisted out of the tile loop nest. - // CHECK-PAD-COUNT=2: tensor.pad %{{.*}} nofold - // CHECK-PAD: scf.for - // Check CSE eliminates the duplicate min operations introduced by tiling. - // CHECK-PAD: affine.min #[[MAP0]] - // CHECK-PAD-NOT: affine.min #[[MAP0]] - // CHECK-PAD-COUNT=2: scf.for - // CHECK-PAD: linalg.matmul - %0 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%arg2: tensor<72x72xf32>) -> tensor<72x72xf32> - func.return %0 : tensor<72x72xf32> -} - -// ----- - -// CHECK-FUSE: func @matmul( -func.func @matmul(%arg0: tensor<72x72xf32>, %arg1: tensor<72x72xf32>, %arg2: tensor<72x72xf32>) -> tensor<72x72xf32> { - - // Check the padding and vectorization applies to the fill operation due to the empty anchor op string. - // CHECK-FUSE: %[[CST:.*]] = arith.constant dense<0.000000e+00> - // CHECK-FUSE: vector.transfer_write %[[CST]] - %cst = arith.constant 0.0 : f32 - %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<72x72xf32>) -> tensor<72x72xf32> - - // Check the matmul is padded and vectorized despite the empty anchor op string. - // CHECK-FUSE: vector.outerproduct - %1 = linalg.matmul ins(%arg0, %arg1: tensor<72x72xf32>, tensor<72x72xf32>) outs(%0: tensor<72x72xf32>) -> tensor<72x72xf32> - func.return %1 : tensor<72x72xf32> -} - -// ----- - -// CHECK-DECOMP: func @conv( -func.func @conv(%arg0: tensor<8x18x17x32xf32>, %arg1: tensor<3x3x32x64xf32>, %arg2: tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32> - - // Check the conv is padded by a rank-reducing vector transfer op pair. - // CHECK-DECOMP: vector.transfer_read {{.*}}: tensor<1x1x?x8xf32>, vector<1x8x8xf32> - // CHECK-DECOMP: vector.outerproduct - // CHECK-DECOMP: vector.transfer_write {{.*}}: vector<1x8x32xf32>, tensor<1x1x?x32xf32> - %1 = linalg.conv_2d_nhwc_hwcf {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} ins(%arg0, %arg1 : tensor<8x18x17x32xf32>, tensor<3x3x32x64xf32>) outs(%0 : tensor<8x16x15x64xf32>) -> tensor<8x16x15x64xf32> - func.return %1 : tensor<8x16x15x64xf32> -} diff --git a/mlir/test/Dialect/Linalg/decompose-convolution.mlir b/mlir/test/Dialect/Linalg/decompose-convolution.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/decompose-convolution.mlir +++ /dev/null @@ -1,94 +0,0 @@ -// RUN: mlir-opt -test-linalg-codegen-strategy="decompose" -split-input-file %s | FileCheck %s - -// CHECK-LABEL: func @conv2d_nhwc_4x1x2x8_tensor -// CHECK-SAME: (%[[INPUT:.+]]: tensor<4x1x6x3xf32>, %[[FILTER:.+]]: tensor<1x2x3x8xf32>, %[[INIT:.+]]: tensor<4x1x2x8xf32>) -func.func @conv2d_nhwc_4x1x2x8_tensor(%input: tensor<4x1x6x3xf32>, %filter: tensor<1x2x3x8xf32>, %init: tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> { - %0 = linalg.conv_2d_nhwc_hwcf - {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<[3, 2]> : tensor<2xi64>} - ins(%input, %filter : tensor<4x1x6x3xf32>, tensor<1x2x3x8xf32>) - outs(%init : tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> - return %0 : tensor<4x1x2x8xf32> -} - -// CHECK: %[[INPUT_1D:.+]] = tensor.extract_slice %[[INPUT]] -// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [4, 1, 6, 3] [1, 1, 1, 1] : tensor<4x1x6x3xf32> to tensor<4x6x3xf32> -// CHECK: %[[FILTER_1D:.+]] = tensor.extract_slice %[[FILTER]] -// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [1, 2, 3, 8] [1, 1, 1, 1] : tensor<1x2x3x8xf32> to tensor<2x3x8xf32> -// CHECK: %[[INIT_1D:.+]] = tensor.extract_slice %[[INIT]] -// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [4, 1, 2, 8] [1, 1, 1, 1] : tensor<4x1x2x8xf32> to tensor<4x2x8xf32> -// CHECK: %[[CONV_1D:.+]] = linalg.conv_1d_nwc_wcf -// CHECK-SAME: dilations = dense<3> : vector<1xi64> -// CHECK-SAME: strides = dense<2> : vector<1xi64> -// CHECK-SAME: ins(%[[INPUT_1D]], %[[FILTER_1D]] : tensor<4x6x3xf32>, tensor<2x3x8xf32>) -// CHECK-SAME: outs(%[[INIT_1D]] : tensor<4x2x8xf32>) -// CHECK: %[[CONV_2D:.+]] = tensor.insert_slice %[[CONV_1D]] into %[[INIT]] -// CHECK-SAME{LITERAL}: [0, 0, 0, 0] [4, 1, 2, 8] [1, 1, 1, 1] : tensor<4x2x8xf32> into tensor<4x1x2x8xf32> -// CHECK: return %[[CONV_2D]] -// ----- - -// CHECK-LABEL: func @conv2d_nhwc_qxqx1xq_tensor -// CHECK-SAME: (%[[INPUT:.+]]: tensor, %[[FILTER:.+]]: tensor, %[[INIT:.+]]: tensor) -func.func @conv2d_nhwc_qxqx1xq_tensor(%input: tensor, %filter: tensor, %init: tensor) -> tensor { - %0 = linalg.conv_2d_nhwc_hwcf - {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<[3, 2]> : tensor<2xi64>} - ins(%input, %filter : tensor, tensor) - outs(%init : tensor) -> tensor - return %0 : tensor -} - -// CHECK: %[[INPUT_1D:.+]] = tensor.extract_slice %[[INPUT]] -// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : -// CHECK-SAME: tensor to tensor -// CHECK: %[[FILTER_1D:.+]] = tensor.extract_slice %[[FILTER]] -// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, 1, %{{.*}}, %{{.*}}] [1, 1, 1, 1] : -// CHECK-SAME: tensor to tensor -// CHECK: %[[INIT_1D:.+]] = tensor.extract_slice %[[INIT]] -// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : -// CHECK-SAME: tensor to tensor -// CHECK: %[[CONV_1D:.+]] = linalg.conv_1d_nwc_wcf -// CHECK-SAME: dilations = dense<2> : vector<1xi64> -// CHECK-SAME: strides = dense<3> : vector<1xi64> -// CHECK-SAME: ins(%[[INPUT_1D]], %[[FILTER_1D]] : tensor, tensor) -// CHECK-SAME: outs(%[[INIT_1D]] : tensor) -// CHECK: %[[CONV_2D:.+]] = tensor.insert_slice %[[CONV_1D]] into %[[INIT]] -// CHECK-SAME: [0, 0, 0, 0] [%{{.*}}, %{{.*}}, 1, %{{.*}}] [1, 1, 1, 1] : -// CHECK-SAME: tensor into tensor -// CHECK: return %[[CONV_2D]] - -// ----- - -// Do not convert convolution ops whose window dimensions are not ones. - -// CHECK-LABEL: func @conv2d_nhwc_4x1x2x8_tensor -func.func @conv2d_nhwc_4x1x2x8_tensor(%input: tensor<4x3x5x3xf32>, %filter: tensor<2x2x3x8xf32>, %init: tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> { - // CHECK: linalg.conv_2d_nhwc_hwcf - %0 = linalg.conv_2d_nhwc_hwcf - {dilations = dense<[2, 3]> : tensor<2xi64>, strides = dense<1> : tensor<2xi64>} - ins(%input, %filter : tensor<4x3x5x3xf32>, tensor<2x2x3x8xf32>) - outs(%init : tensor<4x1x2x8xf32>) -> tensor<4x1x2x8xf32> - return %0 : tensor<4x1x2x8xf32> -} - -// ----- - -// CHECK-LABEL: func @depthwise_conv_2d_nhwc_hwc_tensor -func.func @depthwise_conv_2d_nhwc_hwc_tensor(%input: tensor<1x1x113x96xf32>, %filter: tensor<1x3x96xf32>, %out: tensor<1x1x56x96xf32>) -> tensor<1x1x56x96xf32> { - // CHECK: linalg.depthwise_conv_1d_nwc_wc - %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} - ins(%input, %filter: tensor<1x1x113x96xf32>, tensor<1x3x96xf32>) - outs(%out: tensor<1x1x56x96xf32>) -> tensor<1x1x56x96xf32> - return %0: tensor<1x1x56x96xf32> -} - -// ----- - -// Do not convert convolution ops whose window dimensions are not ones. - -// CHECK-LABEL: func @depthwise_conv_2d_nhwc_hwc_tensor -func.func @depthwise_conv_2d_nhwc_hwc_tensor(%input: tensor<1x113x113x96xf32>, %filter: tensor<3x3x96xf32>, %out: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32> { - // CHECK: linalg.depthwise_conv_2d_nhwc_hwc - %0 = linalg.depthwise_conv_2d_nhwc_hwc {dilations = dense<1> : vector<2xi64>, strides = dense<2> : vector<2xi64>} - ins(%input, %filter: tensor<1x113x113x96xf32>, tensor<3x3x96xf32>) - outs(%out: tensor<1x56x56x96xf32>) -> tensor<1x56x56x96xf32> - return %0: tensor<1x56x56x96xf32> -} diff --git a/mlir/test/Dialect/Linalg/hoist-padding.mlir b/mlir/test/Dialect/Linalg/hoist-padding.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/hoist-padding.mlir +++ /dev/null @@ -1,480 +0,0 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATVEC -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matvec pad hoist-paddings=1,1,0 transpose-paddings=[1,0],[0],[0] run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=TRANSP -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad hoist-paddings=1,2,1 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL - -// MATVEC-DAG: #[[DIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> - -// MATVEC: static_size_divisible -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32> -func.func @static_size_divisible(%arg0: tensor<24x12xf32>, - %arg1: tensor<12xf32>, - %arg2: tensor<24xf32>) -> tensor<24xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c4 = arith.constant 4 : index - - // Pack the vector tiles for all values of IV (IVx4). - // MATVEC: = linalg.init_tensor [3, 4] - // MATVEC: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = - // MATVEC: %[[PIDX0:.*]] = affine.apply #[[DIV4]](%[[PIV0]]) - // MATVEC: %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [4] - // MATVEC: %[[T2:.*]] = tensor.pad %[[T1]] - // MATVEC: %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]] - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) { - %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32> - - // Index the packed vector. - // MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV4]](%[[IV0]]) - // MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]] - %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32> - %3 = tensor.pad %2 nofold low[%c0] high[%c0] { - ^bb0(%arg5: index): - tensor.yield %cst : f32 - } : tensor<4xf32> to tensor<4xf32> - - // Check matvec uses the packed input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]] - %4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %4 : tensor<24xf32> - } - return %0 : tensor<24xf32> -} - -// ----- - -// MATVEC-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 12, 5)> -// MATVEC-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 5)> -// MATVEC-DAG: #[[DIV5:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 5)> -#map0 = affine_map<(d0) -> (5, -d0 + 12)> -#map1 = affine_map<(d0) -> (-d0 + 5)> - -// MATVEC: static_size_not_divisible -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32> -func.func @static_size_not_divisible(%arg0: tensor<24x12xf32>, - %arg1: tensor<12xf32>, - %arg2: tensor<24xf32>) -> tensor<24xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c5 = arith.constant 5 : index - - // Pack the vector tiles for all values of IV (IVx5). - // MATVEC: = linalg.init_tensor [3, 5] - // MATVEC: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = - // MATVEC: %[[PIDX0:.*]] = affine.apply #[[DIV5]](%[[PIV0]]) - // MATVEC: %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]]) - // MATVEC: %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]] - // MATVEC: %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]]) - // MATVEC: %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]] - // MATVEC: %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]] - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) { - %1 = affine.min #map0(%arg3) - %2 = tensor.extract_slice %arg0[0, %arg3] [24, %1] [1, 1] : tensor<24x12xf32> to tensor<24x?xf32> - - // Index the packed vector. - // MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DIV5]](%[[IV0]]) - // MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]] - %3 = tensor.extract_slice %arg1[%arg3] [%1] [1] : tensor<12xf32> to tensor - %4 = affine.apply #map1(%1) - %5 = tensor.pad %2 low[%c0, %c0] high[%c0, %4] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : f32 - } : tensor<24x?xf32> to tensor<24x5xf32> - %6 = tensor.pad %3 low[%c0] high[%4] { - ^bb0(%arg5: index): - tensor.yield %cst : f32 - } : tensor to tensor<5xf32> - - // Check matvec uses the packed input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]] - %7 = linalg.matvec ins(%5, %6 : tensor<24x5xf32>, tensor<5xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %7 : tensor<24xf32> - } - return %0 : tensor<24xf32> -} - -// ----- - -// MATVEC-DAG: #[[SDIV4:[0-9a-z]+]] = affine_map<()[s0] -> (s0 ceildiv 4)> -// MATVEC-DAG: #[[DDIV4:[0-9a-z]+]] = affine_map<(d0) -> (d0 ceildiv 4)> -// MATVEC-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -// MATVEC-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 4)> -#map0 = affine_map<(d0)[s0] -> (4, -d0 + s0)> -#map1 = affine_map<(d0) -> (-d0 + 4)> - -// MATVEC: dynamic_size -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor -func.func @dynamic_size(%arg0: tensor<24x?xf32>, - %arg1: tensor, - %arg2: tensor<24xf32>) -> tensor<24xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - - // MATVEC: %[[D0:.*]] = tensor.dim - %0 = tensor.dim %arg0, %c1 : tensor<24x?xf32> - - // Pack the vector tiles for all values of IV (IVx4). - // MATVEC: %[[PS0:.*]] = affine.apply #[[SDIV4]]()[%[[D0]]] - // MATVEC: = linalg.init_tensor [%[[PS0]], 4] - // MATVEC: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = - // MATVEC: %[[PIDX0:.*]] = affine.apply #[[DDIV4]](%[[PIV0]]) - // MATVEC: %[[TS0:.*]] = affine.min #[[MAP0]](%[[PIV0]])[%[[D0]]] - // MATVEC: %[[T1:.*]] = tensor.extract_slice %[[ARG1]][%[[PIV0]]] [%[[TS0]]] - // MATVEC: %[[HPD0:.*]] = affine.apply #[[MAP1]](%[[TS0]]) - // MATVEC: %[[T2:.*]] = tensor.pad %[[T1]]{{.*}}high[%[[HPD0]] - // MATVEC: %[[T3:.*]] = tensor.insert_slice %[[T1:.*]]{{.*}}[%[[PIDX0]] - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) { - %2 = affine.min #map0(%arg3)[%0] - %3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32> - - // Index the packed vector. - // MATVEC-DAG: %[[IDX0:.*]] = affine.apply #[[DDIV4]](%[[IV0]]) - // MATVEC-DAG: %[[T4:.*]] = tensor.extract_slice %[[T0]][%[[IDX0]] - %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor to tensor - %5 = affine.apply #map1(%2) - %6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : f32 - } : tensor<24x?xf32> to tensor<24x4xf32> - %7 = tensor.pad %4 nofold low[%c0] high[%5] { - ^bb0(%arg5: index): - tensor.yield %cst : f32 - } : tensor to tensor<4xf32> - - // Check matvec uses the packed input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T4]] - %8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %8 : tensor<24xf32> - } - return %1 : tensor<24xf32> -} - -// ----- - -// MATVEC: non_constant_padding -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32> -func.func @non_constant_padding(%arg0: tensor<24x12xf32>, - %arg1: tensor<12xf32>, - %arg2: tensor<24xf32>) -> tensor<24xf32> { - %c4 = arith.constant 4 : index - %c12 = arith.constant 12 : index - %c0 = arith.constant 0 : index - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) { - %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32> - - // Check the non constant padding is not hoisted. - // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]] - // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]] - %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32> - %3 = tensor.pad %2 nofold low[%c0] high[%c0] { - ^bb0(%arg5: index): - %5 = arith.index_cast %arg3 : index to i32 - %6 = arith.sitofp %5 : i32 to f32 - tensor.yield %6 : f32 - } : tensor<4xf32> to tensor<4xf32> - - // Check matvec uses the padded input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]] - %4 = linalg.matvec ins(%1, %3 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %4 : tensor<24xf32> - } - return %0 : tensor<24xf32> -} - -// ----- - -// MATVEC: non_constant_op_padding -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32> -func.func @non_constant_op_padding(%arg0: tensor<24x12xf32>, - %arg1: tensor<12xf32>, - %arg2: tensor<24xf32>) -> tensor<24xf32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c4 = arith.constant 4 : index - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg3 = %c0 to %c12 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) { - %1 = tensor.extract_slice %arg0[0, %arg3] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32> - - // Check the non constant op padding is not hoisted. - // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]] - // MATVEC: %[[V0:.*]] = tensor.extract %[[ARG1]][%[[IV0]] - // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]] - // MATVEC: tensor.yield %[[V0]] - %2 = tensor.extract_slice %arg1[%arg3] [4] [1] : tensor<12xf32> to tensor<4xf32> - %3 = tensor.extract %arg1[%arg3] : tensor<12xf32> - %4 = tensor.pad %2 nofold low[%c0] high[%c0] { - ^bb0(%arg5: index): - tensor.yield %3 : f32 - } : tensor<4xf32> to tensor<4xf32> - - // Check matvec uses the padded input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]] - %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %5 : tensor<24xf32> - } - return %0 : tensor<24xf32> -} - -// ----- - -// MATVEC: non_index_operand -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32> -// MATVEC-SAME: %[[ARG3:[0-9a-zA-Z]*]]: i32 -func.func @non_index_operand(%arg0: tensor<24x12xf32>, - %arg1: tensor<12xf32>, - %arg2: tensor<24xf32>, - %arg3: i32) -> tensor<24xf32> { - %c4 = arith.constant 4 : index - %c12 = arith.constant 12 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) { - %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32> - - // Check the index_cast prevents hoisting due to its non index operand. - // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]] - // MATVEC: %[[IDX0:.*]] = arith.index_cast %[[ARG3]] - // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]] - %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32> - %3 = arith.index_cast %arg3 : i32 to index - %4 = tensor.pad %2 nofold low[%3] high[%3] { - ^bb0(%arg6: index): - tensor.yield %cst : f32 - } : tensor<4xf32> to tensor<4xf32> - - // Check matvec uses the padded input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]] - %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %5 : tensor<24xf32> - } - return %0 : tensor<24xf32> -} - -// ----- - -// MATVEC: memory_effect -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32> -// MATVEC-SAME: %[[ARG3:[0-9a-zA-Z]*]]: memref -func.func @memory_effect(%arg0: tensor<24x12xf32>, - %arg1: tensor<12xf32>, - %arg2: tensor<24xf32>, - %arg3: memref) -> tensor<24xf32> { - %c4 = arith.constant 4 : index - %c12 = arith.constant 12 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) { - %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32> - - // Check the load prevents hoisting due to its memory effect. - // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]] - // MATVEC: %[[IDX0:.*]] = memref.load %[[ARG3]] - // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]] - %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32> - %3 = memref.load %arg3[%c0] : memref - %4 = tensor.pad %2 nofold low[%3] high[%3] { - ^bb0(%arg6: index): - tensor.yield %cst : f32 - } : tensor<4xf32> to tensor<4xf32> - - // Check matvec uses the padded input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]] - %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %5 : tensor<24xf32> - } - return %0 : tensor<24xf32> -} - -// ----- - -// MATVEC: index_result_loop -// MATVEC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12xf32> -// MATVEC-SAME: %[[ARG3:[0-9a-zA-Z]*]]: index -func.func @index_result_loop(%arg0: tensor<24x12xf32>, - %arg1: tensor<12xf32>, - %arg2: tensor<24xf32>, - %arg3: index) -> tensor<24xf32> { - %c4 = arith.constant 4 : index - %c12 = arith.constant 12 : index - %c0 = arith.constant 0 : index - %cst = arith.constant 0.000000e+00 : f32 - - // MATVEC: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg4 = %c0 to %c12 step %c4 iter_args(%arg5 = %arg2) -> (tensor<24xf32>) { - %1 = tensor.extract_slice %arg0[0, %arg4] [24, 4] [1, 1] : tensor<24x12xf32> to tensor<24x4xf32> - - // Check the unexpected operation with a region prevents hoisting. - // MATVEC: %[[T0:.*]] = tensor.extract_slice %[[ARG1]][%[[IV0]] - // MATVEC: %[[IDX0:.*]] = scf.for {{.*}} step %[[ARG3]] - // MATVEC: %[[T1:.*]] = tensor.pad %[[T0]]{{.*}}%[[IDX0]] - %2 = tensor.extract_slice %arg1[%arg4] [4] [1] : tensor<12xf32> to tensor<4xf32> - %3 = scf.for %arg6 = %c0 to %c12 step %arg3 iter_args(%arg7 = %c0) -> (index) { - %6 = arith.addi %arg3, %arg7 : index - scf.yield %6 : index - } - %4 = tensor.pad %2 nofold low[%3] high[%3] { - ^bb0(%arg6: index): - tensor.yield %cst : f32 - } : tensor<4xf32> to tensor<4xf32> - - // Check matvec uses the padded input vector. - // MATVEC: = linalg.matvec ins(%{{.*}}, %[[T1]] - %5 = linalg.matvec ins(%1, %4 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg5 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %5 : tensor<24xf32> - } - return %0 : tensor<24xf32> -} - -// ----- - -#map0 = affine_map<(d0) -> (-d0 + 12, 5)> -#map1 = affine_map<(d0) -> (-d0 + 5)> - -// MATMUL: tile_and_fuse -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<12x6xf32> -// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<6x24xf32> -func.func @tile_and_fuse(%arg0: tensor<12x6xf32>, - %arg1: tensor<6x24xf32>, - %arg2: tensor<12x24xf32>) -> tensor<12x24xf32> { - %c6 = arith.constant 6 : index - %c3 = arith.constant 3 : index - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c5 = arith.constant 5 : index - %cst = arith.constant 0.000000e+00 : f32 - - // Check the second input operand is hoisted by two loop nests. - // MATMUL: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = - // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] - // MATMUL: %[[T2:.*]] = tensor.pad %[[T1]] - - // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg3 = %c0 to %c12 step %c5 iter_args(%arg4 = %arg2) -> (tensor<12x24xf32>) { - %1 = affine.min #map0(%arg3) - - // Check the extract_slice op introduced by the double tiling does not prevent the hoisting. - %2 = tensor.extract_slice %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor<12x24xf32> to tensor - %3 = affine.apply #map1(%1) - - // Check the fused and padded fill op does not prevent hoisting. - %4 = tensor.pad %2 nofold low[%c0, %c0] high[%3, %c0] { - ^bb0(%arg5: index, %arg6: index): - tensor.yield %cst : f32 - } : tensor to tensor<5x24xf32> - %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<5x24xf32>) -> tensor<5x24xf32> - %6 = tensor.extract_slice %5[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor - - // Check the first input operand is hoisted by one loop nest. - // MATMUL: %[[T3:.*]] = scf.for %[[PIV1:[0-9a-z]+]] = - // MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG0]] - // MATMUL: %[[T5:.*]] = tensor.pad %[[T4]] - - // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] = - %7 = scf.for %arg5 = %c0 to %c6 step %c3 iter_args(%arg6 = %6) -> (tensor) { - - // Index the packed operands. - // MATMUL-DAG: %[[T6:.*]] = tensor.extract_slice %[[T3]] - // MATMUL-DAG: %[[T7:.*]] = tensor.extract_slice %[[T0]] - %9 = tensor.extract_slice %arg0[%arg3, %arg5] [%1, 3] [1, 1] : tensor<12x6xf32> to tensor - %10 = tensor.extract_slice %arg1[%arg5, 0] [3, 24] [1, 1] : tensor<6x24xf32> to tensor<3x24xf32> - %11 = tensor.extract_slice %arg6[0, 0] [%1, 24] [1, 1] : tensor to tensor - %12 = tensor.pad %9 nofold low[%c0, %c0] high[%3, %c0] { - ^bb0(%arg7: index, %arg8: index): - tensor.yield %cst : f32 - } : tensor to tensor<5x3xf32> - %13 = tensor.pad %10 nofold low[%c0, %c0] high[%c0, %c0] { - ^bb0(%arg7: index, %arg8: index): - tensor.yield %cst : f32 - } : tensor<3x24xf32> to tensor<3x24xf32> - - // Check the output padding is not hoisted. - // MATMUL: %[[T8:.*]] = tensor.pad - %14 = tensor.pad %11 nofold low[%c0, %c0] high[%3, %c0] { - ^bb0(%arg7: index, %arg8: index): - tensor.yield %cst : f32 - } : tensor to tensor<5x24xf32> - - // Check matmul uses the padded operands. - // MATMUL: = linalg.matmul ins(%[[T6]], %[[T7]] {{.*}} outs(%[[T8]] - %15 = linalg.matmul ins(%12, %13 : tensor<5x3xf32>, tensor<3x24xf32>) outs(%14 : tensor<5x24xf32>) -> tensor<5x24xf32> - %16 = tensor.extract_slice %15[0, 0] [%1, 24] [1, 1] : tensor<5x24xf32> to tensor - %17 = tensor.insert_slice %16 into %arg6[0, 0] [%1, 24] [1, 1] : tensor into tensor - scf.yield %17 : tensor - } - %8 = tensor.insert_slice %7 into %arg4[%arg3, 0] [%1, 24] [1, 1] : tensor into tensor<12x24xf32> - scf.yield %8 : tensor<12x24xf32> - } - return %0 : tensor<12x24xf32> -} - -// ----- - -#map0 = affine_map<(d0)[s0] -> (-d0 + s0, 4)> -#map1 = affine_map<(d0) -> (-d0 + 4)> - -// TRANSP: transpose -// TRANSP-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x?xf32> -func.func @transpose(%arg0: tensor<24x?xf32>, - %arg1: tensor, - %arg2: tensor<24xf32>) -> tensor<24xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %c4 = arith.constant 4 : index - %0 = tensor.dim %arg0, %c1 : tensor<24x?xf32> - - // Transpose the padded matrix. - // TRANSP: %[[T0:.*]] = scf.for %[[PIV0:[0-9a-z]+]] = {{.*}}iter_args(%[[T1:.*]] = - // TRANSP: %[[T2:.*]] = tensor.pad - // TRANSP: %[[T3:.*]] = tensor.extract_slice %[[T1]] - // TRANSP: %[[T4:.*]] = linalg.generic - // TRANSP-SAME: ins(%[[T2]] : tensor<24x4xf32> - // TRANSP-SAME: outs(%[[T3]] : tensor<4x24xf32> - // TRANSP: %[[T5:.*]] = tensor.insert_slice %[[T4]] into %[[T1]] - // TRANSP: scf.yield %[[T5:.*]] - - // TRANSP: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %1 = scf.for %arg3 = %c0 to %0 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24xf32>) { - %2 = affine.min #map0(%arg3)[%0] - %3 = tensor.extract_slice %arg0[0, %arg3] [24, %2] [1, 1] : tensor<24x?xf32> to tensor<24x?xf32> - - // Index the packed vector and transpose back. - // TRANSP: %[[T6:.*]] = tensor.extract_slice %[[T0]] - // TRANSP: %[[T7:.*]] = linalg.init_tensor - // TRANSP: %[[T8:.*]] = linalg.generic - // TRANSP-SAME: ins(%[[T6]] : tensor<4x24xf32> - // TRANSP-SAME: outs(%[[T7]] : tensor<24x4xf32> - %4 = tensor.extract_slice %arg1[%arg3] [%2] [1] : tensor to tensor - %5 = affine.apply #map1(%2) - %6 = tensor.pad %3 low[%c0, %c0] high[%c0, %5] { - ^bb0(%arg5: index, %arg6: index): // no predecessors - tensor.yield %cst : f32 - } : tensor<24x?xf32> to tensor<24x4xf32> - %7 = tensor.pad %4 nofold low[%c0] high[%5] { - ^bb0(%arg5: index): // no predecessors - tensor.yield %cst : f32 - } : tensor to tensor<4xf32> - - // Check matvec uses the packed input vector. - // TRANSP: = linalg.matvec ins(%[[T8]] - %8 = linalg.matvec ins(%6, %7 : tensor<24x4xf32>, tensor<4xf32>) outs(%arg4 : tensor<24xf32>) -> tensor<24xf32> - scf.yield %8 : tensor<24xf32> - } - return %1 : tensor<24xf32> -} diff --git a/mlir/test/Dialect/Linalg/interchange.mlir b/mlir/test/Dialect/Linalg/interchange.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/interchange.mlir +++ /dev/null @@ -1,51 +0,0 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="iterator-interchange=4,0,3,1,2" | FileCheck %s -// RUN: mlir-opt %s -test-linalg-codegen-strategy="iterator-interchange=4,0,3,1,2" -test-linalg-codegen-strategy="iterator-interchange=1,3,4,2,0" | FileCheck --check-prefix=CANCEL-OUT %s - -#map0 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> -#map1 = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)> - -func.func @interchange_generic_op(%arg0 : memref<1x2x3x4x5xindex>, %arg1 : memref<1x2x4xindex>) { - linalg.generic { - indexing_maps = [#map0, #map1], - iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"]} - ins(%arg0 : memref<1x2x3x4x5xindex>) - outs(%arg1 : memref<1x2x4xindex>) { - ^bb0(%arg2 : index, %arg3 : index) : - %0 = linalg.index 0 : index - %1 = linalg.index 1 : index - %2 = linalg.index 4 : index - %3 = arith.subi %0, %1 : index - %4 = arith.addi %3, %2 : index - %5 = arith.addi %4, %arg2 : index - linalg.yield %5 : index - } - return -} - -// CHECK-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d4, d2, d0)> -// CHECK-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d1, d3, d2)> -// CHECK: func @interchange_generic_op -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]] -// CHECK-SAME: iterator_types = ["reduction", "parallel", "parallel", "parallel", "reduction"] -// CHECK-DAG: %[[IDX0:.+]] = linalg.index 1 : index -// CHECK-DAG: %[[IDX1:.+]] = linalg.index 3 : index -// CHECK-DAG: %[[IDX4:.+]] = linalg.index 0 : index -// CHECK: %[[T0:.+]] = arith.subi %[[IDX0]], %[[IDX1]] : index -// CHECK: %[[T1:.+]] = arith.addi %[[T0]], %[[IDX4]] : index -// CHECK: %[[T2:.+]] = arith.addi %[[T1]], %{{.*}} : index - -// CANCEL-OUT-DAG: #[[MAP0:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d2, d3, d4)> -// CANCEL-OUT-DAG: #[[MAP1:.+]] = affine_map<(d0, d1, d2, d3, d4) -> (d0, d1, d3)> -// CANCEL-OUT: func @interchange_generic_op -// CANCEL-OUT: linalg.generic -// CANCEL-OUT-SAME: indexing_maps = [#[[MAP0]], #[[MAP1]]] -// CANCEL-OUT-SAME: iterator_types = ["parallel", "parallel", "reduction", "parallel", "reduction"] -// CANCEL-OUT-DAG: %[[IDX0:.+]] = linalg.index 0 : index -// CANCEL-OUT-DAG: %[[IDX1:.+]] = linalg.index 1 : index -// CANCEL-OUT-DAG: %[[IDX4:.+]] = linalg.index 4 : index -// CANCEL-OUT: %[[T0:.+]] = arith.subi %[[IDX0]], %[[IDX1]] : index -// CANCEL-OUT: %[[T1:.+]] = arith.addi %[[T0]], %[[IDX4]] : index -// CANCEL-OUT: %[[T2:.+]] = arith.addi %[[T1]], %{{.*}} : index - - diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/pad.mlir +++ /dev/null @@ -1,600 +0,0 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=1,1,0 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=MATMUL -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad padding-values=0.:f32,1.:f32 pack-paddings=0,1 padding-dimensions=0,1,2 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad padding-values=0.:f32,0.:f32 pack-paddings=0,1 padding-dimensions=0,1,2 run-enable-pass=false" -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1,2 pack-paddings=0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=FILL-MATMUL -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32 pack-paddings=1,1,0 padding-dimensions=0,1,2 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=0,1 pack-paddings=1,1,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=PARTIAL -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.depthwise_conv_2d_nhwc_hwc pad padding-values=0.:f32,0.:f32,0.:f32 padding-dimensions=1,2 pack-paddings=1,0,1 run-enable-pass=false" -cse -split-input-file | FileCheck %s --check-prefix=DEPTHWISE_CONV_2D - -// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 12, 7)> -// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)> -#map = affine_map<()[s0] -> (-s0 + 12, 7)> - -// MATMUL: static_sizes_output_divisible -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> -// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> -// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> -// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index -// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index -// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index -func.func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { - // MATMUL-DAG: %[[CST:.*]] = arith.constant 0. - // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index - - // MATMUL: %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]] - %0 = affine.min #map()[%iv2] - - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] - // MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG2]] - %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> - %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor - %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32> - - // Check statically sized matmul inputs with partially divisible sizes are padded. - // MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]] - // MATMUL: %[[T3:.*]] = tensor.pad %[[T0]] nofold - // MATMUL-SAME: [%[[C0]], %[[C0]]] - // MATMUL-SAME: [%[[C0]], %[[V0]] - // MATMUL: tensor.yield %[[CST]] - // MATMUL: %[[T4:.*]] = tensor.pad %[[T1]] nofold - - // Check the statically sized matmul output with fully divisible sizes is not padded. - // MATMUL: %[[T5:.*]] = linalg.matmul - // MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>) - // MATMUL-SAME: outs(%[[T2]] : tensor<4x5xf32>) - // MATMUL: %[[T6:.*]] = tensor.insert_slice %[[T5]] - %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32> - %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32> - func.return %5 : tensor<24x25xf32> -} - -// ----- - -// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 25, 7)> -// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)> -#map = affine_map<()[s0] -> (-s0 + 25, 7)> - -// MATMUL: static_sizes_input_divisible -// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> -// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index -// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index -// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index -func.func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { - // MATMUL-DAG: %[[CST:.*]] = arith.constant 0. - // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index - - %3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32> - - // MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]]()[%[[IV1]]] - %4 = affine.min #map()[%iv1] - %5 = tensor.extract_slice %arg1[%iv2, %iv1] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32> - - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG2]] - %6 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32> - - // Check the statically sized matmul output with partially divisible sizes is padded. - // MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]] - // MATMUL: %[[T1:.*]] = tensor.pad %[[T0]] low - // MATMUL-SAME: [%[[C0]], %[[C0]]] - // MATMUL-SAME: [%[[C0]], %[[V0]] - // MATMUL: tensor.yield %[[CST]] - - // MATMUL: %[[T2:.*]] = linalg.matmul - // MATMUL-SAME: outs(%[[T1]] : tensor<4x7xf32>) - // MATMUL: %[[T3:.*]] = tensor.extract_slice %[[T2]] - // MATMUL: %[[T4:.*]] = tensor.insert_slice %[[T3]] - %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32> - %8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32> - - // MATMUL: return %[[T4]] - func.return %8 : tensor<24x25xf32> -} - -// ----- - -// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 5)> -// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 7)> -// MATMUL-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0, s1] -> (-s0 + s1, 6)> -// MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)> -// MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)> - -#map0 = affine_map<()[s0, s1] -> (-s0 + s1, 5)> -#map1 = affine_map<()[s0, s1] -> (-s0 + s1, 6)> -#map2 = affine_map<()[s0, s1] -> (-s0 + s1, 7)> - -// MATMUL: dynamic_sizes -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor -// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor -// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor -// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index -// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index -// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index -func.func @dynamic_sizes(%arg0: tensor, - %arg1: tensor, - %arg2: tensor, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor { - // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index - // MATMUL-DAG: %[[C1:.*]] = arith.constant 1 - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - - // MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] - // MATMUL-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]] - // MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]] - %0 = tensor.dim %arg0, %c0 : tensor - %1 = tensor.dim %arg0, %c1 : tensor - %2 = tensor.dim %arg1, %c1 : tensor - - // MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]]()[%[[IV0]], %[[D0]]] - // MATMUL: %[[TS2:.*]] = affine.min #[[MAP2]]()[%[[IV2]], %[[D2]]] - // MATMUL: %[[TS1:.*]] = affine.min #[[MAP1]]()[%[[IV1]], %[[D1]]] - %6 = affine.min #map0()[%iv0, %0] - %7 = affine.min #map1()[%iv2, %1] - %8 = tensor.extract_slice %arg0[%iv0, %iv2] [%6, %7] [1, 1] : tensor to tensor - %9 = affine.min #map2()[%iv1, %2] - %10 = tensor.extract_slice %arg1[%iv2, %iv1] [%7, %9] [1, 1] : tensor to tensor - %11 = tensor.extract_slice %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor to tensor - - // Check all matmul operands are padded. - // MATMUL: %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]] - // MATMUL: %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]] - // MATMUL: %[[T3:.*]] = tensor.pad %{{.*}} nofold - // MATMUL-SAME: [%[[C0]], %[[C0]]] - // MATMUL-SAME: [%[[V0]], %[[V1]] - // MATMUL: %[[T4:.*]] = tensor.pad %{{.*}} nofold - // MATMUL: %[[T5:.*]] = tensor.pad %{{.*}} low - - // Check the dynamic matmul has been erased. - // MATMUL-NOT: = linalg.matmul {{.*}} tensor - - // Check all padded matmul operands are statically sized. - // MATMUL: %[[T6:.*]] = linalg.matmul - // MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>) - // MATMUL-SAME: outs(%[[T5]] : tensor<5x7xf32>) - // MATMUL: %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]] - // MATMUL: %[[T8:.*]] = tensor.insert_slice %[[T7]] - %12 = linalg.matmul ins(%8, %10 : tensor, tensor) outs(%11 : tensor) -> tensor - %13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor into tensor - - // MATMUL: return %[[T8]] - func.return %13 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// FILL-MATMUL: pad_multiple -// FILL-MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32> -func.func @pad_multiple(%arg0: tensor<64x64xf32>, - %iv0 : index) -> tensor { - %cst = arith.constant 0.0 : f32 - %size = affine.min #map0()[%iv0] - - // FILL-MATMUL: %[[T0:.*]] = tensor.extract_slice - %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - - // Check the two operations are padded by the same pad tensor operation. - // FILL-MATMUL: %[[T1:.*]] = tensor.pad %[[T0]] - // FILL-MATMUL: %[[T2:.*]] = linalg.fill {{.*}} outs(%[[T1]] - // FILL-MATMUL: %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]] - // FILL-MATMUL: = tensor.extract_slice %[[T3]] - %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor) -> tensor - %2 = linalg.matmul ins(%0, %0 : tensor, tensor) outs(%1 : tensor) -> tensor - func.return %2 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// MATMUL: pad_chain -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32> -func.func @pad_chain(%arg0: tensor<64x64xf32>, - %iv0 : index) -> tensor { - %cst = arith.constant 0.0 : f32 - %size = affine.min #map0()[%iv0] - %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - - // Check the matmul at the end of the use-def chain is padded. - // MATMUL: %[[T0:.*]] = linalg.fill - // MATMUL: %[[T1:.*]] = tensor.pad %[[T0]] - // MATMUL: %[[T2:.*]] = linalg.matmul {{.*}} outs(%[[T1]] - // MATMUL: = tensor.extract_slice %[[T2]] - %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor) -> tensor - %2 = linalg.matmul ins(%0, %0 : tensor, tensor) outs(%1 : tensor) -> tensor - func.return %2 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// MATMUL: compose_padding -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32> -func.func @compose_padding(%arg0: tensor<64x64xf32>, - %iv0 : index) -> tensor { - %cst = arith.constant 0.0 : f32 - - // MATMUL: %[[SIZE:.*]] = affine.min - %size = affine.min #map0()[%iv0] - - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // MATMUL-SAME: [0, 0] - // MATMUL-SAME: [%[[SIZE]], %[[SIZE]]] - // MATMUL: %[[T1:.*]] = tensor.pad %[[T0]] - // MATMUL: %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]] - // MATMUL: %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]] - %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor to tensor<64x64xf32> - %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32> - %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<64x64xf32>) -> tensor<64x64xf32> - %4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - - // Check there are no additional pad tensor operations. - // MATMUL-NOT: tensor.pad - - // Check the matmul directly uses the result of the fill operation. - // MATMUL: %[[T4:.*]] = linalg.matmul ins(%[[T3]] - // MATMUL: %[[T5:.*]] = tensor.extract_slice %[[T4]] - // MATMUL-SAME: [0, 0] - // MATMUL-SAME: [%[[SIZE]], %[[SIZE]]] - %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor - - // MATMUL: return %[[T5]] - func.return %5 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// MATMUL: different_padding_values -func.func @different_padding_values(%arg0: tensor<64x64xf32>, - %iv0 : index) -> tensor { - %cst = arith.constant 42.0 : f32 - %size = affine.min #map0()[%iv0] - %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor to tensor<64x64xf32> - %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32> - %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - - // Different padding values prevent composing the paddings (42.0 vs. 0.0). - // MATMUL: = linalg.fill - // MATMUL: = tensor.pad - // MATMUL: = linalg.matmul - %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor - func.return %5 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// MATMUL: different_padding_dynamic_sizes -func.func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>, - %iv0 : index) -> tensor { - %cst = arith.constant 0.0 : f32 - %size = affine.min #map0()[%iv0] - %0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor - %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor to tensor<64x64xf32> - %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32> - %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - - // Different dynamic sizes prevent composing the paddings (%iv0 vs %size). - // MATMUL: = linalg.fill - // MATMUL: = tensor.pad - // MATMUL: = linalg.matmul - %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor - func.return %5 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// MATMUL: different_padding_dynamic_rank -func.func @different_padding_dynamic_rank(%arg0: tensor<64x64x1xf32>, - %iv0 : index) -> tensor { - %cst = arith.constant 0.0 : f32 - %size = affine.min #map0()[%iv0] - %0 = tensor.extract_slice %arg0[0, 0, 0] [%size, %size, 1] [1, 1, 1] : tensor<64x64x1xf32> to tensor - %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor to tensor<64x64xf32> - %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<64x64xf32>) -> tensor<64x64xf32> - %3 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor - - // Different dynamic ranks prevent composing the paddings ([%size, %size, 1] vs [%size, %size]). - // MATMUL: = linalg.fill - // MATMUL: = tensor.pad - // MATMUL: = linalg.matmul - %4 = linalg.matmul ins(%3, %3 : tensor, tensor) outs(%3 : tensor) -> tensor - func.return %4 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// MATMUL: different_padding_static_sizes -func.func @different_padding_static_sizes(%arg0: tensor<62x62xf32>, - %iv0 : index) -> tensor { - %cst = arith.constant 0.0 : f32 - %size = affine.min #map0()[%iv0] - %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor - %1 = tensor.pad %0 low[0, 0] high[%iv0, %iv0] { - ^bb0(%arg3: index, %arg4: index): - tensor.yield %cst : f32 - } : tensor to tensor<62x62xf32> - %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<62x62xf32>) -> tensor<62x62xf32> - %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor - - // Different static sizes prevent composing the paddings (62 vs 64 derived from #map0). - // MATMUL: = linalg.fill - // MATMUL: = tensor.pad - // MATMUL: = linalg.matmul - %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor - func.return %5 : tensor -} - -// ----- - -#map0 = affine_map<()[s0] -> (7, s0)> - -// FILL: scalar_operand -// FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: f32 -// FILL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32> -func.func @scalar_operand(%arg0: f32, - %arg1: tensor<24x12xf32>, - %iv0 : index) -> tensor<24x12xf32> { - %0 = affine.min #map0()[%iv0] - - // FILL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]] - // FILL: %[[T1:.*]] = tensor.pad %[[T0]] nofold - %1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> - - // Check only the fill output operand is padded. - // FILL: %[[T6:.*]] = linalg.fill ins(%[[ARG0]]{{.*}}outs(%[[T1]] - %2 = linalg.fill ins(%arg0 : f32) outs(%1 : tensor<4x?xf32>) -> tensor<4x?xf32> - %3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32> - func.return %3 : tensor<24x12xf32> -} - -// ----- - -#map0 = affine_map<()[s0] -> (7, s0)> - -// MATMUL: static_extract_slice_missing -// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>, -func.func @static_extract_slice_missing(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<4x5xf32>, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<4x5xf32> { - %0 = affine.min #map0()[%iv2] - %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> - %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor - - // Check the matmul inputs are padded despite the missing slice for the static output. - // MATMUL: %[[T0:.*]] = tensor.pad - // MATMUL: %[[T1:.*]] = tensor.pad - // MATMUL: = linalg.matmul ins(%[[T0]], %[[T1]] - // MATMUL-SAME: outs(%[[ARG2]] - %3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32> - func.return %3 : tensor<4x5xf32> -} - -// ----- - -#map0 = affine_map<()[s0] -> (7, s0)> - -// MATMUL: dynamic_extract_slice_missing -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>, -// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>, -// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>, -func.func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { - %0 = affine.min #map0()[%iv2] - - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]] - // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG2]] - %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor - %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32> - - // Check the matmul is not padded due to the missing slice for the dynamic input. - // MATMUL: = linalg.matmul ins(%[[ARG0]], %[[T0]] - // MATMUL-SAME: outs(%[[T1]] - %4 = linalg.matmul ins(%arg0, %2 : tensor<4x?xf32>, tensor) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32> - %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32> - func.return %5 : tensor<24x25xf32> -} - -// ----- - -#map0 = affine_map<()[s0] -> (7, s0)> - -// INPUTS-ONLY: static_input_padding_only -// INPUTS-ONLY-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>, -func.func @static_input_padding_only(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { - %0 = affine.min #map0()[%iv2] - %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> - %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor - - // INPUTS-ONLY: %[[T0:.*]] = tensor.extract_slice %[[ARG2]] - %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32> - - // Check the matmul inputs are padded despite the failure to compute a padding value for the static output. - // INPUTS-ONLY: %[[T1:.*]] = tensor.pad - // INPUTS-ONLY: %[[T2:.*]] = tensor.pad - // INPUTS-ONLY: = linalg.matmul ins(%[[T1]], %[[T2]] - // INPUTS-ONLY-SAME: outs(%[[T0]] - %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32> - %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32> - func.return %5 : tensor<24x25xf32> -} - -// ----- - -#map0 = affine_map<()[s0] -> (7, s0)> - -// INPUTS-ONLY: dynamic_input_padding_only -// INPUTS-ONLY-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32>, -// INPUTS-ONLY-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>, -// INPUTS-ONLY-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>, -func.func @dynamic_input_padding_only(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { - %0 = affine.min #map0()[%iv2] - - // INPUTS-ONLY: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // INPUTS-ONLY: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] - // INPUTS-ONLY: %[[T2:.*]] = tensor.extract_slice %[[ARG2]] - %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> - %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor - %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32> - - // Check the matmul is not padded due to the failure to compute a padding value for the dynamic output. - // INPUTS-ONLY: = linalg.matmul ins(%[[T0]], %[[T1]] - // INPUTS-ONLY-SAME: outs(%[[T2]] - %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor) outs(%3 : tensor<4x?xf32>) -> tensor<4x?xf32> - %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32> - func.return %5 : tensor<24x25xf32> -} - -// ----- - -#map0 = affine_map<()[s0] -> (64, s0)> - -// FILL: rank_reducing -// FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x64x1x64xf32> -func.func @rank_reducing(%arg0: tensor<1x64x1x64xf32>, - %iv0 : index) -> tensor<1x?x?xf32> { - // FILL: %[[CST:.*]] = arith.constant 1. - %cst = arith.constant 0.0 : f32 - %size = affine.min #map0()[%iv0] - %0 = tensor.extract_slice %arg0[0, 0, 0, 0] [1, %size, 1, %size] [1, 1, 1, 1] : tensor<1x64x1x64xf32> to tensor<1x?x?xf32> - - // Check the fill is padded despite the rank-reducing slice operation. - // FILL: %[[T0:.*]] = tensor.pad - // FILL: tensor.yield %[[CST]] - // FILL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]] - // FILL-SAME: tensor<1x64x64xf32> - // FILL: = tensor.extract_slice %[[T1]] - %1 = linalg.fill ins(%cst : f32) outs(%0 : tensor<1x?x?xf32>) -> tensor<1x?x?xf32> - func.return %1 : tensor<1x?x?xf32> -} - -// ----- - -#map0 = affine_map<()[s0] -> (7, s0)> - -// PARTIAL: padding_the_output_dims_only -func.func @padding_the_output_dims_only(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { - // PARTIAL-DAG: %[[C0:.*]] = arith.constant 0 : index - // PARTIAL-DAG: %[[TS:.*]] = affine.apply - %0 = affine.min #map0()[%iv2] - - // Check only the output dimensions of the matmul are padded. - // PARTIAL: %[[T0:.*]] = tensor.pad - // PARTIAL-SAME: [%[[TS]], %[[C0]] - // PARTIAL: %[[T1:.*]] = tensor.pad - // PARTIAL-SAME: [%[[C0]], %[[TS]] - // PARTIAL: %[[T2:.*]] = tensor.pad - // PARTIAL-SAME: [%[[TS]], %[[TS]] - %1 = tensor.extract_slice %arg0[%iv0, %iv2] [%0, %0] [1, 1] : tensor<24x12xf32> to tensor - %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, %0] [1, 1] : tensor<12x25xf32> to tensor - %3 = tensor.extract_slice %arg2[%iv0, %iv1] [%0, %0] [1, 1] : tensor<24x25xf32> to tensor - - // PARTIAL: = linalg.matmul ins(%[[T0]], %[[T1]] - // PARTIAL-SAME: outs(%[[T2]] - %4 = linalg.matmul ins(%1, %2 : tensor, tensor) outs(%3 : tensor) -> tensor - %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [%0, %0] [1, 1] : tensor into tensor<24x25xf32> - func.return %5 : tensor<24x25xf32> -} - -// ----- - -// DEPTHWISE_CONV_2D-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (4, -s0 + 11)> -// DEPTHWISE_CONV_2D-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2)> -// DEPTHWISE_CONV_2D-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * 2 + 1)> -// DEPTHWISE_CONV_2D-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (s0 * -2 + 8)> -// DEPTHWISE_CONV_2D-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 4)> - -#map0 = affine_map<()[s0] -> (4, -s0 + 11)> -#map1 = affine_map<()[s0] -> (s0 * 2)> -#map2 = affine_map<()[s0] -> (s0 * 2 + 1)> - -// DEPTHWISE_CONV_2D: depthwise_conv_2d_padding -// DEPTHWISE_CONV_2D-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<1x23x3x16xf32> -// DEPTHWISE_CONV_2D-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<3x3x16xf32> -// DEPTHWISE_CONV_2D-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<1x13x1x16xf32> -// DEPTHWISE_CONV_2D-SAME: %[[IV0:[0-9a-zA-Z]*]]: index -func.func @depthwise_conv_2d_padding(%arg0: tensor<1x23x3x16xf32>, - %arg1: tensor<3x3x16xf32>, - %arg2: tensor<1x13x1x16xf32>, - %iv0: index) -> tensor<1x?x1x16xf32> { - // DEPTHWISE_CONV_2D-DAG: %[[CST:.*]] = arith.constant 0. - // DEPTHWISE_CONV_2D-DAG: %[[C0:.*]] = arith.constant 0 : index - // DEPTHWISE_CONV_2D-DAG: %[[T0:.*]] = affine.min #[[MAP0]]()[%[[IV0]]] - %0 = affine.min #map0()[%iv0] - %1 = affine.apply #map1()[%iv0] - %2 = affine.apply #map2()[%0] - - // DEPTHWISE_CONV_2D: %[[T3:.*]] = tensor.extract_slice %[[ARG0]] - // DEPTHWISE_CONV_2D: %[[T4:.*]] = tensor.extract_slice %[[ARG2]] - %3 = tensor.extract_slice %arg0[0, %1, 0, 0] [1, %2, 3, 16] [1, 1, 1, 1] : tensor<1x23x3x16xf32> to tensor<1x?x3x16xf32> - %4 = tensor.extract_slice %arg2[0, %iv0, 0, 0] [1, %0, 1, 16] [1, 1, 1, 1] : tensor<1x13x1x16xf32> to tensor<1x?x1x16xf32> - - // Check the padding on the input. - // DEPTHWISE_CONV_2D: %[[T5:.*]] = affine.apply #[[MAP3]]()[%[[T0]]] - // DEPTHWISE_CONV_2D: %[[T6:.*]] = tensor.pad %[[T3]] - // DEPTHWISE_CONV_2D-SAME: low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]] - // DEPTHWISE_CONV_2D-SAME: high[%[[C0]], %[[T5]], %[[C0]], %[[C0]]] - // DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32 - - // Check the padding on the output. - // DEPTHWISE_CONV_2D: %[[T7:.*]] = affine.apply #[[MAP4]]()[%[[T0]]] - // DEPTHWISE_CONV_2D: %[[T8:.*]] = tensor.pad %[[T4]] - // DEPTHWISE_CONV_2D-SAME: low[%[[C0]], %[[C0]], %[[C0]], %[[C0]]] - // DEPTHWISE_CONV_2D-SAME: high[%[[C0]], %[[T7]], %[[C0]], %[[C0]]] - // DEPTHWISE_CONV_2D: tensor.yield %[[CST]] : f32 - - // DEPTHWISE_CONV_2D: %[[T9:.*]] = linalg.depthwise_conv_2d_nhwc_hwc - // DEPTHWISE_CONV_2D-SAME: ins(%[[T6]], %[[ARG1]] : tensor<1x9x3x16xf32>, tensor<3x3x16xf32>) - // DEPTHWISE_CONV_2D-SAME: outs(%[[T8]] : tensor<1x4x1x16xf32>) - %5 = linalg.depthwise_conv_2d_nhwc_hwc - {dilations = dense<1> : tensor<2xi64>, strides = dense<2> : tensor<2xi64>} - ins(%3, %arg1 : tensor<1x?x3x16xf32>, tensor<3x3x16xf32>) - outs(%4 : tensor<1x?x1x16xf32>) -> tensor<1x?x1x16xf32> - - // Check the extract_slice to crop the padded output before return. - // DEPTHWISE_CONV_2D: %[[T10:.*]] = tensor.extract_slice %[[T9]][0, 0, 0, 0] - // DEPTHWISE_CONV_2D-SAME: [1, %[[T0]], 1, 16] - // DEPTHWISE_CONV_2D: return %[[T10]] : tensor<1x?x1x16xf32> - return %5 : tensor<1x?x1x16xf32> -} diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/tile-and-fuse-no-fuse.mlir +++ /dev/null @@ -1,40 +0,0 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=0,0,0 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=MATMUL %s -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.elemwise_unary fuse tile-sizes=32,32,0 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=UNARY %s - -// MATMUL-LABEL: @tile_sizes_zero( -func.func @tile_sizes_zero(%arg0 : tensor, %arg1 : tensor) -> tensor { - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - %cst = arith.constant 0.0 : f32 - %d0 = tensor.dim %arg0, %c0 : tensor - %d1 = tensor.dim %arg1, %c1 : tensor - %init = linalg.init_tensor [%d0, %d1] : tensor - - // MATMUL-NOT: scf.for - // MATMUL: linalg.fill - %fill = linalg.fill ins(%cst : f32) outs(%init : tensor) -> tensor - - // MATMUL-NOT: scf.for - // MATMUL: linalg.matmul - %result = linalg.matmul ins(%arg0, %arg1 : tensor, tensor) - outs(%fill : tensor) -> tensor - func.return %result : tensor -} - -// ----- - -// UNARY-LABEL: @shape_only( -func.func @shape_only(%arg0 : tensor, %arg1 : tensor) -> tensor { - %cst = arith.constant 0.0 : f32 - - // UNARY: linalg.fill - %0 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor) -> tensor - - // UNARY: scf.for - // UNARY: scf.for - // UNARY-NOT: linalg.fill - // UNARY: linalg.elemwise_unary - %1 = linalg.elemwise_unary {fun = #linalg.unary_fn} - ins(%arg0 : tensor) outs(%0 : tensor) -> tensor - func.return %1 : tensor -} diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir +++ /dev/null @@ -1,323 +0,0 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=MATMUL %s -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=GENERIC %s - -// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)> -// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)> -// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 24, d0)> -// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)> - -// MATMUL: fuse_input -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> -func.func @fuse_input(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<24x12xf32>) -> tensor<24x12xf32> - - // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] = - // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] = - // MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]]) - // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] = - // MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]]) - - // Tile both input operand dimensions. - // MATMUL: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]]) - // MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]]) - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // MATMUL-SAME: %[[IV1]], %[[IV2]] - // MATMUL-SAME: %[[UB1]], %[[UB2]] - // MATMUL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]] - // MATMUL: %{{.*}} = linalg.matmul ins(%[[T1]] - %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - func.return %1 : tensor<24x25xf32> -} - -// ----- - -// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)> -// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 25, 4)> - -// MATMUL: fuse_output -// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> -func.func @fuse_output(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { - // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index - // MATMUL-DAG: %[[C1:.*]] = arith.constant 1 : index - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - - // Update the iteration argument of the outermost tile loop. - // MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]] - // MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]] - // MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]]) - // MATMUL: %[[TS0:.*]] = affine.min #[[MAP1]](%[[IV0]]) - - // Tile the both output operand dimensions. - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]] - // MATMUL-SAME: %[[IV1]], %[[IV0]] - // MATMUL-SAME: %[[TS1]], %[[TS0]] - // MATMUL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]] - // MATMUL: scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]] - - // Check there is an extract/insert slice pair for the output operand. - // MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG5]], %[[C0]] - // MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG5]], %[[C1]] - // MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG5]] - // MATMUL-SAME: 0, 0 - // MATMUL-SAME: %[[D0]], %[[D1]] - // MATMUL: %[[T3:.*]] = linalg.matmul {{.*}} outs(%[[T2]] - // MATMUL: %{{.*}} = tensor.insert_slice %[[T3]] into %[[ARG5]] - // MATMUL-SAME: 0, 0 - // MATMUL-SAME: %[[D0]], %[[D1]] - %1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32> - func.return %1 : tensor<24x25xf32> -} - -// ----- - -// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 25, 4)> -// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)> -// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 25, d0)> -// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)> -#map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -#map1 = affine_map<(d0, d1, d2) -> (d0, d2)> - -// MATMUL: fuse_reduction -// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> -// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32> -func.func @fuse_reduction(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c4 = arith.constant 4 : index - %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) { - ^bb0(%arg4: f32, %arg5: f32): - %2 = arith.addf %arg4, %arg5 : f32 - linalg.yield %2 : f32 - } -> tensor<12x25xf32> - - // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] = - // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] = - // MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]]) - // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] = - // MATMUL: %[[TS2:.*]] = affine.min #[[MAP1]](%[[IV2]]) - // MATMUL: %[[UB2:.*]] = affine.min #[[MAP3]](%[[TS2]], %[[IV2]]) - // MATMUL: %[[UB0:.*]] = affine.min #[[MAP2]](%[[TS0]], %[[IV0]]) - - // Tile only the parallel dimensions but not the reduction dimension. - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]] - // MATMUL-SAME: %[[IV2]], 0, %[[IV0]] - // MATMUL-SAME: %[[UB2]], 7, %[[UB0]] - // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] - // MATMUL-SAME: %[[IV2]], %[[IV0]] - // MATMUL-SAME: %[[UB2]], %[[UB0]] - // MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]] - // MATMUL: %{{.*}} = linalg.matmul ins(%{{.*}}, %[[T2]] - %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - func.return %1 : tensor<24x25xf32> -} - -// ----- - -#map0 = affine_map<(d0, d1) -> (d1, d0)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> - -// MATMUL: fuse_transposed -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> -// MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32> -func.func @fuse_transposed(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c4 = arith.constant 4 : index - %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) { - ^bb0(%arg4: f32, %arg5: f32): - %2 = arith.addf %arg4, %arg5 : f32 - linalg.yield %2 : f32 - } -> tensor<24x12xf32> - - // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] = - // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] = - // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] = - - // Swap the input operand slice offsets due to the transposed indexing map. - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG3]] - // MATMUL-SAME: %[[IV2]], %[[IV1]] - // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG0]] - // MATMUL-SAME: %[[IV1]], %[[IV2]] - // MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]] - // MATMUL: %{{.*}} = linalg.matmul ins(%[[T2]] - %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - func.return %1 : tensor<24x25xf32> -} - -// ----- - -// MATMUL: fuse_input_and_output -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> -// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> -func.func @fuse_input_and_output(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<24x12xf32>) -> tensor<24x12xf32> - %1 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - - // Fuse both producers to the appropriate tile loops. - // MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]] - // MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG4:.*]] = %[[ARG3]] - // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG4]] - // MATMUL-SAME: %[[IV1]], %[[IV0]] - // MATMUL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]] - // MATMUL: scf.for %[[IV2:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[T1]] - // MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG0]] - // MATMUL-SAME: %[[IV1]], %[[IV2]] - // MATMUL: %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]] - // MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG5]] - // MATMUL: %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T4]] - %2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32> - func.return %2 : tensor<24x25xf32> -} - -// ----- - -// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)> -#map0 = affine_map<(d0, d1) -> (d1, d0)> - -// MATMUL: fuse_indexed -// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32> -func.func @fuse_indexed(%arg0: tensor<24x12xi32>, - %arg1: tensor<12x25xi32>, - %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c4 = arith.constant 4 : index - %0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) { - ^bb0(%arg3: i32): - %6 = linalg.index 0 : index - %7 = linalg.index 1 : index - %8 = arith.addi %6, %7 : index - %9 = arith.index_cast %8 : index to i32 - linalg.yield %9 : i32 - } -> tensor<12x25xi32> - - // MATMUL: scf.for %[[IV0:[0-9a-zA-Z]*]] = - // MATMUL: scf.for %[[IV1:[0-9a-zA-Z]*]] = - // MATMUL: scf.for %[[IV2:[0-9a-zA-Z]*]] = - - // Shift the indexes by the slice offsets and swap the offsets due to the transposed indexing map. - // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] - // MATMUL-SAME: %[[IV2]], %[[IV0]] - // MATMUL: linalg.generic {{.*}} outs(%[[T1]] - // MATMUL: %[[IDX0:.*]] = linalg.index 0 - // MATMUL: %[[IDX0_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX0]], %[[IV0]]) - // MATMUL: %[[IDX1:.*]] = linalg.index 1 - // MATMUL: %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]]) - // MATMUL: %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]] - %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32> - func.return %1 : tensor<24x25xi32> -} - -// ----- - -#map0 = affine_map<(d0, d1) -> (d0, d1)> -#map1 = affine_map<(d0, d1) -> (d0)> - -// GENERIC: fuse_outermost_reduction -// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32> -// GENERIC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32> -func.func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>, - %arg1: tensor<10xf32>) -> tensor<10xf32> { - %cst = arith.constant 0.000000e+00 : f32 - %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32> - - // Cannot fuse the output fill since the reduction loop is the outermost loop. - // GENERIC: %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG1]] - %1 = linalg.fill ins(%cst : f32) outs(%arg1 : tensor<10xf32>) -> tensor<10xf32> - - // GENERIC: scf.for %[[IV0:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[T0]] - // GENERIC: scf.for %[[IV1:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG3:.*]] = %[[ARG2]] - - // MATMUL the input fill has been fused. - // GENERIC: %[[T1:.*]] = tensor.extract_slice %[[ARG0]] - // GENERIC-SAME: %[[IV1]], %[[IV0]] - // GENERIC: %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]] - // GENERIC: %[[T3:.*]] = tensor.extract_slice %[[ARG3]] - // GENERIC-SAME: %[[IV1]] - // GENERIC: linalg.generic {{.*}} ins(%[[T2]] {{.*}} outs(%[[T3]] - %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) { - ^bb0(%arg2: f32, %arg3: f32): - %3 = arith.addf %arg2, %arg3 : f32 - linalg.yield %3 : f32 - } -> tensor<10xf32> - func.return %2 : tensor<10xf32> -} - -// ----- - -// GENERIC-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)> -// GENERIC-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (-d0 - d1 + 17, 8)> -// GENERIC-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (-d1 - d2 + 17, d0)> -#map0 = affine_map<(d0, d1) -> (d0, d0 + d1)> -#map1 = affine_map<(d0, d1) -> (d0, d1)> - -// GENERIC: fuse_non_rectangular -// GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32> -func.func @fuse_non_rectangular(%arg0: tensor<10x17xf32>, - %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> { - - // GENERIC-DAG: %[[C0:.*]] = arith.constant 0 : index - // GENERIC-DAG: %[[C4:.*]] = arith.constant 4 : index - // GENERIC-DAG: %[[C5:.*]] = arith.constant 5 : index - // GENERIC-DAG: %[[C8:.*]] = arith.constant 8 : index - // GENERIC-DAG: %[[C10:.*]] = arith.constant 10 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32> - - // GENERIC: scf.for %[[IV0:[0-9a-zA-Z]*]] = %[[C0]] to %[[C8]] step %[[C4]] - // GENERIC: scf.for %[[IV1:[0-9a-zA-Z]*]] = %[[C0]] to %[[C10]] step %[[C5]] - - // Compute producer on a hyper rectangular bounding box. Along the second dimenson, - // the offset is set to the sum of the induction variables, and the upper bound - // to either 8 (tile size) or 17 (sum of max indices (9+7) then + 1) minus the - // induction variables. - // GENERIC-DAG: %[[SUM:.*]] = affine.apply #[[MAP0]](%[[IV1]], %[[IV0]] - // GENERIC-DAG: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]], %[[IV0]] - // GENERIC-DAG: %[[UB1:.*]] = affine.min #[[MAP2]](%[[TS1]], %[[IV1]], %[[IV0]] - // GENERIC: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // GENERIC-SAME: %[[IV1]], %[[SUM]] - // GENERIC-SAME: , %[[UB1]] - // GENERIC: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]] - %1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) { - ^bb0(%arg2: f32, %arg3: f32): - %2 = arith.addf %arg2, %arg3 : f32 - linalg.yield %2 : f32 - } -> tensor<10x8xf32> - func.return %1 : tensor<10x8xf32> -} diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/tile-and-fuse-sequence-on-tensors.mlir +++ /dev/null @@ -1,84 +0,0 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.conv_2d fuse tile-sizes=4,4,0,0 tile-interchange=0,1,2,3 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=CONV %s -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=4,4,0 tile-interchange=0,1,2 run-enable-pass=false" -split-input-file | FileCheck --check-prefix=MATMUL %s - -// CONV: fuse_conv_chain -// CONV-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<2x2xf32> -// CONV-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<11x11xf32> -// CONV-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<10x10xf32> -// CONV-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<9x9xf32> -// CONV-SAME: %[[ARG4:[0-9a-zA-Z]*]]: tensor<8x8xf32> -func.func @fuse_conv_chain(%arg0: tensor<2x2xf32>, - %arg1: tensor<11x11xf32>, - %arg2: tensor<10x10xf32>, - %arg3: tensor<9x9xf32>, - %arg4: tensor<8x8xf32>) -> tensor<8x8xf32> { - %cst = arith.constant 1.0 : f32 - - // Do not tile the filter fill since the filter dimensions are not tiled. - // CONV: %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG0]] - %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<2x2xf32>) -> tensor<2x2xf32> - - // Fuse all other operations. - // CONV: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG5:.*]] = %[[ARG4]] - // CONV: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG6:.*]] = %[[ARG5]] - - // CONV: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] - // CONV-SAME: %[[IV0]], %[[IV1]] - // CONV: %[[T2:.*]] = tensor.extract_slice %[[ARG2]] - // CONV-SAME: %[[IV0]], %[[IV1]] - // CONV: %[[T3:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T2]] - // CONV: %[[T4:.*]] = linalg.conv_2d ins(%[[T1]], %[[T0]] : {{.*}} outs(%[[T3]] - %1 = linalg.fill ins(%cst : f32) outs(%arg2 : tensor<10x10xf32>) -> tensor<10x10xf32> - %2 = linalg.conv_2d ins(%arg1, %0 : tensor<11x11xf32>, tensor<2x2xf32>) outs(%1 : tensor<10x10xf32>) -> tensor<10x10xf32> - - // CONV: %[[T5:.*]] = tensor.extract_slice %[[ARG3]] - // CONV-SAME: %[[IV0]], %[[IV1]] - // CONV: %[[T6:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T5]] - // CONV: %[[T7:.*]] = linalg.conv_2d ins(%[[T4]], %[[T0]] : {{.*}} outs(%[[T6]] - %3 = linalg.fill ins(%cst : f32) outs(%arg3 : tensor<9x9xf32>) -> tensor<9x9xf32> - %4 = linalg.conv_2d ins(%2, %0 : tensor<10x10xf32>, tensor<2x2xf32>) outs(%3 : tensor<9x9xf32>) -> tensor<9x9xf32> - - // Use the argument passed in by iteration argument. - // CONV: %[[T8:.*]] = tensor.extract_slice %[[ARG6]] - // CONV-SAME: %[[IV0]], %[[IV1]] - // CONV: %[[T9:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T8]] - // CONV: %[[T5:.*]] = linalg.conv_2d ins(%[[T7]], %[[T0]] {{.*}} outs(%[[T9]] - %5 = linalg.fill ins(%cst : f32) outs(%arg4 : tensor<8x8xf32>) -> tensor<8x8xf32> - %6 = linalg.conv_2d ins(%4, %0 : tensor<9x9xf32>, tensor<2x2xf32>) outs(%5 : tensor<8x8xf32>) -> tensor<8x8xf32> - return %6 : tensor<8x8xf32> -} - -// ----- - -// MATMUL: fuse_matmul_chain -// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<8x8xf32> -func.func @fuse_matmul_chain(%arg0: tensor<8x8xf32>) -> tensor<8x8xf32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - - // Do not tile rhs fill of the producer matmul since none of its loop dimension is tiled. - // MATMUL: %[[T0:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[ARG0]] - %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<8x8xf32>) -> tensor<8x8xf32> - - // MATMUL: scf.for %[[IV0:.*]] = {{.*}} iter_args(%[[ARG1:.*]] = %[[ARG0]] - // MATMUL: scf.for %[[IV1:.*]] = {{.*}} iter_args(%[[ARG2:.*]] = %[[ARG1]] - - // Only the outermost loop of the producer matmul is tiled. - // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG0]] - // MATMUL-SAME: %[[IV0]], 0 - // MATMUL: %[[T2:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T1]] - // MATMUL: %[[T3:.*]] = linalg.matmul ins(%[[T2]], %[[T0]] {{.*}} - %1 = linalg.matmul ins(%0, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32> - - // Use the argument passed in by iteration argument. - // MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG2]] - // MATMUL-SAME: %[[IV0]], %[[IV1]] - // MATMUL: %[[T5:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T4]] - // MATMUL: %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T5]] - %2 = linalg.matmul ins(%1, %0 : tensor<8x8xf32>, tensor<8x8xf32>) outs(%0 : tensor<8x8xf32>) -> tensor<8x8xf32> - return %2 : tensor<8x8xf32> -} diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir deleted file mode 100644 --- a/mlir/test/Integration/Dialect/Linalg/CPU/benchmark_matmul.mlir +++ /dev/null @@ -1,113 +0,0 @@ -// RUN: export M=24 && export K=64 && export N=192 && export ITERS=10 && \ -// RUN: cat %s | sed 's@${M}@'"$M"'@g'| sed 's@${K}@'"$K"'@g' | sed 's@${N}@'"$N"'@g'| sed 's@${ITERS}@'"$ITERS"'@g'| \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.matmul register-tile-sizes=12,32,16 vectorize" | \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=linalg.fill register-tile-sizes=4,32 vectorize" | \ -// RUN: mlir-opt -test-linalg-codegen-strategy="anchor-func=matmul anchor-op=memref.copy register-tile-sizes=4,32 vectorize" | \ - -// RUN: mlir-opt -pass-pipeline="func.func(canonicalize,convert-vector-to-scf,lower-affine,convert-linalg-to-loops)" | \ -// RUN: mlir-opt -pass-pipeline="func.func(canonicalize,convert-scf-to-cf),convert-vector-to-llvm,convert-memref-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts" | \ -// RUN: mlir-cpu-runner -O3 -e main -entry-point-result=void \ -// Activate to dump assembly -// R_UN: -dump-object-file -object-filename=/tmp/a.o \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_runner_utils%shlibext \ -// RUN: -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \ -// Use tee to both print to stderr and FileCheck -// RUN: tee -a /dev/stderr | FileCheck %s - - -!elem_type_a = f32 -!elem_type_b = f32 -!elem_type_c = f32 -!row_major_A = memref<${M}x${K}x!elem_type_a> -!row_major_B = memref<${K}x${N}x!elem_type_b> -!row_major_C = memref<${M}x${N}x!elem_type_c> - -func.func @matmul(%a: !row_major_A, %b: !row_major_B, %c: !row_major_C) -// TODO: activate manually for now. -// attributes { passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} -{ - linalg.matmul ins(%a, %b : !row_major_A, !row_major_B) - outs(%c: !row_major_C) - return -} - -func.func @print_perf(%iters: index, %total_time: f64) { - %c2 = arith.constant 2 : index - %cM = arith.constant ${M} : index - %cN = arith.constant ${N} : index - %cK = arith.constant ${K} : index - - %mn = arith.muli %cM, %cN : index - %mnk = arith.muli %mn, %cK : index - - // 2*M*N*K. - %flops_per_iter = arith.muli %c2, %mnk : index - %flops = arith.muli %iters, %flops_per_iter : index - %flops_i64 = arith.index_cast %flops : index to i64 - %flops_f = arith.sitofp %flops_i64 : i64 to f64 - %flops_per_s = arith.divf %flops_f, %total_time : f64 - vector.print %flops_per_s : f64 - - return -} - -func.func @main() { - %v0 = arith.constant 0.0 : !elem_type_a - %v1 = arith.constant 1.0 : !elem_type_a - - %A = memref.alloc() : !row_major_A - %B = memref.alloc() : !row_major_B - %C = memref.alloc() : !row_major_C - - linalg.fill ins(%v1 : !elem_type_a) outs(%A : !row_major_A) - linalg.fill ins(%v1 : !elem_type_b) outs(%B : !row_major_B) - linalg.fill ins(%v0 : !elem_type_c) outs(%C : !row_major_C) - - %c0 = arith.constant 0: index - %c1 = arith.constant 1: index - %iters = arith.constant ${ITERS}: index - - /// Run and dump performance for matmul. - /// Preheating run: - scf.for %arg0 = %c0 to %iters step %c1 { - %z = arith.constant 0.0 : !elem_type_c - linalg.fill ins(%z : !elem_type_c) outs(%C : !row_major_C) - func.call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> () - } - %t_start_matmul = call @rtclock() : () -> f64 - scf.for %arg0 = %c0 to %iters step %c1 { - // linalg.matmul writes %C in place, need to reset it to zero every time. - // This is accounts for about 10-15% perf hit on small sizes. - // Once linalg on tensors is ready, fusing fill at the register level will - // be easy. - %z = arith.constant 0.0 : !elem_type_c - linalg.fill ins(%z : !elem_type_c) outs(%C : !row_major_C) - func.call @matmul(%A, %B, %C) : (!row_major_A, !row_major_B, !row_major_C) -> () - } - %t_end_matmul = call @rtclock() : () -> f64 - %tmatmul = arith.subf %t_end_matmul, %t_start_matmul: f64 - call @print_perf(%iters, %tmatmul) : (index, f64) -> () - - // CHECK: {{^0$}} - %C_ref = memref.alloc() : !row_major_C - linalg.fill ins(%v0 : !elem_type_c) outs(%C_ref : !row_major_C) - linalg.matmul ins(%A, %B : !row_major_A, !row_major_B) - outs(%C_ref: !row_major_C) - %act = memref.cast %C : !row_major_C to memref<*xf32> - %exp = memref.cast %C_ref : !row_major_C to memref<*xf32> - %errors = call @verifyMemRefF32(%act, %exp) : (memref<*xf32>, memref<*xf32>) -> i64 - vector.print %errors : i64 - memref.dealloc %C_ref : !row_major_C - - memref.dealloc %A : !row_major_A - memref.dealloc %B : !row_major_B - memref.dealloc %C : !row_major_C - - return -} - -func.func private @rtclock() -> f64 -func.func private @verifyMemRefF32(memref<*xf32>, memref<*xf32>) -> i64 attributes { llvm.emit_c_interface } - -// TODO: init with random, run and check output. -// func private @fill_random_f32(memref<*xf32>) diff --git a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt --- a/mlir/test/lib/Dialect/Linalg/CMakeLists.txt +++ b/mlir/test/lib/Dialect/Linalg/CMakeLists.txt @@ -1,6 +1,5 @@ # Exclude tests from libMLIR.so add_mlir_library(MLIRLinalgTestPasses - TestLinalgCodegenStrategy.cpp TestLinalgElementwiseFusion.cpp TestLinalgFusionTransforms.cpp TestLinalgHoisting.cpp diff --git a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp b/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp deleted file mode 100644 --- a/mlir/test/lib/Dialect/Linalg/TestLinalgCodegenStrategy.cpp +++ /dev/null @@ -1,294 +0,0 @@ -//===- TestLinalgCodegenStrategy.cpp - Test Linalg codegen strategy -------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -// This file implements logic for testing the Linalg codegen strategy. -// -//===----------------------------------------------------------------------===// - -#include - -#include "mlir/Dialect/Affine/IR/AffineOps.h" -#include "mlir/Dialect/Func/IR/FuncOps.h" -#include "mlir/Dialect/GPU/IR/GPUDialect.h" -#include "mlir/Dialect/Linalg/IR/Linalg.h" -#include "mlir/Dialect/Linalg/Transforms/CodegenStrategy.h" -#include "mlir/Dialect/Linalg/Utils/Utils.h" -#include "mlir/Dialect/Vector/IR/VectorOps.h" -#include "mlir/IR/PatternMatch.h" -#include "mlir/Parser/Parser.h" -#include "mlir/Pass/Pass.h" - -#include "llvm/ADT/SetVector.h" - -using namespace mlir; -using namespace mlir::linalg; - -namespace { -struct TestLinalgCodegenStrategy - : public PassWrapper> { - MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestLinalgCodegenStrategy) - - StringRef getArgument() const final { return "test-linalg-codegen-strategy"; } - StringRef getDescription() const final { - return "Test Linalg Codegen Strategy."; - } - TestLinalgCodegenStrategy() = default; - TestLinalgCodegenStrategy(const TestLinalgCodegenStrategy &pass) - : PassWrapper(pass) {} - - void getDependentDialects(DialectRegistry ®istry) const override { - // clang-format off - registry.insert(); - // clang-format on - } - - template - void applyStrategyToNamedLinalgOp(); - - void runOnOperation() override; - - void runStrategy(const LinalgTilingAndFusionOptions &tilingAndFusionOptions, - const LinalgTilingOptions &tilingOptions, - const LinalgTilingOptions ®isterTilingOptions, - LinalgPaddingOptions paddingOptions, - vector::VectorContractLowering vectorContractLowering, - vector::VectorTransferSplit vectorTransferSplit); - - Option fuse{ - *this, "fuse", - llvm::cl::desc("Fuse the producers after tiling the root op."), - llvm::cl::init(false)}; - ListOption tileSizes{*this, "tile-sizes", - llvm::cl::desc("Specifies the tile sizes.")}; - ListOption tileInterchange{ - *this, "tile-interchange", - llvm::cl::desc("Specifies the tile interchange.")}; - - Option promote{ - *this, "promote", - llvm::cl::desc("Promote the tile into a small aligned memory buffer."), - llvm::cl::init(false)}; - Option promoteFullTile{ - *this, "promote-full-tile-pad", - llvm::cl::desc("Pad the small aligned memory buffer to the tile sizes."), - llvm::cl::init(false)}; - ListOption registerTileSizes{ - *this, "register-tile-sizes", - llvm::cl::desc( - "Specifies the size of the register tile that will be used " - " to vectorize")}; - Option registerPromote{ - *this, "register-promote", - llvm::cl::desc( - "Promote the register tile into a small aligned memory buffer."), - llvm::cl::init(false)}; - Option registerPromoteFullTile{ - *this, "register-promote-full-tile-pad", - llvm::cl::desc("Pad the small aligned memory buffer to the tile sizes."), - llvm::cl::init(false)}; - Option pad{*this, "pad", llvm::cl::desc("Pad the operands."), - llvm::cl::init(false)}; - ListOption paddingValues{ - *this, "padding-values", - llvm::cl::desc("Operand padding values parsed by the attribute parser.")}; - ListOption paddingDimensions{ - *this, "padding-dimensions", - llvm::cl::desc("Operation iterator dimensions to pad.")}; - ListOption packPaddings{*this, "pack-paddings", - llvm::cl::desc("Operand packing flags.")}; - ListOption hoistPaddings{*this, "hoist-paddings", - llvm::cl::desc("Operand hoisting depths.")}; - ListOption> transposePaddings{ - *this, "transpose-paddings", - llvm::cl::desc( - "Transpose paddings. Specify a operand dimension interchange " - "using the following format:\n" - "-transpose-paddings=[1,0,2],[0,1],[0,1]\n" - "It defines the interchange [1, 0, 2] for operand one and " - "the interchange [0, 1] (no transpose) for the remaining operands." - "All interchange vectors have to be permuations matching the " - "operand rank.")}; - Option generalize{*this, "generalize", - llvm::cl::desc("Generalize named operations."), - llvm::cl::init(false)}; - ListOption iteratorInterchange{ - *this, "iterator-interchange", - llvm::cl::desc("Specifies the iterator interchange.")}; - Option decompose{ - *this, "decompose", - llvm::cl::desc("Decompose convolutions to lower dimensional ones."), - llvm::cl::init(false)}; - Option vectorize{ - *this, "vectorize", - llvm::cl::desc("Rewrite the linalg op as a vector operation."), - llvm::cl::init(false)}; - Option vectorizePadding{ - *this, "vectorize-padding", - llvm::cl::desc("Rewrite pad tensor ops as vector operations."), - llvm::cl::init(false)}; - Option splitVectorTransfersTo{ - *this, "split-transfers", - llvm::cl::desc( - "Split vector transfers between slow (masked) and fast " - "(unmasked) variants. Possible options are:\n" - "\tnone: keep unsplit vector.transfer and pay the full price\n" - "\tmemref.copy: use linalg.fill + memref.copy for the slow path\n" - "\tvector-transfers: use extra small unmasked vector.transfer for" - " the slow path\n"), - llvm::cl::init("none")}; - Option vectorizeContractionTo{ - *this, "vectorize-contraction-to", - llvm::cl::desc("the type of vector op to use for linalg contractions"), - llvm::cl::init("outerproduct")}; - Option unrollVectorTransfers{ - *this, "unroll-vector-transfers", - llvm::cl::desc("Enable full unrolling of vector.transfer operations"), - llvm::cl::init(false)}; - Option runEnablePass{ - *this, "run-enable-pass", - llvm::cl::desc("Run the enable pass between transformations"), - llvm::cl::init(true)}; - Option anchorOpName{ - *this, "anchor-op", - llvm::cl::desc( - "Which single linalg op is the anchor for the codegen strategy to " - "latch on:\n" - "\tlinalg.matmul: anchor on linalg.matmul\n" - "\tlinalg.matmul_column_major: anchor on linalg.matmul_column_major\n" - "\tmemref.copy: anchor on memref.copy\n" - "\tlinalg.fill: anchor on linalg.fill\n"), - llvm::cl::init("")}; - Option anchorFuncOpName{ - *this, "anchor-func", - llvm::cl::desc( - "Which single func op is the anchor for the codegen strategy to " - "latch on."), - llvm::cl::init("")}; -}; - -void TestLinalgCodegenStrategy::runStrategy( - const LinalgTilingAndFusionOptions &tilingAndFusionOptions, - const LinalgTilingOptions &tilingOptions, - const LinalgTilingOptions ®isterTilingOptions, - LinalgPaddingOptions paddingOptions, - vector::VectorContractLowering vectorContractLowering, - vector::VectorTransferSplit vectorTransferSplit) { - std::string anchorOpNameOrWildcard = fuse ? "" : anchorOpName.getValue(); - CodegenStrategy strategy; - strategy - .tileAndFuseIf(fuse && !tileSizes.empty(), anchorOpName, - tilingAndFusionOptions) - .tileIf(!fuse && !tileSizes.empty(), anchorOpName, tilingOptions) - .promoteIf(!fuse && promote, anchorOpName, - LinalgPromotionOptions() - .setAlignment(16) - .setUseFullTileBuffersByDefault(promoteFullTile)) - .tileIf(!fuse && !registerTileSizes.empty(), anchorOpName, - registerTilingOptions) - .promoteIf(!fuse && registerPromote, anchorOpName, - LinalgPromotionOptions() - .setAlignment(16) - .setUseFullTileBuffersByDefault(registerPromoteFullTile)) - .padIf(pad, anchorOpNameOrWildcard, std::move(paddingOptions)) - .decomposeIf(decompose) - .generalizeIf(generalize, anchorOpNameOrWildcard) - .interchangeIf(!iteratorInterchange.empty(), iteratorInterchange) - .vectorizeIf(vectorize, anchorOpNameOrWildcard, nullptr, vectorizePadding) - .vectorLowering( - LinalgVectorLoweringOptions() - .setVectorTransformsOptions( - vector::VectorTransformsOptions() - .setVectorTransformsOptions(vectorContractLowering) - .setVectorTransferSplit(vectorTransferSplit)) - .setVectorTransferToSCFOptions( - VectorTransferToSCFOptions().enableFullUnroll( - unrollVectorTransfers)) - .enableTransferPartialRewrite() - .enableContractionLowering() - .enableTransferToSCFConversion()); - // Created a nested OpPassManager and run. - func::FuncOp funcOp = getOperation(); - OpPassManager dynamicPM("func.func"); - strategy.configurePassPipeline(dynamicPM, funcOp.getContext(), runEnablePass); - if (failed(runPipeline(dynamicPM, funcOp))) - return signalPassFailure(); -} -} // namespace - -/// Apply transformations specified as patterns. -void TestLinalgCodegenStrategy::runOnOperation() { - if (!anchorFuncOpName.empty() && anchorFuncOpName != getOperation().getName()) - return; - - LinalgTilingAndFusionOptions tilingAndFusionOptions; - tilingAndFusionOptions.tileSizes = {tileSizes.begin(), tileSizes.end()}; - tilingAndFusionOptions.tileInterchange = {tileInterchange.begin(), - tileInterchange.end()}; - - LinalgTilingOptions tilingOptions; - if (!tileSizes.empty()) - tilingOptions = tilingOptions.setTileSizes(tileSizes); - if (!tileInterchange.empty()) - tilingOptions = tilingOptions.setInterchange( - SmallVector(tileInterchange.begin(), tileInterchange.end())); - - LinalgTilingOptions registerTilingOptions; - if (!registerTileSizes.empty()) - registerTilingOptions = - registerTilingOptions.setTileSizes(registerTileSizes); - - // Parse the padding values. - SmallVector paddingValueAttributes; - for (const std::string &paddingValue : paddingValues) { - paddingValueAttributes.push_back( - parseAttribute(paddingValue, &getContext())); - } - - // Parse the transpose vectors. - LinalgPaddingOptions paddingOptions; - paddingOptions.setPaddingValues(paddingValueAttributes); - paddingOptions.setPaddingDimensions( - SmallVector{paddingDimensions.begin(), paddingDimensions.end()}); - paddingOptions.setPackPaddings( - SmallVector{packPaddings.begin(), packPaddings.end()}); - paddingOptions.setHoistPaddings( - SmallVector{hoistPaddings.begin(), hoistPaddings.end()}); - paddingOptions.setTransposePaddings(transposePaddings); - - vector::VectorContractLowering vectorContractLowering = - llvm::StringSwitch( - vectorizeContractionTo.getValue()) - .Case("matrixintrinsics", vector::VectorContractLowering::Matmul) - .Case("dot", vector::VectorContractLowering::Dot) - .Case("outerproduct", vector::VectorContractLowering::OuterProduct) - .Default(vector::VectorContractLowering::OuterProduct); - vector::VectorTransferSplit vectorTransferSplit = - llvm::StringSwitch( - splitVectorTransfersTo.getValue()) - .Case("none", vector::VectorTransferSplit::None) - .Case("memref-copy", vector::VectorTransferSplit::LinalgCopy) - .Case("vector-transfers", vector::VectorTransferSplit::VectorTransfer) - .Default(vector::VectorTransferSplit::None); - - runStrategy(tilingAndFusionOptions, tilingOptions, registerTilingOptions, - paddingOptions, vectorContractLowering, vectorTransferSplit); -} - -namespace mlir { -namespace test { -void registerTestLinalgCodegenStrategy() { - PassRegistration(); -} -} // namespace test -} // namespace mlir diff --git a/mlir/tools/mlir-opt/mlir-opt.cpp b/mlir/tools/mlir-opt/mlir-opt.cpp --- a/mlir/tools/mlir-opt/mlir-opt.cpp +++ b/mlir/tools/mlir-opt/mlir-opt.cpp @@ -86,7 +86,6 @@ void registerTestGenericIRVisitorsInterruptPass(); void registerTestInterfaces(); void registerTestLastModifiedPass(); -void registerTestLinalgCodegenStrategy(); void registerTestLinalgElementwiseFusion(); void registerTestLinalgFusionTransforms(); void registerTestLinalgTensorFusionTransforms(); @@ -185,7 +184,6 @@ mlir::test::registerTestGenericIRVisitorsPass(); mlir::test::registerTestInterfaces(); mlir::test::registerTestLastModifiedPass(); - mlir::test::registerTestLinalgCodegenStrategy(); mlir::test::registerTestLinalgElementwiseFusion(); mlir::test::registerTestLinalgFusionTransforms(); mlir::test::registerTestLinalgTensorFusionTransforms();