diff --git a/mlir/test/Dialect/Linalg/pad.mlir b/mlir/test/Dialect/Linalg/pad.mlir --- a/mlir/test/Dialect/Linalg/pad.mlir +++ b/mlir/test/Dialect/Linalg/pad.mlir @@ -1,236 +1,183 @@ -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s -// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-FILL +// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=MATMUL +// RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.fill pad pack-paddings=1,1,0 run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=FILL // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul pad pack-paddings=1,1,0 pad-inputs-only run-enable-pass=false" -cse -canonicalize -split-input-file | FileCheck %s --check-prefix=INPUTS-ONLY -// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 12)> -// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 7)> -#map = affine_map<(d0) -> (7, -d0 + 12)> - -// CHECK: static_sizes_output_divisible -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> -// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> +// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 12)> +// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)> +#map = affine_map<()[s0] -> (7, -s0 + 12)> + +// MATMUL: static_sizes_output_divisible +// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> +// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> +// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> +// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index +// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index +// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index func @static_sizes_output_divisible(%arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { - // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-DAG: %[[C7:.*]] = arith.constant 7 - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c7 = arith.constant 7 : index - %c5 = arith.constant 5 : index - %c4 = arith.constant 4 : index - - // CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg3 = %c0 to %c24 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) { - - // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = - %1 = scf.for %arg5 = %c0 to %c25 step %c5 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { - - // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = - %2 = scf.for %arg7 = %c0 to %c12 step %c7 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) { - - // CHECK: %[[TS2:.*]] = affine.min #[[MAP0]](%[[IV2]]) - %3 = affine.min #map(%arg7) - - // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // CHECK: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] - // CHECK: %[[T2:.*]] = tensor.extract_slice %[[ARG4]] - %4 = tensor.extract_slice %arg0[%arg3, %arg7] [4, %3] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> - %5 = tensor.extract_slice %arg1[%arg7, %arg5] [%3, 5] [1, 1] : tensor<12x25xf32> to tensor - %6 = tensor.extract_slice %arg8[%arg3, %arg5] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32> - - // Check statically sized matmul inputs with partially divisible sizes are padded. - // CHECK: %[[V0:.*]] = affine.apply #[[MAP1]](%[[TS2]]) - // CHECK: %[[T3:.*]] = linalg.pad_tensor %[[T0]] nofold - // CHECK-SAME: [%[[C0]], %[[C0]]] - // CHECK-SAME: [%[[C0]], %[[V0]] - // CHECK: %[[T4:.*]] = linalg.pad_tensor %[[T1]] nofold - - // Check the statically sized matmul output with fully divisible sizes is not padded. - // CHECK: %[[T5:.*]] = linalg.matmul - // CHECK-SAME: ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>) - // CHECK-SAME: outs(%[[T2]] : tensor<4x5xf32>) - // CHECK: %[[T6:.*]] = tensor.insert_slice %[[T5]] - %7 = linalg.matmul ins(%4, %5 : tensor<4x?xf32>, tensor) outs(%6 : tensor<4x5xf32>) -> tensor<4x5xf32> - %8 = tensor.insert_slice %7 into %arg8[%arg3, %arg5] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32> - - // CHECK: scf.yield %[[T6]] - scf.yield %8 : tensor<24x25xf32> - } - scf.yield %2 : tensor<24x25xf32> - } - scf.yield %1 : tensor<24x25xf32> - } - return %0 : tensor<24x25xf32> + %arg2: tensor<24x25xf32>, + %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { + // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index + + // MATMUL: %[[TS2:.*]] = affine.min #[[MAP0]]()[%[[IV2]]] + %0 = affine.min #map()[%iv2] + + // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] + // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG1]] + // MATMUL: %[[T2:.*]] = tensor.extract_slice %[[ARG2]] + %1 = tensor.extract_slice %arg0[%iv0, %iv2] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> + %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor + %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32> + + // Check statically sized matmul inputs with partially divisible sizes are padded. + // MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS2]]] + // MATMUL: %[[T3:.*]] = linalg.pad_tensor %[[T0]] nofold + // MATMUL-SAME: [%[[C0]], %[[C0]]] + // MATMUL-SAME: [%[[C0]], %[[V0]] + // MATMUL: %[[T4:.*]] = linalg.pad_tensor %[[T1]] nofold + + // Check the statically sized matmul output with fully divisible sizes is not padded. + // MATMUL: %[[T5:.*]] = linalg.matmul + // MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<4x7xf32>, tensor<7x5xf32>) + // MATMUL-SAME: outs(%[[T2]] : tensor<4x5xf32>) + // MATMUL: %[[T6:.*]] = tensor.insert_slice %[[T5]] + %4 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32> + %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32> + return %5 : tensor<24x25xf32> } // ----- -// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0) -> (7, -d0 + 25)> -// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 7)> -#map = affine_map<(d0) -> (7, -d0 + 25)> +// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0] -> (7, -s0 + 25)> +// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 7)> +#map = affine_map<()[s0] -> (7, -s0 + 25)> -// CHECK: static_sizes_input_divisible +// MATMUL: static_sizes_input_divisible +// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> +// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index +// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index +// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index func @static_sizes_input_divisible(%arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { - // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-DAG: %[[C7:.*]] = arith.constant 7 - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c25 = arith.constant 25 : index - %c24 = arith.constant 24 : index - %c6 = arith.constant 6 : index - %c7 = arith.constant 7 : index - %c4 = arith.constant 4 : index - - // CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg3 = %c0 to %c24 step %c4 iter_args(%arg4 = %arg2) -> (tensor<24x25xf32>) { - - // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = - %1 = scf.for %arg5 = %c0 to %c25 step %c7 iter_args(%arg6 = %arg4) -> (tensor<24x25xf32>) { - - // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = - %2 = scf.for %arg7 = %c0 to %c12 step %c6 iter_args(%arg8 = %arg6) -> (tensor<24x25xf32>) { - %3 = tensor.extract_slice %arg0[%arg3, %arg7] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32> - - // CHECK: %[[TS1:.*]] = affine.min #[[MAP0]](%[[IV1]]) - %4 = affine.min #map(%arg5) - %5 = tensor.extract_slice %arg1[%arg7, %arg5] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32> - - // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG4]] - %6 = tensor.extract_slice %arg8[%arg3, %arg5] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32> - - // Check the statically sized matmul output with partially divisible sizes is padded. - // CHECK: %[[V0:.*]] = affine.apply #[[MAP1]](%[[TS1]]) - // CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]] low - // CHECK-SAME: [%[[C0]], %[[C0]]] - // CHECK-SAME: [%[[C0]], %[[V0]] - - // CHECK: %[[T2:.*]] = linalg.matmul - // CHECK-SAME: outs(%[[T1]] : tensor<4x7xf32>) - // CHECK: %[[T3:.*]] = tensor.extract_slice %[[T2]] - // CHECK: %[[T4:.*]] = tensor.insert_slice %[[T3]] - %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32> - %8 = tensor.insert_slice %7 into %arg8[%arg3, %arg5] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32> - - // CHECK: scf.yield %[[T4]] - scf.yield %8 : tensor<24x25xf32> - } - scf.yield %2 : tensor<24x25xf32> - } - scf.yield %1 : tensor<24x25xf32> - } - return %0 : tensor<24x25xf32> + %arg2: tensor<24x25xf32>, + %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { + // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index + + %3 = tensor.extract_slice %arg0[%iv0, %iv2] [4, 6] [1, 1] : tensor<24x12xf32> to tensor<4x6xf32> + + // MATMUL: %[[TS1:.*]] = affine.min #[[MAP0]]()[%[[IV1]]] + %4 = affine.min #map()[%iv1] + %5 = tensor.extract_slice %arg1[%iv2, %iv1] [6, %4] [1, 1] : tensor<12x25xf32> to tensor<6x?xf32> + + // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG2]] + %6 = tensor.extract_slice %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<24x25xf32> to tensor<4x?xf32> + + // Check the statically sized matmul output with partially divisible sizes is padded. + // MATMUL: %[[V0:.*]] = affine.apply #[[MAP1]]()[%[[TS1]]] + // MATMUL: %[[T1:.*]] = linalg.pad_tensor %[[T0]] low + // MATMUL-SAME: [%[[C0]], %[[C0]]] + // MATMUL-SAME: [%[[C0]], %[[V0]] + + // MATMUL: %[[T2:.*]] = linalg.matmul + // MATMUL-SAME: outs(%[[T1]] : tensor<4x7xf32>) + // MATMUL: %[[T3:.*]] = tensor.extract_slice %[[T2]] + // MATMUL: %[[T4:.*]] = tensor.insert_slice %[[T3]] + %7 = linalg.matmul ins(%3, %5 : tensor<4x6xf32>, tensor<6x?xf32>) outs(%6 : tensor<4x?xf32>) -> tensor<4x?xf32> + %8 = tensor.insert_slice %7 into %arg2[%iv0, %iv1] [4, %4] [1, 1] : tensor<4x?xf32> into tensor<24x25xf32> + + // MATMUL: return %[[T4]] + return %8 : tensor<24x25xf32> } // ----- -// CHECK-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<(d0)[s0] -> (5, -d0 + s0)> -// CHECK-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<(d0)[s0] -> (7, -d0 + s0)> -// CHECK-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<(d0)[s0] -> (6, -d0 + s0)> -// CHECK-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 5)> -// CHECK-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<(d0) -> (-d0 + 6)> - -#map0 = affine_map<(d0)[s0] -> (5, -d0 + s0)> -#map1 = affine_map<(d0)[s0] -> (6, -d0 + s0)> -#map2 = affine_map<(d0)[s0] -> (7, -d0 + s0)> - -// CHECK: dynamic_sizes -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor -// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor -// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor +// MATMUL-DAG: #[[MAP0:[0-9a-z]+]] = affine_map<()[s0, s1] -> (5, -s0 + s1)> +// MATMUL-DAG: #[[MAP1:[0-9a-z]+]] = affine_map<()[s0, s1] -> (7, -s0 + s1)> +// MATMUL-DAG: #[[MAP2:[0-9a-z]+]] = affine_map<()[s0, s1] -> (6, -s0 + s1)> +// MATMUL-DAG: #[[MAP3:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 5)> +// MATMUL-DAG: #[[MAP4:[0-9a-z]+]] = affine_map<()[s0] -> (-s0 + 6)> + +#map0 = affine_map<()[s0, s1] -> (5, -s0 + s1)> +#map1 = affine_map<()[s0, s1] -> (6, -s0 + s1)> +#map2 = affine_map<()[s0, s1] -> (7, -s0 + s1)> + +// MATMUL: dynamic_sizes +// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor +// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor +// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor +// MATMUL-SAME: %[[IV0:[0-9a-zA-Z]*]]: index +// MATMUL-SAME: %[[IV1:[0-9a-zA-Z]*]]: index +// MATMUL-SAME: %[[IV2:[0-9a-zA-Z]*]]: index func @dynamic_sizes(%arg0: tensor, %arg1: tensor, - %arg2: tensor) -> tensor { - // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index - // CHECK-DAG: %[[C1:.*]] = arith.constant 1 - // CHECK-DAG: %[[C5:.*]] = arith.constant 5 - // CHECK-DAG: %[[C6:.*]] = arith.constant 6 + %arg2: tensor, + %iv0 : index, %iv1 : index, %iv2 : index) -> tensor { + // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index + // MATMUL-DAG: %[[C1:.*]] = arith.constant 1 %c1 = arith.constant 1 : index %c0 = arith.constant 0 : index - %c6 = arith.constant 6 : index - %c7 = arith.constant 7 : index - %c5 = arith.constant 5 : index - // CHECK-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] - // CHECK-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]] - // CHECK-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]] + // MATMUL-DAG: %[[D0:.*]] = tensor.dim %[[ARG0]], %[[C0]] + // MATMUL-DAG: %[[D2:.*]] = tensor.dim %[[ARG0]], %[[C1]] + // MATMUL-DAG: %[[D1:.*]] = tensor.dim %[[ARG1]], %[[C1]] %0 = tensor.dim %arg0, %c0 : tensor %1 = tensor.dim %arg0, %c1 : tensor %2 = tensor.dim %arg1, %c1 : tensor - // CHECK: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %3 = scf.for %arg3 = %c0 to %0 step %c5 iter_args(%arg4 = %arg2) -> (tensor) { - - // CHECK: scf.for %[[IV1:[0-9a-zA-Z]*]] = - %4 = scf.for %arg5 = %c0 to %2 step %c7 iter_args(%arg6 = %arg4) -> (tensor) { - - // CHECK: scf.for %[[IV2:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG4:.*]] = - %5 = scf.for %arg7 = %c0 to %1 step %c6 iter_args(%arg8 = %arg6) -> (tensor) { - - // CHECK: %[[TS0:.*]] = affine.min #[[MAP0]](%[[IV0]])[%[[D0]]] - // CHECK: %[[TS2:.*]] = affine.min #[[MAP2]](%[[IV2]])[%[[D2]]] - // CHECK: %[[TS1:.*]] = affine.min #[[MAP1]](%[[IV1]])[%[[D1]]] - %6 = affine.min #map0(%arg3)[%0] - %7 = affine.min #map1(%arg7)[%1] - %8 = tensor.extract_slice %arg0[%arg3, %arg7] [%6, %7] [1, 1] : tensor to tensor - %9 = affine.min #map2(%arg5)[%2] - %10 = tensor.extract_slice %arg1[%arg7, %arg5] [%7, %9] [1, 1] : tensor to tensor - %11 = tensor.extract_slice %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor to tensor - - // Check all matmul operands are padded. - // CHECK: %[[V0:.*]] = affine.apply #[[MAP3]](%[[TS0]]) - // CHECK: %[[V1:.*]] = affine.apply #[[MAP4]](%[[TS2]]) - // CHECK: %[[T3:.*]] = linalg.pad_tensor %{{.*}} nofold - // CHECK-SAME: [%[[C0]], %[[C0]]] - // CHECK-SAME: [%[[V0]], %[[V1]] - // CHECK: %[[T4:.*]] = linalg.pad_tensor %{{.*}} nofold - // CHECK: %[[T5:.*]] = linalg.pad_tensor %{{.*}} low - - // Check the dynamic matmul has been erased. - // CHECK-NOT: = linalg.matmul {{.*}} tensor - - // Check all padded matmul operands are statically sized. - // CHECK: %[[T6:.*]] = linalg.matmul - // CHECK-SAME: ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>) - // CHECK-SAME: outs(%[[T5]] : tensor<5x7xf32>) - // CHECK: %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]] - // CHECK: %[[T8:.*]] = tensor.insert_slice %[[T7]] - %12 = linalg.matmul ins(%8, %10 : tensor, tensor) outs(%11 : tensor) -> tensor - %13 = tensor.insert_slice %12 into %arg8[%arg3, %arg5] [%6, %9] [1, 1] : tensor into tensor - - // CHECK: scf.yield %[[T8]] - scf.yield %13 : tensor - } - scf.yield %5 : tensor - } - scf.yield %4 : tensor - } - return %3 : tensor + // MATMUL: %[[TS0:.*]] = affine.min #[[MAP0]]()[%[[IV0]], %[[D0]]] + // MATMUL: %[[TS2:.*]] = affine.min #[[MAP2]]()[%[[IV2]], %[[D2]]] + // MATMUL: %[[TS1:.*]] = affine.min #[[MAP1]]()[%[[IV1]], %[[D1]]] + %6 = affine.min #map0()[%iv0, %0] + %7 = affine.min #map1()[%iv2, %1] + %8 = tensor.extract_slice %arg0[%iv0, %iv2] [%6, %7] [1, 1] : tensor to tensor + %9 = affine.min #map2()[%iv1, %2] + %10 = tensor.extract_slice %arg1[%iv2, %iv1] [%7, %9] [1, 1] : tensor to tensor + %11 = tensor.extract_slice %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor to tensor + + // Check all matmul operands are padded. + // MATMUL: %[[V0:.*]] = affine.apply #[[MAP3]]()[%[[TS0]]] + // MATMUL: %[[V1:.*]] = affine.apply #[[MAP4]]()[%[[TS2]]] + // MATMUL: %[[T3:.*]] = linalg.pad_tensor %{{.*}} nofold + // MATMUL-SAME: [%[[C0]], %[[C0]]] + // MATMUL-SAME: [%[[V0]], %[[V1]] + // MATMUL: %[[T4:.*]] = linalg.pad_tensor %{{.*}} nofold + // MATMUL: %[[T5:.*]] = linalg.pad_tensor %{{.*}} low + + // Check the dynamic matmul has been erased. + // MATMUL-NOT: = linalg.matmul {{.*}} tensor + + // Check all padded matmul operands are statically sized. + // MATMUL: %[[T6:.*]] = linalg.matmul + // MATMUL-SAME: ins(%[[T3]], %[[T4]] : tensor<5x6xf32>, tensor<6x7xf32>) + // MATMUL-SAME: outs(%[[T5]] : tensor<5x7xf32>) + // MATMUL: %[[T7:.*]] = tensor.extract_slice %[[T6]][0, 0] [%[[TS0]], %[[TS1]]] + // MATMUL: %[[T8:.*]] = tensor.insert_slice %[[T7]] + %12 = linalg.matmul ins(%8, %10 : tensor, tensor) outs(%11 : tensor) -> tensor + %13 = tensor.insert_slice %12 into %arg2[%iv0, %iv1] [%6, %9] [1, 1] : tensor into tensor + + // MATMUL: return %[[T8]] + return %13 : tensor } // ----- -#map0 = affine_map<(d0) -> (64, d0)> +#map0 = affine_map<()[s0] -> (64, s0)> -// CHECK: compose_padding -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32> +// MATMUL: compose_padding +// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<64x64xf32> func @compose_padding(%arg0: tensor<64x64xf32>, %iv0 : index) -> tensor { %cst = arith.constant 0.0 : f32 - // CHECK: %[[SIZE:.*]] = affine.min - %size = affine.min #map0(%iv0) + // MATMUL: %[[SIZE:.*]] = affine.min + %size = affine.min #map0()[%iv0] - // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] - // CHECK-SAME: [0, 0] - // CHECK-SAME: [%[[SIZE]], %[[SIZE]]] - // CHECK: %[[T1:.*]] = linalg.pad_tensor %[[T0]] - // CHECK: %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]] - // CHECK: %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]] + // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG0]] + // MATMUL-SAME: [0, 0] + // MATMUL-SAME: [%[[SIZE]], %[[SIZE]]] + // MATMUL: %[[T1:.*]] = linalg.pad_tensor %[[T0]] + // MATMUL: %[[T2:.*]] = linalg.fill(%{{.*}}, %[[T1]] + // MATMUL: %[[T3:.*]] = linalg.fill(%{{.*}}, %[[T2]] %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] { ^bb0(%arg3: index, %arg4: index): // no predecessors @@ -241,28 +188,28 @@ %4 = tensor.extract_slice %3[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor // Check there are no additional pad tensor operations. - // CHECK-NOT: linalg.pad_tensor + // MATMUL-NOT: linalg.pad_tensor // Check the matmul directly uses the result of the fill operation. - // CHECK: %[[T4:.*]] = linalg.matmul ins(%[[T3]] - // CHECK: %[[T5:.*]] = tensor.extract_slice %[[T4]] - // CHECK-SAME: [0, 0] - // CHECK-SAME: [%[[SIZE]], %[[SIZE]]] + // MATMUL: %[[T4:.*]] = linalg.matmul ins(%[[T3]] + // MATMUL: %[[T5:.*]] = tensor.extract_slice %[[T4]] + // MATMUL-SAME: [0, 0] + // MATMUL-SAME: [%[[SIZE]], %[[SIZE]]] %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor - // CHECK: return %[[T5]] + // MATMUL: return %[[T5]] return %5 : tensor } // ----- -#map0 = affine_map<(d0) -> (64, d0)> +#map0 = affine_map<()[s0] -> (64, s0)> -// CHECK: different_padding_values +// MATMUL: different_padding_values func @different_padding_values(%arg0: tensor<64x64xf32>, %iv0 : index) -> tensor { %cst = arith.constant 42.0 : f32 - %size = affine.min #map0(%iv0) + %size = affine.min #map0()[%iv0] %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] { ^bb0(%arg3: index, %arg4: index): // no predecessors @@ -272,22 +219,22 @@ %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor // Different padding values prevent composing the paddings (42.0 vs. 0.0). - // CHECK: = linalg.fill - // CHECK: = linalg.pad_tensor - // CHECK: = linalg.matmul + // MATMUL: = linalg.fill + // MATMUL: = linalg.pad_tensor + // MATMUL: = linalg.matmul %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor return %5 : tensor } // ----- -#map0 = affine_map<(d0) -> (64, d0)> +#map0 = affine_map<()[s0] -> (64, s0)> -// CHECK: different_padding_dynamic_sizes +// MATMUL: different_padding_dynamic_sizes func @different_padding_dynamic_sizes(%arg0: tensor<64x64xf32>, %iv0 : index) -> tensor { %cst = arith.constant 0.0 : f32 - %size = affine.min #map0(%iv0) + %size = affine.min #map0()[%iv0] %0 = tensor.extract_slice %arg0[0, 0] [%iv0, %iv0] [1, 1] : tensor<64x64xf32> to tensor %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] { ^bb0(%arg3: index, %arg4: index): // no predecessors @@ -297,22 +244,22 @@ %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<64x64xf32> to tensor // Different dynamic sizes prevent composing the paddings (%iv0 vs %size). - // CHECK: = linalg.fill - // CHECK: = linalg.pad_tensor - // CHECK: = linalg.matmul + // MATMUL: = linalg.fill + // MATMUL: = linalg.pad_tensor + // MATMUL: = linalg.matmul %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor return %5 : tensor } // ----- -#map0 = affine_map<(d0) -> (64, d0)> +#map0 = affine_map<()[s0] -> (64, s0)> -// CHECK: different_padding_static_sizes +// MATMUL: different_padding_static_sizes func @different_padding_static_sizes(%arg0: tensor<62x62xf32>, %iv0 : index) -> tensor { %cst = arith.constant 0.0 : f32 - %size = affine.min #map0(%iv0) + %size = affine.min #map0()[%iv0] %0 = tensor.extract_slice %arg0[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor %1 = linalg.pad_tensor %0 low[0, 0] high[%iv0, %iv0] { ^bb0(%arg3: index, %arg4: index): // no predecessors @@ -322,55 +269,42 @@ %4 = tensor.extract_slice %2[0, 0] [%size, %size] [1, 1] : tensor<62x62xf32> to tensor // Different static sizes prevent composing the paddings (62 vs 64 derived from #map0). - // CHECK: = linalg.fill - // CHECK: = linalg.pad_tensor - // CHECK: = linalg.matmul + // MATMUL: = linalg.fill + // MATMUL: = linalg.pad_tensor + // MATMUL: = linalg.matmul %5 = linalg.matmul ins(%4, %4 : tensor, tensor) outs(%4 : tensor) -> tensor return %5 : tensor } // ----- -#map = affine_map<(d0) -> (7, -d0 + 12)> +#map0 = affine_map<()[s0] -> (7, s0)> -// CHECK-FILL: scalar_operand -// CHECK-FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: f32 -// CHECK-FILL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32> -func @scalar_operand(%arg0: f32, %arg1: tensor<24x12xf32>) -> tensor<24x12xf32> { - %c0 = arith.constant 0 : index - %c12 = arith.constant 12 : index - %c24 = arith.constant 24 : index - %c7 = arith.constant 7 : index - %c4 = arith.constant 4 : index - - // CHECK-FILL: scf.for %[[IV0:[0-9a-zA-Z]*]] = - %0 = scf.for %arg2 = %c0 to %c24 step %c4 iter_args(%arg3 = %arg1) -> (tensor<24x12xf32>) { - - // CHECK-FILL: scf.for %[[IV1:[0-9a-zA-Z]*]] = {{.*}} iter_args(%[[ARG2:.*]] = - %1 = scf.for %arg4 = %c0 to %c12 step %c7 iter_args(%arg5 = %arg3) -> (tensor<24x12xf32>) { - %2 = affine.min #map(%arg4) - - // CHECK-FILL: %[[T0:.*]] = tensor.extract_slice %[[ARG2]] - // CHECK-FILL: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold - %3 = tensor.extract_slice %arg5[%arg2, %arg4] [4, %2] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> - - // Check only the fill output operand is padded. - // CHECK-FILL: %[[T6:.*]] = linalg.fill(%[[ARG0]], %[[T1]] - %4 = linalg.fill(%arg0, %3) : f32, tensor<4x?xf32> -> tensor<4x?xf32> - %5 = tensor.insert_slice %4 into %arg5[%arg2, %arg4] [4, %2] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32> - scf.yield %5 : tensor<24x12xf32> - } - scf.yield %1 : tensor<24x12xf32> - } - return %0 : tensor<24x12xf32> +// FILL: scalar_operand +// FILL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: f32 +// FILL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<24x12xf32> +func @scalar_operand(%arg0: f32, + %arg1: tensor<24x12xf32>, + %iv0 : index) -> tensor<24x12xf32> { + %0 = affine.min #map0()[%iv0] + + // FILL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]] + // FILL: %[[T1:.*]] = linalg.pad_tensor %[[T0]] nofold + %1 = tensor.extract_slice %arg1[0, 0] [4, %0] [1, 1] : tensor<24x12xf32> to tensor<4x?xf32> + + // Check only the fill output operand is padded. + // FILL: %[[T6:.*]] = linalg.fill(%[[ARG0]], %[[T1]] + %2 = linalg.fill(%arg0, %1) : f32, tensor<4x?xf32> -> tensor<4x?xf32> + %3 = tensor.insert_slice %2 into %arg1[0, 0] [4, %0] [1, 1] : tensor<4x?xf32> into tensor<24x12xf32> + return %3 : tensor<24x12xf32> } // ----- #map0 = affine_map<()[s0] -> (7, s0)> -// CHECK: static_extract_slice_missing -// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>, +// MATMUL: static_extract_slice_missing +// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<4x5xf32>, func @static_extract_slice_missing(%arg0: tensor<24x12xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<4x5xf32>, @@ -380,10 +314,10 @@ %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor // Check the matmul inputs are padded despite the missing slice for the static output. - // CHECK: %[[T0:.*]] = linalg.pad_tensor - // CHECK: %[[T1:.*]] = linalg.pad_tensor - // CHECK: = linalg.matmul ins(%[[T0]], %[[T1]] - // CHECK-SAME: outs(%[[ARG2]] + // MATMUL: %[[T0:.*]] = linalg.pad_tensor + // MATMUL: %[[T1:.*]] = linalg.pad_tensor + // MATMUL: = linalg.matmul ins(%[[T0]], %[[T1]] + // MATMUL-SAME: outs(%[[ARG2]] %3 = linalg.matmul ins(%1, %2 : tensor<4x?xf32>, tensor) outs(%arg2 : tensor<4x5xf32>) -> tensor<4x5xf32> return %3 : tensor<4x5xf32> } @@ -392,24 +326,24 @@ #map0 = affine_map<()[s0] -> (7, s0)> -// CHECK: dynamic_extract_slice_missing -// CHECK-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>, -// CHECK-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>, -// CHECK-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>, +// MATMUL: dynamic_extract_slice_missing +// MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<4x?xf32>, +// MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32>, +// MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32>, func @dynamic_extract_slice_missing(%arg0: tensor<4x?xf32>, %arg1: tensor<12x25xf32>, %arg2: tensor<24x25xf32>, %iv0 : index, %iv1 : index, %iv2 : index) -> tensor<24x25xf32> { %0 = affine.min #map0()[%iv2] - // CHECK: %[[T0:.*]] = tensor.extract_slice %[[ARG1]] - // CHECK: %[[T1:.*]] = tensor.extract_slice %[[ARG2]] + // MATMUL: %[[T0:.*]] = tensor.extract_slice %[[ARG1]] + // MATMUL: %[[T1:.*]] = tensor.extract_slice %[[ARG2]] %2 = tensor.extract_slice %arg1[%iv2, %iv1] [%0, 5] [1, 1] : tensor<12x25xf32> to tensor %3 = tensor.extract_slice %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<24x25xf32> to tensor<4x5xf32> // Check the matmul is not padded due to the missing slice for the dynamic input. - // CHECK: = linalg.matmul ins(%[[ARG0]], %[[T0]] - // CHECK-SAME: outs(%[[T1]] + // MATMUL: = linalg.matmul ins(%[[ARG0]], %[[T0]] + // MATMUL-SAME: outs(%[[T1]] %4 = linalg.matmul ins(%arg0, %2 : tensor<4x?xf32>, tensor) outs(%3 : tensor<4x5xf32>) -> tensor<4x5xf32> %5 = tensor.insert_slice %4 into %arg2[%iv0, %iv1] [4, 5] [1, 1] : tensor<4x5xf32> into tensor<24x25xf32> return %5 : tensor<24x25xf32>