diff --git a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp --- a/mlir/lib/Dialect/Linalg/Utils/Utils.cpp +++ b/mlir/lib/Dialect/Linalg/Utils/Utils.cpp @@ -821,9 +821,9 @@ Value maxIndex = applyMapToValues(builder, loc, m, maxIndices).front(); Value d = makeComposedAffineApply(builder, loc, plusOneMap, {maxIndex}); - // Compute min(size, dim - offset) to avoid out-of-bounds accesses. + // Compute min(dim - offset, size) to avoid out-of-bounds accesses. AffineMap minMap = AffineMap::inferFromExprList( - {ArrayRef{dim0, dim1 - dim2}}) + {ArrayRef{dim1 - dim2, dim0}}) .front(); SmallVector operands{size, d, offset}; fullyComposeAffineMapAndOperands(&minMap, &operands); diff --git a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir b/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir --- a/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir +++ b/mlir/test/Dialect/Linalg/tile-and-fuse-on-tensors.mlir @@ -1,16 +1,16 @@ // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.matmul fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=MATMUL %s // RUN: mlir-opt %s -test-linalg-codegen-strategy="anchor-op=linalg.generic fuse tile-sizes=5,4,7 tile-interchange=1,0,2 run-enable-pass=false" -cse -split-input-file | FileCheck --check-prefix=GENERIC %s -// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)> -// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)> -// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 24)> -// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)> +// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)> +// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)> +// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 24, d0)> +// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)> // MATMUL: fuse_input // MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> func.func @fuse_input(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c25 = arith.constant 25 : index @@ -34,19 +34,19 @@ // MATMUL: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]] // MATMUL: %{{.*}} = linalg.matmul ins(%[[T1]] %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - return %1 : tensor<24x25xf32> + func.return %1 : tensor<24x25xf32> } // ----- -// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (5, -d0 + 24)> -// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (4, -d0 + 25)> +// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 24, 5)> +// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 25, 4)> // MATMUL: fuse_output // MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> func.func @fuse_output(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { // MATMUL-DAG: %[[C0:.*]] = arith.constant 0 : index // MATMUL-DAG: %[[C1:.*]] = arith.constant 1 : index %c0 = arith.constant 0 : index @@ -81,15 +81,15 @@ // MATMUL-SAME: 0, 0 // MATMUL-SAME: %[[D0]], %[[D1]] %1 = linalg.matmul ins(%arg0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%0 : tensor<24x25xf32>) -> tensor<24x25xf32> - return %1 : tensor<24x25xf32> + func.return %1 : tensor<24x25xf32> } // ----- -// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (4, -d0 + 25)> -// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (7, -d0 + 12)> -// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 25)> -// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (d0, -d1 + 12)> +// MATMUL-DAG: #[[MAP0:.*]] = affine_map<(d0) -> (-d0 + 25, 4)> +// MATMUL-DAG: #[[MAP1:.*]] = affine_map<(d0) -> (-d0 + 12, 7)> +// MATMUL-DAG: #[[MAP2:.*]] = affine_map<(d0, d1) -> (-d1 + 25, d0)> +// MATMUL-DAG: #[[MAP3:.*]] = affine_map<(d0, d1) -> (-d1 + 12, d0)> #map0 = affine_map<(d0, d1, d2) -> (d0, d1, d2)> #map1 = affine_map<(d0, d1, d2) -> (d0, d2)> @@ -97,16 +97,16 @@ // MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xf32> // MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x7x25xf32> func.func @fuse_reduction(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> { + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>, + %arg3: tensor<12x7x25xf32>) -> tensor<24x25xf32> { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c25 = arith.constant 25 : index %c24 = arith.constant 24 : index %c4 = arith.constant 4 : index %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction", "parallel"]} ins(%arg3 : tensor<12x7x25xf32>) outs(%arg1 : tensor<12x25xf32>) { - ^bb0(%arg4: f32, %arg5: f32): + ^bb0(%arg4: f32, %arg5: f32): %2 = arith.addf %arg4, %arg5 : f32 linalg.yield %2 : f32 } -> tensor<12x25xf32> @@ -129,7 +129,7 @@ // MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]] // MATMUL: %{{.*}} = linalg.matmul ins(%{{.*}}, %[[T2]] %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - return %1 : tensor<24x25xf32> + func.return %1 : tensor<24x25xf32> } // ----- @@ -141,16 +141,16 @@ // MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> // MATMUL-SAME: %[[ARG3:[0-9a-zA-Z]*]]: tensor<12x24xf32> func.func @fuse_transposed(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>, - %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> { + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>, + %arg3: tensor<12x24xf32>) -> tensor<24x25xf32> { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c25 = arith.constant 25 : index %c24 = arith.constant 24 : index %c4 = arith.constant 4 : index %0 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%arg3 : tensor<12x24xf32>) outs(%arg0 : tensor<24x12xf32>) { - ^bb0(%arg4: f32, %arg5: f32): + ^bb0(%arg4: f32, %arg5: f32): %2 = arith.addf %arg4, %arg5 : f32 linalg.yield %2 : f32 } -> tensor<24x12xf32> @@ -167,7 +167,7 @@ // MATMUL: %[[T2:.*]] = linalg.generic {{.*}} ins(%[[T0]] {{.*}} outs(%[[T1]] // MATMUL: %{{.*}} = linalg.matmul ins(%[[T2]] %1 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%arg2 : tensor<24x25xf32>) -> tensor<24x25xf32> - return %1 : tensor<24x25xf32> + func.return %1 : tensor<24x25xf32> } // ----- @@ -176,8 +176,8 @@ // MATMUL-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<24x12xf32> // MATMUL-SAME: %[[ARG2:[0-9a-zA-Z]*]]: tensor<24x25xf32> func.func @fuse_input_and_output(%arg0: tensor<24x12xf32>, - %arg1: tensor<12x25xf32>, - %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { + %arg1: tensor<12x25xf32>, + %arg2: tensor<24x25xf32>) -> tensor<24x25xf32> { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c25 = arith.constant 25 : index @@ -200,7 +200,7 @@ // MATMUL: %[[T4:.*]] = tensor.extract_slice %[[ARG5]] // MATMUL: %{{.*}} = linalg.matmul ins(%[[T3]], {{.*}} outs(%[[T4]] %2 = linalg.matmul ins(%0, %arg1 : tensor<24x12xf32>, tensor<12x25xf32>) outs(%1 : tensor<24x25xf32>) -> tensor<24x25xf32> - return %2 : tensor<24x25xf32> + func.return %2 : tensor<24x25xf32> } // ----- @@ -211,15 +211,15 @@ // MATMUL: fuse_indexed // MATMUL-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<12x25xi32> func.func @fuse_indexed(%arg0: tensor<24x12xi32>, - %arg1: tensor<12x25xi32>, - %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> { + %arg1: tensor<12x25xi32>, + %arg2: tensor<24x25xi32>) -> tensor<24x25xi32> { %c0 = arith.constant 0 : index %c12 = arith.constant 12 : index %c25 = arith.constant 25 : index %c24 = arith.constant 24 : index %c4 = arith.constant 4 : index %0 = linalg.generic {indexing_maps = [#map0], iterator_types = ["parallel", "parallel"]} outs(%arg1 : tensor<12x25xi32>) { - ^bb0(%arg3: i32): + ^bb0(%arg3: i32): %6 = linalg.index 0 : index %7 = linalg.index 1 : index %8 = arith.addi %6, %7 : index @@ -241,7 +241,7 @@ // MATMUL: %[[IDX1_SHIFTED:.*]] = affine.apply #[[MAP0]](%[[IDX1]], %[[IV2]]) // MATMUL: %{{.*}} = arith.addi %[[IDX0_SHIFTED]], %[[IDX1_SHIFTED]] %1 = linalg.matmul ins(%arg0, %0 : tensor<24x12xi32>, tensor<12x25xi32>) outs(%arg2 : tensor<24x25xi32>) -> tensor<24x25xi32> - return %1 : tensor<24x25xi32> + func.return %1 : tensor<24x25xi32> } // ----- @@ -252,8 +252,8 @@ // GENERIC: fuse_outermost_reduction // GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32> // GENERIC-SAME: %[[ARG1:[0-9a-zA-Z]*]]: tensor<10xf32> -func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>, - %arg1: tensor<10xf32>) -> tensor<10xf32> { +func.func @fuse_outermost_reduction(%arg0: tensor<10x17xf32>, + %arg1: tensor<10xf32>) -> tensor<10xf32> { %cst = arith.constant 0.000000e+00 : f32 %0 = linalg.fill ins(%cst : f32) outs(%arg0 : tensor<10x17xf32>) -> tensor<10x17xf32> @@ -272,25 +272,25 @@ // GENERIC-SAME: %[[IV1]] // GENERIC: linalg.generic {{.*}} ins(%[[T2]] {{.*}} outs(%[[T3]] %2 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "reduction"]} ins(%0 : tensor<10x17xf32>) outs(%1 : tensor<10xf32>) { - ^bb0(%arg2: f32, %arg3: f32): + ^bb0(%arg2: f32, %arg3: f32): %3 = arith.addf %arg2, %arg3 : f32 linalg.yield %3 : f32 } -> tensor<10xf32> - return %2 : tensor<10xf32> + func.return %2 : tensor<10xf32> } // ----- // GENERIC-DAG: #[[MAP0:.*]] = affine_map<(d0, d1) -> (d0 + d1)> -// GENERIC-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (8, -d0 - d1 + 17)> -// GENERIC-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (d0, -d1 - d2 + 17)> +// GENERIC-DAG: #[[MAP1:.*]] = affine_map<(d0, d1) -> (-d0 - d1 + 17, 8)> +// GENERIC-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2) -> (-d1 - d2 + 17, d0)> #map0 = affine_map<(d0, d1) -> (d0, d0 + d1)> #map1 = affine_map<(d0, d1) -> (d0, d1)> // GENERIC: fuse_non_rectangular // GENERIC-SAME: %[[ARG0:[0-9a-zA-Z]*]]: tensor<10x17xf32> -func @fuse_non_rectangular(%arg0: tensor<10x17xf32>, - %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> { +func.func @fuse_non_rectangular(%arg0: tensor<10x17xf32>, + %arg1: tensor<10x8xf32>) -> tensor<10x8xf32> { // GENERIC-DAG: %[[C0:.*]] = arith.constant 0 : index // GENERIC-DAG: %[[C4:.*]] = arith.constant 4 : index @@ -315,9 +315,9 @@ // GENERIC-SAME: , %[[UB1]] // GENERIC: %[[T1:.*]] = linalg.fill ins(%{{.*}}{{.*}}outs(%[[T0]] %1 = linalg.generic {indexing_maps = [#map0, #map1], iterator_types = ["parallel", "parallel"]} ins(%0 : tensor<10x17xf32>) outs(%arg1 : tensor<10x8xf32>) { - ^bb0(%arg2: f32, %arg3: f32): + ^bb0(%arg2: f32, %arg3: f32): %2 = arith.addf %arg2, %arg3 : f32 linalg.yield %2 : f32 } -> tensor<10x8xf32> - return %1 : tensor<10x8xf32> + func.return %1 : tensor<10x8xf32> }