diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp @@ -585,83 +585,12 @@ }; } // namespace -namespace { -/// Convert `extract_slice` operations to rank-reduced versions. -struct RankReducedExtractSliceOp - : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(tensor::ExtractSliceOp sliceOp, - PatternRewriter &rewriter) const override { - RankedTensorType resultType = sliceOp.getType(); - SmallVector offsets = sliceOp.getMixedOffsets(); - SmallVector sizes = sliceOp.getMixedSizes(); - SmallVector strides = sliceOp.getMixedStrides(); - auto reassociation = getReassociationMapForFoldingUnitDims(sizes); - if (!reassociation || - reassociation->size() == static_cast(resultType.getRank())) - return failure(); - auto rankReducedType = - tensor::ExtractSliceOp::inferCanonicalRankReducedResultType( - reassociation->size(), sliceOp.getSourceType(), offsets, sizes, - strides) - .cast(); - - Location loc = sliceOp.getLoc(); - Value newSlice = rewriter.create( - loc, rankReducedType, sliceOp.getSource(), offsets, sizes, strides); - rewriter.replaceOpWithNewOp( - sliceOp, resultType, newSlice, *reassociation); - return success(); - } -}; - -/// Convert `insert_slice` operations to rank-reduced versions. -/// This patterns works with both InsertSliceOp and ParallelInsertSliceOp. -template -struct RankReducedInsertSliceOp : public OpRewritePattern { - using OpRewritePattern::OpRewritePattern; - - LogicalResult matchAndRewrite(InsertOpTy insertSliceOp, - PatternRewriter &rewriter) const override { - RankedTensorType sourceType = insertSliceOp.getSourceType(); - SmallVector offsets = insertSliceOp.getMixedOffsets(); - SmallVector sizes = insertSliceOp.getMixedSizes(); - SmallVector strides = insertSliceOp.getMixedStrides(); - auto reassociation = getReassociationMapForFoldingUnitDims(sizes); - if (!reassociation || - reassociation->size() == static_cast(sourceType.getRank())) - return failure(); - Location loc = insertSliceOp.getLoc(); - tensor::CollapseShapeOp reshapedSource; - { - OpBuilder::InsertionGuard g(rewriter); - // The only difference between InsertSliceOp and ParallelInsertSliceOp is - // the insertion point is just before the ParallelCombiningOp in the - // parallel case. - if (std::is_same::value) - rewriter.setInsertionPoint(insertSliceOp->getParentOp()); - reshapedSource = rewriter.create( - loc, insertSliceOp.getSource(), *reassociation); - } - rewriter.replaceOpWithNewOp( - insertSliceOp, reshapedSource, insertSliceOp.getDest(), - insertSliceOp.getMixedOffsets(), insertSliceOp.getMixedSizes(), - insertSliceOp.getMixedStrides()); - return success(); - } -}; -} // namespace - /// Patterns that are used to canonicalize the use of unit-extent dims for /// broadcasting. void mlir::linalg::populateFoldUnitExtentDimsPatterns( RewritePatternSet &patterns) { auto *context = patterns.getContext(); - patterns.add, - RankReducedInsertSliceOp>( + patterns.add( context); linalg::FillOp::getCanonicalizationPatterns(patterns, context); tensor::CollapseShapeOp::getCanonicalizationPatterns(patterns, context); diff --git a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir --- a/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir +++ b/mlir/test/Dialect/Linalg/drop-unit-extent-dims.mlir @@ -1,212 +1,6 @@ // RUN: mlir-opt %s -split-input-file -pass-pipeline="builtin.module(func.func(linalg-fold-unit-extent-dims))" | FileCheck %s -#accesses = [ - affine_map<(i, j, k, l, m) -> (i, k, m)>, - affine_map<(i, j, k, l, m) -> ()>, - affine_map<(i, j, k, l, m) -> (i, k, j, l, m)> -] - -#trait = { - iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"], - indexing_maps = #accesses, - library_call = "some_external_func" -} - -func.func @drop_one_trip_loops(%arg0 : tensor, %arg1 : f32, %shape: tensor) -> tensor { - %0 = linalg.generic #trait - ins(%arg0, %arg1 : tensor, f32) - outs(%shape : tensor) { - ^bb0(%arg2 : f32, %arg3 : f32, %arg4 : f32) : - linalg.yield %arg3 : f32 - } -> tensor - return %0 : tensor -} -// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1, d2) -> (d0, d2)> -// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1, d2) -> ()> -// CHECK-DAG: #[[$MAP3:.*]] = affine_map<(d0, d1, d2) -> (d0, d1, d2)> -// CHECK-LABEL: func @drop_one_trip_loops -// CHECK: tensor.collapse_shape %{{.*}} {{\[}}[0, 1], [2]] -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP1]], #[[$MAP2]], #[[$MAP3]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel", "parallel"] -// CHECK: tensor.expand_shape %{{.*}} {{\[}}[0, 1], [2, 3], [4]] - -// ----- - -#accesses = [ - affine_map<(i, j, k, l, m) -> (i, k, m)>, - affine_map<(i, j, k, l, m) -> (i, k, j, l, m)> -] - -#trait = { - iterator_types = ["parallel", "parallel", "parallel", "parallel", "parallel"], - indexing_maps = #accesses, - library_call = "some_external_func" -} - -func.func @drop_one_trip_loops_indexed - (%arg0 : tensor, %shape: tensor) -> tensor -{ - %0 = linalg.generic #trait - ins(%arg0 : tensor) - outs(%shape: tensor) { - ^bb0(%arg6 : i32, %arg7 : i32) : - %idx0 = linalg.index 0 : index - %idx1 = linalg.index 1 : index - %idx2 = linalg.index 2 : index - %idx3 = linalg.index 3 : index - %idx4 = linalg.index 4 : index - %1 = arith.addi %idx0, %idx1 : index - %2 = arith.subi %1, %idx2 : index - %3 = arith.subi %2, %idx3 : index - %4 = arith.addi %3, %idx4 : index - %5 = arith.index_cast %4 : index to i32 - %6 = arith.addi %5, %arg6 : i32 - linalg.yield %6 : i32 - } -> tensor - return %0 : tensor -} -// The subtractions disappear the access map of the output tensor maps its unit -// dimensions 1 and 3 to the index dimensions 2 and 3. -// CHECK-LABEL: func @drop_one_trip_loops_indexed -// CHECK: linalg.generic -// CHECK: ^{{.+}}( -// CHECK-SAME: %[[ARG4:[a-zA-Z0-9]+]]: i32, %{{.*}}: i32) -// CHECK: %[[IDX0:.+]] = linalg.index 0 : index -// CHECK: %[[IDX1:.+]] = linalg.index 1 : index -// CHECK: %[[IDX2:.+]] = linalg.index 2 : index -// CHECK: %[[T3:.+]] = arith.addi %[[IDX0]], %[[IDX1]] -// CHECK: %[[T4:.+]] = arith.addi %[[T3]], %[[IDX2]] -// CHECK: %[[T5:.+]] = arith.index_cast %[[T4]] : index to i32 -// CHECK: %[[T6:.+]] = arith.addi %[[T5]], %[[ARG4]] : i32 -// CHECK: linalg.yield %[[T6]] : i32 - -// ----- - -#map0 = affine_map<(i, j) -> (i, j)> -#access = [#map0, #map0] -#trait = { - iterator_types = ["parallel", "parallel"], - indexing_maps = #access, - library_call = "some_external_func" -} - -func.func @drop_all_loops(%arg0 : tensor<1x1xf32>) -> tensor<1x1xf32> -{ - %0 = linalg.generic #trait - ins(%arg0 : tensor<1x1xf32>) - outs(%arg0 : tensor<1x1xf32>) { - ^bb0(%arg1: f32, %arg2: f32) : - linalg.yield %arg1 : f32 - } -> tensor<1x1xf32> - return %0 : tensor<1x1xf32> -} -// CHECK: #[[$MAP0:.*]] = affine_map<() -> ()> -// CHECK-LABEL: func @drop_all_loops -// CHECK: tensor.collapse_shape %{{.*}} [] -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP0]]] -// CHECK-SAME: iterator_types = [] - -// ----- - -#map0 = affine_map<(i, j) -> (i, j)> -#access = [#map0, #map0] -#trait = { - iterator_types = ["parallel", "parallel"], - indexing_maps = #access, - library_call = "some_external_func" -} - -func.func @drop_all_loops_indexed - (%arg0 : tensor<1x1xi32>) -> tensor<1x1xi32>{ - %0 = linalg.generic #trait - ins(%arg0 : tensor<1x1xi32>) - outs(%arg0 : tensor<1x1xi32>) { - ^bb0(%arg3: i32, %arg4: i32) : - %idx0 = linalg.index 0 : index - %idx1 = linalg.index 1 : index - %1 = arith.addi %idx0, %idx1 : index - %2 = arith.index_cast %1 : index to i32 - %3 = arith.addi %2, %arg3 : i32 - linalg.yield %3 : i32 - } -> tensor<1x1xi32> - return %0 : tensor<1x1xi32> -} - -// CHECK-LABEL: func @drop_all_loops_indexed -// CHECK: linalg.generic -// CHECK: ^{{.+}}(%[[ARG1:.+]]: i32, %[[ARG2:.+]]: i32) -// CHECK: linalg.yield %[[ARG1]] : i32 - -// ----- - -#accesses = [ - affine_map<(d0) -> (0, d0)>, - affine_map<(d0) -> (d0)> -] - -#trait = { - indexing_maps = #accesses, - iterator_types = ["parallel"], - library_call = "some_external_fn" -} - -func.func @leading_dim_1_canonicalization(%arg0: tensor<1x5xf32>, %shape: tensor<5xf32>) -> tensor<5xf32> { - %0 = linalg.generic #trait - ins(%arg0 : tensor<1x5xf32>) - outs(%shape : tensor<5xf32>) { - ^bb0(%arg2: f32, %arg3: f32): - linalg.yield %arg2 : f32 - } -> tensor<5xf32> - return %0 : tensor<5xf32> -} -// CHECK: #[[$MAP1:.*]] = affine_map<(d0) -> (d0)> - -// CHECK-LABEL: func @leading_dim_1_canonicalization -// CHECK: tensor.collapse_shape %{{.*}} {{\[}}[0, 1]] -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP1]], #[[$MAP1]]] -// CHECK-SAME: iterator_types = ["parallel"] - -// ----- - -#accesses = [ - affine_map<(d0, d1) -> (0, d1)>, - affine_map<(d0, d1) -> (d0, 0)>, - affine_map<(d0, d1) -> (d0, d1)> -] - -#trait = { - indexing_maps = #accesses, - iterator_types = ["parallel", "parallel"], - library_call = "some_external_fn" -} - -func.func @broadcast_test(%arg0 : tensor<5xf32>, %arg1 : tensor<5xf32>, %shape : tensor<5x5xf32>) -> tensor<5x5xf32> -{ - %0 = tensor.expand_shape %arg0 [[0, 1]] : tensor<5xf32> into tensor<1x5xf32> - %1 = tensor.expand_shape %arg1 [[0, 1]] : tensor<5xf32> into tensor<5x1xf32> - %2 = linalg.generic #trait - ins(%0, %1 : tensor<1x5xf32>, tensor<5x1xf32>) - outs(%shape : tensor<5x5xf32>) { - ^bb0(%arg3: f32, %arg4: f32, %arg5: f32): - %3 = arith.addf %arg3, %arg4 : f32 - linalg.yield %3 : f32 - } -> tensor<5x5xf32> - return %2 : tensor<5x5xf32> -} -// CHECK-DAG: #[[$MAP0:.*]] = affine_map<(d0, d1) -> (d1)> -// CHECK-DAG: #[[$MAP1:.*]] = affine_map<(d0, d1) -> (d0)> -// CHECK-DAG: #[[$MAP2:.*]] = affine_map<(d0, d1) -> (d0, d1)> -// CHECK-LABEL: func @broadcast_test -// CHECK-NOT: linalg.tensor_{{.*}}shape -// CHECK: linalg.generic -// CHECK-SAME: indexing_maps = [#[[$MAP0]], #[[$MAP1]], #[[$MAP2]]] -// CHECK-SAME: iterator_types = ["parallel", "parallel"] -// CHECK-NOT: linalg.tensor_{{.*}}shape -// ----- #accesses = [ affine_map<(d0, d1) -> (0, 0)>, @@ -296,34 +90,6 @@ // CHECK: %[[GENERIC_RESHAPE:.+]] = tensor.expand_shape %[[GENERIC]] [] : tensor into tensor<1xf32> // CHECK: return %[[GENERIC_RESHAPE:.+]] : tensor<1xf32> - -// ----- - -func.func @fold_slice( - %arg0 : tensor<1x?x?x1x?x1x1xf32>, %arg1 : tensor<1x?x?x?x?x1x1xf32>, - %arg2 : index, %arg3 : index, %arg4 : index, %arg5 : index, - %arg6 : index, %arg7 : index) -> (tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32>) { - %0 = tensor.extract_slice %arg0[0, %arg2, %arg3, 0, %arg4, 0, 0] - [1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] : - tensor<1x?x?x1x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32> - %1 = tensor.extract_slice %arg1[%arg2, 0, %arg3, 0, 0, %arg4, 0] - [1, %arg5, %arg6, 1, %arg7, 1, 1] [1, 1, 1, 1, 1, 1, 1] : - tensor<1x?x?x?x?x1x1xf32> to tensor<1x?x?x1x?x1x1xf32> - return %0, %1 : tensor<1x?x?x1x?x1x1xf32>, tensor<1x?x?x1x?x1x1xf32> -} -// CHECK: func @fold_slice -// CHECK-SAME: %[[ARG0:.+]]: tensor<1x?x?x1x?x1x1xf32> -// CHECK-SAME: %[[ARG1:.+]]: tensor<1x?x?x?x?x1x1xf32> -// CHECK: %[[SLICE1:.+]] = tensor.extract_slice %[[ARG0]] -// CHECK-SAME: to tensor -// CHECK: %[[RESULT1:.+]] = tensor.expand_shape %[[SLICE1]] -// CHECK-SAME: [0, 1], [2], [3, 4, 5, 6] -// CHECK: %[[SLICE2:.+]] = tensor.extract_slice %[[ARG1]] -// CHECK-SAME: to tensor -// CHECK: %[[RESULT2:.+]] = tensor.expand_shape %[[SLICE2]] -// CHECK-SAME: [0, 1], [2], [3, 4, 5, 6] -// CHECK: return %[[RESULT1]], %[[RESULT2]] - // ----- func.func @unit_dim_for_reduction(%arg0: tensor<1x?x1x?xf32>) -> tensor<1x?xf32> { @@ -430,30 +196,6 @@ // ----- -func.func @slice_unit_dims(%arg0: tensor<1x3xf32>) -> tensor<1x1xf32> { - %0 = tensor.extract_slice %arg0[0, 2] [1, 1] [1, 1] : tensor<1x3xf32> to tensor<1x1xf32> - return %0 : tensor<1x1xf32> -} -// CHECK-LABEL: func @slice_unit_dims -// CHECK: %[[SLICE:.+]] = tensor.extract_slice -// CHECK-SAME: tensor<1x3xf32> to tensor -// CHECK: %[[RESULT:.+]] = tensor.expand_shape %[[SLICE]] [] -// CHECK: return %[[RESULT]] - -// ----- - -func.func @insert_slice_unit_dims(%arg0: tensor<1x3xf32>, %arg1: tensor<1x1xf32>) -> tensor<1x3xf32> { - %0 = tensor.insert_slice %arg1 into %arg0[0, 2] [1, 1] [1, 1] : tensor<1x1xf32> into tensor<1x3xf32> - return %0 : tensor<1x3xf32> -} -// CHECK-LABEL: func @insert_slice_unit_dims -// CHECK: %[[RESHAPE:.+]] = tensor.collapse_shape %{{.+}} [] -// CHECK: %[[RESULT:.+]] = tensor.insert_slice %[[RESHAPE]] -// CHECK-SAME: tensor into tensor<1x3xf32> -// CHECK: return %[[RESULT]] - -// ----- - #accesses = [ affine_map<(i, j, k, l, m) -> (i, k, m)>, affine_map<(i, j, k, l, m) -> ()>, @@ -828,26 +570,6 @@ // ----- -func.func @reduce_dispatch_0() -> tensor<4x2xf32> { - %c2 = arith.constant 2 : index - %c4 = arith.constant 4 : index - %cst = arith.constant 0.000000e+00 : f32 - %0 = tensor.empty() : tensor<4x2xf32> - %res = scf.foreach_thread (%arg0, %arg1) in (%c4, %c2) shared_outs(%o = %0) -> (tensor<4x2xf32>) { - %1 = tensor.empty() : tensor<1x1xf32> - %2 = linalg.fill ins(%cst : f32) outs(%1 : tensor<1x1xf32>) -> tensor<1x1xf32> - scf.foreach_thread.perform_concurrently { - // CHECK: tensor.parallel_insert_slice %{{[0-9a-z]*}} into %{{[0-9a-z]*}} - // CHECK-SAME: [%{{.*}}, %{{.*}}] [1, 1] [1, 1] : tensor into tensor<4x2xf32> - tensor.parallel_insert_slice %2 into %o[%arg0, %arg1] [1, 1] [1, 1] : - tensor<1x1xf32> into tensor<4x2xf32> - } - } - return %res: tensor<4x2xf32> -} - -// ----- - #map0 = affine_map<(i, j) -> (i, j)> #access = [#map0, #map0] #trait = {