diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td --- a/mlir/include/mlir/Dialect/Linalg/Passes.td +++ b/mlir/include/mlir/Dialect/Linalg/Passes.td @@ -53,7 +53,11 @@ "Use stack allocations for memrefs (for testing purposes only)">, Option<"analysisFuzzerSeed", "analysis-fuzzer-seed", "unsigned", /*default=*/"0", - "Analyze ops in random order with a given seed (fuzzer)"> + "Analyze ops in random order with a given seed (fuzzer)">, + Option<"initTensorElimination", "init-tensor-elimination", "bool", + /*default=*/"false", + "(Experimental) Try to eliminate init_tensor operations that are " + "anchored at an insert_slice op">, ]; let constructor = "mlir::createLinalgComprehensiveModuleBufferizePass()"; } diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp @@ -92,8 +92,10 @@ options->printConflicts = printConflicts; // Enable InitTensorOp elimination. - options->addPostAnalysisStep< - linalg_ext::InsertSliceAnchoredInitTensorEliminationStep>(); + if (initTensorElimination) { + options->addPostAnalysisStep< + linalg_ext::InsertSliceAnchoredInitTensorEliminationStep>(); + } if (!allowReturnMemref) options->addPostAnalysisStep(); diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-init-tensor-elimination.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-init-tensor-elimination.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-init-tensor-elimination.mlir @@ -0,0 +1,55 @@ +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="test-analysis-only allow-return-memref init-tensor-elimination" -split-input-file | FileCheck %s + +// ----- + +//===----------------------------------------------------------------------===// +// InitTensorOp elimination +//===----------------------------------------------------------------------===// + +// CHECK-LABEL: func @buffer_forwarding_conflict +func @buffer_forwarding_conflict(%arg0: tensor {linalg.inplaceable = true}, %arg1: index) -> (tensor, tensor) { + %cst = arith.constant 0.000000e+00 : f32 + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none"] + // Instead of allocating, share buffer with some inplace bufferization? + %0 = linalg.init_tensor [%arg1] : tensor + + // CHECK: linalg.fill + // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"] + %1 = linalg.fill(%cst, %0) : f32, tensor -> tensor + + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"] + %2 = tensor.insert_slice %1 into %arg0[0] [%arg1] [1] : tensor into tensor + + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"] + %3 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor into tensor + + // CHECK: return + // CHECK-SAME: __equivalent_func_args__ = [-1, 0] + return %2, %3 : tensor, tensor +} + +// ----- + +// CHECK-LABEL: func @buffer_forwarding_no_conflict +func @buffer_forwarding_no_conflict(%arg0: tensor {linalg.inplaceable = true}, %arg1: index) -> (tensor, tensor) { + %cst = arith.constant 0.000000e+00 : f32 + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"] + // Instead of allocating, share buffer with some inplace bufferization? + %0 = linalg.init_tensor [%arg1] : tensor + + // CHECK: linalg.fill + // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"] + %1 = linalg.fill(%cst, %0) : f32, tensor -> tensor + + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"] + %2 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor into tensor + + // CHECK: return + // CHECK-SAME: __equivalent_func_args__ = [0, 0] + return %2, %2 : tensor, tensor +} diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir @@ -1249,60 +1249,6 @@ // ----- -//===----------------------------------------------------------------------===// -// InitTensorOp elimination -//===----------------------------------------------------------------------===// - -// CHECK-LABEL: func @buffer_forwarding_conflict -func @buffer_forwarding_conflict(%arg0: tensor {linalg.inplaceable = true}, %arg1: index) -> (tensor, tensor) { - %cst = arith.constant 0.000000e+00 : f32 - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none"] - // Instead of allocating, share buffer with some inplace bufferization? - %0 = linalg.init_tensor [%arg1] : tensor - - // CHECK: linalg.fill - // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"] - %1 = linalg.fill(%cst, %0) : f32, tensor -> tensor - - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"] - %2 = tensor.insert_slice %1 into %arg0[0] [%arg1] [1] : tensor into tensor - - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"] - %3 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor into tensor - - // CHECK: return - // CHECK-SAME: __equivalent_func_args__ = [-1, 0] - return %2, %3 : tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @buffer_forwarding_no_conflict -func @buffer_forwarding_no_conflict(%arg0: tensor {linalg.inplaceable = true}, %arg1: index) -> (tensor, tensor) { - %cst = arith.constant 0.000000e+00 : f32 - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"] - // Instead of allocating, share buffer with some inplace bufferization? - %0 = linalg.init_tensor [%arg1] : tensor - - // CHECK: linalg.fill - // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"] - %1 = linalg.fill(%cst, %0) : f32, tensor -> tensor - - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"] - %2 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor into tensor - - // CHECK: return - // CHECK-SAME: __equivalent_func_args__ = [0, 0] - return %2, %2 : tensor, tensor -} - -// ----- - //===----------------------------------------------------------------------===// // scf.if cases //===----------------------------------------------------------------------===// @@ -1764,3 +1710,26 @@ } return %1: tensor } + +// ----- + +//===----------------------------------------------------------------------===// +// InitTensorOp elimination would produce SSA violations for the example below. +//===----------------------------------------------------------------------===// + +func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x18x32xf32>) + -> tensor { + %c0 = arith.constant 0 : index + %c32 = arith.constant 32 : index + %c8 = arith.constant 8 : index + %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32> + %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor + %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32> + %3 = scf.for %arg3 = %c0 to %c32 step %c8 iter_args(%arg4 = %1) -> (tensor) { + %4 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg3) + %5 = tensor.insert_slice %2 into %arg4[%4,0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] : + tensor<1x6x8xf32> into tensor + scf.yield %5 : tensor + } + return %3 : tensor +} \ No newline at end of file diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir @@ -0,0 +1,64 @@ +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-memref init-tensor-elimination" -split-input-file | FileCheck %s + +// ----- + +// CHECK: func @buffer_forwarding_conflict( +// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref +// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index +func @buffer_forwarding_conflict( + %t: tensor {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}, + %sz: index) + -> (tensor, tensor) +{ + %f0 = arith.constant 0.0: f32 + // Alloc is needed for the **first** insert_slice (due to backward traversal during analysis). + // CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]] + // This allocs the whole dim to allow for a full clone of t. + // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) + + // init_tensor itself does not alloc but forwards to the **second** + // insert_slice. InitTensorOp replaces the init_tensor with an out-of-place + // extract_slice. + // CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]]) + // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] + %a = linalg.init_tensor[%sz] : tensor + + // CHECK: linalg.fill({{.*}}, %[[EXTRACT_SLICE_ALLOC]]) : f32, memref + %f = linalg.fill(%f0, %a) : f32, tensor -> tensor + + // CHECK: linalg.copy(%[[FUNC_ARG]], %[[ALLOC]]) : memref, memref + // CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref to memref + // CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]]) : memref, memref + %r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor into tensor + + // CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]]) + %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor into tensor + + return %r0, %r1: tensor, tensor +} + +// ----- + +// CHECK: func @buffer_forwarding_no_conflict( +// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref +// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index +func @buffer_forwarding_no_conflict( + %t: tensor {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}, + %sz: index) + -> (tensor) +{ + %f0 = arith.constant 0.0: f32 + + // init_tensor itself does not alloc but forwards to the insert_slice. + // InitTensorOp replaces the init_tensor with an inplace extract_slice. + // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] + %a = linalg.init_tensor[%sz] : tensor + + // CHECK: linalg.fill({{.*}}, %[[T_SUBVIEW]]) : f32, memref -> tensor + + // Self-copy canonicalizes away later. + %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor into tensor + + return %r1: tensor +} diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir @@ -868,69 +868,6 @@ // ----- -// CHECK: func @buffer_forwarding_conflict( -// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref -// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index -func @buffer_forwarding_conflict( - %t: tensor {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}, - %sz: index) - -> (tensor, tensor) -{ - %f0 = arith.constant 0.0: f32 - // Alloc is needed for the **first** insert_slice (due to backward traversal during analysis). - // CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]] - // This allocs the whole dim to allow for a full clone of t. - // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]]) - - // init_tensor itself does not alloc but forwards to the **second** - // insert_slice. InitTensorOp replaces the init_tensor with an out-of-place - // extract_slice. - // CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]]) - // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] - %a = linalg.init_tensor[%sz] : tensor - - // CHECK: linalg.fill({{.*}}, %[[EXTRACT_SLICE_ALLOC]]) : f32, memref - %f = linalg.fill(%f0, %a) : f32, tensor -> tensor - - // CHECK: linalg.copy(%[[FUNC_ARG]], %[[ALLOC]]) : memref, memref - // CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref to memref - // CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]]) : memref, memref - %r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor into tensor - - // CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]]) - %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor into tensor - - return %r0, %r1: tensor, tensor -} - -// ----- - -// CHECK: func @buffer_forwarding_no_conflict( -// CHECK-SAME: %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref -// CHECK-SAME: %[[sz:[0-9a-zA-Z]*]]: index -func @buffer_forwarding_no_conflict( - %t: tensor {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true}, - %sz: index) - -> (tensor) -{ - %f0 = arith.constant 0.0: f32 - - // init_tensor itself does not alloc but forwards to the insert_slice. - // InitTensorOp replaces the init_tensor with an inplace extract_slice. - // CHECK: %[[T_SUBVIEW:.*]] = memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1] - %a = linalg.init_tensor[%sz] : tensor - - // CHECK: linalg.fill({{.*}}, %[[T_SUBVIEW]]) : f32, memref -> tensor - - // Self-copy canonicalizes away later. - %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor into tensor - - return %r1: tensor -} - -// ----- - // CHECK-LABEL: func @scf_if_inplace( // CHECK-SAME: %[[cond:.*]]: i1, %[[t1:.*]]: memref, %[[v:.*]]: vector func @scf_if_inplace(%cond: i1,