diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -202,6 +202,20 @@ /// For debugging only. Should be used together with `testAnalysisOnly`. bool printConflicts = false; + /// If set to `true`, an `getAliasingOpResult` will return the corresponding + /// "out"/"dest" OpOperand for every op that has the notion of an "out"/"dest" + /// operand. I.e., the aliasing OpOperand of the i-th tensor OpResult is + /// usually the i-th "out" tensor OpOperand. This is in line with + /// destination-passing style and the default behavior. Op interface + /// implementations must follow this contract to avoid surprising behavior. + /// + /// If set to `false`, BufferizableOpInterface implementations can try to be + /// smart and choose to alias with "in" operands or other operands. E.g., the + /// result of a `linalg.generic` op could bufferize in-place with an "in" + /// OpOperand if the corresponding "out" operand is not used within the + /// computation. Whether this pays off or not can be very input IR-specific. + bool alwaysAliasingWithDest = true; + /// Buffer alignment for new memory allocations. unsigned int bufferAlignment = 128; diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td --- a/mlir/include/mlir/Dialect/Linalg/Passes.td +++ b/mlir/include/mlir/Dialect/Linalg/Passes.td @@ -49,6 +49,10 @@ Option<"allowUnknownOps", "allow-unknown-ops", "bool", /*default=*/"false", "Allows unknown (not bufferizable) ops in the input IR.">, + Option<"alwaysAliasingWithDest", "always-aliasing-with-dest", "bool", + /*default=*/"true", + "Tensor OpResult cannot bufferize inplace OpOperands other than " + "out or dest OpOperands (if the op has a notion of such operands)">, Option<"useAlloca", "use-alloca", "bool", /*default=*/"false", "Use stack allocations for memrefs (for testing purposes only)">, diff --git a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp --- a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/LinalgInterfaceImpl.cpp @@ -165,8 +165,7 @@ bool bufferizesToMemoryWrite(Operation *op, OpOperand &opOperand, const BufferizationState &state) const { - // Operand is written to if it has an aliasing OpResult. For more details, - // see `computeAliasingPairs`. + // Operand is written to if it has an aliasing OpResult. auto bufferizableOp = cast(op); return !bufferizableOp.getAliasingOpResult(opOperand, state).empty(); } @@ -176,6 +175,12 @@ const BufferizationState &state) const { auto genericOp = cast(op); + // By default, the i-th OpResult may alias with the i-th "out" tensor. + if (state.getOptions().alwaysAliasingWithDest) + return {genericOp.getOutputOperand(opResult.getResultNumber())}; + + // We can try to be smart and alias in-place with an "in" tensor if the + // corresponding "out" tensor is not used in the computation. // Aliasing OpOperand/OpResult pairs are computed by `computeAliasingPairs`. DenseMap pairs = computeAliasingPairs(genericOp); for (OpOperand *opOperand : genericOp.getInputAndOutputOperands()) @@ -189,6 +194,14 @@ const BufferizationState &state) const { auto genericOp = cast(op); + // By default, the i-th "out" tensor may alias with the i-th OpResult. + if (state.getOptions().alwaysAliasingWithDest) { + if (genericOp.isOutputTensor(&opOperand)) + return {genericOp.getTiedOpResult(&opOperand)}; + return {}; + } + + // We can try to be smart. See comment in `getAliasingOpOperand`. // Aliasing OpOperand/OpResult pairs are computed by `computeAliasingPairs`. DenseMap pairs = computeAliasingPairs(genericOp); if (!pairs.count(&opOperand)) diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp @@ -97,6 +97,7 @@ opt.fullyDynamicLayoutMaps = fullyDynamicLayoutMaps; opt.printConflicts = printConflicts; opt.testAnalysisOnly = testAnalysisOnly; + opt.alwaysAliasingWithDest = alwaysAliasingWithDest; if (initTensorElimination) { opt.addPostAnalysisStep( linalg_ext::insertSliceAnchoredInitTensorEliminationStep); diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-aliasing-in.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-aliasing-in.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-aliasing-in.mlir @@ -0,0 +1,75 @@ +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-memref always-aliasing-with-dest=0" -split-input-file | FileCheck %s + +// CHECK-LABEL: func @linalg_op_bufferizes_inplace_with_input +// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref, %[[t3:.*]]: memref +func @linalg_op_bufferizes_inplace_with_input( + %t1: tensor {linalg.inplaceable = true}, + %t2: tensor {linalg.inplaceable = false}, + %t3: tensor {linalg.inplaceable = false}, + %s1: index, %s2: index, %cst: f32) -> tensor { + // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t1]] : {{.*}}) + %r = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d1)>, + affine_map<(d0, d1)-> (d0, d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%t1, %t2 : tensor, tensor) + outs(%t3 : tensor) { + ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) : + %add = arith.addf %arg0, %arg1 : f32 + linalg.yield %add : f32 + } -> tensor + return %r : tensor +} + +// ----- + +// CHECK-LABEL: func @linalg_op_bufferizes_out_of_place_with_input +// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref, %[[t3:.*]]: memref +func @linalg_op_bufferizes_out_of_place_with_input( + %t1: tensor {linalg.inplaceable = false}, + %t2: tensor {linalg.inplaceable = false}, + %t3: tensor {linalg.inplaceable = false}, + %s1: index, %s2: index, %cst: f32) -> tensor { + // CHECK: %[[alloc:.*]] = memref.alloc + // CHECK: memref.copy %[[t1]], %[[alloc]] + // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[alloc]] : {{.*}}) + %r = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, + affine_map<(d0, d1) -> (d1)>, + affine_map<(d0, d1)-> (d0, d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%t1, %t2 : tensor, tensor) + outs(%t3 : tensor) { + ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) : + %add = arith.addf %arg0, %arg1 : f32 + linalg.yield %add : f32 + } -> tensor + // CHECK: return %[[alloc]] + return %r : tensor +} + +// ----- + +// CHECK-LABEL: func @linalg_op_output_cannot_alias_with_input +// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref, %[[t3:.*]]: memref +func @linalg_op_output_cannot_alias_with_input( + %t1: tensor {linalg.inplaceable = true}, + %t2: tensor {linalg.inplaceable = false}, + %t3: tensor {linalg.inplaceable = true}, + %s1: index, %s2: index, %cst: f32) -> tensor { + // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t3]] : {{.*}}) + %r = linalg.generic { + indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, + affine_map<(d0, d1) -> (d1)>, + affine_map<(d0, d1)-> (d0, d1)>], + iterator_types = ["parallel", "parallel"]} + ins(%t1, %t2 : tensor, tensor) + outs(%t3 : tensor) { + ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) : + %add = arith.addf %arg0, %arg1 : f32 + linalg.yield %add : f32 + } -> tensor + return %r : tensor +} + diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-aliasing-in.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-aliasing-in.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-aliasing-in.mlir @@ -0,0 +1,83 @@ +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="test-analysis-only allow-return-memref always-aliasing-with-dest=0" -split-input-file | FileCheck %s + +// This is a test case for alwaysAliasingWithDest = 0. In that case, an OpResult +// may bufferize in-place with an "in" OpOperand or any non-"out" OpOperand. + + +#accesses = [ + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)> +] +#trait = { + indexing_maps = #accesses, + iterator_types = ["parallel"] +} + +// CHECK-LABEL: func @linalg_op_same_out_tensors( +func @linalg_op_same_out_tensors( + %t1: tensor {linalg.inplaceable = true}, +// CHECK-SAME: bufferization.access = "read-write" + %t2: tensor {linalg.inplaceable = true}) +// CHECK-SAME: bufferization.access = "write" + -> (tensor, tensor){ + + // %1 and %2 are not used in the computation, so the two OpResults do not + // necessarily have to bufferize in-place with the two "out" OpOperands. They + // bufferize in-place with the first and second OpOperand (one of which is an + // "in" OpOperand). + // CHECK: linalg.generic + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"] + %o:2 = linalg.generic #trait ins(%t1 : tensor) + outs (%t2, %t2 : tensor, tensor) { + ^bb(%0: f32, %1: f32, %2 : f32) : + linalg.yield %0, %0 : f32, f32 + } -> (tensor, tensor) + + // CHECK: return + // CHECK-SAME: __equivalent_func_args__ = [0, 1] + return %o#0, %o#1 : tensor, tensor +} + +// ----- + +#accesses = [ + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)>, + affine_map<(i) -> (i)> +] +#trait = { + indexing_maps = #accesses, + iterator_types = ["parallel"] +} + +// CHECK-LABEL: func @linalg_op_same_out_tensors_2( +func @linalg_op_same_out_tensors_2( + %t1: tensor {linalg.inplaceable = true}, +// CHECK-SAME: bufferization.access = "read-write" + %t2: tensor {linalg.inplaceable = true}) +// CHECK-SAME: bufferization.access = "write" + -> (tensor, tensor, tensor){ + + // %1, %2 and %3 are not used in the computation, so the three OpResults do + // not necessarily have to bufferize in-place with the three "out" OpOperands. + // They bufferize in-place with the first, second and third OpOperand (one of + // which is an "in" OpOperand). + // In contrast to the previous test case, two of the chosen OpOperands are the + // same (aliasing) SSA value, which is why one of them must bufferize + // out-of-place. + // CHECK: linalg.generic + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true", "false"] + %o:3 = linalg.generic #trait + ins(%t1 : tensor) + outs (%t2, %t2, %t2 : tensor, tensor, tensor) { + ^bb(%0: f32, %1: f32, %2 : f32, %3 : f32) : + linalg.yield %0, %0, %0 : f32, f32, f32 + } -> (tensor, tensor, tensor) + + // CHECK: return + // CHECK-SAME: __equivalent_func_args__ = [0, 1, -1] + return %o#0, %o#1, %o#2 : tensor, tensor, tensor +} + diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir @@ -990,13 +990,13 @@ // CHECK-LABEL: func @linalg_op_same_out_tensors( func @linalg_op_same_out_tensors( %t1: tensor {linalg.inplaceable = true}, -// CHECK-SAME: bufferization.access = "read-write" +// CHECK-SAME: bufferization.access = "read" %t2: tensor {linalg.inplaceable = true}) // CHECK-SAME: bufferization.access = "write" -> (tensor, tensor){ // CHECK: linalg.generic - // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true"] + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false"] %o:2 = linalg.generic #trait ins(%t1 : tensor) outs (%t2, %t2 : tensor, tensor) { ^bb(%0: f32, %1: f32, %2 : f32) : @@ -1004,7 +1004,7 @@ } -> (tensor, tensor) // CHECK: return - // CHECK-SAME: __equivalent_func_args__ = [0, 1] + // CHECK-SAME: __equivalent_func_args__ = [1, -1] return %o#0, %o#1 : tensor, tensor } @@ -1024,13 +1024,13 @@ // CHECK-LABEL: func @linalg_op_same_out_tensors_2( func @linalg_op_same_out_tensors_2( %t1: tensor {linalg.inplaceable = true}, -// CHECK-SAME: bufferization.access = "read-write" +// CHECK-SAME: bufferization.access = "read" %t2: tensor {linalg.inplaceable = true}) // CHECK-SAME: bufferization.access = "write" -> (tensor, tensor, tensor){ // CHECK: linalg.generic - // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "true", "false"] + // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "false", "false"] %o:3 = linalg.generic #trait ins(%t1 : tensor) outs (%t2, %t2, %t2 : tensor, tensor, tensor) { @@ -1039,7 +1039,7 @@ } -> (tensor, tensor, tensor) // CHECK: return - // CHECK-SAME: __equivalent_func_args__ = [0, 1, -1] + // CHECK-SAME: __equivalent_func_args__ = [1, -1, -1] return %o#0, %o#1, %o#2 : tensor, tensor, tensor } diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir @@ -1176,63 +1176,12 @@ // CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref, %[[t3:.*]]: memref func @linalg_op_bufferizes_inplace_with_input( %t1: tensor {linalg.inplaceable = true}, - %t2: tensor {linalg.inplaceable = false}, - %t3: tensor {linalg.inplaceable = false}, - %s1: index, %s2: index, %cst: f32) -> tensor { - // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t1]] : {{.*}}) - %r = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d1)>, - affine_map<(d0, d1)-> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%t1, %t2 : tensor, tensor) - outs(%t3 : tensor) { - ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) : - %add = arith.addf %arg0, %arg1 : f32 - linalg.yield %add : f32 - } -> tensor - return %r : tensor -} - -// ----- - -// CHECK-LABEL: func @linalg_op_bufferizes_out_of_place_with_input -// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref, %[[t3:.*]]: memref -func @linalg_op_bufferizes_out_of_place_with_input( - %t1: tensor {linalg.inplaceable = false}, - %t2: tensor {linalg.inplaceable = false}, - %t3: tensor {linalg.inplaceable = false}, - %s1: index, %s2: index, %cst: f32) -> tensor { - // CHECK: %[[alloc:.*]] = memref.alloc - // CHECK: memref.copy %[[t1]], %[[alloc]] - // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[alloc]] : {{.*}}) - %r = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, - affine_map<(d0, d1) -> (d1)>, - affine_map<(d0, d1)-> (d0, d1)>], - iterator_types = ["parallel", "parallel"]} - ins(%t1, %t2 : tensor, tensor) - outs(%t3 : tensor) { - ^bb0(%arg0 : f32, %arg1 : f32, %arg2 : f32) : - %add = arith.addf %arg0, %arg1 : f32 - linalg.yield %add : f32 - } -> tensor - // CHECK: return %[[alloc]] - return %r : tensor -} - -// ----- - -// CHECK-LABEL: func @linalg_op_output_cannot_alias_with_input -// CHECK-SAME: %[[t1:.*]]: memref, %[[t2:.*]]: memref, %[[t3:.*]]: memref -func @linalg_op_output_cannot_alias_with_input( - %t1: tensor {linalg.inplaceable = true}, - %t2: tensor {linalg.inplaceable = false}, + %t2: tensor {linalg.inplaceable = true}, %t3: tensor {linalg.inplaceable = true}, %s1: index, %s2: index, %cst: f32) -> tensor { // CHECK: linalg.generic {{.*}} ins(%[[t1]], %[[t2]] : {{.*}}) outs(%[[t3]] : {{.*}}) %r = linalg.generic { - indexing_maps = [affine_map<(d0, d1) -> (d1, d0)>, + indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d1)>, affine_map<(d0, d1)-> (d0, d1)>], iterator_types = ["parallel", "parallel"]}