diff --git a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/BufferizationInterfaceImpl.cpp b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/BufferizationInterfaceImpl.cpp --- a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/BufferizationInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/BufferizationInterfaceImpl.cpp @@ -25,14 +25,39 @@ // TODO: These ops should implement BufferizableOpInterface directly when moved // to the Bufferization dialect. -// TODO: These implementations are conservative and will likely have to be -// loosened for partial bufferization. - /// ToMemrefOp casts a tensor into a memref. The resulting memref is the memory -/// location of the incoming tensor once it will be bufferized. In the anlysis, -/// the incoming tensor is assumed to bufferize to a memory read and to an -/// inplace memory write, since it is unknown what will happen to the resulting -/// memref. +/// location of the incoming tensor once it will be bufferized. +/// +/// ToMemrefOps are generated during partial bufferization when passing the +/// result of a not-yet-bufferized op into a bufferized op. +/// +/// As an example, consider the following IR: +/// +/// %t1 = "writing_op"(%t0) : tensor +/// %t2 = "another_writing_op"(%t1) : tensor +/// +/// In this example, the first op is unknown. Therefore, the analysis +/// conservatively considers its result as not writable, preventing the second +/// op from bufferizing inplace. +/// +/// %t1 = "writing_op"(%t0) : tensor +/// %t2 = "another_writing_op"(%t1) { inplace = [false] } : tensor +/// +/// During bufferization, the first op is left as is, but the second op uses its +/// result, wrapped in a ToMemrefOp. +/// +/// %t1 = "writing_op"(%t0) : tensor +/// %t1_memref = bufferization.to_memref %t1 +/// %t1_copy = memref.alloc +/// memref.copy %t1_memref, %t1_copy +/// "another_writing_op"(%t1_copy) +/// +/// From an analysis perspective, ToMemrefOp operands bufferize to a memory +/// read. They do not bufferize to a memory write. Since the analysis cannot +/// analyze memref usage, it is unknown how the resulting memref it used. We +/// can, however, be sure that the memref is not written to inplace, because +/// buffer copies are inserted together with ToMemrefOps, as can be seen from +/// the example above. struct ToMemrefOpInterface : public BufferizableOpInterface::ExternalModel { @@ -52,13 +77,24 @@ LogicalResult bufferize(Operation *op, OpBuilder &b, BufferizationState &state) const { + // TODO: Need to insert memref.cast in certain situations? + auto toMemrefOp = cast(op); + if (!state.isMapped(toMemrefOp.tensor())) + return success(); + + Value memref = state.lookupBuffer(toMemrefOp.tensor()); + // Do not replace ToMemrefOps that were just created. + if (toMemrefOp.getResult() != memref) + toMemrefOp.replaceAllUsesWith(memref); + return success(); } }; -/// ToTensorOp conceptually loads a tensor from a memory location. Such ops do -/// not lower any further, and they should have disappeared by the time the -/// input is fully bufferized. +/// ToTensorOp conceptually loads a tensor from a memory location. ToTensorOps +/// are used at bufferization boundaries for partial bufferization. They do not +/// lower any further, and should have disappeared by the time the input is +/// fully bufferized. /// /// The analysis has no information about the memref that is loaded from by the /// ToTensorOp. We have to assume that the loaded tensor may after bufferization @@ -66,6 +102,38 @@ /// ToMemrefOp have no aliasing OpOperand/OpResult pairs, this cannot be encoded /// directly in the analysis. However, declaring ToTensorOp results as not /// writable also enforces a buffer copy and has the same effect. +/// +/// OpOperands (of other ops) that are ToTensorOp results never bufferize +/// inplace. This leads to additional buffer copies when gradually bufferizing +/// IR compared to a one-shot bufferization. +/// +/// As an example, consider the following IR: +/// +/// %t1 = "writing_op"(%t0) : tensor +/// %t2 = "another_writing_op"(%t1) : tensor +/// +/// With progressive bufferization, both ops bufferize inplace in the absence of +/// conflicts: +/// +/// %t1 = "writing_op"(%t0) : tensor { inplace = [true] } +/// %t2 = "another_writing_op"(%t1) : tensor { inplace = [true] } +/// +/// Let's assume that after the first partial bufferization, the IR is in the +/// following state: +/// +/// %m = "memref_writing_op"(%m) : memref +/// %m_tensor = bufferization.to_tensor %m +/// %t2 = "another_writing_op"(%m_tensor) : tensor +/// +/// In the above example, another_writing_op cannot bufferize inplace because +/// ToTensorOp results are not writable. +/// +/// Note: ToTensorOps are always inserted right before their use, so they are +/// guaranteed to copy the most recent buffer contents. +/// +/// It is up to other passes to reduce superfluous buffer copies again. +/// Alternatively, such copies should not be introduced in the first place when +/// bufferizing the entire IR in one shot. struct ToTensorOpInterface : public BufferizableOpInterface::ExternalModel { @@ -82,7 +150,20 @@ } bool isWritable(Operation *op, Value value, BufferizationState &state) const { - // It is unknown whether the MemRef operand is writable or not. + // ToTensorOps are generated at the partial bufferization boundary. The RaW + // analysis cannot analyze through ToTensorOps/ToMemrefOps, so buffer copies + // must be inserted whenever writing to a ToTensorOp result. Otherwise, two + // conflicting writing ops could bufferize inplace in such a way that the + // analysis would not consider them conflicting. E.g.: + // + // %t1 = "writing_op"(%t0) : tensor + // %t1_memref = bufferization.to_memref %t1 + // %t1_tensor = bufferization.to_tensor %t1_memref + // %t2 = "another_writing_op"(%t1_tensor) : tensor + // + // In the above example, bufferizing writing_op and another_writing_op could + // bufferize inplace from a RaW analysis perspective, but isWritable = false + // forces a buffer copy. return false; } }; diff --git a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.cpp --- a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.cpp +++ b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ComprehensiveBufferize.cpp @@ -229,12 +229,6 @@ /// Return true if opOperand has been decided to bufferize in-place. static bool isInplaceMemoryWrite(OpOperand &opOperand, const BufferizationAliasInfo &aliasInfo) { - // The analysis does not know what happens to the result of a ToMemrefOp, so - // we assume that it is written to. - // TODO: This is a conservative implementation. This rule will have to be - // relaxed for partial bufferization. - if (isa(opOperand.getOwner())) - return true; // OpOperands without an aliasing OpResult do not write. OpResult opResult = getAliasingOpResult(opOperand); if (!opResult) diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir @@ -167,23 +167,3 @@ } return %r: tensor<4xi32> } - -// ----- - -func @to_memref_op_is_writing( - %t1: tensor {linalg.inplaceable = true}, %idx1: index, - %idx2: index, %idx3: index, %v1: vector<5xf32>) -> (vector<5xf32>, vector<5xf32>) { - // This is a RaW conflict because to_memref is an inplace write and %t1 is - // read further down. This will likely have to change with partial - // bufferization. - - // expected-error @+1 {{input IR has RaW conflict}} - %0 = bufferization.to_memref %t1 : memref - - // Read from both. - %cst = arith.constant 0.0 : f32 - %r1 = vector.transfer_read %t1[%idx3], %cst : tensor, vector<5xf32> - %r2 = vector.transfer_read %0[%idx3], %cst : memref, vector<5xf32> - - return %r1, %r2 : vector<5xf32>, vector<5xf32> -} diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-partial.mlir @@ -1,13 +1,22 @@ // RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s -// TODO: Bufferize result IR of bufferization. -// TODO: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops" -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s +// Bufferize result IR of bufferization. +// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops" -linalg-comprehensive-module-bufferize="allow-return-memref allow-unknown-ops" -split-input-file | FileCheck %s // Run fuzzer with different seeds. // RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null // RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null // RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-module-bufferize="test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null +// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-tensor-bufferize -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-TENSOR +// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-vector-bufferize -canonicalize -split-input-file | FileCheck %s --check-prefix=CHECK-VECTOR + +// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-tensor-bufferize -canonicalize | \ +// RUN: mlir-opt -linalg-comprehensive-vector-bufferize -canonicalize | FileCheck %s --check-prefix=CHECK-TENSOR-VECTOR + +// RUN: mlir-opt %s -allow-unregistered-dialect -linalg-comprehensive-vector-bufferize -canonicalize | \ +// RUN: mlir-opt -linalg-comprehensive-tensor-bufferize -canonicalize | FileCheck %s --check-prefix=CHECK-VECTOR-TENSOR + // CHECK-LABEL: func @use_of_unknown_op_1( // CHECK-SAME: %[[m1:.*]]: memref {linalg.inplaceable = true}) @@ -95,15 +104,23 @@ // CHECK-LABEL: func @use_of_bufferizable_op_in_unbufferizable_op // CHECK-SAME: %[[m1:.*]]: memref func @use_of_bufferizable_op_in_unbufferizable_op( %t1: tensor, %o: index, %s: index) -> (tensor, tensor) { // CHECK: %[[subview:.*]] = memref.subview %[[m1]] + // CHECK-TENSOR: %[[m1_memref:.*]] = bufferization.to_memref %[[m1]] + // CHECK-TENSOR: %[[subview:.*]] = memref.subview %[[m1_memref]] %0 = tensor.extract_slice %t1[%o][%s][1] : tensor to tensor // CHECK: %[[subview_tensor:.*]] = bufferization.to_tensor %[[subview]] // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[subview_tensor]]) + // CHECK-TENSOR: %[[subview_tensor:.*]] = bufferization.to_tensor %[[subview]] + // CHECK-TENSOR: %[[dummy:.*]] = "test.dummy_op"(%[[subview_tensor]]) %1 = "test.dummy_op"(%0) : (tensor) -> tensor // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] // CHECK: return %[[subview]], %[[dummy_memref]] + // CHECK-TENSOR: %[[subview_tensor:.*]] = bufferization.to_tensor %[[subview]] + // CHECK-TENSOR: return %[[subview_tensor]], %[[dummy]] return %0, %1 : tensor, tensor } @@ -148,3 +165,113 @@ // CHECK: return %[[alloc]] return %1 : tensor } + +// ----- + +// CHECK-LABEL: func @write_to_same_tensor_1 +// CHECK-SAME: %[[t1:.*]]: memref {linalg.inplaceable = true}, %idx1: index, + %idx2: index, %idx3: index, %v1: vector<5xf32>) -> (tensor, tensor) { + // CHECK: %[[t1_tensor:.*]] = bufferization.to_tensor %[[t1]] + // Note: bufferization.to_tensor ops are not writable. So dummy_op will not + // bufferize inplace. + // CHECK: %[[dummy:.*]] = "test.dummy_op"(%[[t1_tensor]]) + // CHECK: %[[dummy_memref:.*]] = bufferization.to_memref %[[dummy]] + %0 = "test.dummy_op"(%t1) : (tensor) -> tensor + + // Read from both. + %cst = arith.constant 0.0 : f32 + // CHECK: vector.transfer_write %{{.*}}, %[[t1]] + %r1 = vector.transfer_write %v1, %t1[%idx3] : vector<5xf32>, tensor + + // %0 is not writable. So create a copy here. + // CHECK: %[[alloc:.*]] = memref.alloc + // CHECK: %[[subview:.*]] = memref.subview %[[dummy_memref]] + // CHECK: linalg.copy(%[[subview]], %[[alloc]]) + %e = tensor.extract_slice %0[%idx1][%idx2][1] : tensor to tensor + + // CHECK: vector.transfer_write %{{.*}}, %[[alloc]] + %r2 = vector.transfer_write %v1, %e[%idx3] : vector<5xf32>, tensor + + // CHECK: return %[[alloc]] + return %r1, %r2 : tensor, tensor +} + +// ----- + +// TODO: Check insertion point of to_tensor and to_memref ops. + +// CHECK-TENSOR-LABEL: func @writing_op_sequence +// CHECK-TENSOR-SAME: %[[t1:.*]]: tensor +// CHECK-TENSOR-NOT: tensor.insert + +// CHECK-VECTOR-LABEL: func @writing_op_sequence +// CHECK-VECTOR-SAME: %[[t1:.*]]: tensor +// CHECK-VECTOR-NOT: vector.transfer_write + +// CHECK-TENSOR-VECTOR-LABEL: func @writing_op_sequence +// CHECK-TENSOR-VECTOR-SAME: %[[t1:.*]]: tensor +// CHECK-TENSOR-VECTOR-NOT: tensor.insert +// CHECK-TENSOR-VECTOR-NOT: vector.transfer_write + +// CHECK-VECTOR-TENSOR-LABEL: func @writing_op_sequence +// CHECK-VECTOR-TENSOR-SAME: %[[t1:.*]]: tensor +func @writing_op_sequence(%t1: tensor, %s: f32, + %i: index, %v: vector<5xf32>) -> tensor { + // CHECK-TENSOR: %[[t1_memref:.*]] = bufferization.to_memref %[[t1]] + // CHECK-TENSOR: %[[alloc:.*]] = memref.alloc + // CHECK-TENSOR: %[[casted:.*]] = memref.cast %[[alloc]] + // CHECK-TENSOR: memref.copy %[[t1_memref]], %[[casted]] + // CHECK-TENSOR: memref.store %{{.*}}, %[[alloc]] + + // CHECK-VECTOR: %[[write1:.*]] = tensor.insert %{{.*}} into %[[t1]] + + // CHECK-TENSOR-VECTOR: %[[t1_memref:.*]] = bufferization.to_memref %[[t1]] + // CHECK-TENSOR-VECTOR: %[[alloc:.*]] = memref.alloc + // CHECK-TENSOR-VECTOR: %[[casted:.*]] = memref.cast %[[alloc]] + // CHECK-TENSOR-VECTOR: %[[alloc2:.*]] = memref.alloc + // CHECK-TENSOR-VECTOR: %[[casted2:.*]] = memref.cast %[[alloc2]] + // CHECK-TENSOR-VECTOR: memref.copy %[[t1_memref]], %[[casted]] + // CHECK-TENSOR-VECTOR: memref.store %{{.*}}, %[[alloc]] + + // CHECK-VECTOR-TENSOR: %[[t1_memref:.*]] = bufferization.to_memref %[[t1]] + // CHECK-VECTOR-TENSOR: %[[alloc:.*]] = memref.alloc + // CHECK-VECTOR-TENSOR: %[[casted:.*]] = memref.cast %[[alloc]] + // CHECK-VECTOR-TENSOR: memref.copy %[[t1_memref]], %[[casted]] + // CHECK-VECTOR-TENSOR: memref.store %{{.*}}, %[[alloc]] + %write1 = tensor.insert %s into %t1[%i] : tensor + + // CHECK-TENSOR: %[[t1_tensor:.*]] = bufferization.to_tensor %[[casted]] + // CHECK-TENSOR: %[[write2:.*]] = vector.transfer_write %{{.*}}, %[[t1_tensor]] + + // CHECK-VECTOR: %[[write1_memref:.*]] = bufferization.to_memref %[[write1]] + // CHECK-VECTOR: %[[alloc:.*]] = memref.alloc + // CHECK-VECTOR: %[[casted:.*]] = memref.cast %[[alloc]] + // CHECK-VECTOR: memref.copy %[[write1_memref]], %[[casted]] + // CHECK-VECTOR: vector.transfer_write %{{.*}}, %[[alloc]] + + // CHECK-TENSOR-VECTOR: memref.copy %[[casted]], %[[casted2]] + // CHECK-TENSOR-VECTOR: vector.transfer_write %{{.*}}, %[[alloc2]] + + // CHECK-VECTOR-TENSOR: %[[alloc2:.*]] = memref.alloc + // CHECK-VECTOR-TENSOR: %[[casted2:.*]] = memref.cast %[[alloc2]] + // CHECK-VECTOR-TENSOR: memref.copy %[[casted]], %[[casted2]] + // CHECK-VECTOR-TENSOR: vector.transfer_write %{{.*}}, %[[alloc2]] + %write2 = vector.transfer_write %v, %write1[%i] : vector<5xf32>, tensor + + // CHECK-TENSOR: return %[[write2]] + + // CHECK-VECTOR: %[[write2_tensor:.*]] = bufferization.to_tensor %[[casted]] + // CHECK-VECTOR: return %[[write2_tensor]] + + // CHECK-TENSOR-VECTOR: %[[write2_tensor:.*]] = bufferization.to_tensor %[[casted2]] + // CHECK-TENSOR-VECTOR: return %[[write2_tensor]] + + // CHECK-VECTOR-TENSOR: %[[write2_tensor:.*]] = bufferization.to_tensor %[[casted2]] + // CHECK-VECTOR-TENSOR: return %[[write2_tensor]] + return %write2 : tensor +} + + +