diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td --- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td +++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td @@ -36,9 +36,8 @@ DeclareOpInterfaceMethods]> { let description = [{ This transform materializes an allocation for the targeted tensor value. It - replaces all original uses of the target with the newly allocated buffer, - wrapped in a `bufferization.to_tensor` op. It returns a handle to the result - of the `to_tensor` op. + replaces all original uses of the target with the result of the newly + created `bufferization.alloc_tensor` and returns a handle to its result. Example: ``` @@ -49,19 +48,13 @@ Is rewritten to: ``` %0 = "some_op"() : () -> (tensor<10xf32>) - %1 = memref.alloc() : memref<10xf32> - memref.tensor_store %0, %1 : memref<10xf32> - %2 = bufferization.to_tensor %1 restrict writable : memref<10xf32> - "some_use"(%2) : (tensor<10xf32>) -> () + %1 = bufferization.alloc_tensor() copy(%0) : tensor<10xf32> + "some_use"(%1) : (tensor<10xf32>) -> () ``` - This transform has optimized lowerings for certain targets that are results - of non-DPS ops. For such targets, not only a buffer allocation is emitted - but also the defining op is bufferized. This is to avoid a second - allocation for the missing destination of the non-DPS op (when subsequently - running a bufferization pass/transform). Currently supported ops with - optimized lowerings: - - tensor.pad + In case the target is an OpResult, the entire defining op is moved into the + region of the `bufferization.alloc_tensor` op. Only ops with a single result + are supported. An optional memory space attribute can be specified for the materialized buffer allocation. diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h --- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h +++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h @@ -44,34 +44,15 @@ //===----------------------------------------------------------------------===// using LinalgLoops = SmallVector; -/// Materialize a buffer allocation for the given tensor.pad op and lower the -/// op to linalg.fill/linalg.generic + memref.tensor_store. E.g.: -/// -/// %0 = tensor.pad low[%l] high[%h] %t ... -/// -/// is lowered to: -/// -/// %alloc = memref.alloc -/// linalg.fill ... outs(%alloc) -/// %subview = memref.subview %alloc [%l] [...] [1] -/// memref.tensor_store %t, %subview -/// %0 = bufferization.to_tensor %alloc restrict writable -/// -/// In addition to rewriting the IR as shown above, the result of the -/// bufferization.to_tensor op is returned. -Value bufferizeToAllocation(RewriterBase &rewriter, tensor::PadOp padOp, - Attribute memorySpace = {}); - /// Materialize a buffer allocation for the given tensor value. E.g.: /// -/// %alloc = memref.alloc -/// memref.tensor_store %value, %alloc -/// %0 = bufferization.to_tensor %alloc restrict writable +/// %0 = bufferization.alloc_tensor() copy(%t) : tensor<5xf32> /// -/// In case `value` is a tensor.pad result, the corresponding overload is used -/// internally to produce a better bufferization. -Value bufferizeToAllocation(RewriterBase &rewriter, Value value, - Attribute memorySpace = {}); +/// In case `value` is an OpResult, the defining op is moved into the region of +/// the generated bufferization.alloc_tensor. If the tensor type has dynamic +/// dimensions, the op must implement the ReifyRankedShapedTypeOpInterface. +FailureOr bufferizeToAllocation(RewriterBase &rewriter, Value value, + Attribute memorySpace = {}); void populatePadTensorTilingPatterns(RewritePatternSet &patterns, const LinalgTilingOptions &options); diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp --- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp +++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp @@ -219,10 +219,15 @@ Attribute memorySpace = getMemorySpace().has_value() ? getMemorySpace().value() : Attribute(); IRRewriter rewriter(getContext()); - auto transformed = llvm::to_vector( - llvm::map_range(state.getPayloadValues(getTarget()), [&](Value v) { - return linalg::bufferizeToAllocation(rewriter, v, memorySpace); - })); + SmallVector transformed; + for (Value v : state.getPayloadValues(getTarget())) { + FailureOr alloc = + linalg::bufferizeToAllocation(rewriter, v, memorySpace); + if (failed(alloc)) + return mlir::emitSilenceableFailure(v.getLoc()) + << "unable to create allocation for value"; + transformed.push_back(*alloc); + } results.setValues(getTransformed().cast(), transformed); return DiagnosedSilenceableFailure::success(); } diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertToDestinationStyle.cpp @@ -117,90 +117,6 @@ return genericOp; } -static SmallVector reifyOrComputeDynamicSizes(OpBuilder &b, - Value value) { - auto tensorType = value.getType().cast(); - if (tensorType.hasStaticShape()) - return {}; - - // Try to reify dynamic sizes. - if (auto reifiableOp = - value.getDefiningOp()) { - ReifiedRankedShapedTypeDims reifiedShape; - if (succeeded(reifiableOp.reifyResultShapes(b, reifiedShape))) { - SmallVector dynSizes; - for (int64_t i = 0; i < tensorType.getRank(); ++i) { - if (tensorType.isDynamicDim(i)) - dynSizes.push_back( - reifiedShape[value.cast().getResultNumber()][i]); - } - return dynSizes; - } - } - - // Create tensor.dim ops. - SmallVector dynSizes; - for (int64_t i = 0; i < tensorType.getRank(); ++i) { - if (tensorType.isDynamicDim(i)) - dynSizes.push_back( - b.create(value.getLoc(), value, - b.create(value.getLoc(), i))); - } - return dynSizes; -} - -static Value createAllocationForTensor(RewriterBase &rewriter, Location loc, - Value value, - Attribute memorySpace = {}) { - OpBuilder::InsertionGuard g(rewriter); - auto tensorType = value.getType().cast(); - - // Create buffer allocation. - auto memrefType = bufferization::getMemRefTypeWithStaticIdentityLayout( - tensorType, memorySpace) - .cast(); - SmallVector dynamicSizes = reifyOrComputeDynamicSizes(rewriter, value); - Value alloc = rewriter.create(loc, memrefType, dynamicSizes); - - // Place deallocation at the end of the block. - rewriter.setInsertionPoint(rewriter.getInsertionBlock()->getTerminator()); - rewriter.create(loc, alloc); - - return alloc; -} - -Value linalg::bufferizeToAllocation(RewriterBase &rewriter, PadOp padOp, - Attribute memorySpace) { - OpBuilder::InsertionGuard g(rewriter); - rewriter.setInsertionPoint(padOp); - Location loc = padOp.getLoc(); - - // Create buffer allocation. - Value alloc = - createAllocationForTensor(rewriter, loc, padOp.getResult(), memorySpace); - rewriter.setInsertionPointAfter(alloc.getDefiningOp()); - - // Create linalg.fill or linalg.generic. - Operation *fillOp = movePaddingToFillOrGenericOp(rewriter, loc, padOp, alloc); - rewriter.setInsertionPointAfter(fillOp); - - // Create memref.tensor_store. - SmallVector sizes = - getMixedSizes(rewriter, loc, padOp.getSource()); - SmallVector strides(padOp.getResultType().getRank(), - rewriter.getIndexAttr(1)); - Value subview = rewriter.create( - loc, alloc, /*offsets=*/padOp.getMixedLowPad(), sizes, strides); - rewriter.create(loc, padOp.getSource(), subview); - - // Create bufferization.to_tensor with "restrict" and "writable". The returned - // tensor is a new buffer allocation, so it does not alias with any buffer. - Value toTensorOp = rewriter.create( - loc, alloc, /*restrict=*/true, /*writable=*/true); - rewriter.replaceOp(padOp, toTensorOp); - return toTensorOp; -} - /// Lower tensor.from_elements to a sequence of chained tensor.insert. FailureOr mlir::linalg::rewriteInDestinationPassingStyle( RewriterBase &rewriter, tensor::FromElementsOp fromElementsOp) { @@ -329,41 +245,69 @@ return insertSliceOp.getOperation(); } -Value linalg::bufferizeToAllocation(RewriterBase &rewriter, Value value, - Attribute memorySpace) { - // Call specialized overload for certain ops. - if (auto padOp = value.getDefiningOp()) - return bufferizeToAllocation(rewriter, padOp, memorySpace); +FailureOr linalg::bufferizeToAllocation(RewriterBase &rewriter, + Value value, + Attribute memorySpace) { + OpBuilder::InsertionGuard g(rewriter); + auto tensorType = value.getType().cast(); + SmallVector dynSizes; - // Collect all uses. - SmallVector uses = llvm::to_vector( - llvm::map_range(value.getUses(), [](OpOperand &use) { return &use; })); + // Collect uses of the value. + SmallVector uses; + for (OpOperand &use : value.getUses()) + uses.push_back(&use); + auto replaceAllUsesWith = [&](Value replacement) { + for (OpOperand *use : uses) { + rewriter.updateRootInPlace(use->getOwner(), + [&]() { use->set(replacement); }); + } + }; - OpBuilder::InsertionGuard g(rewriter); if (auto bbArg = value.dyn_cast()) { + // BlockArgument: Directly yield the value from the generated alloc_tensor. rewriter.setInsertionPointToStart(bbArg.getOwner()); - } else { - rewriter.setInsertionPointAfter(value.getDefiningOp()); + + // Create tensor.dim ops. + for (int64_t i = 0; i < tensorType.getRank(); ++i) { + if (tensorType.isDynamicDim(i)) + dynSizes.push_back(rewriter.create( + value.getLoc(), value, + rewriter.create(value.getLoc(), i))); + } + + auto allocTensorOp = rewriter.create( + value.getLoc(), tensorType, dynSizes, /*sizeHint=*/Value(), memorySpace, + /*copy=*/bbArg); + replaceAllUsesWith(allocTensorOp.getResult()); + return allocTensorOp.getResult(); } - Location loc = value.getLoc(); - - // Create buffer allocation. - Value alloc = createAllocationForTensor(rewriter, loc, value, memorySpace); - - // Create memref.tensor_store. - rewriter.setInsertionPointAfter(alloc.getDefiningOp()); - rewriter.create(loc, value, alloc); - - // Create bufferization.to_tensor with "restrict" and "writable". The returned - // tensor is a new buffer allocation, so it does not alias with any buffer. - Value toTensorOp = rewriter.create( - loc, alloc, /*restrict=*/true, /*writable=*/true); - for (OpOperand *use : uses) { - rewriter.updateRootInPlace(use->getOwner(), - [&]() { use->set(toTensorOp); }); + + // OpResult: Move the defining op into the region of the alloc_tensor. + Operation *op = value.getDefiningOp(); + // TODO: Ops with more than one result are not supported. + if (op->getNumResults() != 1) + return failure(); + rewriter.setInsertionPointAfter(op); + if (!tensorType.hasStaticShape()) { + // Try to reify dynamic sizes. + if (auto reifiableOp = cast(op)) { + ReifiedRankedShapedTypeDims reifiedShape; + if (failed(reifiableOp.reifyResultShapes(rewriter, reifiedShape))) + return failure(); + for (int64_t i = 0; i < tensorType.getRank(); ++i) { + if (tensorType.isDynamicDim(i)) + dynSizes.push_back( + reifiedShape[value.cast().getResultNumber()][i]); + } + } } - return toTensorOp; + auto allocTensorOp = rewriter.create( + value.getLoc(), tensorType, dynSizes, memorySpace); + op->moveBefore(allocTensorOp.getTerminator()); + replaceAllUsesWith(allocTensorOp.getResult()); + allocTensorOp.getTerminator().getTensorMutable().assign(value); + return allocTensorOp.getResult(); } namespace { diff --git a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir --- a/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir +++ b/mlir/test/Dialect/Linalg/transform-op-bufferize-to-allocation.mlir @@ -1,6 +1,6 @@ // RUN: mlir-opt -split-input-file \ // RUN: -test-transform-dialect-interpreter -canonicalize \ -// RUN: -allow-unregistered-dialect -split-input-file %s | FileCheck %s +// RUN: -split-input-file %s | FileCheck %s // CHECK: #[[$map:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 5)> // CHECK: #[[$map1:.+]] = affine_map<()[s0, s1] -> (s0 + s1 + 10)> @@ -11,40 +11,11 @@ // CHECK-DAG: %[[dim0:.*]] = tensor.dim %[[t]], %[[c0]] // CHECK-DAG: %[[size0:.*]] = affine.apply #[[$map]]()[%[[h1]], %[[dim0]]] // CHECK-DAG: %[[size1:.*]] = affine.apply #[[$map1]]()[%[[l2]], %[[h2]]] -// CHECK: %[[alloc:.*]] = memref.alloc(%[[size0]], %[[size1]]) : memref -// CHECK: linalg.fill ins(%[[c50]] : index) outs(%[[alloc]] : memref) -// CHECK: %[[dim0:.*]] = tensor.dim %[[t]], %[[c0]] -// CHECK: %[[subview:.*]] = memref.subview %[[alloc]][5, %[[l2]]] [%[[dim0]], 10] [1, 1] -// CHECK: memref.tensor_store %[[t]], %[[subview]] -// CHECK: %[[r:.*]] = bufferization.to_tensor %[[alloc]] restrict writable : memref -// CHECK: memref.dealloc %[[alloc]] -// CHECK: return %[[r]] -func.func @tensor_pad_constant(%t: tensor, %l2: index, %h1: index, - %h2: index) -> tensor { - %0 = tensor.pad %t low[5, %l2] high[%h1, %h2] { - ^bb0(%arg0: index, %arg1: index): - %c = arith.constant 50 : index - tensor.yield %c : index - } : tensor to tensor - return %0 : tensor -} - -transform.sequence failures(propagate) { -^bb1(%arg1: !pdl.operation): - %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %1 = transform.get_result %0[0] : (!pdl.operation) -> !transform.any_value - %2 = transform.structured.bufferize_to_allocation %1 -} - -// ----- - -// CHECK-LABEL: func @tensor_pad_constant( -// CHECK-SAME: %[[t:.*]]: tensor -// CHECK: %[[src:.*]] = bufferization.to_memref %[[t]] -// CHECK: %[[alloc:.*]] = memref.alloc -// CHECK: %[[subview:.*]] = memref.subview %[[alloc]] -// CHECK: memref.copy %[[src]], %[[subview]] -// CHECK: bufferization.to_tensor %[[alloc]] restrict writable +// CHECK: %[[alloc:.*]] = bufferization.alloc_tensor(%[[size0]], %[[size1]]) init { +// CHECK: %[[padded:.*]] = tensor.pad %[[t]] +// CHECK: bufferization.yield %[[padded]] +// CHECK: } : tensor +// CHECK: return %[[alloc]] func.func @tensor_pad_constant(%t: tensor, %l2: index, %h1: index, %h2: index) -> tensor { %0 = tensor.pad %t low[5, %l2] high[%h1, %h2] { @@ -60,8 +31,6 @@ %0 = transform.structured.match ops{["tensor.pad"]} in %arg1 : (!pdl.operation) -> !pdl.operation %1 = transform.get_result %0[0] : (!pdl.operation) -> !transform.any_value %2 = transform.structured.bufferize_to_allocation %1 - // Make sure that One-Shot Bufferize can bufferize the rest. - transform.bufferization.one_shot_bufferize %arg1 } // ----- @@ -70,32 +39,8 @@ // CHECK-SAME: %[[t:.*]]: tensor // CHECK: %[[c0:.*]] = arith.constant 0 : index // CHECK: %[[dim:.*]] = tensor.dim %[[t]], %[[c0]] -// CHECK: %[[alloc:.*]] = memref.alloc(%[[dim]]) : memref -// CHECK: memref.tensor_store %[[t]], %[[alloc]] -// CHECK: %[[alloc_t:.*]] = bufferization.to_tensor %[[alloc]] restrict writable -// CHECK: %[[r:.*]] = tensor.extract %[[alloc_t]] -// CHECK: memref.dealloc %[[alloc]] -// CHECK: return %[[r]] -func.func @materialization_of_bbarg(%t: tensor, %idx: index) -> index { - %r = tensor.extract %t[%idx, %idx] : tensor - return %r : index -} - -transform.sequence failures(propagate) { -^bb1(%arg1: !pdl.operation): - %0 = transform.structured.match ops{["tensor.extract"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %1 = test_produce_value_handle_to_argument_of_parent_block %0, 0 : (!pdl.operation) -> !transform.any_value - %2 = transform.structured.bufferize_to_allocation %1 {memory_space = 4} -} - -// ----- - -// CHECK-LABEL: func @materialization_of_bbarg( -// CHECK-SAME: %[[t:.*]]: tensor -// CHECK: %[[m:.*]] = bufferization.to_memref %[[t]] -// CHECK: %[[alloc:.*]] = memref.alloc(%{{.*}}) : memref -// CHECK: memref.copy %[[m]], %[[alloc]] -// CHECK: %[[r:.*]] = memref.load %[[alloc]] +// CHECK: %[[alloc:.*]] = bufferization.alloc_tensor(%[[dim]]) copy(%[[t]]) {memory_space = 4 : i64} : tensor +// CHECK: %[[r:.*]] = tensor.extract %[[alloc]] // CHECK: return %[[r]] func.func @materialization_of_bbarg(%t: tensor, %idx: index) -> index { %r = tensor.extract %t[%idx, %idx] : tensor @@ -107,28 +52,4 @@ %0 = transform.structured.match ops{["tensor.extract"]} in %arg1 : (!pdl.operation) -> !pdl.operation %1 = test_produce_value_handle_to_argument_of_parent_block %0, 0 : (!pdl.operation) -> !transform.any_value %2 = transform.structured.bufferize_to_allocation %1 {memory_space = 4} - // Make sure that One-Shot Bufferize can bufferize the rest. - transform.bufferization.one_shot_bufferize %arg1 } - -// ----- - -// CHECK-LABEL: func @materialization_of_opresult( -// CHECK: %[[t:.*]] = "dummy.some_op" -// CHECK: %[[alloc:.*]] = memref.alloc(%{{.*}}) : memref -// CHECK: memref.tensor_store %[[t]], %[[alloc]] -// CHECK: %[[r:.*]] = bufferization.to_tensor %[[alloc]] -// CHECK: return %[[r]] -func.func @materialization_of_opresult(%idx: index) -> tensor { - %t = "dummy.some_op"() : () -> (tensor) - return %t : tensor -} - -transform.sequence failures(propagate) { -^bb1(%arg1: !pdl.operation): - %0 = transform.structured.match ops{["dummy.some_op"]} in %arg1 : (!pdl.operation) -> !pdl.operation - %1 = transform.get_result %0[0] : (!pdl.operation) -> !transform.any_value - %2 = transform.structured.bufferize_to_allocation %1 {memory_space = 4} -} - -