diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp @@ -115,6 +115,7 @@ #include "mlir/Dialect/Utils/StaticValueUtils.h" #include "mlir/Dialect/Vector/VectorOps.h" #include "mlir/IR/AsmState.h" +#include "mlir/IR/Matchers.h" #include "mlir/IR/Operation.h" #include "mlir/Pass/Pass.h" #include "mlir/Pass/PassManager.h" @@ -139,6 +140,9 @@ #define DBGS() (llvm::dbgs() << '[' << DEBUG_TYPE << "] ") #define LDBG(X) LLVM_DEBUG(DBGS() << X) +// TODO: from some HW description. +static constexpr int64_t kBufferAlignments = 128; + // Forward declarations. static std::string printOperationInfo(Operation *, bool prefix = true); static std::string printValueInfo(Value, bool prefix = true); @@ -1404,6 +1408,21 @@ // Bufferization-specific scoped alloc/dealloc insertion support. //===----------------------------------------------------------------------===// +template +Operation *getFirstParentOfType(Value v) { + Operation *parent; + if (auto bbArg = v.dyn_cast()) + parent = bbArg.getOwner()->getParentOp(); + else + parent = v.getDefiningOp()->getParentOp(); + while (parent) { + if (isa(parent)) + return parent; + parent = parent->getParentOp(); + } + return nullptr; +} + /// Create an Allocop/DeAllocOp pair, where the AllocOp is after /// `shapedValue.getDefiningOp` (or at the top of the block in case of a /// bbArg) and the DeallocOp is at the end of the block. @@ -1438,8 +1457,27 @@ if (dim.value() == ShapedType::kDynamicSize) dynShape.push_back(createOrFoldDimOp(b, loc, shapedValue, dim.index())); - Value allocated = b.create(loc, allocMemRefType, dynShape); - aliasInfo.createAliasInfoEntry(allocated); + // If the buffer is statically shaped, try to hoist it to the first enclosing + // parallel region. + // TODO: this concept of parallel region and threadlocal needs interfaces. + // TODO: also hoist in the dynamic case. For now this relies on subsequent + // calls to LICM and buffer hoisting which will most likely not succeed. + // TODO: when packing, allocate a static bounding box which will enable more + // hoisting. + Value allocated; + { // Guarded insertion point to potentially hoist the AllocOp. + OpBuilder::InsertionGuard g(b); + if (dynShape.empty()) { + Operation *parent = + getFirstParentOfType(shapedValue); + if (parent) + b.setInsertionPointToStart(&(parent->getRegion(0).front())); + } + allocated = b.create( + loc, allocMemRefType, dynShape, b.getI64IntegerAttr(kBufferAlignments)); + aliasInfo.createAliasInfoEntry(allocated); + } Value casted = allocated; if (memRefType != allocMemRefType) { casted = b.create(loc, memRefType, allocated); @@ -1468,6 +1506,7 @@ BufferizationAliasInfo &aliasInfo) { // Take a guard before anything else. OpBuilder::InsertionGuard g(b); + b.setInsertionPointAfter(op); // TODO: provide the proper interface to iterate on OpResults and get the // matching OpOperands. @@ -1490,7 +1529,6 @@ Value dimTensor = bvm.lookupOrDefault(output); Value alloc = createNewAllocDeallocPairForShapedValue(b, loc, dimTensor, aliasInfo); - b.setInsertionPointAfter(alloc.getDefiningOp()); resultBuffers.push_back(alloc); // Additionally, if the output buffer is used, clone its value for now. @@ -1777,11 +1815,18 @@ if (getInPlace(opResult) != InPlaceSpec::True) { resultBuffer = createNewAllocDeallocPairForShapedValue(b, loc, operand, aliasInfo); - // If the tensor comes from `linalg::InitTensorOp`, the value is - // unitialized and we do not need to copy. + // If the tensor comes from either: + // - linalg.init_tensor + // - tensor.cast(linalg.init_tensor()) + // Then the value is unitialized and we do not need to copy. This is a + // pragmatic simplification of "matching bbArg does not bufferize to a + // read". // TODO: "matching bbArg does not bufferize to a read" is a more general // check. - if (!operand.getDefiningOp()) + auto p1 = m_Op(); + auto p2 = m_Op(m_Op()); + Operation *op = operand.getDefiningOp(); + if (!op || (!p1.match(op) && !p2.match(op))) b.create(forOp.getLoc(), operandBuffer, resultBuffer); } BlockArgument bbArg = forOp.getRegionIterArgForOpOperand(opOperand); @@ -1862,6 +1907,10 @@ static LogicalResult bufferize(OpBuilder &b, TiledLoopOp tiledLoopOp, BlockAndValueMapping &bvm, BufferizationAliasInfo &aliasInfo) { + // Take a guard before anything else. + OpBuilder::InsertionGuard g(b); + b.setInsertionPoint(tiledLoopOp); + // Allocate output buffers if needed, forward output tensor args to the // terminator. Operation *yieldOp = tiledLoopOp.getBody()->getTerminator(); @@ -1904,14 +1953,19 @@ auto loc = tiledLoopOp.getLoc(); Value alloc = createNewAllocDeallocPairForShapedValue( b, loc, oldOutputTensor, aliasInfo); - // If the tensor comes from `linalg::InitTensorOp`, the value is - // unitialized and we do not need to copy. + // If the tensor comes from either: + // - linalg.init_tensor + // - tensor.cast(linalg.init_tensor()) + // Then the value is unitialized and we do not need to copy. This is a + // pragmatic simplification of "matching bbArg does not bufferize to a + // read". // TODO: "matching bbArg does not bufferize to a read" is a more general // check. - if (!oldOutputTensor.getDefiningOp()) { - b.setInsertionPointAfter(alloc.getDefiningOp()); + auto p1 = m_Op(); + auto p2 = m_Op(m_Op()); + Operation *op = oldOutputTensor.getDefiningOp(); + if (!op || (!p1.match(op) && !p2.match(op))) b.create(loc, outputBuffer, alloc); - } outputBuffer = alloc; } // Insert mapping and aliasing info. @@ -2013,11 +2067,9 @@ // If not inplaceable, alloc. Value alloc; auto inPlace = getInPlace(extractSliceOp->getResult(0)); - if (inPlace != InPlaceSpec::True) { + if (inPlace != InPlaceSpec::True) alloc = createNewAllocDeallocPairForShapedValue( b, loc, extractSliceOp.result(), aliasInfo); - b.setInsertionPointAfter(alloc.getDefiningOp()); - } // Bufferize to subview. auto subviewMemRefType = @@ -2062,9 +2114,10 @@ // cloning the whole tensor on every single iteration and is a symptom // of a catastrophically bad scheduling decision. // TODO: be very loud about it or even consider failing the pass. + // Alloc a copy for `insertSliceOp.dest()`, it will become the result + // buffer. Value newDstMemref = createNewAllocDeallocPairForShapedValue( - b, loc, insertSliceOp.result(), aliasInfo); - b.setInsertionPointAfter(newDstMemref.getDefiningOp()); + b, loc, insertSliceOp.dest(), aliasInfo); b.create(insertSliceOp.getLoc(), dstMemref, newDstMemref); dstMemref = newDstMemref; } @@ -2130,10 +2183,11 @@ // If transfer_write is not inPlace, allocate a new buffer. Value newInputBuffer; if (inPlace != InPlaceSpec::True) { + // Alloc a copy for `writeOp.source()`, it will become the result buffer. newInputBuffer = createNewAllocDeallocPairForShapedValue( - b, loc, writeOp.result(), aliasInfo); - b.setInsertionPointAfter(newInputBuffer.getDefiningOp()); - map(bvm, writeOp.result(), newInputBuffer); + b, loc, writeOp.source(), aliasInfo); + Value v = lookup(bvm, writeOp.source()); + b.create(loc, v, newInputBuffer); } else { // InPlace write will result in memref.tensor_load(x) which must // canonicalize away with one of it uses. diff --git a/mlir/test/Dialect/Linalg/comprehensive-foo.mlir b/mlir/test/Dialect/Linalg/comprehensive-foo.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/comprehensive-foo.mlir @@ -0,0 +1,67 @@ +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -debug -verify-each=1 + +func @matmul( + %A: tensor<128x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, + %B: tensor<256x192xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, + %C: tensor<128x192xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) + -> tensor<128x192xf32> { + %c0 = constant 0 : index + %c256 = constant 256 : index + %c32 = constant 32 : index + %cst = constant 0.000000e+00 : f32 + %c128 = constant 128 : index + %c192 = constant 192 : index + %c8 = constant 8 : index + %c16 = constant 16 : index + + // CHECK: scf.for %[[I:.*]] = + %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) { + %1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] : + tensor<128x256xf32> to tensor<8x256xf32> + + // CHECK: scf.for %[[J:.*]] = + %2 = scf.for %arg5 = %c0 to %c192 step %c16 iter_args(%arg6 = %arg4) -> (tensor<128x192xf32>) { + %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] : + tensor<256x192xf32> to tensor<256x16xf32> + + // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc. + // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8x16xf32> + // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1] + // TODO: %4 is never read but just overwritten, this copy can be elided. + // CHECK: linalg.copy(%[[T]], %[[ALLOC]]) + %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] : + tensor<128x192xf32> to tensor<8x16xf32> + + // linalg.fill is inplace. + // CHECK: linalg.fill(%{{.*}}, %[[ALLOC]]) : f32, memref<8x16xf32> + %5 = linalg.fill(%cst, %4) : f32, tensor<8x16xf32> -> tensor<8x16xf32> + + // CHECK: scf.for %[[K:.*]] = + %6 = scf.for %arg7 = %c0 to %c256 step %c32 iter_args(%arg8 = %5) -> (tensor<8x16xf32>) { + %8 = tensor.extract_slice %1[0, %arg7] [8, 32] [1, 1] : + tensor<8x256xf32> to tensor<8x32xf32> + %9 = tensor.extract_slice %3[%arg7, 0] [32, 16] [1, 1] : + tensor<256x16xf32> to tensor<32x16xf32> + + // linalg.matmul is inplace as well as the enclosing scf.for. + // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]] + %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>) + outs(%arg8 : tensor<8x16xf32>) + -> tensor<8x16xf32> + scf.yield %10 : tensor<8x16xf32> + } + + // insert_slice is inplace but its source comes from an equivalent buffer + // that is not in place. So we must insert a copy of the small buffer into + // the bigger buffer. + // CHECK: linalg.copy(%[[ALLOC]], %[[T]]) + %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] : + tensor<8x16xf32> into tensor<128x192xf32> + + // CHECK: memref.dealloc %[[ALLOC]] + scf.yield %7 : tensor<128x192xf32> + } + scf.yield %2 : tensor<128x192xf32> + } + return %0 : tensor<128x192xf32> +} diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir @@ -1,704 +1,75 @@ -// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -//===----------------------------------------------------------------------===// -// Simple cases -//===----------------------------------------------------------------------===// +#map0 = affine_map<(d0) -> (64, -d0 + 518)> +#map1 = affine_map<(d0) -> (d0 ceildiv 64)> +#map2 = affine_map<(d0, d1) -> (d1, d0)> +#map3 = affine_map<(d0, d1, d2) -> (d0, d2)> +#map4 = affine_map<(d0, d1, d2) -> (d1, d2)> +#map5 = affine_map<(d0, d1, d2) -> (d0, d1)> -// ----- - -// CHECK-LABEL: func @extract_slice_fun -func @extract_slice_fun(%A : tensor, %B : tensor {linalg.inplaceable = true}) - -> (tensor<4xf32>, tensor<8xf32>) -{ - // tensor.extract_slice is not used in a write, it is not compelled to - // bufferize out of place. Let callers decide whether they want to create - // aliasing subviews at all call sites or whether they allocate. - // This is true irrespective of whether the function argument is inplaceable. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.extract_slice %B[0][8][1] : tensor to tensor<8xf32> - - return %r0, %r1: tensor<4xf32>, tensor<8xf32> -} - -// ----- - -// CHECK-LABEL: func @insert_slice_fun -func @insert_slice_fun( - %A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor<4xf32>) - -> (tensor, tensor) -{ - // must bufferize out of place. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor - - // bufferizes inplace. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor - - return %r0, %r1: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @conflict_on_B -func @conflict_on_B( - %A : tensor<4x4xf32> {linalg.inplaceable = true}, - %B : tensor<4x4xf32> {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -{ - // matmul output operand interferes with input operand. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %C = linalg.matmul ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // matmul output operand interferes with input operand. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %D = linalg.matmul ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // matmul output operand does not interferes with input operand. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32> -} - -//===----------------------------------------------------------------------===// -// Length-1 producer-consumer cases. -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @extract_slice_extract_slice -func @extract_slice_extract_slice( - %A : tensor {linalg.inplaceable = true}, %B : tensor) - -> (tensor<2xf32>, tensor<2xf32>) -{ - // tensor.extract_slice is not used in a write, it is not compelled to - // bufferize out of place. Let callers decide whether they want to create - // aliasing subviews at all call sites or whether they allocate. - // This is true irrespective of whether the function argument is inplaceable. - // CHECK: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32> - - return %r1, %r3: tensor<2xf32>, tensor<2xf32> -} - -// ----- - -// CHECK-LABEL: func @insert_slice_insert_slice -func @insert_slice_insert_slice( - %A : tensor {linalg.inplaceable = true}, - %A2 : tensor<4xf32> {linalg.inplaceable = true}, - %A3 : tensor<2xf32> {linalg.inplaceable = true}, - %B : tensor, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>) - -> (tensor, tensor) -{ - // CHECK: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor - - // CHECK: {__inplace_results_attr__ = ["false"]} - %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["false"]} - %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor - - return %r1, %r3: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice -func @extract_slice_nonmatching_insert_slice( - %A : tensor {linalg.inplaceable = true}, - %B : tensor, %idx: index) - -> (tensor, tensor) -{ - // %r1 bufferizes inplace because %A is inplaceable. - // %r0 is an overlapping tensor.extract_slice that does not match, it must be - // out of place. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // %r1 can bufferize inplace fine. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor - - // %r3 does bufferizes inplace because %B is not inplaceable. - // %r0 is an overlapping tensor.extract_slice that does not match, but does - // not alias with the buffer coming from %r3 so it can actually bufferize - // inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> - - // %r3 cannot bufferize inplace since %B is not inplaceable. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor - - return %r1, %r3: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @extract_slice_matching_insert_slice -func @extract_slice_matching_insert_slice( - %A : tensor {linalg.inplaceable = true}, - %B : tensor) - -> (tensor, tensor) -{ - // %r1 bufferizes inplace because %A is inplaceable. - // %r0 is a tensor.extract_slice that matches, it can also be bufferized - // inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor - - // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized - // inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> - - // tensor.insert_slice cannot bufferize inplace. - // This should have been captured by a canonicalization pattern and it would - // be unproductive to have special logic in bufferization to encode matching - // insert_slice(extract_slice(A), A). - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor - - return %r1, %r3: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @extract_slice_linalg_readonly_use -func @extract_slice_linalg_readonly_use( - %A : tensor, - %B : tensor<4x4xf32>, - %C : tensor<4x4xf32> {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>) -{ - // tensor.extract_slice is only used as a read, no interference irrespective - // of user's inplace status. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // matmul output operand is not inplaceable at the function boundary. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %D = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // matmul output operand is inplaceable at the function boundary. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%C: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> -} - -// ----- - -// CHECK-LABEL: func @extract_slice_to_linalg_write_use -func @extract_slice_to_linalg_write_use( - %A : tensor<4x4xf32>, - %B : tensor, - %C : tensor {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>) -{ - // Step 4. %sB forward propagates to a write in %D but it is not inplace. - // So this is only ever read and can bufferize inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 3. %sB has a read interference in %E, it does not bufferize inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %D = linalg.matmul ins(%B, %C: tensor, tensor) - outs(%sB: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // Step 2. %sC forward propagates to an inplace write in %E. - // %sC backward propagates to %C which is inplaceable. - // As a consequence this is bufferized inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 1. %sC backprops to the tensor.extract_slice producer which is not - // considered an interference. This bufferizes inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%sC: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> -} - -//===----------------------------------------------------------------------===// -// Transitive cases -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @extract_slice_to_linalg_write_use -func @extract_slice_to_linalg_write_use( - %A : tensor<4x4xf32>, - %B : tensor, - %C : tensor {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>) -{ - // Step 4. %sB forward propagates to an inplace write in %D. - // %sB backward propagates to %B which is not inplaceable. - // As a consequence this is bufferized out of place. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 3. %sB backprops to the tensor.extract_slice producer which is not - // considered an interference. This bufferizes inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %D = linalg.matmul ins(%B, %C: tensor, tensor) - outs(%sB: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // Step 2. %sC forward propagates to an inplace write in %E. - // %sC backward propagates to %C which is inplaceable. - // As a consequence this is bufferized inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 1. %sC backprops to the tensor.extract_slice producer which is not - // considered an interference. This bufferizes inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%sC: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> -} - -// ----- - -// CHECK-LABEL: func @nested_extract_slice_and_insert -func @nested_extract_slice_and_insert( - %A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor {linalg.inplaceable = true}, - %idx : index) - -> (tensor, tensor, tensor) -{ - %f0 = constant 0.0 : f32 - - // 2-level matching tensor.extract_slice / tensor.insert_slice into non - // inplaceable %A. - // - %rA is not inplaceable because %A is not inplaceable at function boundary. - // - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable - // - this propagates to %FA and %ssA being inplaceable. - // - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not - // inplaceable and so %sA is not inplaceable. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor to tensor - %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32> - %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor - %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor into tensor - - // 3-level matching tensor.extract_slice / tensor.insert_slice into - // inplaceable %B. - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.extract_slice - // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because - // clobbering analysis conservatively test for equivalent buffers. - // TODO: This is currently too restrictive and misses clobberings. - // When available, use container-containee analysis. - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor to tensor - %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor to tensor<4x?xf32> - %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32> - %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32> - %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32> - %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor - %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor into tensor - - // 2-level matching tensor.extract_slice / tensor.insert_slice into - // inplaceable %C with a twist. - // Throw a wrench in the system: %rsC production sizes do not match %ssC. - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // The tensor.insert_slice that would be candidate for matching does not actually - // match. That tensor.insert_slice can still be bufferized inplace nonetheless - // but this tensor.extract_slice, which bufferizes to an inplace write, cannot. - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - // CHECK-NEXT: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor to tensor - %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32> - %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor - %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor into tensor - - return %rA, %rB, %rC: tensor, tensor, tensor -} - -//===----------------------------------------------------------------------===// -// Simple loop cases -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @scf_for_yield_only -func @scf_for_yield_only(%A : tensor, - %B : tensor {linalg.inplaceable = true}, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // CHECK: scf.for - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["false"]} - %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor) { - scf.yield %t : tensor - } - - // CHECK: scf.for - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["true"]} - %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor) { - scf.yield %t : tensor - } - - return %r0, %r1: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @scf_for_with_tensor.insert_slice -func @scf_for_with_tensor.insert_slice(%A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor<4xf32>, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // CHECK: scf.for - // scf.for bbArgs are always inplaceable seen from ops inside the body: - // 1. Either the matching tensor is not inplaceable and an alloc occurs - // which makes bbArg inplaceable. - // 2. Or it is already inplaceable and so is bbArg. - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]} - %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) - -> (tensor, tensor) - { - %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor - %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor - scf.yield %ttA, %ttB : tensor, tensor - } - - return %r0#0, %r0#1: tensor, tensor -} - -// ----- - -func private @some_use(tensor) -> () - -// CHECK-LABEL: func @scf_for_deps -func @scf_for_deps(%A : tensor {linalg.inplaceable = true}, - %B : tensor {linalg.inplaceable = true}, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // %r0 must be out of place because one use of %t in the subsequent production - // of %r1 is read. - // CHECK: scf.for - // CHECK-NEXT: call - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["false"]} - %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor) { - call @some_use(%t) : (tensor) -> () - scf.yield %t : tensor - } - - // %r1 bufferizes inplace fine. - // CHECK: scf.for - // CHECK-NEXT: call - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["true"]} - %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor) { - call @some_use(%t) : (tensor) -> () - scf.yield %t : tensor - } - - // %r2 must be out of place because one use of %t in the subsequent production - // of %r3 is read. - // CHECK: linalg.tiled_loop - // CHECK-NEXT: call - // CHECK-NEXT: linalg.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["false"]} - %r2 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step) - ins() - outs(%t = %B: tensor) { - call @some_use(%t) : (tensor) -> () - linalg.yield %t : tensor - } - - // %r3 bufferizes inplace fine. - // CHECK: linalg.tiled_loop - // CHECK-NEXT: call - // CHECK-NEXT: linalg.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["true"]} - %r3 = linalg.tiled_loop (%i) = (%lb) to (%ub) step (%step) - ins() - outs(%t = %B: tensor) { - call @some_use(%t) : (tensor) -> () - linalg.yield %t : tensor - } - - return %r1, %r3: tensor, tensor -} - -// ----- - -//===----------------------------------------------------------------------===// -// Cross function boundary cases. -//===----------------------------------------------------------------------===// - -func private @foo(tensor<64xf32>) - -// CHECK-LABEL: dependence_through_call -func @dependence_through_call(%I : tensor<64xf32> {linalg.inplaceable = true}) { - %f1 = constant 1.000000e+00 : f32 - %f2 = constant 2.000000e+00 : f32 - - // 2. %B already bufferizes inplace, %A would alias and have a different - // value. The calls to `foo` are determined to read conservatively, so %A - // cannot bufferize inplace. - // CHECK: fill - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %A = linalg.fill(%f1, %I) : f32, tensor<64xf32> -> tensor<64xf32> - - // 1. Bufferizes inplace: no alias to %A is yet possible. - // CHECK: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %B = linalg.fill(%f2, %I) : f32, tensor<64xf32> -> tensor<64xf32> - - call @foo(%A) : (tensor<64xf32>) -> () - call @foo(%B) : (tensor<64xf32>) -> () - - return -} - -// ----- - -func private @foo(tensor<64xf32>) - -func private @bar(%A : tensor<64xf32>) { - call @foo(%A) : (tensor<64xf32>) -> () - return -} - -func @read_dependence_through_scf_and_call( - %I : tensor<64xf32> {linalg.inplaceable = true}, - %I2 : tensor<64xf32> {linalg.inplaceable = true}) { +func @matmul_on_tensors(%arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, %arg2: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) -> tensor<518x518xf32> attributes {passthrough = [["target-cpu", "skylake-avx512"], ["prefer-vector-width", "512"]]} { %c0 = constant 0 : index - %c1 = constant 1 : index - %c10 = constant 10 : index - %f1 = constant 1.000000e+00 : f32 - %f2 = constant 2.000000e+00 : f32 - - // 5. %B bufferizes inplace, %A would alias and have a different value. - // The calls to `foo` are determined to read conservatively, so %A cannot - // bufferize inplace. - // CHECK: fill - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %A = linalg.fill(%f1, %I) : f32, tensor<64xf32> -> tensor<64xf32> - - // 4. Bufferizes inplace: no alias to %A is yet possible. - // CHECK: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %B = linalg.fill(%f2, %I) : f32, tensor<64xf32> -> tensor<64xf32> - - // 3. Does not read or write, bufferizes inplace. - // CHECK: scf.for - // CHECK: {__inplace_results_attr__ = ["true", "true"]} - %r:2 = scf.for %i = %c0 to %c10 step %c1 iter_args(%0 = %A, %1 = %B) - -> (tensor<64xf32>, tensor<64xf32>) - { - scf.yield %0, %1 : tensor<64xf32>, tensor<64xf32> + %c518 = constant 518 : index + %c64 = constant 64 : index + %cst = constant 0.000000e+00 : f32 + %c16 = constant 16 : index + %0 = linalg.fill(%cst, %arg2) : f32, tensor<518x518xf32> -> tensor<518x518xf32> + %1 = linalg.init_tensor [9, 64, 64] : tensor<9x64x64xf32> + %2 = tensor.cast %1 : tensor<9x64x64xf32> to tensor + %3 = scf.for %arg3 = %c0 to %c518 step %c64 iter_args(%arg4 = %0) -> (tensor<518x518xf32>) { + %4 = affine.min #map0(%arg3) + %5 = scf.for %arg5 = %c0 to %c518 step %c64 iter_args(%arg6 = %arg4) -> (tensor<518x518xf32>) { + %6 = affine.min #map0(%arg5) + %7 = scf.for %arg7 = %c0 to %c518 step %c64 iter_args(%arg8 = %2) -> (tensor) { + %10 = affine.apply #map1(%arg7) + %11 = affine.min #map0(%arg7) + %12 = tensor.extract_slice %arg1[%arg7, %arg5] [%11, %6] [1, 1] : tensor<518x518xf32> to tensor + %13 = vector.transfer_read %12[%c0, %c0], %cst : tensor, vector<64x64xf32> + %14 = vector.transfer_write %13, %arg8[%10, %c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor + scf.yield %14 : tensor + } + %8 = scf.for %arg7 = %c0 to %c518 step %c64 iter_args(%arg8 = %2) -> (tensor) { + %10 = affine.apply #map1(%arg7) + %11 = affine.min #map0(%arg7) + %12 = tensor.extract_slice %arg0[%arg3, %arg7] [%4, %11] [1, 1] : tensor<518x518xf32> to tensor + %13 = vector.transfer_read %12[%c0, %c0], %cst : tensor, vector<64x64xf32> + %14 = vector.transfer_write %13, %arg8[%10, %c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor + scf.yield %14 : tensor + } + %9 = scf.for %arg7 = %c0 to %c518 step %c64 iter_args(%arg8 = %arg6) -> (tensor<518x518xf32>) { + %10 = tensor.extract_slice %arg8[%arg3, %arg5] [%4, %6] [1, 1] : tensor<518x518xf32> to tensor + %11 = affine.apply #map1(%arg7) + %12 = tensor.extract_slice %8[%11, 0, 0] [1, 64, 64] [1, 1, 1] : tensor to tensor<64x64xf32> + %13 = tensor.extract_slice %7[%11, 0, 0] [1, 64, 64] [1, 1, 1] : tensor to tensor<64x64xf32> + %14 = linalg.init_tensor [64, 64] : tensor<64x64xf32> + %15 = linalg.fill(%cst, %14) : f32, tensor<64x64xf32> -> tensor<64x64xf32> + %16 = vector.transfer_read %10[%c0, %c0], %cst : tensor, vector<64x64xf32> + %17 = vector.transfer_write %16, %15[%c0, %c0] {in_bounds = [true, true]} : vector<64x64xf32>, tensor<64x64xf32> + %18 = scf.for %arg9 = %c0 to %c64 step %c16 iter_args(%arg10 = %17) -> (tensor<64x64xf32>) { + %21 = scf.for %arg11 = %c0 to %c64 step %c16 iter_args(%arg12 = %arg10) -> (tensor<64x64xf32>) { + %22 = tensor.extract_slice %arg12[%arg9, %arg11] [16, 16] [1, 1] : tensor<64x64xf32> to tensor<16x16xf32> + %23 = vector.transfer_read %22[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32> + %24 = scf.for %arg13 = %c0 to %c64 step %c16 iter_args(%arg14 = %23) -> (vector<16x16xf32>) { + %27 = tensor.extract_slice %12[%arg9, %arg13] [16, 16] [1, 1] : tensor<64x64xf32> to tensor<16x16xf32> + %28 = tensor.extract_slice %13[%arg13, %arg11] [16, 16] [1, 1] : tensor<64x64xf32> to tensor<16x16xf32> + %29 = vector.transfer_read %27[%c0, %c0], %cst {in_bounds = [true, true]} : tensor<16x16xf32>, vector<16x16xf32> + %30 = vector.transfer_read %28[%c0, %c0], %cst {in_bounds = [true, true], permutation_map = #map2} : tensor<16x16xf32>, vector<16x16xf32> + %31 = vector.contract {indexing_maps = [#map3, #map4, #map5], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind} %29, %30, %arg14 : vector<16x16xf32>, vector<16x16xf32> into vector<16x16xf32> + scf.yield %31 : vector<16x16xf32> + } + %25 = vector.transfer_write %24, %22[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, tensor<16x16xf32> + %26 = tensor.insert_slice %25 into %arg12[%arg9, %arg11] [16, 16] [1, 1] : tensor<16x16xf32> into tensor<64x64xf32> + scf.yield %26 : tensor<64x64xf32> + } + scf.yield %21 : tensor<64x64xf32> + } + %19 = tensor.extract_slice %18[0, 0] [%4, %6] [1, 1] : tensor<64x64xf32> to tensor + %20 = tensor.insert_slice %19 into %arg8[%arg3, %arg5] [%4, %6] [1, 1] : tensor into tensor<518x518xf32> + scf.yield %20 : tensor<518x518xf32> + } + scf.yield %9 : tensor<518x518xf32> + } + scf.yield %5 : tensor<518x518xf32> } - call @foo(%r#0) : (tensor<64xf32>) -> () - call @foo(%r#1) : (tensor<64xf32>) -> () - - // 2. %B2 already bufferizes inplace, %A2 would alias and have a different - // value. The calls to `foo` are determined to read conservatively, so %A2 - // cannot bufferize inplace. - // CHECK: fill - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %A2 = linalg.fill(%f1, %I2) : f32, tensor<64xf32> -> tensor<64xf32> - - // 1. Bufferizes inplace: no alias to %A2 is yet possible. - // CHECK: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %B2 = linalg.fill(%f2, %I2) : f32, tensor<64xf32> -> tensor<64xf32> - - call @bar(%A2) : (tensor<64xf32>) -> () - call @bar(%B2) : (tensor<64xf32>) -> () - return -} - -//===----------------------------------------------------------------------===// -// Transitive cases through extract_slice. -//===----------------------------------------------------------------------===// - -builtin.func @matmul_on_tensors( - %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, - %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, - %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) - -> tensor<256x256xf32> -{ - %c0 = constant 0 : index - %cst_0 = constant 0.000000e+00 : f32 - %cst_1 = constant 1.000000e+00 : f32 - - %7 = linalg.init_tensor [256, 256] : tensor<256x256xf32> - - // CHECK: linalg.fill - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - // CHECK: linalg.fill - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %8 = linalg.fill(%cst_0, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32> - %11 = linalg.fill(%cst_1, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32> - - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sA = tensor.extract_slice %8[0, 0][256, 16][1, 1]: tensor<256x256xf32> to tensor<256x16xf32> - %sB = tensor.extract_slice %11[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32> - %r = linalg.matmul - ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>) - outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32> - - return %r : tensor<256x256xf32> -} - -// ----- - -builtin.func @matmul_on_tensors( - %arg0: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, - %arg1: tensor<518x518xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = false}, - %arg2: tensor<256x256xf32> {linalg.buffer_layout = affine_map<(d0, d1) -> (d0, d1)>, linalg.inplaceable = true}) - -> tensor<256x256xf32> -{ - %c0 = constant 0 : index - %cst_0 = constant 0.000000e+00 : f32 - %cst_1 = constant 1.000000e+00 : f32 - - %7 = linalg.init_tensor [256, 256] : tensor<256x256xf32> - - // CHECK: linalg.fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK: vector.transfer_write - // CHECK-SAME: {__inplace_results_attr__ = ["false"] - %8 = linalg.fill(%cst_0, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32> - %9 = vector.transfer_read %arg0[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32> - %10 = vector.transfer_write %9, %8[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32> - - // CHECK: linalg.fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK: vector.transfer_write - // CHECK-SAME: {__inplace_results_attr__ = ["false"] - %11 = linalg.fill(%cst_1, %7) : f32, tensor<256x256xf32> -> tensor<256x256xf32> - %12 = vector.transfer_read %arg1[%c0, %c0], %cst_0 {in_bounds = [false, true]} : tensor<518x518xf32>, vector<256x256xf32> - %13 = vector.transfer_write %12, %11[%c0, %c0] {in_bounds = [true, true]} : vector<256x256xf32>, tensor<256x256xf32> - - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sA = tensor.extract_slice %10[0, 0][256, 16][1, 1]: tensor<256x256xf32> to tensor<256x16xf32> - %sB = tensor.extract_slice %13[0, 0][16, 256][1, 1]: tensor<256x256xf32> to tensor<16x256xf32> - %r = linalg.matmul - ins(%sA, %sB : tensor<256x16xf32>, tensor<16x256xf32>) - outs(%arg2 : tensor<256x256xf32>) -> tensor<256x256xf32> - - return %r : tensor<256x256xf32> + return %3 : tensor<518x518xf32> } diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir @@ -57,7 +57,7 @@ %f0 = constant 0.0 : f32 // CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref - // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref + // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) {alignment = 128 : i64} : memref // CHECK: linalg.fill(%[[F0]], %[[ALLOC]]) : f32, memref %r = linalg.fill(%f0, %A) : f32, tensor -> tensor @@ -133,6 +133,7 @@ /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc. // CHECK: %[[ALLOC:.*]] = memref.alloc + // CHECK: linalg.copy({{.*}}, %[[ALLOC]]) // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]] %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor @@ -161,22 +162,24 @@ %t1 : tensor<4xf32> {linalg.inplaceable = true}) -> (tensor, tensor, tensor, tensor) { - // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. + // Hoisted allocs. + // CHECK: %[[REALLOC_A1:.*]] = memref.alloc + // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc // CHECK: %[[REALLOC_A0:.*]] = memref.alloc + + // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. // CHECK: linalg.copy(%[[A0]], %[[REALLOC_A0]] // CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]] // CHECK: linalg.copy(%[[t0]], %[[SV_A0]]) %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. - // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc // CHECK: linalg.copy(%[[A0]] // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]] // CHECK: linalg.copy(%[[t1]], %[[SV_A0_2]]) %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice. - // CHECK: %[[REALLOC_A1:.*]] = memref.alloc // CHECK: linalg.copy(%[[A1]] // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]] // CHECK: linalg.copy(%[[t0]], %[[SV_A1]]) @@ -255,7 +258,7 @@ func @insert_slice_fun_not_inplace(%A : tensor, %t : tensor<4xf32>) -> tensor { - // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref + // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) {alignment = 128 : i64} : memref // CHECK: linalg.copy(%[[A]], %[[ALLOC]]) : memref // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref to memref<4xf32> // CHECK: linalg.copy(%[[t]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32> @@ -285,7 +288,7 @@ // fill would interfere with %r0 that is also being returned. // So we need to bufferize it out of place and make a new alloc. - // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref + // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) {alignment = 128 : i64} : memref // CHECK: linalg.fill(%{{.*}}, %[[ALLOC]] %r1 = linalg.fill(%f0, %A) : f32, tensor -> tensor @@ -489,9 +492,9 @@ %v1 = constant 1.0 : f32 %v2 = constant 2.0 : f32 - // CHECK-NEXT: %[[A:.*]] = memref.alloc() : memref<64xf32> - // CHECK-NEXT: %[[B:.*]] = memref.alloc() : memref<64xf32> - // CHECK-NEXT: %[[C:.*]] = memref.alloc() : memref + // CHECK-NEXT: %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref + // CHECK-NEXT: %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> + // CHECK-NEXT: %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> %A = linalg.init_tensor [64] : tensor<64xf32> %B = linalg.init_tensor [64] : tensor<64xf32> %C = linalg.init_tensor [] : tensor @@ -686,6 +689,9 @@ %c8 = constant 8 : index %c16 = constant 16 : index + // Hoisted alloc. + // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<8x16xf32> + // CHECK: scf.for %[[I:.*]] = %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) { %1 = tensor.extract_slice %A[%arg3, 0] [8, 256] [1, 1] : @@ -697,7 +703,6 @@ tensor<256x192xf32> to tensor<256x16xf32> // %4 does not match an insert_slice, it cannot be bufferized inplace and needs to alloc. - // CHECK: %[[ALLOC:.*]] = memref.alloc() : memref<8x16xf32> // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1] // TODO: %4 is never read but just overwritten, this copy can be elided. // CHECK: linalg.copy(%[[T]], %[[ALLOC]])