diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.h b/mlir/include/mlir/Dialect/Linalg/Passes.h --- a/mlir/include/mlir/Dialect/Linalg/Passes.h +++ b/mlir/include/mlir/Dialect/Linalg/Passes.h @@ -56,18 +56,13 @@ /// Placeholder for now, this is NYI. std::unique_ptr> createConvertLinalgToAffineLoopsPass(); -/// Create a pass that bufferizes the body of a FuncOp and tries to reuse the -/// buffers for those arguments that: -/// a) have been annotated 'inplaceable' and -/// b) whose buffer uses would be free of memory hazards. -std::unique_ptr createLinalgComprehensiveFuncBufferizePass(); - /// This pass implements a cross-dialect bufferization approach and performs an /// analysis to determine which op operands and results may be bufferized in the /// same buffers. The analysis is performed on topologically sorted CallOp and /// FuncOp within a module. It provides analyses and bufferization across -/// function boundaries. Within a single function body, the bufferization used -/// is that provided by `LinalgComprehensiveFuncBufferizePass`. +/// function boundaries. Within a function boundary, the analysis is performed +/// on SSA use-def chains starting from function operands that are annotated +/// with the 'inplaceable' attribute. std::unique_ptr createLinalgComprehensiveModuleBufferizePass(); /// Create a pass to convert Linalg operations which work on tensors to use diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td --- a/mlir/include/mlir/Dialect/Linalg/Passes.td +++ b/mlir/include/mlir/Dialect/Linalg/Passes.td @@ -22,26 +22,6 @@ let dependentDialects = ["linalg::LinalgDialect", "memref::MemRefDialect"]; } -def LinalgComprehensiveFuncBufferize : - FunctionPass<"linalg-comprehensive-func-bufferize"> { - let summary = "Bufferize (tensor into memref) the body of a FuncOp and try " - "to reuse the buffers for those arguments that " - "a) have been annotated 'inplaceable' and " - "b) whose buffer uses would be free of memory hazards"; - let description = [{ - This pass implements a cross-dialect bufferization approach and performs an - analysis to determine which op operands and results may be bufferized in the - same buffers. The analysis is performed on SSA use-def chains starting from - function operands that are annotated with the 'inplaceable' attribute. - }]; - let options = [ - Option<"testAnalysisOnly", "test-analysis-only", "bool", - /*default=*/"false", - "Only runs inplaceability analysis (for testing purposes only)"> - ]; - let constructor = "mlir::createLinalgComprehensiveFuncBufferizePass()"; -} - def LinalgComprehensiveModuleBufferize : Pass<"linalg-comprehensive-module-bufferize", "ModuleOp"> { let summary = "Bufferize (tensor into memref) for a Module."; @@ -50,8 +30,9 @@ analysis to determine which op operands and results may be bufferized in the same buffers. The analysis is performed on topologically sorted CallOp and FuncOp within a module. It provides analyses and bufferization across - function boundaries. Within a single function body, the bufferization used - is that provided by `-linalg-comprehensive-func-bufferize`. + function boundaries. Within a function boundary, the analysis is performed + on SSA use-def chains starting from function operands that are annotated + with the 'inplaceable' attribute. }]; let options = [ Option<"testAnalysisOnly", "test-analysis-only", "bool", diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp --- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferize.cpp @@ -979,10 +979,10 @@ /// Apply `fun` to all the members of the equivalence class of `v`. void BufferizationAliasInfo::applyOnEquivalenceClass( Value v, function_ref fun) const { - for (auto it = equivalentInfo.findLeader(v), - eit = equivalentInfo.member_end(); - it != eit; ++it) { - fun(v); + auto leaderIt = equivalentInfo.findLeader(v); + for (auto mit = leaderIt, meit = equivalentInfo.member_end(); mit != meit; + ++mit) { + fun(mit->v); } } @@ -1485,9 +1485,8 @@ getEquivalentEnclosingFuncBBArg(returnVal, aliasInfo)) { Value oldRes = callOp->getResult(returnOperand.getOperandNumber()); int64_t idx = bbArg.getArgNumber(); - Value buffer = bvm.lookupOrNull(callOp->getOperand(idx)); - if (!buffer) - return callOp->emitError() << "operand #" << idx << " not bufferized"; + Value buffer = lookup(bvm, callOp->getOperand(idx)); + assert(buffer && "expected bufferized value"); // Add CallOp operand/result equivalence: this is interprocedural info. aliasInfo.insertNewBufferEquivalence(oldRes, buffer); map(bvm, oldRes, buffer); @@ -1504,11 +1503,11 @@ continue; } - // TODO: Need to hoist above function boundary and add to - // `hoistedArgumentTypes`. - if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) - return allocOp->emitError() - << " needs hoist across function boundary\n"; + // TODO: Need to hoist above function boundary. + if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) { + hoistedArguments.push_back(allocOp->getResult(0)); + continue; + } // Other cases legitimately need to return a tensor, this is currently not // supported. For instance, if hoisting across function boundary has @@ -1518,13 +1517,14 @@ int64_t returnIdx = returnOperand.getOperandNumber(); return returnOp->emitError() - << " bufferize result #" << returnIdx << "\n"; + << "buffer result #" << returnIdx << " not produced by an alloc\n"; } } // 2. Compute bufferized FunctionType. SmallVector argumentTypes{callOp->getOperandTypes()}; - llvm::append_range(argumentTypes, ValueRange{hoistedArguments}.getTypes()); + ValueRange hoistedArgs{hoistedArguments}; + llvm::append_range(argumentTypes, hoistedArgs.getTypes()); // Get the bufferized FunctionType for funcOp or construct it if not yet // available. FunctionType bufferizedFuncType = getOrCreateBufferizedFunctionType( @@ -1543,8 +1543,8 @@ // Tensor operands are guaranteed to have been buferized. int64_t idx = opOperand.getOperandNumber(); - Value buffer = bvm.lookupOrNull(tensorOperand); - assert(buffer && " missing buffer for operand"); + Value buffer = lookup(bvm, tensorOperand); + assert(buffer && "expected bufferized value"); // Caller / callee type mistmatch is handled with a CastOp. auto memRefType = bufferizedFuncType.getInput(idx); @@ -1592,7 +1592,7 @@ ? rankedMemRefType.getAffineMaps() : ArrayRef{}; Type memRefType = getContiguousOrUnrankedMemRefType( - castOp.getResult().getType(), {}, memorySpace); + castOp.getResult().getType(), affineMaps, memorySpace); Value res = b.create(castOp.getLoc(), memRefType, lookup(bvm, castOp.source())); aliasInfo.insertNewBufferEquivalence(res, castOp.getResult()); @@ -2176,64 +2176,21 @@ return failure(result.wasInterrupted()); } -namespace { -struct LinalgComprehensiveFuncBufferize - : public LinalgComprehensiveFuncBufferizeBase< - LinalgComprehensiveFuncBufferize> { - void runOnFunction() override; - - void getDependentDialects(DialectRegistry ®istry) const override { - registry.insert(); - } -}; -} // end namespace - -void LinalgComprehensiveFuncBufferize::runOnFunction() { - auto funcOp = getFunction(); - - // Analysis phase. - DominanceInfo domInfo(funcOp); - BufferizationAliasInfo aliasInfo(funcOp); - // If the analysis fails, just return. This is expected to reset the IR and no - // single OpResult should be marked inPlace. - if (failed(inPlaceAnalysisFuncOpBody(funcOp, aliasInfo, domInfo))) { - signalPassFailure(); - return; - } - - if (testAnalysisOnly) - return; - - // Bufferization phase. - BlockAndValueMapping bvm; - DenseMap bufferizedFunctionTypes; - if (failed(bufferizeFuncOpInternals(funcOp, bvm, aliasInfo, - bufferizedFunctionTypes))) - signalPassFailure(); - - // Post-pass cleanup of inplaceable attributes. - funcOp.walk([&](Operation *op) { op->removeAttr(kInPlaceResultsAttrName); }); -} - -std::unique_ptr mlir::createLinalgComprehensiveFuncBufferizePass() { - return std::make_unique(); -} - //===----------------------------------------------------------------------===// // Bufferization entry-point for modules. //===----------------------------------------------------------------------===// -/// Return the op with Allocate MemoryEffect if `v` is equivalent to an such +/// Return the op with Allocate MemoryEffect if `v` is equivalent to such an /// an op. Return null otherwise. static Operation *getEquivalentAlloc(Value value, const BufferizationAliasInfo &aliasInfo) { - Operation *res; + Operation *res = nullptr; aliasInfo.applyOnEquivalenceClass(value, [&](Value v) { if (!res) if (auto interface = dyn_cast_or_null(v.getDefiningOp())) if (auto effect = - interface.getEffectOnValue(value)) + interface.getEffectOnValue(v)) res = v.getDefiningOp(); }); return res; @@ -2249,9 +2206,12 @@ if (!funcOp) funcOp = op->getParentOfType(); assert(funcOp && "expected non-null FuncOp"); - for (BlockArgument bbArg : funcOp.getArguments()) + for (BlockArgument bbArg : funcOp.getArguments()) { + if (!bbArg.getType().isa()) + continue; if (aliasInfo.areEquivalentBufferizedValues(v, bbArg)) return bbArg; + } return nullptr; } @@ -2292,9 +2252,6 @@ // externally). // -> Figure out a better layering. TypeRange resultTypes; - FunctionType bufferizedFuncType = - getOrCreateBufferizedFunctionType(funcOp, funcOp.getType().getInputs(), - resultTypes, bufferizedFunctionTypes); // Corner case: Bodiless FuncOp // ============================ @@ -2305,6 +2262,9 @@ if (llvm::any_of(funcOp.getType().getResults(), isaTensor)) return funcOp->emitError() << "cannot bufferize bodiless function that " << "returns a tensor"; + FunctionType bufferizedFuncType = + getOrCreateBufferizedFunctionType(funcOp, funcOp.getType().getInputs(), + TypeRange{}, bufferizedFunctionTypes); funcOp.setType(bufferizedFuncType); LLVM_DEBUG(DBGS() << "End bufferizeFuncOpBoundary no fun body: " << funcOp); return success(); @@ -2323,16 +2283,29 @@ Value returnVal = returnOperand.get(); if (getEquivalentEnclosingFuncBBArg(returnVal, aliasInfo)) continue; - // TODO: Need to hoist above function boundary. If this is not possible due - // to data-depedent sizes, we need a better type than memref. - if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) - return allocOp->emitError() << " needs hoist across function boundary\n"; + + // TODO: Need to hoist above function boundary. + if (Operation *allocOp = getEquivalentAlloc(returnVal, aliasInfo)) { + returnValues.push_back(allocOp->getResult(0)); + continue; + } + + // Other cases legitimately need to return a tensor, this is currently not + // supported. For instance, if hoisting across function boundary has + // failed, it may be due to e.g. data-dependent sizes. In such a case, we + // would need a better type than memref. int64_t returnIdx = returnOperand.getOperandNumber(); - return returnOp->emitError() << " bufferize result #" << returnIdx << "\n"; + return returnOp->emitError() + << "buffer result #" << returnIdx << " not produced by an alloc\n"; } // 2. Rewrite the terminator without the inPlace bufferizable values. - OpBuilder(returnOp).create(returnOp.getLoc(), returnValues); + ValueRange retValues{returnValues}; + FunctionType bufferizedFuncType = getOrCreateBufferizedFunctionType( + funcOp, funcOp.getType().getInputs(), retValues.getTypes(), + bufferizedFunctionTypes); + OpBuilder b(returnOp); + b.create(returnOp.getLoc(), returnValues); returnOp->erase(); // 3. Rewrite the bbArgs. diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis-invalid.mlir +++ /dev/null @@ -1,26 +0,0 @@ -// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize=test-analysis-only -split-input-file -verify-diagnostics - -// ----- - -func @scf_for(%A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor<4xf32>, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) - -> (tensor, tensor) - { - %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor - %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor - - // Throw a wrench in the system by swapping yielded values: this result in a - // ping-pong of values at each iteration on which we currently want to fail. - - // expected-error @+1 {{Yield operand #1 does not bufferize to an equivalent buffer}} - scf.yield %ttB, %ttA : tensor, tensor - } - - return %r0#0, %r0#1: tensor, tensor -} - diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize-analysis.mlir +++ /dev/null @@ -1,474 +0,0 @@ -// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize=test-analysis-only -split-input-file | FileCheck %s - -//===----------------------------------------------------------------------===// -// Simple cases -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @extract_slice_fun -func @extract_slice_fun(%A : tensor, %B : tensor {linalg.inplaceable = true}) - -> (tensor<4xf32>, tensor<8xf32>) -{ - // tensor.extract_slice is not used in a write, it is not compelled to - // bufferize out of place. Let callers decide whether they want to create - // aliasing subviews at all call sites or whether they allocate. - // This is true irrespective of whether the function argument is inplaceable. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.extract_slice %B[0][8][1] : tensor to tensor<8xf32> - - return %r0, %r1: tensor<4xf32>, tensor<8xf32> -} - -// ----- - -// CHECK-LABEL: func @insert_slice_fun -func @insert_slice_fun( - %A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor<4xf32>) - -> (tensor, tensor) -{ - // must bufferize out of place. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor - - // bufferizes inplace. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor - - return %r0, %r1: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @conflict_on_B -func @conflict_on_B( - %A : tensor<4x4xf32> {linalg.inplaceable = true}, - %B : tensor<4x4xf32> {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) -{ - // matmul output operand interferes with input operand. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %C = linalg.matmul ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // matmul output operand interferes with input operand. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %D = linalg.matmul ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // matmul output operand does not interferes with input operand. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32> -} - -//===----------------------------------------------------------------------===// -// Length-1 producer-consumer cases. -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @extract_slice_extract_slice -func @extract_slice_extract_slice( - %A : tensor {linalg.inplaceable = true}, %B : tensor) - -> (tensor<2xf32>, tensor<2xf32>) -{ - // tensor.extract_slice is not used in a write, it is not compelled to - // bufferize out of place. Let callers decide whether they want to create - // aliasing subviews at all call sites or whether they allocate. - // This is true irrespective of whether the function argument is inplaceable. - // CHECK: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32> - - return %r1, %r3: tensor<2xf32>, tensor<2xf32> -} - -// ----- - -// CHECK-LABEL: func @insert_slice_insert_slice -func @insert_slice_insert_slice( - %A : tensor {linalg.inplaceable = true}, - %A2 : tensor<4xf32> {linalg.inplaceable = true}, - %A3 : tensor<2xf32> {linalg.inplaceable = true}, - %B : tensor, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>) - -> (tensor, tensor) -{ - // CHECK: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor - - // CHECK: {__inplace_results_attr__ = ["false"]} - %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32> - - // CHECK: {__inplace_results_attr__ = ["false"]} - %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor - - return %r1, %r3: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice -func @extract_slice_nonmatching_insert_slice( - %A : tensor {linalg.inplaceable = true}, - %B : tensor, %idx: index) - -> (tensor, tensor) -{ - // %r1 bufferizes inplace because %A is inplaceable. - // %r0 is an overlapping tensor.extract_slice that does not match, it must be - // out of place. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // %r1 can bufferize inplace fine. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor - - // %r3 does bufferizes inplace because %B is not inplaceable. - // %r0 is an overlapping tensor.extract_slice that does not match, but does - // not alias with the buffer coming from %r3 so it can actually bufferize - // inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> - - // %r3 cannot bufferize inplace since %B is not inplaceable. - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor - - return %r1, %r3: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @extract_slice_matching_insert_slice -func @extract_slice_matching_insert_slice( - %A : tensor {linalg.inplaceable = true}, - %B : tensor) - -> (tensor, tensor) -{ - // %r1 bufferizes inplace because %A is inplaceable. - // %r0 is a tensor.extract_slice that matches, it can also be bufferized - // inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> - - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor - - // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized - // inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> - - // tensor.insert_slice cannot bufferize inplace. - // This should have been captured by a canonicalization pattern and it would - // be unproductive to have special logic in bufferization to encode matching - // insert_slice(extract_slice(A), A). - // CHECK: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor - - return %r1, %r3: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @extract_slice_linalg_readonly_use -func @extract_slice_linalg_readonly_use( - %A : tensor, - %B : tensor<4x4xf32>, - %C : tensor<4x4xf32> {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>) -{ - // tensor.extract_slice is only used as a read, no interference irrespective - // of user's inplace status. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // matmul output operand is not inplaceable at the function boundary. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %D = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%B: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // matmul output operand is inplaceable at the function boundary. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%C: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> -} - -// ----- - -// CHECK-LABEL: func @extract_slice_to_linalg_write_use -func @extract_slice_to_linalg_write_use( - %A : tensor<4x4xf32>, - %B : tensor, - %C : tensor {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>) -{ - // Step 3. %sB forward propagates to a write in %D but it is not inplace. - // So this is only ever read and can bufferize inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 2. %sB has a read interference in %E, it does not bufferize inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %D = linalg.matmul ins(%B, %C: tensor, tensor) - outs(%sB: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // Step 4. %sC forward propagates to an inplace write in %E. - // %sC backward propagates to %C which is inplaceable. - // As a consequence this is bufferized inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 1. %sC backprops to the tensor.extract_slice producer which is not - // considered an interference. This bufferizes inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%sC: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> -} - -//===----------------------------------------------------------------------===// -// Transitive cases -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @extract_slice_to_linalg_write_use -func @extract_slice_to_linalg_write_use( - %A : tensor<4x4xf32>, - %B : tensor, - %C : tensor {linalg.inplaceable = true}) - -> (tensor<4x4xf32>, tensor<4x4xf32>) -{ - // Step 4. %sB forward propagates to an inplace write in %D. - // %sB backward propagates to %B which is not inplaceable. - // As a consequence this is bufferized out of place. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 1. %sB backprops to the tensor.extract_slice producer which is not - // considered an interference. This bufferizes inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %D = linalg.matmul ins(%B, %C: tensor, tensor) - outs(%sB: tensor<4x4xf32>) - -> tensor<4x4xf32> - - // Step 3. %sC forward propagates to an inplace write in %E. - // %sC backward propagates to %C which is inplaceable. - // As a consequence this is bufferized inplace. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - - // Step 1. %sC backprops to the tensor.extract_slice producer which is not - // considered an interference. This bufferizes inplace. - // CHECK: linalg.matmul - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>) - outs(%sC: tensor<4x4xf32>) - -> tensor<4x4xf32> - - return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> -} - -// ----- - -// CHECK-LABEL: func @nested_extract_slice_and_insert -func @nested_extract_slice_and_insert( - %A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor {linalg.inplaceable = true}, - %idx : index) - -> (tensor, tensor, tensor) -{ - %f0 = constant 0.0 : f32 - - // 2-level matching tensor.extract_slice / tensor.insert_slice into non - // inplaceable %A. - // - %rA is not inplaceable because %A is not inplaceable at function boundary. - // - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable - // - this propagates to %FA and %ssA being inplaceable. - // - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not - // inplaceable and so %sA is not inplaceable. - // CHECK: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor to tensor - %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32> - %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor - %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor into tensor - - // 3-level matching tensor.extract_slice / tensor.insert_slice into - // inplaceable %B. - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.extract_slice - // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because - // clobbering analysis conservatively test for equivalent buffers. - // TODO: This is currently too restrictive and misses clobberings. - // When available, use container-containee analysis. - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor to tensor - %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor to tensor<4x?xf32> - %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32> - %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32> - %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32> - %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor - %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor into tensor - - // 2-level matching tensor.extract_slice / tensor.insert_slice into - // inplaceable %C with a twist. - // Throw a wrench in the system: %rsC production sizes do not match %ssC. - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // The tensor.insert_slice that would be candidate for matching does not actually - // match. That tensor.insert_slice can still be bufferized inplace nonetheless - // but this tensor.extract_slice, which bufferizes to an inplace write, cannot. - // CHECK-NEXT: tensor.extract_slice - // CHECK-SAME: {__inplace_results_attr__ = ["false"]} - // CHECK-NEXT: fill - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor to tensor - %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> - %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32> - %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor - %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor into tensor - - return %rA, %rB, %rC: tensor, tensor, tensor -} - -//===----------------------------------------------------------------------===// -// Simple loop cases -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @scf_for_yield_only -func @scf_for_yield_only(%A : tensor, - %B : tensor {linalg.inplaceable = true}, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // CHECK: scf.for - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["false"]} - %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor) { - scf.yield %t : tensor - } - - // CHECK: scf.for - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["true"]} - %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor) { - scf.yield %t : tensor - } - - return %r0, %r1: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @scf_for_with_tensor.insert_slice -func @scf_for_with_tensor.insert_slice(%A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor<4xf32>, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // CHECK: scf.for - // scf.for bbArgs are always inplaceable seen from ops inside the body: - // 1. Either the matching tensor is not inplaceable and an alloc occurs - // which makes bbArg inplaceable. - // 2. Or it is already inplaceable and so is bbArg. - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: tensor.insert_slice - // CHECK-SAME: {__inplace_results_attr__ = ["true"]} - // CHECK-NEXT: scf.yield - // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]} - %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) - -> (tensor, tensor) - { - %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor - %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor - scf.yield %ttA, %ttB : tensor, tensor - } - - return %r0#0, %r0#1: tensor, tensor -} - diff --git a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir deleted file mode 100644 --- a/mlir/test/Dialect/Linalg/comprehensive-func-bufferize.mlir +++ /dev/null @@ -1,353 +0,0 @@ -// RUN: mlir-opt %s -linalg-comprehensive-func-bufferize -split-input-file | FileCheck %s - -// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK-LABEL: func @fill_inplace( -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: tensor {linalg.inplaceable = true}) -func @fill_inplace(%A : tensor {linalg.inplaceable = true}) -> tensor { - // CHECK: %[[I:.*]] = memref.buffer_cast %[[A]] : memref - - // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32 - %f0 = constant 0.0 : f32 - - /// Inplaceable, no alloc - // CHECK-NOT: alloc - // CHECK: linalg.fill(%[[F0]], %[[I]]) : f32, memref - %r = linalg.fill(%f0, %A) : f32, tensor -> tensor - - // CHECK: %[[R:.*]] = memref.tensor_load %[[I]] : memref - // CHECK: return %[[R]] : tensor - return %r: tensor -} - -// ----- - -// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -/// No linalg.inplaceable flag, must allocate. -// CHECK-LABEL: func @not_inplace( -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: tensor) -func @not_inplace(%A : tensor) -> tensor { - // CHECK: %[[I:.*]] = memref.buffer_cast %[[A]] : memref - - // CHECK: %[[D0:.*]] = memref.dim %[[I]], {{.*}} : memref - // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref - // CHECK: %[[I2:.*]] = memref.cast %[[ALLOC]] : memref to memref - - // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32 - %f0 = constant 0.0 : f32 - - // CHECK: linalg.fill(%[[F0]], %[[I2]]) : f32, memref - %r = linalg.fill(%f0, %A) : f32, tensor -> tensor - - // CHECK: dealloc %[[ALLOC]] : memref - // CHECK: %[[R:.*]] = memref.tensor_load %[[I2]] : memref - // CHECK: return %[[R]] : tensor - return %r: tensor -} - -// ----- - -// CHECK-LABEL: func @not_inplace -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: tensor -func @not_inplace(%A : tensor {linalg.inplaceable = true}) -> tensor { - %f0 = constant 0.0 : f32 - - // CHECK: %[[BUFFER_CAST:.*]] = memref.buffer_cast %[[A]] : memref -> tensor - - /// The second op has no interfering reads and can reuse. - // CHECK-NOT: alloc - // CHECK: linalg.matmul{{.*}}outs(%[[BUFFER_CAST]] - %r = linalg.matmul ins(%f, %f: tensor, tensor) - outs(%A: tensor) - -> tensor - return %r: tensor -} - -// ----- - -// CHECK-LABEL: func @not_inplace -func @not_inplace(%A : tensor {linalg.inplaceable = true}) -> tensor { - /// Within op multiple uses of %A, must alloc. - // CHECK: alloc - %r = linalg.matmul ins(%A, %A: tensor, tensor) - outs(%A: tensor) - -> tensor - return %r: tensor -} -// ----- - -// CHECK-LABEL: func @vec_inplace -func @vec_inplace(%A : tensor {linalg.inplaceable = true}, %vec : vector<4xf32>) - -> tensor -{ - %c0 = constant 0 : index - // CHECK-NOT: alloc - %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor - return %r: tensor -} - -// ----- - -// CHECK-LABEL: func @vec_not_inplace -func @vec_not_inplace(%A : tensor {linalg.inplaceable = true}, %vec : vector<4xf32>) - -> (tensor, tensor) -{ - %c0 = constant 0 : index - %c1 = constant 1 : index - - // CHECK: %[[BUFFER_CAST:.*]] = memref.buffer_cast {{.*}} : memref - - /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc. - // CHECK: %[[ALLOC:.*]] = memref.alloc - // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]] - %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor - - /// The second vector.transfer has no interfering reads and can reuse the buffer. - // CHECK-NOT: alloc - // CHECK-NEXT: vector.transfer_write {{.*}}, %[[BUFFER_CAST]] - %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor - return %r0, %r1: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @insert_slice_fun -func @insert_slice_fun(%A0 : tensor, %A1 : tensor {linalg.inplaceable = true}, - %t0 : tensor<4xf32>, %t1 : tensor<4xf32> {linalg.inplaceable = true}) - -> (tensor, tensor, tensor, tensor) -{ - // CHECK: %[[BUFFER_CAST_A0:.*]] = memref.buffer_cast {{.*}} : memref into tensor - - // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. - // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc - // CHECK: linalg.copy(%[[BUFFER_CAST_A0]] - // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]] - // CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A0_2]]) - %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor - - // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice. - // CHECK: %[[REALLOC_A1:.*]] = memref.alloc - // CHECK: linalg.copy(%[[BUFFER_CAST_A1]] - // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]] - // CHECK: linalg.copy(%[[BUFFER_CAST_t0]], %[[SV_A1]]) - %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor - - // Do not realloc the large tensor. Copy the tensor.extract_slice. - // CHECK-NOT: alloc - // CHECK: %[[SV_A1_2:.*]] = memref.subview %[[BUFFER_CAST_A1]] - // CHECK: linalg.copy(%[[BUFFER_CAST_t1]], %[[SV_A1_2]]) - %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor - - return %r0, %r1, %r2, %r3: tensor, tensor, tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @insert_slice_fun -func @insert_slice_fun(%A : tensor {linalg.inplaceable = true}, %t : tensor<4xf32>) - -> tensor -{ - %f0 = constant 0.0 : f32 - - // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref into tensor - - /// Overwrite BUFFER_CAST_A inplace. - // CHECK: linalg.fill({{.*}}, %[[BUFFER_CAST_A]] - %r1 = linalg.fill(%f0, %r0) : f32, tensor -> tensor - return %r1: tensor -} - -// ----- - -// CHECK-LABEL: func @insert_slice_fun -func @insert_slice_fun(%A : tensor {linalg.inplaceable = true}, %t : tensor<4xf32>) - -> tensor -{ - %f0 = constant 0.0 : f32 - - // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref -> tensor - - // CHECK-NOT: alloc - // CHECK: %[[SV:.*]] = memref.subview %[[BUFFER_CAST_A]] - /// Overwrite BUFFER_CAST_A inplace by copying into the subview. - // CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) - %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor - - return %r1: tensor -} - -// ----- - -// CHECK-LABEL: func @insert_slice_fun_not_inplace -func @insert_slice_fun_not_inplace(%A : tensor, %t : tensor<4xf32>) - -> tensor -{ - // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref - // CHECK: linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC]]) : memref - // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref to memref<4xf32> - // CHECK: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32> - // CHECK: memref.dealloc %[[ALLOC]] : memref - %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor - return %r0: tensor -} - -// ----- - -// CHECK-LABEL: func @insert_slice_fun_not_inplace -func @insert_slice_fun_not_inplace(%A : tensor {linalg.inplaceable = true}, %t : tensor<4xf32>) - -> (tensor, tensor) -{ - %f0 = constant 0.0 : f32 - - // CHECK-DAG: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref to memref<4xf32, {{.*}}> - // CHECK-DAG: linalg.copy(%[[BUFFER_CAST_B]], %[[SV]]) : memref<4xf32, {{.*}}>, memref<4xf32, {{.*}}> - %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor - - // fill would interfere with %r0 that is also being returned. - // So we need to bufferize it out of place and make a new alloc. - // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref - // CHECK-DAG: %[[ALLOC_CAST_DYNAMIC:.*]] = memref.cast %[[ALLOC]] : memref to memref - %r1 = linalg.fill(%f0, %A) : f32, tensor -> tensor - - // CHECK-DAG: %[[RET_A:.*]] = memref.tensor_load %[[BUFFER_CAST_A]] : memref, tensor -} - -// ----- - -// CHECK-LABEL: func @extract_slice_fun -func @extract_slice_fun(%A : tensor {linalg.inplaceable = true}) - -> tensor<4xf32> -{ - // This bufferizes to a pattern that the cross-function boundary pass needs to - // convert into a new memref argument at all call site; this may be either: - // - an externally created aliasing subview (if we want to allow aliasing - // function arguments). - // - a new alloc + copy (more expensive but does not create new function - // argument aliasing). - // CHECK-NOT: alloc - // CHECK-NOT: copy - // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast {{.*}} : memref to tensor<4xf32> - - // CHECK: return %[[RES]] - return %r0: tensor<4xf32> -} - -//===----------------------------------------------------------------------===// -// Simple loop cases -//===----------------------------------------------------------------------===// - -// ----- - -// CHECK-LABEL: func @scf_for_yield_only -func @scf_for_yield_only(%A : tensor, - %B : tensor {linalg.inplaceable = true}, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc - // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast - // CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast - // CHECK: linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC_FOR_A]]) - - // The first scf.for remains but just turns into dead code. - %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor) { - scf.yield %t : tensor - } - - // The second scf.for remains but just turns into dead code. - %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor) { - scf.yield %t : tensor - } - - // Cross function call alloc/dealloc pattern must be hoist out. - // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref - // CHECK: %[[rA:.*]] = memref.tensor_load %[[ALLOC_FOR_A]] - // Returning tensor_load of the buffer cast makes the %r1 loop dead. - // CHECK: %[[rB:.*]] = memref.tensor_load %[[BUFFER_CAST_B:.*]] - // CHECK: return %[[rA]], %[[rB]] : tensor, tensor - return %r0, %r1: tensor, tensor -} - -// ----- - -// CHECK-LABEL: func @scf_for_with_tensor.insert_slice -func @scf_for_with_tensor.insert_slice( - %A : tensor, - %B : tensor {linalg.inplaceable = true}, - %C : tensor<4xf32>, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc - // CHECK: %[[BUFFER_CAST_A:.*]] = memref.buffer_cast - // CHECK: %[[BUFFER_CAST_B:.*]] = memref.buffer_cast - // CHECK: %[[BUFFER_CAST_C:.*]] = memref.buffer_cast - // CHECK: linalg.copy(%[[BUFFER_CAST_A]], %[[ALLOC_FOR_A]]) - - // CHECK: scf.for {{.*}} iter_args(%[[bbA:.*]] = %{{.*}}, %[[bbB:.*]] = %{{.*}}) - %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) - -> (tensor, tensor) - { - // CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1] - // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA - // CHECK: linalg.copy(%[[BUFFER_CAST_C]], %[[svA]]) - %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor - - // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B - // CHECK: %[[svB:.*]] = memref.subview %[[BUFFER_CAST_B]][0] [4] [1] - // CHECK: linalg.copy(%[[BUFFER_CAST_C]], %[[svB]]) - %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor - - // Yielding bbA and bbB will canonicalize away into oblivion. - // CHECK: scf.yield %[[bbA]], %[[bbB]] : tensor, tensor - scf.yield %ttA, %ttB : tensor, tensor - } - - // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref - // CHECK: %[[rA:.*]] = memref.tensor_load %[[ALLOC_FOR_A]] : memref - // CHECK: %[[rB:.*]] = memref.tensor_load %[[BUFFER_CAST_B]] : memref - // CHECK: return %[[rA]], %[[rB]] : tensor, tensor - return %r0#0, %r0#1: tensor, tensor -} diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir @@ -1,5 +1,483 @@ // RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=test-analysis-only -split-input-file | FileCheck %s +//===----------------------------------------------------------------------===// +// Simple cases +//===----------------------------------------------------------------------===// + +// ----- + +// CHECK-LABEL: func @extract_slice_fun +func @extract_slice_fun(%A : tensor, %B : tensor {linalg.inplaceable = true}) + -> (tensor<4xf32>, tensor<8xf32>) +{ + // tensor.extract_slice is not used in a write, it is not compelled to + // bufferize out of place. Let callers decide whether they want to create + // aliasing subviews at all call sites or whether they allocate. + // This is true irrespective of whether the function argument is inplaceable. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> + + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r1 = tensor.extract_slice %B[0][8][1] : tensor to tensor<8xf32> + + return %r0, %r1: tensor<4xf32>, tensor<8xf32> +} + +// ----- + +// CHECK-LABEL: func @insert_slice_fun +func @insert_slice_fun( + %A : tensor, + %B : tensor {linalg.inplaceable = true}, + %C : tensor<4xf32>) + -> (tensor, tensor) +{ + // must bufferize out of place. + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %r0 = tensor.insert_slice %C into %A[0][4][1] : tensor<4xf32> into tensor + + // bufferizes inplace. + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r1 = tensor.insert_slice %C into %B[0][4][1] : tensor<4xf32> into tensor + + return %r0, %r1: tensor, tensor +} + +// ----- + +// CHECK-LABEL: func @conflict_on_B +func @conflict_on_B( + %A : tensor<4x4xf32> {linalg.inplaceable = true}, + %B : tensor<4x4xf32> {linalg.inplaceable = true}) + -> (tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32>) +{ + // matmul output operand interferes with input operand. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %C = linalg.matmul ins(%A, %B: tensor<4x4xf32>, tensor<4x4xf32>) + outs(%B: tensor<4x4xf32>) + -> tensor<4x4xf32> + + // matmul output operand interferes with input operand. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %D = linalg.matmul ins(%B, %A: tensor<4x4xf32>, tensor<4x4xf32>) + outs(%B: tensor<4x4xf32>) + -> tensor<4x4xf32> + + // matmul output operand does not interferes with input operand. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>) + outs(%B: tensor<4x4xf32>) + -> tensor<4x4xf32> + + return %C, %D, %E: tensor<4x4xf32>, tensor<4x4xf32>, tensor<4x4xf32> +} + +//===----------------------------------------------------------------------===// +// Length-1 producer-consumer cases. +//===----------------------------------------------------------------------===// + +// ----- + +// CHECK-LABEL: func @extract_slice_extract_slice +func @extract_slice_extract_slice( + %A : tensor {linalg.inplaceable = true}, %B : tensor) + -> (tensor<2xf32>, tensor<2xf32>) +{ + // tensor.extract_slice is not used in a write, it is not compelled to + // bufferize out of place. Let callers decide whether they want to create + // aliasing subviews at all call sites or whether they allocate. + // This is true irrespective of whether the function argument is inplaceable. + // CHECK: {__inplace_results_attr__ = ["true"]} + %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> + + // CHECK: {__inplace_results_attr__ = ["true"]} + %r1 = tensor.extract_slice %r0[0][2][1] : tensor<4xf32> to tensor<2xf32> + + // CHECK: {__inplace_results_attr__ = ["true"]} + %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> + + // CHECK: {__inplace_results_attr__ = ["true"]} + %r3 = tensor.extract_slice %r2[0][2][1] : tensor<4xf32> to tensor<2xf32> + + return %r1, %r3: tensor<2xf32>, tensor<2xf32> +} + +// ----- + +// CHECK-LABEL: func @insert_slice_insert_slice +func @insert_slice_insert_slice( + %A : tensor {linalg.inplaceable = true}, + %A2 : tensor<4xf32> {linalg.inplaceable = true}, + %A3 : tensor<2xf32> {linalg.inplaceable = true}, + %B : tensor, %B2 : tensor<4xf32>, %B3 : tensor<2xf32>) + -> (tensor, tensor) +{ + // CHECK: {__inplace_results_attr__ = ["true"]} + %r0 = tensor.insert_slice %A3 into %A2[0][2][1] : tensor<2xf32> into tensor<4xf32> + + // CHECK: {__inplace_results_attr__ = ["true"]} + %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor + + // CHECK: {__inplace_results_attr__ = ["false"]} + %r2 = tensor.insert_slice %B3 into %B2[0][2][1] : tensor<2xf32> into tensor<4xf32> + + // CHECK: {__inplace_results_attr__ = ["false"]} + %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor + + return %r1, %r3: tensor, tensor +} + +// ----- + +// CHECK-LABEL: func @extract_slice_nonmatching_insert_slice +func @extract_slice_nonmatching_insert_slice( + %A : tensor {linalg.inplaceable = true}, + %B : tensor, %idx: index) + -> (tensor, tensor) +{ + // %r1 bufferizes inplace because %A is inplaceable. + // %r0 is an overlapping tensor.extract_slice that does not match, it must be + // out of place. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> + + // %r1 can bufferize inplace fine. + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r1 = tensor.insert_slice %r0 into %A[%idx][4][1] : tensor<4xf32> into tensor + + // %r3 does bufferizes inplace because %B is not inplaceable. + // %r0 is an overlapping tensor.extract_slice that does not match, but does + // not alias with the buffer coming from %r3 so it can actually bufferize + // inplace. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> + + // %r3 cannot bufferize inplace since %B is not inplaceable. + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %r3 = tensor.insert_slice %r2 into %B[%idx][4][1] : tensor<4xf32> into tensor + + return %r1, %r3: tensor, tensor +} + +// ----- + +// CHECK-LABEL: func @extract_slice_matching_insert_slice +func @extract_slice_matching_insert_slice( + %A : tensor {linalg.inplaceable = true}, + %B : tensor) + -> (tensor, tensor) +{ + // %r1 bufferizes inplace because %A is inplaceable. + // %r0 is a tensor.extract_slice that matches, it can also be bufferized + // inplace. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> + + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r1 = tensor.insert_slice %r0 into %A[0][4][1] : tensor<4xf32> into tensor + + // %r2 is a tensor.extract_slice that matches %r3, it can be bufferized + // inplace. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %r2 = tensor.extract_slice %B[0][4][1] : tensor to tensor<4xf32> + + // tensor.insert_slice cannot bufferize inplace. + // This should have been captured by a canonicalization pattern and it would + // be unproductive to have special logic in bufferization to encode matching + // insert_slice(extract_slice(A), A). + // CHECK: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %r3 = tensor.insert_slice %r2 into %B[0][4][1] : tensor<4xf32> into tensor + + return %r1, %r3: tensor, tensor +} + +// ----- + +// CHECK-LABEL: func @extract_slice_linalg_readonly_use +func @extract_slice_linalg_readonly_use( + %A : tensor, + %B : tensor<4x4xf32>, + %C : tensor<4x4xf32> {linalg.inplaceable = true}) + -> (tensor<4x4xf32>, tensor<4x4xf32>) +{ + // tensor.extract_slice is only used as a read, no interference irrespective + // of user's inplace status. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %sA = tensor.extract_slice %A[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> + + // matmul output operand is not inplaceable at the function boundary. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %D = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>) + outs(%B: tensor<4x4xf32>) + -> tensor<4x4xf32> + + // matmul output operand is inplaceable at the function boundary. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %E = linalg.matmul ins(%sA, %B: tensor<4x4xf32>, tensor<4x4xf32>) + outs(%C: tensor<4x4xf32>) + -> tensor<4x4xf32> + + return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> +} + +// ----- + +// CHECK-LABEL: func @extract_slice_to_linalg_write_use +func @extract_slice_to_linalg_write_use( + %A : tensor<4x4xf32>, + %B : tensor, + %C : tensor {linalg.inplaceable = true}) + -> (tensor<4x4xf32>, tensor<4x4xf32>) +{ + // Step 3. %sB forward propagates to a write in %D but it is not inplace. + // So this is only ever read and can bufferize inplace. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> + + // Step 2. %sB has a read interference in %E, it does not bufferize inplace. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %D = linalg.matmul ins(%B, %C: tensor, tensor) + outs(%sB: tensor<4x4xf32>) + -> tensor<4x4xf32> + + // Step 4. %sC forward propagates to an inplace write in %E. + // %sC backward propagates to %C which is inplaceable. + // As a consequence this is bufferized inplace. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> + + // Step 1. %sC backprops to the tensor.extract_slice producer which is not + // considered an interference. This bufferizes inplace. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %E = linalg.matmul ins(%A, %sB: tensor<4x4xf32>, tensor<4x4xf32>) + outs(%sC: tensor<4x4xf32>) + -> tensor<4x4xf32> + + return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> +} + +//===----------------------------------------------------------------------===// +// Transitive cases +//===----------------------------------------------------------------------===// + +// ----- + +// CHECK-LABEL: func @extract_slice_to_linalg_write_use +func @extract_slice_to_linalg_write_use( + %A : tensor<4x4xf32>, + %B : tensor, + %C : tensor {linalg.inplaceable = true}) + -> (tensor<4x4xf32>, tensor<4x4xf32>) +{ + // Step 4. %sB forward propagates to an inplace write in %D. + // %sB backward propagates to %B which is not inplaceable. + // As a consequence this is bufferized out of place. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %sB = tensor.extract_slice %B[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> + + // Step 1. %sB backprops to the tensor.extract_slice producer which is not + // considered an interference. This bufferizes inplace. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %D = linalg.matmul ins(%B, %C: tensor, tensor) + outs(%sB: tensor<4x4xf32>) + -> tensor<4x4xf32> + + // Step 3. %sC forward propagates to an inplace write in %E. + // %sC backward propagates to %C which is inplaceable. + // As a consequence this is bufferized inplace. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %sC = tensor.extract_slice %C[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> + + // Step 1. %sC backprops to the tensor.extract_slice producer which is not + // considered an interference. This bufferizes inplace. + // CHECK: linalg.matmul + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %E = linalg.matmul ins(%A, %A: tensor<4x4xf32>, tensor<4x4xf32>) + outs(%sC: tensor<4x4xf32>) + -> tensor<4x4xf32> + + return %D, %E: tensor<4x4xf32>, tensor<4x4xf32> +} + +// ----- + +// CHECK-LABEL: func @nested_extract_slice_and_insert +func @nested_extract_slice_and_insert( + %A : tensor, + %B : tensor {linalg.inplaceable = true}, + %C : tensor {linalg.inplaceable = true}, + %idx : index) + -> (tensor, tensor, tensor) +{ + %f0 = constant 0.0 : f32 + + // 2-level matching tensor.extract_slice / tensor.insert_slice into non + // inplaceable %A. + // - %rA is not inplaceable because %A is not inplaceable at function boundary. + // - once %rA is deemed not inplaceable, nothing prevent %rsA to be inplaceable + // - this propagates to %FA and %ssA being inplaceable. + // - %sA would then bufferize to an inplace write (i.e. %FA) but %A is not + // inplaceable and so %sA is not inplaceable. + // CHECK: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + // CHECK-NEXT: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: fill + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + %sA = tensor.extract_slice %A[0, 0][%idx, %idx][1, 1] : tensor to tensor + %ssA = tensor.extract_slice %sA[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> + %FA = linalg.fill(%f0, %ssA) : f32, tensor<4x4xf32> -> tensor<4x4xf32> + %rsA = tensor.insert_slice %FA into %sA[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor + %rA = tensor.insert_slice %rsA into %A[0, 0][%idx, %idx][1, 1] : tensor into tensor + + // 3-level matching tensor.extract_slice / tensor.insert_slice into + // inplaceable %B. + // CHECK-NEXT: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.extract_slice + // Atm, this 2nd tensor.extract_slice fails to bufferize inplace because + // clobbering analysis conservatively test for equivalent buffers. + // TODO: This is currently too restrictive and misses clobberings. + // When available, use container-containee analysis. + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + // CHECK-NEXT: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: fill + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %sB = tensor.extract_slice %B[0, 0][%idx, %idx][1, 1] : tensor to tensor + %ssB = tensor.extract_slice %sB[0, 0][4, %idx][1, 1] : tensor to tensor<4x?xf32> + %sssB = tensor.extract_slice %ssB[0, 0][4, 4][1, 1] : tensor<4x?xf32> to tensor<4x4xf32> + %FB = linalg.fill(%f0, %sssB) : f32, tensor<4x4xf32> -> tensor<4x4xf32> + %rssB = tensor.insert_slice %FB into %ssB[0, 0][4, 4][1, 1] : tensor<4x4xf32> into tensor<4x?xf32> + %rsB = tensor.insert_slice %rssB into %sB[0, 0][4, %idx][1, 1] : tensor<4x?xf32> into tensor + %rB = tensor.insert_slice %rsB into %B[0, 0][%idx, %idx][1, 1] : tensor into tensor + + // 2-level matching tensor.extract_slice / tensor.insert_slice into + // inplaceable %C with a twist. + // Throw a wrench in the system: %rsC production sizes do not match %ssC. + // CHECK-NEXT: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // The tensor.insert_slice that would be candidate for matching does not actually + // match. That tensor.insert_slice can still be bufferized inplace nonetheless + // but this tensor.extract_slice, which bufferizes to an inplace write, cannot. + // CHECK-NEXT: tensor.extract_slice + // CHECK-SAME: {__inplace_results_attr__ = ["false"]} + // CHECK-NEXT: fill + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + %sC = tensor.extract_slice %C[0, 0][%idx, %idx][1, 1] : tensor to tensor + %ssC = tensor.extract_slice %sC[0, 0][4, 4][1, 1] : tensor to tensor<4x4xf32> + %FC = linalg.fill(%f0, %ssC) : f32, tensor<4x4xf32> -> tensor<4x4xf32> + %rsC = tensor.insert_slice %FC into %sC[0, 0][12345, 67890][1, 1] : tensor<4x4xf32> into tensor + %rC = tensor.insert_slice %rsC into %C[0, 0][%idx, %idx][1, 1] : tensor into tensor + + return %rA, %rB, %rC: tensor, tensor, tensor +} + +//===----------------------------------------------------------------------===// +// Simple loop cases +//===----------------------------------------------------------------------===// + +// ----- + +// CHECK-LABEL: func @scf_for_yield_only +func @scf_for_yield_only(%A : tensor, + %B : tensor {linalg.inplaceable = true}, + %lb : index, %ub : index, %step : index) + -> (tensor, tensor) +{ + // CHECK: scf.for + // CHECK-NEXT: scf.yield + // CHECK-NEXT: {__inplace_results_attr__ = ["false"]} + %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor) { + scf.yield %t : tensor + } + + // CHECK: scf.for + // CHECK-NEXT: scf.yield + // CHECK-NEXT: {__inplace_results_attr__ = ["true"]} + %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor) { + scf.yield %t : tensor + } + + return %r0, %r1: tensor, tensor +} + +// ----- + +// CHECK-LABEL: func @scf_for_with_tensor.insert_slice +func @scf_for_with_tensor.insert_slice(%A : tensor, + %B : tensor {linalg.inplaceable = true}, + %C : tensor<4xf32>, + %lb : index, %ub : index, %step : index) + -> (tensor, tensor) +{ + // CHECK: scf.for + // scf.for bbArgs are always inplaceable seen from ops inside the body: + // 1. Either the matching tensor is not inplaceable and an alloc occurs + // which makes bbArg inplaceable. + // 2. Or it is already inplaceable and so is bbArg. + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: tensor.insert_slice + // CHECK-SAME: {__inplace_results_attr__ = ["true"]} + // CHECK-NEXT: scf.yield + // CHECK-NEXT: {__inplace_results_attr__ = ["false", "true"]} + %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) + -> (tensor, tensor) + { + %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor + %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor + scf.yield %ttA, %ttB : tensor, tensor + } + + return %r0#0, %r0#1: tensor, tensor +} + +// ----- + +//===----------------------------------------------------------------------===// +// Cross function boundary cases. +//===----------------------------------------------------------------------===// + func private @foo(tensor<64xf32>) // CHECK-LABEL: dependence_through_call diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-invalid.mlir @@ -44,3 +44,44 @@ call @foo() : () -> () return } + +// ----- + +func @scf_for(%A : tensor, + %B : tensor {linalg.inplaceable = true}, + %C : tensor<4xf32>, + %lb : index, %ub : index, %step : index) + -> (tensor, tensor) +{ + %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) + -> (tensor, tensor) + { + %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor + %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor + + // Throw a wrench in the system by swapping yielded values: this result in a + // ping-pong of values at each iteration on which we currently want to fail. + + // expected-error @+1 {{Yield operand #1 does not bufferize to an equivalent buffer}} + scf.yield %ttB, %ttA : tensor, tensor + } + + return %r0#0, %r0#1: tensor, tensor +} + +// ----- + +func @extract_slice_fun(%A : tensor {linalg.inplaceable = true}) + -> tensor<4xf32> +{ + // This bufferizes to a pattern that the cross-function boundary pass needs to + // convert into a new memref argument at all call site; this may be either: + // - an externally created aliasing subview (if we want to allow aliasing + // function arguments). + // - a new alloc + copy (more expensive but does not create new function + // argument aliasing). + %r0 = tensor.extract_slice %A[0][4][1] : tensor to tensor<4xf32> + + // expected-error @+1 {{buffer result #0 not produced by an alloc}} + return %r0: tensor<4xf32> +} diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir @@ -1,5 +1,355 @@ // RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -split-input-file | FileCheck %s +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @fill_inplace( +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +func @fill_inplace(%A : tensor {linalg.inplaceable = true}) -> tensor { + // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32 + %f0 = constant 0.0 : f32 + + /// Inplaceable, no alloc + // CHECK-NOT: alloc + // CHECK: linalg.fill(%[[F0]], %[[A]]) : f32, memref + %r = linalg.fill(%f0, %A) : f32, tensor -> tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +/// No linalg.inplaceable flag, must allocate. +// CHECK-LABEL: func @not_inplace( +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref) -> memref { +func @not_inplace(%A : tensor) -> tensor { + // CHECK: %[[F0:.*]] = constant 0.000000e+00 : f32 + %f0 = constant 0.0 : f32 + + // CHECK: %[[D0:.*]] = memref.dim %[[A]], {{.*}} : memref + // CHECK: %[[ALLOC:.*]] = memref.alloc(%[[D0]]) : memref + // CHECK: linalg.fill(%[[F0]], %[[ALLOC]]) : f32, memref + %r = linalg.fill(%f0, %A) : f32, tensor -> tensor + + // CHECK: dealloc %[[ALLOC]] : memref + // CHECK: return %[[ALLOC]] : memref + return %r: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_2d_dyn:.*]] = affine_map<(d0, d1)[s0, s1, s2] -> (d0 * s1 + s0 + d1 * s2)> + +// CHECK-LABEL: func @not_inplace +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref) { +func @not_inplace(%A : tensor {linalg.inplaceable = true}) -> tensor { + %f0 = constant 0.0 : f32 + + /// Cross-op multiple uses of %A, the first op which has interfering reads must alloc. + // CHECK: %[[ALLOC:.*]] = memref.alloc + // CHECK: linalg.fill({{.*}}, %[[ALLOC]] + %f = linalg.fill(%f0, %A) : f32, tensor -> tensor + + /// The second op has no interfering reads and can reuse. + // CHECK-NOT: alloc + // CHECK: linalg.matmul ins(%[[ALLOC]], %[[ALLOC]]{{.*}}) outs(%[[A]] + %r = linalg.matmul ins(%f, %f: tensor, tensor) + outs(%A: tensor) + -> tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r: tensor +} + +// ----- + +// CHECK-LABEL: func @not_inplace +func @not_inplace(%A : tensor {linalg.inplaceable = true}) -> tensor { + /// Within op multiple uses of %A, must alloc. + // CHECK: alloc + %r = linalg.matmul ins(%A, %A: tensor, tensor) + outs(%A: tensor) + -> tensor + return %r: tensor +} +// ----- + +// CHECK-LABEL: func @vec_inplace +func @vec_inplace(%A : tensor {linalg.inplaceable = true}, %vec : vector<4xf32>) + -> tensor +{ + %c0 = constant 0 : index + + // CHECK-NOT: alloc + %r = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @vec_not_inplace +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +func @vec_not_inplace(%A : tensor {linalg.inplaceable = true}, %vec : vector<4xf32>) + -> (tensor, tensor) +{ + %c0 = constant 0 : index + %c1 = constant 1 : index + + /// Cross-op multiple uses of %A, the first vector.transfer which has interfering reads must alloc. + // CHECK: %[[ALLOC:.*]] = memref.alloc + // CHECK-NEXT: vector.transfer_write {{.*}}, %[[ALLOC]] + %r0 = vector.transfer_write %vec, %A[%c0] : vector<4xf32>, tensor + + /// The second vector.transfer has no interfering reads and can reuse the buffer. + // CHECK-NOT: alloc + // CHECK-NEXT: vector.transfer_write {{.*}}, %[[A]] + %r1 = vector.transfer_write %vec, %A[%c1] : vector<4xf32>, tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r0, %r1: tensor, tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun +// CHECK-SAME: %[[A0:[a-zA-Z0-9]*]]: memref, +// CHECK-SAME: %[[A1:[a-zA-Z0-9]*]]: memref, +// CHECK-SAME: %[[t0:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>, +// CHECK-SAME: %[[t1:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func @insert_slice_fun(%A0 : tensor, + %A1 : tensor {linalg.inplaceable = true}, + %t0 : tensor<4xf32>, + %t1 : tensor<4xf32> {linalg.inplaceable = true}) + -> (tensor, tensor, tensor, tensor) +{ + // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. + // CHECK: %[[REALLOC_A0:.*]] = memref.alloc + // CHECK: linalg.copy(%[[A0]], %[[REALLOC_A0]] + // CHECK: %[[SV_A0:.*]] = memref.subview %[[REALLOC_A0]] + // CHECK: linalg.copy(%[[t0]], %[[SV_A0]]) + %r0 = tensor.insert_slice %t0 into %A0[0][4][1] : tensor<4xf32> into tensor + + // Alloc and copy the whole result tensor. Copy the tensor.extract_slice. + // CHECK: %[[REALLOC_A0_2:.*]] = memref.alloc + // CHECK: linalg.copy(%[[A0]] + // CHECK: %[[SV_A0_2:.*]] = memref.subview %[[REALLOC_A0_2]] + // CHECK: linalg.copy(%[[t1]], %[[SV_A0_2]]) + %r1 = tensor.insert_slice %t1 into %A0[0][4][1] : tensor<4xf32> into tensor + + // Still alloc the large tensor because %A1 is read after. Copy the tensor.extract_slice. + // CHECK: %[[REALLOC_A1:.*]] = memref.alloc + // CHECK: linalg.copy(%[[A1]] + // CHECK: %[[SV_A1:.*]] = memref.subview %[[REALLOC_A1]] + // CHECK: linalg.copy(%[[t0]], %[[SV_A1]]) + %r2 = tensor.insert_slice %t0 into %A1[0][4][1] : tensor<4xf32> into tensor + + // Do not realloc the large tensor. Copy the tensor.extract_slice. + // CHECK-NOT: alloc + // CHECK: %[[SV_A1_2:.*]] = memref.subview %[[A1]] + // CHECK: linalg.copy(%[[t1]], %[[SV_A1_2]]) + %r3 = tensor.insert_slice %t1 into %A1[0][4][1] : tensor<4xf32> into tensor + + // CHECK: return %[[REALLOC_A0]], %[[REALLOC_A0_2]], %[[REALLOC_A1]] : + // CHECK-SAME: memref, memref, memref + return %r0, %r1, %r2, %r3: tensor, tensor, tensor, tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func @insert_slice_fun(%A : tensor {linalg.inplaceable = true}, %t : tensor<4xf32>) + -> tensor +{ + %f0 = constant 0.0 : f32 + + // CHECK-NOT: alloc + // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] + // CHECK: linalg.copy(%[[t]], %[[SV_A]]) + %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor + + /// Overwrite A inplace. + // CHECK: linalg.fill({{.*}}, %[[A]] + %r1 = linalg.fill(%f0, %r0) : f32, tensor -> tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r1: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func @insert_slice_fun(%A : tensor {linalg.inplaceable = true}, %t : tensor<4xf32>) + -> tensor +{ + %f0 = constant 0.0 : f32 + + // CHECK: linalg.fill({{.*}}, %[[A]] + %r0 = linalg.fill(%f0, %A) : f32, tensor -> tensor + + // CHECK-NOT: alloc + // CHECK: %[[SV_A:.*]] = memref.subview %[[A]] + /// Overwrite A inplace by copying into the subview. + // CHECK: linalg.copy(%[[t]], %[[SV_A]]) + %r1 = tensor.insert_slice %t into %r0[0][4][1] : tensor<4xf32> into tensor + + // CHECK: return + // CHECK-NOT: tensor + return %r1: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun_not_inplace +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func @insert_slice_fun_not_inplace(%A : tensor, %t : tensor<4xf32>) + -> tensor +{ + // CHECK: %[[ALLOC:.*]] = memref.alloc(%{{.*}}) : memref + // CHECK: linalg.copy(%[[A]], %[[ALLOC]]) : memref + // CHECK: %[[SV:.*]] = memref.subview %[[ALLOC]][0] [4] [1] : memref to memref<4xf32> + // CHECK: linalg.copy(%[[t]], %[[SV]]) : memref<4xf32, #map>, memref<4xf32> + // CHECK: memref.dealloc %[[ALLOC]] : memref + %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor + + // CHECK: return %{{.*}} : memref + return %r0: tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @insert_slice_fun_not_inplace +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func @insert_slice_fun_not_inplace(%A : tensor {linalg.inplaceable = true}, %t : tensor<4xf32>) + -> (tensor, tensor) +{ + %f0 = constant 0.0 : f32 + + // tensor.insert_slice is bufferized first, %A is inplaceable so we can make this inplace + // CHECK-DAG: %[[SV_A:.*]] = memref.subview %[[A]][0] [4] [1] : memref to memref<4xf32, {{.*}}> + // CHECK-DAG: linalg.copy(%[[t]], %[[SV_A]]) : memref<4xf32, {{.*}}>, memref<4xf32, {{.*}}> + %r0 = tensor.insert_slice %t into %A[0][4][1] : tensor<4xf32> into tensor + + // fill would interfere with %r0 that is also being returned. + // So we need to bufferize it out of place and make a new alloc. + // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc({{.*}}) : memref + // CHECK: linalg.fill(%{{.*}}, %[[ALLOC]] + %r1 = linalg.fill(%f0, %A) : f32, tensor -> tensor + + // CHECK: memref.dealloc %[[ALLOC]] : memref + // CHECK: return %[[ALLOC]] : memref + return %r1, %r0: tensor, tensor +} + +//===----------------------------------------------------------------------===// +// Simple loop cases +//===----------------------------------------------------------------------===// + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @scf_for_yield_only +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[t:[a-zA-Z0-9]*]]: memref +func @scf_for_yield_only(%A : tensor, + %B : tensor {linalg.inplaceable = true}, + %lb : index, %ub : index, %step : index) + -> (tensor, tensor) +{ + // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc + // CHECK: linalg.copy(%[[A]], %[[ALLOC_FOR_A]]) + + // The first scf.for remains but just turns into dead code. + %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor) { + scf.yield %t : tensor + } + + // The second scf.for remains but just turns into dead code. + %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor) { + scf.yield %t : tensor + } + + // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref + // CHECK: return %[[ALLOC_FOR_A]] : memref + return %r0, %r1: tensor, tensor +} + +// ----- + +// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK-LABEL: func @scf_for_with_tensor.insert_slice +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]> +func @scf_for_with_tensor.insert_slice( + %A : tensor, + %B : tensor {linalg.inplaceable = true}, + %C : tensor<4xf32>, + %lb : index, %ub : index, %step : index) + -> (tensor, tensor) +{ + // CHECK: %[[ALLOC_FOR_A:.*]] = memref.alloc + // CHECK: linalg.copy(%[[A]], %[[ALLOC_FOR_A]]) + + // CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1] + // CHECK: %[[svB:.*]] = memref.subview %[[B]][0] [4] [1] + + // CHECK: scf.for {{.*}} + // CHECK-NOT: iter_args + %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) + -> (tensor, tensor) + { + // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA + // CHECK: linalg.copy(%[[C]], %[[svA]]) + %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor + + // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B + // CHECK: linalg.copy(%[[C]], %[[svB]]) + %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor + + // CHECK-NOT: scf.yield + scf.yield %ttA, %ttB : tensor, tensor + } + + // CHECK: memref.dealloc %[[ALLOC_FOR_A]] : memref + // CHECK: return %[[ALLOC_FOR_A]] : memref + return %r0#0, %r0#1: tensor, tensor +} + +// ----- + +//===----------------------------------------------------------------------===// +// Cross function boundary cases. +//===----------------------------------------------------------------------===// + // CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> // CHECK: func private @some_external_func(memref)