diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h @@ -77,6 +77,11 @@ /// Set the inPlace bufferization spec to false. void bufferizeOutOfPlace(OpOperand &operand); + /// Return true if `v1` and `v2` may bufferize to aliasing buffers. + bool areAliasingBufferizedValues(Value v1, Value v2) const { + return aliasInfo.isEquivalent(v1, v2); + } + /// Return true if `v1` and `v2` bufferize to equivalent buffers. bool areEquivalentBufferizedValues(Value v1, Value v2) const { return equivalentInfo.isEquivalent(v1, v2); diff --git a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp --- a/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp +++ b/mlir/lib/Dialect/Linalg/ComprehensiveBufferize/ModuleBufferization.cpp @@ -19,8 +19,9 @@ // gathered through PostAnalysisStepFns and stored in // `ModuleAnalysisState`. // -// * `equivalentFuncOpBBArgsAnalysis` determines the equivalent bbArg for each -// tensor return value (if any). +// * `aliasingFuncOpBBArgsAnalysis` determines the equivalent/aliasing bbArgs +// for +// each tensor return value (if any). // * `funcOpBbArgReadWriteAnalysis` determines whether or not a tensor bbArg is // read/written. // @@ -93,16 +94,31 @@ /// Extra analysis state that is required for bufferization of function /// boundaries. struct ModuleAnalysisState : public DialectAnalysisState { + // Note: Function arguments and/or function return values may disappear during + // bufferization. Functions and their CallOps are analyzed and bufferized + // separately. To ensure that a CallOp analysis/bufferization can access an + // already bufferized function's analysis results, we store bbArg/return value + // indices instead of BlockArguments/OpOperand pointers. + /// A set of block argument indices. using BbArgIndexSet = DenseSet; /// A mapping of indices to indices. using IndexMapping = DenseMap; + /// A mapping of indices to a list of indices. + using IndexToIndexListMapping = DenseMap>; + /// A mapping of ReturnOp OpOperand indices to equivalent FuncOp BBArg /// indices. DenseMap equivalentFuncArgs; + /// A mapping of ReturnOp OpOperand indices to aliasing FuncOp BBArg indices. + DenseMap aliasingFuncArgs; + + /// A mapping of FuncOp BBArg indices to aliasing ReturnOp OpOperand indices. + DenseMap aliasingReturnVals; + /// A set of all read BlockArguments of FuncOps. DenseMap readBbArgs; @@ -124,13 +140,21 @@ void startFunctionAnalysis(FuncOp funcOp) { analyzedFuncOps[funcOp] = FuncOpAnalysisState::InProgress; auto createdEquiv = equivalentFuncArgs.try_emplace(funcOp, IndexMapping()); + auto createdAliasingOperands = + aliasingFuncArgs.try_emplace(funcOp, IndexToIndexListMapping()); + auto createdAliasingResults = + aliasingReturnVals.try_emplace(funcOp, IndexToIndexListMapping()); auto createdRead = readBbArgs.try_emplace(funcOp, BbArgIndexSet()); auto createdWritten = writtenBbArgs.try_emplace(funcOp, BbArgIndexSet()); (void)createdEquiv; + (void)createdAliasingOperands; + (void)createdAliasingResults; (void)createdRead; (void)createdWritten; #ifndef NDEBUG assert(createdEquiv.second && "equivalence info exists already"); + assert(createdAliasingOperands.second && "aliasing info exists already"); + assert(createdAliasingResults.second && "aliasing info exists already"); assert(createdRead.second && "bbarg access info exists already"); assert(createdWritten.second && "bbarg access info exists already"); #endif // NDEBUG @@ -201,12 +225,12 @@ op->setAttr(kEquivalentArgsAttr, b.getI64ArrayAttr(equivBbArgs)); } -/// Store function BlockArguments that are equivalent to a returned value in -/// ModuleAnalysisState. +/// Store function BlockArguments that are equivalent to/aliasing a returned +/// value in ModuleAnalysisState. static LogicalResult -equivalentFuncOpBBArgsAnalysis(Operation *op, AnalysisState &state, - BufferizationAliasInfo &aliasInfo, - SmallVector &newOps) { +aliasingFuncOpBBArgsAnalysis(Operation *op, AnalysisState &state, + BufferizationAliasInfo &aliasInfo, + SmallVector &newOps) { ModuleAnalysisState &moduleState = getModuleAnalysisState(state); // Support only single return-terminated block in the function. @@ -217,14 +241,20 @@ for (OpOperand &returnVal : returnOp->getOpOperands()) if (returnVal.get().getType().isa()) for (BlockArgument bbArg : funcOp.getArguments()) - if (bbArg.getType().isa()) + if (bbArg.getType().isa()) { + int64_t returnIdx = returnVal.getOperandNumber(); + int64_t bbArgIdx = bbArg.getArgNumber(); if (aliasInfo.areEquivalentBufferizedValues(returnVal.get(), bbArg)) { - moduleState - .equivalentFuncArgs[funcOp][returnVal.getOperandNumber()] = - bbArg.getArgNumber(); + moduleState.equivalentFuncArgs[funcOp][returnIdx] = bbArgIdx; if (state.getOptions().testAnalysisOnly) annotateEquivalentReturnBbArg(returnVal, bbArg); } + if (aliasInfo.areAliasingBufferizedValues(returnVal.get(), bbArg)) { + moduleState.aliasingFuncArgs[funcOp][returnIdx].push_back(bbArgIdx); + moduleState.aliasingReturnVals[funcOp][bbArgIdx].push_back( + returnIdx); + } + } return success(); } @@ -364,7 +394,8 @@ } /// Gather equivalence info of CallOps. -/// Note: This only adds new equivalence info if `funcOp` was already analyzed. +/// Note: This only adds new equivalence info if the called function was already +/// analyzed. // TODO: This does not handle cyclic function call graphs etc. static void equivalenceAnalysis(FuncOp funcOp, BufferizationAliasInfo &aliasInfo, @@ -750,15 +781,23 @@ FuncOp funcOp = getCalledFunction(callOp); assert(funcOp && "expected CallOp to a FuncOp"); const ModuleAnalysisState &moduleState = getModuleAnalysisState(state); + if (getFuncOpAnalysisState(state, funcOp) != + FuncOpAnalysisState::Analyzed) { + // FuncOp not analyzed yet. Any OpResult may be aliasing. + SmallVector result; + for (OpResult opResult : op->getOpResults()) + if (opResult.getType().isa()) + result.push_back(opResult); + return result; + } + // Get aliasing results from state. + auto aliasingReturnVals = + moduleState.aliasingReturnVals.lookup(funcOp).lookup( + opOperand.getOperandNumber()); SmallVector result; - for (int64_t resultIdx = 0; resultIdx < callOp->getNumResults(); - ++resultIdx) - if (Optional maybeArgNumber = - getEquivalentFuncArgIdx(funcOp, moduleState, resultIdx)) - if (*maybeArgNumber == opOperand.getOperandNumber()) - result.push_back(callOp->getOpResult(resultIdx)); - + for (int64_t resultIdx : aliasingReturnVals) + result.push_back(callOp->getOpResult(resultIdx)); return result; } @@ -769,17 +808,23 @@ FuncOp funcOp = getCalledFunction(callOp); assert(funcOp && "expected CallOp to a FuncOp"); const ModuleAnalysisState &moduleState = getModuleAnalysisState(state); + if (getFuncOpAnalysisState(state, funcOp) != + FuncOpAnalysisState::Analyzed) { + // FuncOp not analyzed yet. Any OpOperand may be aliasing. + SmallVector result; + for (OpOperand &opOperand : op->getOpOperands()) + if (opOperand.get().getType().isa()) + result.push_back(&opOperand); + return result; + } - // TODO: We should be looking for aliasing block arguments here. The current - // condition is actually stronger than neccesary. Once we check for aliasing - // block arguments, we may be multiple. - if (Optional maybeArgNumber = getEquivalentFuncArgIdx( - funcOp, moduleState, opResult.getResultNumber())) - return {&op->getOpOperand(*maybeArgNumber)}; - - // Note: Returning a non-equivalent tensor from a FuncOp is currently not - // supported an will fail bufferization. - return {}; + // Get aliasing bbArgs from state. + auto aliasingFuncArgs = moduleState.aliasingFuncArgs.lookup(funcOp).lookup( + opResult.getResultNumber()); + SmallVector result; + for (int64_t bbArgIdx : aliasingFuncArgs) + result.push_back(&callOp->getOpOperand(bbArgIdx)); + return result; } BufferRelation bufferRelation(Operation *op, OpResult opResult, @@ -799,6 +844,8 @@ assert(funcOp && "expected CallOp to a FuncOp"); const ModuleAnalysisState &moduleState = getModuleAnalysisState(state.getAnalysisState()); + const OneShotBufferizationOptions &options = + static_cast(state.getOptions()); // Result types of the bufferized CallOp. SmallVector resultTypes; @@ -850,8 +897,16 @@ continue; } - return callOp->emitError( - "call to FuncOp that returns non-equivalent tensors not supported"); + if (!options.allowReturnAllocs) + return callOp->emitError( + "call to FuncOp that returns non-equivalent tensors not supported"); + + // Returning a memref. This memref is not equivalent to any bbArg. It is + // likely a newly allocated buffer. We may want to hoist such allocations + // to the call site in the future. + retValMapping[returnValIdx] = resultTypes.size(); + resultTypes.push_back( + funcOp.getFunctionType().getResult(resultTypes.size())); } // 2. Compute bufferized FunctionType. @@ -859,7 +914,7 @@ // Get the bufferized FunctionType for funcOp or construct it if not yet // available. FunctionType bufferizedFuncType = getBufferizedFunctionType( - funcOp.getContext(), argumentTypes, resultTypes, state.getOptions()); + funcOp.getContext(), argumentTypes, resultTypes, options); // 3. Rewrite tensor operands as memrefs based on `bufferizedFuncType`. for (OpOperand &opOperand : callOp->getOpOperands()) { @@ -1021,7 +1076,7 @@ return failure(); // Collect bbArg/return value information after the analysis. - options.addPostAnalysisStep(equivalentFuncOpBBArgsAnalysis); + options.addPostAnalysisStep(aliasingFuncOpBBArgsAnalysis); options.addPostAnalysisStep(funcOpBbArgReadWriteAnalysis); // Analyze ops. diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir --- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir @@ -406,84 +406,6 @@ // Cross function boundary cases. //===----------------------------------------------------------------------===// -// CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]> -// CHECK: func private @some_external_func(memref<4xi32, #[[$DYN_1D_MAP]]>) -func private @some_external_func(tensor<4xi32>) - -// CHECK: func @main() -func @main() { -// CHECK-DAG: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> - %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> - -// CHECK-DAG: %[[alloc:.*]] = memref.alloc -// CHECK-DAG: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> -// CHECK-DAG: memref.copy %[[A]], %[[alloc]] -// CHECK: call @some_external_func(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> () - call @some_external_func(%A) : (tensor<4xi32>) -> () - -// CHECK: memref.dealloc %[[alloc]] - return -} - -// ----- - -// CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]> -// CHECK: func private @some_external_func_within_scf_execute(memref<4xi32, #[[$DYN_1D_MAP]]>) -func private @some_external_func_within_scf_execute(tensor<4xi32>) - -// CHECK: func @main() -func @main() { -// CHECK-DAG: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> - %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> - -// CHECK-DAG: %[[alloc:.*]] = memref.alloc -// CHECK-DAG: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> -// CHECK-DAG: memref.copy %[[A]], %[[alloc]] -// CHECK: call @some_external_func_within_scf_execute(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> () - scf.execute_region { - call @some_external_func_within_scf_execute(%A) : (tensor<4xi32>) -> () - scf.yield - } - -// CHECK: memref.dealloc %[[alloc]] - return -} - -// ----- - -// CHECK: func private @external_func_with_return_val(memref<4xi32, #{{.*}}>) -> f32 -func private @external_func_with_return_val(tensor<4xi32>) -> f32 - -// ----- - -// CHECK-LABEL: func @execute_region_test( -// CHECK-SAME: %[[m1:.*]]: memref {linalg.inplaceable = "true"}) - -> (f32, tensor, f32) -{ - %f1 = arith.constant 0.0 : f32 - %f2 = arith.constant 1.0 : f32 - %idx = arith.constant 7 : index - - // scf.execute_region is canonicalized away after bufferization. So just the - // memref.store is left over. - - // CHECK: memref.store %{{.*}}, %[[m1]][%{{.*}}] - %0, %1, %2 = scf.execute_region -> (f32, tensor, f32) { - %t2 = tensor.insert %f2 into %t1[%idx] : tensor - scf.yield %f1, %t2, %f2 : f32, tensor, f32 - } - - // CHECK: return %{{.*}}, %{{.*}} : f32, f32 - return %0, %1, %2 : f32, tensor, f32 -} - -// ----- - // CHECK-LABEL: func @execute_region_with_conflict( // CHECK-SAME: %[[m1:.*]]: memref {linalg.inplaceable = "true"}) @@ -513,193 +435,6 @@ // ----- -// CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK: func private @some_external_func(memref) -func private @some_external_func(tensor) - -// CHECK: func @scf_for_with_tensor_insert_slice( -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref -// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref -// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]> -func @scf_for_with_tensor_insert_slice( - %A : tensor, %B : tensor, %C : tensor<4xf32>, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ - // CHECK-NEXT: scf.for - %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) - -> (tensor, tensor) - { - // CHECK-NEXT: %[[SVA:.*]] = memref.subview %[[A]] - // CHECK-NEXT: memref.copy %[[C]], %[[SVA]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]> - %ttA = tensor.insert_slice %C into %tA[%i][4][1] : tensor<4xf32> into tensor - - // CHECK-NEXT: %[[SVB:.*]] = memref.subview %[[B]] - // CHECK-NEXT: memref.copy %[[C]], %[[SVB]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]> - %ttB = tensor.insert_slice %C into %tB[%i][4][1] : tensor<4xf32> into tensor - - // scf.yield is empty and is elided - // CHECK-NOT: scf.yield - scf.yield %ttA, %ttB : tensor, tensor - } - - // Swaparoo requires bufferizing the whole function to figure out who's who. - return %r0#1, %r0#0: tensor, tensor -} - -// CHECK: func @bar( -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref -// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref -// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]> -func @bar( - %A : tensor {linalg.inplaceable = true}, - %B : tensor {linalg.inplaceable = true}, - %C : tensor<4xf32> {linalg.inplaceable = true}, - %lb : index, %ub : index, %step : index) - -> (tensor, tensor) -{ -// CHECK-DAG: call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]] - %r0:2 = call @scf_for_with_tensor_insert_slice(%A, %B, %C, %lb, %ub, %step) : - (tensor, tensor, tensor<4xf32>, index, index, index) - -> (tensor, tensor) - - // %r0#0 requires a copy because we have no idea what the function is doing. -// CHECK-DAG: %[[alloc:.*]] = memref.alloc -// CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]] -// CHECK: memref.copy %[[B]], %[[alloc]] -// CHECK-NEXT: call @some_external_func(%[[casted]]) : (memref) -> () - call @some_external_func(%r0#0) : (tensor) -> () - -// CHECK: return - return %r0#0, %r0#1: tensor, tensor -} - -// ----- - -// CHECK-DAG: #[[$DYN_0D_MAP:.*]] = affine_map<()[s0] -> (s0)> -// CHECK-DAG: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK: func @init_and_dot( -// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]> -// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]> -// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref -func @init_and_dot(%a: tensor<64xf32>, %b: tensor<64xf32>, %c: tensor) -> tensor { - // CHECK-NEXT: %[[C0:.*]] = arith.constant 0{{.*}} : f32 - %v0 = arith.constant 0.0 : f32 - - // CHECK-NEXT: linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref) - %d = linalg.fill ins(%v0 : f32) outs(%c : tensor) -> tensor - - // CHECK-NEXT: linalg.dot ins(%[[A]], %[[B]] : memref<64xf32, #[[$DYN_1D_MAP]]>, memref<64xf32, #[[$DYN_1D_MAP]]>) outs(%[[C]] : memref) - %e = linalg.dot ins(%a, %b : tensor<64xf32>,tensor<64xf32>) - outs(%d: tensor) -> tensor - - // CHECK-NEXT: return - return %e : tensor -} - -// CHECK: func @main() -func @main() { - // CHECK-DAG: %[[C0:.*]] = arith.constant 0{{.*}} : f32 - // CHECK-DAG: %[[C1:.*]] = arith.constant 1{{.*}} : f32 - // CHECK-DAG: %[[C2:.*]] = arith.constant 2{{.*}} : f32 - %v0 = arith.constant 0.0 : f32 - %v1 = arith.constant 1.0 : f32 - %v2 = arith.constant 2.0 : f32 - - // CHECK-NEXT: %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> - // CHECK-NEXT: %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> - // CHECK-NEXT: %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref - // CHECK-DAG: %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> - // CHECK-DAG: %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> - // CHECK-DAG: %[[cC:.*]] = memref.cast %[[C]] : memref to memref - %A = linalg.init_tensor [64] : tensor<64xf32> - %B = linalg.init_tensor [64] : tensor<64xf32> - %C = linalg.init_tensor [] : tensor - - // CHECK-DAG: linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>) - // CHECK-DAG: linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>) - // CHECK-DAG: linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref) - %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32> - %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32> - %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor) -> tensor - - // CHECK-NEXT: call @init_and_dot(%[[cA]], %[[cB]], %[[cC]]) - %res = call @init_and_dot(%AA, %BB, %CC) : - (tensor<64xf32>, tensor<64xf32>, tensor) -> tensor - - // CHECK-NEXT: %[[dC:.*]] = memref.cast %[[C]] : memref to memref<*xf32> - %res2 = tensor.cast %res: tensor to tensor<*xf32> - - // CHECK-NEXT: call @print_memref_f32(%[[dC]]) : (memref<*xf32>) -> () - call @print_memref_f32(%res2) : (tensor<*xf32>) -> () - - // CHECK-DAG: memref.dealloc %[[A]] : memref<64xf32> - // CHECK-DAG: memref.dealloc %[[B]] : memref<64xf32> - // CHECK-DAG: memref.dealloc %[[C]] : memref - // CHECK-NEXT: return - return -} - -// CHECK: func private @print_memref_f32(memref<*xf32>) -func private @print_memref_f32(tensor<*xf32>) - -// ----- - -// CHECK: #[[$DYNAMIC:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> - -// CHECK: func private @external_func(memref) -func private @external_func(tensor) - -// CHECK: func @callee( -// CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref -// CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref -// CHECK-SAME: %[[C:[0-9a-zA-Z]*]]: memref -func @callee(%A : tensor {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>}, - %B : tensor, - %C : tensor) { -// CHECK-NEXT: %[[CASTED:.*]] = memref.cast %[[A]] : memref to memref -// CHECK-NEXT: call @external_func(%[[CASTED]]) : (memref) -> () - call @external_func(%A) : (tensor) -> () - -// CHECK-NEXT: call @external_func(%[[B]]) : (memref) -> () - call @external_func(%B) : (tensor) -> () - -// CHECK-NEXT: call @external_func(%[[C]]) : (memref) -> () - call @external_func(%C) : (tensor) -> () - - return -} - -// CHECK: func @entry( -// CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref -// CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref -// CHECK-SAME: %[[C:[0-9a-zA-Z]*]]: memref -func @entry(%A : tensor {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false}, - %B : tensor {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false}, - %C : tensor {linalg.inplaceable = false}) { -// Note: `callee` does not write to its bbArg directly, but `external_func` -// does. Inside `callee`, the writes via `external_func` do not cause a -// conflict. However, inside `entry`, the writes do cause a conflict because -// %A, %B and %C are not inplaceable. This test case shows that this kind of -// conflict detection has a "transitive" nature. -// CHECK: %[[ALLOC_C:.*]] = memref.alloc -// CHECK: %[[CASTED_C:.*]] = memref.cast %[[ALLOC_C]] -// CHECK: %[[ALLOC_B:.*]] = memref.alloc -// CHECK: %[[CASTED_B:.*]] = memref.cast %[[ALLOC_B]] -// CHECK: %[[ALLOC_A:.*]] = memref.alloc -// CHECK: memref.copy %[[A]], %[[ALLOC_A]] -// CHECK: memref.copy %[[B]], %[[ALLOC_B]] -// CHECK: memref.copy %[[C]], %[[ALLOC_C]] -// CHECK: %[[CASTED_A:.*]] = memref.cast %[[ALLOC_A]] -// CHECK-NEXT: call @callee(%[[CASTED_A]], %[[CASTED_B]], %[[CASTED_C]]) - call @callee(%A, %B, %C) : (tensor, tensor, tensor) -> () - return -} - -// ----- - // CHECK: func @matmul( // CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32> // CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32> @@ -900,115 +635,6 @@ // ----- -// CHECK-LABEL: func @inner_func( -// CHECK-SAME: %[[arg0:.*]]: memref) -> tensor { - %f = arith.constant 1.0 : f32 - %c0 = arith.constant 0 : index - // CHECK: memref.store %{{.*}}, %[[arg0]] - %0 = tensor.insert %f into %t[%c0] : tensor - return %0 : tensor -} - -// CHECK-LABEL: func @equivalent_func_arg( -// CHECK-SAME: %[[arg0:.*]]: memref {linalg.inplaceable = true}, - %c0: index, %c10: index, %c1: index) -> tensor { - // CHECK-NOT: copy - %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor) { - // CHECK: call @inner_func(%[[arg0]]) - %3 = call @inner_func(%t1) : (tensor) -> tensor - scf.yield %3 : tensor - } - return %1: tensor -} - -// ----- - -// CHECK-LABEL: func @inner_func_2( -// CHECK-SAME: %[[arg0:.*]]: memref) -> tensor { - %f = arith.constant 1.0 : f32 - %c0 = arith.constant 0 : index - // CHECK: memref.store %{{.*}}, %[[arg0]] - %0 = tensor.insert %f into %t[%c0] : tensor - return %0 : tensor -} - -// CHECK-LABEL: func @equivalent_func_arg_2( -// CHECK-SAME: %[[arg0:.*]]: memref {linalg.inplaceable = true}, - %c0: index, %c10: index, %c1: index) -> tensor { - %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor) { - // CHECK: %[[alloc:.*]] = memref.alloc - // CHECK: %[[casted:.*]] = memref.cast %[[alloc]] - // CHECK: memref.copy %[[arg0]], %[[alloc]] - // CHECK: call @inner_func_2(%[[casted]]) - %3 = call @inner_func_2(%t1) : (tensor) -> tensor - scf.yield %t1 : tensor - } - return %1: tensor -} - -// ----- - -// CHECK-LABEL: func @inner_func( -// CHECK-SAME: %[[arg0:.*]]: memref) -> (tensor, f32) { - // CHECK-NOT: copy - %f = arith.constant 1.0 : f32 - %c0 = arith.constant 0 : index - %c1 = arith.constant 1 : index - // CHECK: memref.store %{{.*}}, %[[arg0]] - %0 = tensor.insert %f into %t[%c0] : tensor - // CHECK: %[[load:.*]] = memref.load %[[arg0]] - %1 = tensor.extract %0[%c1] : tensor - // CHECK: return %[[load]] : f32 - return %0, %1 : tensor, f32 -} - -// CHECK-LABEL: func @call_func_with_non_tensor_return( -// CHECK-SAME: %[[arg0:.*]]: memref {linalg.inplaceable = true}) -> (f32, tensor) { - // CHECK-NOT: copy - // CHECK: %[[call:.*]] = call @inner_func(%[[arg0]]) - %0, %1 = call @inner_func(%t0) : (tensor) -> (tensor, f32) - // CHECK: return %[[call]] : f32 - return %1, %0 : f32, tensor -} - -// ----- - -// CHECK-LABEL: func @func_without_tensor_args -func @func_without_tensor_args(%v : vector<10xf32>) -> () { - // CHECK: %[[alloc:.*]] = memref.alloc() - %0 = linalg.init_tensor[10] : tensor<10xf32> - - %c0 = arith.constant 0 : index - // CHECK: vector.transfer_write %{{.*}}, %[[alloc]] - %1 = vector.transfer_write %v, %0[%c0] : vector<10xf32>, tensor<10xf32> - - %cst = arith.constant 0.0 : f32 - // CHECK: vector.transfer_read %[[alloc]] - %r = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<11xf32> - - vector.print %r : vector<11xf32> - return -} - -// ----- - -// CHECK-LABEL: func private @private_func -func private @private_func(tensor) -> () - -// CHECK-LABEL: func @empty_func() -func @empty_func() -> () { - return -} - -// ----- - func @gather_like( %arg0 : tensor {linalg.inplaceable = false}, %arg1 : tensor {linalg.inplaceable = false}, @@ -1328,3 +954,4 @@ // CHECK: return %[[r0]], %[[r1]] return %f0, %f1: f32, f32 } + diff --git a/mlir/test/Dialect/Linalg/one-shot-module-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Linalg/one-shot-module-bufferize-allow-return-allocs.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/one-shot-module-bufferize-allow-return-allocs.mlir @@ -0,0 +1,64 @@ +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize=allow-return-allocs -split-input-file | FileCheck %s + +// Run fuzzer with different seeds. +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null + +// Test bufferization using memref types that have no layout map. +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs fully-dynamic-layout-maps=0" -split-input-file -o /dev/null + +// Make sure that the returned buffer is not deallocated. +// TODO: Such buffers currently leak. We need buffer hoisting / ref counting for +// this in the future. + +// CHECK-LABEL: func @create_tensor() -> memref<10xf32> { +// CHECK: %[[alloc:.*]] = memref.alloc +// CHECK: return %[[alloc]] +func @create_tensor() -> tensor<10xf32> { + %0 = linalg.init_tensor [10] : tensor<10xf32> + return %0 : tensor<10xf32> +} + +// CHECK: func @caller( +// CHECK: %[[call:.*]] = call @create_tensor() : () -> memref<10xf32> +// CHECK: %[[extracted:.*]] = memref.load %[[call]] +// CHECK: return %[[extracted]] +func @caller(%idx: index) -> f32 { + %0 = call @create_tensor() : () -> (tensor<10xf32>) + %1 = tensor.extract %0[%idx] : tensor<10xf32> + return %1 : f32 +} + +// ----- + +// return_slice returns an aliasing tensor. In main, %t is overwritten (but not +// read). This is a conflict because %0 is aliasing with %t. An alloc + copy is +// needed. + +// CHECK-LABEL: func @return_slice( +// CHECK-NOT: alloc +// CHECK-NOT: copy +// CHECK: memref.subview +func @return_slice(%t: tensor, %sz: index) -> (tensor) { + %0 = tensor.extract_slice %t[4][%sz][1] : tensor to tensor + return %0 : tensor +} + +// CHECK-LABEL: func @main( +// CHECK-SAME: %[[t:.*]]: memref, %sz: index, %idx: index) -> (f32, f32) { + %cst = arith.constant 1.0 : f32 + %0 = call @return_slice(%t, %sz) : (tensor, index) -> (tensor) + %filled = linalg.fill ins(%cst : f32) outs(%t : tensor) -> tensor + %r1 = tensor.extract %0[%idx] : tensor + %r2 = tensor.extract %filled[%idx] : tensor + return %r1, %r2 : f32, f32 +} diff --git a/mlir/test/Dialect/Linalg/one-shot-module-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-module-bufferize.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/Linalg/one-shot-module-bufferize.mlir @@ -0,0 +1,551 @@ +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize -split-input-file | FileCheck %s + +// Run fuzzer with different seeds. +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59" -split-input-file -o /dev/null +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91" -split-input-file -o /dev/null + +// Test bufferization using memref types that have no layout map. +// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-allocs fully-dynamic-layout-maps=0" -split-input-file -o /dev/null + +// Bufferization of bodiless function with no tensor return value. + +// CHECK-LABEL: func private @private_func +func private @private_func(tensor) -> () + +// CHECK-LABEL: func @empty_func() +func @empty_func() -> () { + return +} + +// ----- + +// A bodiless function that returns something that is not a tensor. + +// CHECK: func private @external_func_with_return_val(memref<4xi32, #{{.*}}>) -> f32 +func private @external_func_with_return_val(tensor<4xi32>) -> f32 + +// ----- + +// CHECK-LABEL: func private @private_func +func private @private_func(tensor) -> (f32) + +// private_func may modify the buffer arg, but that's OK because %t is writable. +// No alloc/copy should be inserted. + +// CHECK-LABEL: func @main( +// CHECK-SAME: %[[t:.*]]: memref {linalg.inplaceable = true}) -> (f32) { + %0 = call @private_func(%t) : (tensor) -> (f32) + return %0 : f32 +} + +// ----- + +// CHECK-LABEL: func private @private_func +func private @private_func(tensor) -> (f32) + +// private_func may modify the buffer arg, %t is not writable. A copy is needed. + +// CHECK-LABEL: func @main( +// CHECK-SAME: %[[t:.*]]: memref {linalg.inplaceable = false}) -> (f32) { + %0 = call @private_func(%t) : (tensor) -> (f32) + return %0 : f32 +} + +// ----- + +// Test bufferization of a function without tensor args. + +// CHECK-LABEL: func @func_without_tensor_args +func @func_without_tensor_args(%v : vector<10xf32>) -> () { + // CHECK: %[[alloc:.*]] = memref.alloc() + %0 = linalg.init_tensor[10] : tensor<10xf32> + + %c0 = arith.constant 0 : index + // CHECK: vector.transfer_write %{{.*}}, %[[alloc]] + %1 = vector.transfer_write %v, %0[%c0] : vector<10xf32>, tensor<10xf32> + + %cst = arith.constant 0.0 : f32 + // CHECK: vector.transfer_read %[[alloc]] + %r = vector.transfer_read %1[%c0], %cst : tensor<10xf32>, vector<11xf32> + + vector.print %r : vector<11xf32> + return +} + +// ----- + +// Bufferization of a function that is reading and writing. %t0 is writable, so +// no copy should be inserted. + +// CHECK-LABEL: func @inner_func( +// CHECK-SAME: %[[arg0:.*]]: memref) -> (tensor, f32) { + // CHECK-NOT: copy + %f = arith.constant 1.0 : f32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + // CHECK: memref.store %{{.*}}, %[[arg0]] + %0 = tensor.insert %f into %t[%c0] : tensor + // CHECK: %[[load:.*]] = memref.load %[[arg0]] + %1 = tensor.extract %0[%c1] : tensor + // CHECK: return %[[load]] : f32 + return %0, %1 : tensor, f32 +} + +// CHECK-LABEL: func @call_func_with_non_tensor_return( +// CHECK-SAME: %[[arg0:.*]]: memref {linalg.inplaceable = true}) -> (f32, tensor) { + // CHECK-NOT: alloc + // CHECK-NOT: copy + // CHECK: %[[call:.*]] = call @inner_func(%[[arg0]]) + %0, %1 = call @inner_func(%t0) : (tensor) -> (tensor, f32) + // CHECK: return %[[call]] : f32 + return %1, %0 : f32, tensor +} + +// ----- + +// Bufferization of a function that is reading and writing. %t0 is not writable, +// so a copy is needed. + +// CHECK-LABEL: func @inner_func( +// CHECK-SAME: %[[arg0:.*]]: memref) -> (tensor, f32) { + // CHECK-NOT: copy + %f = arith.constant 1.0 : f32 + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + // CHECK: memref.store %{{.*}}, %[[arg0]] + %0 = tensor.insert %f into %t[%c0] : tensor + // CHECK: %[[load:.*]] = memref.load %[[arg0]] + %1 = tensor.extract %0[%c1] : tensor + // CHECK: return %[[load]] : f32 + return %0, %1 : tensor, f32 +} + +// CHECK-LABEL: func @call_func_with_non_tensor_return( +// CHECK-SAME: %[[arg0:.*]]: memref {linalg.inplaceable = false}) -> (f32, tensor) { + // CHECK: %[[alloc:.*]] = memref.alloc + // CHECK-DAG: memref.copy %[[arg0]], %[[alloc]] + // CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]] + // CHECK: %[[call:.*]] = call @inner_func(%[[casted]]) + %0, %1 = call @inner_func(%t0) : (tensor) -> (tensor, f32) + + // Note: The tensor return value has folded away. + // CHECK: return %[[call]] : f32 + return %1, %0 : f32, tensor +} + +// ----- + +// A chain of function calls. The last function f0 is potentially writing to the +// buffer. This becomes a problem when bufferizing main and a copy must be +// inserted then. (No copies in the other functions.) + +// CHECK-LABEL: func private @f0( +func private @f0(tensor) -> (f32) + +// CHECK-LABEL: func @f1( +// CHECK-SAME: %[[t1:.*]]: memref) -> (f32) { + %0 = call @f0(%t) : (tensor) -> (f32) + return %0 : f32 +} + +// CHECK-LABEL: func @f2( +// CHECK-SAME: %[[t2:.*]]: memref) -> (f32) { + %0 = call @f1(%t) : (tensor) -> (f32) + return %0 : f32 +} + +// CHECK-LABEL: func @main( +// CHECK-SAME: %[[t3:.*]]: memref {linalg.inplaceable = false}) -> (f32) { + %0 = call @f2(%t) : (tensor) -> (f32) + return %0 : f32 +} + +// ----- + +// This function does not read, just write. We need an alloc, but no copy. + +// CHECK-LABEL: func @does_not_read( +// CHECK-NOT: alloc +// CHECK-NOT: copy +func @does_not_read(%t: tensor) -> tensor { + %f0 = arith.constant 0.0 : f32 + %r = linalg.fill ins(%f0 : f32) outs(%t : tensor) -> tensor + return %r : tensor +} + +// CHECK-LABEL: func @main( +// CHECK-SAME: %[[t:.*]]: memref {linalg.inplaceable = false}) -> f32 { + %0 = call @does_not_read(%t) : (tensor) -> (tensor) + %idx = arith.constant 4 : index + %r = tensor.extract %0[%idx] : tensor + return %r : f32 +} + +// ----- + +// Alloc and copy must be inserted because the arith.constant is read-only. + +// CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]> +// CHECK: func private @some_external_func(memref<4xi32, #[[$DYN_1D_MAP]]>) +func private @some_external_func(tensor<4xi32>) + +// CHECK: func @main() +func @main() { +// CHECK-DAG: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> + %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> + +// CHECK-DAG: %[[alloc:.*]] = memref.alloc +// CHECK-DAG: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> +// CHECK-DAG: memref.copy %[[A]], %[[alloc]] +// CHECK: call @some_external_func(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> () + call @some_external_func(%A) : (tensor<4xi32>) -> () + +// CHECK: memref.dealloc %[[alloc]] + return +} + +// ----- + +// Alloc and copy must be inserted because the arith.constant is read-only. The +// function call is inside of an scf.execute_region. + +// CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK: memref.global "private" constant @__constant_4xi32 : memref<4xi32> = dense<[1, 2, 3, 4]> +// CHECK: func private @some_external_func_within_scf_execute(memref<4xi32, #[[$DYN_1D_MAP]]>) +func private @some_external_func_within_scf_execute(tensor<4xi32>) + +// CHECK: func @main() +func @main() { +// CHECK-DAG: %[[A:.*]] = memref.get_global @__constant_4xi32 : memref<4xi32> + %A = arith.constant dense<[1, 2, 3, 4]> : tensor<4xi32> + +// Note: The scf.execute_region canonicalizes away. + +// CHECK-DAG: %[[alloc:.*]] = memref.alloc +// CHECK-DAG: %[[B:.*]] = memref.cast %[[alloc]] : memref<4xi32> to memref<4xi32, #[[$DYN_1D_MAP]]> +// CHECK-DAG: memref.copy %[[A]], %[[alloc]] +// CHECK: call @some_external_func_within_scf_execute(%[[B]]) : (memref<4xi32, #[[$DYN_1D_MAP]]>) -> () + scf.execute_region { + call @some_external_func_within_scf_execute(%A) : (tensor<4xi32>) -> () + scf.yield + } + +// CHECK: memref.dealloc %[[alloc]] + return +} + +// ----- + +// A write inside an scf.execute_region. An equivalent tensor is yielded. + +// CHECK-LABEL: func @execute_region_test( +// CHECK-SAME: %[[m1:.*]]: memref) + -> (f32, tensor, f32) +{ + %f1 = arith.constant 0.0 : f32 + %f2 = arith.constant 1.0 : f32 + %idx = arith.constant 7 : index + + // scf.execute_region is canonicalized away after bufferization. So just the + // memref.store is left over. + + // CHECK-NOT: alloc + // CHECK-NOT: copy + // CHECK: memref.store %{{.*}}, %[[m1]][%{{.*}}] + %0, %1, %2 = scf.execute_region -> (f32, tensor, f32) { + %t2 = tensor.insert %f2 into %t1[%idx] : tensor + scf.yield %f1, %t2, %f2 : f32, tensor, f32 + } + + // CHECK: return %{{.*}}, %{{.*}} : f32, f32 + return %0, %1, %2 : f32, tensor, f32 +} + +// ----- + +// CHECK: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK: func private @some_external_func(memref) +func private @some_external_func(tensor) + +// CHECK: func @scf_for_with_tensor_insert_slice( +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]> +func @scf_for_with_tensor_insert_slice( + %A : tensor, %B : tensor, %C : tensor<4xf32>, + %lb : index, %ub : index, %step : index) + -> (tensor, tensor) +{ + // CHECK-NEXT: scf.for + %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B) + -> (tensor, tensor) + { + // CHECK-NEXT: %[[SVA:.*]] = memref.subview %[[A]] + // CHECK-NEXT: memref.copy %[[C]], %[[SVA]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]> + %ttA = tensor.insert_slice %C into %tA[%i][4][1] : tensor<4xf32> into tensor + + // CHECK-NEXT: %[[SVB:.*]] = memref.subview %[[B]] + // CHECK-NEXT: memref.copy %[[C]], %[[SVB]] : memref<4xf32, #[[$DYN_1D_MAP]]> to memref<4xf32, #[[$DYN_1D_MAP]]> + %ttB = tensor.insert_slice %C into %tB[%i][4][1] : tensor<4xf32> into tensor + + // scf.yield is empty and is elided + // CHECK-NOT: scf.yield + scf.yield %ttA, %ttB : tensor, tensor + } + + // Swaparoo requires bufferizing the whole function to figure out who's who. + return %r0#1, %r0#0: tensor, tensor +} + +// CHECK: func @bar( +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref +// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$DYN_1D_MAP]]> +func @bar( + %A : tensor {linalg.inplaceable = true}, + %B : tensor {linalg.inplaceable = true}, + %C : tensor<4xf32> {linalg.inplaceable = true}, + %lb : index, %ub : index, %step : index) + -> (tensor, tensor) +{ +// CHECK-DAG: call @scf_for_with_tensor_insert_slice(%[[A]], %[[B]], %[[C]] + %r0:2 = call @scf_for_with_tensor_insert_slice(%A, %B, %C, %lb, %ub, %step) : + (tensor, tensor, tensor<4xf32>, index, index, index) + -> (tensor, tensor) + + // %r0#0 requires a copy because we have no idea what the function is doing. +// CHECK-DAG: %[[alloc:.*]] = memref.alloc +// CHECK-DAG: %[[casted:.*]] = memref.cast %[[alloc]] +// CHECK: memref.copy %[[B]], %[[alloc]] +// CHECK-NEXT: call @some_external_func(%[[casted]]) : (memref) -> () + call @some_external_func(%r0#0) : (tensor) -> () + +// CHECK: return + return %r0#0, %r0#1: tensor, tensor +} + +// ----- + +// CHECK-DAG: #[[$DYN_0D_MAP:.*]] = affine_map<()[s0] -> (s0)> +// CHECK-DAG: #[[$DYN_1D_MAP:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK: func @init_and_dot( +// CHECK-SAME: %[[A:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]> +// CHECK-SAME: %[[B:[a-zA-Z0-9]*]]: memref<64xf32, #[[$DYN_1D_MAP]]> +// CHECK-SAME: %[[C:[a-zA-Z0-9]*]]: memref +func @init_and_dot(%a: tensor<64xf32>, %b: tensor<64xf32>, %c: tensor) -> tensor { + // CHECK-NEXT: %[[C0:.*]] = arith.constant 0{{.*}} : f32 + %v0 = arith.constant 0.0 : f32 + + // CHECK-NEXT: linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref) + %d = linalg.fill ins(%v0 : f32) outs(%c : tensor) -> tensor + + // CHECK-NEXT: linalg.dot ins(%[[A]], %[[B]] : memref<64xf32, #[[$DYN_1D_MAP]]>, memref<64xf32, #[[$DYN_1D_MAP]]>) outs(%[[C]] : memref) + %e = linalg.dot ins(%a, %b : tensor<64xf32>,tensor<64xf32>) + outs(%d: tensor) -> tensor + + // CHECK-NEXT: return + return %e : tensor +} + +// CHECK: func @main() +func @main() { + // CHECK-DAG: %[[C0:.*]] = arith.constant 0{{.*}} : f32 + // CHECK-DAG: %[[C1:.*]] = arith.constant 1{{.*}} : f32 + // CHECK-DAG: %[[C2:.*]] = arith.constant 2{{.*}} : f32 + %v0 = arith.constant 0.0 : f32 + %v1 = arith.constant 1.0 : f32 + %v2 = arith.constant 2.0 : f32 + + // CHECK-NEXT: %[[A:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> + // CHECK-NEXT: %[[B:.*]] = memref.alloc() {alignment = 128 : i64} : memref<64xf32> + // CHECK-NEXT: %[[C:.*]] = memref.alloc() {alignment = 128 : i64} : memref + // CHECK-DAG: %[[cA:.*]] = memref.cast %[[A]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> + // CHECK-DAG: %[[cB:.*]] = memref.cast %[[B]] : memref<64xf32> to memref<64xf32, #[[$DYN_1D_MAP]]> + // CHECK-DAG: %[[cC:.*]] = memref.cast %[[C]] : memref to memref + %A = linalg.init_tensor [64] : tensor<64xf32> + %B = linalg.init_tensor [64] : tensor<64xf32> + %C = linalg.init_tensor [] : tensor + + // CHECK-DAG: linalg.fill ins(%[[C1]] : f32) outs(%[[A]] : memref<64xf32>) + // CHECK-DAG: linalg.fill ins(%[[C2]] : f32) outs(%[[B]] : memref<64xf32>) + // CHECK-DAG: linalg.fill ins(%[[C0]] : f32) outs(%[[C]] : memref) + %AA = linalg.fill ins(%v1 : f32) outs(%A : tensor<64xf32>) -> tensor<64xf32> + %BB = linalg.fill ins(%v2 : f32) outs(%B : tensor<64xf32>) -> tensor<64xf32> + %CC = linalg.fill ins(%v0 : f32) outs(%C : tensor) -> tensor + + // CHECK-NEXT: call @init_and_dot(%[[cA]], %[[cB]], %[[cC]]) + %res = call @init_and_dot(%AA, %BB, %CC) : + (tensor<64xf32>, tensor<64xf32>, tensor) -> tensor + + // CHECK-NEXT: %[[dC:.*]] = memref.cast %[[C]] : memref to memref<*xf32> + %res2 = tensor.cast %res: tensor to tensor<*xf32> + + // CHECK-NEXT: call @print_memref_f32(%[[dC]]) : (memref<*xf32>) -> () + call @print_memref_f32(%res2) : (tensor<*xf32>) -> () + + // CHECK-DAG: memref.dealloc %[[A]] : memref<64xf32> + // CHECK-DAG: memref.dealloc %[[B]] : memref<64xf32> + // CHECK-DAG: memref.dealloc %[[C]] : memref + // CHECK-NEXT: return + return +} + +// CHECK: func private @print_memref_f32(memref<*xf32>) +func private @print_memref_f32(tensor<*xf32>) + +// ----- + +// CHECK: #[[$DYNAMIC:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)> + +// CHECK: func private @external_func(memref) +func private @external_func(tensor) + +// CHECK: func @callee( +// CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref +// CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref +// CHECK-SAME: %[[C:[0-9a-zA-Z]*]]: memref +func @callee(%A : tensor {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>}, + %B : tensor, + %C : tensor) { +// CHECK-NEXT: %[[CASTED:.*]] = memref.cast %[[A]] : memref to memref +// CHECK-NEXT: call @external_func(%[[CASTED]]) : (memref) -> () + call @external_func(%A) : (tensor) -> () + +// CHECK-NEXT: call @external_func(%[[B]]) : (memref) -> () + call @external_func(%B) : (tensor) -> () + +// CHECK-NEXT: call @external_func(%[[C]]) : (memref) -> () + call @external_func(%C) : (tensor) -> () + + return +} + +// CHECK: func @entry( +// CHECK-SAME: %[[A:[0-9a-zA-Z]*]]: memref +// CHECK-SAME: %[[B:[0-9a-zA-Z]*]]: memref +// CHECK-SAME: %[[C:[0-9a-zA-Z]*]]: memref +func @entry(%A : tensor {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false}, + %B : tensor {linalg.buffer_layout = affine_map<(i)[s0, s1] -> (i)>, linalg.inplaceable = false}, + %C : tensor {linalg.inplaceable = false}) { +// Note: `callee` does not write to its bbArg directly, but `external_func` +// does. Inside `callee`, the writes via `external_func` do not cause a +// conflict. However, inside `entry`, the writes do cause a conflict because +// %A, %B and %C are not inplaceable. This test case shows that this kind of +// conflict detection has a "transitive" nature. +// CHECK: %[[ALLOC_C:.*]] = memref.alloc +// CHECK: %[[CASTED_C:.*]] = memref.cast %[[ALLOC_C]] +// CHECK: %[[ALLOC_B:.*]] = memref.alloc +// CHECK: %[[CASTED_B:.*]] = memref.cast %[[ALLOC_B]] +// CHECK: %[[ALLOC_A:.*]] = memref.alloc +// CHECK: memref.copy %[[A]], %[[ALLOC_A]] +// CHECK: memref.copy %[[B]], %[[ALLOC_B]] +// CHECK: memref.copy %[[C]], %[[ALLOC_C]] +// CHECK: %[[CASTED_A:.*]] = memref.cast %[[ALLOC_A]] +// CHECK-NEXT: call @callee(%[[CASTED_A]], %[[CASTED_B]], %[[CASTED_C]]) + call @callee(%A, %B, %C) : (tensor, tensor, tensor) -> () + return +} + +// ----- + +// No alloc or copy inside of the loop. + +// CHECK-LABEL: func @inner_func( +// CHECK-SAME: %[[arg0:.*]]: memref) -> tensor { + %f = arith.constant 1.0 : f32 + %c0 = arith.constant 0 : index + // CHECK: memref.store %{{.*}}, %[[arg0]] + %0 = tensor.insert %f into %t[%c0] : tensor + return %0 : tensor +} + +// CHECK-LABEL: func @equivalent_func_arg( +// CHECK-SAME: %[[arg0:.*]]: memref {linalg.inplaceable = true}, + %c0: index, %c10: index, %c1: index) -> tensor { + // CHECK-NOT: alloc + // CHECK-NOT: copy + %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor) { + // CHECK: call @inner_func(%[[arg0]]) + %3 = call @inner_func(%t1) : (tensor) -> tensor + scf.yield %3 : tensor + } + return %1: tensor +} + +// ----- + +// inner_func_2 modifies the bbArg, but the loop yields the original value. A +// buffer copy must be inserted inside the loop. + +// CHECK-LABEL: func @inner_func_2( +// CHECK-SAME: %[[arg0:.*]]: memref) -> tensor { + %f = arith.constant 1.0 : f32 + %c0 = arith.constant 0 : index + // CHECK: memref.store %{{.*}}, %[[arg0]] + %0 = tensor.insert %f into %t[%c0] : tensor + return %0 : tensor +} + +// CHECK-LABEL: func @equivalent_func_arg_2( +// CHECK-SAME: %[[arg0:.*]]: memref {linalg.inplaceable = true}, + %c0: index, %c10: index, %c1: index) -> tensor { + // CHECK: scf.for {{.*}} { + %1 = scf.for %iv = %c0 to %c10 step %c1 iter_args(%t1 = %t0) -> (tensor) { + // CHECK: %[[alloc:.*]] = memref.alloc + // CHECK: %[[casted:.*]] = memref.cast %[[alloc]] + // CHECK: memref.copy %[[arg0]], %[[alloc]] + // CHECK: call @inner_func_2(%[[casted]]) + // CHECK: memref.dealloc %[[alloc]] + // CHECK-NOT: scf.yield + %3 = call @inner_func_2(%t1) : (tensor) -> tensor + scf.yield %t1 : tensor + } + return %1: tensor +}