diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td @@ -315,7 +315,7 @@ /*methodName=*/"isNotConflicting", /*args=*/(ins "OpOperand *":$uRead, "OpOperand *":$uWrite, - "const AnalysisState &":$state), + "AnalysisState &":$state), /*methodBody=*/"", /*defaultImplementation=*/[{ return false; diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h @@ -26,6 +26,11 @@ /// Specifies whether returning newly allocated memrefs should be allowed. /// Otherwise, a pass failure is triggered. bool allowReturnAllocs = false; + + /// Specifies whether buffers should be privatized inside of loop bodies if + /// privatization can avoid a buffer copy. + /// See SCF ForOpInterface::isNotConflicting for more details. + bool privatizeBuffersInLoops = false; }; /// The BufferizationAliasInfo class maintains a list of buffer aliases and @@ -270,6 +275,15 @@ return static_cast(iter->second.get()); } + /// Returns the extension of the specified type if it exists already. + /// Otherwise, creates the extension and then returns it. + template + Ty &getOrCreateExtension(Args &&...args) { + if (Ty *ext = getExtension()) + return *ext; + return addExtension(std::forward(args)...); + } + private: /// `aliasInfo` keeps track of aliasing and equivalent values. Only internal /// functions and `runOneShotBufferize` may access this object. diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td --- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td +++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td @@ -309,6 +309,9 @@ /*default=*/"false", "Test only: Annotate IR with RaW conflicts. Requires " "test-analysis-only.">, + Option<"privatizeBuffersInLoops", "privatize-buffers-in-loops", "bool", + /*default=*/"false", + "Privatize buffers in loops to avoid out-of-place init_args.">, Option<"unknownTypeConversion", "unknown-type-conversion", "std::string", /*default=*/"\"fully-dynamic-layout-map\"", "Controls layout maps for non-inferrable memref types.">, diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp --- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp @@ -200,6 +200,7 @@ if (mustInferMemorySpace) opt.defaultMemorySpace = None; opt.printConflicts = printConflicts; + opt.privatizeBuffersInLoops = privatizeBuffersInLoops; opt.testAnalysisOnly = testAnalysisOnly; opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries; diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp --- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp @@ -26,81 +26,8 @@ using namespace mlir; using namespace mlir::bufferization; -/// Resolve all operands that are also used inside of repetitive regions of the -/// same op. Such cases are not fully supported by One-Shot Bufferize. -/// -/// E.g.: -/// %r = scf.for ... iter_args(%t = %tensor) -> tensor { -/// "some_use"(%tensor) -/// ... -/// } -/// -/// Is converted to: -/// %tensor_copy = bufferization.alloc_tensor copy(%tensor) -/// %r = scf.for ... iter_args(%t = %tensor) -> tensor { -/// "some_use"(%tensor_copy) -/// ... -/// } -static void -resolveUsesInRepetitiveRegions(Operation *op, - const BufferizationOptions &options) { - IRRewriter rewriter(op->getContext()); - AnalysisState state(options); - - // Look for repetitive ops (loops). - op->walk([&](BufferizableOpInterface bufferizableOp) { - // Skip filtered ops. - if (!options.isOpAllowed(bufferizableOp.getOperation())) - return WalkResult::advance(); - - // Find all operands that are also used inside of a repetitive region of - // this op. - for (OpOperand &opOperand : bufferizableOp->getOpOperands()) { - Value operand = opOperand.get(); - // Skip non-tensor operands. - if (!operand.getType().isa()) - continue; - // Skip operands that do not bufferize to memory writes. - if (!bufferizableOp.bufferizesToMemoryWrite(opOperand, state)) - continue; - - // Gather all uses inside repetitive regions. - SmallVector usesInsideRegion; - for (OpOperand &use : operand.getUses()) { - Operation *owner = use.getOwner(); - if (!bufferizableOp->isProperAncestor(owner)) - continue; - for (Region &r : bufferizableOp->getRegions()) { - if (r.findAncestorOpInRegion(*owner) && - bufferizableOp.isRepetitiveRegion(r.getRegionNumber())) { - usesInsideRegion.push_back(&use); - break; - } - } - } - // Nothing to do if the operand is not used inside a repetitive region. - if (usesInsideRegion.empty()) - continue; - - // Insert a tensor copy and replace all uses inside of repetitive regions. - rewriter.setInsertionPoint(bufferizableOp); - auto tensorCopy = rewriter.create( - bufferizableOp->getLoc(), operand.getType().cast(), - /*dynamicSizes=*/ValueRange(), - /*copy=*/operand, /*memory_space=*/IntegerAttr()); - for (OpOperand *use : usesInsideRegion) - use->set(tensorCopy); - } - - return WalkResult::advance(); - }); -} - LogicalResult mlir::bufferization::insertTensorCopies( Operation *op, const OneShotBufferizationOptions &options) { - // Preprocessing: Resolve currently unsupported bufferization cases. - resolveUsesInRepetitiveRegions(op, options); - OneShotAnalysisState state(op, options); // Run normal One-Shot Bufferize analysis or One-Shot Module Bufferize // analysis depending on whether function boundary bufferization is enabled or diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp --- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp @@ -25,8 +25,159 @@ namespace mlir { namespace scf { + namespace { +/// Attribute marker to specify op operands that are privatized. +static constexpr StringLiteral kPrivatizedOperandsAttrName = + "__privatized_operands_attr__"; + +/// Return the number of parents between `op` and `parent`. +static unsigned getDistanceToParent(Operation *op, Operation *parent) { + unsigned distance = 0; + while (op != parent) { + op = op->getParentOp(); + assert(op && "expected op to be an ancestor of parent"); + ++distance; + } + return distance; +} + +/// Mark the OpOperand as privatized within the given scope. Example: +/// tensor.insert %f into %t[%c0] +/// { __privatized_operands_attr__ = [[], [3], []]} +/// The second OpOperand (%t) is privatized within the scope of the third +/// parent op of the tensor.insert op. +static void setPrivatizedOpOperand(OpOperand &opOperand, Operation *scope) { + Operation *op = opOperand.getOwner(); + OpBuilder builder(op); + auto attr = op->getAttr(kPrivatizedOperandsAttrName); + SmallVector operandsVec; + if (attr) { + // Add to the existing attribute. + for (Attribute a : attr.cast()) + operandsVec.push_back(a); + } else { + // Create a new attribute. + operandsVec.append(op->getNumOperands(), builder.getArrayAttr({})); + } + SmallVector scopes = llvm::to_vector(llvm::map_range( + operandsVec[opOperand.getOperandNumber()].cast(), + [](Attribute a) { return a.cast().getInt(); })); + scopes.push_back(getDistanceToParent(op, scope)); + operandsVec[opOperand.getOperandNumber()] = builder.getI64ArrayAttr(scopes); + op->setAttr(kPrivatizedOperandsAttrName, builder.getArrayAttr(operandsVec)); +} +} // namespace + +class SCFAnalysisState : public OneShotAnalysisState::Extension { +public: + SCFAnalysisState(OneShotAnalysisState &state) + : OneShotAnalysisState::Extension(state) {} + + /// Mark a value as privatized within the given scope. + void privatizeValue(Value value, Operation *scope) { +#ifndef NDEBUG + Operation *definingOp = getOwnerOfValue(value); + for (Region &r : scope->getRegions()) + assert(!r.findAncestorOpInRegion(*definingOp) && + "cannot privatize value that is defined within the scope"); +#endif // NDEBUG + tentativelyPrivatizedValues[scope].insert(value); + } + + /// Materialize all value privatizations. E.g.: + /// + /// %r = scf.for ... iter_args(%0 = %t) -> tensor { + /// %read = tensor.extract %t[%idx] { privatized = [[1], []] } + /// ... + /// } + /// + /// Is rewritten to: + /// + /// %t_copy = bufferization.alloc_tensor() copy(%t) : tensor + /// %r = scf.for ... iter_args(%0 = %t) -> tensor { + /// %read = tensor.extract %t_copy[%idx] + /// ... + /// } + /// + /// Note: privatized = [[1], []] means that the 0-th OpOperand is privatized + /// within all regions of the parent of the tensor.extract op. ([2] would + /// refer to the parent's parent etc.) + void materializePrivatizations(RewriterBase &rewriter, + Operation *scope) const { + OpBuilder::InsertionGuard g(rewriter); + rewriter.setInsertionPoint(scope); + + // Return if no values are privatized within the given scope. + auto it = privatizedValues.find(scope); + if (it == privatizedValues.end()) + return; + + for (Value value : it->second) { + auto tensorCopy = rewriter.create( + scope->getLoc(), value.getType().cast(), + /*dynamicSizes=*/ValueRange(), + /*copy=*/value, /*memory_space=*/IntegerAttr()); + + // Update all uses within scope with tensorCopy. + SmallVector uses = llvm::to_vector(llvm::map_range( + value.getUses(), [](OpOperand &use) { return &use; })); + for (OpOperand *use : uses) { + if (scope->isProperAncestor(use->getOwner())) { + rewriter.updateRootInPlace(use->getOwner(), + [&]() { use->set(tensorCopy); }); + } + } + } + } + +protected: + void notifyBufferizeInPlace(OpOperand &operand) override { + // Commit all tentative value privatizations. + for (auto &it : tentativelyPrivatizedValues) { + Operation *scope = it.first; + for (Value v : it.second) { + if (!privatizedValues[scope].insert(v).second) + // Continue if the value is already in the set. + continue; + + // Add attributes for debugging and test cases. + if (getAnalysisState().getOptions().testAnalysisOnly) + for (OpOperand &use : v.getUses()) + if (scope->isProperAncestor(use.getOwner())) + setPrivatizedOpOperand(use, scope); + } + } + tentativelyPrivatizedValues.clear(); + } + + void notifyBufferizeOutOfPlace(OpOperand &operand) override { + // The tentative value privatizations (if any) could not prevent + // out-of-place bufferizations, so we can drop them. + tentativelyPrivatizedValues.clear(); + } + +private: + /// Value privatization is a way to define custom out-of-place bufferization + /// rules in One-Shot Analysis via BufferizableOpInterface::isNotConflicting. + /// A value privatization is a (Operation *, Value) tuple, where the operation + /// signifies the scope in which the SSA value should be privatized. We + /// maintain a set of values because multiple SSA value can be privatized in + /// in a certain scope. + using PrivatizationMapping = DenseMap>; + + /// All privatized values and their scope. + PrivatizationMapping privatizedValues; + + /// Tentatively privatized values are value privatizations that are added + /// during the analysis of an OpOperand. They are either committed or dropped + /// at the end of the analysis, depending on whether the privatization proved + /// useful (in-place bufferization) or useless (out-of-place bufferization). + PrivatizationMapping tentativelyPrivatizedValues; +}; + +namespace { /// Helper function for loop bufferization. Cast the given buffer to the given /// memref type. static Value castBuffer(OpBuilder &b, Value buffer, Type type) { @@ -527,6 +678,11 @@ if (failed(bufferizableOp.resolveTensorOpOperandConflicts(rewriter, state))) return failure(); + if (isa(state)) + if (auto *scfState = static_cast(state) + .getExtension()) + scfState->materializePrivatizations(rewriter, op); + if (!state.getOptions().enforceAliasingInvariants) return success(); @@ -684,6 +840,93 @@ return success(); } + bool isNotConflicting(Operation *op, OpOperand *uRead, + OpOperand *uConflictingWrite, + AnalysisState &state) const { + auto &oneShotState = static_cast(state); + if (!oneShotState.getOptions().privatizeBuffersInLoops) + return false; + + // Try to privatize values inside loop bodies to avoid out-of-place + // bufferizations of init_args. E.g.: + // + // %t = ... + // scf.for ... iter_args(%0 = %t) -> tensor { + // "read"(%t) + // ... + // %1 = "read_and_write"(%0) + // scf.yield %1 + // } + // + // In the above example, the iter_arg operand of the scf.for loop has to + // bufferize out-of-place: + // * conflicting write: init_arg operand of scf.for + // * read: "read"(%t) + // + // Intuitively, the init_arg cannot bufferize in-place because buffer(%t) is + // read within the loop body. Therefore, it must not be modified by the + // scf.for operation. + // + // Instead of bufferizing the init_arg out-of-place, all uses of %t can be + // privatized inside of the loop body: + // + // %t = ... + // %t_copy = bufferization.alloc_tensor() copy(%t) : tensor + // scf.for ... iter_args(%0 = %t) -> tensor { + // "read"(%t_copy) + // ... + // %1 = "read_and_write"(%0) + // scf.yield %1 + // } + // + // Note that in the absence of other conflicts, all loop iterations share + // the same copy %t_copy. In case of a conflict within the loop, every loop + // iteration gets its own copy of %t via the regular conflict resolution + // mechanism. E.g.: + // + // %t = ... + // scf.for ... iter_args(%0 = %t) -> tensor { + // %2 = "read_and_write"(%t) + // ... + // %1 = "read_and_write"(%0) + // scf.yield %1 + // } + // + // Two tensor copies are inserted in the above example: + // + // %t = ... + // %t_copy = bufferization.alloc_tensor() copy(%t) : tensor + // scf.for ... iter_args(%0 = %t) -> tensor { + // %t_copy2 = bufferization.alloc_tensor() copy(%t_copy) : tensor + // %2 = "read_and_write"(%t_copy2) + // ... + // %1 = "read_and_write"(%0) + // scf.yield %1 + // } + auto &scfState = oneShotState.getOrCreateExtension(); + + // Check if the conflicting write is an init_arg. + auto forOp = cast(op); + if (llvm::find(forOp.getInitArgs(), uConflictingWrite->get()) == + forOp.getInitArgs().end()) + return false; + + // Check if the read is inside of the scf.for op. + if (!forOp.getLoopBody().findAncestorOpInRegion(*uRead->getOwner())) + return false; + + // If the read value is defined inside of the loop body, there must be some + // other op in the loop body that puts it in the same alias set as the + // init_arg. That value will be privatized, so we can ignore this conflict. + if (forOp.getLoopBody().findAncestorOpInRegion( + *getOwnerOfValue(uRead->get()))) + return true; + + // Instead of bufferizing the init_arg operand out-of-place, all uses of + // the same value inside of the loop body can be privatized. + scfState.privatizeValue(uRead->get(), op); + return true; + } }; /// Bufferization of scf.while. Replace with a new scf.while that operates on @@ -1092,6 +1335,20 @@ return true; } + LogicalResult resolveConflicts(Operation *op, RewriterBase &rewriter, + const AnalysisState &state) const { + auto bufferizableOp = cast(op); + if (failed(bufferizableOp.resolveTensorOpOperandConflicts(rewriter, state))) + return failure(); + + if (isa(state)) + if (auto *scfState = static_cast(state) + .getExtension()) + scfState->materializePrivatizations(rewriter, op); + + return success(); + } + LogicalResult bufferize(Operation *op, RewriterBase &rewriter, const BufferizationOptions &options) const { OpBuilder::InsertionGuard guard(rewriter); @@ -1170,6 +1427,42 @@ return false; return true; } + + bool isNotConflicting(Operation *op, OpOperand *uRead, + OpOperand *uConflictingWrite, + AnalysisState &state) const { + auto &oneShotState = static_cast(state); + if (!oneShotState.getOptions().privatizeBuffersInLoops) + return false; + + // Try to privatize values inside loop bodies to avoid out-of-place + // bufferizations of shared output operands. See ForOpInterface for a + // detailed explanation. + + auto &scfState = oneShotState.getOrCreateExtension(); + + // Check if the conflicting write is an init_arg. + auto foreachThreadOp = cast(op); + if (llvm::find(foreachThreadOp.getOutputs(), uConflictingWrite->get()) == + foreachThreadOp.getOutputs().end()) + return false; + + // Check if the read is inside of the foreach_thread op. + if (!foreachThreadOp.getBody()->findAncestorOpInBlock(*uRead->getOwner())) + return false; + + // If the read value is defined inside of the loop body, there must be some + // other op in the loop body that puts it in the same alias set as the + // init_arg. That value will be privatized, so we can ignore this conflict. + if (foreachThreadOp.getBody()->findAncestorOpInBlock( + *getOwnerOfValue(uRead->get()))) + return true; + + // Instead of bufferizing the init_arg operand out-of-place, all uses of + // the same value inside of the loop body can be privatized. + scfState.privatizeValue(uRead->get(), op); + return true; + } }; /// Nothing to do for PerformConcurrentlyOp. diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp --- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp +++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp @@ -759,7 +759,7 @@ bool isNotConflicting(Operation *op, OpOperand *uRead, OpOperand *uConflictingWrite, - const AnalysisState &state) const { + AnalysisState &state) const { return isNotConflictingInsertSliceLikeOp( op, uRead, uConflictingWrite, state); } @@ -1032,7 +1032,7 @@ bool isNotConflicting(Operation *op, OpOperand *uRead, OpOperand *uConflictingWrite, - const AnalysisState &state) const { + AnalysisState &state) const { return isNotConflictingInsertSliceLikeOp( op, uRead, uConflictingWrite, state); } diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir --- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir @@ -161,8 +161,7 @@ %c16 = arith.constant 16 : index // Hoisted alloc. - // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<128x192xf32> - // CHECK: memref.copy %[[C]], %[[ALLOC]] + // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<8x16xf32> // CHECK: scf.for %[[I:.*]] = %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) { @@ -174,14 +173,11 @@ %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] : tensor<256x192xf32> to tensor<256x16xf32> - // C was already replaced with a copy by preprocessing, so no copy is - // needed here. - // CHECK: %[[C_SLICE:.*]] = memref.subview %[[ALLOC]] + // Bufferizes out-of-place and is hoisted. %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] : tensor<128x192xf32> to tensor<8x16xf32> - // linalg.fill is inplace. - // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[C_SLICE]] + // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]] %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32> // CHECK: scf.for %[[K:.*]] = @@ -192,7 +188,7 @@ tensor<256x16xf32> to tensor<32x16xf32> // linalg.matmul is inplace as well as the enclosing scf.for. - // CHECK: linalg.matmul ins({{.*}} outs(%[[C_SLICE]] + // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]] %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>) outs(%arg8 : tensor<8x16xf32>) -> tensor<8x16xf32> @@ -202,8 +198,8 @@ // insert_slice is inplace but its source comes from an equivalent buffer // that is not in place. So we must insert a copy of the small buffer into // the bigger buffer. - // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1] - // CHECK: memref.copy %[[C_SLICE]], %[[T]] + // CHECK: %[[C_SLICE:.*]] = memref.subview %[[C]] + // CHECK: memref.copy %[[ALLOC]], %[[C_SLICE]] %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] : tensor<8x16xf32> into tensor<128x192xf32> diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-privatization-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization-analysis.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization-analysis.mlir @@ -0,0 +1,223 @@ +// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries privatize-buffers-in-loops test-analysis-only" -split-input-file | FileCheck %s --check-prefix=CHECK-PRIVATIZATION + +// CHECK-LABEL: func @privatize_value( +// CHECK-PRIVATIZATION-LABEL: func @privatize_value( +func.func @privatize_value(%sz: index, %src: tensor) -> tensor { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // All uses of %src inside the loop body are privatized. + + // CHECK: scf.for {{.*}} { + // CHECK-PRIVATIZATION: scf.for {{.*}} { + %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor { + %pos = "dummy_op"() : () -> (index) + // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"], __privatized_operands_attr__ = [{{\[}}1], []]} + %read = tensor.extract %src[%pos] : tensor + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %s = tensor.insert %read into %t[%iv] : tensor + // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]} + // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]} + scf.yield %s : tensor + } + + // Without privatization: scf.for init_arg bufferizes out-of-place. + // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]} + // With privatization: scf.for init_arg bufferizes in-place. + // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} + + return %r : tensor +} + +// ----- + +// CHECK-LABEL: func @privatize_value_via_alias( +// CHECK-PRIVATIZATION-LABEL: func @privatize_value_via_alias( +func.func @privatize_value_via_alias(%sz: index, %src: tensor) + -> tensor +{ + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // All uses of %src inside the loop body are privatized. + + // CHECK: scf.for {{.*}} { + // CHECK-PRIVATIZATION: scf.for {{.*}} { + %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor { + // Create an alias of %src. + %pos2 = "dummy_op"() : () -> (index) + %sz2 = "dummy_op"() : () -> (index) + // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none", "none"]} + // CHECK-PRIVATIZATION: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none", "none"], __privatized_operands_attr__ = [{{\[}}1], [], []]} + %alias = tensor.extract_slice %src[%pos2][%sz2][1] + : tensor to tensor + + %pos = "dummy_op"() : () -> (index) + // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + %read = tensor.extract %alias[%pos] : tensor + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %s = tensor.insert %read into %t[%iv] : tensor + // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]} + // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]} + scf.yield %s : tensor + } + + // Without privatization: scf.for init_arg bufferizes out-of-place. + // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]} + // With privatization: scf.for init_arg bufferizes in-place. + // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} + + return %r : tensor +} + +// ----- + +// CHECK-LABEL: func @privatize_value_of_alias( +// CHECK-PRIVATIZATION-LABEL: func @privatize_value_of_alias( +func.func @privatize_value_of_alias(%sz: index, %src: tensor) + -> tensor +{ + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // Create an alias of %src. + %pos2 = "dummy_op"() : () -> (index) + %sz2 = "dummy_op"() : () -> (index) + + // Without privatization: tensor.extract_slice bufferizes out-of-place. + // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["false", "none", "none"]} + // With privatization: tensor.extract_slice bufferizes in-place. + // CHECK-PRIVATIZATION: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none", "none"]} + %alias = tensor.extract_slice %src[%pos2][%sz2][1] + : tensor to tensor + + // All uses of %alias (and its aliases) inside the loop body are privatized. + + // CHECK: scf.for {{.*}} { + // CHECK-PRIVATIZATION: scf.for {{.*}} { + %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %alias) -> tensor { + %pos = "dummy_op"() : () -> (index) + // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"], __privatized_operands_attr__ = [{{\[}}1], []]} + %read = tensor.extract %src[%pos] : tensor + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %s = tensor.insert %read into %t[%iv] : tensor + // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]} + // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]} + scf.yield %s : tensor + } + // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} + // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} + + return %r : tensor +} + +// ----- + +// CHECK-LABEL: func @raw_conflict_on_privatized_value( +// CHECK-PRIVATIZATION-LABEL: func @raw_conflict_on_privatized_value( +func.func @raw_conflict_on_privatized_value(%sz: index, %src: tensor) + -> tensor +{ + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // All uses of %src inside the loop body are privatized. + + // CHECK: scf.for {{.*}} { + // CHECK-PRIVATIZATION: scf.for {{.*}} { + %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor { + %pos = "dummy_op"() : () -> (index) + %pos2 = "dummy_op"() : () -> (index) + %f = "dummy_op"() : () -> (f32) + + // Through privatization, all uses of %src inside of loop are replaced with + // a copy that is created just before entering the loop. This is not good + // enough yet, because that buffer copy is written here. Each loop iteration + // gets its own copy. + + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]} + // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"], __privatized_operands_attr__ = [{{\[}}], [1], []]} + %write = tensor.insert %f into %src[%pos2] : tensor + // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + %read = tensor.extract %write[%pos] : tensor + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %s = tensor.insert %read into %t[%iv] : tensor + // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]} + // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]} + scf.yield %s : tensor + } + + // Without privatization: scf.for init_arg bufferizes out-of-place. + // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]} + // With privatization: scf.for init_arg bufferizes in-place. + // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} + + return %r : tensor +} + +// ----- + +// CHECK-LABEL: func @nested_loops( +// CHECK-PRIVATIZATION-LABEL: func @nested_loops( +func.func @nested_loops(%sz: index, %src: tensor) + -> tensor +{ + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // All uses of %src inside the loop body are privatized. + + // CHECK: scf.for {{.*}} { + // CHECK-PRIVATIZATION: scf.for {{.*}} { + %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor { + + // The analysis attemps a second privatization of %src within the scope of + // this loop, but it cannot prevent out-of-place bufferization of the + // init_arg, so this privatization is aborted. + // CHECK: scf.for {{.*}} { + // CHECK-PRIVATIZATION: scf.for {{.*}} { + %r2 = scf.for %iv2 = %c0 to %sz step %c1 iter_args(%t2 = %src) -> tensor { + %pos2 = "dummy_op"() : () -> (index) + // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"], __privatized_operands_attr__ = [{{\[}}2], []]} + %read2 = tensor.extract %src[%pos2] : tensor + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %s = tensor.insert %read2 into %t2[%iv2] : tensor + // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]} + // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]} + scf.yield %s : tensor + } + + // There is no benefit of privatization (for the second loop) here. + // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]} + // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "false"], __privatized_operands_attr__ = [{{\[}}], [], [], [1]]} + + %pos = "dummy_op"() : () -> (index) + // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]} + %read = tensor.extract %r2[%pos] : tensor + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %s = tensor.insert %read into %t[%iv] : tensor + // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]} + // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]} + scf.yield %s : tensor + } + + // Without privatization: scf.for init_arg bufferizes out-of-place. + // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]} + // With privatization: scf.for init_arg bufferizes in-place. + // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]} + + return %r : tensor +} diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-privatization.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization.mlir @@ -0,0 +1,44 @@ +// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -split-input-file | FileCheck %s +// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries privatize-buffers-in-loops" -split-input-file | FileCheck %s --check-prefix=CHECK-PRIVATIZATION + +// CHECK-LABEL: func @privatize_value( +// CHECK-SAME: %[[sz:.*]]: index, %[[src:.*]]: memref +// CHECK-PRIVATIZATION-LABEL: func @privatize_value( +// CHECK-PRIVATIZATION-SAME: %[[sz:.*]]: index, %[[src:.*]]: memref +func.func @privatize_value(%sz: index, %src: tensor) -> tensor { + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + + // A buffer copy is needed somewhere in this test case. + + // Without privatization: scf.for init_arg bufferizes out-of-place. No special + // handling is needed for the loop body. + + // With privatization: scf.for init_arg bufferizes in-place. All uses of %src + // in the loop body are replaced with a buffer copy (created before the loop). + // I.e., the scope of privatization is the scf.for loop. + + // CHECK: %[[src_copy:.*]] = memref.alloc + // CHECK: memref.copy %[[src]], %[[src_copy]] + // CHECK-PRIVATIZATION: %[[src_copy:.*]] = memref.alloc + // CHECK-PRIVATIZATION: memref.copy %[[src]], %[[src_copy]] + + // CHECK: scf.for {{.*}} { + %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor { + %pos = "dummy_op"() : () -> (index) + // CHECK: %[[loaded:.*]] = memref.load %[[src]] + // CHECK-PRIVATIZATION: %[[loaded:.*]] = memref.load %[[src_copy]] + %read = tensor.extract %src[%pos] : tensor + // CHECK: memref.store %[[loaded]], %[[src_copy]] + // CHECK-PRIVATIZATION: memref.store %[[loaded]], %[[src]] + %s = tensor.insert %read into %t[%iv] : tensor + // CHECK-NOT: scf.yield + scf.yield %s : tensor + } + + // CHECK-NOT: memref.dealloc + // CHECK: return %[[src_copy]] + // CHECK-PRIVATIZATION: memref.dealloc %[[src_copy]] + // CHECK-PRIVATIZATION: return %[[src]] + return %r : tensor +} diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir --- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir @@ -1,5 +1,8 @@ // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -drop-equivalent-buffer-results -buffer-deallocation -split-input-file | FileCheck %s +// Test with loop privatization. +// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries privatize-buffers-in-loops" -drop-equivalent-buffer-results -buffer-deallocation -split-input-file | FileCheck %s --check-prefix=CHECK-PRIVATIZATION + // Run fuzzer with different seeds. // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null @@ -256,17 +259,33 @@ // CHECK-SAME: %[[t:.*]]: memref, %lb : index, %ub : index, %step : index) -> tensor { %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor { @@ -649,20 +668,30 @@ // CHECK-LABEL: func @scf_foreach_private_var( // CHECK-SAME: %[[t:.*]]: memref<10xf32 +// CHECK-PRIVATIZATION-LABEL: func @scf_foreach_private_var( +// CHECK-PRIVATIZATION-SAME: %[[t:.*]]: memref<10xf32 func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 { %c2 = arith.constant 2 : index %c5 = arith.constant 5 : index - // A copy is inserted for the uses of %t in the loop. + // Without privatization: The shared_outs operand bufferizes out-of-place. // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32> // CHECK: memref.copy %[[t]], %[[t_copy]] + // With privatization: The shared_outs operand bufferizes in-place. + // CHECK-PRIVATIZATION: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32> + // CHECK-PRIVATIZATION: memref.copy %[[t]], %[[t_copy]] + // CHECK: scf.foreach_thread (%{{.*}}) in (%{{.*}}) { + // CHECK-PRIVATIZATION: scf.foreach_thread (%{{.*}}) in (%{{.*}}) { // Load from the copy and store into the shared output. - // CHECK: %[[subview:.*]] = memref.subview %[[t]] - // CHECK: memref.load %[[t_copy]] + // CHECK: %[[subview:.*]] = memref.subview %[[t_copy]] + // CHECK: memref.load %[[t]] // CHECK: memref.store %{{.*}}, %[[subview]] + // CHECK-PRIVATIZATION: %[[subview:.*]] = memref.subview %[[t]] + // CHECK-PRIVATIZATION: memref.load %[[t_copy]] + // CHECK-PRIVATIZATION: memref.store %{{.*}}, %[[subview]] %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> { %offset = arith.muli %c5, %tid : index %slice = tensor.extract_slice %o[%offset] [5] [1] @@ -681,9 +710,9 @@ // ----- -// CHECK-LABEL: func.func @scf_foreach_privatized_but_not_copied( +// CHECK-LABEL: func.func @scf_foreach_inplace( // CHECK-SAME: %[[t0:.*]]: memref<10xf32, {{.*}}>, %[[t1:.*]]: memref<10xf32 -func.func @scf_foreach_privatized_but_not_copied( +func.func @scf_foreach_inplace( %t0: tensor<10xf32>, %t1: tensor<10xf32>) -> f32 { %c2 = arith.constant 2 : index %c5 = arith.constant 5 : index @@ -696,7 +725,6 @@ %slice = tensor.extract_slice %o[%offset] [5] [1] : tensor<10xf32> to tensor<5xf32> - // %t1 is never written in here, so no copy is needed // CHECK: memref.load %[[t1]] %r2 = tensor.extract %t1[%tid] : tensor<10xf32> %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32> @@ -801,20 +829,25 @@ tensor.yield %cst : f32 } : tensor - // A copy is inserted because %t is used inside the loop. + // A copy is inserted because %t is used inside the loop. The iter_args + // operand bufferizes out-of-place. + // CHECK: %[[generate_copy:.*]] = memref.alloc // CHECK: memref.copy %[[generate]], %[[generate_copy]] // CHECK: scf.for %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %0) -> tensor { %iv_sub = arith.subi %iv, %c1 : index - // CHECK: memref.subview %[[generate_copy]] + // CHECK: memref.subview %[[generate]] %ll = tensor.extract_slice %0[%iv_sub][%sz][1] : tensor to tensor %l = tensor.extract %ll[%c0] : tensor %double = arith.mulf %cst, %l : f32 - // CHECK: memref.store %{{.*}}, %[[generate]] + // CHECK: memref.store %{{.*}}, %[[generate_copy]] %s = tensor.insert %double into %t[%iv] : tensor scf.yield %s : tensor } + + // CHECK: memref.dealloc %[[generate]] + // CHECK: return %[[generate_copy]] return %r : tensor }