diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h --- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h +++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h @@ -575,6 +575,11 @@ Region *getEnclosingRepetitiveRegion(Block *block, const BufferizationOptions &options); +/// Assuming that the given region is repetitive, find the next enclosing +/// repetitive region. +Region *getNextEnclosingRepetitiveRegion(Region *region, + const BufferizationOptions &options); + namespace detail { /// This is the default implementation of /// BufferizableOpInterface::getAliasingOpOperands. Should not be called from diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp --- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp +++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp @@ -41,6 +41,15 @@ using namespace mlir; using namespace bufferization; +static bool isRepetitiveRegion(Region *region, + const BufferizationOptions &options) { + Operation *op = region->getParentOp(); + if (auto bufferizableOp = options.dynCastBufferizableOp(op)) + if (bufferizableOp.isRepetitiveRegion(region->getRegionNumber())) + return true; + return false; +} + Region *bufferization::getEnclosingRepetitiveRegion( Operation *op, const BufferizationOptions &options) { if (!op->getBlock()) @@ -52,11 +61,9 @@ Value value, const BufferizationOptions &options) { Region *region = value.getParentRegion(); while (region) { - Operation *op = region->getParentOp(); - if (auto bufferizableOp = options.dynCastBufferizableOp(op)) - if (bufferizableOp.isRepetitiveRegion(region->getRegionNumber())) - return region; - region = op->getParentRegion(); + if (isRepetitiveRegion(region, options)) + return region; + region = region->getParentRegion(); } return nullptr; } @@ -67,13 +74,22 @@ Operation *op = nullptr; do { op = region->getParentOp(); - if (auto bufferizableOp = options.dynCastBufferizableOp(op)) - if (bufferizableOp.isRepetitiveRegion(region->getRegionNumber())) - return region; + if (isRepetitiveRegion(region, options)) + return region; } while ((region = op->getParentRegion())); return nullptr; } +Region *bufferization::getNextEnclosingRepetitiveRegion( + Region *region, const BufferizationOptions &options) { + assert(isRepetitiveRegion(region, options) && "expected repetitive region"); + while ((region = region->getParentRegion())) { + if (isRepetitiveRegion(region, options)) + break; + } + return region; +} + Operation *bufferization::getOwnerOfValue(Value value) { if (auto opResult = value.dyn_cast()) return opResult.getDefiningOp(); diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp --- a/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp +++ b/mlir/lib/Dialect/Bufferization/Transforms/OneShotAnalysis.cpp @@ -346,25 +346,27 @@ return false; } -/// Return `true` if op dominance can be used to rule out read-after-write -/// conflicts wrt. the given reads and writes. +/// Return `true` if op dominance can be used to rule out a read-after-write +/// conflicts based on the ordering of ops. /// -/// Op dominance can often be used to rule out potential conflicts such as -/// "read" happens before "write". E.g., the following IR is not a RaW conflict -/// because the the read happens *before* the write. +/// Generalized op dominance can often be used to rule out potential conflicts +/// due to "read happens before write". E.g., the following IR is not a RaW +/// conflict because the read happens *before* the write. /// -/// %0 = ... : tensor -/// "reading_op"(%0) : tensor -/// %1 = "writing_op"(%0) : tensor -> tensor +/// Example 1: +/// %0 = ... : tensor // DEF +/// "reading_op"(%0) : tensor // READ +/// %1 = "writing_op"(%0) : tensor -> tensor // WRITE /// /// This is no longer true inside loops (or repetitive regions). In such cases, /// there may not be a meaningful `happensBefore` relationship because ops /// could be executed multiple times. E.g.: /// -/// %0 = ... : tensor +/// Example 2: +/// %0 = ... : tensor // DEF /// scf.for ... { -/// "reading_op"(%0) : tensor -/// %1 = "writing_op"(%0) : tensor -> tensor +/// "reading_op"(%0) : tensor // READ +/// %1 = "writing_op"(%0) : tensor -> tensor // WRITE /// ... /// } /// @@ -374,92 +376,78 @@ /// execution of writing_op. This is problematic because the tensor %0 they /// operate on (i.e., the "definition") is defined outside of the loop. /// -/// Counter example: +/// On a high-level, there is a potential RaW in a program if there exists a +/// possible program execution such that there is a sequence of DEF, followed +/// by WRITE, followed by READ. Each additional DEF resets the sequence. /// +/// E.g.: +/// No conflict: DEF, WRITE, DEF, READ +/// Potential conflict: DEF, READ, WRITE, READ, WRITE +/// +/// Example 1 has no conflict: DEF, READ, WRITE +/// Example 2 has a potential conflict: DEF, (READ, WRITE)* +// +/// Example 3: /// scf.for ... { /// %0 = ... : tensor /// "reading_op"(%0) : tensor /// %1 = "writing_op"(%0) : tensor -> tensor /// ... /// } +/// This has no conflict: (DEF, READ, WRITE)* /// -/// In this example, the definition %0 is in the same repetitive region as -/// "writing_op", so op dominance can be used to compute the `happensBefore` -/// relationship. -/// -/// Whether op dominance can be used or not is decided as follows: Find the -/// closest enclosing repetitive region of all buffer writes wrt. the given -/// tensor reads and writes. (The given sets of reads and writes contain the -/// entire alias set.) In case of a read, we look at the op that defines the -/// read value. In case of a write, we look at the op that is writing. If all of -/// those ops are in the same closest enclosing repetitive region (nullptr in -/// case of "no repetitive region" found at all), then op dominance can be used. -/// Otherwise, it cannot be used. -/// -/// Example: The common enclosing repetitive region is the scf.for loop. -/// Op dominance can be used. +/// Example 4: +/// %0 = ... : tensor /// scf.for ... { -/// %0 = tensor.generate -/// "read"(%0) +/// scf.for ... { "reading_op"(%0) } +/// %1 = "writing_op"(%0) /// } +/// This has a potential conflict: DEF, ((READ)*, WRITE)* /// -/// Example: The common enclosing repetitive region is nullptr: There is no -/// repetitive region around the tensor.generate. Op dominance can be -/// used. -/// %0 = tensor.generate -/// scf.for ... { "read"(%0) } +/// Example 5: +/// %0 = ... : tensor +/// scf.for ... { %1 = "writing_op"(%0) } +/// scf.for ... { "reading_op"(%0) } +/// This has a potential conflict: DEF, WRITE*, READ* /// -/// Example: The common enclosing repetitive regions of tensor.generate and -/// "write" differ. Op dominance cannot be used. -/// %0 = tensor.generate -/// scf.for ... { -/// "read"(%0) -/// "write"(%0) -/// } +/// The following rules are used to rule out RaW conflicts via ordering of ops: /// -/// Example: The common enclosing repetitive regions of tensor.generate and -/// "write" differ, but there is no read of %0, so op dominance can be -/// used. -/// %0 = tensor.generate -/// scf.for ... { -/// "write"(%0) -/// } +/// 1. If the closest enclosing repetitive region of DEF is a proper ancestor of +/// a repetitive region that enclosing both READ and WRITE, we cannot rule +/// out RaW conflict due to the ordering of ops. +/// 2. Otherwise: There are no loops that interfere with our analysis; for +/// analysis purposes, we can assume that there are no loops/repetitive +/// regions. I.e., we can rule out a RaW conflict if READ happensBefore WRITE +/// or WRITE happensBefore DEF. (Checked in `hasReadAfterWriteInterference`.) /// -/// Note: iter_args of loops are not aliases of their respective block -/// arguments, so op domanice can be used when analyzing ops that operate -/// on them. -bool canUseOpDominance(const DenseSet &usesRead, - const DenseSet &usesWrite, +bool canUseOpDominance(OpOperand *uRead, OpOperand *uWrite, + const SetVector &definitions, const AnalysisState &state) { const BufferizationOptions &options = state.getOptions(); - std::optional commonEnclosingRegion; + for (Value def : definitions) { + Region *rRead = getEnclosingRepetitiveRegion(uRead->getOwner(), options); + Region *rDef = getEnclosingRepetitiveRegion(def, options); - // In case of a write, take the region in which the write takes place. - for (OpOperand *uWrite : usesWrite) { - Region *r = getEnclosingRepetitiveRegion(uWrite->getOwner(), options); - if (!commonEnclosingRegion.has_value()) { - commonEnclosingRegion = r; + // READ and DEF are in the same repetitive region. `happensBefore` can be + // used to rule out RaW conflicts due to op ordering. + if (rRead == rDef) continue; - } - if (*commonEnclosingRegion != r) - return false; - } - // In case of a read, take the region which the read value is defined. - for (OpOperand *uRead : usesRead) { - // Optimization: Skip reads of values that have no defined contents. - if (!state.bufferizesToMemoryWrite(uRead->get())) - continue; - Region *r = getEnclosingRepetitiveRegion(uRead->get(), options); - if (!commonEnclosingRegion.has_value()) { - commonEnclosingRegion = r; - continue; + // Find the enclosing repetitive region of READ that is closest to DEF but + // not the repetitive region of DEF itself. + while (true) { + Region *nextRegion = getNextEnclosingRepetitiveRegion(rRead, options); + if (nextRegion == rDef) + break; + assert(nextRegion && "expected to find another repetitive region"); + rRead = nextRegion; } - if (*commonEnclosingRegion != r) + + // We cannot use op dominance if WRITE is inside the same repetitive region. + if (rRead->getParentOp()->isAncestor(uWrite->getOwner())) return false; } - - return commonEnclosingRegion.has_value(); + return true; } /// Annotate IR with details about the detected RaW conflict. @@ -507,10 +495,6 @@ AnalysisState &state, const BufferizationAliasInfo &aliasInfo) { const BufferizationOptions &options = state.getOptions(); - // Check if op dominance can be used to rule out read-after-write conflicts. - bool useDominance = canUseOpDominance(usesRead, usesWrite, state); - LLVM_DEBUG(llvm::dbgs() << "\n- useDominance = " << useDominance << "\n"); - for (OpOperand *uRead : usesRead) { Operation *readingOp = uRead->getOwner(); LLVM_DEBUG(llvm::dbgs() << "\n- check conflict:\n"); @@ -542,6 +526,12 @@ << uConflictingWrite->getOperandNumber() << " of " << *uConflictingWrite->getOwner() << "\n"); + // Check if op dominance can be used to rule out read-after-write + // conflicts. + bool useDominance = + canUseOpDominance(uRead, uConflictingWrite, definitions, state); + LLVM_DEBUG(llvm::dbgs() << "\n- useDominance = " << useDominance << "\n"); + // Throughout this loop, check for multiple requirements that have to be // met for uConflictingWrite to be an actual conflict. Operation *conflictingWritingOp = uConflictingWrite->getOwner(); diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir --- a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir +++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir @@ -697,3 +697,104 @@ return %2, %7 : tensor<4xf32>, tensor<4xf32> } + +// ----- + +// CHECK-LABEL: func @read_of_bbarg_in_repetitive_region( +func.func @read_of_bbarg_in_repetitive_region( + %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { + // CHECK: scf.for + scf.for %iv = %a to %b step %c { + // Must bufferize out-of-place because definition of read is in a different + // repetitive region. + // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["false"]} + %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> + %3 = tensor.extract %2[%a] : tensor<4xf32> + vector.print %3 : f32 + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %4 = tensor.insert %cst into %2[%a] : tensor<4xf32> + %5 = tensor.extract %4[%a] : tensor<4xf32> + vector.print %5 : f32 + } + return +} + +// ----- + +// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_write( +func.func @read_definition_in_same_repetitive_region_as_write( + %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]} + %1 = tensor.insert %cst into %t[%a] : tensor<10xf32> + // CHECK: scf.for + scf.for %iv = %a to %b step %c { + // Can bufferize in-place. + // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} + %2 = tensor.extract_slice %1[0][4][1] : tensor<10xf32> to tensor<4xf32> + %3 = tensor.extract %2[%a] : tensor<4xf32> + vector.print %3 : f32 + } + return +} + +// ----- + +// CHECK-LABEL: func @read_definition_in_same_repetitive_region_as_conflicting_write( +func.func @read_definition_in_same_repetitive_region_as_conflicting_write( + %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { + // Cannot bufferize in-place according to normal op dominance rules. + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]} + %1 = tensor.insert %cst into %t[%a] : tensor<10xf32> + // CHECK: scf.for + scf.for %iv = %a to %b step %c { + // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} + %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> + %3 = tensor.extract %2[%a] : tensor<4xf32> + vector.print %3 : f32 + } + return +} + +// ----- + +// CHECK: func @write_value_in_repetitive_region( +func.func @write_value_in_repetitive_region( + %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { + %0 = tensor.extract %t[%a] : tensor<10xf32> + vector.print %0 : f32 + + scf.for %iv = %a to %b step %c { + // No further read of %0, so this can bufferize in-place. + // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} + %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> + // CHECK: linalg.fill {__inplace_operands_attr__ = ["none", "true"]} + %filled = linalg.fill ins(%cst : f32) outs(%2 : tensor<4xf32>) -> tensor<4xf32> + %3 = tensor.extract %filled[%a] : tensor<4xf32> + vector.print %3 : f32 + } + return +} + +// ----- + +// CHECK-LABEL: func @nesting_op_repetitive_regions( +func.func @nesting_op_repetitive_regions( + %t: tensor<10xf32>, %a: index, %b: index, %c: index, %cst: f32) { + // Cannot bufferize in-place according to normal op dominance rules. + // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]} + %1 = tensor.insert %cst into %t[%a] : tensor<10xf32> + // CHECK: scf.for + scf.for %iv1 = %a to %b step %c { + // CHECK: scf.for + scf.for %iv2 = %a to %b step %c { + // CHECK: scf.for + scf.for %iv3 = %a to %b step %c { + // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true"]} + %2 = tensor.extract_slice %t[0][4][1] : tensor<10xf32> to tensor<4xf32> + %3 = tensor.extract %2[%a] : tensor<4xf32> + vector.print %3 : f32 + } + } + } + return +}