diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.td
@@ -315,7 +315,7 @@
         /*methodName=*/"isNotConflicting",
         /*args=*/(ins "OpOperand *":$uRead,
                       "OpOperand *":$uWrite,
-                      "const AnalysisState &":$state),
+                      "AnalysisState &":$state),
         /*methodBody=*/"",
         /*defaultImplementation=*/[{
           return false;
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/OneShotAnalysis.h
@@ -26,6 +26,11 @@
   /// Specifies whether returning newly allocated memrefs should be allowed.
   /// Otherwise, a pass failure is triggered.
   bool allowReturnAllocs = false;
+
+  /// Specifies whether buffers should be privatized inside of loop bodies if
+  /// privatization can avoid a buffer copy.
+  /// See SCF ForOpInterface::isNotConflicting for more details.
+  bool privatizeBuffersInLoops = false;
 };
 
 /// The BufferizationAliasInfo class maintains a list of buffer aliases and
@@ -270,6 +275,15 @@
     return static_cast<Ty *>(iter->second.get());
   }
 
+  /// Returns the extension of the specified type if it exists already.
+  /// Otherwise, creates the extension and then returns it.
+  template <typename Ty, typename... Args>
+  Ty &getOrCreateExtension(Args &&...args) {
+    if (Ty *ext = getExtension<Ty>())
+      return *ext;
+    return addExtension<Ty>(std::forward<Args>(args)...);
+  }
+
 private:
   /// `aliasInfo` keeps track of aliasing and equivalent values. Only internal
   /// functions and `runOneShotBufferize` may access this object.
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -309,6 +309,9 @@
             /*default=*/"false",
            "Test only: Annotate IR with RaW conflicts. Requires "
            "test-analysis-only.">,
+    Option<"privatizeBuffersInLoops", "privatize-buffers-in-loops", "bool",
+           /*default=*/"false",
+           "Privatize buffers in loops to avoid out-of-place init_args.">,
     Option<"unknownTypeConversion", "unknown-type-conversion", "std::string",
            /*default=*/"\"fully-dynamic-layout-map\"",
            "Controls layout maps for non-inferrable memref types.">,
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -200,6 +200,7 @@
       if (mustInferMemorySpace)
         opt.defaultMemorySpace = None;
       opt.printConflicts = printConflicts;
+      opt.privatizeBuffersInLoops = privatizeBuffersInLoops;
       opt.testAnalysisOnly = testAnalysisOnly;
       opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
--- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
@@ -26,81 +26,8 @@
 using namespace mlir;
 using namespace mlir::bufferization;
 
-/// Resolve all operands that are also used inside of repetitive regions of the
-/// same op. Such cases are not fully supported by One-Shot Bufferize.
-///
-/// E.g.:
-/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
-///   "some_use"(%tensor)
-///   ...
-/// }
-///
-/// Is converted to:
-/// %tensor_copy = bufferization.alloc_tensor copy(%tensor)
-/// %r = scf.for ... iter_args(%t = %tensor) -> tensor<?xf32> {
-///   "some_use"(%tensor_copy)
-///   ...
-/// }
-static void
-resolveUsesInRepetitiveRegions(Operation *op,
-                               const BufferizationOptions &options) {
-  IRRewriter rewriter(op->getContext());
-  AnalysisState state(options);
-
-  // Look for repetitive ops (loops).
-  op->walk([&](BufferizableOpInterface bufferizableOp) {
-    // Skip filtered ops.
-    if (!options.isOpAllowed(bufferizableOp.getOperation()))
-      return WalkResult::advance();
-
-    // Find all operands that are also used inside of a repetitive region of
-    // this op.
-    for (OpOperand &opOperand : bufferizableOp->getOpOperands()) {
-      Value operand = opOperand.get();
-      // Skip non-tensor operands.
-      if (!operand.getType().isa<TensorType>())
-        continue;
-      // Skip operands that do not bufferize to memory writes.
-      if (!bufferizableOp.bufferizesToMemoryWrite(opOperand, state))
-        continue;
-
-      // Gather all uses inside repetitive regions.
-      SmallVector<OpOperand *> usesInsideRegion;
-      for (OpOperand &use : operand.getUses()) {
-        Operation *owner = use.getOwner();
-        if (!bufferizableOp->isProperAncestor(owner))
-          continue;
-        for (Region &r : bufferizableOp->getRegions()) {
-          if (r.findAncestorOpInRegion(*owner) &&
-              bufferizableOp.isRepetitiveRegion(r.getRegionNumber())) {
-            usesInsideRegion.push_back(&use);
-            break;
-          }
-        }
-      }
-      // Nothing to do if the operand is not used inside a repetitive region.
-      if (usesInsideRegion.empty())
-        continue;
-
-      // Insert a tensor copy and replace all uses inside of repetitive regions.
-      rewriter.setInsertionPoint(bufferizableOp);
-      auto tensorCopy = rewriter.create<AllocTensorOp>(
-          bufferizableOp->getLoc(), operand.getType().cast<TensorType>(),
-          /*dynamicSizes=*/ValueRange(),
-          /*copy=*/operand, /*memory_space=*/IntegerAttr());
-      for (OpOperand *use : usesInsideRegion)
-        use->set(tensorCopy);
-    }
-
-    return WalkResult::advance();
-  });
-}
-
 LogicalResult mlir::bufferization::insertTensorCopies(
     Operation *op, const OneShotBufferizationOptions &options) {
-  // Preprocessing: Resolve currently unsupported bufferization cases.
-  resolveUsesInRepetitiveRegions(op, options);
-
   OneShotAnalysisState state(op, options);
   // Run normal One-Shot Bufferize analysis or One-Shot Module Bufferize
   // analysis depending on whether function boundary bufferization is enabled or
diff --git a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -25,8 +25,159 @@
 
 namespace mlir {
 namespace scf {
+
 namespace {
+/// Attribute marker to specify op operands that are privatized.
+static constexpr StringLiteral kPrivatizedOperandsAttrName =
+    "__privatized_operands_attr__";
+
+/// Return the number of parents between `op` and `parent`.
+static unsigned getDistanceToParent(Operation *op, Operation *parent) {
+  unsigned distance = 0;
+  while (op != parent) {
+    op = op->getParentOp();
+    assert(op && "expected op to be an ancestor of parent");
+    ++distance;
+  }
+  return distance;
+}
+
+/// Mark the OpOperand as privatized within the given scope. Example:
+/// tensor.insert %f into %t[%c0]
+///     { __privatized_operands_attr__ = [[], [3], []]}
+/// The second OpOperand (%t) is privatized within the scope of the third
+/// parent op of the tensor.insert op.
+static void setPrivatizedOpOperand(OpOperand &opOperand, Operation *scope) {
+  Operation *op = opOperand.getOwner();
+  OpBuilder builder(op);
+  auto attr = op->getAttr(kPrivatizedOperandsAttrName);
+  SmallVector<Attribute> operandsVec;
+  if (attr) {
+    // Add to the existing attribute.
+    for (Attribute a : attr.cast<ArrayAttr>())
+      operandsVec.push_back(a);
+  } else {
+    // Create a new attribute.
+    operandsVec.append(op->getNumOperands(), builder.getArrayAttr({}));
+  }
 
+  SmallVector<int64_t> scopes = llvm::to_vector(llvm::map_range(
+      operandsVec[opOperand.getOperandNumber()].cast<ArrayAttr>(),
+      [](Attribute a) { return a.cast<IntegerAttr>().getInt(); }));
+  scopes.push_back(getDistanceToParent(op, scope));
+  operandsVec[opOperand.getOperandNumber()] = builder.getI64ArrayAttr(scopes);
+  op->setAttr(kPrivatizedOperandsAttrName, builder.getArrayAttr(operandsVec));
+}
+} // namespace
+
+class SCFAnalysisState : public OneShotAnalysisState::Extension {
+public:
+  SCFAnalysisState(OneShotAnalysisState &state)
+      : OneShotAnalysisState::Extension(state) {}
+
+  /// Mark a value as privatized within the given scope.
+  void privatizeValue(Value value, Operation *scope) {
+#ifndef NDEBUG
+    Operation *definingOp = getOwnerOfValue(value);
+    for (Region &r : scope->getRegions())
+      assert(!r.findAncestorOpInRegion(*definingOp) &&
+             "cannot privatize value that is defined within the scope");
+#endif // NDEBUG
+    tentativelyPrivatizedValues[scope].insert(value);
+  }
+
+  /// Materialize all value privatizations. E.g.:
+  ///
+  /// %r = scf.for ... iter_args(%0 = %t) -> tensor<?xf32> {
+  ///   %read = tensor.extract %t[%idx] { privatized = [[1], []] }
+  ///   ...
+  /// }
+  ///
+  /// Is rewritten to:
+  ///
+  /// %t_copy = bufferization.alloc_tensor() copy(%t) : tensor<?xf32>
+  /// %r = scf.for ... iter_args(%0 = %t) -> tensor<?xf32> {
+  ///   %read = tensor.extract %t_copy[%idx]
+  ///   ...
+  /// }
+  ///
+  /// Note: privatized = [[1], []] means that the 0-th OpOperand is privatized
+  /// within all regions of the parent of the tensor.extract op. ([2] would
+  /// refer to the parent's parent etc.)
+  void materializePrivatizations(RewriterBase &rewriter,
+                                 Operation *scope) const {
+    OpBuilder::InsertionGuard g(rewriter);
+    rewriter.setInsertionPoint(scope);
+
+    // Return if no values are privatized within the given scope.
+    auto it = privatizedValues.find(scope);
+    if (it == privatizedValues.end())
+      return;
+
+    for (Value value : it->second) {
+      auto tensorCopy = rewriter.create<AllocTensorOp>(
+          scope->getLoc(), value.getType().cast<TensorType>(),
+          /*dynamicSizes=*/ValueRange(),
+          /*copy=*/value, /*memory_space=*/IntegerAttr());
+
+      // Update all uses within scope with tensorCopy.
+      SmallVector<OpOperand *> uses = llvm::to_vector(llvm::map_range(
+          value.getUses(), [](OpOperand &use) { return &use; }));
+      for (OpOperand *use : uses) {
+        if (scope->isProperAncestor(use->getOwner())) {
+          rewriter.updateRootInPlace(use->getOwner(),
+                                     [&]() { use->set(tensorCopy); });
+        }
+      }
+    }
+  }
+
+protected:
+  void notifyBufferizeInPlace(OpOperand &operand) override {
+    // Commit all tentative value privatizations.
+    for (auto &it : tentativelyPrivatizedValues) {
+      Operation *scope = it.first;
+      for (Value v : it.second) {
+        if (!privatizedValues[scope].insert(v).second)
+          // Continue if the value is already in the set.
+          continue;
+
+        // Add attributes for debugging and test cases.
+        if (getAnalysisState().getOptions().testAnalysisOnly)
+          for (OpOperand &use : v.getUses())
+            if (scope->isProperAncestor(use.getOwner()))
+              setPrivatizedOpOperand(use, scope);
+      }
+    }
+    tentativelyPrivatizedValues.clear();
+  }
+
+  void notifyBufferizeOutOfPlace(OpOperand &operand) override {
+    // The tentative value privatizations (if any) could not prevent
+    // out-of-place bufferizations, so we can drop them.
+    tentativelyPrivatizedValues.clear();
+  }
+
+private:
+  /// Value privatization is a way to define custom out-of-place bufferization
+  /// rules in One-Shot Analysis via BufferizableOpInterface::isNotConflicting.
+  /// A value privatization is a (Operation *, Value) tuple, where the operation
+  /// signifies the scope in which the SSA value should be privatized. We
+  /// maintain a set of values because multiple SSA value can be privatized in
+  /// in a certain scope.
+  using PrivatizationMapping = DenseMap<Operation *, DenseSet<Value>>;
+
+  /// All privatized values and their scope.
+  PrivatizationMapping privatizedValues;
+
+  /// Tentatively privatized values are value privatizations that are added
+  /// during the analysis of an OpOperand. They are either committed or dropped
+  /// at the end of the analysis, depending on whether the privatization proved
+  /// useful (in-place bufferization) or useless (out-of-place bufferization).
+  PrivatizationMapping tentativelyPrivatizedValues;
+};
+
+namespace {
 /// Helper function for loop bufferization. Cast the given buffer to the given
 /// memref type.
 static Value castBuffer(OpBuilder &b, Value buffer, Type type) {
@@ -527,6 +678,11 @@
     if (failed(bufferizableOp.resolveTensorOpOperandConflicts(rewriter, state)))
       return failure();
 
+    if (isa<OneShotAnalysisState>(state))
+      if (auto *scfState = static_cast<const OneShotAnalysisState &>(state)
+                               .getExtension<SCFAnalysisState>())
+        scfState->materializePrivatizations(rewriter, op);
+
     if (!state.getOptions().enforceAliasingInvariants)
       return success();
 
@@ -684,6 +840,93 @@
 
     return success();
   }
+  bool isNotConflicting(Operation *op, OpOperand *uRead,
+                        OpOperand *uConflictingWrite,
+                        AnalysisState &state) const {
+    auto &oneShotState = static_cast<OneShotAnalysisState &>(state);
+    if (!oneShotState.getOptions().privatizeBuffersInLoops)
+      return false;
+
+    // Try to privatize values inside loop bodies to avoid out-of-place
+    // bufferizations of init_args. E.g.:
+    //
+    // %t = ...
+    // scf.for ... iter_args(%0 = %t) -> tensor<?xf32> {
+    //   "read"(%t)
+    //   ...
+    //   %1 = "read_and_write"(%0)
+    //   scf.yield %1
+    // }
+    //
+    // In the above example, the iter_arg operand of the scf.for loop has to
+    // bufferize out-of-place:
+    // * conflicting write: init_arg operand of scf.for
+    // * read: "read"(%t)
+    //
+    // Intuitively, the init_arg cannot bufferize in-place because buffer(%t) is
+    // read within the loop body. Therefore, it must not be modified by the
+    // scf.for operation.
+    //
+    // Instead of bufferizing the init_arg out-of-place, all uses of %t can be
+    // privatized inside of the loop body:
+    //
+    // %t = ...
+    // %t_copy = bufferization.alloc_tensor() copy(%t) : tensor<?xf32>
+    // scf.for ... iter_args(%0 = %t) -> tensor<?xf32> {
+    //   "read"(%t_copy)
+    //   ...
+    //   %1 = "read_and_write"(%0)
+    //   scf.yield %1
+    // }
+    //
+    // Note that in the absence of other conflicts, all loop iterations share
+    // the same copy %t_copy. In case of a conflict within the loop, every loop
+    // iteration gets its own copy of %t via the regular conflict resolution
+    // mechanism. E.g.:
+    //
+    // %t = ...
+    // scf.for ... iter_args(%0 = %t) -> tensor<?xf32> {
+    //   %2 = "read_and_write"(%t)
+    //   ...
+    //   %1 = "read_and_write"(%0)
+    //   scf.yield %1
+    // }
+    //
+    // Two tensor copies are inserted in the above example:
+    //
+    // %t = ...
+    // %t_copy = bufferization.alloc_tensor() copy(%t) : tensor<?xf32>
+    // scf.for ... iter_args(%0 = %t) -> tensor<?xf32> {
+    //   %t_copy2 = bufferization.alloc_tensor() copy(%t_copy) : tensor<?xf32>
+    //   %2 = "read_and_write"(%t_copy2)
+    //   ...
+    //   %1 = "read_and_write"(%0)
+    //   scf.yield %1
+    // }
+    auto &scfState = oneShotState.getOrCreateExtension<SCFAnalysisState>();
+
+    // Check if the conflicting write is an init_arg.
+    auto forOp = cast<scf::ForOp>(op);
+    if (llvm::find(forOp.getInitArgs(), uConflictingWrite->get()) ==
+        forOp.getInitArgs().end())
+      return false;
+
+    // Check if the read is inside of the scf.for op.
+    if (!forOp.getLoopBody().findAncestorOpInRegion(*uRead->getOwner()))
+      return false;
+
+    // If the read value is defined inside of the loop body, there must be some
+    // other op in the loop body that puts it in the same alias set as the
+    // init_arg. That value will be privatized, so we can ignore this conflict.
+    if (forOp.getLoopBody().findAncestorOpInRegion(
+            *getOwnerOfValue(uRead->get())))
+      return true;
+
+    // Instead of bufferizing the init_arg operand out-of-place, all uses of
+    // the same value inside of the loop body can be privatized.
+    scfState.privatizeValue(uRead->get(), op);
+    return true;
+  }
 };
 
 /// Bufferization of scf.while. Replace with a new scf.while that operates on
@@ -1092,6 +1335,20 @@
     return true;
   }
 
+  LogicalResult resolveConflicts(Operation *op, RewriterBase &rewriter,
+                                 const AnalysisState &state) const {
+    auto bufferizableOp = cast<BufferizableOpInterface>(op);
+    if (failed(bufferizableOp.resolveTensorOpOperandConflicts(rewriter, state)))
+      return failure();
+
+    if (isa<OneShotAnalysisState>(state))
+      if (auto *scfState = static_cast<const OneShotAnalysisState &>(state)
+                               .getExtension<SCFAnalysisState>())
+        scfState->materializePrivatizations(rewriter, op);
+
+    return success();
+  }
+
   LogicalResult bufferize(Operation *op, RewriterBase &rewriter,
                           const BufferizationOptions &options) const {
     OpBuilder::InsertionGuard guard(rewriter);
@@ -1170,6 +1427,42 @@
       return false;
     return true;
   }
+
+  bool isNotConflicting(Operation *op, OpOperand *uRead,
+                        OpOperand *uConflictingWrite,
+                        AnalysisState &state) const {
+    auto &oneShotState = static_cast<OneShotAnalysisState &>(state);
+    if (!oneShotState.getOptions().privatizeBuffersInLoops)
+      return false;
+
+    // Try to privatize values inside loop bodies to avoid out-of-place
+    // bufferizations of shared output operands. See ForOpInterface for a
+    // detailed explanation.
+
+    auto &scfState = oneShotState.getOrCreateExtension<SCFAnalysisState>();
+
+    // Check if the conflicting write is an init_arg.
+    auto foreachThreadOp = cast<ForeachThreadOp>(op);
+    if (llvm::find(foreachThreadOp.getOutputs(), uConflictingWrite->get()) ==
+        foreachThreadOp.getOutputs().end())
+      return false;
+
+    // Check if the read is inside of the foreach_thread op.
+    if (!foreachThreadOp.getBody()->findAncestorOpInBlock(*uRead->getOwner()))
+      return false;
+
+    // If the read value is defined inside of the loop body, there must be some
+    // other op in the loop body that puts it in the same alias set as the
+    // init_arg. That value will be privatized, so we can ignore this conflict.
+    if (foreachThreadOp.getBody()->findAncestorOpInBlock(
+            *getOwnerOfValue(uRead->get())))
+      return true;
+
+    // Instead of bufferizing the init_arg operand out-of-place, all uses of
+    // the same value inside of the loop body can be privatized.
+    scfState.privatizeValue(uRead->get(), op);
+    return true;
+  }
 };
 
 /// Nothing to do for PerformConcurrentlyOp.
diff --git a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
--- a/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/Tensor/Transforms/BufferizableOpInterfaceImpl.cpp
@@ -759,7 +759,7 @@
 
   bool isNotConflicting(Operation *op, OpOperand *uRead,
                         OpOperand *uConflictingWrite,
-                        const AnalysisState &state) const {
+                        AnalysisState &state) const {
     return isNotConflictingInsertSliceLikeOp<tensor::InsertSliceOp>(
         op, uRead, uConflictingWrite, state);
   }
@@ -1032,7 +1032,7 @@
 
   bool isNotConflicting(Operation *op, OpOperand *uRead,
                         OpOperand *uConflictingWrite,
-                        const AnalysisState &state) const {
+                        AnalysisState &state) const {
     return isNotConflictingInsertSliceLikeOp<tensor::ParallelInsertSliceOp>(
         op, uRead, uConflictingWrite, state);
   }
diff --git a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/one-shot-bufferize.mlir
@@ -161,8 +161,7 @@
   %c16 = arith.constant 16 : index
 
   // Hoisted alloc.
-  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<128x192xf32>
-  // CHECK: memref.copy %[[C]], %[[ALLOC]]
+  // CHECK: %[[ALLOC:.*]] = memref.alloc() {alignment = 128 : i64} : memref<8x16xf32>
 
   // CHECK: scf.for %[[I:.*]] =
   %0 = scf.for %arg3 = %c0 to %c128 step %c8 iter_args(%arg4 = %C) -> (tensor<128x192xf32>) {
@@ -174,14 +173,11 @@
       %3 = tensor.extract_slice %B[0, %arg5] [256, 16] [1, 1] :
         tensor<256x192xf32> to tensor<256x16xf32>
 
-      // C was already replaced with a copy by preprocessing, so no copy is
-      // needed here.
-      // CHECK: %[[C_SLICE:.*]] = memref.subview %[[ALLOC]]
+      // Bufferizes out-of-place and is hoisted.
       %4 = tensor.extract_slice %C[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<128x192xf32> to tensor<8x16xf32>
 
-      // linalg.fill is inplace.
-      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[C_SLICE]]
+      // CHECK: linalg.fill ins(%{{.*}} : f32) outs(%[[ALLOC]]
       %5 = linalg.fill ins(%cst : f32) outs(%4 : tensor<8x16xf32>) -> tensor<8x16xf32>
 
       // CHECK: scf.for %[[K:.*]] =
@@ -192,7 +188,7 @@
           tensor<256x16xf32> to tensor<32x16xf32>
 
         // linalg.matmul is inplace as well as the enclosing scf.for.
-        // CHECK: linalg.matmul ins({{.*}} outs(%[[C_SLICE]]
+        // CHECK: linalg.matmul ins({{.*}} outs(%[[ALLOC]]
         %10 = linalg.matmul ins(%8, %9 : tensor<8x32xf32>, tensor<32x16xf32>)
                            outs(%arg8 : tensor<8x16xf32>)
           -> tensor<8x16xf32>
@@ -202,8 +198,8 @@
       // insert_slice is inplace but its source comes from an equivalent buffer
       // that is not in place. So we must insert a copy of the small buffer into
       // the bigger buffer.
-      // CHECK: %[[T:.*]] = memref.subview %[[C]][%[[I]], %[[J]]] [8, 16] [1, 1]
-      // CHECK: memref.copy %[[C_SLICE]], %[[T]]
+      // CHECK: %[[C_SLICE:.*]] = memref.subview %[[C]]
+      // CHECK: memref.copy %[[ALLOC]], %[[C_SLICE]]
       %7 = tensor.insert_slice %6 into %arg6[%arg3, %arg5] [8, 16] [1, 1] :
         tensor<8x16xf32> into tensor<128x192xf32>
 
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-privatization-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization-analysis.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization-analysis.mlir
@@ -0,0 +1,223 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries test-analysis-only" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries privatize-buffers-in-loops test-analysis-only" -split-input-file | FileCheck %s --check-prefix=CHECK-PRIVATIZATION
+
+// CHECK-LABEL: func @privatize_value(
+// CHECK-PRIVATIZATION-LABEL: func @privatize_value(
+func.func @privatize_value(%sz: index, %src: tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // All uses of %src inside the loop body are privatized.
+
+  // CHECK: scf.for {{.*}} {
+  // CHECK-PRIVATIZATION: scf.for {{.*}} {
+  %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor<?xf32> {
+    %pos = "dummy_op"() : () -> (index)
+    // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"], __privatized_operands_attr__ = [{{\[}}1], []]}
+    %read = tensor.extract %src[%pos] : tensor<?xf32>
+    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    %s = tensor.insert %read into %t[%iv] : tensor<?xf32>
+    // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]}
+    // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]}
+    scf.yield %s : tensor<?xf32>
+  }
+
+  // Without privatization: scf.for init_arg bufferizes out-of-place.
+  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
+  // With privatization: scf.for init_arg bufferizes in-place.
+  // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @privatize_value_via_alias(
+// CHECK-PRIVATIZATION-LABEL: func @privatize_value_via_alias(
+func.func @privatize_value_via_alias(%sz: index, %src: tensor<?xf32>)
+    -> tensor<?xf32>
+{
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // All uses of %src inside the loop body are privatized.
+
+  // CHECK: scf.for {{.*}} {
+  // CHECK-PRIVATIZATION: scf.for {{.*}} {
+  %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor<?xf32> {
+    // Create an alias of %src.
+    %pos2 = "dummy_op"() : () -> (index)
+    %sz2 = "dummy_op"() : () -> (index)
+    // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none", "none"]}
+    // CHECK-PRIVATIZATION: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none", "none"], __privatized_operands_attr__ = [{{\[}}1], [], []]}
+    %alias = tensor.extract_slice %src[%pos2][%sz2][1]
+        : tensor<?xf32> to tensor<?xf32>
+
+    %pos = "dummy_op"() : () -> (index)
+    // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %read = tensor.extract %alias[%pos] : tensor<?xf32>
+    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    %s = tensor.insert %read into %t[%iv] : tensor<?xf32>
+    // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]}
+    // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]}
+    scf.yield %s : tensor<?xf32>
+  }
+
+  // Without privatization: scf.for init_arg bufferizes out-of-place.
+  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
+  // With privatization: scf.for init_arg bufferizes in-place.
+  // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @privatize_value_of_alias(
+// CHECK-PRIVATIZATION-LABEL: func @privatize_value_of_alias(
+func.func @privatize_value_of_alias(%sz: index, %src: tensor<?xf32>)
+    -> tensor<?xf32>
+{
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // Create an alias of %src.
+  %pos2 = "dummy_op"() : () -> (index)
+  %sz2 = "dummy_op"() : () -> (index)
+
+  // Without privatization: tensor.extract_slice bufferizes out-of-place.
+  // CHECK: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["false", "none", "none"]}
+  // With privatization: tensor.extract_slice bufferizes in-place.
+  // CHECK-PRIVATIZATION: tensor.extract_slice {{.*}} {__inplace_operands_attr__ = ["true", "none", "none"]}
+  %alias = tensor.extract_slice %src[%pos2][%sz2][1]
+      : tensor<?xf32> to tensor<?xf32>
+
+  // All uses of %alias (and its aliases) inside the loop body are privatized.
+
+  // CHECK: scf.for {{.*}} {
+  // CHECK-PRIVATIZATION: scf.for {{.*}} {
+  %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %alias) -> tensor<?xf32> {
+    %pos = "dummy_op"() : () -> (index)
+    // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"], __privatized_operands_attr__ = [{{\[}}1], []]}
+    %read = tensor.extract %src[%pos] : tensor<?xf32>
+    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    %s = tensor.insert %read into %t[%iv] : tensor<?xf32>
+    // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]}
+    // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]}
+    scf.yield %s : tensor<?xf32>
+  }
+  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+  // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @raw_conflict_on_privatized_value(
+// CHECK-PRIVATIZATION-LABEL: func @raw_conflict_on_privatized_value(
+func.func @raw_conflict_on_privatized_value(%sz: index, %src: tensor<?xf32>)
+    -> tensor<?xf32>
+{
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // All uses of %src inside the loop body are privatized.
+
+  // CHECK: scf.for {{.*}} {
+  // CHECK-PRIVATIZATION: scf.for {{.*}} {
+  %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor<?xf32> {
+    %pos = "dummy_op"() : () -> (index)
+    %pos2 = "dummy_op"() : () -> (index)
+    %f = "dummy_op"() : () -> (f32)
+
+    // Through privatization, all uses of %src inside of loop are replaced with
+    // a copy that is created just before entering the loop. This is not good
+    // enough yet, because that buffer copy is written here. Each loop iteration
+    // gets its own copy.
+
+    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"]}
+    // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "false", "none"], __privatized_operands_attr__ = [{{\[}}], [1], []]}
+    %write = tensor.insert %f into %src[%pos2] : tensor<?xf32>
+    // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %read = tensor.extract %write[%pos] : tensor<?xf32>
+    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    %s = tensor.insert %read into %t[%iv] : tensor<?xf32>
+    // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]}
+    // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]}
+    scf.yield %s : tensor<?xf32>
+  }
+
+  // Without privatization: scf.for init_arg bufferizes out-of-place.
+  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
+  // With privatization: scf.for init_arg bufferizes in-place.
+  // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @nested_loops(
+// CHECK-PRIVATIZATION-LABEL: func @nested_loops(
+func.func @nested_loops(%sz: index, %src: tensor<?xf32>)
+    -> tensor<?xf32>
+{
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // All uses of %src inside the loop body are privatized.
+
+  // CHECK: scf.for {{.*}} {
+  // CHECK-PRIVATIZATION: scf.for {{.*}} {
+  %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor<?xf32> {
+
+    // The analysis attemps a second privatization of %src within the scope of
+    // this loop, but it cannot prevent out-of-place bufferization of the
+    // init_arg, so this privatization is aborted.
+    // CHECK: scf.for {{.*}} {
+    // CHECK-PRIVATIZATION: scf.for {{.*}} {
+    %r2 = scf.for %iv2 = %c0 to %sz step %c1 iter_args(%t2 = %src) -> tensor<?xf32> {
+      %pos2 = "dummy_op"() : () -> (index)
+      // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+      // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"], __privatized_operands_attr__ = [{{\[}}2], []]}
+      %read2 = tensor.extract %src[%pos2] : tensor<?xf32>
+      // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+      // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+      %s = tensor.insert %read2 into %t2[%iv2] : tensor<?xf32>
+      // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]}
+      // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]}
+      scf.yield %s : tensor<?xf32>
+    }
+
+    // There is no benefit of privatization (for the second loop) here.
+    // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
+    // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "false"], __privatized_operands_attr__ = [{{\[}}], [], [], [1]]}
+
+    %pos = "dummy_op"() : () -> (index)
+    // CHECK: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.extract {{.*}} {__inplace_operands_attr__ = ["true", "none"]}
+    %read = tensor.extract %r2[%pos] : tensor<?xf32>
+    // CHECK: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    // CHECK-PRIVATIZATION: tensor.insert {{.*}} {__inplace_operands_attr__ = ["none", "true", "none"]}
+    %s = tensor.insert %read into %t[%iv] : tensor<?xf32>
+    // CHECK: scf.yield {__inplace_operands_attr__ = ["true"]}
+    // CHECK-PRIVATIZATION: scf.yield {__inplace_operands_attr__ = ["true"]}
+    scf.yield %s : tensor<?xf32>
+  }
+
+  // Without privatization: scf.for init_arg bufferizes out-of-place.
+  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
+  // With privatization: scf.for init_arg bufferizes in-place.
+  // CHECK-PRIVATIZATION: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+
+  return %r : tensor<?xf32>
+}
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-privatization.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-privatization.mlir
@@ -0,0 +1,44 @@
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries privatize-buffers-in-loops" -split-input-file | FileCheck %s --check-prefix=CHECK-PRIVATIZATION
+
+// CHECK-LABEL: func @privatize_value(
+//  CHECK-SAME:     %[[sz:.*]]: index, %[[src:.*]]: memref
+// CHECK-PRIVATIZATION-LABEL: func @privatize_value(
+//  CHECK-PRIVATIZATION-SAME:     %[[sz:.*]]: index, %[[src:.*]]: memref
+func.func @privatize_value(%sz: index, %src: tensor<?xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+
+  // A buffer copy is needed somewhere in this test case.
+
+  // Without privatization: scf.for init_arg bufferizes out-of-place. No special
+  // handling is needed for the loop body.
+
+  // With privatization: scf.for init_arg bufferizes in-place. All uses of %src
+  // in the loop body are replaced with a buffer copy (created before the loop).
+  // I.e., the scope of privatization is the scf.for loop.
+
+  // CHECK: %[[src_copy:.*]] = memref.alloc
+  // CHECK: memref.copy %[[src]], %[[src_copy]]
+  // CHECK-PRIVATIZATION: %[[src_copy:.*]] = memref.alloc
+  // CHECK-PRIVATIZATION: memref.copy %[[src]], %[[src_copy]]
+
+  // CHECK: scf.for {{.*}} {
+  %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %src) -> tensor<?xf32> {
+    %pos = "dummy_op"() : () -> (index)
+    // CHECK: %[[loaded:.*]] = memref.load %[[src]]
+    // CHECK-PRIVATIZATION: %[[loaded:.*]] = memref.load %[[src_copy]]
+    %read = tensor.extract %src[%pos] : tensor<?xf32>
+    // CHECK: memref.store %[[loaded]], %[[src_copy]]
+    // CHECK-PRIVATIZATION: memref.store %[[loaded]], %[[src]]
+    %s = tensor.insert %read into %t[%iv] : tensor<?xf32>
+    // CHECK-NOT: scf.yield
+    scf.yield %s : tensor<?xf32>
+  }
+
+  // CHECK-NOT: memref.dealloc
+  // CHECK: return %[[src_copy]]
+  // CHECK-PRIVATIZATION: memref.dealloc %[[src_copy]]
+  // CHECK-PRIVATIZATION: return %[[src]]
+  return %r : tensor<?xf32>
+}
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -1,5 +1,8 @@
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -drop-equivalent-buffer-results -buffer-deallocation -split-input-file | FileCheck %s
 
+// Test with loop privatization.
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries privatize-buffers-in-loops" -drop-equivalent-buffer-results -buffer-deallocation -split-input-file | FileCheck %s --check-prefix=CHECK-PRIVATIZATION
+
 // Run fuzzer with different seeds.
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
@@ -256,17 +259,33 @@
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
 //       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
 //       CHECK:   memref.copy %[[t]], %[[alloc]]
-//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[t]]
+//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[alloc]]
+//       CHECK:   memref.dealloc %[[alloc]]
 //       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
 //   CHECK-DAG:     memref.dealloc %[[iter]]
 //   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[alloc]], %[[alloc2]]
-//       CHECK:     %[[alloc2_casted:.*]] = memref.cast %[[alloc2]]
-//       CHECK:     %[[cloned2:.*]] = bufferization.clone %[[alloc2_casted]]
+//       CHECK:     memref.copy %[[t]], %[[alloc2]]
+//       CHECK:     %[[cloned2:.*]] = bufferization.clone %[[alloc2]]
 //       CHECK:     memref.dealloc %[[alloc2]]
 //       CHECK:     scf.yield %[[cloned2]]
-//       CHECK:   memref.dealloc %[[alloc]]
 //       CHECK:   return %[[for]]
+
+// CHECK-PRIVATIZATION-LABEL: func @scf_for_yield_non_equivalent(
+//  CHECK-PRIVATIZATION-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK-PRIVATIZATION:   %[[alloc:.*]] = memref.alloc(%{{.*}})
+//       CHECK-PRIVATIZATION:   memref.copy %[[t]], %[[alloc]]
+//       CHECK-PRIVATIZATION:   %[[cloned:.*]] = bufferization.clone %[[t]]
+//       CHECK-PRIVATIZATION:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
+//   CHECK-PRIVATIZATION-DAG:     memref.dealloc %[[iter]]
+//   CHECK-PRIVATIZATION-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//       CHECK-PRIVATIZATION:     memref.copy %[[alloc]], %[[alloc2]]
+//       CHECK-PRIVATIZATION:     %[[alloc2_casted:.*]] = memref.cast %[[alloc2]]
+//       CHECK-PRIVATIZATION:     %[[cloned2:.*]] = bufferization.clone %[[alloc2_casted]]
+//       CHECK-PRIVATIZATION:     memref.dealloc %[[alloc2]]
+//       CHECK-PRIVATIZATION:     scf.yield %[[cloned2]]
+//       CHECK-PRIVATIZATION:   memref.dealloc %[[alloc]]
+//       CHECK-PRIVATIZATION:   return %[[for]]
+
 func.func @scf_for_yield_non_equivalent(
     %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
   %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
@@ -649,20 +668,30 @@
 
 // CHECK-LABEL: func @scf_foreach_private_var(
 //  CHECK-SAME:     %[[t:.*]]: memref<10xf32
+// CHECK-PRIVATIZATION-LABEL: func @scf_foreach_private_var(
+//  CHECK-PRIVATIZATION-SAME:     %[[t:.*]]: memref<10xf32
 func.func @scf_foreach_private_var(%t: tensor<10xf32>) -> f32 {
   %c2 = arith.constant 2 : index
   %c5 = arith.constant 5 : index
 
-  // A copy is inserted for the uses of %t in the loop.
+  // Without privatization: The shared_outs operand bufferizes out-of-place.
   // CHECK: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
   // CHECK: memref.copy %[[t]], %[[t_copy]]
 
+  // With privatization: The shared_outs operand bufferizes in-place.
+  // CHECK-PRIVATIZATION: %[[t_copy:.*]] = memref.alloc() {{.*}} : memref<10xf32>
+  // CHECK-PRIVATIZATION: memref.copy %[[t]], %[[t_copy]]
+
   // CHECK: scf.foreach_thread (%{{.*}}) in (%{{.*}}) {
+  // CHECK-PRIVATIZATION: scf.foreach_thread (%{{.*}}) in (%{{.*}}) {
 
   // Load from the copy and store into the shared output.
-  // CHECK:   %[[subview:.*]] = memref.subview %[[t]]
-  // CHECK:   memref.load %[[t_copy]]
+  // CHECK:   %[[subview:.*]] = memref.subview %[[t_copy]]
+  // CHECK:   memref.load %[[t]]
   // CHECK:   memref.store %{{.*}}, %[[subview]]
+  // CHECK-PRIVATIZATION:   %[[subview:.*]] = memref.subview %[[t]]
+  // CHECK-PRIVATIZATION:   memref.load %[[t_copy]]
+  // CHECK-PRIVATIZATION:   memref.store %{{.*}}, %[[subview]]
   %0 = scf.foreach_thread (%tid) in (%c2) shared_outs(%o = %t) -> tensor<10xf32> {
     %offset = arith.muli %c5, %tid : index
     %slice = tensor.extract_slice %o[%offset] [5] [1]
@@ -681,9 +710,9 @@
 
 // -----
 
-// CHECK-LABEL: func.func @scf_foreach_privatized_but_not_copied(
+// CHECK-LABEL: func.func @scf_foreach_inplace(
 //  CHECK-SAME:     %[[t0:.*]]: memref<10xf32, {{.*}}>, %[[t1:.*]]: memref<10xf32
-func.func @scf_foreach_privatized_but_not_copied(
+func.func @scf_foreach_inplace(
     %t0: tensor<10xf32>, %t1: tensor<10xf32>) -> f32 {
   %c2 = arith.constant 2 : index
   %c5 = arith.constant 5 : index
@@ -696,7 +725,6 @@
     %slice = tensor.extract_slice %o[%offset] [5] [1]
         : tensor<10xf32> to tensor<5xf32>
 
-    // %t1 is never written in here, so no copy is needed
     // CHECK: memref.load %[[t1]]
     %r2 = tensor.extract %t1[%tid] : tensor<10xf32>
     %i = tensor.insert %r2 into %slice[%c2] : tensor<5xf32>
@@ -801,20 +829,25 @@
     tensor.yield %cst : f32
   } : tensor<?xf32>
 
-  // A copy is inserted because %t is used inside the loop.
+  // A copy is inserted because %t is used inside the loop. The iter_args
+  // operand bufferizes out-of-place.
+
   // CHECK: %[[generate_copy:.*]] = memref.alloc
   // CHECK: memref.copy %[[generate]], %[[generate_copy]]
   // CHECK: scf.for
   %r = scf.for %iv = %c0 to %sz step %c1 iter_args(%t = %0) -> tensor<?xf32> {
     %iv_sub = arith.subi %iv, %c1 : index
-    // CHECK: memref.subview %[[generate_copy]]
+    // CHECK: memref.subview %[[generate]]
     %ll = tensor.extract_slice %0[%iv_sub][%sz][1] : tensor<?xf32> to tensor<?xf32>
     %l = tensor.extract %ll[%c0] : tensor<?xf32>
     %double = arith.mulf %cst, %l : f32
-    // CHECK: memref.store %{{.*}}, %[[generate]]
+    // CHECK: memref.store %{{.*}}, %[[generate_copy]]
     %s = tensor.insert %double into %t[%iv] : tensor<?xf32>
     scf.yield %s : tensor<?xf32>
   }
+
+  // CHECK: memref.dealloc %[[generate]]
+  // CHECK: return %[[generate_copy]]
   return %r : tensor<?xf32>
 }