diff --git a/mlir/docs/Bufferization.md b/mlir/docs/Bufferization.md
--- a/mlir/docs/Bufferization.md
+++ b/mlir/docs/Bufferization.md
@@ -270,9 +270,11 @@
 in an efficient way. For this reason, One-Shot Bufferize must be explicitly
 configured with `allow-return-allocs` to support such IR.
 
-When running with `allow-return-allocs`, One-Shot Bufferize resolves yields of
-newly allocated buffers with copies. E.g., the `scf.if` example above would
-bufferize to IR similar to the following:
+When running with `allow-return-allocs`, One-Shot Bufferize may introduce
+allocations that cannot be deallocated by One-Shot Bufferize yet. For that
+reason, `-buffer-deallocation` must be run after One-Shot Bufferize. This buffer
+deallocation pass resolves yields of newly allocated buffers with copies. E.g.,
+the `scf.if` example above would bufferize to IR similar to the following:
 
 ```mlir
 %0 = scf.if %c -> (memref<?xf32>) {
@@ -291,15 +293,10 @@
 must be deallocated at some point after the `scf.if` (unless the `%0` is
 returned/yielded from its block).
 
-One-Shot Bufferize internally utilizes functionality from the
-[Buffer Deallocation](https://mlir.llvm.org/docs/BufferDeallocationInternals/)
-pass to deallocate yielded buffers. Therefore, ops with regions must implement
-the `RegionBranchOpInterface` when `allow-return-allocs`.
-
-Note: Buffer allocations that are returned from a function are not deallocated.
-It is the caller's responsibility to deallocate the buffer. In the future, this
-could be automated with allocation hoisting (across function boundaries) or
-reference counting.
+Note: Buffer allocations that are returned from a function are not deallocated,
+not even with `-buffer-deallocation`. It is the caller's responsibility to
+deallocate the buffer. In the future, this could be automated with allocation
+hoisting (across function boundaries) or reference counting.
 
 One-Shot Bufferize can be configured to leak all memory and not generate any
 buffer deallocations with `create-deallocs=0`. This can be useful for
diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -582,15 +582,6 @@
 getMemRefTypeWithStaticIdentityLayout(TensorType tensorType,
                                       Attribute memorySpace = {});
 
-/// Create alloc/dealloc ops as specified in the bufferization options. If
-/// `onlyLeakingAlloc`, only those buffer allocations are processed for which no
-/// buffer deallocation can be created. `changed` is set to `true` if the IR was
-/// modified.
-LogicalResult createAllocDeallocOps(Operation *op,
-                                    const BufferizationOptions &options,
-                                    bool onlyLeakingAllocs = false,
-                                    bool *changed = nullptr);
-
 } // namespace bufferization
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -179,9 +179,11 @@
     allocated buffers from a block can lead to bad performance because
     additional buffer copies are often needed to make sure that every buffer
     allocation is also deallocated again. By default, such IR is rejected by
-    One-Shot Bufferize. Such IR can be allowed with `allow-return-allocs`.
-    Note that new buffer allocations that are returned from a function can
-    currently not be deallocated and leak.
+    One-Shot Bufferize. Such IR can be allowed with `allow-return-allocs`. In
+    that case, the `-buffer-deallocation` pass should be run after One-Shot
+    Bufferize. Note that new buffer allocations that are returned from a
+    function can currently not be deallocated by `-buffer-deallocation` and
+    leak.
 
     One-Shot Bufferize will by default reject IR that contains non-bufferizable
     op, i.e., ops that do not implemement BufferizableOpInterface. Such IR can
@@ -223,10 +225,7 @@
       argument, the return value disappears. Instead, the buffer of the tensor
       argument is modified in-place.
     * Returning non-equivalent tensors is forbidden by default and must be
-      explicitly activated with `allow-return-allocs`. If such a tensor happens
-      to bufferize to a new memory allocation, this buffer will never be
-      deallocated and leak. Such IR needs special handling, e.g., allocation
-      hoisting or reference counting.
+      explicitly activated with `allow-return-allocs`.
     * Non-equivalent returned tensors of fully static size can be promoted to
       function arguments with `promote-buffer-results-to-out-params`. In that
       case, buffers for such tensors are allocated at each call site. Instead of
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -38,12 +38,6 @@
 constexpr const ::llvm::StringLiteral
     bufferization::BufferizableOpInterface::kInplaceableAttrName;
 
-/// Attribute name used to mark allocs that are created by the bufferization.
-static const char *kBufferAllocationAttr = "bufferization.allocation";
-
-/// Attribute name used to mark allocs that should not be deallocated.
-static const char *kSkipDeallocAttr = "bufferization.skip_dealloc";
-
 //===----------------------------------------------------------------------===//
 // OpFilter
 //===----------------------------------------------------------------------===//
@@ -456,15 +450,6 @@
   return allocMemRefType;
 }
 
-static Value createBufferAllocation(OpBuilder &b, Location loc, MemRefType type,
-                                    ValueRange dynShape, bool skipDealloc) {
-  auto allocaOp = b.create<memref::AllocaOp>(loc, type, dynShape);
-  allocaOp->setAttr(kBufferAllocationAttr, b.getUnitAttr());
-  if (skipDealloc)
-    allocaOp->setAttr(kSkipDeallocAttr, b.getUnitAttr());
-  return allocaOp.getResult();
-}
-
 /// Create an allocation after `shapedValue.getDefiningOp` (or at the top of the
 /// block in case of a bbArg).
 FailureOr<Value> BufferizationState::createAlloc(OpBuilder &b, Location loc,
@@ -479,21 +464,32 @@
   MemRefType allocMemRefType =
       getAllocationTypeAndShape(b, loc, shapedValue, dynShape);
 
+  // Create the buffer allocation.
+  FailureOr<Value> buffer =
+      getOptions().createAlloc(b, loc, allocMemRefType, dynShape);
+  if (failed(buffer))
+    return failure();
+
   // Should be the buffer be deallocated again or should we let it leak?
-  bool skipDealloc;
   if (dealloc) {
-    skipDealloc = !dealloc.getValue();
+    if (!dealloc.getValue())
+      return *buffer;
   } else {
     assert(shapedValue.getType().isa<TensorType>() &&
            "must specify `dealloc` if non-tensor value is passed");
     // Buffer should be not be deallocated if deallocs are generally deactivated
     // or if the tensor is yielded from a block.
-    skipDealloc = !getOptions().createDeallocs ||
-                  getAnalysisState().isTensorYielded(shapedValue);
+    if (!getOptions().createDeallocs ||
+        getAnalysisState().isTensorYielded(shapedValue))
+      return *buffer;
   }
 
-  // Create the buffer allocation.
-  return createBufferAllocation(b, loc, allocMemRefType, dynShape, skipDealloc);
+  // Create buffer deallocation.
+  b.setInsertionPoint(b.getInsertionBlock()->getTerminator());
+  if (failed(getOptions().createDealloc(b, loc, *buffer)))
+    return failure();
+
+  return *buffer;
 }
 
 /// Create a memory copy between two memref buffers.
@@ -506,53 +502,6 @@
   return success();
 }
 
-LogicalResult
-bufferization::createAllocDeallocOps(Operation *op,
-                                     const BufferizationOptions &options,
-                                     bool onlyLeakingAllocs, bool *changed) {
-  IRRewriter rewriter(op->getContext());
-  if (changed)
-    *changed = false;
-
-  // Bufferization creates memref.alloca ops. After bufferization, these must be
-  // rewritten to alloc/dealloc ops as specified in the bufferization options.
-  WalkResult status = op->walk([&](memref::AllocaOp allocaOp) {
-    // Ignore memref.alloca ops that were not created by the bufferization.
-    if (!allocaOp->hasAttr(kBufferAllocationAttr))
-      return WalkResult::skip();
-    // If `onlyLeakingAllocs`, process only ops that are marked as
-    // "skip dealloc".
-    bool skipDealloc = allocaOp->hasAttr(kSkipDeallocAttr);
-    if (onlyLeakingAllocs && !skipDealloc)
-      return WalkResult::skip();
-
-    // Create alloc.
-    Block *block = allocaOp->getBlock();
-    rewriter.setInsertionPoint(allocaOp);
-    FailureOr<Value> alloc =
-        options.createAlloc(rewriter, allocaOp->getLoc(), allocaOp.getType(),
-                            allocaOp.dynamicSizes());
-    if (failed(alloc))
-      return WalkResult::interrupt();
-    rewriter.replaceOp(allocaOp, *alloc);
-    if (changed)
-      *changed = true;
-
-    // Stop here if the buffer should not be deallocated.
-    if (skipDealloc)
-      return WalkResult::advance();
-
-    // Create dealloc.
-    rewriter.setInsertionPoint(block->getTerminator());
-    if (failed(options.createDealloc(rewriter, alloc->getLoc(), *alloc)))
-      return WalkResult::interrupt();
-
-    return WalkResult::advance();
-  });
-
-  return success(!status.wasInterrupted());
-}
-
 //===----------------------------------------------------------------------===//
 // Bufferization-specific BlockAndValueMapping support with debugging.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -294,31 +294,12 @@
 LogicalResult
 bufferization::finalizeBuffers(Operation *op,
                                const BufferizationOptions &options) {
-  // Create allocation ops for "leaking buffers", i.e., buffer allocations that
-  // escape block boundaries. If there are no leaking allocs, `hasLeakingAllocs`
-  // is set to `false`.
-  bool hasLeakingAllocs = false;
-  if (failed(createAllocDeallocOps(op, options, /*onlyLeakingAllocs=*/true,
-                                   &hasLeakingAllocs)))
-    return failure();
-
   // Promote returned buffers to "out" parameters.
   // TODO: Pass options to support custom dealloc ops.
   if (options.promoteBufferResultsToOutParams && isa<ModuleOp>(op) &&
       failed(promoteBufferResultsToOutParams(cast<ModuleOp>(op))))
     return failure();
 
-  // Create deallocation ops for all "leaking buffers" and all buffer
-  // allocations that were added during the above promotion process.
-  // TODO: Pass options to support custom dealloc ops.
-  if (hasLeakingAllocs && options.createDeallocs &&
-      failed(deallocateBuffers(op)))
-    return failure();
-
-  // Deallocate all remaining buffers at the end of their parent blocks.
-  if (failed(createAllocDeallocOps(op, options)))
-    return failure();
-
   return success();
 }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-bufferize-allow-return-allocs.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs allow-unknown-ops" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs allow-unknown-ops" -buffer-deallocation -canonicalize -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
 // RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23" -split-input-file -o /dev/null
diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-out-params.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-out-params.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-out-params.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-out-params.mlir
@@ -1,6 +1,6 @@
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries allow-return-allocs promote-buffer-results-to-out-params function-boundary-type-conversion=fully-dynamic-layout-map" -split-input-file | FileCheck %s
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries allow-return-allocs promote-buffer-results-to-out-params function-boundary-type-conversion=identity-layout-map" -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT
-// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries allow-return-allocs function-boundary-type-conversion=infer-layout-map" -split-input-file | FileCheck %s --check-prefix=CHECK-BASELINE
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries allow-return-allocs promote-buffer-results-to-out-params function-boundary-type-conversion=fully-dynamic-layout-map" -buffer-deallocation -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries allow-return-allocs promote-buffer-results-to-out-params function-boundary-type-conversion=identity-layout-map" -buffer-deallocation -split-input-file | FileCheck %s --check-prefix=CHECK-NO-LAYOUT
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries allow-return-allocs function-boundary-type-conversion=infer-layout-map" -buffer-deallocation -split-input-file | FileCheck %s --check-prefix=CHECK-BASELINE
 
 // Note: function-boundary-type-conversion=infer-layout-map with
 // promote-buffer-results-to-out-params is an unsupported combination.
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
--- a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -buffer-deallocation -split-input-file | FileCheck %s
 
 // Run fuzzer with different seeds.
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
@@ -6,7 +6,7 @@
 // RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
 
 // Test bufferization using memref types that have no layout map.
-// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -allow-unregistered-dialect -one-shot-bufferize="allow-return-allocs unknown-type-conversion=identity-layout-map function-boundary-type-conversion=identity-layout-map bufferize-function-boundaries" -buffer-deallocation -split-input-file -o /dev/null
 
 // CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
 
@@ -213,10 +213,8 @@
 
 // CHECK-LABEL: func @scf_execute_region_yield_non_equivalent(
 //       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
-//       CHECK:   %[[clone:.*]] = bufferization.clone %[[alloc]]
+//       CHECK:   %[[r:.*]] = memref.load %[[alloc]][%{{.*}}]
 //       CHECK:   memref.dealloc %[[alloc]]
-//       CHECK:   %[[r:.*]] = memref.load %[[clone]][%{{.*}}]
-//       CHECK:   memref.dealloc %[[clone]]
 //       CHECK:   return %[[r]]
 func.func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32 {
   %r = scf.execute_region -> (tensor<?xf32>) {
@@ -236,11 +234,15 @@
 // CHECK-LABEL: func @scf_for_yield_non_equivalent(
 //  CHECK-SAME:     %[[t:.*]]: memref<?xf32
 //       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
-//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[alloc]])
-//       CHECK:     memref.dealloc %[[iter]]
-//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[alloc]]
+//       CHECK:   memref.dealloc %[[alloc]]
+//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
+//   CHECK-DAG:     memref.dealloc %[[iter]]
+//   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
 //       CHECK:     memref.copy %[[t]], %[[alloc2]]
-//       CHECK:     scf.yield %[[alloc2]]
+//       CHECK:     %[[cloned2:.*]] = bufferization.clone %[[alloc2]]
+//       CHECK:     memref.dealloc %[[alloc2]]
+//       CHECK:     scf.yield %[[cloned2]]
 //       CHECK:   return %[[for]]
 func.func @scf_for_yield_non_equivalent(
     %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
@@ -269,7 +271,9 @@
 //       CHECK:     memref.copy %[[alloc2]], %[[alloc3]]
 //       CHECK:     memref.dealloc %[[alloc2]]
 //       CHECK:     %[[casted3:.*]] = memref.cast %[[alloc3]]
-//       CHECK:     scf.yield %[[casted3]]
+//       CHECK:     %[[cloned3:.*]] = bufferization.clone %[[casted3]]
+//       CHECK:     memref.dealloc %[[alloc3]]
+//       CHECK:     scf.yield %[[cloned3]]
 //       CHECK:   return %[[for]]
 func.func @scf_for_yield_allocation(%t: tensor<?xf32>, %lb : index, %ub : index,
                                %step : index) -> tensor<?xf32> {
@@ -309,12 +313,16 @@
 //       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
 //       CHECK:     memref.copy %[[iter2]], %[[alloc2]]
 //       CHECK:     memref.dealloc %[[iter2]]
+//       CHECK:     %[[casted2:.*]] = memref.cast %[[alloc2]]
 //       CHECK:     %[[alloc1:.*]] = memref.alloc(%{{.*}})
 //       CHECK:     memref.copy %[[iter1]], %[[alloc1]]
 //       CHECK:     memref.dealloc %[[iter1]]
 //       CHECK:     %[[casted1:.*]] = memref.cast %[[alloc1]]
-//       CHECK:     %[[casted2:.*]] = memref.cast %[[alloc2]]
-//       CHECK:     scf.yield %[[casted2]], %[[casted1]]
+//       CHECK:     %[[cloned1:.*]] = bufferization.clone %[[casted1]]
+//       CHECK:     memref.dealloc %[[alloc1]]
+//       CHECK:     %[[cloned2:.*]] = bufferization.clone %[[casted2]]
+//       CHECK:     memref.dealloc %[[alloc2]]
+//       CHECK:     scf.yield %[[cloned2]], %[[cloned1]]
     // Yield tensors in different order.
     scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
   }
@@ -376,12 +384,16 @@
     // CHECK: %[[a1:.*]] = memref.alloc() {{.*}} : memref<5xi1>
     // CHECK: memref.copy %[[w1]], %[[a1]]
     // CHECK: memref.dealloc %[[w1]]
+    // CHECK: %[[casted1:.*]] = memref.cast %[[a1]]
     // CHECK: %[[a0:.*]] = memref.alloc() {{.*}} : memref<5xi1>
     // CHECK: memref.copy %[[w0]], %[[a0]]
     // CHECK: memref.dealloc %[[w0]]
     // CHECK: %[[casted0:.*]] = memref.cast %[[a0]]
-    // CHECK: %[[casted1:.*]] = memref.cast %[[a1]]
-    // CHECK: scf.condition(%[[condition]]) %[[casted1]], %[[casted0]]
+    // CHECK: %[[cloned0:.*]] = bufferization.clone %[[casted0]]
+    // CHECK: memref.dealloc %[[a0]]
+    // CHECK: %[[cloned1:.*]] = bufferization.clone %[[casted1]]
+    // CHECK: memref.dealloc %[[a1]]
+    // CHECK: scf.condition(%[[condition]]) %[[cloned1]], %[[cloned0]]
     %condition = tensor.extract %w0[%idx] : tensor<5xi1>
     scf.condition(%condition) %w1, %w0 : tensor<5xi1>, tensor<5xi1>
   } do {
@@ -389,7 +401,11 @@
     // CHECK: } do {
     // CHECK: ^bb0(%[[b0:.*]]: memref<5xi1, #{{.*}}>, %[[b1:.*]]: memref<5xi1, #{{.*}}):
     // CHECK: memref.store %{{.*}}, %[[b0]]
-    // CHECK: scf.yield %[[b0]], %[[b1]]
+    // CHECK: %[[cloned2:.*]] = bufferization.clone %[[b1]]
+    // CHECK: memref.dealloc %[[b1]]
+    // CHECK: %[[cloned3:.*]] = bufferization.clone %[[b0]]
+    // CHECK: memref.dealloc %[[b0]]
+    // CHECK: scf.yield %[[cloned3]], %[[cloned2]]
     // CHECK: }
     %pos = "dummy.some_op"() : () -> (index)
     %val = "dummy.another_op"() : () -> (i1)
@@ -421,12 +437,16 @@
     // CHECK: %[[a1:.*]] = memref.alloc() {{.*}} : memref<5xi1>
     // CHECK: memref.copy %[[w1]], %[[a1]]
     // CHECK: memref.dealloc %[[w1]]
+    // CHECK: %[[casted1:.*]] = memref.cast %[[a1]]
     // CHECK: %[[a0:.*]] = memref.alloc() {{.*}} : memref<5xi1>
     // CHECK: memref.copy %[[w0]], %[[a0]]
     // CHECK: memref.dealloc %[[w0]]
     // CHECK: %[[casted0:.*]] = memref.cast %[[a0]]
-    // CHECK: %[[casted1:.*]] = memref.cast %[[a1]]
-    // CHECK: scf.condition(%[[condition]]) %[[casted1]], %[[casted0]]
+    // CHECK: %[[cloned0:.*]] = bufferization.clone %[[casted0]]
+    // CHECK: memref.dealloc %[[a0]]
+    // CHECK: %[[cloned1:.*]] = bufferization.clone %[[casted1]]
+    // CHECK: memref.dealloc %[[a1]]
+    // CHECK: scf.condition(%[[condition]]) %[[cloned1]], %[[cloned0]]
     %condition = tensor.extract %w0[%idx] : tensor<5xi1>
     scf.condition(%condition) %w1, %w0 : tensor<5xi1>, tensor<5xi1>
   } do {
@@ -437,11 +457,15 @@
     // CHECK: %[[a3:.*]] = memref.alloc() {{.*}} : memref<5xi1>
     // CHECK: memref.copy %[[b1]], %[[a3]]
     // CHECK: memref.dealloc %[[b1]]
+    // CHECK: %[[casted3:.*]] = memref.cast %[[a3]]
     // CHECK: %[[a2:.*]] = memref.alloc() {{.*}} : memref<5xi1>
     // CHECK: memref.copy %[[b0]], %[[a2]]
     // CHECK: %[[casted2:.*]] = memref.cast %[[a2]]
-    // CHECK: %[[casted3:.*]] = memref.cast %[[a3]]
-    // CHECK: scf.yield %[[casted3]], %[[casted2]]
+    // CHECK: %[[cloned2:.*]] = bufferization.clone %[[casted2]]
+    // CHECK: memref.dealloc %[[a2]]
+    // CHECK: %[[cloned3:.*]] = bufferization.clone %[[casted3]]
+    // CHECK: memref.dealloc %[[a3]]
+    // CHECK: scf.yield %[[cloned3]], %[[cloned2]]
     // CHECK: }
     %pos = "dummy.some_op"() : () -> (index)
     %val = "dummy.another_op"() : () -> (i1)
@@ -460,8 +484,8 @@
 //       CHECK:   %[[alloc2:.*]] = memref.alloc() {{.*}} : memref<5xi1>
 //       CHECK:   %[[clone:.*]] = bufferization.clone %[[arg1]]
 //       CHECK:   scf.while (%[[arg3:.*]] = %[[clone]]) : (memref<5xi1, #{{.*}}) -> () {
-//       CHECK:     memref.dealloc %[[arg3]]
-//       CHECK:     %[[load:.*]] = memref.load %[[arg0]]
+//   CHECK-DAG:     memref.dealloc %[[arg3]]
+//   CHECK-DAG:     %[[load:.*]] = memref.load %[[arg0]]
 //       CHECK:     scf.condition(%[[load]])
 //       CHECK:   } do {
 //       CHECK:     memref.copy %[[arg0]], %[[alloc2]]
@@ -469,7 +493,9 @@
 //       CHECK:     %[[alloc1:.*]] = memref.alloc() {{.*}} : memref<5xi1>
 //       CHECK:     memref.copy %[[alloc2]], %[[alloc1]]
 //       CHECK:     %[[casted:.*]] = memref.cast %[[alloc1]] : memref<5xi1> to memref<5xi1, #{{.*}}>
-//       CHECK:     scf.yield %[[casted]]
+//       CHECK:     %[[cloned:.*]] = bufferization.clone %[[casted]]
+//       CHECK:     memref.dealloc %[[alloc1]]
+//       CHECK:     scf.yield %[[cloned]]
 //       CHECK:   }
 //   CHECK-DAG:   memref.dealloc %[[alloc2]]
 func.func @scf_while_iter_arg_result_mismatch(%arg0: tensor<5xi1>,