diff --git a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
--- a/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/one-shot-module-bufferize-analysis.mlir
@@ -559,121 +559,6 @@
   return %rA, %rB, %rC: tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// Simple loop cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-LABEL: func @scf_for_yield_only
-func.func @scf_for_yield_only(
-    %A : tensor<?xf32> {bufferization.writable = false},
-    %B : tensor<?xf32> {bufferization.writable = true},
-    %lb : index,
-    %ub : index,
-    %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //      CHECK: scf.for
-  // CHECK-NEXT: scf.yield
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  //      CHECK: scf.for
-  // CHECK-NEXT: scf.yield
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
-  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [-1, 1]
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
-func.func @scf_for_with_tensor.insert_slice(
-    %A : tensor<?xf32> {bufferization.writable = false},
-    %B : tensor<?xf32> {bufferization.writable = true},
-    %C : tensor<4xf32> {bufferization.writable = false},
-    %lb : index,
-    %ub : index,
-    %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //      CHECK: scf.for
-  // scf.for bbArgs are always inplaceable seen from ops inside the body:
-  //   1. Either the matching tensor is not inplaceable and an alloc occurs
-  //      which makes bbArg inplaceable.
-  //   2. Or it is already inplaceable and so is bbArg.
-  // CHECK-NEXT:   tensor.insert_slice
-  // CHECK-SAME:     {__inplace_operands_attr__ = ["true", "true"]}
-  // CHECK-NEXT:   tensor.insert_slice
-  // CHECK-SAME:     {__inplace_operands_attr__ = ["true", "true"]}
-  // CHECK-NEXT:   scf.yield {__inplace_operands_attr__ = ["true", "true"]}
-  // CHECK-NEXT: } {__inplace_operands_attr__ = ["none", "none", "none", "false", "true"]}
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
-  }
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [-1, 1]
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-func.func private @some_use(tensor<?xf32>) -> ()
-
-// CHECK-LABEL: func @scf_for_deps
-func.func @scf_for_deps(
-    %A : tensor<?xf32> {bufferization.writable = true},
-    %B : tensor<?xf32> {bufferization.writable = true},
-    %lb : index,
-    %ub : index,
-    %step : index)
-  -> (tensor<?xf32>)
-{
-  // %r0 must be out of place because one use of %t in the subsequent production
-  // of %r1 is read.
-  //      CHECK: scf.for
-  // CHECK-NEXT: call
-  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
-  // CHECK-NEXT: scf.yield
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    func.call @some_use(%t) : (tensor<?xf32>) -> ()
-    scf.yield %t : tensor<?xf32>
-  }
-
-  // %r1 bufferizes inplace fine.
-  //      CHECK: scf.for
-  // CHECK-NEXT: call
-  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
-  // CHECK-NEXT: scf.yield
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
-  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    func.call @some_use(%t) : (tensor<?xf32>) -> ()
-    scf.yield %t : tensor<?xf32>
-  }
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0]
-  return %r1: tensor<?xf32>
-}
-
 // -----
 
 //===----------------------------------------------------------------------===//
@@ -1148,465 +1033,6 @@
 
 // -----
 
-#accesses = [
-  affine_map<(i) -> (i)>
-]
-#trait = {
-  indexing_maps = #accesses,
-  iterator_types = ["parallel"]
-}
-
-// CHECK-LABEL: func @reading_scf_for
-func.func @reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true},
-                           %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) {
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.0 : f32
-
-  // Write to %t1.
-  // CHECK:      vector.transfer_write
-  // CHECK-SAME: __inplace_operands_attr__ = ["none", "false", "none"]
-  %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32>
-
-  // Read the old value of %t1 inside the loop via an alias.
-  // CHECK: scf.for {{.*}} {
-  %r, %v3 = scf.for %i = %c0 to %s step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) {
-    // CHECK:      tensor.extract_slice
-    // CHECK-SAME: __inplace_operands_attr__ = ["true", "none", "none"]
-    %e = tensor.extract_slice %t2[%s][%s][1] : tensor<?xf32> to tensor<?xf32>
-
-    // Read from %t1 via alias %e.
-    %v2 = vector.transfer_read %e[%s], %cst : tensor<?xf32>, vector<5xf32>
-    scf.yield %t2, %v2 : tensor<?xf32>, vector<5xf32>
-  }
-  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true", "none"]}
-
-  // Use %t3 in some way without reading it, so that it does not get DCE'd.
-  // CHECK:      linalg.generic
-  // CHECK-SAME: __inplace_operands_attr__ = ["true"]
-  %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) {
-      ^bb(%0: f32) :
-        linalg.yield %cst : f32
-    } -> (tensor<?xf32>)
-
-  return %o, %v3 : tensor<?xf32>, vector<5xf32>
-}
-
-// -----
-
-#accesses = [
-  affine_map<(i) -> (i)>
-]
-#trait = {
-  indexing_maps = #accesses,
-  iterator_types = ["parallel"]
-}
-
-// CHECK-LABEL: func @non_reading_scf_for
-func.func @non_reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true},
-                               %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) {
-
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %cst = arith.constant 0.0 : f32
-
-  // Write to %t1.
-  // CHECK:      vector.transfer_write
-  // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"]
-  %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32>
-
-  // This loop does not read from %t1. It only writes to it.
-  // CHECK:      scf.for
-  %r, %v3 = scf.for %i = %c0 to %s step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) {
-    // Write to %t1 via %t2. (Overwrite %t3.)
-    // CHECK:      linalg.generic
-    // CHECK-SAME: __inplace_operands_attr__ = ["true"]
-    %o2 = linalg.generic #trait outs (%t2 : tensor<?xf32>) {
-        ^bb(%0: f32) :
-          linalg.yield %cst : f32
-      } -> (tensor<?xf32>)
-
-    // Read overwritten value. This is not a read of %t1.
-    %v2 = vector.transfer_read %o2[%s], %cst : tensor<?xf32>, vector<5xf32>
-    scf.yield %o2, %v2 : tensor<?xf32>, vector<5xf32>
-  }
-
-  // Use %t3 in some way without reading it, so that it does not get DCE'd.
-  // CHECK:      linalg.generic
-  // CHECK-SAME: __inplace_operands_attr__ = ["true"]
-  %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) {
-      ^bb(%0: f32) :
-        linalg.yield %cst : f32
-    } -> (tensor<?xf32>)
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
-  return %o, %v3 : tensor<?xf32>, vector<5xf32>
-}
-
-// -----
-
-//===----------------------------------------------------------------------===//
-// scf.if cases
-//===----------------------------------------------------------------------===//
-
-// This example passes analysis, but it fails when bufferizing.
-// CHECK-LABEL: func @scf_if_inplace1
-func.func @scf_if_inplace1(%t1: tensor<?xf32> {bufferization.writable = true},
-                           %t2: tensor<?xf32> {bufferization.writable = true},
-                           %cond: i1) -> tensor<?xf32> {
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    // CHECK:      scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t1 : tensor<?xf32>
-  } else {
-    // CHECK:      scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t2 : tensor<?xf32>
-  }
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_inplace2
-func.func @scf_if_inplace2(%t1: tensor<?xf32> {bufferization.writable = true},
-                           %v: vector<5xf32>, %idx: index,
-                           %cond: i1) -> tensor<?xf32> {
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    // CHECK:      scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t1 : tensor<?xf32>
-  } else {
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    scf.yield %t2 : tensor<?xf32>
-  }
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0]
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_inplace3
-func.func @scf_if_inplace3(%t1: tensor<?xf32> {bufferization.writable = true},
-                           %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index,
-                           %cond: i1) -> tensor<?xf32> {
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
-  %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-    %t2 = vector.transfer_write %v1, %e[%idx] : vector<5xf32>, tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t2 : tensor<?xf32>
-  } else {
-    // Writing the same tensor through an alias. This is OK.
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t3 : tensor<?xf32>
-  }
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_in_place4
-func.func @scf_if_in_place4(%t1: tensor<?xf32> {bufferization.writable = true},
-                            %v: vector<5xf32>, %idx: index,
-                            %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) {
-  %cst = arith.constant 0.0 : f32
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t1 : tensor<?xf32>
-  } else {
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t2 : tensor<?xf32>
-  }
-  %r_alias = scf.if %cond2 -> (tensor<?xf32>) {
-    // Reading %r is OK. No conflict.
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %r : tensor<?xf32>
-  } else {
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %r : tensor<?xf32>
-  }
-  %v2 = vector.transfer_read %r_alias[%idx], %cst : tensor<?xf32>, vector<10xf32>
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
-  return %r_alias, %v2 : tensor<?xf32>, vector<10xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_inplace5
-func.func @scf_if_inplace5(%t1: tensor<?xf32> {bufferization.writable = true},
-                           %idx: index, %cond: i1) -> tensor<?xf32> {
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    //      CHECK: tensor.extract_slice
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
-    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %e : tensor<?xf32>
-  } else {
-    //      CHECK: tensor.extract_slice
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
-    %f = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %f : tensor<?xf32>
-  }
-
-  // Inserting into an equivalent tensor at the same offset. This bufferizes
-  // inplace.
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
-  %r2 = tensor.insert_slice %r into %t1[%idx][%idx][1] : tensor<?xf32> into tensor<?xf32>
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0]
-  return %r2 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_inplace6
-func.func @scf_if_inplace6(%t1: tensor<?xf32> {bufferization.writable = true},
-                           %v1: vector<5xf32>, %v2: vector<5xf32>,
-                           %v3: vector<5xf32>, %idx: index,
-                           %cond: i1, %cond2: i1) -> tensor<?xf32> {
-  // Test nested scf.if ops.
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    %t2 = scf.if %cond2 -> (tensor<?xf32>) {
-      //      CHECK: vector.transfer_write
-      // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-      %t3 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-      //      CHECK: scf.yield
-      // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-      scf.yield %t3 : tensor<?xf32>
-    } else {
-      //      CHECK: vector.transfer_write
-      // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-      %t4 = vector.transfer_write %v3, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-      //      CHECK: scf.yield
-      // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-      scf.yield %t4 : tensor<?xf32>
-    }
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t2 : tensor<?xf32>
-  } else {
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t3 : tensor<?xf32>
-  }
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0]
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_inplace7
-func.func @scf_if_inplace7(%t1: tensor<?xf32> {bufferization.writable = true},
-                           %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index,
-                           %idx2: index, %cond: i1) -> (tensor<?xf32>, vector<5xf32>) {
-  %cst = arith.constant 0.0 : f32
-  %r, %v_r2 = scf.if %cond -> (tensor<?xf32>, vector<5xf32>) {
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
-    %t2 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]}
-    scf.yield %t2, %v1 : tensor<?xf32>, vector<5xf32>
-  } else {
-    // Writing the same tensor through an alias.
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
-    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    // Read the original value of %t1. This requires the write in this branch
-    // to be out-of-place. But the write in the other branch can still be
-    // inplace.
-    %v_r = vector.transfer_read %t1[%idx2], %cst : tensor<?xf32>, vector<5xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]}
-    scf.yield %t3, %v_r : tensor<?xf32>, vector<5xf32>
-  }
-  return %r, %v_r2 : tensor<?xf32>, vector<5xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_out_of_place1a
-func.func @scf_if_out_of_place1a(%t1: tensor<?xf32> {bufferization.writable = true},
-                                 %idx: index, %idx2: index,
-                                 %cond: i1) -> tensor<?xf32> {
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    //      CHECK: tensor.extract_slice
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
-    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %e : tensor<?xf32>
-  } else {
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t1 : tensor<?xf32>
-  }
-
-  // Reading from and writing to the same tensor via different args. This is a
-  // conflict.
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none", "none"]
-  %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32>
-  return %r2 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_out_of_place1b
-func.func @scf_if_out_of_place1b(%t1: tensor<?xf32> {bufferization.writable = true},
-                                 %idx: index, %idx2: index, %idx3: index,
-                                 %cond: i1) -> tensor<?xf32> {
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    //      CHECK: tensor.extract_slice
-    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
-    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %e : tensor<?xf32>
-  } else {
-    //      CHECK: tensor.extract_slice
-    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
-    %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %f : tensor<?xf32>
-  }
-
-  // Reading from and writing to the same tensor via different args. This is a
-  // conflict. In contrast to scf_if_out_of_place1a, the fact that %r aliases
-  // with %t1 is only detected when analyzing the tensor.extract_slices. That's
-  // why the tensor.insert_slice is inplace and the two extract_slices are
-  // out-of-place.
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
-  %r2 = tensor.insert_slice %r into %t1[%idx3][%idx3][1] : tensor<?xf32> into tensor<?xf32>
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0]
-  return %r2 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_out_of_place1c
-func.func @scf_if_out_of_place1c(%t1: tensor<?xf32> {bufferization.writable = true},
-                                 %idx: index, %idx2: index, %cond: i1) -> tensor<?xf32> {
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    //      CHECK: tensor.extract_slice
-    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
-    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %e : tensor<?xf32>
-  } else {
-    // TODO: This one could bufferize inplace, but the analysis is too restrictive.
-    //      CHECK: tensor.extract_slice
-    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
-    %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %f : tensor<?xf32>
-  }
-
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
-  %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32>
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0]
-  return %r2 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_out_of_place2
-func.func @scf_if_out_of_place2(%t1: tensor<?xf32> {bufferization.writable = true},
-                                %v: vector<5xf32>, %idx: index,
-                                %cond: i1) -> (tensor<?xf32>, vector<10xf32>) {
-  %cst = arith.constant 0.0 : f32
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    scf.yield %t1 : tensor<?xf32>
-  } else {
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
-    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t2 : tensor<?xf32>
-  }
-
-  // Read the old value of %t1. Forces the transfer_write to bufferize
-  // out-of-place.
-  %v2 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<10xf32>
-  return %r, %v2 : tensor<?xf32>, vector<10xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_out_of_place3
-func.func @scf_if_out_of_place3(%t1: tensor<?xf32> {bufferization.writable = true},
-                                %v: vector<5xf32>, %idx: index,
-                                %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) {
-  %cst = arith.constant 0.0 : f32
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    scf.yield %t1 : tensor<?xf32>
-  } else {
-    //      CHECK: vector.transfer_write
-    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
-    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t2 : tensor<?xf32>
-  }
-  %t1_alias = scf.if %cond2 -> (tensor<?xf32>) {
-    // scf.yield bufferizes to a read. That is a conflict in this example.
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t1 : tensor<?xf32>
-  } else {
-    //      CHECK: scf.yield
-    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
-    scf.yield %t1 : tensor<?xf32>
-  }
-  %v2 = vector.transfer_read %t1_alias[%idx], %cst : tensor<?xf32>, vector<10xf32>
-  return %r, %v2 : tensor<?xf32>, vector<10xf32>
-}
-
-// -----
-
 // CHECK-LABEL: func @some_use
 func.func @some_use(%A : tensor<?xf32> {bufferization.writable = true},
                     %v : vector<5xf32>) -> (tensor<?xf32>) {
@@ -1817,30 +1243,3 @@
 
   return %r0 : tensor<?xf32>
 }
-
-// -----
-
-// CHECK-LABEL: func @write_to_same_tensor_in_loop_in_place(
-func.func @write_to_same_tensor_in_loop_in_place(
-    %A : tensor<?xf32> {linalg.inplaceable = true},
-    %lb : index, %ub : index, %step : index, %sz: index)
-  -> (tensor<?xf32>)
-{
-  // CHECK: scf.for {{.*}} {
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    %B = linalg.init_tensor [%sz] : tensor<?xf32>
-    %i2 = arith.index_cast %i : index to i32
-    %i3 = arith.sitofp %i2 : i32 to f32
-    // The tensor.insert is in-place because the %B is defined inside the loop.
-    //      CHECK: tensor.insert
-    // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "none"]}
-    %B2 = tensor.insert %i3 into %B[%i] : tensor<?xf32>
-    //      CHECK: tensor.insert_slice
-    // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
-    %A2 = tensor.insert_slice %B2 into %t[%i][%sz][1] : tensor<?xf32> into tensor<?xf32>
-    scf.yield %A2 : tensor<?xf32>
-  }
-  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
-
-  return %r0 : tensor<?xf32>
-}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -286,142 +286,12 @@
   return %r0: tensor<?xf32>
 }
 
-//===----------------------------------------------------------------------===//
-// Simple loop cases
-//===----------------------------------------------------------------------===//
-
-// -----
-
-// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-// CHECK-LABEL: func @scf_for_yield_only(
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>,
-//  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
-//  CHECK-SAME:   ) -> memref<?xf32> {
-func.func @scf_for_yield_only(
-    %A : tensor<?xf32> {bufferization.writable = false},
-    %B : tensor<?xf32> {bufferization.writable = true},
-    %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
-  //     CHECK:   memref.copy %[[A]], %[[ALLOC_FOR_A]]
-
-  // The first scf.for remains but just turns into dead code.
-  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  // The second scf.for remains but just turns into dead code.
-  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  //     CHECK:   return %[[ALLOC_FOR_A]] : memref<?xf32>
-  // CHECK-NOT:   dealloc
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// Ensure that the function bufferizes without error. This tests pre-order
-// traversal of scf.for loops during bufferization. No need to check the IR,
-// just want to make sure that it does not crash.
-
-// CHECK-LABEL: func @nested_scf_for
-func.func @nested_scf_for(%A : tensor<?xf32> {bufferization.writable = true},
-                          %v : vector<5xf32>) -> tensor<?xf32> {
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c10 = arith.constant 10 : index
-  %r1 = scf.for %i = %c0 to %c10 step %c1 iter_args(%B = %A) -> tensor<?xf32> {
-    %r2 = scf.for %j = %c0 to %c10 step %c1 iter_args(%C = %B) -> tensor<?xf32> {
-      %w = vector.transfer_write %v, %C[%c0] : vector<5xf32>, tensor<?xf32>
-      scf.yield %w : tensor<?xf32>
-    }
-    scf.yield %r2 : tensor<?xf32>
-  }
-  return %r1 : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
-
-// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
-//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
-//  CHECK-SAME:   %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
-//  CHECK-SAME:   %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
-func.func @scf_for_with_tensor.insert_slice(
-    %A : tensor<?xf32> {bufferization.writable = false},
-    %B : tensor<?xf32> {bufferization.writable = true},
-    %C : tensor<4xf32> {bufferization.writable = false},
-    %lb : index, %ub : index, %step : index)
-  -> (tensor<?xf32>, tensor<?xf32>)
-{
-  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
-  //     CHECK:   memref.copy %[[A]], %[[ALLOC_FOR_A]]
-
-  //     CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1]
-  //     CHECK: %[[svB:.*]] = memref.subview %[[B]][0] [4] [1]
-
-  //     CHECK:   scf.for {{.*}}
-  // CHECK-NOT: iter_args
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-    // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA
-    //     CHECK: memref.copy %[[C]], %[[svA]]
-    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-    // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B
-    //     CHECK:   memref.copy %[[C]], %[[svB]]
-    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-    // CHECK-NOT:   scf.yield
-    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
-  }
-
-  //     CHECK:  return %[[ALLOC_FOR_A]] : memref<?xf32>
-  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
-}
-
 // -----
 
 //===----------------------------------------------------------------------===//
 // Cross function boundary cases.
 //===----------------------------------------------------------------------===//
 
-// CHECK-LABEL: func @execute_region_with_conflict(
-//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
-func.func @execute_region_with_conflict(
-    %t1 : tensor<?xf32> {bufferization.writable = "true"})
-  -> (f32, tensor<?xf32>, f32)
-{
-  %f1 = arith.constant 0.0 : f32
-  %idx = arith.constant 7 : index
-
-  // scf.execute_region is canonicalized away after bufferization. So just the
-  // memref.store is left over.
-
-  // CHECK: %[[alloc:.*]] = memref.alloc
-  // CHECK: memref.copy %[[m1]], %[[alloc]]
-  // CHECK: memref.store %{{.*}}, %[[alloc]][%{{.*}}]
-  %0, %1, %2 = scf.execute_region -> (f32, tensor<?xf32>, f32) {
-    %t2 = tensor.insert %f1 into %t1[%idx] : tensor<?xf32>
-    scf.yield %f1, %t2, %f1 : f32, tensor<?xf32>, f32
-  }
-
-  // CHECK: %[[casted:.*]] = memref.cast %[[alloc]]
-  // CHECK: %[[load:.*]] = memref.load %[[m1]]
-  %3 = tensor.extract %t1[%idx] : tensor<?xf32>
-
-  // CHECK: return %{{.*}}, %[[casted]], %[[load]] : f32, memref<?xf32, #{{.*}}>, f32
-  return %0, %1, %3 : f32, tensor<?xf32>, f32
-}
-
-// -----
-
 //      CHECK: func @matmul(
 // CHECK-SAME:   %[[A:[0-9a-zA-Z]*]]: memref<128x256xf32>
 // CHECK-SAME:   %[[B:[0-9a-zA-Z]*]]: memref<256x192xf32>
@@ -536,80 +406,6 @@
   return %rA : tensor<?x?xf32>
 }
 
-// -----
-
-// CHECK-LABEL: func @scf_if_inplace(
-//  CHECK-SAME:     %[[cond:.*]]: i1, %[[t1:.*]]: memref<?xf32{{.*}}>, %[[v:.*]]: vector
-func.func @scf_if_inplace(%cond: i1,
-                          %t1: tensor<?xf32> {bufferization.writable = true},
-                          %v: vector<5xf32>, %idx: index) -> tensor<?xf32> {
-
-  //      CHECK: scf.if %[[cond]] {
-  // CHECK-NEXT: } else {
-  // CHECK-NEXT:   vector.transfer_write %[[v]], %[[t1]]
-  // CHECK-NEXT: }
-  // CHECK-NEXT: return
-  %r = scf.if %cond -> (tensor<?xf32>) {
-    scf.yield %t1 : tensor<?xf32>
-  } else {
-    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
-    scf.yield %t2 : tensor<?xf32>
-  }
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_inside_scf_for
-//   CHECK-DAG:   %[[c0:.*]] = arith.constant 0 : index
-//   CHECK-DAG:   %[[c1:.*]] = arith.constant 1 : index
-//   CHECK-DAG:   %[[c10:.*]] = arith.constant 10 : index
-//       CHECK:   scf.for %{{.*}} = %[[c0]] to %[[c10]] step %[[c1]] {
-//       CHECK:     scf.if %{{.*}} {
-//       CHECK:     } else {
-//       CHECK:       vector.transfer_write
-//       CHECK:     }
-//       CHECK:   }
-func.func @scf_if_inside_scf_for(
-    %t1: tensor<?xf32> {bufferization.writable = true},
-    %v: vector<5xf32>, %idx: index,
-    %cond: i1)
-  -> tensor<?xf32>
-{
-  %c0 = arith.constant 0 : index
-  %c1 = arith.constant 1 : index
-  %c10 = arith.constant 10 : index
-  %r = scf.for %iv = %c0 to %c10 step %c1 iter_args(%bb = %t1) -> (tensor<?xf32>) {
-    %r2 = scf.if %cond -> (tensor<?xf32>) {
-      scf.yield %bb : tensor<?xf32>
-    } else {
-      %t2 = vector.transfer_write %v, %bb[%idx] : vector<5xf32>, tensor<?xf32>
-      scf.yield %t2 : tensor<?xf32>
-    }
-    scf.yield %r2 : tensor<?xf32>
-  }
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @scf_if_non_equiv_yields(
-//  CHECK-SAME:     %[[cond:.*]]: i1, %[[A:.*]]: memref<{{.*}}>, %[[B:.*]]: memref<{{.*}}>) -> memref<{{.*}}>
-func.func @scf_if_non_equiv_yields(
-    %b : i1,
-    %A : tensor<4xf32> {bufferization.writable = false},
-    %B : tensor<4xf32> {bufferization.writable = false})
-  -> tensor<4xf32>
-{
-  // CHECK: %[[r:.*]] = arith.select %[[cond]], %[[A]], %[[B]]
-  %r = scf.if %b -> (tensor<4xf32>) {
-    scf.yield %A : tensor<4xf32>
-  } else {
-    scf.yield %B : tensor<4xf32>
-  }
-  // CHECK: return %[[r]]
-  return %r: tensor<4xf32>
-}
 
 // -----
 
@@ -823,126 +619,3 @@
   }
   return %5: tensor<?x1x6x8xf32>
 }
-
-// -----
-
-// Note: This bufferization is inefficient, but it bufferizes correctly.
-
-// CHECK-LABEL: func @scf_execute_region_yield_non_equivalent(
-//       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
-//       CHECK:   %[[clone:.*]] = bufferization.clone %[[alloc]]
-//       CHECK:   memref.dealloc %[[alloc]]
-//       CHECK:   %[[r:.*]] = memref.load %[[clone]][%{{.*}}]
-//       CHECK:   memref.dealloc %[[clone]]
-//       CHECK:   return %[[r]]
-func.func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32 {
-  %r = scf.execute_region -> (tensor<?xf32>) {
-    %t2 = linalg.init_tensor [%i] : tensor<?xf32>
-    scf.yield %t2 : tensor<?xf32>
-  }
-  %f = tensor.extract %r[%j] : tensor<?xf32>
-  return %f : f32
-}
-
-// -----
-
-// Note: This bufferizes to inefficient code, but bufferization should not see
-// such IR in the first place. The iter_arg would canonicalize away. This test
-// case is just to ensure that the bufferization generates correct code.
-
-// CHECK-LABEL: func @scf_for_yield_non_equivalent(
-//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
-//       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
-//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[alloc]])
-//       CHECK:     memref.dealloc %[[iter]]
-//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[t]], %[[alloc2]]
-//       CHECK:     scf.yield %[[alloc2]]
-//       CHECK:   return %[[for]]
-func.func @scf_for_yield_non_equivalent(
-    %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
-  %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
-    scf.yield %t : tensor<?xf32>
-  }
-
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// Note: This bufferizes to inefficient code, but bufferization should not see
-// such IR in the first place. The iter_arg would canonicalize away. This test
-// case is just to ensure that the bufferization generates correct code.
-
-// CHECK-LABEL: func @scf_for_yield_allocation(
-//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
-//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[t]]
-//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
-// This alloc is for the linalg.init_tensor.
-//   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-//   CHECK-DAG:     memref.dealloc %[[iter]]
-// This alloc is for the scf.yield.
-//       CHECK:     %[[alloc3:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[alloc2]], %[[alloc3]]
-//       CHECK:     memref.dealloc %[[alloc2]]
-//       CHECK:     %[[casted3:.*]] = memref.cast %[[alloc3]]
-//       CHECK:     scf.yield %[[casted3]]
-//       CHECK:   return %[[for]]
-func.func @scf_for_yield_allocation(%t: tensor<?xf32>, %lb : index, %ub : index,
-                               %step : index) -> tensor<?xf32> {
-  %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
-    %t2 = linalg.init_tensor [%i] : tensor<?xf32>
-    scf.yield %t2 : tensor<?xf32>
-  }
-
-  return %r : tensor<?xf32>
-}
-
-// -----
-
-// TODO: The scf.yield could bufferize to 1 alloc and 2 copies (instead of
-// 2 allocs and 2 copies).
-
-// CHECK-LABEL: func @scf_for_swapping_yields(
-//  CHECK-SAME:     %[[A:.*]]: memref<?xf32, #{{.*}}>, %[[B:.*]]: memref<?xf32, #{{.*}}>
-func.func @scf_for_swapping_yields(
-    %A : tensor<?xf32>, %B : tensor<?xf32> {bufferization.writable = true},
-    %C : tensor<4xf32>, %lb : index, %ub : index, %step : index)
-  -> (f32, f32)
-{
-//   CHECK-DAG:   %[[clone1:.*]] = bufferization.clone %[[A]]
-//   CHECK-DAG:   %[[clone2:.*]] = bufferization.clone %[[B]]
-//       CHECK:   %[[for:.*]]:2 = scf.for {{.*}} iter_args(%[[iter1:.*]] = %[[clone1]], %[[iter2:.*]] = %[[clone2]])
-  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
-      -> (tensor<?xf32>, tensor<?xf32>)
-  {
-//       CHECK:     %[[sv1:.*]] = memref.subview %[[iter1]]
-//       CHECK:     memref.copy %{{.*}}, %[[sv1]]
-    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
-//       CHECK:     %[[sv2:.*]] = memref.subview %[[iter2]]
-//       CHECK:     memref.copy %{{.*}}, %[[sv2]]
-    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
-
-//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[iter2]], %[[alloc2]]
-//       CHECK:     memref.dealloc %[[iter2]]
-//       CHECK:     %[[alloc1:.*]] = memref.alloc(%{{.*}})
-//       CHECK:     memref.copy %[[iter1]], %[[alloc1]]
-//       CHECK:     memref.dealloc %[[iter1]]
-//       CHECK:     %[[casted1:.*]] = memref.cast %[[alloc1]]
-//       CHECK:     %[[casted2:.*]] = memref.cast %[[alloc2]]
-//       CHECK:     scf.yield %[[casted2]], %[[casted1]]
-    // Yield tensors in different order.
-    scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
-  }
-
-//       CHECK:     %[[r0:.*]] = memref.load %[[for]]#0
-//       CHECK:     memref.dealloc %[[for]]#0
-//       CHECK:     %[[r1:.*]] = memref.load %[[for]]#1
-//       CHECK:     memref.dealloc %[[for]]#1
-  %f0 = tensor.extract %r0#0[%step] : tensor<?xf32>
-  %f1 = tensor.extract %r0#1[%step] : tensor<?xf32>
-//       CHECK:     return %[[r0]], %[[r1]]
-  return %f0, %f1: f32, f32
-}
-
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize-analysis.mlir
@@ -0,0 +1,601 @@
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs" -split-input-file | FileCheck %s
+
+// Run fuzzer with different seeds.
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs analysis-fuzzer-seed=23" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs analysis-fuzzer-seed=59" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="bufferize-function-boundaries test-analysis-only allow-return-allocs analysis-fuzzer-seed=91" -split-input-file -o /dev/null
+
+// CHECK-LABEL: func @scf_for_yield_only
+func.func @scf_for_yield_only(
+    %A : tensor<?xf32> {bufferization.writable = false},
+    %B : tensor<?xf32> {bufferization.writable = true},
+    %lb : index,
+    %ub : index,
+    %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //      CHECK: scf.for
+  // CHECK-NEXT: scf.yield
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
+  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  //      CHECK: scf.for
+  // CHECK-NEXT: scf.yield
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [-1, 1]
+  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
+func.func @scf_for_with_tensor.insert_slice(
+    %A : tensor<?xf32> {bufferization.writable = false},
+    %B : tensor<?xf32> {bufferization.writable = true},
+    %C : tensor<4xf32> {bufferization.writable = false},
+    %lb : index,
+    %ub : index,
+    %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //      CHECK: scf.for
+  // scf.for bbArgs are always inplaceable seen from ops inside the body:
+  //   1. Either the matching tensor is not inplaceable and an alloc occurs
+  //      which makes bbArg inplaceable.
+  //   2. Or it is already inplaceable and so is bbArg.
+  // CHECK-NEXT:   tensor.insert_slice
+  // CHECK-SAME:     {__inplace_operands_attr__ = ["true", "true"]}
+  // CHECK-NEXT:   tensor.insert_slice
+  // CHECK-SAME:     {__inplace_operands_attr__ = ["true", "true"]}
+  // CHECK-NEXT:   scf.yield {__inplace_operands_attr__ = ["true", "true"]}
+  // CHECK-NEXT: } {__inplace_operands_attr__ = ["none", "none", "none", "false", "true"]}
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
+  }
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [-1, 1]
+  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+func.func private @some_use(tensor<?xf32>) -> ()
+
+// CHECK-LABEL: func @scf_for_deps
+func.func @scf_for_deps(
+    %A : tensor<?xf32> {bufferization.writable = true},
+    %B : tensor<?xf32> {bufferization.writable = true},
+    %lb : index,
+    %ub : index,
+    %step : index)
+  -> (tensor<?xf32>)
+{
+  // %r0 must be out of place because one use of %t in the subsequent production
+  // of %r1 is read.
+  //      CHECK: scf.for
+  // CHECK-NEXT: call
+  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
+  // CHECK-NEXT: scf.yield
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "false"]}
+  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+    func.call @some_use(%t) : (tensor<?xf32>) -> ()
+    scf.yield %t : tensor<?xf32>
+  }
+
+  // %r1 bufferizes inplace fine.
+  //      CHECK: scf.for
+  // CHECK-NEXT: call
+  // CHECK-SAME: {__inplace_operands_attr__ = ["false"]}
+  // CHECK-NEXT: scf.yield
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+  //      CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+    func.call @some_use(%t) : (tensor<?xf32>) -> ()
+    scf.yield %t : tensor<?xf32>
+  }
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0]
+  return %r1: tensor<?xf32>
+}
+
+// -----
+
+#accesses = [
+  affine_map<(i) -> (i)>
+]
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel"]
+}
+
+// CHECK-LABEL: func @reading_scf_for
+func.func @reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true},
+                           %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) {
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.0 : f32
+
+  // Write to %t1.
+  // CHECK:      vector.transfer_write
+  // CHECK-SAME: __inplace_operands_attr__ = ["none", "false", "none"]
+  %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32>
+
+  // Read the old value of %t1 inside the loop via an alias.
+  // CHECK: scf.for {{.*}} {
+  %r, %v3 = scf.for %i = %c0 to %s step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) {
+    // CHECK:      tensor.extract_slice
+    // CHECK-SAME: __inplace_operands_attr__ = ["true", "none", "none"]
+    %e = tensor.extract_slice %t2[%s][%s][1] : tensor<?xf32> to tensor<?xf32>
+
+    // Read from %t1 via alias %e.
+    %v2 = vector.transfer_read %e[%s], %cst : tensor<?xf32>, vector<5xf32>
+    scf.yield %t2, %v2 : tensor<?xf32>, vector<5xf32>
+  }
+  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true", "none"]}
+
+  // Use %t3 in some way without reading it, so that it does not get DCE'd.
+  // CHECK:      linalg.generic
+  // CHECK-SAME: __inplace_operands_attr__ = ["true"]
+  %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) {
+      ^bb(%0: f32) :
+        linalg.yield %cst : f32
+    } -> (tensor<?xf32>)
+
+  return %o, %v3 : tensor<?xf32>, vector<5xf32>
+}
+
+// -----
+
+#accesses = [
+  affine_map<(i) -> (i)>
+]
+#trait = {
+  indexing_maps = #accesses,
+  iterator_types = ["parallel"]
+}
+
+// CHECK-LABEL: func @non_reading_scf_for
+func.func @non_reading_scf_for(%t1: tensor<?xf32> {bufferization.writable = true},
+                               %s: index, %v: vector<5xf32>) -> (tensor<?xf32>, vector<5xf32>) {
+
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant 0.0 : f32
+
+  // Write to %t1.
+  // CHECK:      vector.transfer_write
+  // CHECK-SAME: __inplace_operands_attr__ = ["none", "true", "none"]
+  %t3 = vector.transfer_write %v, %t1[%s] : vector<5xf32>, tensor<?xf32>
+
+  // This loop does not read from %t1. It only writes to it.
+  // CHECK:      scf.for
+  %r, %v3 = scf.for %i = %c0 to %s step %c1 iter_args(%t2 = %t1, %v0 = %v) -> (tensor<?xf32>, vector<5xf32>) {
+    // Write to %t1 via %t2. (Overwrite %t3.)
+    // CHECK:      linalg.generic
+    // CHECK-SAME: __inplace_operands_attr__ = ["true"]
+    %o2 = linalg.generic #trait outs (%t2 : tensor<?xf32>) {
+        ^bb(%0: f32) :
+          linalg.yield %cst : f32
+      } -> (tensor<?xf32>)
+
+    // Read overwritten value. This is not a read of %t1.
+    %v2 = vector.transfer_read %o2[%s], %cst : tensor<?xf32>, vector<5xf32>
+    scf.yield %o2, %v2 : tensor<?xf32>, vector<5xf32>
+  }
+
+  // Use %t3 in some way without reading it, so that it does not get DCE'd.
+  // CHECK:      linalg.generic
+  // CHECK-SAME: __inplace_operands_attr__ = ["true"]
+  %o = linalg.generic #trait outs (%t3 : tensor<?xf32>) {
+      ^bb(%0: f32) :
+        linalg.yield %cst : f32
+    } -> (tensor<?xf32>)
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
+  return %o, %v3 : tensor<?xf32>, vector<5xf32>
+}
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// scf.if cases
+//===----------------------------------------------------------------------===//
+
+// This example passes analysis, but it fails when bufferizing.
+// CHECK-LABEL: func @scf_if_inplace1
+func.func @scf_if_inplace1(%t1: tensor<?xf32> {bufferization.writable = true},
+                           %t2: tensor<?xf32> {bufferization.writable = true},
+                           %cond: i1) -> tensor<?xf32> {
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    // CHECK:      scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t1 : tensor<?xf32>
+  } else {
+    // CHECK:      scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t2 : tensor<?xf32>
+  }
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_inplace2
+func.func @scf_if_inplace2(%t1: tensor<?xf32> {bufferization.writable = true},
+                           %v: vector<5xf32>, %idx: index,
+                           %cond: i1) -> tensor<?xf32> {
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    // CHECK:      scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t1 : tensor<?xf32>
+  } else {
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    scf.yield %t2 : tensor<?xf32>
+  }
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0]
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_inplace3
+func.func @scf_if_inplace3(%t1: tensor<?xf32> {bufferization.writable = true},
+                           %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index,
+                           %cond: i1) -> tensor<?xf32> {
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
+  %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+    %t2 = vector.transfer_write %v1, %e[%idx] : vector<5xf32>, tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t2 : tensor<?xf32>
+  } else {
+    // Writing the same tensor through an alias. This is OK.
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t3 : tensor<?xf32>
+  }
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_in_place4
+func.func @scf_if_in_place4(%t1: tensor<?xf32> {bufferization.writable = true},
+                            %v: vector<5xf32>, %idx: index,
+                            %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) {
+  %cst = arith.constant 0.0 : f32
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t1 : tensor<?xf32>
+  } else {
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t2 : tensor<?xf32>
+  }
+  %r_alias = scf.if %cond2 -> (tensor<?xf32>) {
+    // Reading %r is OK. No conflict.
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %r : tensor<?xf32>
+  } else {
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %r : tensor<?xf32>
+  }
+  %v2 = vector.transfer_read %r_alias[%idx], %cst : tensor<?xf32>, vector<10xf32>
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0, -1]
+  return %r_alias, %v2 : tensor<?xf32>, vector<10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_inplace5
+func.func @scf_if_inplace5(%t1: tensor<?xf32> {bufferization.writable = true},
+                           %idx: index, %cond: i1) -> tensor<?xf32> {
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    //      CHECK: tensor.extract_slice
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
+    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %e : tensor<?xf32>
+  } else {
+    //      CHECK: tensor.extract_slice
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
+    %f = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %f : tensor<?xf32>
+  }
+
+  // Inserting into an equivalent tensor at the same offset. This bufferizes
+  // inplace.
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
+  %r2 = tensor.insert_slice %r into %t1[%idx][%idx][1] : tensor<?xf32> into tensor<?xf32>
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0]
+  return %r2 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_inplace6
+func.func @scf_if_inplace6(%t1: tensor<?xf32> {bufferization.writable = true},
+                           %v1: vector<5xf32>, %v2: vector<5xf32>,
+                           %v3: vector<5xf32>, %idx: index,
+                           %cond: i1, %cond2: i1) -> tensor<?xf32> {
+  // Test nested scf.if ops.
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    %t2 = scf.if %cond2 -> (tensor<?xf32>) {
+      //      CHECK: vector.transfer_write
+      // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+      %t3 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+      //      CHECK: scf.yield
+      // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+      scf.yield %t3 : tensor<?xf32>
+    } else {
+      //      CHECK: vector.transfer_write
+      // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+      %t4 = vector.transfer_write %v3, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+      //      CHECK: scf.yield
+      // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+      scf.yield %t4 : tensor<?xf32>
+    }
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t2 : tensor<?xf32>
+  } else {
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t3 : tensor<?xf32>
+  }
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0]
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_inplace7
+func.func @scf_if_inplace7(%t1: tensor<?xf32> {bufferization.writable = true},
+                           %v1: vector<5xf32>, %v2: vector<5xf32>, %idx: index,
+                           %idx2: index, %cond: i1) -> (tensor<?xf32>, vector<5xf32>) {
+  %cst = arith.constant 0.0 : f32
+  %r, %v_r2 = scf.if %cond -> (tensor<?xf32>, vector<5xf32>) {
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true", "none"]
+    %t2 = vector.transfer_write %v1, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]}
+    scf.yield %t2, %v1 : tensor<?xf32>, vector<5xf32>
+  } else {
+    // Writing the same tensor through an alias.
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
+    %t3 = vector.transfer_write %v2, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    // Read the original value of %t1. This requires the write in this branch
+    // to be out-of-place. But the write in the other branch can still be
+    // inplace.
+    %v_r = vector.transfer_read %t1[%idx2], %cst : tensor<?xf32>, vector<5xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]}
+    scf.yield %t3, %v_r : tensor<?xf32>, vector<5xf32>
+  }
+  return %r, %v_r2 : tensor<?xf32>, vector<5xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_out_of_place1a
+func.func @scf_if_out_of_place1a(%t1: tensor<?xf32> {bufferization.writable = true},
+                                 %idx: index, %idx2: index,
+                                 %cond: i1) -> tensor<?xf32> {
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    //      CHECK: tensor.extract_slice
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none", "none"]
+    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %e : tensor<?xf32>
+  } else {
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t1 : tensor<?xf32>
+  }
+
+  // Reading from and writing to the same tensor via different args. This is a
+  // conflict.
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none", "none"]
+  %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32>
+  return %r2 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_out_of_place1b
+func.func @scf_if_out_of_place1b(%t1: tensor<?xf32> {bufferization.writable = true},
+                                 %idx: index, %idx2: index, %idx3: index,
+                                 %cond: i1) -> tensor<?xf32> {
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    //      CHECK: tensor.extract_slice
+    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
+    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %e : tensor<?xf32>
+  } else {
+    //      CHECK: tensor.extract_slice
+    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
+    %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %f : tensor<?xf32>
+  }
+
+  // Reading from and writing to the same tensor via different args. This is a
+  // conflict. In contrast to scf_if_out_of_place1a, the fact that %r aliases
+  // with %t1 is only detected when analyzing the tensor.extract_slices. That's
+  // why the tensor.insert_slice is inplace and the two extract_slices are
+  // out-of-place.
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
+  %r2 = tensor.insert_slice %r into %t1[%idx3][%idx3][1] : tensor<?xf32> into tensor<?xf32>
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0]
+  return %r2 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_out_of_place1c
+func.func @scf_if_out_of_place1c(%t1: tensor<?xf32> {bufferization.writable = true},
+                                 %idx: index, %idx2: index, %cond: i1) -> tensor<?xf32> {
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    //      CHECK: tensor.extract_slice
+    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
+    %e = tensor.extract_slice %t1[%idx][%idx][1] : tensor<?xf32> to tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %e : tensor<?xf32>
+  } else {
+    // TODO: This one could bufferize inplace, but the analysis is too restrictive.
+    //      CHECK: tensor.extract_slice
+    // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none", "none"]
+    %f = tensor.extract_slice %t1[%idx2][%idx2][1] : tensor<?xf32> to tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %f : tensor<?xf32>
+  }
+
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none", "none"]
+  %r2 = tensor.insert_slice %r into %t1[%idx2][%idx2][1] : tensor<?xf32> into tensor<?xf32>
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0]
+  return %r2 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_out_of_place2
+func.func @scf_if_out_of_place2(%t1: tensor<?xf32> {bufferization.writable = true},
+                                %v: vector<5xf32>, %idx: index,
+                                %cond: i1) -> (tensor<?xf32>, vector<10xf32>) {
+  %cst = arith.constant 0.0 : f32
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    scf.yield %t1 : tensor<?xf32>
+  } else {
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
+    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t2 : tensor<?xf32>
+  }
+
+  // Read the old value of %t1. Forces the transfer_write to bufferize
+  // out-of-place.
+  %v2 = vector.transfer_read %t1[%idx], %cst : tensor<?xf32>, vector<10xf32>
+  return %r, %v2 : tensor<?xf32>, vector<10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_out_of_place3
+func.func @scf_if_out_of_place3(%t1: tensor<?xf32> {bufferization.writable = true},
+                                %v: vector<5xf32>, %idx: index,
+                                %cond: i1, %cond2: i1) -> (tensor<?xf32>, vector<10xf32>) {
+  %cst = arith.constant 0.0 : f32
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    scf.yield %t1 : tensor<?xf32>
+  } else {
+    //      CHECK: vector.transfer_write
+    // CHECK-SAME: {__inplace_operands_attr__ = ["none", "false", "none"]
+    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t2 : tensor<?xf32>
+  }
+  %t1_alias = scf.if %cond2 -> (tensor<?xf32>) {
+    // scf.yield bufferizes to a read. That is a conflict in this example.
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t1 : tensor<?xf32>
+  } else {
+    //      CHECK: scf.yield
+    // CHECK-SAME: {__inplace_operands_attr__ = ["true"]}
+    scf.yield %t1 : tensor<?xf32>
+  }
+  %v2 = vector.transfer_read %t1_alias[%idx], %cst : tensor<?xf32>, vector<10xf32>
+  return %r, %v2 : tensor<?xf32>, vector<10xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @write_to_same_tensor_in_loop_in_place(
+func.func @write_to_same_tensor_in_loop_in_place(
+    %A : tensor<?xf32> {linalg.inplaceable = true},
+    %lb : index, %ub : index, %step : index, %sz: index)
+  -> (tensor<?xf32>)
+{
+  // CHECK: scf.for {{.*}} {
+  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+    %B = linalg.init_tensor [%sz] : tensor<?xf32>
+    %i2 = arith.index_cast %i : index to i32
+    %i3 = arith.sitofp %i2 : i32 to f32
+    // The tensor.insert is in-place because the %B is defined inside the loop.
+    //      CHECK: tensor.insert
+    // CHECK-SAME:   {__inplace_operands_attr__ = ["none", "true", "none"]}
+    %B2 = tensor.insert %i3 into %B[%i] : tensor<?xf32>
+    //      CHECK: tensor.insert_slice
+    // CHECK-SAME:   {__inplace_operands_attr__ = ["true", "true", "none", "none"]}
+    %A2 = tensor.insert_slice %B2 into %t[%i][%sz][1] : tensor<?xf32> into tensor<?xf32>
+    scf.yield %A2 : tensor<?xf32>
+  }
+  // CHECK: } {__inplace_operands_attr__ = ["none", "none", "none", "true"]}
+
+  return %r0 : tensor<?xf32>
+}
diff --git a/mlir/test/Dialect/SCF/one-shot-bufferize.mlir b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SCF/one-shot-bufferize.mlir
@@ -0,0 +1,330 @@
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs bufferize-function-boundaries" -split-input-file | FileCheck %s
+
+// Run fuzzer with different seeds.
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=23 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=59 bufferize-function-boundaries" -split-input-file -o /dev/null
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs test-analysis-only analysis-fuzzer-seed=91 bufferize-function-boundaries" -split-input-file -o /dev/null
+
+// Test bufferization using memref types that have no layout map.
+// RUN: mlir-opt %s -one-shot-bufferize="allow-return-allocs fully-dynamic-layout-maps=0 bufferize-function-boundaries" -split-input-file -o /dev/null
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @scf_for_yield_only(
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>,
+//  CHECK-SAME:   %[[t:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   ) -> memref<?xf32> {
+func.func @scf_for_yield_only(
+    %A : tensor<?xf32> {bufferization.writable = false},
+    %B : tensor<?xf32> {bufferization.writable = true},
+    %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
+  //     CHECK:   memref.copy %[[A]], %[[ALLOC_FOR_A]]
+
+  // The first scf.for remains but just turns into dead code.
+  %r0 = scf.for %i = %lb to %ub step %step iter_args(%t = %A) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  // The second scf.for remains but just turns into dead code.
+  %r1 = scf.for %i = %lb to %ub step %step iter_args(%t = %B) -> (tensor<?xf32>) {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  //     CHECK:   return %[[ALLOC_FOR_A]] : memref<?xf32>
+  // CHECK-NOT:   dealloc
+  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// Ensure that the function bufferizes without error. This tests pre-order
+// traversal of scf.for loops during bufferization. No need to check the IR,
+// just want to make sure that it does not crash.
+
+// CHECK-LABEL: func @nested_scf_for
+func.func @nested_scf_for(%A : tensor<?xf32> {bufferization.writable = true},
+                          %v : vector<5xf32>) -> tensor<?xf32> {
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %r1 = scf.for %i = %c0 to %c10 step %c1 iter_args(%B = %A) -> tensor<?xf32> {
+    %r2 = scf.for %j = %c0 to %c10 step %c1 iter_args(%C = %B) -> tensor<?xf32> {
+      %w = vector.transfer_write %v, %C[%c0] : vector<5xf32>, tensor<?xf32>
+      scf.yield %w : tensor<?xf32>
+    }
+    scf.yield %r2 : tensor<?xf32>
+  }
+  return %r1 : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-DAG: #[[$map_1d_dyn:.*]] = affine_map<(d0)[s0, s1] -> (d0 * s1 + s0)>
+
+// CHECK-LABEL: func @scf_for_with_tensor.insert_slice
+//  CHECK-SAME:   %[[A:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[B:[a-zA-Z0-9]*]]: memref<?xf32, #[[$map_1d_dyn]]>
+//  CHECK-SAME:   %[[C:[a-zA-Z0-9]*]]: memref<4xf32, #[[$map_1d_dyn]]>
+func.func @scf_for_with_tensor.insert_slice(
+    %A : tensor<?xf32> {bufferization.writable = false},
+    %B : tensor<?xf32> {bufferization.writable = true},
+    %C : tensor<4xf32> {bufferization.writable = false},
+    %lb : index, %ub : index, %step : index)
+  -> (tensor<?xf32>, tensor<?xf32>)
+{
+  //     CHECK:   %[[ALLOC_FOR_A:.*]] = memref.alloc
+  //     CHECK:   memref.copy %[[A]], %[[ALLOC_FOR_A]]
+
+  //     CHECK: %[[svA:.*]] = memref.subview %[[ALLOC_FOR_A]][0] [4] [1]
+  //     CHECK: %[[svB:.*]] = memref.subview %[[B]][0] [4] [1]
+
+  //     CHECK:   scf.for {{.*}}
+  // CHECK-NOT: iter_args
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+    // %ttA bufferizes to direct copy of %BUFFER_CAST_C into %svA
+    //     CHECK: memref.copy %[[C]], %[[svA]]
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+    // %ttB bufferizes to direct copy of %BUFFER_CAST_C into %BUFFER_CAST_B
+    //     CHECK:   memref.copy %[[C]], %[[svB]]
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+    // CHECK-NOT:   scf.yield
+    scf.yield %ttA, %ttB : tensor<?xf32>, tensor<?xf32>
+  }
+
+  //     CHECK:  return %[[ALLOC_FOR_A]] : memref<?xf32>
+  return %r0#0, %r0#1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @execute_region_with_conflict(
+//  CHECK-SAME:     %[[m1:.*]]: memref<?xf32
+func.func @execute_region_with_conflict(
+    %t1 : tensor<?xf32> {bufferization.writable = "true"})
+  -> (f32, tensor<?xf32>, f32)
+{
+  %f1 = arith.constant 0.0 : f32
+  %idx = arith.constant 7 : index
+
+  // scf.execute_region is canonicalized away after bufferization. So just the
+  // memref.store is left over.
+
+  // CHECK: %[[alloc:.*]] = memref.alloc
+  // CHECK: memref.copy %[[m1]], %[[alloc]]
+  // CHECK: memref.store %{{.*}}, %[[alloc]][%{{.*}}]
+  %0, %1, %2 = scf.execute_region -> (f32, tensor<?xf32>, f32) {
+    %t2 = tensor.insert %f1 into %t1[%idx] : tensor<?xf32>
+    scf.yield %f1, %t2, %f1 : f32, tensor<?xf32>, f32
+  }
+
+  // CHECK: %[[casted:.*]] = memref.cast %[[alloc]]
+  // CHECK: %[[load:.*]] = memref.load %[[m1]]
+  %3 = tensor.extract %t1[%idx] : tensor<?xf32>
+
+  // CHECK: return %{{.*}}, %[[casted]], %[[load]] : f32, memref<?xf32, #{{.*}}>, f32
+  return %0, %1, %3 : f32, tensor<?xf32>, f32
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_inplace(
+//  CHECK-SAME:     %[[cond:.*]]: i1, %[[t1:.*]]: memref<?xf32{{.*}}>, %[[v:.*]]: vector
+func.func @scf_if_inplace(%cond: i1,
+                          %t1: tensor<?xf32> {bufferization.writable = true},
+                          %v: vector<5xf32>, %idx: index) -> tensor<?xf32> {
+
+  //      CHECK: scf.if %[[cond]] {
+  // CHECK-NEXT: } else {
+  // CHECK-NEXT:   vector.transfer_write %[[v]], %[[t1]]
+  // CHECK-NEXT: }
+  // CHECK-NEXT: return
+  %r = scf.if %cond -> (tensor<?xf32>) {
+    scf.yield %t1 : tensor<?xf32>
+  } else {
+    %t2 = vector.transfer_write %v, %t1[%idx] : vector<5xf32>, tensor<?xf32>
+    scf.yield %t2 : tensor<?xf32>
+  }
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_inside_scf_for
+//   CHECK-DAG:   %[[c0:.*]] = arith.constant 0 : index
+//   CHECK-DAG:   %[[c1:.*]] = arith.constant 1 : index
+//   CHECK-DAG:   %[[c10:.*]] = arith.constant 10 : index
+//       CHECK:   scf.for %{{.*}} = %[[c0]] to %[[c10]] step %[[c1]] {
+//       CHECK:     scf.if %{{.*}} {
+//       CHECK:     } else {
+//       CHECK:       vector.transfer_write
+//       CHECK:     }
+//       CHECK:   }
+func.func @scf_if_inside_scf_for(
+    %t1: tensor<?xf32> {bufferization.writable = true},
+    %v: vector<5xf32>, %idx: index,
+    %cond: i1)
+  -> tensor<?xf32>
+{
+  %c0 = arith.constant 0 : index
+  %c1 = arith.constant 1 : index
+  %c10 = arith.constant 10 : index
+  %r = scf.for %iv = %c0 to %c10 step %c1 iter_args(%bb = %t1) -> (tensor<?xf32>) {
+    %r2 = scf.if %cond -> (tensor<?xf32>) {
+      scf.yield %bb : tensor<?xf32>
+    } else {
+      %t2 = vector.transfer_write %v, %bb[%idx] : vector<5xf32>, tensor<?xf32>
+      scf.yield %t2 : tensor<?xf32>
+    }
+    scf.yield %r2 : tensor<?xf32>
+  }
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @scf_if_non_equiv_yields(
+//  CHECK-SAME:     %[[cond:.*]]: i1, %[[A:.*]]: memref<{{.*}}>, %[[B:.*]]: memref<{{.*}}>) -> memref<{{.*}}>
+func.func @scf_if_non_equiv_yields(
+    %b : i1,
+    %A : tensor<4xf32> {bufferization.writable = false},
+    %B : tensor<4xf32> {bufferization.writable = false})
+  -> tensor<4xf32>
+{
+  // CHECK: %[[r:.*]] = arith.select %[[cond]], %[[A]], %[[B]]
+  %r = scf.if %b -> (tensor<4xf32>) {
+    scf.yield %A : tensor<4xf32>
+  } else {
+    scf.yield %B : tensor<4xf32>
+  }
+  // CHECK: return %[[r]]
+  return %r: tensor<4xf32>
+}
+
+// -----
+
+// Note: This bufferization is inefficient, but it bufferizes correctly.
+
+// CHECK-LABEL: func @scf_execute_region_yield_non_equivalent(
+//       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
+//       CHECK:   %[[clone:.*]] = bufferization.clone %[[alloc]]
+//       CHECK:   memref.dealloc %[[alloc]]
+//       CHECK:   %[[r:.*]] = memref.load %[[clone]][%{{.*}}]
+//       CHECK:   memref.dealloc %[[clone]]
+//       CHECK:   return %[[r]]
+func.func @scf_execute_region_yield_non_equivalent(%i: index, %j: index) -> f32 {
+  %r = scf.execute_region -> (tensor<?xf32>) {
+    %t2 = linalg.init_tensor [%i] : tensor<?xf32>
+    scf.yield %t2 : tensor<?xf32>
+  }
+  %f = tensor.extract %r[%j] : tensor<?xf32>
+  return %f : f32
+}
+
+// -----
+
+// Note: This bufferizes to inefficient code, but bufferization should not see
+// such IR in the first place. The iter_arg would canonicalize away. This test
+// case is just to ensure that the bufferization generates correct code.
+
+// CHECK-LABEL: func @scf_for_yield_non_equivalent(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK:   %[[alloc:.*]] = memref.alloc(%{{.*}})
+//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[alloc]])
+//       CHECK:     memref.dealloc %[[iter]]
+//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[t]], %[[alloc2]]
+//       CHECK:     scf.yield %[[alloc2]]
+//       CHECK:   return %[[for]]
+func.func @scf_for_yield_non_equivalent(
+    %t: tensor<?xf32>, %lb : index, %ub : index, %step : index) -> tensor<?xf32> {
+  %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
+    scf.yield %t : tensor<?xf32>
+  }
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// Note: This bufferizes to inefficient code, but bufferization should not see
+// such IR in the first place. The iter_arg would canonicalize away. This test
+// case is just to ensure that the bufferization generates correct code.
+
+// CHECK-LABEL: func @scf_for_yield_allocation(
+//  CHECK-SAME:     %[[t:.*]]: memref<?xf32
+//       CHECK:   %[[cloned:.*]] = bufferization.clone %[[t]]
+//       CHECK:   %[[for:.*]] = scf.for {{.*}} iter_args(%[[iter:.*]] = %[[cloned]])
+// This alloc is for the linalg.init_tensor.
+//   CHECK-DAG:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//   CHECK-DAG:     memref.dealloc %[[iter]]
+// This alloc is for the scf.yield.
+//       CHECK:     %[[alloc3:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[alloc2]], %[[alloc3]]
+//       CHECK:     memref.dealloc %[[alloc2]]
+//       CHECK:     %[[casted3:.*]] = memref.cast %[[alloc3]]
+//       CHECK:     scf.yield %[[casted3]]
+//       CHECK:   return %[[for]]
+func.func @scf_for_yield_allocation(%t: tensor<?xf32>, %lb : index, %ub : index,
+                               %step : index) -> tensor<?xf32> {
+  %r = scf.for %i = %lb to %ub step %step iter_args(%a = %t) -> tensor<?xf32> {
+    %t2 = linalg.init_tensor [%i] : tensor<?xf32>
+    scf.yield %t2 : tensor<?xf32>
+  }
+
+  return %r : tensor<?xf32>
+}
+
+// -----
+
+// TODO: The scf.yield could bufferize to 1 alloc and 2 copies (instead of
+// 2 allocs and 2 copies).
+
+// CHECK-LABEL: func @scf_for_swapping_yields(
+//  CHECK-SAME:     %[[A:.*]]: memref<?xf32, #{{.*}}>, %[[B:.*]]: memref<?xf32, #{{.*}}>
+func.func @scf_for_swapping_yields(
+    %A : tensor<?xf32>, %B : tensor<?xf32> {bufferization.writable = true},
+    %C : tensor<4xf32>, %lb : index, %ub : index, %step : index)
+  -> (f32, f32)
+{
+//   CHECK-DAG:   %[[clone1:.*]] = bufferization.clone %[[A]]
+//   CHECK-DAG:   %[[clone2:.*]] = bufferization.clone %[[B]]
+//       CHECK:   %[[for:.*]]:2 = scf.for {{.*}} iter_args(%[[iter1:.*]] = %[[clone1]], %[[iter2:.*]] = %[[clone2]])
+  %r0:2 = scf.for %i = %lb to %ub step %step iter_args(%tA = %A, %tB = %B)
+      -> (tensor<?xf32>, tensor<?xf32>)
+  {
+//       CHECK:     %[[sv1:.*]] = memref.subview %[[iter1]]
+//       CHECK:     memref.copy %{{.*}}, %[[sv1]]
+    %ttA = tensor.insert_slice %C into %tA[0][4][1] : tensor<4xf32> into tensor<?xf32>
+//       CHECK:     %[[sv2:.*]] = memref.subview %[[iter2]]
+//       CHECK:     memref.copy %{{.*}}, %[[sv2]]
+    %ttB = tensor.insert_slice %C into %tB[0][4][1] : tensor<4xf32> into tensor<?xf32>
+
+//       CHECK:     %[[alloc2:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[iter2]], %[[alloc2]]
+//       CHECK:     memref.dealloc %[[iter2]]
+//       CHECK:     %[[alloc1:.*]] = memref.alloc(%{{.*}})
+//       CHECK:     memref.copy %[[iter1]], %[[alloc1]]
+//       CHECK:     memref.dealloc %[[iter1]]
+//       CHECK:     %[[casted1:.*]] = memref.cast %[[alloc1]]
+//       CHECK:     %[[casted2:.*]] = memref.cast %[[alloc2]]
+//       CHECK:     scf.yield %[[casted2]], %[[casted1]]
+    // Yield tensors in different order.
+    scf.yield %ttB, %ttA : tensor<?xf32>, tensor<?xf32>
+  }
+
+//       CHECK:     %[[r0:.*]] = memref.load %[[for]]#0
+//       CHECK:     memref.dealloc %[[for]]#0
+//       CHECK:     %[[r1:.*]] = memref.load %[[for]]#1
+//       CHECK:     memref.dealloc %[[for]]#1
+  %f0 = tensor.extract %r0#0[%step] : tensor<?xf32>
+  %f1 = tensor.extract %r0#1[%step] : tensor<?xf32>
+//       CHECK:     return %[[r0]], %[[r1]]
+  return %f0, %f1: f32, f32
+}