diff --git a/mlir/include/mlir/Dialect/Linalg/Passes.td b/mlir/include/mlir/Dialect/Linalg/Passes.td
--- a/mlir/include/mlir/Dialect/Linalg/Passes.td
+++ b/mlir/include/mlir/Dialect/Linalg/Passes.td
@@ -53,7 +53,11 @@
            "Use stack allocations for memrefs (for testing purposes only)">,
     Option<"analysisFuzzerSeed", "analysis-fuzzer-seed", "unsigned",
            /*default=*/"0",
-           "Analyze ops in random order with a given seed (fuzzer)">
+           "Analyze ops in random order with a given seed (fuzzer)">,
+    Option<"initTensorElimination", "init-tensor-elimination", "bool",
+            /*default=*/"false",
+           "(Experimental) Try to eliminate init_tensor operations that are "
+           "anchored at an insert_slice op">,
   ];
   let constructor = "mlir::createLinalgComprehensiveModuleBufferizePass()";
 }
diff --git a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
--- a/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ComprehensiveBufferizePass.cpp
@@ -92,8 +92,10 @@
   options->printConflicts = printConflicts;
 
   // Enable InitTensorOp elimination.
-  options->addPostAnalysisStep<
-      linalg_ext::InsertSliceAnchoredInitTensorEliminationStep>();
+  if (initTensorElimination) {
+    options->addPostAnalysisStep<
+        linalg_ext::InsertSliceAnchoredInitTensorEliminationStep>();
+  }
   if (!allowReturnMemref)
     options->addPostAnalysisStep<scf_ext::AssertDestinationPassingStyle>();
 
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-init-tensor-elimination.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-init-tensor-elimination.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis-init-tensor-elimination.mlir
@@ -0,0 +1,55 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="test-analysis-only allow-return-memref init-tensor-elimination" -split-input-file | FileCheck %s
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// InitTensorOp elimination
+//===----------------------------------------------------------------------===//
+
+// CHECK-LABEL: func @buffer_forwarding_conflict
+func @buffer_forwarding_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none"]
+  // Instead of allocating, share buffer with some inplace bufferization?
+  %0 = linalg.init_tensor [%arg1] : tensor<?xf32>
+
+  //      CHECK: linalg.fill
+  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
+  %1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"]
+  %2 = tensor.insert_slice %1 into %arg0[0] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
+
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
+  %3 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [-1, 0]
+  return %2, %3 : tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: func @buffer_forwarding_no_conflict
+func @buffer_forwarding_no_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
+  %cst = arith.constant 0.000000e+00 : f32
+  //      CHECK: tensor.extract_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]
+  // Instead of allocating, share buffer with some inplace bufferization?
+  %0 = linalg.init_tensor [%arg1] : tensor<?xf32>
+
+  //      CHECK: linalg.fill
+  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
+  %1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //      CHECK: tensor.insert_slice
+  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
+  %2 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
+
+  //      CHECK: return
+  // CHECK-SAME: __equivalent_func_args__ = [0, 0]
+  return %2, %2 : tensor<?xf32>, tensor<?xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-analysis.mlir
@@ -1249,60 +1249,6 @@
 
 // -----
 
-//===----------------------------------------------------------------------===//
-// InitTensorOp elimination
-//===----------------------------------------------------------------------===//
-
-// CHECK-LABEL: func @buffer_forwarding_conflict
-func @buffer_forwarding_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
-  %cst = arith.constant 0.000000e+00 : f32
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["false", "none"]
-  // Instead of allocating, share buffer with some inplace bufferization?
-  %0 = linalg.init_tensor [%arg1] : tensor<?xf32>
-
-  //      CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
-  %1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "false", "none"]
-  %2 = tensor.insert_slice %1 into %arg0[0] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
-
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
-  %3 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [-1, 0]
-  return %2, %3 : tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-// CHECK-LABEL: func @buffer_forwarding_no_conflict
-func @buffer_forwarding_no_conflict(%arg0: tensor<?xf32> {linalg.inplaceable = true}, %arg1: index) -> (tensor<?xf32>, tensor<?xf32>) {
-  %cst = arith.constant 0.000000e+00 : f32
-  //      CHECK: tensor.extract_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "none"]
-  // Instead of allocating, share buffer with some inplace bufferization?
-  %0 = linalg.init_tensor [%arg1] : tensor<?xf32>
-
-  //      CHECK: linalg.fill
-  // CHECK-SAME: {__inplace_operands_attr__ = ["none", "true"]
-  %1 = linalg.fill(%cst, %0) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  //      CHECK: tensor.insert_slice
-  // CHECK-SAME: {__inplace_operands_attr__ = ["true", "true", "none"]
-  %2 = tensor.insert_slice %1 into %arg0[42] [%arg1] [1] : tensor<?xf32> into tensor<?xf32>
-
-  //      CHECK: return
-  // CHECK-SAME: __equivalent_func_args__ = [0, 0]
-  return %2, %2 : tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
 //===----------------------------------------------------------------------===//
 // scf.if cases
 //===----------------------------------------------------------------------===//
@@ -1764,3 +1710,26 @@
   }
   return %1: tensor<?xf32>
 }
+
+// -----
+
+//===----------------------------------------------------------------------===//
+// InitTensorOp elimination would produce SSA violations for the example below.
+//===----------------------------------------------------------------------===//
+
+func @depthwise_conv_1d_nwc_wc(%arg0: index, %arg1: index, %arg2: tensor<8x18x32xf32>) 
+    -> tensor<?x1x6x8xf32> {
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %c8 = arith.constant 8 : index
+  %0 = linalg.init_tensor [4, 1, 6, 8] : tensor<4x1x6x8xf32>
+  %1 = tensor.cast %0 : tensor<4x1x6x8xf32> to tensor<?x1x6x8xf32>
+  %2 = linalg.init_tensor [1, 6, 8] : tensor<1x6x8xf32>
+  %3 = scf.for %arg3 = %c0 to %c32 step %c8 iter_args(%arg4 = %1) -> (tensor<?x1x6x8xf32>) {
+    %4 = affine.apply affine_map<(d0) -> (d0 ceildiv 8)>(%arg3)
+    %5 = tensor.insert_slice %2 into %arg4[%4,0, 0, 0] [1, 1, 6, 8] [1, 1, 1, 1] :
+      tensor<1x6x8xf32> into tensor<?x1x6x8xf32>
+    scf.yield %5 : tensor<?x1x6x8xf32>
+  }
+  return %3 : tensor<?x1x6x8xf32>
+}
\ No newline at end of file
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize-init-tensor-elimination.mlir
@@ -0,0 +1,64 @@
+// RUN: mlir-opt %s -linalg-comprehensive-module-bufferize="allow-return-memref init-tensor-elimination" -split-input-file | FileCheck %s
+
+// -----
+
+//      CHECK: func @buffer_forwarding_conflict(
+// CHECK-SAME:   %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
+// CHECK-SAME:   %[[sz:[0-9a-zA-Z]*]]: index
+func @buffer_forwarding_conflict(
+  %t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
+  %sz: index)
+    -> (tensor<?xf32>, tensor<?xf32>)
+{
+  %f0 = arith.constant 0.0: f32
+  // Alloc is needed for the **first** insert_slice (due to backward traversal during analysis).
+  //     CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]]
+  // This allocs the whole dim to allow for a full clone of t.
+  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]])
+
+  // init_tensor itself does not alloc but forwards to the **second**
+  // insert_slice. InitTensorOp replaces the init_tensor with an out-of-place
+  // extract_slice.
+  //     CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]])
+  //     CHECK: %[[T_SUBVIEW:.*]] =  memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
+  %a = linalg.init_tensor[%sz] : tensor<?xf32>
+
+  //     CHECK: linalg.fill({{.*}}, %[[EXTRACT_SLICE_ALLOC]]) : f32, memref<?xf32>
+  %f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  //     CHECK: linalg.copy(%[[FUNC_ARG]], %[[ALLOC]]) : memref<?xf32>, memref<?xf32>
+  //     CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref<?xf32> to memref<?xf32>
+  //     CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]]) : memref<?xf32>, memref<?xf32>
+  %r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor<?xf32> into tensor<?xf32>
+
+  //     CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]])
+  %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
+
+  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
+}
+
+// -----
+
+//      CHECK: func @buffer_forwarding_no_conflict(
+// CHECK-SAME:   %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
+// CHECK-SAME:   %[[sz:[0-9a-zA-Z]*]]: index
+func @buffer_forwarding_no_conflict(
+  %t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
+  %sz: index)
+    -> (tensor<?xf32>)
+{
+  %f0 = arith.constant 0.0: f32
+
+  // init_tensor itself does not alloc but forwards to the insert_slice.
+  // InitTensorOp replaces the init_tensor with an inplace extract_slice.
+  // CHECK: %[[T_SUBVIEW:.*]] =  memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
+  %a = linalg.init_tensor[%sz] : tensor<?xf32>
+
+  // CHECK: linalg.fill({{.*}}, %[[T_SUBVIEW]]) : f32, memref<?xf32
+  %f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
+
+  // Self-copy canonicalizes away later.
+  %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
+
+  return %r1: tensor<?xf32>
+}
diff --git a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
--- a/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
+++ b/mlir/test/Dialect/Linalg/comprehensive-module-bufferize.mlir
@@ -868,69 +868,6 @@
 
 // -----
 
-//      CHECK: func @buffer_forwarding_conflict(
-// CHECK-SAME:   %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
-// CHECK-SAME:   %[[sz:[0-9a-zA-Z]*]]: index
-func @buffer_forwarding_conflict(
-  %t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
-  %sz: index)
-    -> (tensor<?xf32>, tensor<?xf32>)
-{
-  %f0 = arith.constant 0.0: f32
-  // Alloc is needed for the **first** insert_slice (due to backward traversal during analysis).
-  //     CHECK: %[[DIM:.*]] = memref.dim %[[FUNC_ARG]]
-  // This allocs the whole dim to allow for a full clone of t.
-  //     CHECK: %[[ALLOC:.*]] = memref.alloc(%[[DIM]])
-
-  // init_tensor itself does not alloc but forwards to the **second**
-  // insert_slice. InitTensorOp replaces the init_tensor with an out-of-place
-  // extract_slice.
-  //     CHECK: %[[EXTRACT_SLICE_ALLOC:.*]] = memref.alloc(%[[sz]])
-  //     CHECK: %[[T_SUBVIEW:.*]] =  memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
-  %a = linalg.init_tensor[%sz] : tensor<?xf32>
-
-  //     CHECK: linalg.fill({{.*}}, %[[EXTRACT_SLICE_ALLOC]]) : f32, memref<?xf32>
-  %f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  //     CHECK: linalg.copy(%[[FUNC_ARG]], %[[ALLOC]]) : memref<?xf32>, memref<?xf32>
-  //     CHECK: %[[SV0_ALLOC:.*]] = memref.subview %[[ALLOC]][0] [%[[sz]]] [1] : memref<?xf32> to memref<?xf32>
-  //     CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[SV0_ALLOC]]) : memref<?xf32>, memref<?xf32>
-  %r0 = tensor.insert_slice %f into %t[0][%sz][1]: tensor<?xf32> into tensor<?xf32>
-
-  //     CHECK: linalg.copy(%[[EXTRACT_SLICE_ALLOC]], %[[T_SUBVIEW]])
-  %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
-
-  return %r0, %r1: tensor<?xf32>, tensor<?xf32>
-}
-
-// -----
-
-//      CHECK: func @buffer_forwarding_no_conflict(
-// CHECK-SAME:   %[[FUNC_ARG:[0-9a-zA-Z]*]]: memref<?xf32>
-// CHECK-SAME:   %[[sz:[0-9a-zA-Z]*]]: index
-func @buffer_forwarding_no_conflict(
-  %t: tensor<?xf32> {linalg.buffer_layout = affine_map<(d0) -> (d0)>, linalg.inplaceable = true},
-  %sz: index)
-    -> (tensor<?xf32>)
-{
-  %f0 = arith.constant 0.0: f32
-
-  // init_tensor itself does not alloc but forwards to the insert_slice.
-  // InitTensorOp replaces the init_tensor with an inplace extract_slice.
-  // CHECK: %[[T_SUBVIEW:.*]] =  memref.subview %[[FUNC_ARG]][42] [%[[sz]]] [1]
-  %a = linalg.init_tensor[%sz] : tensor<?xf32>
-
-  // CHECK: linalg.fill({{.*}}, %[[T_SUBVIEW]]) : f32, memref<?xf32
-  %f = linalg.fill(%f0, %a) : f32, tensor<?xf32> -> tensor<?xf32>
-
-  // Self-copy canonicalizes away later.
-  %r1 = tensor.insert_slice %f into %t[42][%sz][1]: tensor<?xf32> into tensor<?xf32>
-
-  return %r1: tensor<?xf32>
-}
-
-// -----
-
 // CHECK-LABEL: func @scf_if_inplace(
 //  CHECK-SAME:     %[[cond:.*]]: i1, %[[t1:.*]]: memref<?xf32{{.*}}>, %[[v:.*]]: vector
 func @scf_if_inplace(%cond: i1,