diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/LoopEmitter.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
 
 using namespace mlir;
 using namespace mlir::sparse_tensor;
@@ -206,7 +207,14 @@
     Type elementType = rtp.getElementType();
     if (!enc) {
       // Non-annotated dense tensors.
-      auto denseTp = MemRefType::get(shape, elementType);
+      BaseMemRefType denseTp = MemRefType::get(shape, elementType);
+
+      // TODO: if we unconditionally use fully dynamic layout here, it breaks
+      // some vectorization passes which requires static stride = 1.
+      // Is it possible to call vectorization pass after bufferization?
+      if (llvm::isa_and_nonnull<tensor::ExtractSliceOp>(tensor.getDefiningOp()))
+        denseTp = bufferization::getMemRefTypeWithFullyDynamicLayout(rtp);
+
       Value denseVal =
           builder.create<bufferization::ToMemrefOp>(loc, denseTp, tensor);
       // Dense outputs need special handling.
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_foreach_slices.mlir
@@ -70,23 +70,23 @@
     call @foreach_print_slice(%a) : (tensor<4x4xf64, #CSR_SLICE>) -> ()
 
     // FIXME: investigate why a tensor copy is inserted for this slice
-//    %dense = tensor.extract_slice %sa[1, 1][4, 4][1, 2] : tensor<8x8xf64> to
-//                                                          tensor<4x4xf64>
-//    %b = sparse_tensor.convert %dense : tensor<4x4xf64> to tensor<4x4xf64, #CSR>
-//    // Foreach on sparse tensor instead of slice they should yield the same result.
-//    //
-//    // C_HECK-NEXT: 1
-//    // C_HECK-NEXT: 0
-//    // C_HECK-NEXT: 2.3
-//    // C_HECK-NEXT: 2
-//    // C_HECK-NEXT: 3
-//    // C_HECK-NEXT: 1
-//    // C_HECK-NEXT: 3
-//    // C_HECK-NEXT: 2
-//    // C_HECK-NEXT: 2.1
-//    //
-//    call @foreach_print_non_slice(%b) : (tensor<4x4xf64, #CSR>) -> ()
-//    bufferization.dealloc_tensor %b : tensor<4x4xf64, #CSR>
+    %dense = tensor.extract_slice %sa[1, 1][4, 4][1, 2] : tensor<8x8xf64> to
+                                                          tensor<4x4xf64>
+    %b = sparse_tensor.convert %dense : tensor<4x4xf64> to tensor<4x4xf64, #CSR>
+    // Foreach on sparse tensor instead of slice they should yield the same result.
+    //
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 0
+    // CHECK-NEXT: 2.3
+    // CHECK-NEXT: 2
+    // CHECK-NEXT: 3
+    // CHECK-NEXT: 1
+    // CHECK-NEXT: 3
+    // CHECK-NEXT: 2
+    // CHECK-NEXT: 2.1
+    //
+    call @foreach_print_non_slice(%b) : (tensor<4x4xf64, #CSR>) -> ()
+    bufferization.dealloc_tensor %b : tensor<4x4xf64, #CSR>
 
     bufferization.dealloc_tensor %tmp : tensor<8x8xf64, #CSR>
     return