diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -145,6 +145,9 @@
 Value allocDenseTensor(OpBuilder &builder, Location loc,
                        RankedTensorType tensorTp, ValueRange sizes);
 
+/// Generates code to deallocate a dense buffer.
+void deallocDenseTensor(OpBuilder &builder, Location loc, Value buffer);
+
 /// Generates the code to read the value from tensor[ivs]. The generated code
 /// looks like the following and the insertion point after this routine is
 /// inside the if-then branch behind the assignment to ind.
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.cpp
@@ -947,6 +947,11 @@
   return mem;
 }
 
+void mlir::sparse_tensor::deallocDenseTensor(OpBuilder &builder, Location loc,
+                                             Value buffer) {
+  builder.create<memref::DeallocOp>(loc, buffer);
+}
+
 Value mlir::sparse_tensor::genValueForDense(OpBuilder &builder, Location loc,
                                             Value tensor, ValueRange ivs) {
   Value val = builder.create<tensor::ExtractOp>(loc, tensor, ivs);
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -337,11 +337,6 @@
       .getResult(0);
 }
 
-/// Generates code to deallocate a dense buffer.
-static void deallocDenseTensor(OpBuilder &builder, Location loc, Value buffer) {
-  builder.create<memref::DeallocOp>(loc, buffer);
-}
-
 /// Converts a pointer to COO (from calls to iter->next()) into a vector of
 /// indices, apply (optional) `offset` on `offsetDim`.
 static SmallVector<Value, 4> loadIndices(OpBuilder &builder, Location loc,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorRewriting.cpp
@@ -564,7 +564,10 @@
 
     SmallVector<Value, 4> sizes;
     sizesForTensor(rewriter, sizes, loc, srcTp, src);
+
     Value dst = allocDenseTensor(rewriter, loc, dstTp, sizes);
+    Block *insertionBlock = rewriter.getInsertionBlock();
+    bool noEscape = bufferization::allocationDoesNotEscape(op->getOpResult(0));
 
     rewriter.create<ForeachOp>(loc, src, llvm::None,
                                [&](OpBuilder &builder, Location loc,
@@ -575,6 +578,12 @@
                                });
 
     rewriter.replaceOpWithNewOp<bufferization::ToTensorOp>(op, dstTp, dst);
+
+    // Deallocate the buffer.
+    if (noEscape) {
+      rewriter.setInsertionPoint(insertionBlock->getTerminator());
+      deallocDenseTensor(rewriter, loc, dst);
+    }
     return success();
   }
 
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -9,7 +9,13 @@
 // RUN: mlir-cpu-runner -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
-
+//
+// Do the same run, but now with direct IR generation.
+//
+// RUN: mlir-opt %s --sparse-compiler=enable-runtime-library=false | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_lib_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
 
 #CSR = #sparse_tensor.encoding<{
   dimLevelType = [ "dense", "compressed" ],
@@ -221,8 +227,10 @@
     //
     // Sanity check on nonzeros.
     //
-    // CHECK: ( 30.5, 4.2, 4.6, 7, 8, -1, -1, -1 )
-    // CHECK: ( 30.5, 4.2, 4.6, 7, 8, -1, -1, -1 )
+    // FIXME: bring this back once dense2sparse skips zeros
+    //
+    // C_HECK: ( 30.5, 4.2, 4.6, 7, 8 )
+    // C_HECK: ( 30.5, 4.2, 4.6, 7, 8 )
     //
     %val7 = sparse_tensor.values %7 : tensor<4x4xf64, #CSR> to memref<?xf64>
     %val8 = sparse_tensor.values %8 : tensor<4x4xf64, #DCSR> to memref<?xf64>