diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
--- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
@@ -1071,6 +1071,59 @@
       rank, shape, perm.data(), sparse.data(), tensor);
 }
 
+/// Converts a sparse tensor to COO-flavored format expressed using C-style
+/// data structures. The expected output parameters are pointers for these
+/// values:
+///
+///   rank:    rank of tensor
+///   nse:     number of specified elements (usually the nonzeros)
+///   shape:   array with dimension size for each rank
+///   values:  a "nse" array with values for all specified elements
+///   indices: a flat "nse x rank" array with indices for all specified elements
+///
+/// The input is a pointer to SparseTensorStorage<P, I, V>, typically returned
+/// from convertToMLIRSparseTensor.
+///
+//  TODO: Currently, values are copied from SparseTensorStorage to
+//  SparseTensorCOO, then to the output. We may want to reduce the number of
+//  copies.
+//
+//  TODO: for now f64 tensors only, no dim ordering, all dimensions compressed
+//
+void convertFromMLIRSparseTensor(void *tensor, uint64_t *p_rank,
+                                 uint64_t *p_nse, uint64_t **p_shape,
+                                 double **p_values, uint64_t **p_indices) {
+  SparseTensorStorage<uint64_t, uint64_t, double> *sparse_tensor =
+      static_cast<SparseTensorStorage<uint64_t, uint64_t, double> *>(tensor);
+  uint64_t rank = sparse_tensor->getRank();
+  std::vector<uint64_t> perm(rank);
+  std::iota(perm.begin(), perm.end(), 0);
+  SparseTensorCOO<double> *coo = sparse_tensor->toCOO(perm.data());
+
+  const std::vector<Element<double>> &elements = coo->getElements();
+  uint64_t nse = elements.size();
+
+  uint64_t *shape = new uint64_t[rank];
+  for (uint64_t i = 0; i < rank; i++)
+    shape[i] = coo->getSizes()[i];
+
+  double *values = new double[nse];
+  uint64_t *indices = new uint64_t[rank * nse];
+
+  for (uint64_t i = 0, base = 0; i < nse; i++) {
+    values[i] = elements[i].value;
+    for (uint64_t j = 0; j < rank; j++)
+      indices[base + j] = elements[i].indices[j];
+    base += rank;
+  }
+
+  delete coo;
+  *p_rank = rank;
+  *p_nse = nse;
+  *p_shape = shape;
+  *p_values = values;
+  *p_indices = indices;
+}
 } // extern "C"
 
 #endif // MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_elementwise_add_sparse_output.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_elementwise_add_sparse_output.py
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_elementwise_add_sparse_output.py
@@ -0,0 +1,133 @@
+# RUN: SUPPORT_LIB=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext %PYTHON %s | FileCheck %s
+
+import ctypes
+import numpy as np
+import os
+import sys
+
+import mlir.all_passes_registration
+
+from mlir import ir
+from mlir import runtime as rt
+from mlir import execution_engine
+from mlir import passmanager
+from mlir.dialects import sparse_tensor as st
+from mlir.dialects import builtin
+from mlir.dialects.linalg.opdsl import lang as dsl
+
+_SCRIPT_PATH = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(_SCRIPT_PATH)
+from tools import np_to_sparse_tensor as test_tools
+
+# TODO: Use linalg_structured_op to generate the kernel after making it to
+# handle sparse tensor outputs.
+_KERNEL_STR = """
+#DCSR = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed" ]
+}>
+
+#trait_add_elt = {
+  indexing_maps = [
+    affine_map<(i,j) -> (i,j)>,  // A
+    affine_map<(i,j) -> (i,j)>,  // B
+    affine_map<(i,j) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel"],
+  doc = "X(i,j) = A(i,j) + B(i,j)"
+}
+
+func @sparse_add_elt(
+    %arga: tensor<3x4xf64, #DCSR>, %argb: tensor<3x4xf64, #DCSR>) -> tensor<3x4xf64, #DCSR> {
+  %c3 = arith.constant 3 : index
+  %c4 = arith.constant 4 : index
+  %argx = sparse_tensor.init [%c3, %c4] : tensor<3x4xf64, #DCSR>
+  %0 = linalg.generic #trait_add_elt
+    ins(%arga, %argb: tensor<3x4xf64, #DCSR>, tensor<3x4xf64, #DCSR>)
+    outs(%argx: tensor<3x4xf64, #DCSR>) {
+      ^bb(%a: f64, %b: f64, %x: f64):
+        %1 = arith.addf %a, %b : f64
+        linalg.yield %1 : f64
+  } -> tensor<3x4xf64, #DCSR>
+  return %0 : tensor<3x4xf64, #DCSR>
+}
+
+func @main(%ad: tensor<3x4xf64>, %bd: tensor<3x4xf64>) -> tensor<3x4xf64, #DCSR>
+  attributes { llvm.emit_c_interface } {
+  %a = sparse_tensor.convert %ad : tensor<3x4xf64> to tensor<3x4xf64, #DCSR>
+  %b = sparse_tensor.convert %bd : tensor<3x4xf64> to tensor<3x4xf64, #DCSR>
+  %0 = call @sparse_add_elt(%a, %b) : (tensor<3x4xf64, #DCSR>, tensor<3x4xf64, #DCSR>) -> tensor<3x4xf64, #DCSR>
+  return %0 : tensor<3x4xf64, #DCSR>
+}
+"""
+
+
+class _SparseCompiler:
+  """Sparse compiler passes."""
+
+  def __init__(self):
+    self.pipeline = (
+        f'sparsification,'
+        f'sparse-tensor-conversion,'
+        f'builtin.func(linalg-bufferize,convert-linalg-to-loops,convert-vector-to-scf),'
+        f'convert-scf-to-std,'
+        f'func-bufferize,'
+        f'tensor-constant-bufferize,'
+        f'builtin.func(tensor-bufferize,std-bufferize,finalizing-bufferize),'
+        f'convert-vector-to-llvm{{reassociate-fp-reductions=1 enable-index-optimizations=1}},'
+        f'lower-affine,'
+        f'convert-memref-to-llvm,'
+        f'convert-std-to-llvm,'
+        f'reconcile-unrealized-casts')
+
+  def __call__(self, module: ir.Module):
+    passmanager.PassManager.parse(self.pipeline).run(module)
+
+
+def _run_test(support_lib, kernel):
+  """Compiles, runs and checks results."""
+  module = ir.Module.parse(kernel)
+  _SparseCompiler()(module)
+  engine = execution_engine.ExecutionEngine(
+      module, opt_level=0, shared_libs=[support_lib])
+
+  # Set up numpy inputs and buffer for output.
+  a = np.array(
+      [[1.1, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 6.6, 0.0]],
+      np.float64)
+  b = np.array(
+      [[1.1, 0.0, 0.0, 2.8], [0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0]],
+      np.float64)
+
+  mem_a = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(a)))
+  mem_b = ctypes.pointer(ctypes.pointer(rt.get_ranked_memref_descriptor(b)))
+
+  # The sparse tensor output is a pointer to pointer of char.
+  out = ctypes.c_char(0)
+  mem_out = ctypes.pointer(ctypes.pointer(out))
+
+  # Invoke the kernel.
+  engine.invoke('main', mem_a, mem_b, mem_out)
+
+  # Retrieve and check the result.
+  rank, nse, shape, values, indices = test_tools.sparse_tensor_to_coo_tensor(
+      support_lib, mem_out[0], np.float64)
+
+  # CHECK: PASSED
+  np.allclose(rank, 2)
+  np.allclose(nse, 3)
+  np.allclose(shape, [3, 4])
+  np.allclose(values, [2.2, 2.8, 6.6])
+  np.allclose(indices, [[0, 0], [0, 3], [2, 2]])
+  print('PASSED')
+
+
+def test_elementwise_add():
+  # Obtain path to runtime support library.
+  support_lib = os.getenv('SUPPORT_LIB')
+  assert support_lib is not None, 'SUPPORT_LIB is undefined'
+  assert os.path.exists(support_lib), f'{support_lib} does not exist'
+  with ir.Context() as ctx, ir.Location.unknown():
+    _run_test(support_lib, _KERNEL_STR)
+
+
+test_elementwise_add()
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/tools/lit.local.cfg b/mlir/test/Integration/Dialect/SparseTensor/python/tools/lit.local.cfg
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/tools/lit.local.cfg
@@ -0,0 +1,2 @@
+# Files in this directory are tools, not tests.
+config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py b/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/tools/np_to_sparse_tensor.py
@@ -0,0 +1,74 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#  This file contains functions to process sparse tensor outputs.
+
+import ctypes
+import functools
+import numpy as np
+
+
+@functools.lru_cache()
+def _get_c_shared_lib(lib_name: str):
+  """Loads and returns the requested C shared library.
+
+  Args:
+    lib_name: A string representing the C shared library.
+
+  Returns:
+    The C shared library.
+
+  Raises:
+    OSError: If there is any problem in loading the shared library.
+    ValueError:  If the shared library doesn't contain the needed routine.
+  """
+  # This raises OSError exception if there is any problem in loading the shared
+  # library.
+  c_lib = ctypes.CDLL(lib_name)
+
+  try:
+    c_lib.convertFromMLIRSparseTensor.restype = ctypes.c_void_p
+  except Exception as e:
+    raise ValueError('Missing function convertFromMLIRSparseTensor from '
+                     f'the C shared library: {e} ') from e
+
+  return c_lib
+
+
+def sparse_tensor_to_coo_tensor(support_lib, sparse, dtype):
+  """Converts a sparse tensor to COO-flavored format.
+
+  Args:
+     support_lib: A string for the supporting C shared library.
+     sparse: A ctypes.pointer to the sparse tensor descriptor.
+     dtype: The numpy data type for the tensor elements.
+
+  Returns:
+    A tuple that contains the following values:
+    rank: An integer for the rank of the tensor.
+    nse: An interger for the number of non-zero values in the tensor.
+    shape: A 1D numpy array of integers, for the shape of the tensor.
+    values: A 1D numpy array, for the non-zero values in the tensor.
+    indices: A 2D numpy array of integers, representing the indices for the
+      non-zero values in the tensor.
+
+  Raises:
+    OSError: If there is any problem in loading the shared library.
+    ValueError:  If the shared library doesn't contain the needed routine.
+  """
+  c_lib = _get_c_shared_lib(support_lib)
+
+  rank = ctypes.c_ulonglong(0)
+  nse = ctypes.c_ulonglong(0)
+  shape = ctypes.POINTER(ctypes.c_ulonglong)()
+  values = ctypes.POINTER(np.ctypeslib.as_ctypes_type(dtype))()
+  indices = ctypes.POINTER(ctypes.c_ulonglong)()
+  c_lib.convertFromMLIRSparseTensor(sparse, ctypes.byref(rank),
+                                    ctypes.byref(nse), ctypes.byref(shape),
+                                    ctypes.byref(values), ctypes.byref(indices))
+  # Convert the returned values to the corresponding numpy types.
+  shape = np.ctypeslib.as_array(shape, shape=[rank.value])
+  values = np.ctypeslib.as_array(values, shape=[nse.value])
+  indices = np.ctypeslib.as_array(indices, shape=[nse.value, rank.value])
+  return rank, nse, shape, values, indices