diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -104,6 +104,28 @@
   DO(C64, complex64)                                                           \
   DO(C32, complex32)
 
+// This x-macro calls `DO` on every pair of overhead and `V` types, given an
+// overhead type.
+#define MLIR_SPARSETENSOR_O_FOREVERY_V(INAME, I, DO)                           \
+  DO(INAME, I, F64, double)                                                    \
+  DO(INAME, I, F32, float)                                                     \
+  DO(INAME, I, F16, f16)                                                       \
+  DO(INAME, I, BF16, bf16)                                                     \
+  DO(INAME, I, I64, int64_t)                                                   \
+  DO(INAME, I, I32, int32_t)                                                   \
+  DO(INAME, I, I16, int16_t)                                                   \
+  DO(INAME, I, I8, int8_t)                                                     \
+  DO(INAME, I, C64, complex64)                                                 \
+  DO(INAME, I, C32, complex32)
+
+// This x-macro calls its argument on every pair of overhead and `V` types.
+#define MLIR_SPARSETENSOR_FOREVERY_O_V(DO)                                     \
+  MLIR_SPARSETENSOR_O_FOREVERY_V(64, uint64_t, DO)                             \
+  MLIR_SPARSETENSOR_O_FOREVERY_V(32, uint32_t, DO)                             \
+  MLIR_SPARSETENSOR_O_FOREVERY_V(16, uint16_t, DO)                             \
+  MLIR_SPARSETENSOR_O_FOREVERY_V(8, uint8_t, DO)                               \
+  MLIR_SPARSETENSOR_O_FOREVERY_V(0, index_type, DO)
+
 constexpr bool isFloatingPrimaryType(PrimaryType valTy) {
   return PrimaryType::kF64 <= valTy && valTy <= PrimaryType::kBF16;
 }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -249,6 +249,14 @@
     return tensor;
   }
 
+  /// Reads the COO tensor, stores the indices, values and actual nnz to the
+  /// given buffers. The maximum number of nnz that can be hold by the buffers
+  /// is `maxNnz`. Returns a boolean value to indicate whether the COO elements
+  /// are sorted.
+  template <typename I, typename V>
+  bool readCOO(I *indices, V *values, uint64_t *actualNnz,
+               const uint64_t *dim2lvl, uint64_t maxNnz);
+
 private:
   /// Attempts to read a line from the file.  Is private because there's
   /// no reason for client code to call it.
@@ -286,6 +294,13 @@
   void readCOOLoop(uint64_t lvlRank, detail::PermutationRef dim2lvl,
                    SparseTensorCOO<V> *lvlCOO);
 
+  /// The internal implementation of `readCOO`.  We template over
+  /// `IsPattern` and `IsSymmetric` in order to perform LICM without
+  /// needing to duplicate the source code.
+  template <typename I, typename V, bool IsPattern, bool IsSymmetric>
+  bool readCOOLoop(I *indices, V *values, uint64_t *actualNnz,
+                   const uint64_t *dim2lvl, uint64_t maxNnz);
+
   /// Reads the MME header of a general sparse matrix of type real.
   void readMMEHeader();
 
@@ -365,6 +380,86 @@
   }
 }
 
+template <typename I, typename V>
+bool SparseTensorReader::readCOO(I *indices, V *values, uint64_t *actualNnz,
+                                 const uint64_t *dim2lvl, uint64_t maxNnz) {
+  assert(isValid() && "Attempt to readCOO() before readHeader()");
+  // Do some manual LICM, to avoid assertions in the for-loop.
+  const bool IsPattern = isPattern();
+  const bool IsSymmetric = (isSymmetric() && getRank() == 2);
+  bool isSorted;
+  if (IsPattern && IsSymmetric)
+    isSorted = readCOOLoop<I, V, true, true>(indices, values, actualNnz,
+                                             dim2lvl, maxNnz);
+  else if (IsPattern)
+    isSorted = readCOOLoop<I, V, true, false>(indices, values, actualNnz,
+                                              dim2lvl, maxNnz);
+  else if (IsSymmetric)
+    isSorted = readCOOLoop<I, V, false, true>(indices, values, actualNnz,
+                                              dim2lvl, maxNnz);
+  else
+    isSorted = readCOOLoop<I, V, false, false>(indices, values, actualNnz,
+                                               dim2lvl, maxNnz);
+  // Close the file and return isSorted.
+  closeFile();
+  return isSorted;
+}
+
+template <typename I, typename V, bool IsPattern, bool IsSymmetric>
+bool SparseTensorReader::readCOOLoop(I *indices, V *values, uint64_t *actualNnz,
+                                     const uint64_t *dim2lvl, uint64_t maxNnz) {
+  const uint64_t rank = getRank();
+  const uint64_t nnz = getNNZ();
+  detail::PermutationRef d2l(rank, dim2lvl);
+  std::vector<I> dimInd(rank);
+  // Assume unsorted for symmetric.
+  bool isSorted = IsSymmetric ? false : true;
+  // We inline `readCOOElement` here in order to avoid redundant
+  // assertions, since they're guaranteed by the call to `isValid()`
+  // and the construction of `dimInd` above.
+  I *pIndices = indices;
+  V *pValues = values;
+  char *linePtr;
+  auto getOneElement = [&]() {
+    linePtr = readCOOIndices<I>(dimInd.data());
+    d2l.pushforward(rank, dimInd.data(), pIndices);
+    *pValues = detail::readCOOValue<V, IsPattern>(&linePtr);
+  };
+  getOneElement();
+  uint64_t n = 1;
+  for (uint64_t k = 1; k < nnz; ++k) {
+    assert(n < maxNnz);
+    pIndices = indices + (n * rank);
+    pValues = values + n;
+    getOneElement();
+    if (isSorted) {
+      I *prev = pIndices - rank;
+      for (uint64_t d = 0; d < rank; ++d) {
+        if (prev[d] != pIndices[d]) {
+          isSorted = (prev[d] < pIndices[d]);
+          break;
+        }
+      }
+    }
+    n++;
+
+    // We currently chose to deal with symmetric matrices by fully
+    // constructing them.  In the future, we may want to make symmetry
+    // implicit for storage reasons.
+    if constexpr (IsSymmetric)
+      if (pIndices[0] != pIndices[1]) {
+        assert(n < maxNnz);
+        pIndices[2] = pIndices[1];
+        pIndices[3] = pIndices[0];
+        pValues[1] = pValues[0];
+        n++;
+      }
+  }
+
+  *actualNnz = n;
+  return isSorted;
+}
+
 /// Writes the sparse tensor to `filename` in extended FROSTT format.
 template <typename V>
 inline void writeExtFROSTT(const SparseTensorCOO<V> &coo,
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
--- a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
@@ -283,6 +283,19 @@
 MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETNEXT)
 #undef DECL_GETNEXT
 
+/// Reads the sparse tensor, stores the indices, values and the actual nnz to
+/// the given memrefs. Returns a boolean value to indicate whether the COO
+/// elements are sorted.
+#define DECL_GETNEXT(INAME, I, VNAME, V)                                       \
+  MLIR_CRUNNERUTILS_EXPORT bool                                                \
+      _mlir_ciface_getSparseTensorReaderRead##INAME##VNAME(                    \
+          void *p, StridedMemRefType<I, 1> *iref,                              \
+          StridedMemRefType<V, 1> *vref,                                       \
+          StridedMemRefType<index_type, 0> *nref);                             \
+          StridedMemRefType<index_type, 0> *dim2lvlRef);                       \
+  MLIR_SPARSETENSOR_FOREVERY_O_V(DECL_GETNEXT)
+#undef DECL_GETNEXT
+
 using SparseTensorWriter = std::ostream;
 
 /// Creates a SparseTensorWriter for outputing a sparse tensor to a file with
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -631,6 +631,35 @@
 MLIR_SPARSETENSOR_FOREVERY_V(IMPL_GETNEXT)
 #undef IMPL_GETNEXT
 
+#define IMPL_GETNEXT(INAME, I, VNAME, V)                                       \
+  bool _mlir_ciface_getSparseTensorReaderRead##INAME##VNAME(                   \
+      void *p, StridedMemRefType<I, 1> *iref, StridedMemRefType<V, 1> *vref,   \
+      StridedMemRefType<index_type, 0> *nref,                                  \
+      StridedMemRefType<index_type, 1> *dim2lvlRef) {                          \
+    assert(p &&iref &&vref &&dim2lvlRef &&nref);                               \
+    auto &reader = *static_cast<SparseTensorReader *>(p);                      \
+    ASSERT_NO_STRIDE(iref);                                                    \
+    ASSERT_NO_STRIDE(vref);                                                    \
+    ASSERT_NO_STRIDE(dim2lvlRef);                                              \
+    const uint64_t is = MEMREF_GET_USIZE(iref);                                \
+    const uint64_t vs = MEMREF_GET_USIZE(vref);                                \
+    const uint64_t rank = reader.getRank();                                    \
+    assert(vs *rank == is);                                                    \
+    const uint64_t ps = MEMREF_GET_USIZE(dim2lvlRef);                          \
+    assert(ps == rank);                                                        \
+    (void)is;                                                                  \
+    (void)vs;                                                                  \
+    (void)rank;                                                                \
+    (void)ps;                                                                  \
+    I *indices = MEMREF_GET_PAYLOAD(iref);                                     \
+    V *values = MEMREF_GET_PAYLOAD(vref);                                      \
+    index_type *dim2lvl = MEMREF_GET_PAYLOAD(dim2lvlRef);                      \
+    index_type *n = MEMREF_GET_PAYLOAD(nref);                                  \
+    return reader.readCOO<I, V>(indices, values, n, dim2lvl, vs);              \
+  }
+MLIR_SPARSETENSOR_FOREVERY_O_V(IMPL_GETNEXT)
+#undef IMPL_GETNEXT
+
 void *_mlir_ciface_newSparseTensorFromReader(
     void *p, StridedMemRefType<index_type, 1> *lvlSizesRef,
     StridedMemRefType<DimLevelType, 1> *lvlTypesRef,
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir
@@ -43,6 +43,9 @@
   func.func private @getSparseTensorReaderIsSymmetric(!TensorReader) -> (i1)
   func.func private @copySparseTensorReaderDimSizes(!TensorReader,
     memref<?xindex>) -> () attributes { llvm.emit_c_interface }
+  func.func private @getSparseTensorReaderRead0F32(!TensorReader,
+    memref<?xindex>, memref<?xf32>, memref<index>, memref<?xindex>)
+    -> (i1) attributes { llvm.emit_c_interface }
   func.func private @getSparseTensorReaderNextF32(!TensorReader,
     memref<?xindex>, memref<f32>) -> () attributes { llvm.emit_c_interface }
 
@@ -60,6 +63,14 @@
     return
   }
 
+  func.func @dumpi2(%arg0: memref<?xindex, strided<[2], offset: ?>>) {
+    %c0 = arith.constant 0 : index
+    %v = vector.transfer_read %arg0[%c0], %c0 :
+      memref<?xindex, strided<[2], offset: ?>>, vector<17xindex>
+    vector.print %v : vector<17xindex>
+    return
+  }
+
   func.func @dumpf(%arg0: memref<?xf32>) {
     %c0 = arith.constant 0 : index
     %d0 = arith.constant 0.0 : f32
@@ -70,39 +81,33 @@
 
   // Returns the indices and values of the tensor.
   func.func @readTensorFile(%tensor: !TensorReader)
-    -> (memref<?xindex>, memref<?xindex>, memref<?xf32>) {
+    -> (memref<?xindex>, memref<?xf32>, index, i1) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
 
     %rank = call @getSparseTensorReaderRank(%tensor) : (!TensorReader) -> index
     %nnz = call @getSparseTensorReaderNNZ(%tensor) : (!TensorReader) -> index
 
     // Assume rank == 2.
-    %x0s = memref.alloc(%nnz) : memref<?xindex>
-    %x1s = memref.alloc(%nnz) : memref<?xindex>
+    %isize = arith.muli %c2, %nnz : index
+    %xs = memref.alloc(%isize) : memref<?xindex>
     %vs = memref.alloc(%nnz) : memref<?xf32>
-    %indices = memref.alloc(%rank) : memref<?xindex>
-    %value = memref.alloca() : memref<f32>
-    scf.for %i = %c0 to %nnz step %c1 {
-      func.call @getSparseTensorReaderNextF32(%tensor, %indices, %value)
-        : (!TensorReader, memref<?xindex>, memref<f32>) -> ()
-      // TODO: can we use memref.subview to avoid the need for the %value
-      //       buffer?
-      %v = memref.load %value[] : memref<f32>
-      memref.store %v, %vs[%i] : memref<?xf32>
-      %i0 = memref.load %indices[%c0] : memref<?xindex>
-      memref.store %i0, %x0s[%i] : memref<?xindex>
-      %i1 = memref.load %indices[%c1] : memref<?xindex>
-      memref.store %i1, %x1s[%i] : memref<?xindex>
-    }
-
-    // Release the resource for the indices.
-    memref.dealloc %indices : memref<?xindex>
-    return %x0s, %x1s, %vs : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %dim2lvl = memref.alloca(%c2) : memref<?xindex>
+    memref.store %c0, %dim2lvl[%c0] : memref<?xindex>
+    memref.store %c1, %dim2lvl[%c1] : memref<?xindex>
+    %n = memref.alloca() : memref<index>
+    %isSorted =func.call @getSparseTensorReaderRead0F32(%tensor, %xs, %vs, %n, %dim2lvl)
+        : (!TensorReader, memref<?xindex>, memref<?xf32>, memref<index>, memref<?xindex>) -> (i1)
+    %nnz2 = memref.load %n[] : memref<index>
+    return %xs, %vs, %nnz2, %isSorted : memref<?xindex>, memref<?xf32>, index, i1
   }
 
   // Reads a COO tensor from the given file name and prints its content.
   func.func @readTensorFileAndDump(%fileName: !Filename) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
     %tensor = call @createSparseTensorReader(%fileName)
       : (!Filename) -> (!TensorReader)
     %rank = call @getSparseTensorReaderRank(%tensor) : (!TensorReader) -> index
@@ -116,18 +121,23 @@
     func.call @copySparseTensorReaderDimSizes(%tensor, %dimSizes)
       : (!TensorReader, memref<?xindex>) -> ()
     call @dumpi(%dimSizes) : (memref<?xindex>) -> ()
-    %x0s, %x1s, %vs = call @readTensorFile(%tensor)
-      : (!TensorReader) -> (memref<?xindex>, memref<?xindex>, memref<?xf32>)
 
-    call @dumpi(%x0s) : (memref<?xindex>) -> ()
-    call @dumpi(%x1s) : (memref<?xindex>) -> ()
+    %xs, %vs, %nnz2, %isSorted = call @readTensorFile(%tensor)
+      : (!TensorReader) -> (memref<?xindex>, memref<?xf32>, index, i1)
+    %x0s = memref.subview %xs[%c0][%nnz2][%c2]
+      : memref<?xindex> to memref<?xindex, strided<[2], offset: ?>>
+    %x1s = memref.subview %xs[%c1][%nnz2][%c2]
+      : memref<?xindex> to memref<?xindex, strided<[2], offset: ?>>
+    vector.print %isSorted : i1
+    vector.print %nnz2 : index
+    call @dumpi2(%x0s) : (memref<?xindex, strided<[2], offset: ?>>) -> ()
+    call @dumpi2(%x1s) : (memref<?xindex, strided<[2], offset: ?>>) -> ()
     call @dumpf(%vs) : (memref<?xf32>) -> ()
 
     // Release the resources.
     call @delSparseTensorReader(%tensor) : (!TensorReader) -> ()
     memref.dealloc %dimSizes : memref<?xindex>
-    memref.dealloc %x0s : memref<?xindex>
-    memref.dealloc %x1s : memref<?xindex>
+    memref.dealloc %xs : memref<?xindex>
     memref.dealloc %vs : memref<?xf32>
 
     return
@@ -184,6 +194,8 @@
     // CHECK: 17
     // CHECK: 0
     // CHECK: ( 4, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHECK: 1
+    // CHECK: 17
     // CHECK: ( 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 )
     // CHECK: ( 0, 126, 127, 254, 1, 253, 2, 0, 1, 3, 98, 126, 127, 128, 249, 253, 255 )
     // CHECK: ( -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17 )