diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/Enums.h
@@ -92,6 +92,11 @@
 };
 
 // This x-macro includes all `V` types.
+// TODO: We currently split out the non-variadic version from the variadic
+// version. Using ##__VA_ARGS__ to avoid the split gives
+//   warning: token pasting of ',' and __VA_ARGS__ is a GNU extension
+//   [-Wgnu-zero-variadic-macro-arguments]
+// and __VA_OPT__(, ) __VA_ARGS__ requires c++20.
 #define MLIR_SPARSETENSOR_FOREVERY_V(DO)                                       \
   DO(F64, double)                                                              \
   DO(F32, float)                                                               \
@@ -104,6 +109,27 @@
   DO(C64, complex64)                                                           \
   DO(C32, complex32)
 
+// This x-macro includes all `V` types and supports variadic arguments.
+#define MLIR_SPARSETENSOR_FOREVERY_V_VAR(DO, ...)                              \
+  DO(F64, double, __VA_ARGS__)                                                 \
+  DO(F32, float, __VA_ARGS__)                                                  \
+  DO(F16, f16, __VA_ARGS__)                                                    \
+  DO(BF16, bf16, __VA_ARGS__)                                                  \
+  DO(I64, int64_t, __VA_ARGS__)                                                \
+  DO(I32, int32_t, __VA_ARGS__)                                                \
+  DO(I16, int16_t, __VA_ARGS__)                                                \
+  DO(I8, int8_t, __VA_ARGS__)                                                  \
+  DO(C64, complex64, __VA_ARGS__)                                              \
+  DO(C32, complex32, __VA_ARGS__)
+
+// This x-macro calls its argument on every pair of overhead and `V` types.
+#define MLIR_SPARSETENSOR_FOREVERY_V_O(DO)                                     \
+  MLIR_SPARSETENSOR_FOREVERY_V_VAR(DO, 64, uint64_t)                           \
+  MLIR_SPARSETENSOR_FOREVERY_V_VAR(DO, 32, uint32_t)                           \
+  MLIR_SPARSETENSOR_FOREVERY_V_VAR(DO, 16, uint16_t)                           \
+  MLIR_SPARSETENSOR_FOREVERY_V_VAR(DO, 8, uint8_t)                             \
+  MLIR_SPARSETENSOR_FOREVERY_V_VAR(DO, 0, index_type)
+
 constexpr bool isFloatingPrimaryType(PrimaryType valTy) {
   return PrimaryType::kF64 <= valTy && valTy <= PrimaryType::kBF16;
 }
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
--- a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -249,6 +249,14 @@
     return tensor;
   }
 
+  /// Reads the COO tensor from the file, stores the coordinates and values to
+  /// the given buffers, returns a boolean value to indicate whether the COO
+  /// elements are sorted.
+  /// Precondition: the buffers should have enough space to hold the elements.
+  template <typename C, typename V>
+  bool readToBuffers(uint64_t lvlRank, const uint64_t *dim2lvl,
+                     C *lvlCoordinates, V *values);
+
 private:
   /// Attempts to read a line from the file.  Is private because there's
   /// no reason for client code to call it.
@@ -287,6 +295,13 @@
   void readCOOLoop(uint64_t lvlRank, detail::PermutationRef dim2lvl,
                    SparseTensorCOO<V> *lvlCOO);
 
+  /// The internal implementation of `readToBuffers`.  We template over
+  /// `IsPattern` in order to perform LICM without needing to duplicate the
+  /// source code.
+  template <typename C, typename V, bool IsPattern>
+  bool readToBuffersLoop(uint64_t lvlRank, detail::PermutationRef dim2lvl,
+                         C *lvlCoordinates, V *values);
+
   /// Reads the MME header of a general sparse matrix of type real.
   void readMMEHeader();
 
@@ -351,6 +366,69 @@
   }
 }
 
+template <typename C, typename V>
+bool SparseTensorReader::readToBuffers(uint64_t lvlRank,
+                                       const uint64_t *dim2lvl,
+                                       C *lvlCoordinates, V *values) {
+  assert(isValid() && "Attempt to readCOO() before readHeader()");
+  const uint64_t dimRank = getRank();
+  assert(lvlRank == dimRank && "Rank mismatch");
+  detail::PermutationRef d2l(dimRank, dim2lvl);
+  // Do some manual LICM, to avoid assertions in the for-loop.
+  bool isSorted =
+      isPattern()
+          ? readToBuffersLoop<C, V, true>(lvlRank, d2l, lvlCoordinates, values)
+          : readToBuffersLoop<C, V, false>(lvlRank, d2l, lvlCoordinates,
+                                           values);
+
+  // Close the file and return isSorted.
+  closeFile();
+  return isSorted;
+}
+
+template <typename C, typename V, bool IsPattern>
+bool SparseTensorReader::readToBuffersLoop(uint64_t lvlRank,
+                                           detail::PermutationRef dim2lvl,
+                                           C *lvlCoordinates, V *values) {
+  const uint64_t dimRank = getRank();
+  const uint64_t nse = getNNZ();
+  std::vector<C> dimCoords(dimRank);
+  // Read the first element with isSorted=false as a way to avoid accessing its
+  // previous element.
+  bool isSorted = false;
+  char *linePtr;
+  // We inline `readCOOElement` here in order to avoid redundant assertions,
+  // since they're guaranteed by the call to `isValid()` and the construction
+  // of `dimCoords` above.
+  auto readElement = [&]() {
+    linePtr = readCOOIndices<C>(dimCoords.data());
+    dim2lvl.pushforward(dimRank, dimCoords.data(), lvlCoordinates);
+    *values = detail::readCOOValue<V, IsPattern>(&linePtr);
+    if (isSorted) {
+      // Note that isSorted was set to false while reading the first element,
+      // to guarantee the safeness of using prevLvlCoords.
+      C *prevLvlCoords = lvlCoordinates - lvlRank;
+      // TODO: define a new CoordsLT which is like ElementLT but doesn't have
+      // the V parameter, and use it here.
+      for (uint64_t l = 0; l < lvlRank; ++l) {
+        if (prevLvlCoords[l] != lvlCoordinates[l]) {
+          if (prevLvlCoords[l] > lvlCoordinates[l])
+            isSorted = false;
+          break;
+        }
+      }
+    }
+    lvlCoordinates += lvlRank;
+    ++values;
+  };
+  readElement();
+  isSorted = true;
+  for (uint64_t n = 1; n < nse; ++n)
+    readElement();
+
+  return isSorted;
+}
+
 /// Writes the sparse tensor to `filename` in extended FROSTT format.
 template <typename V>
 inline void writeExtFROSTT(const SparseTensorCOO<V> &coo,
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
--- a/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensorRuntime.h
@@ -283,6 +283,17 @@
 MLIR_SPARSETENSOR_FOREVERY_V(DECL_GETNEXT)
 #undef DECL_GETNEXT
 
+/// Reads the sparse tensor, stores the coordinates and values to the given
+/// memrefs. Returns a boolean value to indicate whether the COO elements are
+/// sorted.
+#define DECL_GETNEXT(VNAME, V, CNAME, C)                                       \
+  MLIR_CRUNNERUTILS_EXPORT bool                                                \
+      _mlir_ciface_getSparseTensorReaderRead##CNAME##VNAME(                    \
+          void *p, StridedMemRefType<index_type, 1> *dim2lvlRef,               \
+          StridedMemRefType<C, 1> *iref, StridedMemRefType<V, 1> *vref)        \
+          MLIR_SPARSETENSOR_FOREVERY_V_O(DECL_GETNEXT)
+#undef DECL_GETNEXT
+
 using SparseTensorWriter = std::ostream;
 
 /// Creates a SparseTensorWriter for outputing a sparse tensor to a file with
diff --git a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
--- a/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorRuntime.cpp
@@ -631,6 +631,33 @@
 MLIR_SPARSETENSOR_FOREVERY_V(IMPL_GETNEXT)
 #undef IMPL_GETNEXT
 
+#define IMPL_GETNEXT(VNAME, V, CNAME, C)                                       \
+  bool _mlir_ciface_getSparseTensorReaderRead##CNAME##VNAME(                   \
+      void *p, StridedMemRefType<index_type, 1> *dim2lvlRef,                   \
+      StridedMemRefType<C, 1> *cref, StridedMemRefType<V, 1> *vref) {          \
+    assert(p);                                                                 \
+    auto &reader = *static_cast<SparseTensorReader *>(p);                      \
+    ASSERT_NO_STRIDE(cref);                                                    \
+    ASSERT_NO_STRIDE(vref);                                                    \
+    ASSERT_NO_STRIDE(dim2lvlRef);                                              \
+    const uint64_t cSize = MEMREF_GET_USIZE(cref);                             \
+    const uint64_t vSize = MEMREF_GET_USIZE(vref);                             \
+    const uint64_t lvlRank = reader.getRank();                                 \
+    assert(vSize *lvlRank <= cSize);                                           \
+    assert(vSize >= reader.getNNZ() && "Not enough space in buffers");         \
+    ASSERT_USIZE_EQ(dim2lvlRef, lvlRank);                                      \
+    (void)cSize;                                                               \
+    (void)vSize;                                                               \
+    (void)lvlRank;                                                             \
+    C *lvlCoordinates = MEMREF_GET_PAYLOAD(cref);                              \
+    V *values = MEMREF_GET_PAYLOAD(vref);                                      \
+    index_type *dim2lvl = MEMREF_GET_PAYLOAD(dim2lvlRef);                      \
+    return reader.readToBuffers<C, V>(lvlRank, dim2lvl, lvlCoordinates,        \
+                                      values);                                 \
+  }
+MLIR_SPARSETENSOR_FOREVERY_V_O(IMPL_GETNEXT)
+#undef IMPL_GETNEXT
+
 void *_mlir_ciface_newSparseTensorFromReader(
     void *p, StridedMemRefType<index_type, 1> *lvlSizesRef,
     StridedMemRefType<DimLevelType, 1> *lvlTypesRef,
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_file_io.mlir
@@ -43,6 +43,9 @@
   func.func private @getSparseTensorReaderIsSymmetric(!TensorReader) -> (i1)
   func.func private @copySparseTensorReaderDimSizes(!TensorReader,
     memref<?xindex>) -> () attributes { llvm.emit_c_interface }
+  func.func private @getSparseTensorReaderRead0F32(!TensorReader,
+    memref<?xindex>, memref<?xindex>, memref<?xf32>)
+    -> (i1) attributes { llvm.emit_c_interface }
   func.func private @getSparseTensorReaderNextF32(!TensorReader,
     memref<?xindex>, memref<f32>) -> () attributes { llvm.emit_c_interface }
 
@@ -60,6 +63,14 @@
     return
   }
 
+  func.func @dumpi2(%arg0: memref<?xindex, strided<[2], offset: ?>>) {
+    %c0 = arith.constant 0 : index
+    %v = vector.transfer_read %arg0[%c0], %c0 :
+      memref<?xindex, strided<[2], offset: ?>>, vector<17xindex>
+    vector.print %v : vector<17xindex>
+    return
+  }
+
   func.func @dumpf(%arg0: memref<?xf32>) {
     %c0 = arith.constant 0 : index
     %d0 = arith.constant 0.0 : f32
@@ -70,39 +81,31 @@
 
   // Returns the indices and values of the tensor.
   func.func @readTensorFile(%tensor: !TensorReader)
-    -> (memref<?xindex>, memref<?xindex>, memref<?xf32>) {
+    -> (memref<?xindex>, memref<?xf32>, i1) {
     %c0 = arith.constant 0 : index
     %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
 
     %rank = call @getSparseTensorReaderRank(%tensor) : (!TensorReader) -> index
     %nnz = call @getSparseTensorReaderNNZ(%tensor) : (!TensorReader) -> index
 
     // Assume rank == 2.
-    %x0s = memref.alloc(%nnz) : memref<?xindex>
-    %x1s = memref.alloc(%nnz) : memref<?xindex>
+    %isize = arith.muli %c2, %nnz : index
+    %xs = memref.alloc(%isize) : memref<?xindex>
     %vs = memref.alloc(%nnz) : memref<?xf32>
-    %indices = memref.alloc(%rank) : memref<?xindex>
-    %value = memref.alloca() : memref<f32>
-    scf.for %i = %c0 to %nnz step %c1 {
-      func.call @getSparseTensorReaderNextF32(%tensor, %indices, %value)
-        : (!TensorReader, memref<?xindex>, memref<f32>) -> ()
-      // TODO: can we use memref.subview to avoid the need for the %value
-      //       buffer?
-      %v = memref.load %value[] : memref<f32>
-      memref.store %v, %vs[%i] : memref<?xf32>
-      %i0 = memref.load %indices[%c0] : memref<?xindex>
-      memref.store %i0, %x0s[%i] : memref<?xindex>
-      %i1 = memref.load %indices[%c1] : memref<?xindex>
-      memref.store %i1, %x1s[%i] : memref<?xindex>
-    }
-
-    // Release the resource for the indices.
-    memref.dealloc %indices : memref<?xindex>
-    return %x0s, %x1s, %vs : memref<?xindex>, memref<?xindex>, memref<?xf32>
+    %dim2lvl = memref.alloca(%c2) : memref<?xindex>
+    memref.store %c0, %dim2lvl[%c0] : memref<?xindex>
+    memref.store %c1, %dim2lvl[%c1] : memref<?xindex>
+    %isSorted =func.call @getSparseTensorReaderRead0F32(%tensor, %dim2lvl, %xs, %vs)
+        : (!TensorReader, memref<?xindex>, memref<?xindex>, memref<?xf32>) -> (i1)
+    return %xs, %vs, %isSorted : memref<?xindex>, memref<?xf32>, i1
   }
 
   // Reads a COO tensor from the given file name and prints its content.
   func.func @readTensorFileAndDump(%fileName: !Filename) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
     %tensor = call @createSparseTensorReader(%fileName)
       : (!Filename) -> (!TensorReader)
     %rank = call @getSparseTensorReaderRank(%tensor) : (!TensorReader) -> index
@@ -116,18 +119,22 @@
     func.call @copySparseTensorReaderDimSizes(%tensor, %dimSizes)
       : (!TensorReader, memref<?xindex>) -> ()
     call @dumpi(%dimSizes) : (memref<?xindex>) -> ()
-    %x0s, %x1s, %vs = call @readTensorFile(%tensor)
-      : (!TensorReader) -> (memref<?xindex>, memref<?xindex>, memref<?xf32>)
 
-    call @dumpi(%x0s) : (memref<?xindex>) -> ()
-    call @dumpi(%x1s) : (memref<?xindex>) -> ()
+    %xs, %vs, %isSorted = call @readTensorFile(%tensor)
+      : (!TensorReader) -> (memref<?xindex>, memref<?xf32>, i1)
+    %x0s = memref.subview %xs[%c0][%nnz][%c2]
+      : memref<?xindex> to memref<?xindex, strided<[2], offset: ?>>
+    %x1s = memref.subview %xs[%c1][%nnz][%c2]
+      : memref<?xindex> to memref<?xindex, strided<[2], offset: ?>>
+    vector.print %isSorted : i1
+    call @dumpi2(%x0s) : (memref<?xindex, strided<[2], offset: ?>>) -> ()
+    call @dumpi2(%x1s) : (memref<?xindex, strided<[2], offset: ?>>) -> ()
     call @dumpf(%vs) : (memref<?xf32>) -> ()
 
     // Release the resources.
     call @delSparseTensorReader(%tensor) : (!TensorReader) -> ()
     memref.dealloc %dimSizes : memref<?xindex>
-    memref.dealloc %x0s : memref<?xindex>
-    memref.dealloc %x1s : memref<?xindex>
+    memref.dealloc %xs : memref<?xindex>
     memref.dealloc %vs : memref<?xf32>
 
     return
@@ -184,6 +191,7 @@
     // CHECK: 17
     // CHECK: 0
     // CHECK: ( 4, 256, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 )
+    // CHECK: 1
     // CHECK: ( 0, 0, 0, 0, 1, 1, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3 )
     // CHECK: ( 0, 126, 127, 254, 1, 253, 2, 0, 1, 3, 98, 126, 127, 128, 249, 253, 255 )
     // CHECK: ( -1, 2, -3, 4, -5, 6, -7, 8, -9, 10, -11, 12, -13, 14, -15, 16, -17 )
@@ -215,4 +223,4 @@
 
     return
   }
-}
+}
\ No newline at end of file