diff --git a/mlir/include/mlir/ExecutionEngine/Float16bits.h b/mlir/include/mlir/ExecutionEngine/Float16bits.h
--- a/mlir/include/mlir/ExecutionEngine/Float16bits.h
+++ b/mlir/include/mlir/ExecutionEngine/Float16bits.h
@@ -17,23 +17,36 @@
 #include <cstdint>
 #include <iostream>
 
+#ifdef _WIN32
+#ifdef mlir_float16_utils_EXPORTS // We are building this library
+#define MLIR_FLOAT16_EXPORT __declspec(dllexport)
+#define MLIR_FLOAT16_DEFINE_FUNCTIONS
+#else // We are using this library
+#define MLIR_FLOAT16_EXPORT __declspec(dllimport)
+#endif // mlir_float16_utils_EXPORTS
+#else  // Non-windows: use visibility attributes.
+#define MLIR_FLOAT16_EXPORT __attribute__((visibility("default")))
+#define MLIR_FLOAT16_DEFINE_FUNCTIONS
+#endif // _WIN32
+
 // Implements half precision and bfloat with f16 and bf16, using the MLIR type
 // names. These data types are also used for c-interface runtime routines.
 extern "C" {
-struct f16 {
+struct MLIR_FLOAT16_EXPORT f16 {
   f16(float f = 0);
   uint16_t bits;
 };
 
-struct bf16 {
+struct MLIR_FLOAT16_EXPORT bf16 {
   bf16(float f = 0);
   uint16_t bits;
 };
 }
 
 // Outputs a half precision value.
-std::ostream &operator<<(std::ostream &os, const f16 &f);
+MLIR_FLOAT16_EXPORT std::ostream &operator<<(std::ostream &os, const f16 &f);
 // Outputs a bfloat value.
-std::ostream &operator<<(std::ostream &os, const bf16 &d);
+MLIR_FLOAT16_EXPORT std::ostream &operator<<(std::ostream &os, const bf16 &d);
 
+#undef MLIR_FLOAT16_EXPORT
 #endif // MLIR_EXECUTIONENGINE_FLOAT16BITS_H_
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/COO.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/COO.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/COO.h
@@ -0,0 +1,163 @@
+//===- COO.h - Coordinate-scheme sparse tensor representation ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_COO_H
+#define MLIR_EXECUTIONENGINE_SPARSETENSOR_COO_H
+
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <functional>
+#include <vector>
+
+namespace mlir {
+namespace sparse_tensor {
+
+/// A sparse tensor element in coordinate scheme (value and indices).
+/// For example, a rank-1 vector element would look like
+///   ({i}, a[i])
+/// and a rank-5 tensor element like
+///   ({i,j,k,l,m}, a[i,j,k,l,m])
+/// We use pointer to a shared index pool rather than e.g. a direct
+/// vector since that (1) reduces the per-element memory footprint, and
+/// (2) centralizes the memory reservation and (re)allocation to one place.
+template <typename V>
+struct Element final {
+  Element(uint64_t *ind, V val) : indices(ind), value(val){};
+  uint64_t *indices; // pointer into shared index pool
+  V value;
+};
+
+/// The type of callback functions which receive an element.  We avoid
+/// packaging the coordinates and value together as an `Element` object
+/// because this helps keep code somewhat cleaner.
+template <typename V>
+using ElementConsumer =
+    const std::function<void(const std::vector<uint64_t> &, V)> &;
+
+/// A memory-resident sparse tensor in coordinate scheme (collection of
+/// elements). This data structure is used to read a sparse tensor from
+/// any external format into memory and sort the elements lexicographically
+/// by indices before passing it back to the client (most packed storage
+/// formats require the elements to appear in lexicographic index order).
+template <typename V>
+struct SparseTensorCOO final {
+public:
+  SparseTensorCOO(const std::vector<uint64_t> &dimSizes, uint64_t capacity)
+      : dimSizes(dimSizes) {
+    if (capacity) {
+      elements.reserve(capacity);
+      indices.reserve(capacity * getRank());
+    }
+  }
+
+  /// Adds element as indices and value.
+  void add(const std::vector<uint64_t> &ind, V val) {
+    assert(!iteratorLocked && "Attempt to add() after startIterator()");
+    uint64_t *base = indices.data();
+    uint64_t size = indices.size();
+    uint64_t rank = getRank();
+    assert(ind.size() == rank && "Element rank mismatch");
+    for (uint64_t r = 0; r < rank; r++) {
+      assert(ind[r] < dimSizes[r] && "Index is too large for the dimension");
+      indices.push_back(ind[r]);
+    }
+    // This base only changes if indices were reallocated. In that case, we
+    // need to correct all previous pointers into the vector. Note that this
+    // only happens if we did not set the initial capacity right, and then only
+    // for every internal vector reallocation (which with the doubling rule
+    // should only incur an amortized linear overhead).
+    uint64_t *newBase = indices.data();
+    if (newBase != base) {
+      for (uint64_t i = 0, n = elements.size(); i < n; i++)
+        elements[i].indices = newBase + (elements[i].indices - base);
+      base = newBase;
+    }
+    // Add element as (pointer into shared index pool, value) pair.
+    elements.emplace_back(base + size, val);
+  }
+
+  /// Sorts elements lexicographically by index.
+  void sort() {
+    assert(!iteratorLocked && "Attempt to sort() after startIterator()");
+    // TODO: we may want to cache an `isSorted` bit, to avoid
+    // unnecessary/redundant sorting.
+    uint64_t rank = getRank();
+    std::sort(elements.begin(), elements.end(),
+              [rank](const Element<V> &e1, const Element<V> &e2) {
+                for (uint64_t r = 0; r < rank; r++) {
+                  if (e1.indices[r] == e2.indices[r])
+                    continue;
+                  return e1.indices[r] < e2.indices[r];
+                }
+                return false;
+              });
+  }
+
+  /// Get the rank of the tensor.
+  uint64_t getRank() const { return dimSizes.size(); }
+
+  /// Getter for the dimension-sizes array.
+  const std::vector<uint64_t> &getDimSizes() const { return dimSizes; }
+
+  /// Getter for the elements array.
+  const std::vector<Element<V>> &getElements() const { return elements; }
+
+  /// Switch into iterator mode.
+  void startIterator() {
+    iteratorLocked = true;
+    iteratorPos = 0;
+  }
+
+  /// Get the next element.
+  const Element<V> *getNext() {
+    assert(iteratorLocked && "Attempt to getNext() before startIterator()");
+    if (iteratorPos < elements.size())
+      return &(elements[iteratorPos++]);
+    iteratorLocked = false;
+    return nullptr;
+  }
+
+  /// Factory method. Permutes the original dimensions according to
+  /// the given ordering and expects subsequent add() calls to honor
+  /// that same ordering for the given indices. The result is a
+  /// fully permuted coordinate scheme.
+  ///
+  /// Precondition: `dimSizes` and `perm` must be valid for `rank`.
+  static SparseTensorCOO<V> *newSparseTensorCOO(uint64_t rank,
+                                                const uint64_t *dimSizes,
+                                                const uint64_t *perm,
+                                                uint64_t capacity = 0) {
+    std::vector<uint64_t> permsz(rank);
+    for (uint64_t r = 0; r < rank; r++) {
+      assert(dimSizes[r] > 0 && "Dimension size zero has trivial storage");
+      permsz[perm[r]] = dimSizes[r];
+    }
+    return new SparseTensorCOO<V>(permsz, capacity);
+  }
+
+private:
+  const std::vector<uint64_t> dimSizes; // per-dimension sizes
+  std::vector<Element<V>> elements;     // all COO elements
+  std::vector<uint64_t> indices;        // shared index pool
+  bool iteratorLocked = false;
+  unsigned iteratorPos = 0;
+};
+
+} // namespace sparse_tensor
+} // namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_COO_H
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/CheckedMul.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/CheckedMul.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/CheckedMul.h
@@ -0,0 +1,43 @@
+//===- CheckedMul.h - multiplication that checks for overflow ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header is not part of the public API.  It is placed in the
+// includes directory only because that's required by the implementations
+// of template-classes.
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_CHECKEDMUL_H
+#define MLIR_EXECUTIONENGINE_SPARSETENSOR_CHECKEDMUL_H
+
+#include <cassert>
+#include <cinttypes>
+#include <limits>
+
+namespace mlir {
+namespace sparse_tensor {
+namespace detail {
+
+/// A version of `operator*` on `uint64_t` which checks for overflows.
+inline uint64_t checkedMul(uint64_t lhs, uint64_t rhs) {
+  assert((lhs == 0 || rhs <= std::numeric_limits<uint64_t>::max() / lhs) &&
+         "Integer overflow");
+  return lhs * rhs;
+}
+
+} // namespace detail
+} // namespace sparse_tensor
+} // namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_CHECKEDMUL_H
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Enums.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Enums.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Enums.h
@@ -0,0 +1,163 @@
+//===- Enums.h - Enums shared with the runtime ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Typedefs and enums for the lightweight runtime support library for
+// sparse tensor manipulations.  These are required to be public so that
+// they can be shared with `Transforms/SparseTensorConversion.cpp`, since
+// they define the arguments to the public functions declared later on.
+//
+// This file also defines x-macros <https://en.wikipedia.org/wiki/X_Macro>
+// so that we can generate variations of the public functions for each
+// supported primary- and/or overhead-type.
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_ENUMS_H
+#define MLIR_EXECUTIONENGINE_SPARSETENSOR_ENUMS_H
+
+#include "mlir/ExecutionEngine/Float16bits.h"
+
+#include <cinttypes>
+#include <complex>
+
+#ifdef _WIN32
+#ifdef mlir_sparse_tensor_utils_EXPORTS // We are building this library
+#define MLIR_SPARSETENSOR_EXPORT __declspec(dllexport)
+#define MLIR_SPARSETENSOR_DEFINE_FUNCTIONS
+#else // We are using this library
+#define MLIR_SPARSETENSOR_EXPORT __declspec(dllimport)
+#endif // mlir_sparse_tensor_utils_EXPORTS
+#else  // Non-windows: use visibility attributes.
+#define MLIR_SPARSETENSOR_EXPORT __attribute__((visibility("default")))
+#define MLIR_SPARSETENSOR_DEFINE_FUNCTIONS
+#endif // _WIN32
+
+namespace mlir {
+namespace sparse_tensor {
+
+/// This type is used in the public API at all places where MLIR expects
+/// values with the built-in type "index".  For now, we simply assume that
+/// type is 64-bit, but targets with different "index" bit widths should
+/// link with an alternatively built runtime support library.
+// TODO: support such targets?
+using index_type = uint64_t;
+
+/// Encoding of overhead types (both pointer overhead and indices
+/// overhead), for "overloading" @newSparseTensor.
+enum class MLIR_SPARSETENSOR_EXPORT OverheadType : uint32_t {
+  kIndex = 0,
+  kU64 = 1,
+  kU32 = 2,
+  kU16 = 3,
+  kU8 = 4
+};
+
+// This x-macro calls its argument on every overhead type which has
+// fixed-width.  It excludes `index_type` because that type is often
+// handled specially (e.g., by translating it into the architecture-dependent
+// equivalent fixed-width overhead type).
+#define FOREVERY_FIXED_O(DO)                                                   \
+  DO(64, uint64_t)                                                             \
+  DO(32, uint32_t)                                                             \
+  DO(16, uint16_t)                                                             \
+  DO(8, uint8_t)
+
+// This x-macro calls its argument on every overhead type, including
+// `index_type`.
+#define FOREVERY_O(DO)                                                         \
+  FOREVERY_FIXED_O(DO)                                                         \
+  DO(0, index_type)
+
+// These are not just shorthands but indicate the particular
+// implementation used (e.g., as opposed to C99's `complex double`,
+// or MLIR's `ComplexType`).
+using complex64 = std::complex<double>;
+using complex32 = std::complex<float>;
+
+/// Encoding of the elemental type, for "overloading" @newSparseTensor.
+enum class MLIR_SPARSETENSOR_EXPORT PrimaryType : uint32_t {
+  kF64 = 1,
+  kF32 = 2,
+  kF16 = 3,
+  kBF16 = 4,
+  kI64 = 5,
+  kI32 = 6,
+  kI16 = 7,
+  kI8 = 8,
+  kC64 = 9,
+  kC32 = 10
+};
+
+// This x-macro includes all `V` types.
+#define FOREVERY_V(DO)                                                         \
+  DO(F64, double)                                                              \
+  DO(F32, float)                                                               \
+  DO(F16, f16)                                                                 \
+  DO(BF16, bf16)                                                               \
+  DO(I64, int64_t)                                                             \
+  DO(I32, int32_t)                                                             \
+  DO(I16, int16_t)                                                             \
+  DO(I8, int8_t)                                                               \
+  DO(C64, complex64)                                                           \
+  DO(C32, complex32)
+
+constexpr MLIR_SPARSETENSOR_EXPORT bool
+isFloatingPrimaryType(PrimaryType valTy) {
+  return PrimaryType::kF64 <= valTy && valTy <= PrimaryType::kBF16;
+}
+
+constexpr MLIR_SPARSETENSOR_EXPORT bool
+isIntegralPrimaryType(PrimaryType valTy) {
+  return PrimaryType::kI64 <= valTy && valTy <= PrimaryType::kI8;
+}
+
+constexpr MLIR_SPARSETENSOR_EXPORT bool isRealPrimaryType(PrimaryType valTy) {
+  return PrimaryType::kF64 <= valTy && valTy <= PrimaryType::kI8;
+}
+
+constexpr MLIR_SPARSETENSOR_EXPORT bool
+isComplexPrimaryType(PrimaryType valTy) {
+  return PrimaryType::kC64 <= valTy && valTy <= PrimaryType::kC32;
+}
+
+/// The actions performed by @newSparseTensor.
+enum class MLIR_SPARSETENSOR_EXPORT Action : uint32_t {
+  kEmpty = 0,
+  kFromFile = 1,
+  kFromCOO = 2,
+  kSparseToSparse = 3,
+  kEmptyCOO = 4,
+  kToCOO = 5,
+  kToIterator = 6,
+};
+
+/// This enum mimics `SparseTensorEncodingAttr::DimLevelType` for
+/// breaking dependency cycles.  `SparseTensorEncodingAttr::DimLevelType`
+/// is the source of truth and this enum should be kept consistent with it.
+enum class MLIR_SPARSETENSOR_EXPORT DimLevelType : uint8_t {
+  kDense = 0,
+  kCompressed = 1,
+  kCompressedNu = 2,
+  kCompressedNo = 3,
+  kCompressedNuNo = 4,
+  kSingleton = 5,
+  kSingletonNu = 6,
+  kSingletonNo = 7,
+  kSingletonNuNo = 8,
+};
+
+} // namespace sparse_tensor
+} // namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_ENUMS_H
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h
@@ -0,0 +1,42 @@
+//===- ErrorHandling.h - Helpers for errors ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This header is not part of the public API.  It is placed in the
+// includes directory only because that's required by the implementations
+// of template-classes.
+//
+// This file defines an extremely lightweight API for fatal errors (not
+// arising from assertions).  The API does not attempt to be sophisticated
+// in any way, it's just the usual "I give up" style of error reporting.
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_ERRORHANDLING_H
+#define MLIR_EXECUTIONENGINE_SPARSETENSOR_ERRORHANDLING_H
+
+#include <cstdio>
+#include <cstdlib>
+
+// This macro helps minimize repetition of this idiom, as well as ensuring
+// we have some additional output indicating where the error is coming from.
+// (Since `fprintf` doesn't provide a stacktrace, this helps make it easier
+// to track down whether an error is coming from our code vs somewhere else
+// in MLIR.)
+#define MLIR_SPARSETENSOR_FATAL(...)                                           \
+  do {                                                                         \
+    fprintf(stderr, "SparseTensorUtils: " __VA_ARGS__);                        \
+    exit(1);                                                                   \
+  } while (0)
+
+#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_ERRORHANDLING_H
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/File.h
@@ -0,0 +1,259 @@
+//===- File.h - Parsing sparse tensors from files ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements parsing and printing of files in one of the
+// following external formats:
+//
+// (1) Matrix Market Exchange (MME): *.mtx
+//     https://math.nist.gov/MatrixMarket/formats.html
+//
+// (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns
+//     http://frostt.io/tensors/file-formats.html
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
+#define MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
+
+#include "mlir/ExecutionEngine/SparseTensor/Storage.h"
+
+#include <fstream>
+
+namespace mlir {
+namespace sparse_tensor {
+
+/// This class abstracts over the information stored in file headers,
+/// as well as providing the buffers and methods for parsing those headers.
+class SparseTensorFile final {
+public:
+  enum class ValueKind {
+    kInvalid = 0,
+    kPattern = 1,
+    kReal = 2,
+    kInteger = 3,
+    kComplex = 4,
+    kUndefined = 5
+  };
+
+  explicit SparseTensorFile(char *filename) : filename(filename) {
+    assert(filename && "Received nullptr for filename");
+  }
+
+  // Disallows copying, to avoid duplicating the `file` pointer.
+  SparseTensorFile(const SparseTensorFile &) = delete;
+  SparseTensorFile &operator=(const SparseTensorFile &) = delete;
+
+  // This dtor tries to avoid leaking the `file`.  (Though it's better
+  // to call `closeFile` explicitly when possible, since there are
+  // circumstances where dtors are not called reliably.)
+  ~SparseTensorFile() { closeFile(); }
+
+  /// Opens the file for reading.
+  void openFile();
+
+  /// Closes the file.
+  void closeFile();
+
+  /// Attempts to read a line from the file.
+  char *readLine();
+
+  /// Reads and parses the file's header.
+  void readHeader();
+
+  ValueKind getValueKind() const { return valueKind_; }
+
+  bool isValid() const { return valueKind_ != ValueKind::kInvalid; }
+
+  /// Gets the MME "pattern" property setting.  Is only valid after
+  /// parsing the header.
+  bool isPattern() const {
+    assert(isValid() && "Attempt to isPattern() before readHeader()");
+    return valueKind_ == ValueKind::kPattern;
+  }
+
+  /// Gets the MME "symmetric" property setting.  Is only valid after
+  /// parsing the header.
+  bool isSymmetric() const {
+    assert(isValid() && "Attempt to isSymmetric() before readHeader()");
+    return isSymmetric_;
+  }
+
+  /// Gets the rank of the tensor.  Is only valid after parsing the header.
+  uint64_t getRank() const {
+    assert(isValid() && "Attempt to getRank() before readHeader()");
+    return idata[0];
+  }
+
+  /// Gets the number of non-zeros.  Is only valid after parsing the header.
+  uint64_t getNNZ() const {
+    assert(isValid() && "Attempt to getNNZ() before readHeader()");
+    return idata[1];
+  }
+
+  /// Gets the dimension-sizes array.  The pointer itself is always
+  /// valid; however, the values stored therein are only valid after
+  /// parsing the header.
+  const uint64_t *getDimSizes() const { return idata + 2; }
+
+  /// Safely gets the size of the given dimension.  Is only valid
+  /// after parsing the header.
+  uint64_t getDimSize(uint64_t d) const {
+    assert(d < getRank());
+    return idata[2 + d];
+  }
+
+  /// Asserts the shape subsumes the actual dimension sizes.  Is only
+  /// valid after parsing the header.
+  void assertMatchesShape(uint64_t rank, const uint64_t *shape) const;
+
+private:
+  void readMMEHeader();
+  void readExtFROSTTHeader();
+
+  static constexpr int kColWidth = 1025;
+  const char *filename;
+  FILE *file = nullptr;
+  ValueKind valueKind_ = ValueKind::kInvalid;
+  bool isSymmetric_ = false;
+  uint64_t idata[512];
+  char line[kColWidth];
+};
+
+namespace detail {
+
+// Adds a value to a tensor in coordinate scheme. If is_symmetric_value is true,
+// also adds the value to its symmetric location.
+template <typename T, typename V>
+inline void addValue(T *coo, V value, const std::vector<uint64_t> indices,
+                     bool is_symmetric_value) {
+  // TODO: <https://github.com/llvm/llvm-project/issues/54179>
+  coo->add(indices, value);
+  // We currently chose to deal with symmetric matrices by fully constructing
+  // them. In the future, we may want to make symmetry implicit for storage
+  // reasons.
+  if (is_symmetric_value)
+    coo->add({indices[1], indices[0]}, value);
+}
+
+// Reads an element of a complex type for the current indices in coordinate
+// scheme.
+template <typename V>
+inline void readCOOValue(SparseTensorCOO<std::complex<V>> *coo,
+                         const std::vector<uint64_t> indices, char **linePtr,
+                         bool is_pattern, bool add_symmetric_value) {
+  // Read two values to make a complex. The external formats always store
+  // numerical values with the type double, but we cast these values to the
+  // sparse tensor object type. For a pattern tensor, we arbitrarily pick the
+  // value 1 for all entries.
+  V re = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
+  V im = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
+  std::complex<V> value = {re, im};
+  addValue(coo, value, indices, add_symmetric_value);
+}
+
+// Reads an element of a non-complex type for the current indices in coordinate
+// scheme.
+template <typename V,
+          typename std::enable_if<
+              !std::is_same<std::complex<float>, V>::value &&
+              !std::is_same<std::complex<double>, V>::value>::type * = nullptr>
+inline void readCOOValue(SparseTensorCOO<V> *coo,
+                         const std::vector<uint64_t> indices, char **linePtr,
+                         bool is_pattern, bool is_symmetric_value) {
+  // The external formats always store these numerical values with the type
+  // double, but we cast these values to the sparse tensor object type.
+  // For a pattern tensor, we arbitrarily pick the value 1 for all entries.
+  double value = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
+  addValue(coo, value, indices, is_symmetric_value);
+}
+
+} // namespace detail
+
+/// Reads a sparse tensor with the given filename into a memory-resident
+/// sparse tensor in coordinate scheme.
+template <typename V>
+inline SparseTensorCOO<V> *
+openSparseTensorCOO(char *filename, uint64_t rank, const uint64_t *shape,
+                    const uint64_t *perm, PrimaryType valTp) {
+  SparseTensorFile stfile(filename);
+  stfile.openFile();
+  stfile.readHeader();
+  // Check tensor element type against the value type in the input file.
+  SparseTensorFile::ValueKind valueKind = stfile.getValueKind();
+  bool tensorIsInteger =
+      (valTp >= PrimaryType::kI64 && valTp <= PrimaryType::kI8);
+  bool tensorIsReal = (valTp >= PrimaryType::kF64 && valTp <= PrimaryType::kI8);
+  if ((valueKind == SparseTensorFile::ValueKind::kReal && tensorIsInteger) ||
+      (valueKind == SparseTensorFile::ValueKind::kComplex && tensorIsReal)) {
+    MLIR_SPARSETENSOR_FATAL(
+        "Tensor element type %d not compatible with values in file %s\n",
+        static_cast<int>(valTp), filename);
+  }
+  stfile.assertMatchesShape(rank, shape);
+  // Prepare sparse tensor object with per-dimension sizes
+  // and the number of nonzeros as initial capacity.
+  uint64_t nnz = stfile.getNNZ();
+  auto *coo = SparseTensorCOO<V>::newSparseTensorCOO(rank, stfile.getDimSizes(),
+                                                     perm, nnz);
+  // Read all nonzero elements.
+  std::vector<uint64_t> indices(rank);
+  for (uint64_t k = 0; k < nnz; k++) {
+    char *linePtr = stfile.readLine();
+    for (uint64_t r = 0; r < rank; r++) {
+      uint64_t idx = strtoul(linePtr, &linePtr, 10);
+      // Add 0-based index.
+      indices[perm[r]] = idx - 1;
+    }
+    detail::readCOOValue(coo, indices, &linePtr, stfile.isPattern(),
+                         stfile.isSymmetric() && indices[0] != indices[1]);
+  }
+  // Close the file and return tensor.
+  stfile.closeFile();
+  return coo;
+}
+
+/// Writes the sparse tensor to `dest` in extended FROSTT format.
+template <typename V>
+inline void outSparseTensor(void *tensor, void *dest, bool sort) {
+  assert(tensor && dest);
+  auto coo = static_cast<SparseTensorCOO<V> *>(tensor);
+  if (sort)
+    coo->sort();
+  char *filename = static_cast<char *>(dest);
+  auto &dimSizes = coo->getDimSizes();
+  auto &elements = coo->getElements();
+  uint64_t rank = coo->getRank();
+  uint64_t nnz = elements.size();
+  std::fstream file;
+  file.open(filename, std::ios_base::out | std::ios_base::trunc);
+  assert(file.is_open());
+  file << "; extended FROSTT format\n" << rank << " " << nnz << std::endl;
+  for (uint64_t r = 0; r < rank - 1; r++)
+    file << dimSizes[r] << " ";
+  file << dimSizes[rank - 1] << std::endl;
+  for (uint64_t i = 0; i < nnz; i++) {
+    auto &idx = elements[i].indices;
+    for (uint64_t r = 0; r < rank; r++)
+      file << (idx[r] + 1) << " ";
+    file << elements[i].value << std::endl;
+  }
+  file.flush();
+  file.close();
+  assert(file.good());
+}
+
+} // namespace sparse_tensor
+} // namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_FILE_H
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
new file mode 100644
--- /dev/null
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensor/Storage.h
@@ -0,0 +1,917 @@
+//===- Storage.h - TACO-flavored sparse tensor representation ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+// This file contains definitions for the following classes:
+//
+// * `SparseTensorStorageBase`
+// * `SparseTensorStorage<P, I, V>`
+// * `SparseTensorEnumeratorBase<V>`
+// * `SparseTensorEnumerator<P, I, V>`
+// * `SparseTensorNNZ`
+//
+// Ideally we would split the storage classes and enumerator classes
+// into separate files, to improve legibility.  But alas: because these
+// are template-classes, they must therefore provide *definitions* in the
+// header; and those definitions cause circular dependencies that make it
+// impossible to split the file up along the desired lines.  (We could
+// split the base classes from the derived classes, but that doesn't
+// particularly help improve legibility.)
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_EXECUTIONENGINE_SPARSETENSOR_STORAGE_H
+#define MLIR_EXECUTIONENGINE_SPARSETENSOR_STORAGE_H
+
+#include "mlir/ExecutionEngine/SparseTensor/COO.h"
+#include "mlir/ExecutionEngine/SparseTensor/CheckedMul.h"
+#include "mlir/ExecutionEngine/SparseTensor/Enums.h"
+#include "mlir/ExecutionEngine/SparseTensor/ErrorHandling.h"
+
+namespace mlir {
+namespace sparse_tensor {
+
+namespace detail {
+
+// TODO: try to unify this with `SparseTensorFile::assertMatchesShape`
+// which is used by `openSparseTensorCOO`.  It's easy enough to resolve
+// the `std::vector` vs pointer mismatch for `dimSizes`; but it's trickier
+// to resolve the presence/absence of `perm` (without introducing extra
+// overhead), so perhaps the code duplication is unavoidable.
+//
+/// Asserts that the `dimSizes` (in target-order) under the `perm` (mapping
+/// semantic-order to target-order) are a refinement of the desired `shape`
+/// (in semantic-order).
+///
+/// Precondition: `perm` and `shape` must be valid for `rank`.
+inline void assertPermutedSizesMatchShape(const std::vector<uint64_t> &dimSizes,
+                                          uint64_t rank, const uint64_t *perm,
+                                          const uint64_t *shape) {
+  assert(perm && shape);
+  assert(rank == dimSizes.size() && "Rank mismatch");
+  for (uint64_t r = 0; r < rank; r++)
+    assert((shape[r] == 0 || shape[r] == dimSizes[perm[r]]) &&
+           "Dimension size mismatch");
+}
+
+} // namespace detail
+
+// This forward decl is sufficient to split `SparseTensorStorageBase` into
+// its own header, but isn't sufficient for `SparseTensorStorage` to join it.
+template <typename V>
+class SparseTensorEnumeratorBase;
+
+/// Abstract base class for `SparseTensorStorage<P,I,V>`.  This class
+/// takes responsibility for all the `<P,I,V>`-independent aspects
+/// of the tensor (e.g., shape, sparsity, permutation).  In addition,
+/// we use function overloading to implement "partial" method
+/// specialization, which the C-API relies on to catch type errors
+/// arising from our use of opaque pointers.
+class SparseTensorStorageBase {
+public:
+  /// Constructs a new storage object.  The `perm` maps the tensor's
+  /// semantic-ordering of dimensions to this object's storage-order.
+  /// The `dimSizes` and `sparsity` arrays are already in storage-order.
+  ///
+  /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
+  SparseTensorStorageBase(const std::vector<uint64_t> &dimSizes,
+                          const uint64_t *perm, const DimLevelType *sparsity);
+
+  virtual ~SparseTensorStorageBase() = default;
+
+  /// Get the rank of the tensor.
+  uint64_t getRank() const { return dimSizes.size(); }
+
+  /// Getter for the dimension-sizes array, in storage-order.
+  const std::vector<uint64_t> &getDimSizes() const { return dimSizes; }
+
+  /// Safely lookup the size of the given (storage-order) dimension.
+  uint64_t getDimSize(uint64_t d) const {
+    assert(d < getRank());
+    return dimSizes[d];
+  }
+
+  /// Getter for the "reverse" permutation, which maps this object's
+  /// storage-order to the tensor's semantic-order.
+  const std::vector<uint64_t> &getRev() const { return rev; }
+
+  /// Getter for the dimension-types array, in storage-order.
+  const std::vector<DimLevelType> &getDimTypes() const { return dimTypes; }
+
+  /// Safely check if the (storage-order) dimension uses dense storage.
+  bool isDenseDim(uint64_t d) const {
+    assert(d < getRank());
+    return dimTypes[d] == DimLevelType::kDense;
+  }
+
+  /// Safely check if the (storage-order) dimension uses compressed storage.
+  bool isCompressedDim(uint64_t d) const {
+    assert(d < getRank());
+    switch (dimTypes[d]) {
+    case DimLevelType::kCompressed:
+    case DimLevelType::kCompressedNu:
+    case DimLevelType::kCompressedNo:
+    case DimLevelType::kCompressedNuNo:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Safely check if the (storage-order) dimension uses singleton storage.
+  bool isSingletonDim(uint64_t d) const {
+    assert(d < getRank());
+    switch (dimTypes[d]) {
+    case DimLevelType::kSingleton:
+    case DimLevelType::kSingletonNu:
+    case DimLevelType::kSingletonNo:
+    case DimLevelType::kSingletonNuNo:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  /// Safely check if the (storage-order) dimension is ordered.
+  bool isOrderedDim(uint64_t d) const {
+    assert(d < getRank());
+    switch (dimTypes[d]) {
+    case DimLevelType::kCompressedNo:
+    case DimLevelType::kCompressedNuNo:
+    case DimLevelType::kSingletonNo:
+    case DimLevelType::kSingletonNuNo:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  /// Safely check if the (storage-order) dimension is unique.
+  bool isUniqueDim(uint64_t d) const {
+    assert(d < getRank());
+    switch (dimTypes[d]) {
+    case DimLevelType::kCompressedNu:
+    case DimLevelType::kCompressedNuNo:
+    case DimLevelType::kSingletonNu:
+    case DimLevelType::kSingletonNuNo:
+      return false;
+    default:
+      return true;
+    }
+  }
+
+  /// Allocate a new enumerator.
+#define DECL_NEWENUMERATOR(VNAME, V)                                           \
+  virtual void newEnumerator(SparseTensorEnumeratorBase<V> **, uint64_t,       \
+                             const uint64_t *) const;
+  FOREVERY_V(DECL_NEWENUMERATOR)
+#undef DECL_NEWENUMERATOR
+
+  /// Overhead storage.
+#define DECL_GETPOINTERS(PNAME, P)                                             \
+  virtual void getPointers(std::vector<P> **, uint64_t);
+  FOREVERY_FIXED_O(DECL_GETPOINTERS)
+#undef DECL_GETPOINTERS
+#define DECL_GETINDICES(INAME, I)                                              \
+  virtual void getIndices(std::vector<I> **, uint64_t);
+  FOREVERY_FIXED_O(DECL_GETINDICES)
+#undef DECL_GETINDICES
+
+  /// Primary storage.
+#define DECL_GETVALUES(VNAME, V) virtual void getValues(std::vector<V> **);
+  FOREVERY_V(DECL_GETVALUES)
+#undef DECL_GETVALUES
+
+  /// Element-wise insertion in lexicographic index order.
+#define DECL_LEXINSERT(VNAME, V) virtual void lexInsert(const uint64_t *, V);
+  FOREVERY_V(DECL_LEXINSERT)
+#undef DECL_LEXINSERT
+
+  /// Expanded insertion.
+#define DECL_EXPINSERT(VNAME, V)                                               \
+  virtual void expInsert(uint64_t *, V *, bool *, uint64_t *, uint64_t);
+  FOREVERY_V(DECL_EXPINSERT)
+#undef DECL_EXPINSERT
+
+  /// Finishes insertion.
+  virtual void endInsert() = 0;
+
+protected:
+  // Since this class is virtual, we must disallow public copying in
+  // order to avoid "slicing".  Since this class has data members,
+  // that means making copying protected.
+  // <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-copy-virtual>
+  SparseTensorStorageBase(const SparseTensorStorageBase &) = default;
+  // Copy-assignment would be implicitly deleted (because `dimSizes`
+  // is const), so we explicitly delete it for clarity.
+  SparseTensorStorageBase &operator=(const SparseTensorStorageBase &) = delete;
+
+private:
+  const std::vector<uint64_t> dimSizes;
+  std::vector<uint64_t> rev;
+  const std::vector<DimLevelType> dimTypes;
+};
+
+// This forward decl is necessary for defining `SparseTensorStorage`,
+// but isn't sufficient for splitting it off.
+template <typename P, typename I, typename V>
+class SparseTensorEnumerator;
+
+/// A memory-resident sparse tensor using a storage scheme based on
+/// per-dimension sparse/dense annotations. This data structure provides a
+/// bufferized form of a sparse tensor type. In contrast to generating setup
+/// methods for each differently annotated sparse tensor, this method provides
+/// a convenient "one-size-fits-all" solution that simply takes an input tensor
+/// and annotations to implement all required setup in a general manner.
+template <typename P, typename I, typename V>
+class SparseTensorStorage final : public SparseTensorStorageBase {
+  /// Private constructor to share code between the other constructors.
+  /// Beware that the object is not necessarily guaranteed to be in a
+  /// valid state after this constructor alone; e.g., `isCompressedDim(d)`
+  /// doesn't entail `!(pointers[d].empty())`.
+  ///
+  /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
+  SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
+                      const uint64_t *perm, const DimLevelType *sparsity)
+      : SparseTensorStorageBase(dimSizes, perm, sparsity), pointers(getRank()),
+        indices(getRank()), idx(getRank()) {}
+
+public:
+  /// Constructs a sparse tensor storage scheme with the given dimensions,
+  /// permutation, and per-dimension dense/sparse annotations, using
+  /// the coordinate scheme tensor for the initial contents if provided.
+  ///
+  /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
+  SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
+                      const uint64_t *perm, const DimLevelType *sparsity,
+                      SparseTensorCOO<V> *coo)
+      : SparseTensorStorage(dimSizes, perm, sparsity) {
+    // Provide hints on capacity of pointers and indices.
+    // TODO: needs much fine-tuning based on actual sparsity; currently
+    //       we reserve pointer/index space based on all previous dense
+    //       dimensions, which works well up to first sparse dim; but
+    //       we should really use nnz and dense/sparse distribution.
+    bool allDense = true;
+    uint64_t sz = 1;
+    for (uint64_t r = 0, rank = getRank(); r < rank; r++) {
+      if (isCompressedDim(r)) {
+        // TODO: Take a parameter between 1 and `dimSizes[r]`, and multiply
+        // `sz` by that before reserving. (For now we just use 1.)
+        pointers[r].reserve(sz + 1);
+        pointers[r].push_back(0);
+        indices[r].reserve(sz);
+        sz = 1;
+        allDense = false;
+      } else if (isSingletonDim(r)) {
+        indices[r].reserve(sz);
+        sz = 1;
+        allDense = false;
+      } else { // Dense dimension.
+        assert(isDenseDim(r));
+        sz = detail::checkedMul(sz, getDimSizes()[r]);
+      }
+    }
+    // Then assign contents from coordinate scheme tensor if provided.
+    if (coo) {
+      // Ensure both preconditions of `fromCOO`.
+      assert(coo->getDimSizes() == getDimSizes() && "Tensor size mismatch");
+      coo->sort();
+      // Now actually insert the `elements`.
+      const std::vector<Element<V>> &elements = coo->getElements();
+      uint64_t nnz = elements.size();
+      values.reserve(nnz);
+      fromCOO(elements, 0, nnz, 0);
+    } else if (allDense) {
+      values.resize(sz, 0);
+    }
+  }
+
+  /// Constructs a sparse tensor storage scheme with the given dimensions,
+  /// permutation, and per-dimension dense/sparse annotations, using
+  /// the given sparse tensor for the initial contents.
+  ///
+  /// Preconditions:
+  /// * `perm` and `sparsity` must be valid for `dimSizes.size()`.
+  /// * The `tensor` must have the same value type `V`.
+  SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
+                      const uint64_t *perm, const DimLevelType *sparsity,
+                      const SparseTensorStorageBase &tensor);
+
+  ~SparseTensorStorage() final = default;
+
+  /// Partially specialize these getter methods based on template types.
+  void getPointers(std::vector<P> **out, uint64_t d) final {
+    assert(d < getRank());
+    *out = &pointers[d];
+  }
+  void getIndices(std::vector<I> **out, uint64_t d) final {
+    assert(d < getRank());
+    *out = &indices[d];
+  }
+  void getValues(std::vector<V> **out) final { *out = &values; }
+
+  /// Partially specialize lexicographical insertions based on template types.
+  void lexInsert(const uint64_t *cursor, V val) final {
+    // First, wrap up pending insertion path.
+    uint64_t diff = 0;
+    uint64_t top = 0;
+    if (!values.empty()) {
+      diff = lexDiff(cursor);
+      endPath(diff + 1);
+      top = idx[diff] + 1;
+    }
+    // Then continue with insertion path.
+    insPath(cursor, diff, top, val);
+  }
+
+  /// Partially specialize expanded insertions based on template types.
+  /// Note that this method resets the values/filled-switch array back
+  /// to all-zero/false while only iterating over the nonzero elements.
+  void expInsert(uint64_t *cursor, V *values, bool *filled, uint64_t *added,
+                 uint64_t count) final {
+    if (count == 0)
+      return;
+    // Sort.
+    std::sort(added, added + count);
+    // Restore insertion path for first insert.
+    const uint64_t lastDim = getRank() - 1;
+    uint64_t index = added[0];
+    cursor[lastDim] = index;
+    lexInsert(cursor, values[index]);
+    assert(filled[index]);
+    values[index] = 0;
+    filled[index] = false;
+    // Subsequent insertions are quick.
+    for (uint64_t i = 1; i < count; i++) {
+      assert(index < added[i] && "non-lexicographic insertion");
+      index = added[i];
+      cursor[lastDim] = index;
+      insPath(cursor, lastDim, added[i - 1] + 1, values[index]);
+      assert(filled[index]);
+      values[index] = 0;
+      filled[index] = false;
+    }
+  }
+
+  /// Finalizes lexicographic insertions.
+  void endInsert() final {
+    if (values.empty())
+      finalizeSegment(0);
+    else
+      endPath(0);
+  }
+
+  void newEnumerator(SparseTensorEnumeratorBase<V> **out, uint64_t rank,
+                     const uint64_t *perm) const final {
+    *out = new SparseTensorEnumerator<P, I, V>(*this, rank, perm);
+  }
+
+  /// Returns this sparse tensor storage scheme as a new memory-resident
+  /// sparse tensor in coordinate scheme with the given dimension order.
+  ///
+  /// Precondition: `perm` must be valid for `getRank()`.
+  SparseTensorCOO<V> *toCOO(const uint64_t *perm) const {
+    SparseTensorEnumeratorBase<V> *enumerator;
+    newEnumerator(&enumerator, getRank(), perm);
+    SparseTensorCOO<V> *coo =
+        new SparseTensorCOO<V>(enumerator->permutedSizes(), values.size());
+    enumerator->forallElements([&coo](const std::vector<uint64_t> &ind, V val) {
+      coo->add(ind, val);
+    });
+    // TODO: This assertion assumes there are no stored zeros,
+    // or if there are then that we don't filter them out.
+    // Cf., <https://github.com/llvm/llvm-project/issues/54179>
+    assert(coo->getElements().size() == values.size());
+    delete enumerator;
+    return coo;
+  }
+
+  /// Factory method. Constructs a sparse tensor storage scheme with the given
+  /// dimensions, permutation, and per-dimension dense/sparse annotations,
+  /// using the coordinate scheme tensor for the initial contents if provided.
+  /// In the latter case, the coordinate scheme must respect the same
+  /// permutation as is desired for the new sparse tensor storage.
+  ///
+  /// Precondition: `shape`, `perm`, and `sparsity` must be valid for `rank`.
+  static SparseTensorStorage<P, I, V> *
+  newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
+                  const DimLevelType *sparsity, SparseTensorCOO<V> *coo) {
+    SparseTensorStorage<P, I, V> *n = nullptr;
+    if (coo) {
+      const auto &coosz = coo->getDimSizes();
+      detail::assertPermutedSizesMatchShape(coosz, rank, perm, shape);
+      n = new SparseTensorStorage<P, I, V>(coosz, perm, sparsity, coo);
+    } else {
+      std::vector<uint64_t> permsz(rank);
+      for (uint64_t r = 0; r < rank; r++) {
+        assert(shape[r] > 0 && "Dimension size zero has trivial storage");
+        permsz[perm[r]] = shape[r];
+      }
+      // We pass the null `coo` to ensure we select the intended constructor.
+      n = new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, coo);
+    }
+    return n;
+  }
+
+  /// Factory method. Constructs a sparse tensor storage scheme with
+  /// the given dimensions, permutation, and per-dimension dense/sparse
+  /// annotations, using the sparse tensor for the initial contents.
+  ///
+  /// Preconditions:
+  /// * `shape`, `perm`, and `sparsity` must be valid for `rank`.
+  /// * The `tensor` must have the same value type `V`.
+  static SparseTensorStorage<P, I, V> *
+  newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
+                  const DimLevelType *sparsity,
+                  const SparseTensorStorageBase *source) {
+    assert(source && "Got nullptr for source");
+    SparseTensorEnumeratorBase<V> *enumerator;
+    source->newEnumerator(&enumerator, rank, perm);
+    const auto &permsz = enumerator->permutedSizes();
+    detail::assertPermutedSizesMatchShape(permsz, rank, perm, shape);
+    auto *tensor =
+        new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, *source);
+    delete enumerator;
+    return tensor;
+  }
+
+private:
+  /// Appends an arbitrary new position to `pointers[d]`.  This method
+  /// checks that `pos` is representable in the `P` type; however, it
+  /// does not check that `pos` is semantically valid (i.e., larger than
+  /// the previous position and smaller than `indices[d].capacity()`).
+  void appendPointer(uint64_t d, uint64_t pos, uint64_t count = 1) {
+    assert(isCompressedDim(d));
+    assert(pos <= std::numeric_limits<P>::max() &&
+           "Pointer value is too large for the P-type");
+    pointers[d].insert(pointers[d].end(), count, static_cast<P>(pos));
+  }
+
+  /// Appends index `i` to dimension `d`, in the semantically general
+  /// sense.  For non-dense dimensions, that means appending to the
+  /// `indices[d]` array, checking that `i` is representable in the `I`
+  /// type; however, we do not verify other semantic requirements (e.g.,
+  /// that `i` is in bounds for `dimSizes[d]`, and not previously occurring
+  /// in the same segment).  For dense dimensions, this method instead
+  /// appends the appropriate number of zeros to the `values` array,
+  /// where `full` is the number of "entries" already written to `values`
+  /// for this segment (aka one after the highest index previously appended).
+  void appendIndex(uint64_t d, uint64_t full, uint64_t i) {
+    if (isCompressedDim(d) || isSingletonDim(d)) {
+      assert(i <= std::numeric_limits<I>::max() &&
+             "Index value is too large for the I-type");
+      indices[d].push_back(static_cast<I>(i));
+    } else { // Dense dimension.
+      assert(isDenseDim(d));
+      assert(i >= full && "Index was already filled");
+      if (i == full)
+        return; // Short-circuit, since it'll be a nop.
+      if (d + 1 == getRank())
+        values.insert(values.end(), i - full, 0);
+      else
+        finalizeSegment(d + 1, 0, i - full);
+    }
+  }
+
+  /// Writes the given coordinate to `indices[d][pos]`.  This method
+  /// checks that `i` is representable in the `I` type; however, it
+  /// does not check that `i` is semantically valid (i.e., in bounds
+  /// for `dimSizes[d]` and not elsewhere occurring in the same segment).
+  void writeIndex(uint64_t d, uint64_t pos, uint64_t i) {
+    assert(isCompressedDim(d));
+    // Subscript assignment to `std::vector` requires that the `pos`-th
+    // entry has been initialized; thus we must be sure to check `size()`
+    // here, instead of `capacity()` as would be ideal.
+    assert(pos < indices[d].size() && "Index position is out of bounds");
+    assert(i <= std::numeric_limits<I>::max() &&
+           "Index value is too large for the I-type");
+    indices[d][pos] = static_cast<I>(i);
+  }
+
+  /// Computes the assembled-size associated with the `d`-th dimension,
+  /// given the assembled-size associated with the `(d-1)`-th dimension.
+  /// "Assembled-sizes" correspond to the (nominal) sizes of overhead
+  /// storage, as opposed to "dimension-sizes" which are the cardinality
+  /// of coordinates for that dimension.
+  ///
+  /// Precondition: the `pointers[d]` array must be fully initialized
+  /// before calling this method.
+  uint64_t assembledSize(uint64_t parentSz, uint64_t d) const {
+    if (isCompressedDim(d))
+      return pointers[d][parentSz];
+    // else if dense:
+    return parentSz * getDimSizes()[d];
+  }
+
+  /// Initializes sparse tensor storage scheme from a memory-resident sparse
+  /// tensor in coordinate scheme. This method prepares the pointers and
+  /// indices arrays under the given per-dimension dense/sparse annotations.
+  ///
+  /// Preconditions:
+  /// (1) the `elements` must be lexicographically sorted.
+  /// (2) the indices of every element are valid for `dimSizes` (equal rank
+  ///     and pointwise less-than).
+  void fromCOO(const std::vector<Element<V>> &elements, uint64_t lo,
+               uint64_t hi, uint64_t d) {
+    uint64_t rank = getRank();
+    assert(d <= rank && hi <= elements.size());
+    // Once dimensions are exhausted, insert the numerical values.
+    if (d == rank) {
+      assert(lo < hi);
+      values.push_back(elements[lo].value);
+      return;
+    }
+    // Visit all elements in this interval.
+    uint64_t full = 0;
+    while (lo < hi) { // If `hi` is unchanged, then `lo < elements.size()`.
+      // Find segment in interval with same index elements in this dimension.
+      uint64_t i = elements[lo].indices[d];
+      uint64_t seg = lo + 1;
+      bool merge = isUniqueDim(d);
+      while (merge && seg < hi && elements[seg].indices[d] == i)
+        seg++;
+      // Handle segment in interval for sparse or dense dimension.
+      appendIndex(d, full, i);
+      full = i + 1;
+      fromCOO(elements, lo, seg, d + 1);
+      // And move on to next segment in interval.
+      lo = seg;
+    }
+    // Finalize the sparse pointer structure at this dimension.
+    finalizeSegment(d, full);
+  }
+
+  /// Finalize the sparse pointer structure at this dimension.
+  void finalizeSegment(uint64_t d, uint64_t full = 0, uint64_t count = 1) {
+    if (count == 0)
+      return; // Short-circuit, since it'll be a nop.
+    if (isCompressedDim(d)) {
+      appendPointer(d, indices[d].size(), count);
+    } else if (isSingletonDim(d)) {
+      return;
+    } else { // Dense dimension.
+      assert(isDenseDim(d));
+      const uint64_t sz = getDimSizes()[d];
+      assert(sz >= full && "Segment is overfull");
+      count = detail::checkedMul(count, sz - full);
+      // For dense storage we must enumerate all the remaining coordinates
+      // in this dimension (i.e., coordinates after the last non-zero
+      // element), and either fill in their zero values or else recurse
+      // to finalize some deeper dimension.
+      if (d + 1 == getRank())
+        values.insert(values.end(), count, 0);
+      else
+        finalizeSegment(d + 1, 0, count);
+    }
+  }
+
+  /// Wraps up a single insertion path, inner to outer.
+  void endPath(uint64_t diff) {
+    uint64_t rank = getRank();
+    assert(diff <= rank);
+    for (uint64_t i = 0; i < rank - diff; i++) {
+      const uint64_t d = rank - i - 1;
+      finalizeSegment(d, idx[d] + 1);
+    }
+  }
+
+  /// Continues a single insertion path, outer to inner.
+  void insPath(const uint64_t *cursor, uint64_t diff, uint64_t top, V val) {
+    uint64_t rank = getRank();
+    assert(diff < rank);
+    for (uint64_t d = diff; d < rank; d++) {
+      uint64_t i = cursor[d];
+      appendIndex(d, top, i);
+      top = 0;
+      idx[d] = i;
+    }
+    values.push_back(val);
+  }
+
+  /// Finds the lexicographic differing dimension.
+  uint64_t lexDiff(const uint64_t *cursor) const {
+    for (uint64_t r = 0, rank = getRank(); r < rank; r++)
+      if (cursor[r] > idx[r])
+        return r;
+      else
+        assert(cursor[r] == idx[r] && "non-lexicographic insertion");
+    assert(0 && "duplication insertion");
+    return -1u;
+  }
+
+  // Allow `SparseTensorEnumerator` to access the data-members (to avoid
+  // the cost of virtual-function dispatch in inner loops), without
+  // making them public to other client code.
+  friend class SparseTensorEnumerator<P, I, V>;
+
+  std::vector<std::vector<P>> pointers;
+  std::vector<std::vector<I>> indices;
+  std::vector<V> values;
+  std::vector<uint64_t> idx; // index cursor for lexicographic insertion.
+};
+
+/// A (higher-order) function object for enumerating the elements of some
+/// `SparseTensorStorage` under a permutation.  That is, the `forallElements`
+/// method encapsulates the loop-nest for enumerating the elements of
+/// the source tensor (in whatever order is best for the source tensor),
+/// and applies a permutation to the coordinates/indices before handing
+/// each element to the callback.  A single enumerator object can be
+/// freely reused for several calls to `forallElements`, just so long
+/// as each call is sequential with respect to one another.
+///
+/// N.B., this class stores a reference to the `SparseTensorStorageBase`
+/// passed to the constructor; thus, objects of this class must not
+/// outlive the sparse tensor they depend on.
+///
+/// Design Note: The reason we define this class instead of simply using
+/// `SparseTensorEnumerator<P,I,V>` is because we need to hide/generalize
+/// the `<P,I>` template parameters from MLIR client code (to simplify the
+/// type parameters used for direct sparse-to-sparse conversion).  And the
+/// reason we define the `SparseTensorEnumerator<P,I,V>` subclasses rather
+/// than simply using this class, is to avoid the cost of virtual-method
+/// dispatch within the loop-nest.
+template <typename V>
+class SparseTensorEnumeratorBase {
+public:
+  /// Constructs an enumerator with the given permutation for mapping
+  /// the semantic-ordering of dimensions to the desired target-ordering.
+  ///
+  /// Preconditions:
+  /// * the `tensor` must have the same `V` value type.
+  /// * `perm` must be valid for `rank`.
+  SparseTensorEnumeratorBase(const SparseTensorStorageBase &tensor,
+                             uint64_t rank, const uint64_t *perm)
+      : src(tensor), permsz(src.getRev().size()), reord(getRank()),
+        cursor(getRank()) {
+    assert(perm && "Received nullptr for permutation");
+    assert(rank == getRank() && "Permutation rank mismatch");
+    const auto &rev = src.getRev();           // source-order -> semantic-order
+    const auto &dimSizes = src.getDimSizes(); // in source storage-order
+    for (uint64_t s = 0; s < rank; s++) {     // `s` source storage-order
+      uint64_t t = perm[rev[s]];              // `t` target-order
+      reord[s] = t;
+      permsz[t] = dimSizes[s];
+    }
+  }
+
+  virtual ~SparseTensorEnumeratorBase() = default;
+
+  // We disallow copying to help avoid leaking the `src` reference.
+  // (In addition to avoiding the problem of slicing.)
+  SparseTensorEnumeratorBase(const SparseTensorEnumeratorBase &) = delete;
+  SparseTensorEnumeratorBase &
+  operator=(const SparseTensorEnumeratorBase &) = delete;
+
+  /// Returns the source/target tensor's rank.  (The source-rank and
+  /// target-rank are always equal since we only support permutations.
+  /// Though once we add support for other dimension mappings, this
+  /// method will have to be split in two.)
+  uint64_t getRank() const { return permsz.size(); }
+
+  /// Returns the target tensor's dimension sizes.
+  const std::vector<uint64_t> &permutedSizes() const { return permsz; }
+
+  /// Enumerates all elements of the source tensor, permutes their
+  /// indices, and passes the permuted element to the callback.
+  /// The callback must not store the cursor reference directly,
+  /// since this function reuses the storage.  Instead, the callback
+  /// must copy it if they want to keep it.
+  virtual void forallElements(ElementConsumer<V> yield) = 0;
+
+protected:
+  const SparseTensorStorageBase &src;
+  std::vector<uint64_t> permsz; // in target order.
+  std::vector<uint64_t> reord;  // source storage-order -> target order.
+  std::vector<uint64_t> cursor; // in target order.
+};
+
+template <typename P, typename I, typename V>
+class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
+  using Base = SparseTensorEnumeratorBase<V>;
+
+public:
+  /// Constructs an enumerator with the given permutation for mapping
+  /// the semantic-ordering of dimensions to the desired target-ordering.
+  ///
+  /// Precondition: `perm` must be valid for `rank`.
+  SparseTensorEnumerator(const SparseTensorStorage<P, I, V> &tensor,
+                         uint64_t rank, const uint64_t *perm)
+      : Base(tensor, rank, perm) {}
+
+  ~SparseTensorEnumerator() final = default;
+
+  void forallElements(ElementConsumer<V> yield) final {
+    forallElements(yield, 0, 0);
+  }
+
+private:
+  /// The recursive component of the public `forallElements`.
+  void forallElements(ElementConsumer<V> yield, uint64_t parentPos,
+                      uint64_t d) {
+    // Recover the `<P,I,V>` type parameters of `src`.
+    const auto &src =
+        static_cast<const SparseTensorStorage<P, I, V> &>(this->src);
+    if (d == Base::getRank()) {
+      assert(parentPos < src.values.size() &&
+             "Value position is out of bounds");
+      // TODO: <https://github.com/llvm/llvm-project/issues/54179>
+      yield(this->cursor, src.values[parentPos]);
+    } else if (src.isCompressedDim(d)) {
+      // Look up the bounds of the `d`-level segment determined by the
+      // `d-1`-level position `parentPos`.
+      const std::vector<P> &pointersD = src.pointers[d];
+      assert(parentPos + 1 < pointersD.size() &&
+             "Parent pointer position is out of bounds");
+      const uint64_t pstart = static_cast<uint64_t>(pointersD[parentPos]);
+      const uint64_t pstop = static_cast<uint64_t>(pointersD[parentPos + 1]);
+      // Loop-invariant code for looking up the `d`-level coordinates/indices.
+      const std::vector<I> &indicesD = src.indices[d];
+      assert(pstop <= indicesD.size() && "Index position is out of bounds");
+      uint64_t &cursorReordD = this->cursor[this->reord[d]];
+      for (uint64_t pos = pstart; pos < pstop; pos++) {
+        cursorReordD = static_cast<uint64_t>(indicesD[pos]);
+        forallElements(yield, pos, d + 1);
+      }
+    } else if (src.isSingletonDim(d)) {
+      MLIR_SPARSETENSOR_FATAL("unsupported dimension level type");
+    } else { // Dense dimension.
+      assert(src.isDenseDim(d));
+      const uint64_t sz = src.getDimSizes()[d];
+      const uint64_t pstart = parentPos * sz;
+      uint64_t &cursorReordD = this->cursor[this->reord[d]];
+      for (uint64_t i = 0; i < sz; i++) {
+        cursorReordD = i;
+        forallElements(yield, pstart + i, d + 1);
+      }
+    }
+  }
+};
+
+/// Statistics regarding the number of nonzero subtensors in
+/// a source tensor, for direct sparse=>sparse conversion a la
+/// <https://arxiv.org/abs/2001.02609>.
+///
+/// N.B., this class stores references to the parameters passed to
+/// the constructor; thus, objects of this class must not outlive
+/// those parameters.
+class SparseTensorNNZ final {
+public:
+  /// Allocate the statistics structure for the desired sizes and
+  /// sparsity (in the target tensor's storage-order).  This constructor
+  /// does not actually populate the statistics, however; for that see
+  /// `initialize`.
+  ///
+  /// Precondition: `dimSizes` must not contain zeros.
+  SparseTensorNNZ(const std::vector<uint64_t> &dimSizes,
+                  const std::vector<DimLevelType> &sparsity);
+
+  // We disallow copying to help avoid leaking the stored references.
+  SparseTensorNNZ(const SparseTensorNNZ &) = delete;
+  SparseTensorNNZ &operator=(const SparseTensorNNZ &) = delete;
+
+  /// Returns the rank of the target tensor.
+  uint64_t getRank() const { return dimSizes.size(); }
+
+  /// Enumerate the source tensor to fill in the statistics.  The
+  /// enumerator should already incorporate the permutation (from
+  /// semantic-order to the target storage-order).
+  template <typename V>
+  void initialize(SparseTensorEnumeratorBase<V> &enumerator) {
+    assert(enumerator.getRank() == getRank() && "Tensor rank mismatch");
+    assert(enumerator.permutedSizes() == dimSizes && "Tensor size mismatch");
+    enumerator.forallElements(
+        [this](const std::vector<uint64_t> &ind, V) { add(ind); });
+  }
+
+  /// The type of callback functions which receive an nnz-statistic.
+  using NNZConsumer = const std::function<void(uint64_t)> &;
+
+  /// Lexicographically enumerates all indicies for dimensions strictly
+  /// less than `stopDim`, and passes their nnz statistic to the callback.
+  /// Since our use-case only requires the statistic not the coordinates
+  /// themselves, we do not bother to construct those coordinates.
+  void forallIndices(uint64_t stopDim, NNZConsumer yield) const;
+
+private:
+  /// Adds a new element (i.e., increment its statistics).  We use
+  /// a method rather than inlining into the lambda in `initialize`,
+  /// to avoid spurious templating over `V`.  And this method is private
+  /// to avoid needing to re-assert validity of `ind` (which is guaranteed
+  /// by `forallElements`).
+  void add(const std::vector<uint64_t> &ind);
+
+  /// Recursive component of the public `forallIndices`.
+  void forallIndices(NNZConsumer yield, uint64_t stopDim, uint64_t parentPos,
+                     uint64_t d) const;
+
+  // All of these are in the target storage-order.
+  const std::vector<uint64_t> &dimSizes;
+  const std::vector<DimLevelType> &dimTypes;
+  std::vector<std::vector<uint64_t>> nnz;
+};
+
+template <typename P, typename I, typename V>
+SparseTensorStorage<P, I, V>::SparseTensorStorage(
+    const std::vector<uint64_t> &dimSizes, const uint64_t *perm,
+    const DimLevelType *sparsity, const SparseTensorStorageBase &tensor)
+    : SparseTensorStorage(dimSizes, perm, sparsity) {
+  SparseTensorEnumeratorBase<V> *enumerator;
+  tensor.newEnumerator(&enumerator, getRank(), perm);
+  {
+    // Initialize the statistics structure.
+    SparseTensorNNZ nnz(getDimSizes(), getDimTypes());
+    nnz.initialize(*enumerator);
+    // Initialize "pointers" overhead (and allocate "indices", "values").
+    uint64_t parentSz = 1; // assembled-size (not dimension-size) of `r-1`.
+    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+      if (isCompressedDim(r)) {
+        pointers[r].reserve(parentSz + 1);
+        pointers[r].push_back(0);
+        uint64_t currentPos = 0;
+        nnz.forallIndices(r, [this, &currentPos, r](uint64_t n) {
+          currentPos += n;
+          appendPointer(r, currentPos);
+        });
+        assert(pointers[r].size() == parentSz + 1 &&
+               "Final pointers size doesn't match allocated size");
+        // That assertion entails `assembledSize(parentSz, r)`
+        // is now in a valid state.  That is, `pointers[r][parentSz]`
+        // equals the present value of `currentPos`, which is the
+        // correct assembled-size for `indices[r]`.
+      }
+      // Update assembled-size for the next iteration.
+      parentSz = assembledSize(parentSz, r);
+      // Ideally we need only `indices[r].reserve(parentSz)`, however
+      // the `std::vector` implementation forces us to initialize it too.
+      // That is, in the yieldPos loop we need random-access assignment
+      // to `indices[r]`; however, `std::vector`'s subscript-assignment
+      // only allows assigning to already-initialized positions.
+      if (isCompressedDim(r))
+        indices[r].resize(parentSz, 0);
+    }
+    values.resize(parentSz, 0); // Both allocate and zero-initialize.
+  }
+  // The yieldPos loop
+  enumerator->forallElements([this](const std::vector<uint64_t> &ind, V val) {
+    uint64_t parentSz = 1, parentPos = 0;
+    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+      if (isCompressedDim(r)) {
+        // If `parentPos == parentSz` then it's valid as an array-lookup;
+        // however, it's semantically invalid here since that entry
+        // does not represent a segment of `indices[r]`.  Moreover, that
+        // entry must be immutable for `assembledSize` to remain valid.
+        assert(parentPos < parentSz && "Pointers position is out of bounds");
+        const uint64_t currentPos = pointers[r][parentPos];
+        // This increment won't overflow the `P` type, since it can't
+        // exceed the original value of `pointers[r][parentPos+1]`
+        // which was already verified to be within bounds for `P`
+        // when it was written to the array.
+        pointers[r][parentPos]++;
+        writeIndex(r, currentPos, ind[r]);
+        parentPos = currentPos;
+      } else if (isSingletonDim(r)) {
+        // the new parentPos equals the old parentPos.
+      } else { // Dense dimension.
+        assert(isDenseDim(r));
+        parentPos = parentPos * getDimSizes()[r] + ind[r];
+      }
+      parentSz = assembledSize(parentSz, r);
+    }
+    assert(parentPos < values.size() && "Value position is out of bounds");
+    values[parentPos] = val;
+  });
+  // No longer need the enumerator, so we'll delete it ASAP.
+  delete enumerator;
+  // The finalizeYieldPos loop
+  for (uint64_t parentSz = 1, rank = getRank(), r = 0; r < rank; r++) {
+    if (isCompressedDim(r)) {
+      assert(parentSz == pointers[r].size() - 1 &&
+             "Actual pointers size doesn't match the expected size");
+      // Can't check all of them, but at least we can check the last one.
+      assert(pointers[r][parentSz - 1] == pointers[r][parentSz] &&
+             "Pointers got corrupted");
+      // TODO: optimize this by using `memmove` or similar.
+      for (uint64_t n = 0; n < parentSz; n++) {
+        const uint64_t parentPos = parentSz - n;
+        pointers[r][parentPos] = pointers[r][parentPos - 1];
+      }
+      pointers[r][0] = 0;
+    }
+    parentSz = assembledSize(parentSz, r);
+  }
+}
+
+} // namespace sparse_tensor
+} // namespace mlir
+
+#endif // MLIR_EXECUTIONENGINE_SPARSETENSOR_STORAGE_H
diff --git a/mlir/include/mlir/ExecutionEngine/SparseTensorUtils.h b/mlir/include/mlir/ExecutionEngine/SparseTensorUtils.h
--- a/mlir/include/mlir/ExecutionEngine/SparseTensorUtils.h
+++ b/mlir/include/mlir/ExecutionEngine/SparseTensorUtils.h
@@ -1,4 +1,4 @@
-//===- SparseTensorUtils.h - Enums shared with the runtime ------*- C++ -*-===//
+//===- SparseTensorUtils.h - SparseTensor runtime support lib ---*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,126 +7,25 @@
 //===----------------------------------------------------------------------===//
 //
 // This header file provides the enums and functions which comprise the
-// public API of `ExecutionEngine/SparseUtils.cpp`.
+// public API of the `ExecutionEngine/SparseTensorUtils.cpp` runtime
+// support library for the SparseTensor dialect.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef MLIR_EXECUTIONENGINE_SPARSETENSORUTILS_H_
-#define MLIR_EXECUTIONENGINE_SPARSETENSORUTILS_H_
+#ifndef MLIR_EXECUTIONENGINE_SPARSETENSORUTILS_H
+#define MLIR_EXECUTIONENGINE_SPARSETENSORUTILS_H
 
 #include "mlir/ExecutionEngine/CRunnerUtils.h"
-#include "mlir/ExecutionEngine/Float16bits.h"
+#include "mlir/ExecutionEngine/SparseTensor/Enums.h"
 
 #include <cinttypes>
 #include <complex>
 #include <vector>
 
+using namespace mlir::sparse_tensor;
+
 extern "C" {
 
-//===----------------------------------------------------------------------===//
-//
-// Typedefs and enums.  These are required to be public so that they
-// can be shared with `Transforms/SparseTensorConversion.cpp`, since
-// they define the arguments to the public functions declared later on.
-//
-// This section also defines x-macros <https://en.wikipedia.org/wiki/X_Macro>
-// so that we can generate variations of the public functions for each
-// supported primary- and/or overhead-type.
-//
-//===----------------------------------------------------------------------===//
-
-/// This type is used in the public API at all places where MLIR expects
-/// values with the built-in type "index". For now, we simply assume that
-/// type is 64-bit, but targets with different "index" bit widths should
-/// link with an alternatively built runtime support library.
-// TODO: support such targets?
-using index_type = uint64_t;
-
-/// Encoding of overhead types (both pointer overhead and indices
-/// overhead), for "overloading" @newSparseTensor.
-enum class OverheadType : uint32_t {
-  kIndex = 0,
-  kU64 = 1,
-  kU32 = 2,
-  kU16 = 3,
-  kU8 = 4
-};
-
-// This x-macro calls its argument on every overhead type which has
-// fixed-width.  It excludes `index_type` because that type is often
-// handled specially (e.g., by translating it into the architecture-dependent
-// equivalent fixed-width overhead type).
-#define FOREVERY_FIXED_O(DO)                                                   \
-  DO(64, uint64_t)                                                             \
-  DO(32, uint32_t)                                                             \
-  DO(16, uint16_t)                                                             \
-  DO(8, uint8_t)
-
-// This x-macro calls its argument on every overhead type, including
-// `index_type`.
-#define FOREVERY_O(DO)                                                         \
-  FOREVERY_FIXED_O(DO)                                                         \
-  DO(0, index_type)
-
-// These are not just shorthands but indicate the particular
-// implementation used (e.g., as opposed to C99's `complex double`,
-// or MLIR's `ComplexType`).
-using complex64 = std::complex<double>;
-using complex32 = std::complex<float>;
-
-/// Encoding of the elemental type, for "overloading" @newSparseTensor.
-enum class PrimaryType : uint32_t {
-  kF64 = 1,
-  kF32 = 2,
-  kF16 = 3,
-  kBF16 = 4,
-  kI64 = 5,
-  kI32 = 6,
-  kI16 = 7,
-  kI8 = 8,
-  kC64 = 9,
-  kC32 = 10
-};
-
-// This x-macro includes all `V` types.
-#define FOREVERY_V(DO)                                                         \
-  DO(F64, double)                                                              \
-  DO(F32, float)                                                               \
-  DO(F16, f16)                                                                 \
-  DO(BF16, bf16)                                                               \
-  DO(I64, int64_t)                                                             \
-  DO(I32, int32_t)                                                             \
-  DO(I16, int16_t)                                                             \
-  DO(I8, int8_t)                                                               \
-  DO(C64, complex64)                                                           \
-  DO(C32, complex32)
-
-/// The actions performed by @newSparseTensor.
-enum class Action : uint32_t {
-  kEmpty = 0,
-  kFromFile = 1,
-  kFromCOO = 2,
-  kSparseToSparse = 3,
-  kEmptyCOO = 4,
-  kToCOO = 5,
-  kToIterator = 6,
-};
-
-/// This enum mimics `SparseTensorEncodingAttr::DimLevelType` for
-/// breaking dependency cycles.  `SparseTensorEncodingAttr::DimLevelType`
-/// is the source of truth and this enum should be kept consistent with it.
-enum class DimLevelType : uint8_t {
-  kDense = 0,
-  kCompressed = 1,
-  kCompressedNu = 2,
-  kCompressedNo = 3,
-  kCompressedNuNo = 4,
-  kSingleton = 5,
-  kSingletonNu = 6,
-  kSingletonNo = 7,
-  kSingletonNuNo = 8,
-};
-
 //===----------------------------------------------------------------------===//
 //
 // Public functions which operate on MLIR buffers (memrefs) to interact
@@ -243,7 +142,7 @@
 #undef DECL_OUTSPARSETENSOR
 
 /// Releases the memory for the tensor-storage object.
-void delSparseTensor(void *tensor);
+MLIR_CRUNNERUTILS_EXPORT void delSparseTensor(void *tensor);
 
 /// Releases the memory for the coordinate-scheme object.
 #define DECL_DELCOO(VNAME, V)                                                  \
@@ -308,4 +207,4 @@
 
 } // extern "C"
 
-#endif // MLIR_EXECUTIONENGINE_SPARSETENSORUTILS_H_
+#endif // MLIR_EXECUTIONENGINE_SPARSETENSORUTILS_H
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
--- a/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/CodegenUtils.h
@@ -16,7 +16,7 @@
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Complex/IR/Complex.h"
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
-#include "mlir/ExecutionEngine/SparseTensorUtils.h"
+#include "mlir/ExecutionEngine/SparseTensor/Enums.h"
 #include "mlir/IR/Builders.h"
 
 namespace mlir {
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -28,7 +28,7 @@
 #include "mlir/Dialect/SparseTensor/IR/SparseTensor.h"
 #include "mlir/Dialect/SparseTensor/Transforms/Passes.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
-#include "mlir/ExecutionEngine/SparseTensorUtils.h"
+#include "mlir/ExecutionEngine/SparseTensor/Enums.h"
 #include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -118,13 +118,26 @@
   MLIRSupport
 )
 
+add_mlir_library(mlir_float16_utils
+  SHARED
+  Float16bits.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+  )
+set_property(TARGET mlir_float16_utils PROPERTY CXX_STANDARD 17)
+target_compile_definitions(mlir_float16_utils PRIVATE mlir_float16_utils_EXPORTS)
+
+add_subdirectory(SparseTensor)
+
 add_mlir_library(mlir_c_runner_utils
   SHARED
   CRunnerUtils.cpp
-  Float16bits.cpp
   SparseTensorUtils.cpp
 
   EXCLUDE_FROM_LIBMLIR
+
+  LINK_LIBS PUBLIC
+  mlir_sparse_tensor_utils
   )
 set_property(TARGET mlir_c_runner_utils PROPERTY CXX_STANDARD 17)
 target_compile_definitions(mlir_c_runner_utils PRIVATE mlir_c_runner_utils_EXPORTS)
diff --git a/mlir/lib/ExecutionEngine/Float16bits.cpp b/mlir/lib/ExecutionEngine/Float16bits.cpp
--- a/mlir/lib/ExecutionEngine/Float16bits.cpp
+++ b/mlir/lib/ExecutionEngine/Float16bits.cpp
@@ -12,6 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/ExecutionEngine/Float16bits.h"
+
+#ifdef MLIR_FLOAT16_DEFINE_FUNCTIONS // We are building this library
+
 #include <cmath>
 #include <cstring>
 
@@ -188,3 +191,5 @@
   // cases.
   return __truncsfbf2(static_cast<float>(d));
 }
+
+#endif // MLIR_FLOAT16_DEFINE_FUNCTIONS
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt b/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt
new file mode 100644
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt
@@ -0,0 +1,23 @@
+# Unlike mlir_float16_utils, mlir_c_runner_utils, etc, we do *not* make
+# this a shared library: because doing so causes issues building on Windows.
+add_mlir_library(mlir_sparse_tensor_utils
+  File.cpp
+  NNZ.cpp
+  Storage.cpp
+
+  EXCLUDE_FROM_LIBMLIR
+
+  LINK_LIBS PUBLIC
+  mlir_float16_utils
+  )
+set_property(TARGET mlir_sparse_tensor_utils PROPERTY CXX_STANDARD 17)
+target_compile_definitions(mlir_sparse_tensor_utils PRIVATE mlir_sparse_tensor_utils_EXPORTS)
+
+# To make sure we adhere to the style guide:
+# <https://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers>
+check_cxx_compiler_flag(-Wweak-vtables
+  COMPILER_SUPPORTS_WARNING_WEAK_VTABLES)
+if(COMPILER_SUPPORTS_WARNING_WEAK_VTABLES)
+  target_compile_options(mlir_sparse_tensor_utils PUBLIC
+    "-Wweak-vtables")
+endif()
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/File.cpp b/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SparseTensor/File.cpp
@@ -0,0 +1,163 @@
+//===- File.cpp - Parsing sparse tensors from files -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements parsing and printing of files in one of the
+// following external formats:
+//
+// (1) Matrix Market Exchange (MME): *.mtx
+//     https://math.nist.gov/MatrixMarket/formats.html
+//
+// (2) Formidable Repository of Open Sparse Tensors and Tools (FROSTT): *.tns
+//     http://frostt.io/tensors/file-formats.html
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/SparseTensor/File.h"
+
+#ifdef MLIR_SPARSETENSOR_DEFINE_FUNCTIONS // We are building this library
+
+#include <cctype>
+#include <cstring>
+
+using namespace mlir::sparse_tensor;
+
+/// Opens the file for reading.
+void SparseTensorFile::openFile() {
+  if (file)
+    MLIR_SPARSETENSOR_FATAL("Already opened file %s\n", filename);
+  file = fopen(filename, "r");
+  if (!file)
+    MLIR_SPARSETENSOR_FATAL("Cannot find file %s\n", filename);
+}
+
+/// Closes the file.
+void SparseTensorFile::closeFile() {
+  if (file) {
+    fclose(file);
+    file = nullptr;
+  }
+}
+
+// TODO(wrengr/bixia): figure out how to reorganize the element-parsing
+// loop of `openSparseTensorCOO` into methods of this class, so we can
+// avoid leaking access to the `line` pointer (both for general hygiene
+// and because we can't mark it const due to the second argument of
+// `strtoul`/`strtoud` being `char * *restrict` rather than
+// `char const* *restrict`).
+//
+/// Attempts to read a line from the file.
+char *SparseTensorFile::readLine() {
+  if (fgets(line, kColWidth, file))
+    return line;
+  MLIR_SPARSETENSOR_FATAL("Cannot read next line of %s\n", filename);
+}
+
+/// Reads and parses the file's header.
+void SparseTensorFile::readHeader() {
+  assert(file && "Attempt to readHeader() before openFile()");
+  if (strstr(filename, ".mtx"))
+    readMMEHeader();
+  else if (strstr(filename, ".tns"))
+    readExtFROSTTHeader();
+  else
+    MLIR_SPARSETENSOR_FATAL("Unknown format %s\n", filename);
+  assert(isValid() && "Failed to read the header");
+}
+
+/// Asserts the shape subsumes the actual dimension sizes.  Is only
+/// valid after parsing the header.
+void SparseTensorFile::assertMatchesShape(uint64_t rank,
+                                          const uint64_t *shape) const {
+  assert(rank == getRank() && "Rank mismatch");
+  for (uint64_t r = 0; r < rank; r++)
+    assert((shape[r] == 0 || shape[r] == idata[2 + r]) &&
+           "Dimension size mismatch");
+}
+
+/// Helper to convert string to lower case.
+static inline char *toLower(char *token) {
+  for (char *c = token; *c; ++c)
+    *c = tolower(*c);
+  return token;
+}
+
+/// Read the MME header of a general sparse matrix of type real.
+void SparseTensorFile::readMMEHeader() {
+  char header[64];
+  char object[64];
+  char format[64];
+  char field[64];
+  char symmetry[64];
+  // Read header line.
+  if (fscanf(file, "%63s %63s %63s %63s %63s\n", header, object, format, field,
+             symmetry) != 5)
+    MLIR_SPARSETENSOR_FATAL("Corrupt header in %s\n", filename);
+  // Process `field`, which specify pattern or the data type of the values.
+  if (strcmp(toLower(field), "pattern") == 0)
+    valueKind_ = ValueKind::kPattern;
+  else if (strcmp(toLower(field), "real") == 0)
+    valueKind_ = ValueKind::kReal;
+  else if (strcmp(toLower(field), "integer") == 0)
+    valueKind_ = ValueKind::kInteger;
+  else if (strcmp(toLower(field), "complex") == 0)
+    valueKind_ = ValueKind::kComplex;
+  else
+    MLIR_SPARSETENSOR_FATAL("Unexpected header field value in %s\n", filename);
+
+  // Set properties.
+  isSymmetric_ = (strcmp(toLower(symmetry), "symmetric") == 0);
+  // Make sure this is a general sparse matrix.
+  if (strcmp(toLower(header), "%%matrixmarket") ||
+      strcmp(toLower(object), "matrix") ||
+      strcmp(toLower(format), "coordinate") ||
+      (strcmp(toLower(symmetry), "general") && !isSymmetric_))
+    MLIR_SPARSETENSOR_FATAL("Cannot find a general sparse matrix in %s\n",
+                            filename);
+  // Skip comments.
+  while (true) {
+    readLine();
+    if (line[0] != '%')
+      break;
+  }
+  // Next line contains M N NNZ.
+  idata[0] = 2; // rank
+  if (sscanf(line, "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n", idata + 2, idata + 3,
+             idata + 1) != 3)
+    MLIR_SPARSETENSOR_FATAL("Cannot find size in %s\n", filename);
+}
+
+/// Read the "extended" FROSTT header. Although not part of the documented
+/// format, we assume that the file starts with optional comments followed
+/// by two lines that define the rank, the number of nonzeros, and the
+/// dimensions sizes (one per rank) of the sparse tensor.
+void SparseTensorFile::readExtFROSTTHeader() {
+  // Skip comments.
+  while (true) {
+    readLine();
+    if (line[0] != '#')
+      break;
+  }
+  // Next line contains RANK and NNZ.
+  if (sscanf(line, "%" PRIu64 "%" PRIu64 "\n", idata, idata + 1) != 2)
+    MLIR_SPARSETENSOR_FATAL("Cannot find metadata in %s\n", filename);
+  // Followed by a line with the dimension sizes (one per rank).
+  for (uint64_t r = 0; r < idata[0]; r++)
+    if (fscanf(file, "%" PRIu64, idata + 2 + r) != 1)
+      MLIR_SPARSETENSOR_FATAL("Cannot find dimension size %s\n", filename);
+  readLine(); // end of line
+  // The FROSTT format does not define the data type of the nonzero elements.
+  valueKind_ = ValueKind::kUndefined;
+}
+
+#endif // MLIR_SPARSETENSOR_DEFINE_FUNCTIONS
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/NNZ.cpp b/mlir/lib/ExecutionEngine/SparseTensor/NNZ.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SparseTensor/NNZ.cpp
@@ -0,0 +1,88 @@
+//===- NNZ.cpp - NNZ-statistics for direct sparse2sparse conversion -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains method definitions for `SparseTensorNNZ`.
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/SparseTensor/Storage.h"
+
+#ifdef MLIR_SPARSETENSOR_DEFINE_FUNCTIONS // We are building this library
+
+using namespace mlir::sparse_tensor;
+
+SparseTensorNNZ::SparseTensorNNZ(const std::vector<uint64_t> &dimSizes,
+                                 const std::vector<DimLevelType> &sparsity)
+    : dimSizes(dimSizes), dimTypes(sparsity), nnz(getRank()) {
+  assert(dimSizes.size() == dimTypes.size() && "Rank mismatch");
+  bool uncompressed = true;
+  (void)uncompressed;
+  uint64_t sz = 1; // the product of all `dimSizes` strictly less than `r`.
+  for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+    switch (dimTypes[r]) {
+    case DimLevelType::kCompressed:
+      assert(uncompressed &&
+             "Multiple compressed layers not currently supported");
+      uncompressed = false;
+      nnz[r].resize(sz, 0); // Both allocate and zero-initialize.
+      break;
+    case DimLevelType::kDense:
+      assert(uncompressed && "Dense after compressed not currently supported");
+      break;
+    case DimLevelType::kSingleton:
+      // Singleton after Compressed causes no problems for allocating
+      // `nnz` nor for the yieldPos loop.  This remains true even
+      // when adding support for multiple compressed dimensions or
+      // for dense-after-compressed.
+      break;
+    default:
+      MLIR_SPARSETENSOR_FATAL("unsupported dimension level type");
+    }
+    sz = detail::checkedMul(sz, dimSizes[r]);
+  }
+}
+
+void SparseTensorNNZ::forallIndices(uint64_t stopDim,
+                                    SparseTensorNNZ::NNZConsumer yield) const {
+  assert(stopDim < getRank() && "Stopping-dimension is out of bounds");
+  assert(dimTypes[stopDim] == DimLevelType::kCompressed &&
+         "Cannot look up non-compressed dimensions");
+  forallIndices(yield, stopDim, 0, 0);
+}
+
+void SparseTensorNNZ::add(const std::vector<uint64_t> &ind) {
+  uint64_t parentPos = 0;
+  for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+    if (dimTypes[r] == DimLevelType::kCompressed)
+      nnz[r][parentPos]++;
+    parentPos = parentPos * dimSizes[r] + ind[r];
+  }
+}
+
+void SparseTensorNNZ::forallIndices(SparseTensorNNZ::NNZConsumer yield,
+                                    uint64_t stopDim, uint64_t parentPos,
+                                    uint64_t d) const {
+  assert(d <= stopDim);
+  if (d == stopDim) {
+    assert(parentPos < nnz[d].size() && "Cursor is out of range");
+    yield(nnz[d][parentPos]);
+  } else {
+    const uint64_t sz = dimSizes[d];
+    const uint64_t pstart = parentPos * sz;
+    for (uint64_t i = 0; i < sz; i++)
+      forallIndices(yield, stopDim, pstart + i, d + 1);
+  }
+}
+
+#endif // MLIR_SPARSETENSOR_DEFINE_FUNCTIONS
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
new file mode 100644
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/SparseTensor/Storage.cpp
@@ -0,0 +1,100 @@
+//===- StorageBase.cpp - TACO-flavored sparse tensor representation -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains method definitions for `SparseTensorStorageBase`.
+// In particular we want to ensure that the default implementations of
+// the "partial method specialization" trick aren't inline (since there's
+// no benefit).  Though this also helps ensure that we avoid weak-vtables:
+// <https://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers>
+//
+// This file is part of the lightweight runtime support library for sparse
+// tensor manipulations.  The functionality of the support library is meant
+// to simplify benchmarking, testing, and debugging MLIR code operating on
+// sparse tensors.  However, the provided functionality is **not** part of
+// core MLIR itself.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/ExecutionEngine/SparseTensor/Storage.h"
+
+#ifdef MLIR_SPARSETENSOR_DEFINE_FUNCTIONS // We are building this library
+
+using namespace mlir::sparse_tensor;
+
+SparseTensorStorageBase::SparseTensorStorageBase(
+    const std::vector<uint64_t> &dimSizes, const uint64_t *perm,
+    const DimLevelType *sparsity)
+    : dimSizes(dimSizes), rev(getRank()),
+      dimTypes(sparsity, sparsity + getRank()) {
+  assert(perm && sparsity);
+  const uint64_t rank = getRank();
+  // Validate parameters.
+  assert(rank > 0 && "Trivial shape is unsupported");
+  for (uint64_t r = 0; r < rank; r++) {
+    assert(dimSizes[r] > 0 && "Dimension size zero has trivial storage");
+    assert((isDenseDim(r) || isCompressedDim(r) || isSingletonDim(r)) &&
+           "Unsupported DimLevelType");
+  }
+  // Construct the "reverse" (i.e., inverse) permutation.
+  for (uint64_t r = 0; r < rank; r++)
+    rev[perm[r]] = r;
+}
+
+// Helper macro for generating error messages when some
+// `SparseTensorStorage<P,I,V>` is cast to `SparseTensorStorageBase`
+// and then the wrong "partial method specialization" is called.
+#define FATAL_PIV(NAME)                                                        \
+  MLIR_SPARSETENSOR_FATAL("<P,I,V> type mismatch for: " #NAME);
+
+#define IMPL_NEWENUMERATOR(VNAME, V)                                           \
+  void SparseTensorStorageBase::newEnumerator(                                 \
+      SparseTensorEnumeratorBase<V> **, uint64_t, const uint64_t *) const {    \
+    FATAL_PIV("newEnumerator" #VNAME);                                         \
+  }
+FOREVERY_V(IMPL_NEWENUMERATOR)
+#undef IMPL_NEWENUMERATOR
+
+#define IMPL_GETPOINTERS(PNAME, P)                                             \
+  void SparseTensorStorageBase::getPointers(std::vector<P> **, uint64_t) {     \
+    FATAL_PIV("getPointers" #PNAME);                                           \
+  }
+FOREVERY_FIXED_O(IMPL_GETPOINTERS)
+#undef IMPL_GETPOINTERS
+
+#define IMPL_GETINDICES(INAME, I)                                              \
+  void SparseTensorStorageBase::getIndices(std::vector<I> **, uint64_t) {      \
+    FATAL_PIV("getIndices" #INAME);                                            \
+  }
+FOREVERY_FIXED_O(IMPL_GETINDICES)
+#undef IMPL_GETINDICES
+
+#define IMPL_GETVALUES(VNAME, V)                                               \
+  void SparseTensorStorageBase::getValues(std::vector<V> **) {                 \
+    FATAL_PIV("getValues" #VNAME);                                             \
+  }
+FOREVERY_V(IMPL_GETVALUES)
+#undef IMPL_GETVALUES
+
+#define IMPL_LEXINSERT(VNAME, V)                                               \
+  void SparseTensorStorageBase::lexInsert(const uint64_t *, V) {               \
+    FATAL_PIV("lexInsert" #VNAME);                                             \
+  }
+FOREVERY_V(IMPL_LEXINSERT)
+#undef IMPL_LEXINSERT
+
+#define IMPL_EXPINSERT(VNAME, V)                                               \
+  void SparseTensorStorageBase::expInsert(uint64_t *, V *, bool *, uint64_t *, \
+                                          uint64_t) {                          \
+    FATAL_PIV("expInsert" #VNAME);                                             \
+  }
+FOREVERY_V(IMPL_EXPINSERT)
+#undef IMPL_EXPINSERT
+
+#undef FATAL_PIV
+
+#endif // MLIR_SPARSETENSOR_DEFINE_FUNCTIONS
diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
--- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
@@ -15,6 +15,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/ExecutionEngine/SparseTensorUtils.h"
+#include "mlir/ExecutionEngine/SparseTensor/COO.h"
+#include "mlir/ExecutionEngine/SparseTensor/ErrorHandling.h"
+#include "mlir/ExecutionEngine/SparseTensor/File.h"
+#include "mlir/ExecutionEngine/SparseTensor/Storage.h"
 
 #ifdef MLIR_CRUNNERUTILS_DEFINE_FUNCTIONS
 
@@ -30,6 +34,8 @@
 #include <limits>
 #include <numeric>
 
+using namespace mlir::sparse_tensor;
+
 //===----------------------------------------------------------------------===//
 //
 // Internal support for storing and reading sparse tensors.
@@ -68,1363 +74,6 @@
 
 namespace {
 
-static constexpr int kColWidth = 1025;
-
-/// A version of `operator*` on `uint64_t` which checks for overflows.
-static inline uint64_t checkedMul(uint64_t lhs, uint64_t rhs) {
-  assert((lhs == 0 || rhs <= std::numeric_limits<uint64_t>::max() / lhs) &&
-         "Integer overflow");
-  return lhs * rhs;
-}
-
-// This macro helps minimize repetition of this idiom, as well as ensuring
-// we have some additional output indicating where the error is coming from.
-// (Since `fprintf` doesn't provide a stacktrace, this helps make it easier
-// to track down whether an error is coming from our code vs somewhere else
-// in MLIR.)
-#define FATAL(...)                                                             \
-  do {                                                                         \
-    fprintf(stderr, "SparseTensorUtils: " __VA_ARGS__);                        \
-    exit(1);                                                                   \
-  } while (0)
-
-// TODO: try to unify this with `SparseTensorFile::assertMatchesShape`
-// which is used by `openSparseTensorCOO`.  It's easy enough to resolve
-// the `std::vector` vs pointer mismatch for `dimSizes`; but it's trickier
-// to resolve the presence/absence of `perm` (without introducing extra
-// overhead), so perhaps the code duplication is unavoidable.
-//
-/// Asserts that the `dimSizes` (in target-order) under the `perm` (mapping
-/// semantic-order to target-order) are a refinement of the desired `shape`
-/// (in semantic-order).
-///
-/// Precondition: `perm` and `shape` must be valid for `rank`.
-static inline void
-assertPermutedSizesMatchShape(const std::vector<uint64_t> &dimSizes,
-                              uint64_t rank, const uint64_t *perm,
-                              const uint64_t *shape) {
-  assert(perm && shape);
-  assert(rank == dimSizes.size() && "Rank mismatch");
-  for (uint64_t r = 0; r < rank; r++)
-    assert((shape[r] == 0 || shape[r] == dimSizes[perm[r]]) &&
-           "Dimension size mismatch");
-}
-
-/// A sparse tensor element in coordinate scheme (value and indices).
-/// For example, a rank-1 vector element would look like
-///   ({i}, a[i])
-/// and a rank-5 tensor element like
-///   ({i,j,k,l,m}, a[i,j,k,l,m])
-/// We use pointer to a shared index pool rather than e.g. a direct
-/// vector since that (1) reduces the per-element memory footprint, and
-/// (2) centralizes the memory reservation and (re)allocation to one place.
-template <typename V>
-struct Element final {
-  Element(uint64_t *ind, V val) : indices(ind), value(val){};
-  uint64_t *indices; // pointer into shared index pool
-  V value;
-};
-
-/// The type of callback functions which receive an element.  We avoid
-/// packaging the coordinates and value together as an `Element` object
-/// because this helps keep code somewhat cleaner.
-template <typename V>
-using ElementConsumer =
-    const std::function<void(const std::vector<uint64_t> &, V)> &;
-
-/// A memory-resident sparse tensor in coordinate scheme (collection of
-/// elements). This data structure is used to read a sparse tensor from
-/// any external format into memory and sort the elements lexicographically
-/// by indices before passing it back to the client (most packed storage
-/// formats require the elements to appear in lexicographic index order).
-template <typename V>
-struct SparseTensorCOO final {
-public:
-  SparseTensorCOO(const std::vector<uint64_t> &dimSizes, uint64_t capacity)
-      : dimSizes(dimSizes) {
-    if (capacity) {
-      elements.reserve(capacity);
-      indices.reserve(capacity * getRank());
-    }
-  }
-
-  /// Adds element as indices and value.
-  void add(const std::vector<uint64_t> &ind, V val) {
-    assert(!iteratorLocked && "Attempt to add() after startIterator()");
-    uint64_t *base = indices.data();
-    uint64_t size = indices.size();
-    uint64_t rank = getRank();
-    assert(ind.size() == rank && "Element rank mismatch");
-    for (uint64_t r = 0; r < rank; r++) {
-      assert(ind[r] < dimSizes[r] && "Index is too large for the dimension");
-      indices.push_back(ind[r]);
-    }
-    // This base only changes if indices were reallocated. In that case, we
-    // need to correct all previous pointers into the vector. Note that this
-    // only happens if we did not set the initial capacity right, and then only
-    // for every internal vector reallocation (which with the doubling rule
-    // should only incur an amortized linear overhead).
-    uint64_t *newBase = indices.data();
-    if (newBase != base) {
-      for (uint64_t i = 0, n = elements.size(); i < n; i++)
-        elements[i].indices = newBase + (elements[i].indices - base);
-      base = newBase;
-    }
-    // Add element as (pointer into shared index pool, value) pair.
-    elements.emplace_back(base + size, val);
-  }
-
-  /// Sorts elements lexicographically by index.
-  void sort() {
-    assert(!iteratorLocked && "Attempt to sort() after startIterator()");
-    // TODO: we may want to cache an `isSorted` bit, to avoid
-    // unnecessary/redundant sorting.
-    uint64_t rank = getRank();
-    std::sort(elements.begin(), elements.end(),
-              [rank](const Element<V> &e1, const Element<V> &e2) {
-                for (uint64_t r = 0; r < rank; r++) {
-                  if (e1.indices[r] == e2.indices[r])
-                    continue;
-                  return e1.indices[r] < e2.indices[r];
-                }
-                return false;
-              });
-  }
-
-  /// Get the rank of the tensor.
-  uint64_t getRank() const { return dimSizes.size(); }
-
-  /// Getter for the dimension-sizes array.
-  const std::vector<uint64_t> &getDimSizes() const { return dimSizes; }
-
-  /// Getter for the elements array.
-  const std::vector<Element<V>> &getElements() const { return elements; }
-
-  /// Switch into iterator mode.
-  void startIterator() {
-    iteratorLocked = true;
-    iteratorPos = 0;
-  }
-
-  /// Get the next element.
-  const Element<V> *getNext() {
-    assert(iteratorLocked && "Attempt to getNext() before startIterator()");
-    if (iteratorPos < elements.size())
-      return &(elements[iteratorPos++]);
-    iteratorLocked = false;
-    return nullptr;
-  }
-
-  /// Factory method. Permutes the original dimensions according to
-  /// the given ordering and expects subsequent add() calls to honor
-  /// that same ordering for the given indices. The result is a
-  /// fully permuted coordinate scheme.
-  ///
-  /// Precondition: `dimSizes` and `perm` must be valid for `rank`.
-  static SparseTensorCOO<V> *newSparseTensorCOO(uint64_t rank,
-                                                const uint64_t *dimSizes,
-                                                const uint64_t *perm,
-                                                uint64_t capacity = 0) {
-    std::vector<uint64_t> permsz(rank);
-    for (uint64_t r = 0; r < rank; r++) {
-      assert(dimSizes[r] > 0 && "Dimension size zero has trivial storage");
-      permsz[perm[r]] = dimSizes[r];
-    }
-    return new SparseTensorCOO<V>(permsz, capacity);
-  }
-
-private:
-  const std::vector<uint64_t> dimSizes; // per-dimension sizes
-  std::vector<Element<V>> elements;     // all COO elements
-  std::vector<uint64_t> indices;        // shared index pool
-  bool iteratorLocked = false;
-  unsigned iteratorPos = 0;
-};
-
-// Forward.
-template <typename V>
-class SparseTensorEnumeratorBase;
-
-// Helper macro for generating error messages when some
-// `SparseTensorStorage<P,I,V>` is cast to `SparseTensorStorageBase`
-// and then the wrong "partial method specialization" is called.
-#define FATAL_PIV(NAME) FATAL("<P,I,V> type mismatch for: " #NAME);
-
-/// Abstract base class for `SparseTensorStorage<P,I,V>`.  This class
-/// takes responsibility for all the `<P,I,V>`-independent aspects
-/// of the tensor (e.g., shape, sparsity, permutation).  In addition,
-/// we use function overloading to implement "partial" method
-/// specialization, which the C-API relies on to catch type errors
-/// arising from our use of opaque pointers.
-class SparseTensorStorageBase {
-public:
-  /// Constructs a new storage object.  The `perm` maps the tensor's
-  /// semantic-ordering of dimensions to this object's storage-order.
-  /// The `dimSizes` and `sparsity` arrays are already in storage-order.
-  ///
-  /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
-  SparseTensorStorageBase(const std::vector<uint64_t> &dimSizes,
-                          const uint64_t *perm, const DimLevelType *sparsity)
-      : dimSizes(dimSizes), rev(getRank()),
-        dimTypes(sparsity, sparsity + getRank()) {
-    assert(perm && sparsity);
-    const uint64_t rank = getRank();
-    // Validate parameters.
-    assert(rank > 0 && "Trivial shape is unsupported");
-    for (uint64_t r = 0; r < rank; r++) {
-      assert(dimSizes[r] > 0 && "Dimension size zero has trivial storage");
-      assert((dimTypes[r] == DimLevelType::kDense ||
-              dimTypes[r] == DimLevelType::kCompressed) &&
-             "Unsupported DimLevelType");
-    }
-    // Construct the "reverse" (i.e., inverse) permutation.
-    for (uint64_t r = 0; r < rank; r++)
-      rev[perm[r]] = r;
-  }
-
-  virtual ~SparseTensorStorageBase() = default;
-
-  /// Get the rank of the tensor.
-  uint64_t getRank() const { return dimSizes.size(); }
-
-  /// Getter for the dimension-sizes array, in storage-order.
-  const std::vector<uint64_t> &getDimSizes() const { return dimSizes; }
-
-  /// Safely lookup the size of the given (storage-order) dimension.
-  uint64_t getDimSize(uint64_t d) const {
-    assert(d < getRank());
-    return dimSizes[d];
-  }
-
-  /// Getter for the "reverse" permutation, which maps this object's
-  /// storage-order to the tensor's semantic-order.
-  const std::vector<uint64_t> &getRev() const { return rev; }
-
-  /// Getter for the dimension-types array, in storage-order.
-  const std::vector<DimLevelType> &getDimTypes() const { return dimTypes; }
-
-  /// Safely check if the (storage-order) dimension uses compressed storage.
-  bool isCompressedDim(uint64_t d) const {
-    assert(d < getRank());
-    return (dimTypes[d] == DimLevelType::kCompressed);
-  }
-
-  /// Allocate a new enumerator.
-#define DECL_NEWENUMERATOR(VNAME, V)                                           \
-  virtual void newEnumerator(SparseTensorEnumeratorBase<V> **, uint64_t,       \
-                             const uint64_t *) const {                         \
-    FATAL_PIV("newEnumerator" #VNAME);                                         \
-  }
-  FOREVERY_V(DECL_NEWENUMERATOR)
-#undef DECL_NEWENUMERATOR
-
-  /// Overhead storage.
-#define DECL_GETPOINTERS(PNAME, P)                                             \
-  virtual void getPointers(std::vector<P> **, uint64_t) {                      \
-    FATAL_PIV("getPointers" #PNAME);                                           \
-  }
-  FOREVERY_FIXED_O(DECL_GETPOINTERS)
-#undef DECL_GETPOINTERS
-#define DECL_GETINDICES(INAME, I)                                              \
-  virtual void getIndices(std::vector<I> **, uint64_t) {                       \
-    FATAL_PIV("getIndices" #INAME);                                            \
-  }
-  FOREVERY_FIXED_O(DECL_GETINDICES)
-#undef DECL_GETINDICES
-
-  /// Primary storage.
-#define DECL_GETVALUES(VNAME, V)                                               \
-  virtual void getValues(std::vector<V> **) { FATAL_PIV("getValues" #VNAME); }
-  FOREVERY_V(DECL_GETVALUES)
-#undef DECL_GETVALUES
-
-  /// Element-wise insertion in lexicographic index order.
-#define DECL_LEXINSERT(VNAME, V)                                               \
-  virtual void lexInsert(const uint64_t *, V) { FATAL_PIV("lexInsert" #VNAME); }
-  FOREVERY_V(DECL_LEXINSERT)
-#undef DECL_LEXINSERT
-
-  /// Expanded insertion.
-#define DECL_EXPINSERT(VNAME, V)                                               \
-  virtual void expInsert(uint64_t *, V *, bool *, uint64_t *, uint64_t) {      \
-    FATAL_PIV("expInsert" #VNAME);                                             \
-  }
-  FOREVERY_V(DECL_EXPINSERT)
-#undef DECL_EXPINSERT
-
-  /// Finishes insertion.
-  virtual void endInsert() = 0;
-
-protected:
-  // Since this class is virtual, we must disallow public copying in
-  // order to avoid "slicing".  Since this class has data members,
-  // that means making copying protected.
-  // <https://github.com/isocpp/CppCoreGuidelines/blob/master/CppCoreGuidelines.md#Rc-copy-virtual>
-  SparseTensorStorageBase(const SparseTensorStorageBase &) = default;
-  // Copy-assignment would be implicitly deleted (because `dimSizes`
-  // is const), so we explicitly delete it for clarity.
-  SparseTensorStorageBase &operator=(const SparseTensorStorageBase &) = delete;
-
-private:
-  const std::vector<uint64_t> dimSizes;
-  std::vector<uint64_t> rev;
-  const std::vector<DimLevelType> dimTypes;
-};
-
-#undef FATAL_PIV
-
-// Forward.
-template <typename P, typename I, typename V>
-class SparseTensorEnumerator;
-
-/// A memory-resident sparse tensor using a storage scheme based on
-/// per-dimension sparse/dense annotations. This data structure provides a
-/// bufferized form of a sparse tensor type. In contrast to generating setup
-/// methods for each differently annotated sparse tensor, this method provides
-/// a convenient "one-size-fits-all" solution that simply takes an input tensor
-/// and annotations to implement all required setup in a general manner.
-template <typename P, typename I, typename V>
-class SparseTensorStorage final : public SparseTensorStorageBase {
-  /// Private constructor to share code between the other constructors.
-  /// Beware that the object is not necessarily guaranteed to be in a
-  /// valid state after this constructor alone; e.g., `isCompressedDim(d)`
-  /// doesn't entail `!(pointers[d].empty())`.
-  ///
-  /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
-  SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
-                      const uint64_t *perm, const DimLevelType *sparsity)
-      : SparseTensorStorageBase(dimSizes, perm, sparsity), pointers(getRank()),
-        indices(getRank()), idx(getRank()) {}
-
-public:
-  /// Constructs a sparse tensor storage scheme with the given dimensions,
-  /// permutation, and per-dimension dense/sparse annotations, using
-  /// the coordinate scheme tensor for the initial contents if provided.
-  ///
-  /// Precondition: `perm` and `sparsity` must be valid for `dimSizes.size()`.
-  SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
-                      const uint64_t *perm, const DimLevelType *sparsity,
-                      SparseTensorCOO<V> *coo)
-      : SparseTensorStorage(dimSizes, perm, sparsity) {
-    // Provide hints on capacity of pointers and indices.
-    // TODO: needs much fine-tuning based on actual sparsity; currently
-    //       we reserve pointer/index space based on all previous dense
-    //       dimensions, which works well up to first sparse dim; but
-    //       we should really use nnz and dense/sparse distribution.
-    bool allDense = true;
-    uint64_t sz = 1;
-    for (uint64_t r = 0, rank = getRank(); r < rank; r++) {
-      if (isCompressedDim(r)) {
-        // TODO: Take a parameter between 1 and `dimSizes[r]`, and multiply
-        // `sz` by that before reserving. (For now we just use 1.)
-        pointers[r].reserve(sz + 1);
-        pointers[r].push_back(0);
-        indices[r].reserve(sz);
-        sz = 1;
-        allDense = false;
-      } else { // Dense dimension.
-        sz = checkedMul(sz, getDimSizes()[r]);
-      }
-    }
-    // Then assign contents from coordinate scheme tensor if provided.
-    if (coo) {
-      // Ensure both preconditions of `fromCOO`.
-      assert(coo->getDimSizes() == getDimSizes() && "Tensor size mismatch");
-      coo->sort();
-      // Now actually insert the `elements`.
-      const std::vector<Element<V>> &elements = coo->getElements();
-      uint64_t nnz = elements.size();
-      values.reserve(nnz);
-      fromCOO(elements, 0, nnz, 0);
-    } else if (allDense) {
-      values.resize(sz, 0);
-    }
-  }
-
-  /// Constructs a sparse tensor storage scheme with the given dimensions,
-  /// permutation, and per-dimension dense/sparse annotations, using
-  /// the given sparse tensor for the initial contents.
-  ///
-  /// Preconditions:
-  /// * `perm` and `sparsity` must be valid for `dimSizes.size()`.
-  /// * The `tensor` must have the same value type `V`.
-  SparseTensorStorage(const std::vector<uint64_t> &dimSizes,
-                      const uint64_t *perm, const DimLevelType *sparsity,
-                      const SparseTensorStorageBase &tensor);
-
-  ~SparseTensorStorage() final = default;
-
-  /// Partially specialize these getter methods based on template types.
-  void getPointers(std::vector<P> **out, uint64_t d) final {
-    assert(d < getRank());
-    *out = &pointers[d];
-  }
-  void getIndices(std::vector<I> **out, uint64_t d) final {
-    assert(d < getRank());
-    *out = &indices[d];
-  }
-  void getValues(std::vector<V> **out) final { *out = &values; }
-
-  /// Partially specialize lexicographical insertions based on template types.
-  void lexInsert(const uint64_t *cursor, V val) final {
-    // First, wrap up pending insertion path.
-    uint64_t diff = 0;
-    uint64_t top = 0;
-    if (!values.empty()) {
-      diff = lexDiff(cursor);
-      endPath(diff + 1);
-      top = idx[diff] + 1;
-    }
-    // Then continue with insertion path.
-    insPath(cursor, diff, top, val);
-  }
-
-  /// Partially specialize expanded insertions based on template types.
-  /// Note that this method resets the values/filled-switch array back
-  /// to all-zero/false while only iterating over the nonzero elements.
-  void expInsert(uint64_t *cursor, V *values, bool *filled, uint64_t *added,
-                 uint64_t count) final {
-    if (count == 0)
-      return;
-    // Sort.
-    std::sort(added, added + count);
-    // Restore insertion path for first insert.
-    const uint64_t lastDim = getRank() - 1;
-    uint64_t index = added[0];
-    cursor[lastDim] = index;
-    lexInsert(cursor, values[index]);
-    assert(filled[index]);
-    values[index] = 0;
-    filled[index] = false;
-    // Subsequent insertions are quick.
-    for (uint64_t i = 1; i < count; i++) {
-      assert(index < added[i] && "non-lexicographic insertion");
-      index = added[i];
-      cursor[lastDim] = index;
-      insPath(cursor, lastDim, added[i - 1] + 1, values[index]);
-      assert(filled[index]);
-      values[index] = 0;
-      filled[index] = false;
-    }
-  }
-
-  /// Finalizes lexicographic insertions.
-  void endInsert() final {
-    if (values.empty())
-      finalizeSegment(0);
-    else
-      endPath(0);
-  }
-
-  void newEnumerator(SparseTensorEnumeratorBase<V> **out, uint64_t rank,
-                     const uint64_t *perm) const final {
-    *out = new SparseTensorEnumerator<P, I, V>(*this, rank, perm);
-  }
-
-  /// Returns this sparse tensor storage scheme as a new memory-resident
-  /// sparse tensor in coordinate scheme with the given dimension order.
-  ///
-  /// Precondition: `perm` must be valid for `getRank()`.
-  SparseTensorCOO<V> *toCOO(const uint64_t *perm) const {
-    SparseTensorEnumeratorBase<V> *enumerator;
-    newEnumerator(&enumerator, getRank(), perm);
-    SparseTensorCOO<V> *coo =
-        new SparseTensorCOO<V>(enumerator->permutedSizes(), values.size());
-    enumerator->forallElements([&coo](const std::vector<uint64_t> &ind, V val) {
-      coo->add(ind, val);
-    });
-    // TODO: This assertion assumes there are no stored zeros,
-    // or if there are then that we don't filter them out.
-    // Cf., <https://github.com/llvm/llvm-project/issues/54179>
-    assert(coo->getElements().size() == values.size());
-    delete enumerator;
-    return coo;
-  }
-
-  /// Factory method. Constructs a sparse tensor storage scheme with the given
-  /// dimensions, permutation, and per-dimension dense/sparse annotations,
-  /// using the coordinate scheme tensor for the initial contents if provided.
-  /// In the latter case, the coordinate scheme must respect the same
-  /// permutation as is desired for the new sparse tensor storage.
-  ///
-  /// Precondition: `shape`, `perm`, and `sparsity` must be valid for `rank`.
-  static SparseTensorStorage<P, I, V> *
-  newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
-                  const DimLevelType *sparsity, SparseTensorCOO<V> *coo) {
-    SparseTensorStorage<P, I, V> *n = nullptr;
-    if (coo) {
-      const auto &coosz = coo->getDimSizes();
-      assertPermutedSizesMatchShape(coosz, rank, perm, shape);
-      n = new SparseTensorStorage<P, I, V>(coosz, perm, sparsity, coo);
-    } else {
-      std::vector<uint64_t> permsz(rank);
-      for (uint64_t r = 0; r < rank; r++) {
-        assert(shape[r] > 0 && "Dimension size zero has trivial storage");
-        permsz[perm[r]] = shape[r];
-      }
-      // We pass the null `coo` to ensure we select the intended constructor.
-      n = new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, coo);
-    }
-    return n;
-  }
-
-  /// Factory method. Constructs a sparse tensor storage scheme with
-  /// the given dimensions, permutation, and per-dimension dense/sparse
-  /// annotations, using the sparse tensor for the initial contents.
-  ///
-  /// Preconditions:
-  /// * `shape`, `perm`, and `sparsity` must be valid for `rank`.
-  /// * The `tensor` must have the same value type `V`.
-  static SparseTensorStorage<P, I, V> *
-  newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
-                  const DimLevelType *sparsity,
-                  const SparseTensorStorageBase *source) {
-    assert(source && "Got nullptr for source");
-    SparseTensorEnumeratorBase<V> *enumerator;
-    source->newEnumerator(&enumerator, rank, perm);
-    const auto &permsz = enumerator->permutedSizes();
-    assertPermutedSizesMatchShape(permsz, rank, perm, shape);
-    auto *tensor =
-        new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, *source);
-    delete enumerator;
-    return tensor;
-  }
-
-private:
-  /// Appends an arbitrary new position to `pointers[d]`.  This method
-  /// checks that `pos` is representable in the `P` type; however, it
-  /// does not check that `pos` is semantically valid (i.e., larger than
-  /// the previous position and smaller than `indices[d].capacity()`).
-  void appendPointer(uint64_t d, uint64_t pos, uint64_t count = 1) {
-    assert(isCompressedDim(d));
-    assert(pos <= std::numeric_limits<P>::max() &&
-           "Pointer value is too large for the P-type");
-    pointers[d].insert(pointers[d].end(), count, static_cast<P>(pos));
-  }
-
-  /// Appends index `i` to dimension `d`, in the semantically general
-  /// sense.  For non-dense dimensions, that means appending to the
-  /// `indices[d]` array, checking that `i` is representable in the `I`
-  /// type; however, we do not verify other semantic requirements (e.g.,
-  /// that `i` is in bounds for `dimSizes[d]`, and not previously occurring
-  /// in the same segment).  For dense dimensions, this method instead
-  /// appends the appropriate number of zeros to the `values` array,
-  /// where `full` is the number of "entries" already written to `values`
-  /// for this segment (aka one after the highest index previously appended).
-  void appendIndex(uint64_t d, uint64_t full, uint64_t i) {
-    if (isCompressedDim(d)) {
-      assert(i <= std::numeric_limits<I>::max() &&
-             "Index value is too large for the I-type");
-      indices[d].push_back(static_cast<I>(i));
-    } else { // Dense dimension.
-      assert(i >= full && "Index was already filled");
-      if (i == full)
-        return; // Short-circuit, since it'll be a nop.
-      if (d + 1 == getRank())
-        values.insert(values.end(), i - full, 0);
-      else
-        finalizeSegment(d + 1, 0, i - full);
-    }
-  }
-
-  /// Writes the given coordinate to `indices[d][pos]`.  This method
-  /// checks that `i` is representable in the `I` type; however, it
-  /// does not check that `i` is semantically valid (i.e., in bounds
-  /// for `dimSizes[d]` and not elsewhere occurring in the same segment).
-  void writeIndex(uint64_t d, uint64_t pos, uint64_t i) {
-    assert(isCompressedDim(d));
-    // Subscript assignment to `std::vector` requires that the `pos`-th
-    // entry has been initialized; thus we must be sure to check `size()`
-    // here, instead of `capacity()` as would be ideal.
-    assert(pos < indices[d].size() && "Index position is out of bounds");
-    assert(i <= std::numeric_limits<I>::max() &&
-           "Index value is too large for the I-type");
-    indices[d][pos] = static_cast<I>(i);
-  }
-
-  /// Computes the assembled-size associated with the `d`-th dimension,
-  /// given the assembled-size associated with the `(d-1)`-th dimension.
-  /// "Assembled-sizes" correspond to the (nominal) sizes of overhead
-  /// storage, as opposed to "dimension-sizes" which are the cardinality
-  /// of coordinates for that dimension.
-  ///
-  /// Precondition: the `pointers[d]` array must be fully initialized
-  /// before calling this method.
-  uint64_t assembledSize(uint64_t parentSz, uint64_t d) const {
-    if (isCompressedDim(d))
-      return pointers[d][parentSz];
-    // else if dense:
-    return parentSz * getDimSizes()[d];
-  }
-
-  /// Initializes sparse tensor storage scheme from a memory-resident sparse
-  /// tensor in coordinate scheme. This method prepares the pointers and
-  /// indices arrays under the given per-dimension dense/sparse annotations.
-  ///
-  /// Preconditions:
-  /// (1) the `elements` must be lexicographically sorted.
-  /// (2) the indices of every element are valid for `dimSizes` (equal rank
-  ///     and pointwise less-than).
-  void fromCOO(const std::vector<Element<V>> &elements, uint64_t lo,
-               uint64_t hi, uint64_t d) {
-    uint64_t rank = getRank();
-    assert(d <= rank && hi <= elements.size());
-    // Once dimensions are exhausted, insert the numerical values.
-    if (d == rank) {
-      assert(lo < hi);
-      values.push_back(elements[lo].value);
-      return;
-    }
-    // Visit all elements in this interval.
-    uint64_t full = 0;
-    while (lo < hi) { // If `hi` is unchanged, then `lo < elements.size()`.
-      // Find segment in interval with same index elements in this dimension.
-      uint64_t i = elements[lo].indices[d];
-      uint64_t seg = lo + 1;
-      while (seg < hi && elements[seg].indices[d] == i)
-        seg++;
-      // Handle segment in interval for sparse or dense dimension.
-      appendIndex(d, full, i);
-      full = i + 1;
-      fromCOO(elements, lo, seg, d + 1);
-      // And move on to next segment in interval.
-      lo = seg;
-    }
-    // Finalize the sparse pointer structure at this dimension.
-    finalizeSegment(d, full);
-  }
-
-  /// Finalize the sparse pointer structure at this dimension.
-  void finalizeSegment(uint64_t d, uint64_t full = 0, uint64_t count = 1) {
-    if (count == 0)
-      return; // Short-circuit, since it'll be a nop.
-    if (isCompressedDim(d)) {
-      appendPointer(d, indices[d].size(), count);
-    } else { // Dense dimension.
-      const uint64_t sz = getDimSizes()[d];
-      assert(sz >= full && "Segment is overfull");
-      count = checkedMul(count, sz - full);
-      // For dense storage we must enumerate all the remaining coordinates
-      // in this dimension (i.e., coordinates after the last non-zero
-      // element), and either fill in their zero values or else recurse
-      // to finalize some deeper dimension.
-      if (d + 1 == getRank())
-        values.insert(values.end(), count, 0);
-      else
-        finalizeSegment(d + 1, 0, count);
-    }
-  }
-
-  /// Wraps up a single insertion path, inner to outer.
-  void endPath(uint64_t diff) {
-    uint64_t rank = getRank();
-    assert(diff <= rank);
-    for (uint64_t i = 0; i < rank - diff; i++) {
-      const uint64_t d = rank - i - 1;
-      finalizeSegment(d, idx[d] + 1);
-    }
-  }
-
-  /// Continues a single insertion path, outer to inner.
-  void insPath(const uint64_t *cursor, uint64_t diff, uint64_t top, V val) {
-    uint64_t rank = getRank();
-    assert(diff < rank);
-    for (uint64_t d = diff; d < rank; d++) {
-      uint64_t i = cursor[d];
-      appendIndex(d, top, i);
-      top = 0;
-      idx[d] = i;
-    }
-    values.push_back(val);
-  }
-
-  /// Finds the lexicographic differing dimension.
-  uint64_t lexDiff(const uint64_t *cursor) const {
-    for (uint64_t r = 0, rank = getRank(); r < rank; r++)
-      if (cursor[r] > idx[r])
-        return r;
-      else
-        assert(cursor[r] == idx[r] && "non-lexicographic insertion");
-    assert(0 && "duplication insertion");
-    return -1u;
-  }
-
-  // Allow `SparseTensorEnumerator` to access the data-members (to avoid
-  // the cost of virtual-function dispatch in inner loops), without
-  // making them public to other client code.
-  friend class SparseTensorEnumerator<P, I, V>;
-
-  std::vector<std::vector<P>> pointers;
-  std::vector<std::vector<I>> indices;
-  std::vector<V> values;
-  std::vector<uint64_t> idx; // index cursor for lexicographic insertion.
-};
-
-/// A (higher-order) function object for enumerating the elements of some
-/// `SparseTensorStorage` under a permutation.  That is, the `forallElements`
-/// method encapsulates the loop-nest for enumerating the elements of
-/// the source tensor (in whatever order is best for the source tensor),
-/// and applies a permutation to the coordinates/indices before handing
-/// each element to the callback.  A single enumerator object can be
-/// freely reused for several calls to `forallElements`, just so long
-/// as each call is sequential with respect to one another.
-///
-/// N.B., this class stores a reference to the `SparseTensorStorageBase`
-/// passed to the constructor; thus, objects of this class must not
-/// outlive the sparse tensor they depend on.
-///
-/// Design Note: The reason we define this class instead of simply using
-/// `SparseTensorEnumerator<P,I,V>` is because we need to hide/generalize
-/// the `<P,I>` template parameters from MLIR client code (to simplify the
-/// type parameters used for direct sparse-to-sparse conversion).  And the
-/// reason we define the `SparseTensorEnumerator<P,I,V>` subclasses rather
-/// than simply using this class, is to avoid the cost of virtual-method
-/// dispatch within the loop-nest.
-template <typename V>
-class SparseTensorEnumeratorBase {
-public:
-  /// Constructs an enumerator with the given permutation for mapping
-  /// the semantic-ordering of dimensions to the desired target-ordering.
-  ///
-  /// Preconditions:
-  /// * the `tensor` must have the same `V` value type.
-  /// * `perm` must be valid for `rank`.
-  SparseTensorEnumeratorBase(const SparseTensorStorageBase &tensor,
-                             uint64_t rank, const uint64_t *perm)
-      : src(tensor), permsz(src.getRev().size()), reord(getRank()),
-        cursor(getRank()) {
-    assert(perm && "Received nullptr for permutation");
-    assert(rank == getRank() && "Permutation rank mismatch");
-    const auto &rev = src.getRev();           // source-order -> semantic-order
-    const auto &dimSizes = src.getDimSizes(); // in source storage-order
-    for (uint64_t s = 0; s < rank; s++) {     // `s` source storage-order
-      uint64_t t = perm[rev[s]];              // `t` target-order
-      reord[s] = t;
-      permsz[t] = dimSizes[s];
-    }
-  }
-
-  virtual ~SparseTensorEnumeratorBase() = default;
-
-  // We disallow copying to help avoid leaking the `src` reference.
-  // (In addition to avoiding the problem of slicing.)
-  SparseTensorEnumeratorBase(const SparseTensorEnumeratorBase &) = delete;
-  SparseTensorEnumeratorBase &
-  operator=(const SparseTensorEnumeratorBase &) = delete;
-
-  /// Returns the source/target tensor's rank.  (The source-rank and
-  /// target-rank are always equal since we only support permutations.
-  /// Though once we add support for other dimension mappings, this
-  /// method will have to be split in two.)
-  uint64_t getRank() const { return permsz.size(); }
-
-  /// Returns the target tensor's dimension sizes.
-  const std::vector<uint64_t> &permutedSizes() const { return permsz; }
-
-  /// Enumerates all elements of the source tensor, permutes their
-  /// indices, and passes the permuted element to the callback.
-  /// The callback must not store the cursor reference directly,
-  /// since this function reuses the storage.  Instead, the callback
-  /// must copy it if they want to keep it.
-  virtual void forallElements(ElementConsumer<V> yield) = 0;
-
-protected:
-  const SparseTensorStorageBase &src;
-  std::vector<uint64_t> permsz; // in target order.
-  std::vector<uint64_t> reord;  // source storage-order -> target order.
-  std::vector<uint64_t> cursor; // in target order.
-};
-
-template <typename P, typename I, typename V>
-class SparseTensorEnumerator final : public SparseTensorEnumeratorBase<V> {
-  using Base = SparseTensorEnumeratorBase<V>;
-
-public:
-  /// Constructs an enumerator with the given permutation for mapping
-  /// the semantic-ordering of dimensions to the desired target-ordering.
-  ///
-  /// Precondition: `perm` must be valid for `rank`.
-  SparseTensorEnumerator(const SparseTensorStorage<P, I, V> &tensor,
-                         uint64_t rank, const uint64_t *perm)
-      : Base(tensor, rank, perm) {}
-
-  ~SparseTensorEnumerator() final = default;
-
-  void forallElements(ElementConsumer<V> yield) final {
-    forallElements(yield, 0, 0);
-  }
-
-private:
-  /// The recursive component of the public `forallElements`.
-  void forallElements(ElementConsumer<V> yield, uint64_t parentPos,
-                      uint64_t d) {
-    // Recover the `<P,I,V>` type parameters of `src`.
-    const auto &src =
-        static_cast<const SparseTensorStorage<P, I, V> &>(this->src);
-    if (d == Base::getRank()) {
-      assert(parentPos < src.values.size() &&
-             "Value position is out of bounds");
-      // TODO: <https://github.com/llvm/llvm-project/issues/54179>
-      yield(this->cursor, src.values[parentPos]);
-    } else if (src.isCompressedDim(d)) {
-      // Look up the bounds of the `d`-level segment determined by the
-      // `d-1`-level position `parentPos`.
-      const std::vector<P> &pointersD = src.pointers[d];
-      assert(parentPos + 1 < pointersD.size() &&
-             "Parent pointer position is out of bounds");
-      const uint64_t pstart = static_cast<uint64_t>(pointersD[parentPos]);
-      const uint64_t pstop = static_cast<uint64_t>(pointersD[parentPos + 1]);
-      // Loop-invariant code for looking up the `d`-level coordinates/indices.
-      const std::vector<I> &indicesD = src.indices[d];
-      assert(pstop <= indicesD.size() && "Index position is out of bounds");
-      uint64_t &cursorReordD = this->cursor[this->reord[d]];
-      for (uint64_t pos = pstart; pos < pstop; pos++) {
-        cursorReordD = static_cast<uint64_t>(indicesD[pos]);
-        forallElements(yield, pos, d + 1);
-      }
-    } else { // Dense dimension.
-      const uint64_t sz = src.getDimSizes()[d];
-      const uint64_t pstart = parentPos * sz;
-      uint64_t &cursorReordD = this->cursor[this->reord[d]];
-      for (uint64_t i = 0; i < sz; i++) {
-        cursorReordD = i;
-        forallElements(yield, pstart + i, d + 1);
-      }
-    }
-  }
-};
-
-/// Statistics regarding the number of nonzero subtensors in
-/// a source tensor, for direct sparse=>sparse conversion a la
-/// <https://arxiv.org/abs/2001.02609>.
-///
-/// N.B., this class stores references to the parameters passed to
-/// the constructor; thus, objects of this class must not outlive
-/// those parameters.
-class SparseTensorNNZ final {
-public:
-  /// Allocate the statistics structure for the desired sizes and
-  /// sparsity (in the target tensor's storage-order).  This constructor
-  /// does not actually populate the statistics, however; for that see
-  /// `initialize`.
-  ///
-  /// Precondition: `dimSizes` must not contain zeros.
-  SparseTensorNNZ(const std::vector<uint64_t> &dimSizes,
-                  const std::vector<DimLevelType> &sparsity)
-      : dimSizes(dimSizes), dimTypes(sparsity), nnz(getRank()) {
-    assert(dimSizes.size() == dimTypes.size() && "Rank mismatch");
-    bool uncompressed = true;
-    (void)uncompressed;
-    uint64_t sz = 1; // the product of all `dimSizes` strictly less than `r`.
-    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
-      switch (dimTypes[r]) {
-      case DimLevelType::kCompressed:
-        assert(uncompressed &&
-               "Multiple compressed layers not currently supported");
-        uncompressed = false;
-        nnz[r].resize(sz, 0); // Both allocate and zero-initialize.
-        break;
-      case DimLevelType::kDense:
-        assert(uncompressed &&
-               "Dense after compressed not currently supported");
-        break;
-      case DimLevelType::kSingleton:
-        // Singleton after Compressed causes no problems for allocating
-        // `nnz` nor for the yieldPos loop.  This remains true even
-        // when adding support for multiple compressed dimensions or
-        // for dense-after-compressed.
-        break;
-      default:
-        FATAL("unsupported dimension level type");
-      }
-      sz = checkedMul(sz, dimSizes[r]);
-
-    }
-  }
-
-  // We disallow copying to help avoid leaking the stored references.
-  SparseTensorNNZ(const SparseTensorNNZ &) = delete;
-  SparseTensorNNZ &operator=(const SparseTensorNNZ &) = delete;
-
-  /// Returns the rank of the target tensor.
-  uint64_t getRank() const { return dimSizes.size(); }
-
-  /// Enumerate the source tensor to fill in the statistics.  The
-  /// enumerator should already incorporate the permutation (from
-  /// semantic-order to the target storage-order).
-  template <typename V>
-  void initialize(SparseTensorEnumeratorBase<V> &enumerator) {
-    assert(enumerator.getRank() == getRank() && "Tensor rank mismatch");
-    assert(enumerator.permutedSizes() == dimSizes && "Tensor size mismatch");
-    enumerator.forallElements(
-        [this](const std::vector<uint64_t> &ind, V) { add(ind); });
-  }
-
-  /// The type of callback functions which receive an nnz-statistic.
-  using NNZConsumer = const std::function<void(uint64_t)> &;
-
-  /// Lexicographically enumerates all indicies for dimensions strictly
-  /// less than `stopDim`, and passes their nnz statistic to the callback.
-  /// Since our use-case only requires the statistic not the coordinates
-  /// themselves, we do not bother to construct those coordinates.
-  void forallIndices(uint64_t stopDim, NNZConsumer yield) const {
-    assert(stopDim < getRank() && "Stopping-dimension is out of bounds");
-    assert(dimTypes[stopDim] == DimLevelType::kCompressed &&
-           "Cannot look up non-compressed dimensions");
-    forallIndices(yield, stopDim, 0, 0);
-  }
-
-private:
-  /// Adds a new element (i.e., increment its statistics).  We use
-  /// a method rather than inlining into the lambda in `initialize`,
-  /// to avoid spurious templating over `V`.  And this method is private
-  /// to avoid needing to re-assert validity of `ind` (which is guaranteed
-  /// by `forallElements`).
-  void add(const std::vector<uint64_t> &ind) {
-    uint64_t parentPos = 0;
-    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
-      if (dimTypes[r] == DimLevelType::kCompressed)
-        nnz[r][parentPos]++;
-      parentPos = parentPos * dimSizes[r] + ind[r];
-    }
-  }
-
-  /// Recursive component of the public `forallIndices`.
-  void forallIndices(NNZConsumer yield, uint64_t stopDim, uint64_t parentPos,
-                     uint64_t d) const {
-    assert(d <= stopDim);
-    if (d == stopDim) {
-      assert(parentPos < nnz[d].size() && "Cursor is out of range");
-      yield(nnz[d][parentPos]);
-    } else {
-      const uint64_t sz = dimSizes[d];
-      const uint64_t pstart = parentPos * sz;
-      for (uint64_t i = 0; i < sz; i++)
-        forallIndices(yield, stopDim, pstart + i, d + 1);
-    }
-  }
-
-  // All of these are in the target storage-order.
-  const std::vector<uint64_t> &dimSizes;
-  const std::vector<DimLevelType> &dimTypes;
-  std::vector<std::vector<uint64_t>> nnz;
-};
-
-template <typename P, typename I, typename V>
-SparseTensorStorage<P, I, V>::SparseTensorStorage(
-    const std::vector<uint64_t> &dimSizes, const uint64_t *perm,
-    const DimLevelType *sparsity, const SparseTensorStorageBase &tensor)
-    : SparseTensorStorage(dimSizes, perm, sparsity) {
-  SparseTensorEnumeratorBase<V> *enumerator;
-  tensor.newEnumerator(&enumerator, getRank(), perm);
-  {
-    // Initialize the statistics structure.
-    SparseTensorNNZ nnz(getDimSizes(), getDimTypes());
-    nnz.initialize(*enumerator);
-    // Initialize "pointers" overhead (and allocate "indices", "values").
-    uint64_t parentSz = 1; // assembled-size (not dimension-size) of `r-1`.
-    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
-      if (isCompressedDim(r)) {
-        pointers[r].reserve(parentSz + 1);
-        pointers[r].push_back(0);
-        uint64_t currentPos = 0;
-        nnz.forallIndices(r, [this, &currentPos, r](uint64_t n) {
-          currentPos += n;
-          appendPointer(r, currentPos);
-        });
-        assert(pointers[r].size() == parentSz + 1 &&
-               "Final pointers size doesn't match allocated size");
-        // That assertion entails `assembledSize(parentSz, r)`
-        // is now in a valid state.  That is, `pointers[r][parentSz]`
-        // equals the present value of `currentPos`, which is the
-        // correct assembled-size for `indices[r]`.
-      }
-      // Update assembled-size for the next iteration.
-      parentSz = assembledSize(parentSz, r);
-      // Ideally we need only `indices[r].reserve(parentSz)`, however
-      // the `std::vector` implementation forces us to initialize it too.
-      // That is, in the yieldPos loop we need random-access assignment
-      // to `indices[r]`; however, `std::vector`'s subscript-assignment
-      // only allows assigning to already-initialized positions.
-      if (isCompressedDim(r))
-        indices[r].resize(parentSz, 0);
-    }
-    values.resize(parentSz, 0); // Both allocate and zero-initialize.
-  }
-  // The yieldPos loop
-  enumerator->forallElements([this](const std::vector<uint64_t> &ind, V val) {
-    uint64_t parentSz = 1, parentPos = 0;
-    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
-      if (isCompressedDim(r)) {
-        // If `parentPos == parentSz` then it's valid as an array-lookup;
-        // however, it's semantically invalid here since that entry
-        // does not represent a segment of `indices[r]`.  Moreover, that
-        // entry must be immutable for `assembledSize` to remain valid.
-        assert(parentPos < parentSz && "Pointers position is out of bounds");
-        const uint64_t currentPos = pointers[r][parentPos];
-        // This increment won't overflow the `P` type, since it can't
-        // exceed the original value of `pointers[r][parentPos+1]`
-        // which was already verified to be within bounds for `P`
-        // when it was written to the array.
-        pointers[r][parentPos]++;
-        writeIndex(r, currentPos, ind[r]);
-        parentPos = currentPos;
-      } else { // Dense dimension.
-        parentPos = parentPos * getDimSizes()[r] + ind[r];
-      }
-      parentSz = assembledSize(parentSz, r);
-    }
-    assert(parentPos < values.size() && "Value position is out of bounds");
-    values[parentPos] = val;
-  });
-  // No longer need the enumerator, so we'll delete it ASAP.
-  delete enumerator;
-  // The finalizeYieldPos loop
-  for (uint64_t parentSz = 1, rank = getRank(), r = 0; r < rank; r++) {
-    if (isCompressedDim(r)) {
-      assert(parentSz == pointers[r].size() - 1 &&
-             "Actual pointers size doesn't match the expected size");
-      // Can't check all of them, but at least we can check the last one.
-      assert(pointers[r][parentSz - 1] == pointers[r][parentSz] &&
-             "Pointers got corrupted");
-      // TODO: optimize this by using `memmove` or similar.
-      for (uint64_t n = 0; n < parentSz; n++) {
-        const uint64_t parentPos = parentSz - n;
-        pointers[r][parentPos] = pointers[r][parentPos - 1];
-      }
-      pointers[r][0] = 0;
-    }
-    parentSz = assembledSize(parentSz, r);
-  }
-}
-
-/// Helper to convert string to lower case.
-static char *toLower(char *token) {
-  for (char *c = token; *c; c++)
-    *c = tolower(*c);
-  return token;
-}
-
-/// This class abstracts over the information stored in file headers,
-/// as well as providing the buffers and methods for parsing those headers.
-class SparseTensorFile final {
-public:
-  enum class ValueKind {
-    kInvalid = 0,
-    kPattern = 1,
-    kReal = 2,
-    kInteger = 3,
-    kComplex = 4,
-    kUndefined = 5
-  };
-
-  explicit SparseTensorFile(char *filename) : filename(filename) {
-    assert(filename && "Received nullptr for filename");
-  }
-
-  // Disallows copying, to avoid duplicating the `file` pointer.
-  SparseTensorFile(const SparseTensorFile &) = delete;
-  SparseTensorFile &operator=(const SparseTensorFile &) = delete;
-
-  // This dtor tries to avoid leaking the `file`.  (Though it's better
-  // to call `closeFile` explicitly when possible, since there are
-  // circumstances where dtors are not called reliably.)
-  ~SparseTensorFile() { closeFile(); }
-
-  /// Opens the file for reading.
-  void openFile() {
-    if (file)
-      FATAL("Already opened file %s\n", filename);
-    file = fopen(filename, "r");
-    if (!file)
-      FATAL("Cannot find file %s\n", filename);
-  }
-
-  /// Closes the file.
-  void closeFile() {
-    if (file) {
-      fclose(file);
-      file = nullptr;
-    }
-  }
-
-  // TODO(wrengr/bixia): figure out how to reorganize the element-parsing
-  // loop of `openSparseTensorCOO` into methods of this class, so we can
-  // avoid leaking access to the `line` pointer (both for general hygiene
-  // and because we can't mark it const due to the second argument of
-  // `strtoul`/`strtoud` being `char * *restrict` rather than
-  // `char const* *restrict`).
-  //
-  /// Attempts to read a line from the file.
-  char *readLine() {
-    if (fgets(line, kColWidth, file))
-      return line;
-    FATAL("Cannot read next line of %s\n", filename);
-  }
-
-  /// Reads and parses the file's header.
-  void readHeader() {
-    assert(file && "Attempt to readHeader() before openFile()");
-    if (strstr(filename, ".mtx"))
-      readMMEHeader();
-    else if (strstr(filename, ".tns"))
-      readExtFROSTTHeader();
-    else
-      FATAL("Unknown format %s\n", filename);
-    assert(isValid() && "Failed to read the header");
-  }
-
-  ValueKind getValueKind() const { return valueKind_; }
-
-  bool isValid() const { return valueKind_ != ValueKind::kInvalid; }
-
-  /// Gets the MME "pattern" property setting.  Is only valid after
-  /// parsing the header.
-  bool isPattern() const {
-    assert(isValid() && "Attempt to isPattern() before readHeader()");
-    return valueKind_ == ValueKind::kPattern;
-  }
-
-  /// Gets the MME "symmetric" property setting.  Is only valid after
-  /// parsing the header.
-  bool isSymmetric() const {
-    assert(isValid() && "Attempt to isSymmetric() before readHeader()");
-    return isSymmetric_;
-  }
-
-  /// Gets the rank of the tensor.  Is only valid after parsing the header.
-  uint64_t getRank() const {
-    assert(isValid() && "Attempt to getRank() before readHeader()");
-    return idata[0];
-  }
-
-  /// Gets the number of non-zeros.  Is only valid after parsing the header.
-  uint64_t getNNZ() const {
-    assert(isValid() && "Attempt to getNNZ() before readHeader()");
-    return idata[1];
-  }
-
-  /// Gets the dimension-sizes array.  The pointer itself is always
-  /// valid; however, the values stored therein are only valid after
-  /// parsing the header.
-  const uint64_t *getDimSizes() const { return idata + 2; }
-
-  /// Safely gets the size of the given dimension.  Is only valid
-  /// after parsing the header.
-  uint64_t getDimSize(uint64_t d) const {
-    assert(d < getRank());
-    return idata[2 + d];
-  }
-
-  /// Asserts the shape subsumes the actual dimension sizes.  Is only
-  /// valid after parsing the header.
-  void assertMatchesShape(uint64_t rank, const uint64_t *shape) const {
-    assert(rank == getRank() && "Rank mismatch");
-    for (uint64_t r = 0; r < rank; r++)
-      assert((shape[r] == 0 || shape[r] == idata[2 + r]) &&
-             "Dimension size mismatch");
-  }
-
-private:
-  void readMMEHeader();
-  void readExtFROSTTHeader();
-
-  const char *filename;
-  FILE *file = nullptr;
-  ValueKind valueKind_ = ValueKind::kInvalid;
-  bool isSymmetric_ = false;
-  uint64_t idata[512];
-  char line[kColWidth];
-};
-
-/// Read the MME header of a general sparse matrix of type real.
-void SparseTensorFile::readMMEHeader() {
-  char header[64];
-  char object[64];
-  char format[64];
-  char field[64];
-  char symmetry[64];
-  // Read header line.
-  if (fscanf(file, "%63s %63s %63s %63s %63s\n", header, object, format, field,
-             symmetry) != 5)
-    FATAL("Corrupt header in %s\n", filename);
-  // Process `field`, which specify pattern or the data type of the values.
-  if (strcmp(toLower(field), "pattern") == 0)
-    valueKind_ = ValueKind::kPattern;
-  else if (strcmp(toLower(field), "real") == 0)
-    valueKind_ = ValueKind::kReal;
-  else if (strcmp(toLower(field), "integer") == 0)
-    valueKind_ = ValueKind::kInteger;
-  else if (strcmp(toLower(field), "complex") == 0)
-    valueKind_ = ValueKind::kComplex;
-  else
-    FATAL("Unexpected header field value in %s\n", filename);
-
-  // Set properties.
-  isSymmetric_ = (strcmp(toLower(symmetry), "symmetric") == 0);
-  // Make sure this is a general sparse matrix.
-  if (strcmp(toLower(header), "%%matrixmarket") ||
-      strcmp(toLower(object), "matrix") ||
-      strcmp(toLower(format), "coordinate") ||
-      (strcmp(toLower(symmetry), "general") && !isSymmetric_))
-    FATAL("Cannot find a general sparse matrix in %s\n", filename);
-  // Skip comments.
-  while (true) {
-    readLine();
-    if (line[0] != '%')
-      break;
-  }
-  // Next line contains M N NNZ.
-  idata[0] = 2; // rank
-  if (sscanf(line, "%" PRIu64 "%" PRIu64 "%" PRIu64 "\n", idata + 2, idata + 3,
-             idata + 1) != 3)
-    FATAL("Cannot find size in %s\n", filename);
-}
-
-/// Read the "extended" FROSTT header. Although not part of the documented
-/// format, we assume that the file starts with optional comments followed
-/// by two lines that define the rank, the number of nonzeros, and the
-/// dimensions sizes (one per rank) of the sparse tensor.
-void SparseTensorFile::readExtFROSTTHeader() {
-  // Skip comments.
-  while (true) {
-    readLine();
-    if (line[0] != '#')
-      break;
-  }
-  // Next line contains RANK and NNZ.
-  if (sscanf(line, "%" PRIu64 "%" PRIu64 "\n", idata, idata + 1) != 2)
-    FATAL("Cannot find metadata in %s\n", filename);
-  // Followed by a line with the dimension sizes (one per rank).
-  for (uint64_t r = 0; r < idata[0]; r++)
-    if (fscanf(file, "%" PRIu64, idata + 2 + r) != 1)
-      FATAL("Cannot find dimension size %s\n", filename);
-  readLine(); // end of line
-  // The FROSTT format does not define the data type of the nonzero elements.
-  valueKind_ = ValueKind::kUndefined;
-}
-
-// Adds a value to a tensor in coordinate scheme. If is_symmetric_value is true,
-// also adds the value to its symmetric location.
-template <typename T, typename V>
-static inline void addValue(T *coo, V value,
-                            const std::vector<uint64_t> indices,
-                            bool is_symmetric_value) {
-  // TODO: <https://github.com/llvm/llvm-project/issues/54179>
-  coo->add(indices, value);
-  // We currently chose to deal with symmetric matrices by fully constructing
-  // them. In the future, we may want to make symmetry implicit for storage
-  // reasons.
-  if (is_symmetric_value)
-    coo->add({indices[1], indices[0]}, value);
-}
-
-// Reads an element of a complex type for the current indices in coordinate
-// scheme.
-template <typename V>
-static inline void readCOOValue(SparseTensorCOO<std::complex<V>> *coo,
-                                const std::vector<uint64_t> indices,
-                                char **linePtr, bool is_pattern,
-                                bool add_symmetric_value) {
-  // Read two values to make a complex. The external formats always store
-  // numerical values with the type double, but we cast these values to the
-  // sparse tensor object type. For a pattern tensor, we arbitrarily pick the
-  // value 1 for all entries.
-  V re = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
-  V im = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
-  std::complex<V> value = {re, im};
-  addValue(coo, value, indices, add_symmetric_value);
-}
-
-// Reads an element of a non-complex type for the current indices in coordinate
-// scheme.
-template <typename V,
-          typename std::enable_if<
-              !std::is_same<std::complex<float>, V>::value &&
-              !std::is_same<std::complex<double>, V>::value>::type * = nullptr>
-static void inline readCOOValue(SparseTensorCOO<V> *coo,
-                                const std::vector<uint64_t> indices,
-                                char **linePtr, bool is_pattern,
-                                bool is_symmetric_value) {
-  // The external formats always store these numerical values with the type
-  // double, but we cast these values to the sparse tensor object type.
-  // For a pattern tensor, we arbitrarily pick the value 1 for all entries.
-  double value = is_pattern ? 1.0 : strtod(*linePtr, linePtr);
-  addValue(coo, value, indices, is_symmetric_value);
-}
-
-/// Reads a sparse tensor with the given filename into a memory-resident
-/// sparse tensor in coordinate scheme.
-template <typename V>
-static SparseTensorCOO<V> *
-openSparseTensorCOO(char *filename, uint64_t rank, const uint64_t *shape,
-                    const uint64_t *perm, PrimaryType valTp) {
-  SparseTensorFile stfile(filename);
-  stfile.openFile();
-  stfile.readHeader();
-  // Check tensor element type against the value type in the input file.
-  SparseTensorFile::ValueKind valueKind = stfile.getValueKind();
-  bool tensorIsInteger =
-      (valTp >= PrimaryType::kI64 && valTp <= PrimaryType::kI8);
-  bool tensorIsReal = (valTp >= PrimaryType::kF64 && valTp <= PrimaryType::kI8);
-  if ((valueKind == SparseTensorFile::ValueKind::kReal && tensorIsInteger) ||
-      (valueKind == SparseTensorFile::ValueKind::kComplex && tensorIsReal)) {
-    FATAL("Tensor element type %d not compatible with values in file %s\n",
-          static_cast<int>(valTp), filename);
-  }
-  stfile.assertMatchesShape(rank, shape);
-  // Prepare sparse tensor object with per-dimension sizes
-  // and the number of nonzeros as initial capacity.
-  uint64_t nnz = stfile.getNNZ();
-  auto *coo = SparseTensorCOO<V>::newSparseTensorCOO(rank, stfile.getDimSizes(),
-                                                     perm, nnz);
-  // Read all nonzero elements.
-  std::vector<uint64_t> indices(rank);
-  for (uint64_t k = 0; k < nnz; k++) {
-    char *linePtr = stfile.readLine();
-    for (uint64_t r = 0; r < rank; r++) {
-      uint64_t idx = strtoul(linePtr, &linePtr, 10);
-      // Add 0-based index.
-      indices[perm[r]] = idx - 1;
-    }
-    readCOOValue(coo, indices, &linePtr, stfile.isPattern(),
-                 stfile.isSymmetric() && indices[0] != indices[1]);
-  }
-  // Close the file and return tensor.
-  stfile.closeFile();
-  return coo;
-}
-
-/// Writes the sparse tensor to `dest` in extended FROSTT format.
-template <typename V>
-static void outSparseTensor(void *tensor, void *dest, bool sort) {
-  assert(tensor && dest);
-  auto coo = static_cast<SparseTensorCOO<V> *>(tensor);
-  if (sort)
-    coo->sort();
-  char *filename = static_cast<char *>(dest);
-  auto &dimSizes = coo->getDimSizes();
-  auto &elements = coo->getElements();
-  uint64_t rank = coo->getRank();
-  uint64_t nnz = elements.size();
-  std::fstream file;
-  file.open(filename, std::ios_base::out | std::ios_base::trunc);
-  assert(file.is_open());
-  file << "; extended FROSTT format\n" << rank << " " << nnz << std::endl;
-  for (uint64_t r = 0; r < rank - 1; r++)
-    file << dimSizes[r] << " ";
-  file << dimSizes[rank - 1] << std::endl;
-  for (uint64_t i = 0; i < nnz; i++) {
-    auto &idx = elements[i].indices;
-    for (uint64_t r = 0; r < rank; r++)
-      file << (idx[r] + 1) << " ";
-    file << elements[i].value << std::endl;
-  }
-  file.flush();
-  file.close();
-  assert(file.good());
-}
-
 /// Initializes sparse tensor from an external COO-flavored format.
 template <typename V>
 static SparseTensorStorage<uint64_t, uint64_t, V> *
@@ -1437,13 +86,14 @@
   std::sort(order.begin(), order.end());
   for (uint64_t i = 0; i < rank; ++i)
     if (i != order[i])
-      FATAL("Not a permutation of 0..%" PRIu64 "\n", rank);
+      MLIR_SPARSETENSOR_FATAL("Not a permutation of 0..%" PRIu64 "\n", rank);
 
   // Verify that the sparsity values are supported.
   for (uint64_t i = 0; i < rank; ++i)
     if (sparsity[i] != DimLevelType::kDense &&
         sparsity[i] != DimLevelType::kCompressed)
-      FATAL("Unsupported sparsity value %d\n", static_cast<int>(sparsity[i]));
+      MLIR_SPARSETENSOR_FATAL("Unsupported sparsity value %d\n",
+                              static_cast<int>(sparsity[i]));
 #endif
 
   // Convert external format to internal COO.
@@ -1677,9 +327,10 @@
 
   // Unsupported case (add above if needed).
   // TODO: better pretty-printing of enum values!
-  FATAL("unsupported combination of types: <P=%d, I=%d, V=%d>\n",
-        static_cast<int>(ptrTp), static_cast<int>(indTp),
-        static_cast<int>(valTp));
+  MLIR_SPARSETENSOR_FATAL(
+      "unsupported combination of types: <P=%d, I=%d, V=%d>\n",
+      static_cast<int>(ptrTp), static_cast<int>(indTp),
+      static_cast<int>(valTp));
 }
 #undef CASE
 #undef CASE_SECSAME
@@ -1834,7 +485,7 @@
   sprintf(var, "TENSOR%" PRIu64, id);
   char *env = getenv(var);
   if (!env)
-    FATAL("Environment variable %s is not set\n", var);
+    MLIR_SPARSETENSOR_FATAL("Environment variable %s is not set\n", var);
   return env;
 }
 
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -2064,7 +2064,6 @@
     hdrs = [
         "include/mlir/Dialect/SparseTensor/Transforms/BufferizableOpInterfaceImpl.h",
         "include/mlir/Dialect/SparseTensor/Transforms/Passes.h",
-        "include/mlir/ExecutionEngine/SparseTensorUtils.h",
     ],
     includes = ["include"],
     deps = [
@@ -2091,7 +2090,7 @@
         ":TensorDialect",
         ":Transforms",
         ":VectorDialect",
-        ":mlir_c_runner_utils",
+        ":mlir_sparse_tensor_utils",
         "//llvm:Support",
     ],
 )
@@ -6547,20 +6546,71 @@
     deps = [":mlir_async_runtime"],
 )
 
+cc_library(
+    name = "_mlir_float16_utils",
+    srcs = ["lib/ExecutionEngine/Float16bits.cpp"],
+    hdrs = ["include/mlir/ExecutionEngine/Float16bits.h"],
+    copts = ["-Dmlir_float16_utils_EXPORTS"],
+    includes = ["include"],
+)
+
+# Indirection to avoid 'libmlir_float16_utils.so' filename clash.
+alias(
+    name = "mlir_float16_utils",
+    actual = "_mlir_float16_utils",
+)
+
+cc_binary(
+    name = "libmlir_float16_utils.so",
+    linkshared = True,
+    linkstatic = False,
+    deps = [":mlir_float16_utils"],
+)
+
+# Unlike mlir_float16_utils, mlir_c_runner_utils, etc, we do *not* make
+# this a shared library: because on the CMake side, doing so causes
+# issues when building on Windows.
+#
+# We relist Float16bits.h because Enums.h includes it; rather than
+# forcing all direct-dependants state that they also directly-depend
+# on :mlir_float16_utils (to satisfy the layering_check).
+cc_library(
+    name = "mlir_sparse_tensor_utils",
+    srcs = [
+        "lib/ExecutionEngine/SparseTensor/File.cpp",
+        "lib/ExecutionEngine/SparseTensor/NNZ.cpp",
+        "lib/ExecutionEngine/SparseTensor/Storage.cpp",
+    ],
+    hdrs = [
+        "include/mlir/ExecutionEngine/Float16bits.h",
+        "include/mlir/ExecutionEngine/SparseTensor/COO.h",
+        "include/mlir/ExecutionEngine/SparseTensor/CheckedMul.h",
+        "include/mlir/ExecutionEngine/SparseTensor/Enums.h",
+        "include/mlir/ExecutionEngine/SparseTensor/ErrorHandling.h",
+        "include/mlir/ExecutionEngine/SparseTensor/File.h",
+        "include/mlir/ExecutionEngine/SparseTensor/Storage.h",
+    ],
+    copts = ["-Dmlir_sparse_tensor_utils_EXPORTS"],
+    includes = ["include"],
+    deps = [":mlir_float16_utils"],
+)
+
+# We relist Enums.h because SparseTensorUtils.h includes/reexports it
+# as part of the public API.
 cc_library(
     name = "_mlir_c_runner_utils",
     srcs = [
         "lib/ExecutionEngine/CRunnerUtils.cpp",
-        "lib/ExecutionEngine/Float16bits.cpp",
         "lib/ExecutionEngine/SparseTensorUtils.cpp",
     ],
     hdrs = [
         "include/mlir/ExecutionEngine/CRunnerUtils.h",
-        "include/mlir/ExecutionEngine/Float16bits.h",
         "include/mlir/ExecutionEngine/Msan.h",
+        "include/mlir/ExecutionEngine/SparseTensor/Enums.h",
         "include/mlir/ExecutionEngine/SparseTensorUtils.h",
     ],
     includes = ["include"],
+    deps = [":mlir_sparse_tensor_utils"],
 )
 
 # Indirection to avoid 'libmlir_c_runner_utils.so' filename clash.