diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
--- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
@@ -27,6 +27,7 @@
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <limits>
 #include <numeric>
@@ -187,6 +188,7 @@
                           const uint64_t *perm, const DimLevelType *sparsity)
       : dimSizes(szs), rev(getRank()),
         dimTypes(sparsity, sparsity + getRank()) {
+    assert(perm && sparsity);
     const uint64_t rank = getRank();
     // Validate parameters.
     assert(rank > 0 && "Trivial shape is unsupported");
@@ -290,6 +292,135 @@
   const std::vector<DimLevelType> dimTypes;
 };
 
+/// This class provides the interface required by `SparseTensorEnumerator<V>`,
+/// and is a superclass of `SparseTensorStorage<P,I,V>` for abstracting
+/// over the `<P,I>` templating.  We need that abstraction so that direct
+/// sparse-to-sparse conversion need not take separate `<P,I>` parameters
+/// for the source vs the target (nor require that the source parameters
+/// match the target parameters).
+template <typename V>
+class EnumerableSparseTensorStorage : public SparseTensorStorageBase {
+public:
+  EnumerableSparseTensorStorage(const std::vector<uint64_t> &szs,
+                                const uint64_t *perm,
+                                const DimLevelType *sparsity)
+      : SparseTensorStorageBase(szs, perm, sparsity) {}
+
+  /// Looks up the value stored at the given position.
+  virtual V getValue(uint64_t pos) const = 0;
+
+  /// Looks up the `d`-level position/pointer stored at the given
+  /// `d-1`-level position, and converts the stored `P` into `uint64_t`.
+  virtual uint64_t getPointer(uint64_t d, uint64_t parentPos) const = 0;
+
+  /// Looks up the `d`-level coordinate/index stored at the given
+  /// `d`-level position, and converts the stored `I` into `uint64_t`.
+  virtual uint64_t getIndex(uint64_t d, uint64_t pos) const = 0;
+};
+
+/// A (higher-order) function object for enumerating the elements of some
+/// `SparseTensorStorage` under a permutation.  That is, the `forallElements`
+/// method encapsulates the loop-nest for enumerating the elements of
+/// the source tensor (in whatever order is best for the source tensor),
+/// and applies a permutation to the coordinates/indices before handing
+/// each element to the callback.  A single enumerator object can be
+/// freely reused for several calls to `forallElements`, just so long
+/// as each call is sequential with respect to one another.
+///
+/// N.B., this class stores a reference to the `EnumerableSparseTensorStorage`
+/// passed to the constructor; thus, objects of this class must not
+/// outlive the sparse tensor they depend on.
+template <typename V>
+class SparseTensorEnumerator {
+public:
+  /// Constructs an enumerator with the identity permutation, thus
+  /// enumerating elements with the semantic-ordering of dimensions.
+  explicit SparseTensorEnumerator(
+      const EnumerableSparseTensorStorage<V> &tensor)
+      : src(tensor), permsz(src.getRev().size()), reord(src.getRev()),
+        cursor(getRank()) {
+    const auto &sizes = src.getDimSizes();
+    for (uint64_t rank = getRank(), s = 0; s < rank; s++)
+      permsz[reord[s]] = sizes[s];
+  }
+
+  /// Constructs an enumerator with the given permutation for mapping
+  /// the semantic-ordering of dimensions to the desired target-ordering.
+  ///
+  /// Precondition: `perm` must be valid for `rank`.
+  SparseTensorEnumerator(const EnumerableSparseTensorStorage<V> &tensor,
+                         uint64_t rank, const uint64_t *perm)
+      : src(tensor), permsz(src.getRev().size()), reord(getRank()),
+        cursor(getRank()) {
+    assert(perm);
+    assert(rank == getRank() && "Permutation rank mismatch");
+    const auto &rev = src.getRev();        // source stg-order -> semantic-order
+    const auto &sizes = src.getDimSizes(); // in source storage-order
+    for (uint64_t s = 0; s < rank; s++) {  // `s` source storage-order
+      uint64_t t = perm[rev[s]];           // `t` target-order
+      reord[s] = t;
+      permsz[t] = sizes[s];
+    }
+  }
+
+  SparseTensorEnumerator(const SparseTensorEnumerator &) = delete;
+  SparseTensorEnumerator(SparseTensorEnumerator &&) = delete;
+  SparseTensorEnumerator &operator=(const SparseTensorEnumerator &) = delete;
+  SparseTensorEnumerator &operator=(SparseTensorEnumerator &&) = delete;
+
+  /// Returns the source/target tensor's rank.  (The source-rank and
+  /// target-rank are always equal since we only support permutations.
+  /// Though once we add support for other dimension mappings, this
+  /// method will have to be split in two.)
+  inline uint64_t getRank() const { return permsz.size(); }
+
+  /// Returns the target tensor's dimension sizes.
+  inline const std::vector<uint64_t> &permutedSizes() const { return permsz; }
+
+  /// The type of callback functions which receive an element (in target
+  /// order).  We avoid packaging the coordinates and value together
+  /// as an `Element` object because this helps keep code somewhat cleaner.
+  typedef const std::function<void(const std::vector<uint64_t> &, V)>
+      &ElementConsumer;
+
+  /// Enumerates all elements of the source tensor, permutes their
+  /// indices, and passes the permuted element to the callback.
+  /// The callback must not store the cursor reference directly, since
+  /// this function reuses the storage.  Instead, the callback must copy
+  /// it if they want to keep it.
+  inline void forallElements(ElementConsumer yield) {
+    forallElements(yield, 0, 0);
+  }
+
+private:
+  /// The recursive component of the public `forallElements`.
+  void forallElements(ElementConsumer yield, uint64_t parentPos, uint64_t d) {
+    if (d == getRank()) {
+      // TODO: <https://github.com/llvm/llvm-project/issues/54179>
+      yield(cursor, src.getValue(parentPos));
+    } else if (src.isCompressedDim(d)) {
+      const uint64_t pstart = src.getPointer(d, parentPos);
+      const uint64_t pstop = src.getPointer(d, parentPos + 1);
+      for (uint64_t pos = pstart; pos < pstop; pos++) {
+        cursor[reord[d]] = src.getIndex(d, pos);
+        forallElements(yield, pos, d + 1);
+      }
+    } else { // Dense dimension.
+      const uint64_t sz = src.getDimSizes()[d];
+      const uint64_t pstart = parentPos * sz;
+      for (uint64_t i = 0; i < sz; i++) {
+        cursor[reord[d]] = i;
+        forallElements(yield, pstart + i, d + 1);
+      }
+    }
+  }
+
+  const EnumerableSparseTensorStorage<V> &src;
+  std::vector<uint64_t> permsz; // in target order.
+  std::vector<uint64_t> reord;  // source storage-order -> target order.
+  std::vector<uint64_t> cursor; // in target order.
+};
+
 /// A memory-resident sparse tensor using a storage scheme based on
 /// per-dimension sparse/dense annotations. This data structure provides a
 /// bufferized form of a sparse tensor type. In contrast to generating setup
@@ -297,7 +428,9 @@
 /// a convenient "one-size-fits-all" solution that simply takes an input tensor
 /// and annotations to implement all required setup in a general manner.
 template <typename P, typename I, typename V>
-class SparseTensorStorage : public SparseTensorStorageBase {
+class SparseTensorStorage : public EnumerableSparseTensorStorage<V> {
+  using Base = EnumerableSparseTensorStorage<V>;
+
 public:
   /// Constructs a sparse tensor storage scheme with the given dimensions,
   /// permutation, and per-dimension dense/sparse annotations, using
@@ -307,18 +440,18 @@
   SparseTensorStorage(const std::vector<uint64_t> &szs, const uint64_t *perm,
                       const DimLevelType *sparsity,
                       SparseTensorCOO<V> *coo = nullptr)
-      : SparseTensorStorageBase(szs, perm, sparsity), pointers(getRank()),
-        indices(getRank()), idx(getRank()) {
-    const uint64_t rank = getRank();
+      : Base(szs, perm, sparsity), pointers(Base::getRank()),
+        indices(Base::getRank()), idx(Base::getRank()) {
+    const uint64_t rank = Base::getRank();
     // Provide hints on capacity of pointers and indices.
     // TODO: needs fine-tuning based on sparsity
     bool allDense = true;
     uint64_t sz = 1;
     for (uint64_t r = 0; r < rank; r++) {
-      const uint64_t sz_r = getDimSizes()[r];
+      const uint64_t sz_r = Base::getDimSizes()[r];
       assert(sz_r <= std::numeric_limits<uint64_t>::max() / sz);
       sz *= sz_r;
-      if (isCompressedDim(r)) {
+      if (Base::isCompressedDim(r)) {
         pointers[r].reserve(sz + 1);
         pointers[r].push_back(0);
         indices[r].reserve(sz);
@@ -329,7 +462,7 @@
     // Then assign contents from coordinate scheme tensor if provided.
     if (coo) {
       // Ensure both preconditions of `fromCOO`.
-      assert(coo->getSizes() == getDimSizes() && "Tensor size mismatch");
+      assert(coo->getSizes() == Base::getDimSizes() && "Tensor size mismatch");
       coo->sort();
       // Now actually insert the `elements`.
       const std::vector<Element<V>> &elements = coo->getElements();
@@ -345,11 +478,11 @@
 
   /// Partially specialize these getter methods based on template types.
   void getPointers(std::vector<P> **out, uint64_t d) override {
-    assert(d < getRank());
+    assert(d < Base::getRank());
     *out = &pointers[d];
   }
   void getIndices(std::vector<I> **out, uint64_t d) override {
-    assert(d < getRank());
+    assert(d < Base::getRank());
     *out = &indices[d];
   }
   void getValues(std::vector<V> **out) override { *out = &values; }
@@ -378,7 +511,7 @@
     // Sort.
     std::sort(added, added + count);
     // Restore insertion path for first insert.
-    uint64_t rank = getRank();
+    const uint64_t rank = Base::getRank();
     uint64_t index = added[0];
     cursor[rank - 1] = index;
     lexInsert(cursor, values[index]);
@@ -409,24 +542,13 @@
   /// sparse tensor in coordinate scheme with the given dimension order.
   ///
   /// Precondition: `perm` must be valid for `getRank()`.
-  SparseTensorCOO<V> *toCOO(const uint64_t *perm) {
-    // Restore original order of the dimension sizes and allocate coordinate
-    // scheme with desired new ordering specified in perm.
-    const uint64_t rank = getRank();
-    const auto &rev = getRev();
-    const auto &sizes = getDimSizes();
-    std::vector<uint64_t> orgsz(rank);
-    for (uint64_t r = 0; r < rank; r++)
-      orgsz[rev[r]] = sizes[r];
-    SparseTensorCOO<V> *coo = SparseTensorCOO<V>::newSparseTensorCOO(
-        rank, orgsz.data(), perm, values.size());
-    // Populate coordinate scheme restored from old ordering and changed with
-    // new ordering. Rather than applying both reorderings during the recursion,
-    // we compute the combine permutation in advance.
-    std::vector<uint64_t> reord(rank);
-    for (uint64_t r = 0; r < rank; r++)
-      reord[r] = perm[rev[r]];
-    toCOO(*coo, reord, 0, 0);
+  SparseTensorCOO<V> *toCOO(const uint64_t *perm) const {
+    SparseTensorEnumerator<V> enumerator(*this, Base::getRank(), perm);
+    SparseTensorCOO<V> *coo =
+        new SparseTensorCOO<V>(enumerator.permutedSizes(), values.size());
+    enumerator.forallElements([&coo](const std::vector<uint64_t> &ind, V val) {
+      coo->add(ind, val);
+    });
     // TODO: This assertion assumes there are no stored zeros,
     // or if there are then that we don't filter them out.
     // Cf., <https://github.com/llvm/llvm-project/issues/54179>
@@ -468,7 +590,7 @@
   /// does not check that `pos` is semantically valid (i.e., larger than
   /// the previous position and smaller than `indices[d].capacity()`).
   inline void appendPointer(uint64_t d, uint64_t pos, uint64_t count = 1) {
-    assert(isCompressedDim(d));
+    assert(Base::isCompressedDim(d));
     assert(pos <= std::numeric_limits<P>::max() &&
            "Pointer value is too large for the P-type");
     pointers[d].insert(pointers[d].end(), count, static_cast<P>(pos));
@@ -483,7 +605,7 @@
   /// appends the appropriate number of zeros to the `values` array, where
   /// `full` indicates the highest index already written for this segment.
   void appendIndex(uint64_t d, uint64_t full, uint64_t i) {
-    if (isCompressedDim(d)) {
+    if (Base::isCompressedDim(d)) {
       assert(i <= std::numeric_limits<I>::max() &&
              "Index value is too large for the I-type");
       indices[d].push_back(static_cast<I>(i));
@@ -491,7 +613,7 @@
       assert(i >= full && "Index was already filled");
       if (i == full)
         return; // Short-circuit, since it'll be a nop.
-      if (d + 1 == getRank())
+      if (d + 1 == Base::getRank())
         values.insert(values.end(), i - full, 0);
       else
         finalizeSegment(d + 1, 0, i - full);
@@ -508,9 +630,10 @@
   ///     and pointwise less-than).
   void fromCOO(const std::vector<Element<V>> &elements, uint64_t lo,
                uint64_t hi, uint64_t d) {
+    const uint64_t rank = Base::getRank();
+    assert(d <= rank && hi <= elements.size());
     // Once dimensions are exhausted, insert the numerical values.
-    assert(d <= getRank() && hi <= elements.size());
-    if (d == getRank()) {
+    if (d == rank) {
       assert(lo < hi);
       values.push_back(elements[lo].value);
       return;
@@ -525,7 +648,7 @@
         seg++;
       // Handle segment in interval for sparse or dense dimension.
       appendIndex(d, full, i);
-      if (!isCompressedDim(d)) // i.e., *is* dense.
+      if (!Base::isCompressedDim(d)) // i.e., *is* dense.
         full = i + 1;
       fromCOO(elements, lo, seg, d + 1);
       // And move on to next segment in interval.
@@ -535,39 +658,14 @@
     finalizeSegment(d, full);
   }
 
-  /// Stores the sparse tensor storage scheme into a memory-resident sparse
-  /// tensor in coordinate scheme.
-  void toCOO(SparseTensorCOO<V> &tensor, std::vector<uint64_t> &reord,
-             uint64_t pos, uint64_t d) {
-    assert(d <= getRank());
-    if (d == getRank()) {
-      assert(pos < values.size());
-      tensor.add(idx, values[pos]);
-    } else if (isCompressedDim(d)) {
-      // Sparse dimension.
-      for (uint64_t ii = pointers[d][pos]; ii < pointers[d][pos + 1]; ii++) {
-        idx[reord[d]] = indices[d][ii];
-        toCOO(tensor, reord, ii, d + 1);
-      }
-    } else {
-      // Dense dimension.
-      const uint64_t sz = getDimSizes()[d];
-      const uint64_t off = pos * sz;
-      for (uint64_t i = 0; i < sz; i++) {
-        idx[reord[d]] = i;
-        toCOO(tensor, reord, off + i, d + 1);
-      }
-    }
-  }
-
   /// Finalize the sparse pointer structure at this dimension.
   void finalizeSegment(uint64_t d, uint64_t full = 0, uint64_t count = 1) {
     if (count == 0)
       return; // Short-circuit, since it'll be a nop.
-    if (isCompressedDim(d)) {
+    if (Base::isCompressedDim(d)) {
       appendPointer(d, indices[d].size(), count);
     } else { // Dense dimension.
-      const uint64_t sz = getDimSizes()[d];
+      const uint64_t sz = Base::getDimSizes()[d];
       assert(sz >= full && "Segment is overfull");
       // Assuming we check for overflows in the constructor, then this
       // multiply will never overflow.
@@ -576,7 +674,7 @@
       // in this dimension (i.e., coordinates after the last non-zero
       // element), and either fill in their zero values or else recurse
       // to finalize some deeper dimension.
-      if (d + 1 == getRank())
+      if (d + 1 == Base::getRank())
         values.insert(values.end(), count, 0);
       else
         finalizeSegment(d + 1, 0, count);
@@ -585,7 +683,7 @@
 
   /// Wraps up a single insertion path, inner to outer.
   void endPath(uint64_t diff) {
-    uint64_t rank = getRank();
+    const uint64_t rank = Base::getRank();
     assert(diff <= rank);
     for (uint64_t i = 0; i < rank - diff; i++) {
       const uint64_t d = rank - i - 1;
@@ -595,7 +693,7 @@
 
   /// Continues a single insertion path, outer to inner.
   void insPath(const uint64_t *cursor, uint64_t diff, uint64_t top, V val) {
-    uint64_t rank = getRank();
+    const uint64_t rank = Base::getRank();
     assert(diff < rank);
     for (uint64_t d = diff; d < rank; d++) {
       uint64_t i = cursor[d];
@@ -608,7 +706,7 @@
 
   /// Finds the lexicographic differing dimension.
   uint64_t lexDiff(const uint64_t *cursor) const {
-    for (uint64_t r = 0, rank = getRank(); r < rank; r++)
+    for (uint64_t r = 0, rank = Base::getRank(); r < rank; r++)
       if (cursor[r] > idx[r])
         return r;
       else
@@ -617,6 +715,26 @@
     return -1u;
   }
 
+protected:
+  // Allow `SparseTensorEnumerator` to access these methods without
+  // making them public to other client code.
+  friend class SparseTensorEnumerator<V>;
+  V getValue(uint64_t pos) const override {
+    assert(pos < values.size() && "Value position is out of bounds");
+    return values[pos];
+  }
+  uint64_t getPointer(uint64_t d, uint64_t parentPos) const override {
+    assert(Base::isCompressedDim(d)); // Entails `d < getRank()`.
+    assert(parentPos < pointers[d].size() &&
+           "Pointer position is out of bounds");
+    return pointers[d][parentPos]; // Converts the stored `P` into `uint64_t`.
+  }
+  uint64_t getIndex(uint64_t d, uint64_t pos) const override {
+    assert(Base::isCompressedDim(d)); // Entails `d < getRank()`.
+    assert(pos < indices[d].size() && "Index position is out of bounds");
+    return indices[d][pos]; // Converts the stored `I` into `uint64_t`.
+  }
+
 private:
   std::vector<std::vector<P>> pointers;
   std::vector<std::vector<I>> indices;