diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -352,6 +352,32 @@
   rewriter.create<memref::StoreOp>(loc, elemV, tensor, ivs);
 }
 
+/// Determine if the runtime library supports direct conversion to the
+/// given target `dimTypes`.
+static bool canUseDirectConversion(
+    ArrayRef<SparseTensorEncodingAttr::DimLevelType> dimTypes) {
+  bool alreadyCompressed = false;
+  for (uint64_t rank = dimTypes.size(), r = 0; r < rank; r++) {
+    switch (dimTypes[r]) {
+    case SparseTensorEncodingAttr::DimLevelType::Compressed:
+      if (alreadyCompressed)
+        return false; // Multiple compressed dimensions not yet supported.
+      alreadyCompressed = true;
+      break;
+    case SparseTensorEncodingAttr::DimLevelType::Dense:
+      if (alreadyCompressed)
+        return false; // Dense after Compressed not yet supported.
+      break;
+    case SparseTensorEncodingAttr::DimLevelType::Singleton:
+      // Although Singleton isn't generally supported yet, the direct
+      // conversion method doesn't have any particular problems with
+      // singleton after compressed.
+      break;
+    }
+  }
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 // Conversion rules.
 //===----------------------------------------------------------------------===//
@@ -489,19 +515,39 @@
       SmallVector<Value, 8> params;
       ShapedType stp = srcType.cast<ShapedType>();
       sizesFromPtr(rewriter, sizes, op, encSrc, stp, src);
-      // Set up encoding with right mix of src and dst so that the two
-      // method calls can share most parameters, while still providing
-      // the correct sparsity information to either of them.
-      auto enc = SparseTensorEncodingAttr::get(
-          op->getContext(), encDst.getDimLevelType(), encDst.getDimOrdering(),
-          encSrc.getPointerBitWidth(), encSrc.getIndexBitWidth());
-      newParams(rewriter, params, op, stp, enc, Action::kToCOO, sizes, src);
-      Value coo = genNewCall(rewriter, op, params);
-      params[3] = constantPointerTypeEncoding(rewriter, loc, encDst);
-      params[4] = constantIndexTypeEncoding(rewriter, loc, encDst);
-      params[6] = constantAction(rewriter, loc, Action::kFromCOO);
-      params[7] = coo;
-      rewriter.replaceOp(op, genNewCall(rewriter, op, params));
+      bool useDirectConversion;
+      switch (options.sparseToSparseStrategy) {
+      case SparseToSparseConversionStrategy::kViaCOO:
+        useDirectConversion = false;
+        break;
+      case SparseToSparseConversionStrategy::kDirect:
+        useDirectConversion = true;
+        assert(canUseDirectConversion(encDst.getDimLevelType()) &&
+               "Unsupported target for direct sparse-to-sparse conversion");
+        break;
+      case SparseToSparseConversionStrategy::kAuto:
+        useDirectConversion = canUseDirectConversion(encDst.getDimLevelType());
+        break;
+      }
+      if (useDirectConversion) {
+        newParams(rewriter, params, op, stp, encDst, Action::kSparseToSparse,
+                  sizes, src);
+        rewriter.replaceOp(op, genNewCall(rewriter, op, params));
+      } else { // use via-COO conversion.
+        // Set up encoding with right mix of src and dst so that the two
+        // method calls can share most parameters, while still providing
+        // the correct sparsity information to either of them.
+        auto enc = SparseTensorEncodingAttr::get(
+            op->getContext(), encDst.getDimLevelType(), encDst.getDimOrdering(),
+            encSrc.getPointerBitWidth(), encSrc.getIndexBitWidth());
+        newParams(rewriter, params, op, stp, enc, Action::kToCOO, sizes, src);
+        Value coo = genNewCall(rewriter, op, params);
+        params[3] = constantPointerTypeEncoding(rewriter, loc, encDst);
+        params[4] = constantIndexTypeEncoding(rewriter, loc, encDst);
+        params[6] = constantAction(rewriter, loc, Action::kFromCOO);
+        params[7] = coo;
+        rewriter.replaceOp(op, genNewCall(rewriter, op, params));
+      }
       return success();
     }
     if (!encDst && encSrc) {
diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
--- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
@@ -73,6 +73,22 @@
 
 static constexpr int kColWidth = 1025;
 
+// TODO(wrengr): adjust this so it can be used by `openSparseTensorCOO`
+// too.  That version doesn't have the permutation, and the `sizes`
+// are a pointer/C-array rather than `std::vector`.
+/// Precondition: `perm` and `shape` must be valid for `rank`.
+static inline void assertPermutedSizesMatchShape(
+    const std::vector<uint64_t> &sizes, // in target order.
+    const uint64_t rank,
+    const uint64_t *perm,    // semantic order -> target order.
+    const uint64_t *shape) { // in semantic order.
+  assert(rank == sizes.size() && "Rank mismatch");
+  for (uint64_t r = 0; r < rank; r++) {
+    const uint64_t s = shape[r];
+    assert((s == 0 || s == sizes[perm[r]]) && "Dimension size mismatch");
+  }
+}
+
 /// A sparse tensor element in coordinate scheme (value and indices).
 /// For example, a rank-1 vector element would look like
 ///   ({i}, a[i])
@@ -171,6 +187,7 @@
   unsigned iteratorPos;
 };
 
+//===----------------------------------------------------------------------===//
 /// Abstract base class of sparse tensor storage. Note that we use
 /// function overloading to implement "partial" method specialization.
 class SparseTensorStorageBase {
@@ -373,6 +390,132 @@
   }
 }; // class SparseTensorEnumerator
 
+//===----------------------------------------------------------------------===//
+// Forward declaration.
+template <typename P, typename I, typename V>
+class SparseTensorStorage;
+
+/// Statistics regarding the number of nonzero subtensors in
+/// a source tensor, for direct sparse=>sparse conversion a la
+/// <https://arxiv.org/abs/2001.02609>.
+///
+/// N.B., this class stores a reference to the dimension-sizes passed
+/// to the constructor; thus, objects of this class must not outlive
+/// the dimension-sizes they depend on.
+class SparseTensorNNZ {
+  // All of these are in the target storage-order.
+  const std::vector<uint64_t> &dimSizes;
+  const std::vector<DimLevelType> dimTypes;
+  std::vector<std::vector<uint64_t>> nnz;
+
+public:
+  SparseTensorNNZ(const SparseTensorNNZ &) = delete;
+  SparseTensorNNZ(SparseTensorNNZ &&) = delete;
+  SparseTensorNNZ &operator=(const SparseTensorNNZ &) = delete;
+  SparseTensorNNZ &operator=(SparseTensorNNZ &&) = delete;
+
+  /// Returns the rank of the target tensor.
+  uint64_t getRank() const { return dimSizes.size(); }
+
+  /// Allocate the statistics structure for the desired sizes and
+  /// sparsity (in the target tensor's storage-order).
+  ///
+  /// Preconditions:
+  /// (1) `sparsity` must be valid for `szs.size()`
+  /// (2) `szs` must not contain zeros.
+  SparseTensorNNZ(const std::vector<uint64_t> &szs,
+                  const DimLevelType *sparsity)
+      : dimSizes(szs), dimTypes(sparsity, sparsity + getRank()),
+        nnz(getRank()) {
+    bool uncompressed = true;
+    uint64_t sz = 1; // sz_r := product(dimSizes[0:r-1]);
+    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+      switch (dimTypes[r]) {
+      case DimLevelType::kCompressed:
+        assert(uncompressed &&
+               "Multiple compressed layers not currently supported");
+        uncompressed = false;
+        nnz[r].resize(sz, 0); // Both allocate and zero-initialize.
+        break;
+      case DimLevelType::kDense:
+        assert(uncompressed &&
+               "Dense after compressed not currently supported");
+        break;
+      case DimLevelType::kSingleton:
+        // Singleton after Compressed causes no problems for allocating
+        // `nnz` nor for the yieldPos loop.  This remains true even
+        // when adding support for multiple compressed dimensions or
+        // for dense-after-compressed.
+        break;
+      }
+      // TODO(wrengr): this division is expensive, but replacing it
+      // with ffs/clz or intrinsics is non-portable; does mlir have
+      // any cpp macro to enable this only when we want "paranoia mode"
+      // for extra debugging info?  Or should we just rely on Clang's
+      // -fsanitize=integer ?
+      assert(dimSizes[r] <= std::numeric_limits<uint64_t>::max() / sz);
+      sz *= dimSizes[r];
+    }
+  }
+
+  /// Enumerate the source tensor to fill in the statistics.  The
+  /// enumerator should already incorporate the permutation of dimensions
+  /// from the semantic-order to the target tensor's storage-order.
+  template <typename V>
+  void initialize(SparseTensorEnumerator<V> &enumerator) {
+    assert(enumerator.getRank() == getRank() && "Tensor rank mismatch");
+    assert(enumerator.permutedSizes() == dimSizes && "Tensor size mismatch");
+    enumerator.forallElements(
+        [this](const std::vector<uint64_t> &ind, V) { add(ind); });
+  }
+
+private:
+  /// Adds a new element (i.e., increment its statistics).  We use
+  /// a method rather than inlining into the lambda in `initialize`,
+  /// to avoid spurious templating over `V`.  And this method is private
+  /// to avoid needing to re-assert validity of `ind` (which is guaranteed
+  /// by `forallElements`).
+  void add(const std::vector<uint64_t> &ind) {
+    uint64_t parentPos = 0;
+    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+      if (dimTypes[r] == DimLevelType::kCompressed)
+        nnz[r][parentPos]++;
+      parentPos = parentPos * dimSizes[r] + ind[r];
+    }
+  }
+
+public:
+  /// The type of callback functions which receive an nnz-statistic.
+  typedef const std::function<void(uint64_t)> &NNZConsumer;
+
+  /// Lexicographically enumerates all indicies for dimensions strictly
+  /// less than `stopDim`, and passes their nnz statistic to the callback.
+  /// Since our use case only requires the statistic not the coordinates
+  /// themselves, we do not bother to construct them.
+  void forallIndices(uint64_t stopDim, NNZConsumer yield) const {
+    assert(stopDim < getRank() && "Stopping-dimension is out of bounds");
+    assert(dimTypes[stopDim] == DimLevelType::kCompressed &&
+           "Cannot look up non-compressed dimensions");
+    forallIndices(yield, stopDim, 0, 0);
+  }
+
+private:
+  /// Recursive component of the public `forallIndices`.
+  void forallIndices(NNZConsumer yield, uint64_t stopDim, uint64_t parentPos,
+                     uint64_t d) const {
+    assert(d <= stopDim);
+    if (d == stopDim) {
+      assert(parentPos < nnz[d].size() && "Cursor is out of range");
+      yield(nnz[d][parentPos]);
+    } else {
+      const uint64_t sz = dimSizes[d];
+      const uint64_t pstart = parentPos * sz;
+      for (uint64_t i = 0; i < sz; i++)
+        forallIndices(yield, stopDim, pstart + i, d + 1);
+    }
+  }
+}; // class SparseTensorNNZ
+
 //===----------------------------------------------------------------------===//
 /// A memory-resident sparse tensor using a storage scheme based on
 /// per-dimension sparse/dense annotations. This data structure provides a
@@ -382,40 +525,58 @@
 /// and annotations to implement all required setup in a general manner.
 template <typename P, typename I, typename V>
 class SparseTensorStorage : protected EnumerableSparseTensorStorage<V> {
-public:
-  /// Constructs a sparse tensor storage scheme with the given dimensions,
-  /// permutation, and per-dimension dense/sparse annotations, using
-  /// the coordinate scheme tensor for the initial contents if provided.
+private:
+  /// Partial constructor for sharing code between the public
+  /// constructors.  This only does the bare minimum setup for the
+  /// other constructors, thus the resulting object is not guaranteed
+  /// to be in a valid state.  In particular `isCompressedDim` (and
+  /// hence `appendCurrentPointer` etc) will not work.
   ///
   /// Precondition: `perm` and `sparsity` must be valid for `szs.size()`.
   SparseTensorStorage(const std::vector<uint64_t> &szs, const uint64_t *perm,
-                      const DimLevelType *sparsity,
-                      SparseTensorCOO<V> *tensor = nullptr)
+                      const DimLevelType *sparsity)
       : sizes(szs), rev(getRank()), idx(getRank()), pointers(getRank()),
         indices(getRank()) {
-    uint64_t rank = getRank();
+    // Validate parameters.
+    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+      assert(sizes[r] > 0 && "Dimension size zero has trivial storage");
+      assert((sparsity[r] == DimLevelType::kDense ||
+              sparsity[r] == DimLevelType::kCompressed) &&
+             "Unsupported DimLevelType");
+    }
     // Store "reverse" permutation.
-    for (uint64_t r = 0; r < rank; r++)
+    for (uint64_t rank = getRank(), r = 0; r < rank; r++)
       rev[perm[r]] = r;
+  }
+
+public:
+  /// Constructs a sparse tensor storage scheme with the given dimensions,
+  /// permutation, and per-dimension dense/sparse annotations, using
+  /// the coordinate scheme tensor for the initial contents if provided.
+  ///
+  /// Precondition: `perm` and `sparsity` must be valid for `szs.size()`.
+  SparseTensorStorage(const std::vector<uint64_t> &szs, const uint64_t *perm,
+                      const DimLevelType *sparsity, SparseTensorCOO<V> *tensor)
+      : SparseTensorStorage(szs, perm, sparsity) {
     // Provide hints on capacity of pointers and indices.
     // TODO: needs fine-tuning based on sparsity
     bool allDense = true;
     uint64_t sz = 1;
-    for (uint64_t r = 0; r < rank; r++) {
-      assert(sizes[r] > 0 && "Dimension size zero has trivial storage");
+    for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+      // TODO(wrengr): this division is expensive, but replacing it
+      // with ffs/clz or intrinsics is non-portable; does mlir have
+      // any cpp macro to enable this only when we want "paranoia mode"
+      // for extra debugging info?  Or should we just rely on Clang's
+      // -fsanitize=integer ?
+      assert(sizes[r] <= std::numeric_limits<uint64_t>::max() / sz);
       sz *= sizes[r];
       if (sparsity[r] == DimLevelType::kCompressed) {
         pointers[r].reserve(sz + 1);
         indices[r].reserve(sz);
         sz = 1;
         allDense = false;
-        // Prepare the pointer structure.  We cannot use `appendPointer`
-        // here, because `isCompressedDim` won't work until after this
-        // preparation has been done.
+        // Can't use `isCompressedDim` until after we push this zero.
         pointers[r].push_back(0);
-      } else {
-        assert(sparsity[r] == DimLevelType::kDense &&
-               "singleton not yet supported");
       }
     }
     // Then assign contents from coordinate scheme tensor if provided.
@@ -433,6 +594,114 @@
     }
   }
 
+  /// Constructs a sparse tensor storage scheme with the given dimensions,
+  /// permutation, and per-dimension dense/sparse annotations, using
+  /// the given sparse tensor for the initial contents.
+  ///
+  /// Precondition: `perm` and `sparsity` must be valid for `szs.size()`.
+  SparseTensorStorage(const std::vector<uint64_t> &szs, const uint64_t *perm,
+                      const DimLevelType *sparsity,
+                      const EnumerableSparseTensorStorage<V> &tensor)
+      : SparseTensorStorage(szs, perm, sparsity) {
+    SparseTensorEnumerator<V> enumerator(tensor, getRank(), perm);
+    {
+      // Initialize the statistics structure.
+      SparseTensorNNZ nnz(sizes, sparsity);
+      nnz.initialize(enumerator);
+      // Initialize "pointers" overhead (and allocate "indices", "values").
+      uint64_t parentSz = 1; // assembled-size (not dimension-size) of `r-1`.
+      for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+        if (sparsity[r] == DimLevelType::kCompressed) {
+          pointers[r].reserve(parentSz + 1);
+          // Can't use `isCompressedDim` until after we push this zero.
+          pointers[r].push_back(0);
+          uint64_t currentPos = 0;
+          nnz.forallIndices(r, [this, &currentPos, r](uint64_t n) {
+            currentPos += n;
+            appendPointer(r, currentPos);
+          });
+          assert(pointers[r].size() == parentSz + 1 &&
+                 "Final pointers size doesn't match allocated size");
+          // That assertion entails `assembledSize(parentSz, r)`
+          // is now in a valid state.  That is, `pointers[r][parentSz]`
+          // equals the present value of `currentPos`, which is the
+          // correct assembled-size for `indices[r]`.
+        }
+        // Update assembled-size for the next iteration.
+        parentSz = assembledSize(parentSz, r);
+        // Ideally we need only `indices[r].reserve(parentSz)`, however
+        // the `std::vector` implementation forces us to initialize it too.
+        // That is, in the yieldPos loop we need random-access assignment
+        // to `indices[r]`; however, `std::vector`'s subscript-assignment
+        // only allows assigning to already-initialized positions.
+        if (isCompressedDim(r))
+          indices[r].resize(parentSz, 0);
+      }
+      values.resize(parentSz, 0); // Both allocate and zero-initialize.
+    }                             // end initializing "pointers" overhead
+    // The yieldPos loop
+    enumerator.forallElements([this](const std::vector<uint64_t> &ind, V val) {
+      uint64_t parentSz = 1, parentPos = 0;
+      for (uint64_t rank = getRank(), r = 0; r < rank; r++) {
+        if (isCompressedDim(r)) {
+          // If `parentPos == parentSz` then it's valid as an array-lookup;
+          // however, it's semantically invalid here since that entry
+          // does not represent a segment of `indices[r]`.  Moreover, that
+          // entry must be immutable for `assembledSize` to remain valid.
+          assert(parentPos < parentSz && "Pointers position is out of bounds");
+          const uint64_t currentPos = pointers[r][parentPos];
+          // This increment won't overflow the `P` type, since it can't
+          // exceed the original value of `pointers[r][parentPos+1]`
+          // which was already verified to be within bounds for `P`
+          // when it was written to the array.
+          pointers[r][parentPos]++;
+          writeIndex(r, currentPos, ind[r]);
+          parentPos = currentPos;
+        } else { // if dense
+          parentPos = parentPos * sizes[r] + ind[r];
+        } // else if singleton: the new `parentPos` is the same as the old one.
+        parentSz = assembledSize(parentSz, r);
+      }
+      assert(parentPos < values.size() && "Value position is out of bounds");
+      values[parentPos] = val;
+    }); // end yieldPos loop
+    // The finalizeYieldPos loop
+    for (uint64_t parentSz = 1, rank = getRank(), r = 0; r < rank; r++) {
+      if (isCompressedDim(r)) {
+        assert(parentSz == pointers[r].size() - 1 &&
+               "Actual pointers size doesn't match the expected size");
+        // Can't check all of them, but at least we can check the last one.
+        assert(pointers[r][parentSz - 1] == pointers[r][parentSz] &&
+               "Pointers got corrupted");
+        // TODO: optimize this by using `memmove` or similar.
+        for (uint64_t n = 0; n < parentSz; n++) {
+          uint64_t parentPos = parentSz - n;
+          pointers[r][parentPos] = pointers[r][parentPos - 1];
+        }
+        pointers[r][0] = 0;
+      }
+      parentSz = assembledSize(parentSz, r);
+    } // end finalizeYieldPos loop
+  }
+
+private:
+  /// Computes the assembled-size associated with the `d`-th dimension,
+  /// given the assembled-size associated with the `(d-1)`-th dimension.
+  /// "Assembled-sizes" correspond to the (nominal) sizes of overhead
+  /// storage, as opposed to "dimension-sizes" which are the cardinality
+  /// of coordinates for that dimension.
+  ///
+  /// Precondition: the `pointers[d]` array must be fully initialized
+  /// before calling this method.
+  inline uint64_t assembledSize(uint64_t parentSz, uint64_t d) const {
+    if (isCompressedDim(d))
+      return pointers[d][parentSz];
+    // else if dense:
+    return parentSz * sizes[d];
+    // else if singleton: `return parentSz;`
+  }
+
+public:
   ~SparseTensorStorage() override = default;
 
   /// Get the rank of the tensor.
@@ -533,39 +802,71 @@
   /// Precondition: `shape`, `perm`, and `sparsity` must be valid for `rank`.
   static SparseTensorStorage<P, I, V> *
   newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
-                  const DimLevelType *sparsity, SparseTensorCOO<V> *tensor) {
+                  const DimLevelType *sparsity, SparseTensorCOO<V> *coo) {
     SparseTensorStorage<P, I, V> *n = nullptr;
-    if (tensor) {
-      assert(tensor->getRank() == rank && "Tensor rank mismatch");
-      for (uint64_t r = 0; r < rank; r++)
-        assert(shape[r] == 0 || shape[r] == tensor->getSizes()[perm[r]]);
-      n = new SparseTensorStorage<P, I, V>(tensor->getSizes(), perm, sparsity,
-                                           tensor);
-      delete tensor;
+    if (coo) {
+      const auto &coosz = coo->getSizes();
+      assertPermutedSizesMatchShape(coosz, rank, perm, shape);
+      n = new SparseTensorStorage<P, I, V>(coosz, perm, sparsity, coo);
+      delete coo;
     } else {
       std::vector<uint64_t> permsz(rank);
       for (uint64_t r = 0; r < rank; r++) {
         assert(shape[r] > 0 && "Dimension size zero has trivial storage");
         permsz[perm[r]] = shape[r];
       }
-      n = new SparseTensorStorage<P, I, V>(permsz, perm, sparsity);
+      // We pass the null `coo` to ensure we select the intended constructor.
+      n = new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, coo);
     }
     return n;
   }
 
+  /// Factory method. Constructs a sparse tensor storage scheme with
+  /// the given dimensions, permutation, and per-dimension dense/sparse
+  /// annotations, using the sparse tensor for the initial contents.
+  ///
+  /// Precondition: `shape`, `perm`, and `sparsity` must be valid for `rank`.
+  static SparseTensorStorage<P, I, V> *
+  newSparseTensor(uint64_t rank, const uint64_t *shape, const uint64_t *perm,
+                  const DimLevelType *sparsity,
+                  const EnumerableSparseTensorStorage<V> *source) {
+    assert(source && "Got nullptr for source");
+    SparseTensorEnumerator<V> enumerator(*source, rank, perm);
+    const auto &permsz = enumerator.permutedSizes();
+    assertPermutedSizesMatchShape(permsz, rank, perm, shape);
+    // N.B., do *not* `delete source`.  The other `newSparseTensor`
+    // only deletes the COO because the end-user has no way to do so
+    // themselves.  Whereas if we delete the source here, then we'll
+    // get a double-free error in `delSparseTensor`.
+    return new SparseTensorStorage<P, I, V>(permsz, perm, sparsity, *source);
+  }
+
 private:
   /// Appends the next free position of `indices[d]` to `pointers[d]`.
   /// Thus, when called after inserting the last element of a segment,
   /// it will append the position where the next segment begins.
-  inline void appendPointer(uint64_t d) {
+  inline void appendCurrentPointer(uint64_t d) {
     assert(isCompressedDim(d)); // Entails `d < getRank()`.
-    uint64_t p = indices[d].size();
-    assert(p <= std::numeric_limits<P>::max() &&
+    appendPointer(d, indices[d].size());
+  }
+
+  /// Appends an arbitrary new position to `pointers[d]`.  This method
+  /// checks that `pos` is representable in the `P` type; however, it
+  /// does not check that `pos` is semantically valid (i.e., larger than
+  /// the previous position and smaller than `indices[d].capacity()`).
+  inline void appendPointer(uint64_t d, uint64_t pos) {
+    // This first assertion is trivially satisfied by both use sites;
+    // but we leave it in just to be safe.
+    assert(isCompressedDim(d)); // Entails `d < getRank()`.
+    assert(pos <= std::numeric_limits<P>::max() &&
            "Pointer value is too large for the P-type");
-    pointers[d].push_back(p); // Here is where we convert to `P`.
+    pointers[d].push_back(pos); // Here is where we convert to `P`.
   }
 
-  /// Appends the given index to `indices[d]`.
+  /// Appends the given index to `indices[d]`.  This method checks that
+  /// `i` is representable in the `I` type; however, it does not check
+  /// that `i` is semantically valid (i.e., in bounds for `sizes[d]`
+  /// and not previously occurring in the same segment).
   inline void appendIndex(uint64_t d, uint64_t i) {
     assert(isCompressedDim(d)); // Entails `d < getRank()`.
     assert(i <= std::numeric_limits<I>::max() &&
@@ -573,6 +874,23 @@
     indices[d].push_back(i); // Here is where we convert to `I`.
   }
 
+  /// Writes the given coordinate to `indices[d][pos]`.  This method
+  /// checks that `i` is representable in the `I` type; however, it
+  /// does not check that `i` is semantically valid (i.e., in bounds
+  /// for `sizes[d]` and not elsewhere occurring in the same segment).
+  inline void writeIndex(uint64_t d, uint64_t pos, uint64_t i) {
+    // This first assertion trivially entailed at the one call-site;
+    // but just to be safe.
+    assert(isCompressedDim(d)); // Entails `d < getRank()`.
+    // Subscript assignment to `std::vector` requires that the `pos`-th
+    // entry has been initialized; thus we must be sure to check `size()`
+    // here, instead of `capacity()` as would be ideal.
+    assert(pos < indices[d].size() && "Index position is out of bounds");
+    assert(i <= std::numeric_limits<I>::max() &&
+           "Index value is too large for the I-type");
+    indices[d][pos] = i; // Here is where we convert to `I`.
+  }
+
   /// Initializes sparse tensor storage scheme from a memory-resident sparse
   /// tensor in coordinate scheme. This method prepares the pointers and
   /// indices arrays under the given per-dimension dense/sparse annotations.
@@ -615,7 +933,7 @@
     }
     // Finalize the sparse pointer structure at this dimension.
     if (isCompressedDim(d)) {
-      appendPointer(d);
+      appendCurrentPointer(d);
     } else {
       // For dense storage we must fill in all the zero values after
       // the last element.
@@ -630,7 +948,7 @@
     if (d == getRank()) {
       values.push_back(0);
     } else if (isCompressedDim(d)) {
-      appendPointer(d);
+      appendCurrentPointer(d);
     } else {
       for (uint64_t full = 0, sz = sizes[d]; full < sz; full++)
         endDim(d + 1);
@@ -644,7 +962,7 @@
     for (uint64_t i = 0; i < rank - diff; i++) {
       uint64_t d = rank - i - 1;
       if (isCompressedDim(d)) {
-        appendPointer(d);
+        appendCurrentPointer(d);
       } else {
         for (uint64_t full = idx[d] + 1, sz = sizes[d]; full < sz; full++)
           endDim(d + 1);
@@ -718,6 +1036,7 @@
   std::vector<V> values;
 };
 
+//===----------------------------------------------------------------------===//
 /// Helper to convert string to lower case.
 static char *toLower(char *token) {
   for (char *c = token; *c; c++)
@@ -832,7 +1151,7 @@
            "dimension size mismatch");
   SparseTensorCOO<V> *tensor =
       SparseTensorCOO<V>::newSparseTensorCOO(rank, idata + 2, perm, nnz);
-  //  Read all nonzero elements.
+  // Read all nonzero elements.
   std::vector<uint64_t> indices(rank);
   for (uint64_t k = 0; k < nnz; k++) {
     if (!fgets(line, kColWidth, file)) {
@@ -987,28 +1306,34 @@
 
 #define CASE(p, i, v, P, I, V)                                                 \
   if (ptrTp == (p) && indTp == (i) && valTp == (v)) {                          \
-    SparseTensorCOO<V> *tensor = nullptr;                                      \
+    SparseTensorCOO<V> *coo = nullptr;                                         \
     if (action <= Action::kFromCOO) {                                          \
       if (action == Action::kFromFile) {                                       \
         char *filename = static_cast<char *>(ptr);                             \
-        tensor = openSparseTensorCOO<V>(filename, rank, shape, perm);          \
+        coo = openSparseTensorCOO<V>(filename, rank, shape, perm);             \
       } else if (action == Action::kFromCOO) {                                 \
-        tensor = static_cast<SparseTensorCOO<V> *>(ptr);                       \
+        coo = static_cast<SparseTensorCOO<V> *>(ptr);                          \
       } else {                                                                 \
         assert(action == Action::kEmpty);                                      \
       }                                                                        \
+      return SparseTensorStorage<P, I, V>::newSparseTensor(rank, shape, perm,  \
+                                                           sparsity, coo);     \
+    }                                                                          \
+    if (action == Action::kSparseToSparse) {                                   \
+      EnumerableSparseTensorStorage<V> *tensor =                               \
+          static_cast<EnumerableSparseTensorStorage<V> *>(ptr);                \
       return SparseTensorStorage<P, I, V>::newSparseTensor(rank, shape, perm,  \
                                                            sparsity, tensor);  \
     }                                                                          \
     if (action == Action::kEmptyCOO)                                           \
       return SparseTensorCOO<V>::newSparseTensorCOO(rank, shape, perm);        \
-    tensor = static_cast<SparseTensorStorage<P, I, V> *>(ptr)->toCOO(perm);    \
+    coo = static_cast<SparseTensorStorage<P, I, V> *>(ptr)->toCOO(perm);       \
     if (action == Action::kToIterator) {                                       \
-      tensor->startIterator();                                                 \
+      coo->startIterator();                                                    \
     } else {                                                                   \
       assert(action == Action::kToCOO);                                        \
     }                                                                          \
-    return tensor;                                                             \
+    return coo;                                                                \
   }
 
 #define CASE_SECSAME(p, v, P, V) CASE(p, p, v, P, P, V)
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -1,4 +1,10 @@
-// RUN: mlir-opt %s --sparse-tensor-conversion --canonicalize --cse | FileCheck %s
+// First use with `kViaCOO` for sparse2sparse conversion (the old way).
+// RUN: mlir-opt %s --sparse-tensor-conversion="s2s-strategy=1" \
+// RUN:    --canonicalize --cse | FileCheck %s
+//
+// Now again with `kAuto` (the new default)
+// RUN: mlir-opt %s --sparse-tensor-conversion="s2s-strategy=0" \
+// RUN:    --canonicalize --cse | FileCheck %s -check-prefix=CHECKAUTO
 
 #SparseVector = #sparse_tensor.encoding<{
   dimLevelType = ["compressed"]
@@ -191,15 +197,28 @@
 
 // CHECK-LABEL: func @sparse_convert_1d_ss(
 //  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//   CHECK-DAG: %[[ToCOO:.*]] = arith.constant 5 : i32
+//   CHECK-DAG: %[[FromCOO:.*]] = arith.constant 2 : i32
 //   CHECK-DAG: %[[P:.*]] = memref.alloca() : memref<1xi8>
 //   CHECK-DAG: %[[Q:.*]] = memref.alloca() : memref<1xindex>
 //   CHECK-DAG: %[[R:.*]] = memref.alloca() : memref<1xindex>
 //   CHECK-DAG: %[[X:.*]] = memref.cast %[[P]] : memref<1xi8> to memref<?xi8>
 //   CHECK-DAG: %[[Y:.*]] = memref.cast %[[Q]] : memref<1xindex> to memref<?xindex>
 //   CHECK-DAG: %[[Z:.*]] = memref.cast %[[R]] : memref<1xindex> to memref<?xindex>
-//       CHECK: %[[C:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[A]])
-//       CHECK: %[[T:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[C]])
+//       CHECK: %[[C:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %[[ToCOO]], %[[A]])
+//       CHECK: %[[T:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %[[FromCOO]], %[[C]])
 //       CHECK: return %[[T]] : !llvm.ptr<i8>
+// CHECKAUTO-LABEL: func @sparse_convert_1d_ss(
+//  CHECKAUTO-SAME: %[[A:.*]]: !llvm.ptr<i8>)
+//   CHECKAUTO-DAG: %[[SparseToSparse:.*]] = arith.constant 3 : i32
+//   CHECKAUTO-DAG: %[[P:.*]] = memref.alloca() : memref<1xi8>
+//   CHECKAUTO-DAG: %[[Q:.*]] = memref.alloca() : memref<1xindex>
+//   CHECKAUTO-DAG: %[[R:.*]] = memref.alloca() : memref<1xindex>
+//   CHECKAUTO-DAG: %[[X:.*]] = memref.cast %[[P]] : memref<1xi8> to memref<?xi8>
+//   CHECKAUTO-DAG: %[[Y:.*]] = memref.cast %[[Q]] : memref<1xindex> to memref<?xindex>
+//   CHECKAUTO-DAG: %[[Z:.*]] = memref.cast %[[R]] : memref<1xindex> to memref<?xindex>
+//       CHECKAUTO: %[[T:.*]] = call @newSparseTensor(%[[X]], %[[Y]], %[[Z]], %{{.*}}, %{{.*}}, %{{.*}}, %[[SparseToSparse]], %[[A]])
+//       CHECKAUTO: return %[[T]] : !llvm.ptr<i8>
 func @sparse_convert_1d_ss(%arg0: tensor<?xf32, #SparseVector64>) -> tensor<?xf32, #SparseVector32> {
   %0 = sparse_tensor.convert %arg0 : tensor<?xf32, #SparseVector64> to tensor<?xf32, #SparseVector32>
   return %0 : tensor<?xf32, #SparseVector32>
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion_sparse2sparse.mlir
@@ -0,0 +1,102 @@
+// Force this file to use the kDirect method for sparse2sparse
+// RUN: mlir-opt %s --sparse-compiler="s2s-strategy=2" | \
+// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+#Tensor1 = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "dense", "compressed" ]
+}>
+
+// NOTE: dense after compressed is not currently supported for the target
+// of direct-sparse2sparse conversion.  (It's fine for the source though.)
+#Tensor2 = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed", "dense" ]
+}>
+
+#Tensor3 = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "dense", "compressed" ],
+  dimOrdering = affine_map<(i,j,k) -> (i,k,j)>
+}>
+
+module {
+  //
+  // Utilities for output and releasing memory.
+  //
+  func @dump(%arg0: tensor<2x3x4xf64>) {
+    %c0 = arith.constant 0 : index
+    %d0 = arith.constant -1.0 : f64
+    %0 = vector.transfer_read %arg0[%c0, %c0, %c0], %d0: tensor<2x3x4xf64>, vector<2x3x4xf64>
+    vector.print %0 : vector<2x3x4xf64>
+    return
+  }
+  func @dumpAndRelease_234(%arg0: tensor<2x3x4xf64>) {
+    call @dump(%arg0) : (tensor<2x3x4xf64>) -> ()
+    %1 = bufferization.to_memref %arg0 : memref<2x3x4xf64>
+    memref.dealloc %1 : memref<2x3x4xf64>
+    return
+  }
+
+  //
+  // Main driver.
+  //
+  func @entry() {
+    //
+    // Initialize a 3-dim dense tensor.
+    //
+    %src = arith.constant dense<[
+       [  [  1.0,  2.0,  3.0,  4.0 ],
+          [  5.0,  6.0,  7.0,  8.0 ],
+          [  9.0, 10.0, 11.0, 12.0 ] ],
+       [  [ 13.0, 14.0, 15.0, 16.0 ],
+          [ 17.0, 18.0, 19.0, 20.0 ],
+          [ 21.0, 22.0, 23.0, 24.0 ] ]
+    ]> : tensor<2x3x4xf64>
+
+    //
+    // Convert dense tensor directly to various sparse tensors.
+    //
+    %s1 = sparse_tensor.convert %src : tensor<2x3x4xf64> to tensor<2x3x4xf64, #Tensor1>
+    %s2 = sparse_tensor.convert %src : tensor<2x3x4xf64> to tensor<2x3x4xf64, #Tensor2>
+    %s3 = sparse_tensor.convert %src : tensor<2x3x4xf64> to tensor<2x3x4xf64, #Tensor3>
+
+    //
+    // Convert sparse tensor directly to another sparse format.
+    //
+    %t13 = sparse_tensor.convert %s1 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64, #Tensor3>
+    %t21 = sparse_tensor.convert %s2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64, #Tensor1>
+    %t23 = sparse_tensor.convert %s2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64, #Tensor3>
+    %t31 = sparse_tensor.convert %s3 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64, #Tensor1>
+
+    //
+    // Convert sparse tensor back to dense.
+    //
+    %d13 = sparse_tensor.convert %t13 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64>
+    %d21 = sparse_tensor.convert %t21 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64>
+    %d23 = sparse_tensor.convert %t23 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64>
+    %d31 = sparse_tensor.convert %t31 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64>
+
+    //
+    // Check round-trip equality.  And release dense tensors.
+    //
+    // CHECK-COUNT-5: ( ( ( 1, 2, 3, 4 ), ( 5, 6, 7, 8 ), ( 9, 10, 11, 12 ) ), ( ( 13, 14, 15, 16 ), ( 17, 18, 19, 20 ), ( 21, 22, 23, 24 ) ) )
+    call @dump(%src) : (tensor<2x3x4xf64>) -> ()
+    call @dumpAndRelease_234(%d13) : (tensor<2x3x4xf64>) -> ()
+    call @dumpAndRelease_234(%d21) : (tensor<2x3x4xf64>) -> ()
+    call @dumpAndRelease_234(%d23) : (tensor<2x3x4xf64>) -> ()
+    call @dumpAndRelease_234(%d31) : (tensor<2x3x4xf64>) -> ()
+
+    //
+    // Release sparse tensors.
+    //
+    sparse_tensor.release %t13 : tensor<2x3x4xf64, #Tensor3>
+    sparse_tensor.release %t21 : tensor<2x3x4xf64, #Tensor1>
+    sparse_tensor.release %t23 : tensor<2x3x4xf64, #Tensor3>
+    sparse_tensor.release %t31 : tensor<2x3x4xf64, #Tensor1>
+    sparse_tensor.release %s1 : tensor<2x3x4xf64, #Tensor1>
+    sparse_tensor.release %s2 : tensor<2x3x4xf64, #Tensor2>
+    sparse_tensor.release %s3 : tensor<2x3x4xf64, #Tensor3>
+
+    return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
--- a/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
+++ b/mlir/test/Integration/Dialect/SparseTensor/python/test_stress.py
@@ -189,11 +189,17 @@
     vec = 0
     vl = 1
     e = False
+    # Disable direct sparse2sparse conversion, because it doubles the time!
+    # TODO: While direct s2s is far too slow for per-commit testing,
+    # we should have some framework ensure that we run this test with
+    # `s2s=0` on a regular basis, to ensure that it does continue to work.
+    s2s = 1
     sparsification_options = (
         f'parallelization-strategy={par} '
         f'vectorization-strategy={vec} '
         f'vl={vl} '
-        f'enable-simd-index32={e}')
+        f'enable-simd-index32={e} '
+        f's2s-strategy={s2s}')
     compiler = sparse_compiler.SparseCompiler(options=sparsification_options)
     f64 = ir.F64Type.get()
     # Be careful about increasing this because