diff --git a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
--- a/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/Transforms/Passes.td
@@ -106,6 +106,7 @@
   let dependentDialects = [
     "arith::ArithmeticDialect",
     "LLVM::LLVMDialect",
+    "linalg::LinalgDialect",
     "memref::MemRefDialect",
     "scf::SCFDialect",
     "sparse_tensor::SparseTensorDialect",
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -275,6 +275,56 @@
       params);
 }
 
+/// Generates a call to `SparseTensorCOO<V>::Iterator::getNext()`.
+/// To avoid needing to handle multiple outputs and avoid defining
+/// a bunch of new MLIR types for `Element<V>`, we instead have both
+/// `indices` and `elemPtr` serve as out-parameters and return a bool
+/// to indicate whether those out-parameters are filled or whether we
+/// have no more elements to iterate.
+///
+/// \param [in] elemTp  The type of the tensor's elements/values, called
+///                     `V` below.
+/// \param [in] iter    A value of MLIR-type `!llvm.ptr<i8>` which we
+///                     static-cast to C++-type `SparseTensorCOO<V>::Iterator*`.
+/// \param [out] ind    A value of `memref<?xindex>` type, where the dynamic
+///                     size matches the iterator/sparse-tensor.
+/// \param [out] elemPtr A value of MLIR-type `memref<V>`.
+///
+/// \returns `i1` indicating whether `ind` and `elemPtr` were filled.
+//
+// TODO(wrengr): once we're sure this is actually working as
+//   intended/desired, then we ought to be able to deduce the `elemTp`
+//   directly from the `elemPtr`.
+static Value genGetNextCall(ConversionPatternRewriter &rewriter, Operation *op,
+                            Type elemTp, Value iter, Value ind, Value elemPtr) {
+  Location loc = op->getLoc();
+  StringRef name;
+  if (elemTp.isF64())
+    name = "getNextF64";
+  else if (elemTp.isF32())
+    name = "getNextF32";
+  else if (elemTp.isInteger(64))
+    name = "getNextI64";
+  else if (elemTp.isInteger(32))
+    name = "getNextI32";
+  else if (elemTp.isInteger(16))
+    name = "getNextI16";
+  else if (elemTp.isInteger(8))
+    name = "getNextI8";
+  else
+    llvm_unreachable("Unknown element type");
+  SmallVector<Value, 8> params;
+  params.push_back(iter);
+  params.push_back(ind);
+  params.push_back(elemPtr);
+  // TODO(wrengr): Need we do anything extra to massage between C++'s
+  //   `bool` type and MLIR's `i1` type?
+  Type i1 = rewriter.getI1Type();
+  auto call = rewriter.create<CallOp>(
+      loc, i1, getFunc(op, name, i1, params, /*emitCInterface=*/true), params);
+  return call.getResult(0);
+}
+
 /// If the tensor is a sparse constant, generates and returns the pair of
 /// the constants for the indices and the values.
 static Optional<std::pair<Value, Value>>
@@ -313,6 +363,10 @@
 /// is the given `rank`.  This array is intended to serve as a reusable
 /// buffer for storing the indices of a single tensor element, to avoid
 /// allocation in the body of loops.
+// TODO(wrengr): can we push the type-erasure of the rank down into
+//   where we actually generate calls to the ExecutionEngine, instead
+//   of doing them here?  (so that the intervening generated code can
+//   still be type-checked wrt the rank)
 static Value allocaIndices(ConversionPatternRewriter &rewriter, Location loc,
                            int64_t rank) {
   auto indexTp = rewriter.getIndexType();
@@ -321,6 +375,71 @@
   return rewriter.create<memref::AllocaOp>(loc, memTp, ValueRange{arg});
 }
 
+/// Generates code to stack-allocate room for one value of the given
+/// type, and return the pointer to that memory.  At present we leave
+/// the memory for that value uninitialized.
+static Value allocaValue(ConversionPatternRewriter &rewriter, Location loc,
+                         Type t) {
+  return rewriter.create<memref::AllocaOp>(loc, MemRefType::get({}, t));
+}
+
+/// Generates code to allocate a tensor of the given type, and zero
+/// initialize it.  This function assumes the TensorType is fully
+/// specified (i.e., has static rank and sizes).
+static Value allocDenseTensor(ConversionPatternRewriter &rewriter, Location loc,
+                              RankedTensorType tensorTp) {
+  /* DO_NOT_SUBMIT(wrengr): {notes to self}
+  // This code works (a) to generate `%0 = constant dense<0> : $tensorTp`
+  // which MLIR calls a "splat tensor", and (b) not cause mlir-opt to
+  // crash like it used to with tensor::GenerateOp (which was redherring-ly
+  // attributed to an unlegalized builtin.unrealized_conversion_cast).
+  //
+  // Unfortunately, splat-tensors may only live in read-only buffers
+  // eventually; thus we must instead revert back to using linalg::FillOp
+  // like we had originally.  But it looks like we're allowed to have that
+  // work on the `tensor<>` type instead of just `memref<>`.
+  //
+  // cf
+  <mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp;rcl=401852401;l=1380-1382>
+  re overhead of using attributes.
+  // SmallVector does have emplace_back() afterall; see line1419 of that same
+  file
+  //
+  // There's also `SparseElementsAttr::getZeroValue<T>()` but that
+  // requires us to have the `T` at C++ compile-time, rather than the
+  // `RankedTensorType` at C++ runtime / MLIR codegen time.  We could
+  // paper over this staging issue via templates, sure, but I'm not sure
+  // the benefit would be worth the boilerplate to do so.
+  //
+  DenseIntOrFPElementsAttr zeroAttr;
+  Type elemTp = tensorTp.getElementType();
+  if (auto floatTp = elemTp.dyn_cast<FloatType>()) {
+    // Cribbed from the implementation of SparseElementsAttr::getZeroAPFloat()
+    APFloat zeroElem(floatTp.getFloatSemantics());
+    zeroAttr = DenseFPElementsAttr::get(tensorTp, zeroElem);
+  } else if (auto intTp = elemTp.dyn_cast<IntegerType>()) {
+    // Cribbed from the implementation of SparseElementsAttr::getZeroAPInt()
+    APInt zeroElem = APInt::getZero(intTp.getWidth());
+    zeroAttr = DenseIntElementsAttr::get(tensorTp, zeroElem);
+  } else
+    llvm_unreachable("The element type is not an IntOrFP type");
+  // DO_NOT_SUBMIT(wrengr): this assertion isn't dynamic, so we can/should
+  // probably remove it before landing the CL.  I just have it here to make
+  // sure I didn't mess up the C++ code above.
+  assert(zeroAttr.isSplat() && "Failed to codegen a 'splat' tensor");
+  return rewriter.create<ConstantOp>(loc, tensorTp, zeroAttr);
+  */
+  Type elemTp = tensorTp.getElementType();
+  // TODO(wrengr): support dynamic sizes too.
+  // BUG(wrengr): Why won't this legalize?
+  Value initTensor =
+      rewriter.create<linalg::InitTensorOp>(loc, tensorTp.getShape(), elemTp)
+          .getResult();
+  Value zero = constantZero(rewriter, loc, elemTp);
+  // Re legalization, beware that linalg::FillOp implicitly uses linalg::YieldOp
+  return rewriter.create<linalg::FillOp>(loc, zero, initTensor).getResult(0);
+}
+
 //===----------------------------------------------------------------------===//
 // Conversion rules.
 //===----------------------------------------------------------------------===//
@@ -406,8 +525,71 @@
       rewriter.replaceOp(op, genNewCall(rewriter, op, encDst, 1, perm, coo));
       return success();
     }
-    if (!encDst || encSrc) {
-      // TODO: sparse => dense
+    if (!encDst && encSrc) {
+      // This is sparse => dense conversion, which is handled as follows:
+      //   dst = new Tensor(0);
+      //   iter = src->toCOO()->getIterator();
+      //   while (elem = iter->getNext()) {
+      //     dst[elem.indices] = elem.value;
+      //   }
+      // While it would be more efficient to inline the iterator logic
+      // directly rather than allocating an object and calling methods,
+      // this is good enough for now.
+      Location loc = op->getLoc();
+      ShapedType shapeTp = resType.cast<ShapedType>();
+      unsigned rank = shapeTp.getRank();
+      Type elemTp = shapeTp.getElementType();
+      Value dst =
+          allocDenseTensor(rewriter, loc, resType.cast<RankedTensorType>());
+      Value ind = allocaIndices(rewriter, loc, rank);
+      Value elemPtr = allocaValue(rewriter, loc, elemTp);
+      Value perm;
+      Value iter = genNewCall(rewriter, op, encSrc, 4, perm, src);
+      // Generate the while-loop Op itself.
+      // (We cannot call `argTypes` a "TypeRange" here, lest mlir-opt sefgault.)
+      SmallVector<Type> argTypes{resType};
+      auto whileOp =
+          rewriter.create<scf::WhileOp>(loc, argTypes, ValueRange{dst});
+      Block *before = rewriter.createBlock(&whileOp.before(), {}, argTypes);
+      Block *after = rewriter.createBlock(&whileOp.after(), {}, argTypes);
+      // Build the while-loop's "before" region.
+      {
+        rewriter.setInsertionPointToEnd(before);
+        Value cond = genGetNextCall(rewriter, op, elemTp, iter, ind, elemPtr);
+        rewriter.create<scf::ConditionOp>(loc, cond, before->getArguments());
+      }
+      // Build the while-loop's "after" region.
+      // BUG(wrengr): this part is still causing mlir-opt to segfault,
+      //   just in a new/different way than before.
+      {
+        rewriter.setInsertionPointToStart(after);
+        // Can't pass `ind` directly to tensor::InsertOp::build();
+        // instead must explicitly convert it into a ValueRange `ivs`.
+        // (Though, of course, we cannot call it a "ValueRange" here,
+        // lest mlir-opt sefgault.)
+        SmallVector<Value, 4> ivs;
+        ivs.reserve(rank);
+        for (unsigned i = 0; i < rank; i++) {
+          Value idx = constantIndex(rewriter, loc, i);
+          ivs.push_back(rewriter.create<memref::LoadOp>(loc, ind, idx));
+        }
+        Value elemV = rewriter.create<memref::LoadOp>(loc, elemPtr);
+        Value dstIn = after->getArgument(0);
+        Value dstOut =
+            rewriter.create<tensor::InsertOp>(loc, elemV, dstIn, ivs);
+        // BUG(wrengr): evidently, the TypeRange argument to
+        //   scf::YieldOp::build() is only allowed to be passed when it's
+        //   empty: because they assert(resultTypes.size() == 0u) for
+        //   some bizzare reason.
+        rewriter.create<scf::YieldOp>(loc, ValueRange{dstOut});
+      }
+      // Finish up.
+      rewriter.setInsertionPointAfter(whileOp);
+      rewriter.replaceOp(op, whileOp.getResult(0));
+      return success();
+    }
+    if (!encDst && !encSrc) {
+      // dense => dense
       return failure();
     }
     // This is a dense => sparse conversion or a sparse constant in COO =>
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorPasses.cpp
@@ -114,11 +114,15 @@
     });
     // The following operations and dialects may be introduced by the
     // rewriting rules, and are therefore marked as legal.
+    // BUG(wrengr): why aren't linalg::InitTensorOp and tensor::InsertOp getting
+    // legalized?!
     target.addLegalOp<arith::ConstantOp, ConstantOp, arith::IndexCastOp,
-                      tensor::CastOp, tensor::ExtractOp, arith::CmpFOp,
-                      arith::CmpIOp>();
+                      arith::CmpFOp, arith::CmpIOp, tensor::CastOp,
+                      tensor::ExtractOp, tensor::InsertOp, linalg::InitTensorOp,
+                      linalg::FillOp, linalg::YieldOp>();
     target.addLegalDialect<scf::SCFDialect, LLVM::LLVMDialect,
-                           memref::MemRefDialect>();
+                           linalg::LinalgDialect, memref::MemRefDialect,
+                           tensor::TensorDialect>();
     // Populate with rules and apply rewriting rules.
     populateFuncOpTypeConversionPattern(patterns, converter);
     populateCallOpTypeConversionPattern(patterns, converter);
diff --git a/mlir/lib/ExecutionEngine/SparseUtils.cpp b/mlir/lib/ExecutionEngine/SparseUtils.cpp
--- a/mlir/lib/ExecutionEngine/SparseUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseUtils.cpp
@@ -107,6 +107,12 @@
   /// Getter for elements array.
   const std::vector<Element<V>> &getElements() const { return elements; }
 
+  // Forward declaration of the class required by getIterator.  We make
+  // it a nested class so that it can access the private fields.
+  class Iterator;
+  /// Returns an iterator over the elements of a SparseTensorCOO.
+  Iterator *getIterator() const { return new Iterator(*this); }
+
   /// Factory method. Permutes the original dimensions according to
   /// the given ordering and expects subsequent add() calls to honor
   /// that same ordering for the given indices. The result is a
@@ -132,10 +138,40 @@
     }
     return false;
   }
-  std::vector<uint64_t> sizes; // per-rank dimension sizes
+  const std::vector<uint64_t> sizes; // per-rank dimension sizes
   std::vector<Element<V>> elements;
 };
 
+/// An iterator over the elements of a sparse tensor in coordinate-scheme
+/// format.  This iterator is not designed for use by C++ code itself,
+/// but rather by generated MLIR code (i.e., by calls to `IMPL_COO_GETNEXT`);
+///  hence why it may look idisyncratic or unconventional compared to
+/// conventional C++ iterators.
+template <typename V>
+class SparseTensorCOO<V>::Iterator {
+  // TODO(wrengr): really this class should be a thin wrapper/subclass
+  // of the std::vector, rather than needing to do a dereference every
+  // time a method is called; but we don't want to actually copy the whole
+  // contents of the underlying array(s) when this class is initialized.
+  // Maybe we should be a thin wrapper/subclass of SparseTensorCOO?
+  // Or have a variant of SparseTensorStorage::toCOO() to construct this
+  // iterator directly?
+  const std::vector<Element<V>> &elements;
+  unsigned pos;
+
+public:
+  // TODO(wrengr): to guarantee safety we'd either need to consume the
+  // SparseTensorCOO (e.g., requiring an rvalue-reference) or get notified
+  // somehow whenever the SparseTensorCOO adds new elements, sorts, etc.
+  Iterator(const SparseTensorCOO<V> &coo) : elements(coo.elements), pos(0) {}
+
+  const Element<V> *getNext() {
+    if (pos < elements.size())
+      return &(elements[pos++]);
+    return nullptr;
+  }
+};
+
 /// Abstract base class of sparse tensor storage. Note that we use
 /// function overloading to implement "partial" method specialization.
 class SparseTensorStorageBase {
@@ -518,7 +554,13 @@
   kI8 = 6
 };
 
-enum Action : uint32_t { kFromFile = 0, kFromCOO = 1, kNewCOO = 2, kToCOO = 3 };
+enum Action : uint32_t {
+  kFromFile = 0,
+  kFromCOO = 1,
+  kNewCOO = 2,
+  kToCOO = 3,
+  kToIter = 4
+};
 
 #define CASE(p, i, v, P, I, V)                                                 \
   if (ptrTp == (p) && indTp == (i) && valTp == (v)) {                          \
@@ -530,10 +572,15 @@
       tensor = static_cast<SparseTensorCOO<V> *>(ptr);                         \
     else if (action == kNewCOO)                                                \
       return SparseTensorCOO<V>::newSparseTensorCOO(size, sizes, perm);        \
-    else if (action == kToCOO)                                                 \
-      return static_cast<SparseTensorStorage<P, I, V> *>(ptr)->toCOO(perm);    \
-    else                                                                       \
-      assert(0);                                                               \
+    else {                                                                     \
+      tensor = static_cast<SparseTensorStorage<P, I, V> *>(ptr)->toCOO(perm);  \
+      if (action == kToCOO)                                                    \
+        return tensor;                                                         \
+      else if (action == kToIter)                                              \
+        return tensor->getIterator();                                          \
+      else                                                                     \
+        assert(0);                                                             \
+    }                                                                          \
     return SparseTensorStorage<P, I, V>::newSparseTensor(tensor, sparsity,     \
                                                          perm);                \
   }
@@ -582,6 +629,29 @@
     return tensor;                                                             \
   }
 
+/// Calls SparseTensorCOO<V>::Iterator::getNext() with the following semantics.
+/// To avoid needing to handle multiple outputs and avoid defining
+/// a bunch of new MLIR types for `Element<V>`, we instead have both
+/// `iref` and `value` serve as out-parameters and return a bool to
+/// indicate whether those out-parameters are filled or whether we have
+/// no more elements to iterate.
+#define IMPL_COO_GETNEXT(NAME, V)                                              \
+  bool _mlir_ciface_##NAME(void *ptr, StridedMemRefType<uint64_t, 1> *iref,    \
+                           StridedMemRefType<V, 0> *vref) {                    \
+    assert(iref->strides[0] == 1);                                             \
+    uint64_t *indx = iref->data + iref->offset;                                \
+    V *value = vref->data + vref->offset;                                      \
+    const uint64_t isize = iref->sizes[0];                                     \
+    auto iter = static_cast<SparseTensorCOO<V>::Iterator *>(ptr);              \
+    const Element<V> *elem = iter->getNext();                                  \
+    if (elem == nullptr)                                                       \
+      return false;                                                            \
+    for (uint64_t r = 0; r < isize; r++)                                       \
+      indx[r] = elem->indices[r];                                              \
+    *value = elem->value;                                                      \
+    return true;                                                               \
+  }
+
 /// Constructs a new sparse tensor. This is the "swiss army knife"
 /// method for materializing sparse tensors into the computation.
 ///
@@ -590,6 +660,8 @@
 /// kFromCOO = ptr contains coordinate scheme to assign to new storage
 /// kNewCOO = returns empty coordinate scheme to fill and use with kFromCOO
 /// kToCOO = returns coordinate scheme from storage in ptr to use with kFromCOO
+/// kToIter = returns iterator from storage in ptr (call IMPL_COO_GETNEXT to
+/// use)
 void *
 _mlir_ciface_newSparseTensor(StridedMemRefType<uint8_t, 1> *aref, // NOLINT
                              StridedMemRefType<uint64_t, 1> *sref,
@@ -687,10 +759,19 @@
 IMPL3(addEltI16, int16_t)
 IMPL3(addEltI8, int8_t)
 
+/// Helper to enumerate elements of coordinate scheme, one per value type.
+IMPL_COO_GETNEXT(getNextF64, double)
+IMPL_COO_GETNEXT(getNextF32, float)
+IMPL_COO_GETNEXT(getNextI64, int64_t)
+IMPL_COO_GETNEXT(getNextI32, int32_t)
+IMPL_COO_GETNEXT(getNextI16, int16_t)
+IMPL_COO_GETNEXT(getNextI8, int8_t)
+
 #undef CASE
 #undef IMPL1
 #undef IMPL2
 #undef IMPL3
+#undef IMPL_COO_GETNEXT
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_conversion.mlir
@@ -1,11 +1,14 @@
 // RUN: mlir-opt %s \
+// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
 // RUN:   --sparsification --sparse-tensor-conversion \
+// RUN:   --linalg-bufferize --convert-linalg-to-loops \
 // RUN:   --convert-vector-to-scf --convert-scf-to-std \
 // RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
-// RUN:   --std-bufferize --finalizing-bufferize  \
-// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm --convert-std-to-llvm --reconcile-unrealized-casts | \
+// RUN:   --std-bufferize --finalizing-bufferize \
+// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
+// RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
 // RUN: mlir-cpu-runner \
-// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -e entry -entry-point-result=void \
 // RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
 // RUN: FileCheck %s
 
@@ -75,6 +78,27 @@
     }
     return
   }
+  func @checkTensor(%arg0: tensor<2x3x4xf64>, %arg1: tensor<2x3x4xf64>) {
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %c3 = arith.constant 3 : index
+    %c4 = arith.constant 4 : index
+    // Same content?
+    scf.for %i = %c0 to %c2 step %c1 {
+      scf.for %j = %c0 to %c3 step %c1 {
+        scf.for %k = %c0 to %c4 step %c1 {
+          %a = tensor.extract %arg0[%i, %j, %k] : tensor<2x3x4xf64>
+          %b = tensor.extract %arg1[%i, %j, %k] : tensor<2x3x4xf64>
+          %c = arith.cmpf une, %a, %b : f64
+          scf.if %c {
+            call @exit(%c1) : (index) -> ()
+          }
+        }
+      }
+    }
+    return
+  }
 
   //
   // Output utility.
@@ -132,6 +156,13 @@
     %h = sparse_tensor.convert %2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64, #Tensor3>
     %i = sparse_tensor.convert %3 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64, #Tensor3>
 
+    //
+    // Convert sparse tensor back to dense.
+    //
+    %x = sparse_tensor.convert %1 : tensor<2x3x4xf64, #Tensor1> to tensor<2x3x4xf64>
+    %y = sparse_tensor.convert %2 : tensor<2x3x4xf64, #Tensor2> to tensor<2x3x4xf64>
+    %z = sparse_tensor.convert %3 : tensor<2x3x4xf64, #Tensor3> to tensor<2x3x4xf64>
+
     //
     // Check values equality.
     //
@@ -160,6 +191,10 @@
     call @checkf64(%v3, %hv) : (memref<?xf64>, memref<?xf64>) -> ()
     call @checkf64(%v3, %iv) : (memref<?xf64>, memref<?xf64>) -> ()
 
+    call @checkTensor(%t, %x) : (tensor<2x3x4xf64>, tensor<2x3x4xf64>) -> ()
+    call @checkTensor(%t, %y) : (tensor<2x3x4xf64>, tensor<2x3x4xf64>) -> ()
+    call @checkTensor(%t, %z) : (tensor<2x3x4xf64>, tensor<2x3x4xf64>) -> ()
+
     //
     // Check index equality.
     //