diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensorOps.td
@@ -219,6 +219,81 @@
                        " type($tensor) `,` type($indices) `,` type($value)";
 }
 
+def SparseTensor_ExpandOp : SparseTensor_Op<"expand", []>,
+    Arguments<(ins AnyTensor:$tensor)>,
+    Results<(outs AnyStridedMemRefOfRank<1>:$values,
+                  StridedMemRefRankOf<[I1],[1]>:$filled,
+                  StridedMemRefRankOf<[Index],[1]>:$added,
+                  Index:$count)> {
+  string summary = "Expands an access pattern for insertion";
+  string description = [{
+    Performs an access pattern expansion for the innermost dimensions of the
+    given tensor. This operation is useful to implement kernels in which a
+    sparse tensor appears as output. This technique is known under several
+    different names and using several alternative implementations,
+    for example, phase counter [Gustavson71], expanded or switch array
+    [Pissanetzky84], in phase scan [Duff90], access pattern expansion [Bik96],
+    and workspaces [Kjolstad2018].
+
+    The values and filled array have a size the suffices for a *dense* innermost
+    dimension (e.g. a full row for matrices). The added array and count are used
+    to store new indices when a false value is encountered in the filled array.
+    All arrays should be allocated before the loop (possibly even shared between
+    loops in a future optimization) so that their *dense* intitialization can be
+    amortized over many iterations. Setting and resetting the dense arrays in
+    the loop nest itself is kept *sparse* by only iterating over set elements
+    through an indirection using the added array, so that the operations are
+    kept proportional to the number of nonzeros.
+
+    Note that this operation is "impure" in the sense that its behavior
+    is solely defined by side-effects and not SSA values. The semantics
+    may be refined over time as our sparse abstractions evolve.
+
+    Example:
+
+    ```mlir
+    %values, %filled, %added, %count = sparse_tensor.expand %0
+      : tensor<4x4xf64, #CSR> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+    ```
+  }];
+  let assemblyFormat = "$tensor attr-dict `:` type($tensor) `to` type($values)"
+                       " `,` type($filled) `,` type($added) `,` type($count)";
+}
+
+def SparseTensor_CompressOp : SparseTensor_Op<"compress", []>,
+    Arguments<(ins AnyTensor:$tensor,
+                   StridedMemRefRankOf<[Index],[1]>:$indices,
+                   AnyStridedMemRefOfRank<1>:$values,
+                   StridedMemRefRankOf<[I1],[1]>:$filled,
+                   StridedMemRefRankOf<[Index],[1]>:$added,
+                   Index:$count)> {
+  string summary = "Compressed an access pattern for insertion";
+  string description = [{
+    Finishes a single access pattern by moving the inserted elements
+    into the sparse storage scheme. The values and filled array are reset
+    in a *sparse* fashion by only iterating over set elements through an
+    indirection using the added array, so that the operations are kept
+    proportional to the number of nonzeros. See the 'expand' operation
+    for more details.
+
+    Note that this operation is "impure" in the sense that its behavior
+    is solely defined by side-effects and not SSA values. The semantics
+    may be refined over time as our sparse abstractions evolve.
+
+    Example:
+
+    ```mlir
+    sparse_tensor.compress %0, %1, %values, %filled, %added, %2
+        : tensor<4x4xf64, #CSR>, memref<?xindex>, memref<?xf64>,
+	  memref<?xi1>, memref<?xindex>, index
+    ```
+  }];
+  let assemblyFormat = "$tensor `,` $indices `,` $values `,` $filled `,`"
+                        " $added `,` $count attr-dict `:` type($tensor) `,`"
+			" type($indices) `,` type($values) `,` type($filled) `,`"
+			" type($added) `,` type($count)";
+}
+
 def SparseTensor_LoadOp : SparseTensor_Op<"load", [SameOperandsAndResultType]>,
     Arguments<(ins AnyTensor:$tensor, UnitAttr:$hasInserts)>,
     Results<(outs AnyTensor:$result)> {
diff --git a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
--- a/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
+++ b/mlir/lib/Dialect/SparseTensor/IR/SparseTensorDialect.cpp
@@ -305,6 +305,18 @@
   return success();
 }
 
+static LogicalResult verify(ExpandOp op) {
+  if (!getSparseTensorEncoding(op.tensor().getType()))
+    return op.emitError("expected a sparse tensor for expansion");
+  return success();
+}
+
+static LogicalResult verify(CompressOp op) {
+  if (!getSparseTensorEncoding(op.tensor().getType()))
+    return op.emitError("expected a sparse tensor for compression");
+  return success();
+}
+
 static LogicalResult verify(LoadOp op) {
   if (!getSparseTensorEncoding(op.tensor().getType()))
     return op.emitError("expected a sparse tensor to materialize");
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseTensorConversion.cpp
@@ -253,10 +253,17 @@
 /// type, but returns it as type `memref<? x $tp>` (rather than as type
 /// `memref<$sz x $tp>`).
 static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc,
-                       unsigned sz, Type tp) {
+                       Value sz, Type tp) {
   auto memTp = MemRefType::get({ShapedType::kDynamicSize}, tp);
-  Value a = constantIndex(rewriter, loc, sz);
-  return rewriter.create<memref::AllocaOp>(loc, memTp, ValueRange{a});
+  return rewriter.create<memref::AllocaOp>(loc, memTp, ValueRange{sz});
+}
+
+/// Generates an uninitialized temporary buffer of the given size and
+/// type, but returns it as type `memref<? x $tp>` (rather than as type
+/// `memref<$sz x $tp>`).
+static Value genAlloca(ConversionPatternRewriter &rewriter, Location loc,
+                       unsigned sz, Type tp) {
+  return genAlloca(rewriter, loc, constantIndex(rewriter, loc, sz), tp);
 }
 
 /// Generates an uninitialized temporary buffer with room for one value
@@ -906,6 +913,78 @@
   }
 };
 
+class SparseTensorExpandConverter : public OpConversionPattern<ExpandOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(ExpandOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    ShapedType srcType = op.tensor().getType().cast<ShapedType>();
+    Type eltType = srcType.getElementType();
+    Type boolType = rewriter.getIntegerType(1);
+    Type idxType = rewriter.getIndexType();
+    // All initialization should be done on entry of the loop nest.
+    rewriter.setInsertionPointAfter(op.tensor().getDefiningOp());
+    // Determine the size for access expansion.
+    auto enc = getSparseTensorEncoding(srcType);
+    Value src = adaptor.getOperands()[0];
+    Value sz = genDimSizeCall(rewriter, op, enc, src, srcType.getRank() - 1);
+    // Allocate temporary stack buffers for values, filled-switch, and indices.
+    Value values = genAlloca(rewriter, loc, sz, eltType);
+    Value filled = genAlloca(rewriter, loc, sz, boolType);
+    Value indices = genAlloca(rewriter, loc, sz, idxType);
+    Value zero = constantZero(rewriter, loc, idxType);
+    // Reset the values/filled-switch to all-zero/false. Note that this
+    // introduces an O(N) operation into the computation, but this reset
+    // operation is amortized over the innermost loops for the access
+    // pattern expansion.
+    rewriter.create<linalg::FillOp>(loc, constantZero(rewriter, loc, eltType),
+                                    values);
+    rewriter.create<linalg::FillOp>(loc, constantZero(rewriter, loc, boolType),
+                                    filled);
+    // Replace expansion op with these buffers and initial index.
+    assert(op.getNumResults() == 4);
+    rewriter.replaceOp(op, {values, filled, indices, zero});
+    return success();
+  }
+};
+
+class SparseTensorCompressConverter : public OpConversionPattern<CompressOp> {
+public:
+  using OpConversionPattern::OpConversionPattern;
+  LogicalResult
+  matchAndRewrite(CompressOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    // Note that this method call resets the values/filled-switch back to
+    // all-zero/false by only iterating over the set elements, so the
+    // complexity remains proportional to the sparsity of the expanded
+    // access pattern.
+    Type srcType = op.tensor().getType();
+    Type eltType = srcType.cast<ShapedType>().getElementType();
+    StringRef name;
+    if (eltType.isF64())
+      name = "expInsertF64";
+    else if (eltType.isF32())
+      name = "expInsertF32";
+    else if (eltType.isInteger(64))
+      name = "expInsertI64";
+    else if (eltType.isInteger(32))
+      name = "expInsertI32";
+    else if (eltType.isInteger(16))
+      name = "expInsertI16";
+    else if (eltType.isInteger(8))
+      name = "expInsertI8";
+    else
+      return failure();
+    TypeRange noTp;
+    auto fn =
+        getFunc(op, name, noTp, adaptor.getOperands(), /*emitCInterface=*/true);
+    rewriter.replaceOpWithNewOp<CallOp>(op, noTp, fn, adaptor.getOperands());
+    return success();
+  }
+};
+
 } // namespace
 
 //===----------------------------------------------------------------------===//
@@ -921,6 +1000,7 @@
                SparseTensorInitConverter, SparseTensorConvertConverter,
                SparseTensorReleaseConverter, SparseTensorToPointersConverter,
                SparseTensorToIndicesConverter, SparseTensorToValuesConverter,
-               SparseTensorLoadConverter, SparseTensorLexInsertConverter>(
+               SparseTensorLoadConverter, SparseTensorLexInsertConverter,
+               SparseTensorExpandConverter, SparseTensorCompressConverter>(
       typeConverter, patterns.getContext());
 }
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Sparsification.cpp
@@ -54,7 +54,8 @@
         pidxs(numTensors, std::vector<Value>(numLoops)),
         idxs(numTensors, std::vector<Value>(numLoops)), redExp(-1u), redVal(),
         redKind(kNoReduc), sparseOut(op), outerParNest(nest), lexIdx(),
-        curVecLength(1), curVecMask() {}
+        expValues(), expFilled(), expAdded(), expCount(), curVecLength(1),
+        curVecMask() {}
   /// Sparsification options.
   SparsificationOptions options;
   /// Universal dense indices and upper bounds (by index). The loops array
@@ -81,10 +82,15 @@
   Reduction redKind;
   // Sparse tensor as output. Implemented either through direct injective
   // insertion in lexicographic index order (where indices are updated
-  // in the temporary array `lexIdx`) or TODO: access pattern expansion
+  // in the temporary array `lexIdx`) or through access pattern expansion
+  // in the innermost loop nest (`expValues` through `expCount`).
   OpOperand *sparseOut;
   unsigned outerParNest;
   Value lexIdx;
+  Value expValues;
+  Value expFilled;
+  Value expAdded;
+  Value expCount;
   // Current vector length and mask.
   unsigned curVecLength;
   Value curVecMask;
@@ -334,8 +340,8 @@
     }
     // Determine admissable dynamic insertion situations:
     // (1) fully injective, since there are no reductions,
-    // (2) admissable 1-d expansion in innermost dimension. TODO: accept
-    if (nest == op.getRank(lhs)) {
+    // (2) admissable 1-d expansion in innermost dimension.
+    if (nest >= op.getRank(lhs) - 1) {
       *sparseOut = lhs;
       outerParNest = nest;
       return true;
@@ -680,6 +686,16 @@
   }
 }
 
+/// Generates index for load/store on sparse tensor.
+static Value genIndex(CodeGen &codegen, linalg::GenericOp op, OpOperand *t) {
+  auto map = op.getTiedIndexingMap(t);
+  auto enc = getSparseTensorEncoding(t->get().getType());
+  AffineExpr a = map.getResult(perm(enc, map.getNumResults() - 1));
+  assert(a.getKind() == AffineExprKind::DimId);
+  unsigned idx = a.cast<AffineDimExpr>().getPosition();
+  return codegen.loops[idx];
+}
+
 /// Generates subscript for load/store on a dense or sparse tensor.
 static Value genSubscript(CodeGen &codegen, PatternRewriter &rewriter,
                           linalg::GenericOp op, OpOperand *t,
@@ -705,6 +721,62 @@
   return codegen.buffers[tensor];
 }
 
+/// Generates insertion code to implement dynamic tensor load.
+static Value genInsertionLoad(CodeGen &codegen, PatternRewriter &rewriter,
+                              linalg::GenericOp op, OpOperand *t) {
+  Location loc = op.getLoc();
+  // Direct lexicographic index order, tensor loads as zero.
+  if (!codegen.expValues) {
+    Type tp = getElementTypeOrSelf(t->get().getType());
+    return rewriter.create<arith::ConstantOp>(loc, tp,
+                                              rewriter.getZeroAttr(tp));
+  }
+  // Load from expanded access pattern.
+  Value index = genIndex(codegen, op, t);
+  return rewriter.create<memref::LoadOp>(loc, codegen.expValues, index);
+}
+
+/// Generates insertion code to implement dynamic tensor store.
+static void genInsertionStore(CodeGen &codegen, PatternRewriter &rewriter,
+                              linalg::GenericOp op, OpOperand *t, Value rhs) {
+  Location loc = op.getLoc();
+  // Direct insertion in lexicographic index order.
+  if (!codegen.expValues) {
+    rewriter.create<LexInsertOp>(loc, t->get(), codegen.lexIdx, rhs);
+    return;
+  }
+  // Generates insertion code along expanded access pattern.
+  //   if (!expFilled[i]) then
+  //     expFilled[i] = true
+  //     expAdded[inserts++] = i
+  //   endif
+  //   values[i] = rhs
+  Value index = genIndex(codegen, op, t);
+  Value fval = rewriter.create<arith::ConstantIntOp>(loc, 0, 1); // false
+  Value tval = rewriter.create<arith::ConstantIntOp>(loc, 1, 1); // true
+  // If statement.
+  Value filled = rewriter.create<memref::LoadOp>(loc, codegen.expFilled, index);
+  Value cond = rewriter.create<arith::CmpIOp>(loc, arith::CmpIPredicate::eq,
+                                              filled, fval);
+  scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, rewriter.getIndexType(),
+                                              cond, /*else=*/true);
+  // True branch.
+  rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());
+  rewriter.create<memref::StoreOp>(loc, tval, codegen.expFilled, index);
+  rewriter.create<memref::StoreOp>(loc, index, codegen.expAdded,
+                                   codegen.expCount);
+  Value one = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  Value add = rewriter.create<arith::AddIOp>(loc, codegen.expCount, one);
+  rewriter.create<scf::YieldOp>(loc, add);
+  // False branch.
+  rewriter.setInsertionPointToStart(&ifOp.elseRegion().front());
+  rewriter.create<scf::YieldOp>(loc, codegen.expCount);
+  rewriter.setInsertionPointAfter(ifOp);
+  // Value assignment.
+  codegen.expCount = ifOp.getResult(0);
+  rewriter.create<memref::StoreOp>(loc, rhs, codegen.expValues, index);
+}
+
 /// Generates a load on a dense or sparse tensor.
 static Value genTensorLoad(Merger &merger, CodeGen &codegen,
                            PatternRewriter &rewriter, linalg::GenericOp op,
@@ -716,13 +788,10 @@
       return genVectorInvariantValue(codegen, rewriter, val);
     return val;
   }
-  // Insertion (a sparse tensor output "loads" as zero).
+  // Load during insertion.
   OpOperand *t = op.getInputAndOutputOperands()[merger.exp(exp).tensor];
-  if (t == codegen.sparseOut) {
-    Type tp = getElementTypeOrSelf(t->get().getType());
-    return rewriter.create<arith::ConstantOp>(op.getLoc(), tp,
-                                              rewriter.getZeroAttr(tp));
-  }
+  if (t == codegen.sparseOut)
+    return genInsertionLoad(codegen, rewriter, op, t);
   // Actual load.
   SmallVector<Value, 4> args;
   Value ptr = genSubscript(codegen, rewriter, op, t, args);
@@ -744,10 +813,10 @@
     updateReduc(merger, codegen, rhs);
     return;
   }
-  // Insertion.
+  // Store during insertion.
   OpOperand *t = op.getOutputOperand(0);
   if (t == codegen.sparseOut) {
-    rewriter.create<LexInsertOp>(loc, t->get(), codegen.lexIdx, rhs);
+    genInsertionStore(codegen, rewriter, op, t, rhs);
     return;
   }
   // Actual store.
@@ -916,6 +985,42 @@
   }
 }
 
+/// Generates an expanded access pattern in innermost dimension.
+static void genExpansion(Merger &merger, CodeGen &codegen,
+                         PatternRewriter &rewriter, linalg::GenericOp op,
+                         unsigned at, bool atStart) {
+  OpOperand *lhs = codegen.sparseOut;
+  if (!lhs || codegen.outerParNest != op.getRank(lhs) - 1 ||
+      at != codegen.outerParNest)
+    return; // not needed at this level
+  // Generate start or end of an expanded access pattern.
+  Value tensor = lhs->get();
+  Location loc = op.getLoc();
+  if (atStart) {
+    auto dynShape = {ShapedType::kDynamicSize};
+    Type etp = tensor.getType().cast<ShapedType>().getElementType();
+    Type t1 = MemRefType::get(dynShape, etp);
+    Type t2 = MemRefType::get(dynShape, genIntType(rewriter, 1));
+    Type t3 = MemRefType::get(dynShape, genIntType(rewriter, 0));
+    Type t4 = rewriter.getIndexType();
+    auto res =
+        rewriter.create<ExpandOp>(loc, TypeRange({t1, t2, t3, t4}), tensor);
+    assert(res.getNumResults() == 4);
+    assert(!codegen.expValues);
+    codegen.expValues = res.getResult(0);
+    codegen.expFilled = res.getResult(1);
+    codegen.expAdded = res.getResult(2);
+    codegen.expCount = res.getResult(3);
+  } else {
+    assert(codegen.expValues);
+    rewriter.create<CompressOp>(loc, tensor, codegen.lexIdx, codegen.expValues,
+                                codegen.expFilled, codegen.expAdded,
+                                codegen.expCount);
+    codegen.expValues = codegen.expFilled = codegen.expAdded =
+        codegen.expCount = Value();
+  }
+}
+
 /// Generates initialization code for the subsequent loop sequence at
 /// current index level. Returns true if the loop sequence needs to
 /// maintain the universal index.
@@ -1069,9 +1174,13 @@
     }
     operands.push_back(codegen.redVal);
   }
+  if (codegen.expValues)
+    operands.push_back(codegen.expCount);
   scf::ForOp forOp = rewriter.create<scf::ForOp>(loc, lo, hi, step, operands);
   if (codegen.redVal)
     updateReduc(merger, codegen, forOp.getRegionIterArgs().front());
+  if (codegen.expValues)
+    codegen.expCount = forOp.getRegionIterArgs().back();
   // Assign induction variable to sparse or dense index.
   Value iv = forOp.getInductionVar();
   if (isSparse)
@@ -1106,6 +1215,10 @@
     types.push_back(codegen.redVal.getType());
     operands.push_back(codegen.redVal);
   }
+  if (codegen.expValues) {
+    types.push_back(indexType);
+    operands.push_back(codegen.expCount);
+  }
   if (needsUniv) {
     types.push_back(indexType);
     operands.push_back(codegen.loops[idx]);
@@ -1135,6 +1248,8 @@
   }
   if (codegen.redVal)
     updateReduc(merger, codegen, after->getArgument(o++));
+  if (codegen.expValues)
+    codegen.expCount = after->getArgument(o++);
   if (needsUniv)
     codegen.loops[idx] = after->getArgument(o++);
   assert(o == operands.size());
@@ -1215,8 +1330,9 @@
     }
   }
 
-  // Move the insertion indices in lexicographic index order.
-  if (codegen.sparseOut) {
+  // Move the insertion indices in lexicographic index order. During access
+  // pattern expansion, we can skip setting the innermost dimension.
+  if (codegen.sparseOut && !codegen.expValues) {
     Value pos = rewriter.create<arith::ConstantIndexOp>(loc, at);
     rewriter.create<memref::StoreOp>(loc, codegen.loops[idx], codegen.lexIdx,
                                      pos);
@@ -1231,11 +1347,21 @@
                               scf::WhileOp whileOp) {
   Location loc = op.getLoc();
   // Finalize each else branch of all if statements.
-  if (codegen.redVal) {
+  if (codegen.redVal || codegen.expValues) {
     while (auto ifOp = dyn_cast_or_null<scf::IfOp>(
                rewriter.getInsertionBlock()->getParentOp())) {
-      rewriter.create<scf::YieldOp>(loc, codegen.redVal);
-      updateReduc(merger, codegen, ifOp.getResult(0));
+      unsigned y = 0;
+      SmallVector<Value, 4> yields;
+      if (codegen.redVal) {
+        yields.push_back(codegen.redVal);
+        updateReduc(merger, codegen, ifOp.getResult(y++));
+      }
+      if (codegen.expValues) {
+        yields.push_back(codegen.expCount);
+        codegen.expCount = ifOp->getResult(y++);
+      }
+      assert(y == yields.size());
+      rewriter.create<scf::YieldOp>(loc, yields);
       rewriter.setInsertionPointAfter(ifOp);
     }
   }
@@ -1266,6 +1392,10 @@
     operands.push_back(codegen.redVal);
     updateReduc(merger, codegen, whileOp->getResult(o++));
   }
+  if (codegen.expValues) {
+    operands.push_back(codegen.expCount);
+    codegen.expCount = whileOp->getResult(o++);
+  }
   if (needsUniv) {
     operands.push_back(
         rewriter.create<arith::AddIOp>(loc, codegen.loops[idx], one));
@@ -1287,6 +1417,10 @@
     operands.push_back(codegen.redVal);
     updateReduc(merger, codegen, loop->getResult(o++));
   }
+  if (codegen.expValues) {
+    operands.push_back(codegen.expCount);
+    codegen.expCount = loop->getResult(o++);
+  }
   assert(o == operands.size());
   if (o > 0)
     rewriter.create<scf::YieldOp>(loc, operands);
@@ -1318,6 +1452,8 @@
   }
   if (codegen.redVal)
     types.push_back(codegen.redVal.getType());
+  if (codegen.expValues)
+    types.push_back(rewriter.getIndexType());
   scf::IfOp ifOp = rewriter.create<scf::IfOp>(loc, types, cond, /*else=*/true);
   rewriter.setInsertionPointToStart(&ifOp.thenRegion().front());
   return ifOp;
@@ -1325,11 +1461,19 @@
 
 /// Generates end of true branch of if-statement within a while-loop.
 static void endIf(Merger &merger, CodeGen &codegen, PatternRewriter &rewriter,
-                  linalg::GenericOp op, scf::IfOp ifOp, Value ifInput) {
+                  linalg::GenericOp op, scf::IfOp ifOp, Operation *loop,
+                  Value redInput, Value cntInput) {
+  SmallVector<Value, 4> operands;
   if (codegen.redVal) {
-    rewriter.create<scf::YieldOp>(op.getLoc(), codegen.redVal);
-    updateReduc(merger, codegen, ifInput);
+    operands.push_back(codegen.redVal);
+    updateReduc(merger, codegen, redInput);
+  }
+  if (codegen.expValues) {
+    operands.push_back(codegen.expCount);
+    codegen.expCount = cntInput;
   }
+  if (!operands.empty())
+    rewriter.create<scf::YieldOp>(op.getLoc(), operands);
   rewriter.setInsertionPointToStart(&ifOp.elseRegion().front());
 }
 
@@ -1348,6 +1492,8 @@
   assert(!codegen.loops[idx]);
   // Emit invariants at this loop sequence level.
   genInvariants(merger, codegen, rewriter, op, exp, ldx, /*atStart=*/true);
+  // Emit access pattern expansion for sparse tensor output.
+  genExpansion(merger, codegen, rewriter, op, at, /*atStart=*/true);
   // Emit further intitialization at this loop sequence level.
   unsigned l0 = merger.set(lts)[0];
   bool needsUniv =
@@ -1399,7 +1545,7 @@
 /// Ends a loop sequence at given level.
 static void endLoopSeq(Merger &merger, CodeGen &codegen,
                        PatternRewriter &rewriter, linalg::GenericOp op,
-                       unsigned exp, unsigned idx, unsigned ldx) {
+                       unsigned exp, unsigned at, unsigned idx, unsigned ldx) {
   assert(codegen.curVecLength == 1);
   codegen.loops[idx] = Value();
   // Bring a pending reduction back from SIMD form when sequence ends.
@@ -1409,6 +1555,8 @@
                   genVectorReducEnd(codegen, rewriter, op.getLoc(), vtp));
   // Unmark bookkeeping of invariants and loop index.
   genInvariants(merger, codegen, rewriter, op, exp, ldx, /*atStart=*/false);
+  // Finalize access pattern expansion for sparse tensor output.
+  genExpansion(merger, codegen, rewriter, op, at, /*atStart=*/false);
 }
 
 /// Recursively generates code while computing iteration lattices in order
@@ -1443,7 +1591,8 @@
 
     // Visit all lattices points with Li >= Lj to generate the
     // loop-body, possibly with if statements for coiteration.
-    Value ifInput = codegen.redVal;
+    Value redInput = codegen.redVal;
+    Value cntInput = codegen.expCount;
     bool isWhile = dyn_cast<scf::WhileOp>(loop) != nullptr;
     for (unsigned j = 0; j < lsize; j++) {
       unsigned lj = merger.set(lts)[j];
@@ -1454,7 +1603,7 @@
           scf::IfOp ifOp =
               genIf(merger, codegen, rewriter, op, idx, merger.lat(lj).simple);
           genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
-          endIf(merger, codegen, rewriter, op, ifOp, ifInput);
+          endIf(merger, codegen, rewriter, op, ifOp, loop, redInput, cntInput);
         } else {
           genStmt(merger, codegen, rewriter, op, topSort, ej, at + 1);
         }
@@ -1467,7 +1616,7 @@
   }
 
   // End a loop sequence.
-  endLoopSeq(merger, codegen, rewriter, op, exp, idx, ldx);
+  endLoopSeq(merger, codegen, rewriter, op, exp, at, idx, ldx);
 }
 
 /// Converts the result computed by the sparse kernel into the required form.
diff --git a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
--- a/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
+++ b/mlir/lib/ExecutionEngine/SparseTensorUtils.cpp
@@ -163,10 +163,10 @@
 /// function overloading to implement "partial" method specialization.
 class SparseTensorStorageBase {
 public:
-  // Dimension size query.
+  /// Dimension size query.
   virtual uint64_t getDimSize(uint64_t) = 0;
 
-  // Overhead storage.
+  /// Overhead storage.
   virtual void getPointers(std::vector<uint64_t> **, uint64_t) { fatal("p64"); }
   virtual void getPointers(std::vector<uint32_t> **, uint64_t) { fatal("p32"); }
   virtual void getPointers(std::vector<uint16_t> **, uint64_t) { fatal("p16"); }
@@ -176,7 +176,7 @@
   virtual void getIndices(std::vector<uint16_t> **, uint64_t) { fatal("i16"); }
   virtual void getIndices(std::vector<uint8_t> **, uint64_t) { fatal("i8"); }
 
-  // Primary storage.
+  /// Primary storage.
   virtual void getValues(std::vector<double> **) { fatal("valf64"); }
   virtual void getValues(std::vector<float> **) { fatal("valf32"); }
   virtual void getValues(std::vector<int64_t> **) { fatal("vali64"); }
@@ -184,13 +184,35 @@
   virtual void getValues(std::vector<int16_t> **) { fatal("vali16"); }
   virtual void getValues(std::vector<int8_t> **) { fatal("vali8"); }
 
-  // Element-wise insertion in lexicographic index order.
+  /// Element-wise insertion in lexicographic index order.
   virtual void lexInsert(uint64_t *, double) { fatal("insf64"); }
   virtual void lexInsert(uint64_t *, float) { fatal("insf32"); }
   virtual void lexInsert(uint64_t *, int64_t) { fatal("insi64"); }
   virtual void lexInsert(uint64_t *, int32_t) { fatal("insi32"); }
   virtual void lexInsert(uint64_t *, int16_t) { fatal("ins16"); }
   virtual void lexInsert(uint64_t *, int8_t) { fatal("insi8"); }
+
+  /// Expanded insertion.
+  virtual void expInsert(uint64_t *, double *, bool *, uint64_t *, uint64_t) {
+    fatal("expf64");
+  }
+  virtual void expInsert(uint64_t *, float *, bool *, uint64_t *, uint64_t) {
+    fatal("expf32");
+  }
+  virtual void expInsert(uint64_t *, int64_t *, bool *, uint64_t *, uint64_t) {
+    fatal("expi64");
+  }
+  virtual void expInsert(uint64_t *, int32_t *, bool *, uint64_t *, uint64_t) {
+    fatal("expi32");
+  }
+  virtual void expInsert(uint64_t *, int16_t *, bool *, uint64_t *, uint64_t) {
+    fatal("expi16");
+  }
+  virtual void expInsert(uint64_t *, int8_t *, bool *, uint64_t *, uint64_t) {
+    fatal("expi8");
+  }
+
+  /// Finishes insertion.
   virtual void endInsert() = 0;
 
   virtual ~SparseTensorStorageBase() {}
@@ -289,6 +311,35 @@
     insPath(cursor, diff, top, val);
   }
 
+  /// Partially specialize expanded insertions based on template types.
+  /// Note that this method resets the values/filled-switch array back
+  /// to all-zero/false while only iterating over the nonzero elements.
+  void expInsert(uint64_t *cursor, V *values, bool *filled, uint64_t *added,
+                 uint64_t count) override {
+    if (count == 0)
+      return;
+    // Sort.
+    std::sort(added, added + count);
+    // Restore insertion path for first insert.
+    uint64_t rank = getRank();
+    uint64_t index = added[0];
+    cursor[rank - 1] = index;
+    lexInsert(cursor, values[index]);
+    assert(filled[index]);
+    values[index] = 0;
+    filled[index] = false;
+    // Subsequent insertions are quick.
+    for (uint64_t i = 1; i < count; i++) {
+      assert(index < added[i] && "non-lexicographic insertion");
+      index = added[i];
+      cursor[rank - 1] = index;
+      insPath(cursor, rank - 1, added[i - 1] + 1, values[index]);
+      assert(filled[index]);
+      values[index] = 0.0;
+      filled[index] = false;
+    }
+  }
+
   /// Finalizes lexicographic insertions.
   void endInsert() override {
     if (values.empty())
@@ -683,8 +734,7 @@
 
 #define IMPL_SPARSEVALUES(NAME, TYPE, LIB)                                     \
   void _mlir_ciface_##NAME(StridedMemRefType<TYPE, 1> *ref, void *tensor) {    \
-    assert(ref);                                                               \
-    assert(tensor);                                                            \
+    assert(ref &&tensor);                                                      \
     std::vector<TYPE> *v;                                                      \
     static_cast<SparseTensorStorageBase *>(tensor)->LIB(&v);                   \
     ref->basePtr = ref->data = v->data();                                      \
@@ -696,8 +746,7 @@
 #define IMPL_GETOVERHEAD(NAME, TYPE, LIB)                                      \
   void _mlir_ciface_##NAME(StridedMemRefType<TYPE, 1> *ref, void *tensor,      \
                            index_t d) {                                        \
-    assert(ref);                                                               \
-    assert(tensor);                                                            \
+    assert(ref &&tensor);                                                      \
     std::vector<TYPE> *v;                                                      \
     static_cast<SparseTensorStorageBase *>(tensor)->LIB(&v, d);                \
     ref->basePtr = ref->data = v->data();                                      \
@@ -710,9 +759,7 @@
   void *_mlir_ciface_##NAME(void *tensor, TYPE value,                          \
                             StridedMemRefType<index_t, 1> *iref,               \
                             StridedMemRefType<index_t, 1> *pref) {             \
-    assert(tensor);                                                            \
-    assert(iref);                                                              \
-    assert(pref);                                                              \
+    assert(tensor &&iref &&pref);                                              \
     assert(iref->strides[0] == 1 && pref->strides[0] == 1);                    \
     assert(iref->sizes[0] == pref->sizes[0]);                                  \
     const index_t *indx = iref->data + iref->offset;                           \
@@ -726,10 +773,11 @@
   }
 
 #define IMPL_GETNEXT(NAME, V)                                                  \
-  bool _mlir_ciface_##NAME(void *tensor, StridedMemRefType<uint64_t, 1> *iref, \
+  bool _mlir_ciface_##NAME(void *tensor, StridedMemRefType<index_t, 1> *iref,  \
                            StridedMemRefType<V, 0> *vref) {                    \
+    assert(tensor &&iref &&vref);                                              \
     assert(iref->strides[0] == 1);                                             \
-    uint64_t *indx = iref->data + iref->offset;                                \
+    index_t *indx = iref->data + iref->offset;                                 \
     V *value = vref->data + vref->offset;                                      \
     const uint64_t isize = iref->sizes[0];                                     \
     auto iter = static_cast<SparseTensorCOO<V> *>(tensor);                     \
@@ -747,12 +795,32 @@
 #define IMPL_LEXINSERT(NAME, V)                                                \
   void _mlir_ciface_##NAME(void *tensor, StridedMemRefType<index_t, 1> *cref,  \
                            V val) {                                            \
+    assert(tensor &&cref);                                                     \
     assert(cref->strides[0] == 1);                                             \
-    uint64_t *cursor = cref->data + cref->offset;                              \
+    index_t *cursor = cref->data + cref->offset;                               \
     assert(cursor);                                                            \
     static_cast<SparseTensorStorageBase *>(tensor)->lexInsert(cursor, val);    \
   }
 
+#define IMPL_EXPINSERT(NAME, V)                                                \
+  void _mlir_ciface_##NAME(                                                    \
+      void *tensor, StridedMemRefType<index_t, 1> *cref,                       \
+      StridedMemRefType<V, 1> *vref, StridedMemRefType<bool, 1> *fref,         \
+      StridedMemRefType<index_t, 1> *aref, index_t count) {                    \
+    assert(tensor &&cref &&vref &&fref &&aref);                                \
+    assert(cref->strides[0] == 1);                                             \
+    assert(vref->strides[0] == 1);                                             \
+    assert(fref->strides[0] == 1);                                             \
+    assert(aref->strides[0] == 1);                                             \
+    assert(vref->sizes[0] == fref->sizes[0]);                                  \
+    index_t *cursor = cref->data + cref->offset;                               \
+    V *values = vref->data + vref->offset;                                     \
+    bool *filled = fref->data + fref->offset;                                  \
+    index_t *added = aref->data + aref->offset;                                \
+    static_cast<SparseTensorStorageBase *>(tensor)->expInsert(                 \
+        cursor, values, filled, added, count);                                 \
+  }
+
 /// Constructs a new sparse tensor. This is the "swiss army knife"
 /// method for materializing sparse tensors into the computation.
 ///
@@ -912,12 +980,21 @@
 IMPL_LEXINSERT(lexInsertI16, int16_t)
 IMPL_LEXINSERT(lexInsertI8, int8_t)
 
+/// Helper to insert using expansion, one per value type.
+IMPL_EXPINSERT(expInsertF64, double)
+IMPL_EXPINSERT(expInsertF32, float)
+IMPL_EXPINSERT(expInsertI64, int64_t)
+IMPL_EXPINSERT(expInsertI32, int32_t)
+IMPL_EXPINSERT(expInsertI16, int16_t)
+IMPL_EXPINSERT(expInsertI8, int8_t)
+
 #undef CASE
 #undef IMPL_SPARSEVALUES
 #undef IMPL_GETOVERHEAD
 #undef IMPL_ADDELT
 #undef IMPL_GETNEXT
-#undef IMPL_INSERTLEX
+#undef IMPL_LEXINSERT
+#undef IMPL_EXPINSERT
 
 //===----------------------------------------------------------------------===//
 //
diff --git a/mlir/test/Dialect/SparseTensor/conversion.mlir b/mlir/test/Dialect/SparseTensor/conversion.mlir
--- a/mlir/test/Dialect/SparseTensor/conversion.mlir
+++ b/mlir/test/Dialect/SparseTensor/conversion.mlir
@@ -441,3 +441,30 @@
   return
 }
 
+// CHECK-LABEL: func @sparse_expansion()
+//    %[[S:.*]] = call @sparseDimSize
+//    %[[V:.*]] = memref.alloca(%[[S]]) : memref<?xf64>
+//    %[[F:.*]] = memref.alloca(%[[S]]) : memref<?xi1>
+//    %[[A:.*]] = memref.alloca(%[[S]]) : memref<?xindex>
+//    linalg.fill(%{{.*}}, %[[V]]) : f64, memref<?xf64>
+//    linalg.fill(%{{.*}}, %[[F]]) : i1, memref<?xi1>
+//       CHECK: return
+func @sparse_expansion() {
+  %c = arith.constant 8 : index
+  %0 = sparse_tensor.init [%c, %c] : tensor<8x8xf64, #SparseMatrix>
+  %values, %filled, %added, %count = sparse_tensor.expand %0
+    : tensor<8x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+  return
+}
+
+// CHECK-LABEL: func @sparse_compression(
+//  CHECK-SAME: %[[A:.*]]: !llvm.ptr<i8>,
+//       CHECK: call @expInsertF64(%[[A]],
+//       CHECK: return
+func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>,
+                         %arg1: memref<?xindex>, %arg2: memref<?xf64>, %arg3: memref<?xi1>,
+                         %arg4: memref<?xindex>, %arg5: index) {
+  sparse_tensor.compress %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
+    : tensor<8x8xf64, #SparseMatrix>, memref<?xindex>, memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+  return
+}
diff --git a/mlir/test/Dialect/SparseTensor/invalid.mlir b/mlir/test/Dialect/SparseTensor/invalid.mlir
--- a/mlir/test/Dialect/SparseTensor/invalid.mlir
+++ b/mlir/test/Dialect/SparseTensor/invalid.mlir
@@ -160,6 +160,25 @@
 
 // -----
 
+func @sparse_unannotated_expansion(%arg0: tensor<128xf64>) {
+  // expected-error@+1 {{expected a sparse tensor for expansion}}
+  %values, %filled, %added, %count = sparse_tensor.expand %arg0
+    : tensor<128xf64> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+  return
+}
+
+// -----
+
+func @sparse_unannotated_compression(%arg0: tensor<128xf64>, %arg1: memref<?xindex>,
+                                     %arg2: memref<?xf64>, %arg3: memref<?xi1>,
+				     %arg4: memref<?xindex>, %arg5: index) {
+  // expected-error@+1 {{expected a sparse tensor for compression}}
+  sparse_tensor.compress %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
+    : tensor<128xf64>, memref<?xindex>, memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+}
+
+// -----
+
 func @sparse_convert_unranked(%arg0: tensor<*xf32>) -> tensor<10xf32> {
   // expected-error@+1 {{unexpected type in convert}}
   %0 = sparse_tensor.convert %arg0 : tensor<*xf32> to tensor<10xf32>
diff --git a/mlir/test/Dialect/SparseTensor/roundtrip.mlir b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
--- a/mlir/test/Dialect/SparseTensor/roundtrip.mlir
+++ b/mlir/test/Dialect/SparseTensor/roundtrip.mlir
@@ -149,3 +149,33 @@
   sparse_tensor.lex_insert %arg0, %arg1, %arg2 : tensor<128xf64, #SparseVector>, memref<?xindex>, f64
   return
 }
+
+// -----
+
+#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["compressed", "compressed"]}>
+
+// CHECK-LABEL: func @sparse_expansion(
+//  CHECK-SAME: %[[A:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{{.*}}>>)
+//       CHECK: sparse_tensor.expand %[[A]]
+//       CHECK: return
+func @sparse_expansion(%arg0: tensor<8x8xf64, #SparseMatrix>) {
+  %values, %filled, %added, %count = sparse_tensor.expand %arg0
+    : tensor<8x8xf64, #SparseMatrix> to memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+  return
+}
+
+// -----
+
+#SparseMatrix = #sparse_tensor.encoding<{dimLevelType = ["compressed", "compressed"]}>
+
+// CHECK-LABEL: func @sparse_compression(
+//  CHECK-SAME: %[[A:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{{.*}}>>,
+//       CHECK: sparse_tensor.compress %[[A]]
+//       CHECK: return
+func @sparse_compression(%arg0: tensor<8x8xf64, #SparseMatrix>,
+                         %arg1: memref<?xindex>, %arg2: memref<?xf64>, %arg3: memref<?xi1>,
+                         %arg4: memref<?xindex>, %arg5: index) {
+  sparse_tensor.compress %arg0, %arg1, %arg2, %arg3, %arg4, %arg5
+    : tensor<8x8xf64, #SparseMatrix>, memref<?xindex>, memref<?xf64>, memref<?xi1>, memref<?xindex>, index
+  return
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_matmul.mlir
@@ -0,0 +1,274 @@
+// RUN: mlir-opt %s \
+// RUN:   --linalg-generalize-named-ops --linalg-fuse-elementwise-ops \
+// RUN:   --sparsification --sparse-tensor-conversion \
+// RUN:   --linalg-bufferize --convert-linalg-to-loops \
+// RUN:   --convert-vector-to-scf --convert-scf-to-std \
+// RUN:   --func-bufferize --tensor-constant-bufferize --tensor-bufferize \
+// RUN:   --std-bufferize --finalizing-bufferize --lower-affine \
+// RUN:   --convert-vector-to-llvm --convert-memref-to-llvm \
+// RUN:   --convert-std-to-llvm --reconcile-unrealized-casts | \
+// RUN: mlir-cpu-runner \
+// RUN:  -e entry -entry-point-result=void  \
+// RUN:  -shared-libs=%mlir_integration_test_dir/libmlir_c_runner_utils%shlibext | \
+// RUN: FileCheck %s
+//
+
+#CSR = #sparse_tensor.encoding<{
+  dimLevelType = [ "dense", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (i,j)>
+}>
+
+#DCSR = #sparse_tensor.encoding<{
+  dimLevelType = [ "compressed", "compressed" ],
+  dimOrdering = affine_map<(i,j) -> (i,j)>
+}>
+
+module {
+  //
+  // Computes C = A x B with all matrices dense.
+  //
+  func @matmul1(%A: tensor<4x8xf64>,
+                %B: tensor<8x4xf64>) -> tensor<4x4xf64> {
+    %C = arith.constant dense<0.0> : tensor<4x4xf64>
+    %D = linalg.matmul
+      ins(%A, %B: tensor<4x8xf64>, tensor<8x4xf64>)
+         outs(%C: tensor<4x4xf64>) -> tensor<4x4xf64>
+    return %D: tensor<4x4xf64>
+  }
+
+  //
+  // Computes C = A x B with all matrices sparse (SpMSpM) in CSR.
+  //
+  func @matmul2(%A: tensor<4x8xf64, #CSR>,
+                %B: tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR> {
+    %c4 = arith.constant 4 : index
+    %C = sparse_tensor.init [%c4, %c4] : tensor<4x4xf64, #CSR>
+    %D = linalg.matmul
+      ins(%A, %B: tensor<4x8xf64, #CSR>, tensor<8x4xf64, #CSR>)
+         outs(%C: tensor<4x4xf64, #CSR>) -> tensor<4x4xf64, #CSR>
+    return %D: tensor<4x4xf64, #CSR>
+  }
+
+  //
+  // Computes C = A x B with all matrices sparse (SpMSpM) in DCSR.
+  //
+  func @matmul3(%A: tensor<4x8xf64, #DCSR>,
+                %B: tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR> {
+    %c4 = arith.constant 4 : index
+    %C = sparse_tensor.init [%c4, %c4] : tensor<4x4xf64, #DCSR>
+    %D = linalg.matmul
+      ins(%A, %B: tensor<4x8xf64, #DCSR>, tensor<8x4xf64, #DCSR>)
+         outs(%C: tensor<4x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR>
+    return %D: tensor<4x4xf64, #DCSR>
+  }
+
+  //
+  // Main driver.
+  //
+  func @entry() {
+    %c0 = arith.constant 0 : index
+    %d1 = arith.constant -1.0 : f64
+
+    // Initialize various matrices, dense for stress testing,
+    // and sparse to verify correct nonzero structure.
+    %da = arith.constant dense<[
+        [ 1.1, 2.1, 3.1, 4.1, 5.1, 6.1, 7.1, 8.1 ],
+        [ 1.2, 2.2, 3.2, 4.2, 5.2, 6.2, 7.2, 8.2 ],
+        [ 1.3, 2.3, 3.3, 4.3, 5.3, 6.3, 7.3, 8.3 ],
+        [ 1.4, 2.4, 3.4, 4.4, 5.4, 6.4, 7.4, 8.4 ]
+    ]> : tensor<4x8xf64>
+    %db = arith.constant dense<[
+        [ 10.1, 11.1, 12.1, 13.1 ],
+        [ 10.2, 11.2, 12.2, 13.2 ],
+        [ 10.3, 11.3, 12.3, 13.3 ],
+        [ 10.4, 11.4, 12.4, 13.4 ],
+        [ 10.5, 11.5, 12.5, 13.5 ],
+        [ 10.6, 11.6, 12.6, 13.6 ],
+        [ 10.7, 11.7, 12.7, 13.7 ],
+        [ 10.8, 11.8, 12.8, 13.8 ]
+    ]> : tensor<8x4xf64>
+    %sa = arith.constant dense<[
+        [ 0.0, 2.1, 0.0, 0.0, 0.0, 6.1, 0.0, 0.0 ],
+        [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ],
+        [ 0.0, 2.3, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0 ],
+        [ 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0 ]
+    ]> : tensor<4x8xf64>
+    %sb = arith.constant dense<[
+        [ 0.0, 0.0, 0.0, 1.0 ],
+        [ 0.0, 0.0, 2.0, 0.0 ],
+        [ 0.0, 3.0, 0.0, 0.0 ],
+        [ 4.0, 0.0, 0.0, 0.0 ],
+        [ 0.0, 0.0, 0.0, 0.0 ],
+        [ 0.0, 5.0, 0.0, 0.0 ],
+        [ 0.0, 0.0, 6.0, 0.0 ],
+        [ 0.0, 0.0, 7.0, 8.0 ]
+    ]> : tensor<8x4xf64>
+
+    // Convert all these matrices to sparse format.
+    %a1 = sparse_tensor.convert %da : tensor<4x8xf64> to tensor<4x8xf64, #CSR>
+    %a2 = sparse_tensor.convert %da : tensor<4x8xf64> to tensor<4x8xf64, #DCSR>
+    %a3 = sparse_tensor.convert %sa : tensor<4x8xf64> to tensor<4x8xf64, #CSR>
+    %a4 = sparse_tensor.convert %sa : tensor<4x8xf64> to tensor<4x8xf64, #DCSR>
+    %b1 = sparse_tensor.convert %db : tensor<8x4xf64> to tensor<8x4xf64, #CSR>
+    %b2 = sparse_tensor.convert %db : tensor<8x4xf64> to tensor<8x4xf64, #DCSR>
+    %b3 = sparse_tensor.convert %sb : tensor<8x4xf64> to tensor<8x4xf64, #CSR>
+    %b4 = sparse_tensor.convert %sb : tensor<8x4xf64> to tensor<8x4xf64, #DCSR>
+
+    // Call kernels with dense.
+    %0 = call @matmul1(%da, %db)
+       : (tensor<4x8xf64>, tensor<8x4xf64>) -> tensor<4x4xf64>
+    %1 = call @matmul2(%a1, %b1)
+       : (tensor<4x8xf64, #CSR>,
+          tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR>
+    %2 = call @matmul3(%a2, %b2)
+       : (tensor<4x8xf64, #DCSR>,
+          tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR>
+
+    // Call kernels with one sparse.
+    %3 = call @matmul1(%sa, %db)
+       : (tensor<4x8xf64>, tensor<8x4xf64>) -> tensor<4x4xf64>
+    %4 = call @matmul2(%a3, %b1)
+       : (tensor<4x8xf64, #CSR>,
+          tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR>
+    %5 = call @matmul3(%a4, %b2)
+       : (tensor<4x8xf64, #DCSR>,
+          tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR>
+
+    // Call kernels with sparse.
+    %6 = call @matmul1(%sa, %sb)
+       : (tensor<4x8xf64>, tensor<8x4xf64>) -> tensor<4x4xf64>
+    %7 = call @matmul2(%a3, %b3)
+       : (tensor<4x8xf64, #CSR>,
+          tensor<8x4xf64, #CSR>) -> tensor<4x4xf64, #CSR>
+    %8 = call @matmul3(%a4, %b4)
+       : (tensor<4x8xf64, #DCSR>,
+          tensor<8x4xf64, #DCSR>) -> tensor<4x4xf64, #DCSR>
+
+    //
+    // CHECK:    ( ( 388.76, 425.56, 462.36, 499.16 ),
+    // CHECK-SAME: ( 397.12, 434.72, 472.32, 509.92 ),
+    // CHECK-SAME: ( 405.48, 443.88, 482.28, 520.68 ),
+    // CHECK-SAME: ( 413.84, 453.04, 492.24, 531.44 ) )
+    //
+    %m0 = bufferization.to_memref %0 : memref<4x4xf64>
+    %v0 = vector.transfer_read %m0[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v0 : vector<4x4xf64>
+
+    //
+    // CHECK:    ( ( 388.76, 425.56, 462.36, 499.16 ),
+    // CHECK-SAME: ( 397.12, 434.72, 472.32, 509.92 ),
+    // CHECK-SAME: ( 405.48, 443.88, 482.28, 520.68 ),
+    // CHECK-SAME: ( 413.84, 453.04, 492.24, 531.44 ) )
+    //
+    %c1 = sparse_tensor.convert %1 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
+    %m1 = bufferization.to_memref %c1 : memref<4x4xf64>
+    %v1 = vector.transfer_read %m1[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v1 : vector<4x4xf64>
+
+    //
+    // CHECK:    ( ( 388.76, 425.56, 462.36, 499.16 ),
+    // CHECK-SAME: ( 397.12, 434.72, 472.32, 509.92 ),
+    // CHECK-SAME: ( 405.48, 443.88, 482.28, 520.68 ),
+    // CHECK-SAME: ( 413.84, 453.04, 492.24, 531.44 ) )
+    //
+    %c2 = sparse_tensor.convert %2 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64>
+    %m2 = bufferization.to_memref %c2 : memref<4x4xf64>
+    %v2 = vector.transfer_read %m2[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v2 : vector<4x4xf64>
+
+    //
+    // CHECK:    ( ( 86.08, 94.28, 102.48, 110.68 ),
+    // CHECK-SAME: ( 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 23.46, 25.76, 28.06, 30.36 ),
+    // CHECK-SAME: ( 10.8, 11.8, 12.8, 13.8 ) )
+    //
+    %m3 = bufferization.to_memref %3 : memref<4x4xf64>
+    %v3 = vector.transfer_read %m3[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v3 : vector<4x4xf64>
+
+    //
+    // CHECK:    ( ( 86.08, 94.28, 102.48, 110.68 ),
+    // CHECK-SAME: ( 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 23.46, 25.76, 28.06, 30.36 ),
+    // CHECK-SAME: ( 10.8, 11.8, 12.8, 13.8 ) )
+    //
+    %c4 = sparse_tensor.convert %4 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
+    %m4 = bufferization.to_memref %c4 : memref<4x4xf64>
+    %v4 = vector.transfer_read %m4[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v4 : vector<4x4xf64>
+
+    //
+    // CHECK:    ( ( 86.08, 94.28, 102.48, 110.68 ),
+    // CHECK-SAME: ( 0, 0, 0, 0 ),
+    // CHECK-SAME: ( 23.46, 25.76, 28.06, 30.36 ),
+    // CHECK-SAME: ( 10.8, 11.8, 12.8, 13.8 ) )
+    //
+    %c5 = sparse_tensor.convert %5 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64>
+    %m5 = bufferization.to_memref %c5 : memref<4x4xf64>
+    %v5 = vector.transfer_read %m5[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v5 : vector<4x4xf64>
+
+    //
+    // CHECK: ( ( 0, 30.5, 4.2, 0 ), ( 0, 0, 0, 0 ), ( 0, 0, 4.6, 0 ), ( 0, 0, 7, 8 ) )
+    //
+    %m6 = bufferization.to_memref %6 : memref<4x4xf64>
+    %v6 = vector.transfer_read %m6[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v6 : vector<4x4xf64>
+
+    //
+    // CHECK: ( ( 0, 30.5, 4.2, 0 ), ( 0, 0, 0, 0 ), ( 0, 0, 4.6, 0 ), ( 0, 0, 7, 8 ) )
+    //
+    %c7 = sparse_tensor.convert %7 : tensor<4x4xf64, #CSR> to tensor<4x4xf64>
+    %m7 = bufferization.to_memref %c7 : memref<4x4xf64>
+    %v7 = vector.transfer_read %m7[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v7 : vector<4x4xf64>
+
+    //
+    // CHECK: ( ( 0, 30.5, 4.2, 0 ), ( 0, 0, 0, 0 ), ( 0, 0, 4.6, 0 ), ( 0, 0, 7, 8 ) )
+    //
+    %c8 = sparse_tensor.convert %8 : tensor<4x4xf64, #DCSR> to tensor<4x4xf64>
+    %m8 = bufferization.to_memref %c8 : memref<4x4xf64>
+    %v8 = vector.transfer_read %m8[%c0, %c0], %d1 : memref<4x4xf64>, vector<4x4xf64>
+    vector.print %v8 : vector<4x4xf64>
+
+    //
+    // Sanity check on nonzeros.
+    //
+    // CHECK: ( 30.5, 4.2, 4.6, 7, 8, -1, -1, -1 )
+    // CHECK: ( 30.5, 4.2, 4.6, 7, 8, -1, -1, -1 )
+    //
+    %val7 = sparse_tensor.values %7 : tensor<4x4xf64, #CSR> to memref<?xf64>
+    %val8 = sparse_tensor.values %8 : tensor<4x4xf64, #DCSR> to memref<?xf64>
+    %nz7 = vector.transfer_read %val7[%c0], %d1 : memref<?xf64>, vector<8xf64>
+    %nz8 = vector.transfer_read %val8[%c0], %d1 : memref<?xf64>, vector<8xf64>
+    vector.print %nz7 : vector<8xf64>
+    vector.print %nz8 : vector<8xf64>
+
+    // Release the resources.
+    sparse_tensor.release %a1 : tensor<4x8xf64, #CSR>
+    sparse_tensor.release %a2 : tensor<4x8xf64, #DCSR>
+    sparse_tensor.release %a3 : tensor<4x8xf64, #CSR>
+    sparse_tensor.release %a4 : tensor<4x8xf64, #DCSR>
+    sparse_tensor.release %b1 : tensor<8x4xf64, #CSR>
+    sparse_tensor.release %b2 : tensor<8x4xf64, #DCSR>
+    sparse_tensor.release %b3 : tensor<8x4xf64, #CSR>
+    sparse_tensor.release %b4 : tensor<8x4xf64, #DCSR>
+    sparse_tensor.release %1 : tensor<4x4xf64, #CSR>
+    sparse_tensor.release %2 : tensor<4x4xf64, #DCSR>
+    sparse_tensor.release %4 : tensor<4x4xf64, #CSR>
+    sparse_tensor.release %5 : tensor<4x4xf64, #DCSR>
+    sparse_tensor.release %7 : tensor<4x4xf64, #CSR>
+    sparse_tensor.release %8 : tensor<4x4xf64, #DCSR>
+    memref.dealloc %m0 : memref<4x4xf64>
+    memref.dealloc %m1 : memref<4x4xf64>
+    memref.dealloc %m2 : memref<4x4xf64>
+    memref.dealloc %m3 : memref<4x4xf64>
+    memref.dealloc %m4 : memref<4x4xf64>
+    memref.dealloc %m5 : memref<4x4xf64>
+    memref.dealloc %m6 : memref<4x4xf64>
+    memref.dealloc %m7 : memref<4x4xf64>
+    memref.dealloc %m8 : memref<4x4xf64>
+
+    return
+  }
+}