diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -605,6 +605,125 @@
   return success();
 }
 
+/// Match and rewrite SDDMM kernel.
+static LogicalResult rewriteSDDMM(PatternRewriter &rewriter, linalg::GenericOp op,
+                     bool enableRT) {
+  Location loc = op.getLoc();
+  Value a = op.getOperand(1);
+  Value b = op.getOperand(2);
+  Value c = op.getOperand(0); // we have C = AB
+  SmallVector<Value> tokens;
+
+  // Only admissible sparse matrix format and dense matrices.
+  bool isCOO = false;
+  SparseTensorType aTp = getSparseTensorType(a);
+  SparseTensorType bTp = getSparseTensorType(b);
+  SparseTensorType cTp = getSparseTensorType(c);
+  if (!areAdmissibleTypes(aTp, bTp, cTp, enableRT, isCOO))
+    return failure();
+
+  // TODO: duplicate C and update its user later than this GenericOp line if it
+  // is used by other ops.
+
+  // the following does the in-place operation. If the sparse matrix C is
+  // reused, we may need to copy it before the operation so that users could use
+  // the new copy instead. Start sparse kernel and copy data from host to
+  // device.
+  // TODO: assert no other uses of c is later than this GenericOp operator
+
+  //   a : bufA           -> matA
+  //   b : bufB           -> matA
+  //   c : memR/memC/memV -> rowC,colC,valC
+  Value nseC = rewriter.create<NumberOfEntriesOp>(loc, a);
+  Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
+  Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
+  Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
+  Value bufA = genTensorToMemref(rewriter, loc, a);
+  Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
+  Value bufB = genTensorToMemref(rewriter, loc, b);
+  Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
+  Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT);
+  Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT);
+  Value memV = genToValues(rewriter, loc, c);
+  Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
+  Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
+  Value valC = genAllocCopy(rewriter, loc, memV, tokens);
+  genBlockingWait(rewriter, loc, tokens);
+  tokens.clear();
+
+  // Create sparse environment and sparse matrix/dense matrix handles.
+  Type indexTp = rewriter.getIndexType();
+  Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
+  Type dnMatHandleTp = rewriter.getType<gpu::SparseDnMatHandleType>();
+  Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
+  Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
+  Value token = genFirstWait(rewriter, loc);
+  auto env =
+      rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
+  Value handle = env.getResult(0);
+  token = env.getAsyncToken();
+
+  auto dmatA = rewriter.create<gpu::CreateDnMatOp>(loc, dnMatHandleTp, tokenTp,
+                                                   token, szm, szk, matA);
+  Value dnA = dmatA.getResult(0);
+  token = dmatA.getAsyncToken();
+  auto dmatB = rewriter.create<gpu::CreateDnMatOp>(loc, dnMatHandleTp, tokenTp,
+                                                   token, szk, szn, matB);
+  Value dnB = dmatB.getResult(0);
+  token = dmatB.getAsyncToken();
+
+  Operation *spGenC =
+      genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szn, nseC,
+               rowC, colC, valC, isCOO, enableRT);
+  Value spMatC = spGenC->getResult(0);
+  token = spGenC->getResult(1);
+
+  // Precompute buffersize for SDDMM.
+  auto bufferComp = rewriter.create<gpu::SDDMMBufferSizeOp>(
+      loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC);
+  Value bufferSz = bufferComp.getResult(0);
+  token = bufferComp.getAsyncToken();
+  auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
+  Value buffer = buf.getResult(0);
+  token = buf.getAsyncToken();
+
+  // Perform the SDDMM.
+  auto sddmmComp = rewriter.create<gpu::SDDMMOp>(loc, tokenTp, token, handle,
+                                                 dnA, dnB, spMatC, buffer);
+  token = sddmmComp.getAsyncToken();
+
+  // Copy data back to host and free all the resoures.
+  token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnA)
+              .getAsyncToken();
+  token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnB)
+              .getAsyncToken();
+  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
+              .getAsyncToken();
+  token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
+              .getAsyncToken();
+  tokens.push_back(token);
+  genBlockingWait(rewriter, loc, tokens);
+  tokens.clear();
+  token = genFirstWait(rewriter, loc);
+  token = genCopyMemRef(rewriter, loc, memR, rowC, token);
+  token = genCopyMemRef(rewriter, loc, memC, colC, token);
+  token = genCopyMemRef(rewriter, loc, memV, valC, token);
+  token = genDeallocMemRef(rewriter, loc, buffer, token);
+  token = genDeallocMemRef(rewriter, loc, matA, token);
+  token = genDeallocMemRef(rewriter, loc, matB, token);
+  token = genDeallocMemRef(rewriter, loc, rowC, token);
+  if (colC)
+    token = genDeallocMemRef(rewriter, loc, colC, token);
+  token = genDeallocMemRef(rewriter, loc, valC, token);
+  tokens.push_back(token);
+  genBlockingWait(rewriter, loc, tokens);
+  tokens.clear();
+
+  // Done.
+  rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Rewriting rules for direct code generation.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
@@ -0,0 +1,102 @@
+// RUN: mlir-opt %s --linalg-generalize-named-ops \
+// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+
+#trait_sampled_dense_dense = {
+  indexing_maps = [
+    affine_map<(i,j,k) -> (i,j)>,  // S
+    affine_map<(i,j,k) -> (i,k)>,  // A
+    affine_map<(i,j,k) -> (k,j)>,  // B
+    affine_map<(i,j,k) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"],
+  doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)"
+}
+
+#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>
+
+module {
+
+//
+// A kernel that computes a direct sampled matrix matrix multiplication
+// (with sparse result).
+// Compute SDDMM C = C\spy AB
+// VAL_0 is C
+//
+// CHECK-LABEL:   func.func @sampled_dense_dense_matmul(
+// CHECK-SAME:      %[[VAL_0:.*]]: tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>,
+// CHECK-SAME:      %[[VAL_1:.*]]: tensor<?x?xf64>,
+// CHECK-SAME:      %[[VAL_2:.*]]: tensor<?x?xf64>) -> tensor<?x?xf64> {
+// CHECK-DAG:       %[[VAL_3:.*]] = arith.constant 0 : index
+// CHECK-DAG:       %[[VAL_4:.*]] = arith.constant 1 : index
+// CHECK-DAG:       %[[VAL_5:.*]] = sparse_tensor.number_of_entries %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_6:.*]] = tensor.dim %[[VAL_0]], %[[VAL_3]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_7:.*]] = tensor.dim %[[VAL_0]], %[[VAL_4]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>>
+// CHECK-DAG:       %[[VAL_8:.*]] = tensor.dim %[[VAL_1]], %[[VAL_4]] : tensor<?x?xf64>
+// CHECK-DAG:       %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xindex>
+// CHECK-DAG:       %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<?x?xf64, #sparse_tensor.encoding<{{{.*}}}>> to memref<?xf64>
+// CHECK:           %[[VAL_12:.*]] = gpu.wait async
+// CHECK:           %[[VAL_13:.*]] = memref.dim %[[VAL_9]], %[[VAL_3]] : memref<?xindex>
+// CHECK:           %[[VAL_14:.*]], %[[VAL_15:.*]] = gpu.alloc async {{\[}}%[[VAL_12]]] (%[[VAL_13]]) : memref<?xindex>
+// CHECK:           %[[VAL_16:.*]] = gpu.memcpy async {{\[}}%[[VAL_15]]] %[[VAL_14]], %[[VAL_9]] : memref<?xindex>, memref<?xindex>
+// CHECK:           %[[VAL_17:.*]] = gpu.wait async
+// CHECK:           %[[VAL_18:.*]] = memref.dim %[[VAL_10]], %[[VAL_3]] : memref<?xindex>
+// CHECK:           %[[VAL_19:.*]], %[[VAL_20:.*]] = gpu.alloc async {{\[}}%[[VAL_17]]] (%[[VAL_18]]) : memref<?xindex>
+// CHECK:           %[[VAL_21:.*]] = gpu.memcpy async {{\[}}%[[VAL_20]]] %[[VAL_19]], %[[VAL_10]] : memref<?xindex>, memref<?xindex>
+// CHECK:           %[[VAL_22:.*]] = gpu.wait async
+// CHECK:           %[[VAL_23:.*]] = memref.dim %[[VAL_11]], %[[VAL_3]] : memref<?xf64>
+// CHECK:           %[[VAL_24:.*]], %[[VAL_25:.*]] = gpu.alloc async {{\[}}%[[VAL_22]]] (%[[VAL_23]]) : memref<?xf64>
+// CHECK:           %[[VAL_26:.*]] = gpu.memcpy async {{\[}}%[[VAL_25]]] %[[VAL_24]], %[[VAL_11]] : memref<?xf64>, memref<?xf64>
+// CHECK:           %[[VAL_27:.*]] = bufferization.to_memref %[[VAL_1]] : memref<?x?xf64>
+// CHECK:           %[[VAL_28:.*]] = gpu.wait async
+// CHECK:           %[[VAL_29:.*]] = memref.dim %[[VAL_27]], %[[VAL_3]] : memref<?x?xf64>
+// CHECK:           %[[VAL_30:.*]] = memref.dim %[[VAL_27]], %[[VAL_4]] : memref<?x?xf64>
+// CHECK:           %[[VAL_31:.*]], %[[VAL_32:.*]] = gpu.alloc async {{\[}}%[[VAL_28]]] (%[[VAL_29]], %[[VAL_30]]) : memref<?x?xf64>
+// CHECK:           %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_27]] : memref<?x?xf64>, memref<?x?xf64>
+// CHECK:           %[[VAL_34:.*]] = bufferization.to_memref %[[VAL_2]] : memref<?x?xf64>
+// CHECK:           %[[VAL_35:.*]] = gpu.wait async
+// CHECK:           %[[VAL_36:.*]] = memref.dim %[[VAL_34]], %[[VAL_3]] : memref<?x?xf64>
+// CHECK:           %[[VAL_37:.*]] = memref.dim %[[VAL_34]], %[[VAL_4]] : memref<?x?xf64>
+// CHECK:           %[[VAL_38:.*]], %[[VAL_39:.*]] = gpu.alloc async {{\[}}%[[VAL_35]]] (%[[VAL_36]], %[[VAL_37]]) : memref<?x?xf64>
+// CHECK:           %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref<?x?xf64>, memref<?x?xf64>
+// CHECK:           gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]]
+// CHECK:           %[[VAL_41:.*]] = gpu.wait async
+// CHECK:           %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]]
+// CHECK:           %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
+// CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
+// CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
+// CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]
+// CHECK:           %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
+// CHECK:           %[[VAL_54:.*]] = gpu.sddmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
+// CHECK:           %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]
+// CHECK:           %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]]
+// CHECK:           %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]]
+// CHECK:           %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]]
+// CHECK:           %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref<?xindex>
+// CHECK:           %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref<?xindex>
+// CHECK:           %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref<?xf64>
+// CHECK:           %[[VAL_62:.*]] = gpu.dealloc async {{\[}}%[[VAL_61]]] %[[VAL_52]] : memref<?xi8>
+// CHECK:           %[[VAL_63:.*]] = gpu.dealloc async {{\[}}%[[VAL_62]]] %[[VAL_31]] : memref<?x?xf64>
+// CHECK:           %[[VAL_64:.*]] = gpu.memcpy async {{\[}}%[[VAL_63]]] %[[VAL_34]], %[[VAL_38]] : memref<?x?xf64>, memref<?x?xf64>
+// CHECK:           %[[VAL_65:.*]] = gpu.dealloc async {{\[}}%[[VAL_64]]] %[[VAL_38]] : memref<?x?xf64>
+// CHECK:           gpu.wait {{\[}}%[[VAL_65]]]
+// CHECK:           %[[VAL_66:.*]] = bufferization.to_tensor %[[VAL_34]] : memref<?x?xf64>
+// CHECK:           return %[[VAL_66]] : tensor<?x?xf64>
+// CHECK:         }
+func.func @sparse_sampled_dd(%args: tensor<8x8xf64, #CSR>,
+                              %arga: tensor<8x8xf64>,
+                              %argb: tensor<8x8xf64>) -> tensor<8x8xf64, #CSR> {
+  %1 = bufferization.alloc_tensor() : tensor<8x8xf64, #CSR>
+  %2 = linalg.generic #trait_sampled_dense_dense
+    ins(%args, %arga, %argb: tensor<8x8xf64, #CSR>,
+                              tensor<8x8xf64>, tensor<8x8xf64>)
+    outs(%1: tensor<8x8xf64, #CSR>) {
+      ^bb(%s: f64, %a: f64, %b: f64, %x: f64):
+        %p = arith.mulf %a, %b : f64 
+        %q = arith.mulf %s, %p : f64
+        %r = arith.addf %x, %q : f64
+        linalg.yield %r : f64
+  } -> tensor<8x8xf64, #CSR>
+  return %2 : tensor<8x8xf64, #CSR>
+}
+}