diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
--- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp
@@ -625,6 +625,121 @@
   return success();
 }
 
+/// Match and rewrite SDDMM kernel.
+static LogicalResult rewriteSDDMM(PatternRewriter &rewriter,
+                                  linalg::GenericOp op, bool enableRT) {
+  Location loc = op.getLoc();
+  Value a = op.getOperand(1);
+  Value b = op.getOperand(2);
+  Value c = op.getOperand(0); // we have C = AB
+  SmallVector<Value> tokens;
+
+  // Only admissible sparse matrix format and dense matrices.
+  bool isCOO = false;
+  SparseTensorType aTp = getSparseTensorType(a);
+  SparseTensorType bTp = getSparseTensorType(b);
+  SparseTensorType cTp = getSparseTensorType(c);
+  if (!areAdmissibleTypes(cTp, bTp, aTp, enableRT, isCOO))
+    return failure();
+
+  // The SDDMM does the in-place operation. If the sparse matrix C is
+  // reused, e.g., later on when we remove redundant memcpy, we may need to
+  // duplicate it before the operation so that users could use the new copy
+  // instead. Start sparse kernel and copy data from host to device.
+  //   a : bufA           -> matA
+  //   b : bufB           -> matA
+  //   c : memR/memC/memV -> rowC,colC,valC
+  Value nseC = rewriter.create<NumberOfEntriesOp>(loc, a);
+  Value szm = linalg::createOrFoldDimOp(rewriter, loc, a, 0);
+  Value szk = linalg::createOrFoldDimOp(rewriter, loc, a, 1);
+  Value szn = linalg::createOrFoldDimOp(rewriter, loc, b, 1);
+  Value bufA = genTensorToMemref(rewriter, loc, a);
+  Value matA = genAllocCopy(rewriter, loc, bufA, tokens);
+  Value bufB = genTensorToMemref(rewriter, loc, b);
+  Value matB = genAllocCopy(rewriter, loc, bufB, tokens);
+  Value memR = genFirstPosOrCrds(rewriter, loc, c, isCOO, enableRT);
+  Value memC = genSecondCrds(rewriter, loc, c, isCOO, enableRT);
+  Value memV = genToValues(rewriter, loc, c);
+  Value rowC = genAllocCopy(rewriter, loc, memR, tokens);
+  Value colC = memC ? genAllocCopy(rewriter, loc, memC, tokens) : Value();
+  Value valC = genAllocCopy(rewriter, loc, memV, tokens);
+  genBlockingWait(rewriter, loc, tokens);
+  tokens.clear();
+
+  // Create sparse environment and sparse matrix/dense matrix handles.
+  Type indexTp = rewriter.getIndexType();
+  Type envHandleTp = rewriter.getType<gpu::SparseEnvHandleType>();
+  Type dnMatHandleTp = rewriter.getType<gpu::SparseDnMatHandleType>();
+  Type spMatHandleTp = rewriter.getType<gpu::SparseSpMatHandleType>();
+  Type tokenTp = rewriter.getType<gpu::AsyncTokenType>();
+  Value token = genFirstWait(rewriter, loc);
+  auto env =
+      rewriter.create<gpu::CreateSparseEnvOp>(loc, envHandleTp, tokenTp, token);
+  Value handle = env.getResult(0);
+  token = env.getAsyncToken();
+
+  auto dmatA = rewriter.create<gpu::CreateDnMatOp>(
+      loc, dnMatHandleTp, tokenTp, handle, token, szm, szk, matA);
+  Value dnA = dmatA.getResult(0);
+  token = dmatA.getAsyncToken();
+  auto dmatB = rewriter.create<gpu::CreateDnMatOp>(
+      loc, dnMatHandleTp, tokenTp, handle, token, szk, szn, matB);
+  Value dnB = dmatB.getResult(0);
+  token = dmatB.getAsyncToken();
+
+  Operation *spGenC =
+      genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szn, nseC,
+               rowC, colC, valC, isCOO, enableRT);
+  Value spMatC = spGenC->getResult(0);
+  token = spGenC->getResult(1);
+
+  auto dnCType = llvm::cast<ShapedType>(c.getType()).getElementType();
+  // Precompute buffersize for SDDMM.
+  auto bufferComp = rewriter.create<gpu::SDDMMBufferSizeOp>(
+      loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC, dnCType);
+  Value bufferSz = bufferComp.getResult(0);
+  token = bufferComp.getAsyncToken();
+  auto buf = genAllocBuffer(rewriter, loc, bufferSz, token);
+  Value buffer = buf.getResult(0);
+  token = buf.getAsyncToken();
+
+  // Perform the SDDMM.
+  auto sddmmComp = rewriter.create<gpu::SDDMMOp>(
+      loc, tokenTp, token, handle, dnA, dnB, spMatC, dnCType, buffer);
+  token = sddmmComp.getAsyncToken();
+
+  // Copy data back to host and free all the resoures.
+  token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnA)
+              .getAsyncToken();
+  token = rewriter.create<gpu::DestroyDnMatOp>(loc, tokenTp, token, dnB)
+              .getAsyncToken();
+  token = rewriter.create<gpu::DestroySpMatOp>(loc, tokenTp, token, spMatC)
+              .getAsyncToken();
+  token = rewriter.create<gpu::DestroySparseEnvOp>(loc, tokenTp, token, handle)
+              .getAsyncToken();
+  tokens.push_back(token);
+  genBlockingWait(rewriter, loc, tokens);
+  tokens.clear();
+  token = genFirstWait(rewriter, loc);
+  token = genCopyMemRef(rewriter, loc, memR, rowC, token);
+  token = genCopyMemRef(rewriter, loc, memC, colC, token);
+  token = genCopyMemRef(rewriter, loc, memV, valC, token);
+  token = genDeallocMemRef(rewriter, loc, buffer, token);
+  token = genDeallocMemRef(rewriter, loc, matA, token);
+  token = genDeallocMemRef(rewriter, loc, matB, token);
+  token = genDeallocMemRef(rewriter, loc, rowC, token);
+  if (colC)
+    token = genDeallocMemRef(rewriter, loc, colC, token);
+  token = genDeallocMemRef(rewriter, loc, valC, token);
+  tokens.push_back(token);
+  genBlockingWait(rewriter, loc, tokens);
+  tokens.clear();
+
+  // Done.
+  rewriter.replaceOp(op, op.getDpsInitOperand(0)->get());
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Rewriting rules for direct code generation.
 //===----------------------------------------------------------------------===//
@@ -776,6 +891,17 @@
       return rewriteSpMM(rewriter, op, enableRT);
     }
 
+    // Recognize a SDDMM kernel.
+    if (//numLoops == 3 && numTensors == 4 &&
+        linalg::isParallelIterator(iteratorTypes[0]) &&
+        linalg::isParallelIterator(iteratorTypes[1]) &&
+        linalg::isReductionIterator(iteratorTypes[2])){// &&
+        // TODO: add transposed {i, k}, {k, j}
+        // TODO: maybe add transposed {i, j} in future
+        //maps == infer({{i, j}, {i, k}, {k, j}, {i, j}})) {
+      return rewriteSDDMM(rewriter, op, enableRT);
+    }
+
     return failure();
   }
 
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir
@@ -0,0 +1,202 @@
+// RUN: mlir-opt %s --linalg-generalize-named-ops \
+// RUN:             --sparsification="enable-gpu-libgen" | FileCheck %s
+
+#trait_sampled_dense_dense = {
+  indexing_maps = [
+  affine_map<(i,j,k) -> (i,j)>,  // S
+  affine_map<(i,j,k) -> (i,k)>,  // A
+  affine_map<(i,j,k) -> (k,j)>,  // B
+  affine_map<(i,j,k) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"],
+  doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)"
+}
+
+#trait_vec_op = {
+  indexing_maps = [
+  affine_map<(i,j) -> (i,j)>,  // a (in)
+  affine_map<(i,j) -> (i,j)>,  // b (in)
+  affine_map<(i,j) -> (i,j)>   // x (out)
+  ],
+  iterator_types = ["parallel", "parallel"]
+}
+
+#CSR = #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>
+
+#SortedCOO = #sparse_tensor.encoding<{
+  lvlTypes = [ "compressed-nu", "singleton" ]
+}>
+
+module {
+
+// CHECK-LABEL:   func.func @sparse_sampled_dd(
+// CHECK-SAME:                                 %[[VAL_0:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>,
+// CHECK-SAME:                                 %[[VAL_1:.*]]: tensor<8x8xf64>,
+// CHECK-SAME:                                 %[[VAL_2:.*]]: tensor<8x8xf64>) -> tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 8 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant false
+// CHECK:           %[[VAL_7:.*]] = arith.constant true
+// CHECK:           %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>
+// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>
+// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64>
+// CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_16:.*]] = %[[VAL_8]]) -> (tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) {
+// CHECK:             %[[VAL_17:.*]], %[[VAL_18:.*]], %[[VAL_19:.*]], %[[VAL_20:.*]] = sparse_tensor.expand %[[VAL_8]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>, memref<?xi1>, memref<?xindex>
+// CHECK:             %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_23:.*]] = %[[VAL_20]]) -> (index) {
+// CHECK:               %[[VAL_24:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_15]], %[[VAL_22]]] : memref<8x8xf64>
+// CHECK:               %[[VAL_25:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:               %[[VAL_26:.*]] = arith.addi %[[VAL_15]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_27:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_26]]] : memref<?xindex>
+// CHECK:               %[[VAL_28:.*]] = scf.for %[[VAL_29:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_5]] iter_args(%[[VAL_30:.*]] = %[[VAL_23]]) -> (index) {
+// CHECK:                 %[[VAL_31:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:                 %[[VAL_32:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref<?xf64>
+// CHECK:                 %[[VAL_33:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_29]]] : memref<?xf64>
+// CHECK:                 %[[VAL_34:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<8x8xf64>
+// CHECK:                 %[[VAL_35:.*]] = arith.mulf %[[VAL_24]], %[[VAL_34]] : f64
+// CHECK:                 %[[VAL_36:.*]] = arith.mulf %[[VAL_33]], %[[VAL_35]] : f64
+// CHECK:                 %[[VAL_37:.*]] = arith.addf %[[VAL_32]], %[[VAL_36]] : f64
+// CHECK:                 %[[VAL_38:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref<?xi1>
+// CHECK:                 %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_38]], %[[VAL_6]] : i1
+// CHECK:                 %[[VAL_40:.*]] = scf.if %[[VAL_39]] -> (index) {
+// CHECK:                   memref.store %[[VAL_7]], %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref<?xi1>
+// CHECK:                   memref.store %[[VAL_31]], %[[VAL_19]]{{\[}}%[[VAL_30]]] : memref<?xindex>
+// CHECK:                   %[[VAL_41:.*]] = arith.addi %[[VAL_30]], %[[VAL_5]] : index
+// CHECK:                   scf.yield %[[VAL_41]] : index
+// CHECK:                 } else {
+// CHECK:                   scf.yield %[[VAL_30]] : index
+// CHECK:                 }
+// CHECK:                 memref.store %[[VAL_37]], %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref<?xf64>
+// CHECK:                 scf.yield %[[VAL_42:.*]] : index
+// CHECK:               } {"Emitted from" = "linalg.generic"}
+// CHECK:               scf.yield %[[VAL_43:.*]] : index
+// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             %[[VAL_44:.*]] = sparse_tensor.compress %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_45:.*]] into %[[VAL_16]]{{\[}}%[[VAL_15]]] : memref<?xf64>, memref<?xi1>, memref<?xindex>, tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:             scf.yield %[[VAL_44]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           %[[VAL_46:.*]] = sparse_tensor.load %[[VAL_47:.*]] hasInserts : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           return %[[VAL_46]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:         }
+//
+// A kernel that computes a direct sampled matrix matrix multiplication
+// (with sparse result).
+// Compute SDDMM C = C\spy AB
+// VAL_0 is C
+func.func @sparse_sampled_dd(%arga: tensor<8x8xf64>, %argb: tensor<8x8xf64>, %args: tensor<8x8xf64, #SortedCOO>) -> tensor<8x8xf64, #SortedCOO> {
+  %1 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO>
+  %2 = linalg.generic #trait_sampled_dense_dense
+  ins(%args, %arga, %argb: tensor<8x8xf64, #SortedCOO>,
+  tensor<8x8xf64>, tensor<8x8xf64>)
+  outs(%1: tensor<8x8xf64, #SortedCOO>) {
+  ^bb(%s: f64, %a: f64, %b: f64, %x: f64):
+  %p = arith.mulf %a, %b : f64
+  %q = arith.mulf %s, %p : f64
+  %r = arith.addf %x, %q : f64
+  linalg.yield %r : f64
+  } -> tensor<8x8xf64, #SortedCOO>
+  return %2 : tensor<8x8xf64, #SortedCOO>
+}
+
+// CHECK-LABEL:   func.func @sparse_sampled_dd_with_reuse(
+// CHECK-SAME:                                            %[[VAL_0:.*]]: tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>,
+// CHECK-SAME:                                            %[[VAL_1:.*]]: tensor<8x8xf64>,
+// CHECK-SAME:                                            %[[VAL_2:.*]]: tensor<8x8xf64>) -> tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> {
+// CHECK:           %[[VAL_3:.*]] = arith.constant 8 : index
+// CHECK:           %[[VAL_4:.*]] = arith.constant 0 : index
+// CHECK:           %[[VAL_5:.*]] = arith.constant 1 : index
+// CHECK:           %[[VAL_6:.*]] = arith.constant false
+// CHECK:           %[[VAL_7:.*]] = arith.constant true
+// CHECK:           %[[VAL_8:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           %[[VAL_9:.*]] = sparse_tensor.positions %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_10:.*]] = sparse_tensor.coordinates %[[VAL_0]] {level = 1 : index} : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xindex>
+// CHECK:           %[[VAL_11:.*]] = sparse_tensor.values %[[VAL_0]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>
+// CHECK:           %[[VAL_12:.*]] = bufferization.to_memref %[[VAL_1]] : memref<8x8xf64>
+// CHECK:           %[[VAL_13:.*]] = bufferization.to_memref %[[VAL_2]] : memref<8x8xf64>
+// CHECK:           %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_16:.*]] = %[[VAL_8]]) -> (tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>) {
+// CHECK:             %[[VAL_17:.*]], %[[VAL_18:.*]], %[[VAL_19:.*]], %[[VAL_20:.*]] = sparse_tensor.expand %[[VAL_8]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>> to memref<?xf64>, memref<?xi1>, memref<?xindex>
+// CHECK:             %[[VAL_21:.*]] = scf.for %[[VAL_22:.*]] = %[[VAL_4]] to %[[VAL_3]] step %[[VAL_5]] iter_args(%[[VAL_23:.*]] = %[[VAL_20]]) -> (index) {
+// CHECK:               %[[VAL_24:.*]] = memref.load %[[VAL_12]]{{\[}}%[[VAL_15]], %[[VAL_22]]] : memref<8x8xf64>
+// CHECK:               %[[VAL_25:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_15]]] : memref<?xindex>
+// CHECK:               %[[VAL_26:.*]] = arith.addi %[[VAL_15]], %[[VAL_5]] : index
+// CHECK:               %[[VAL_27:.*]] = memref.load %[[VAL_9]]{{\[}}%[[VAL_26]]] : memref<?xindex>
+// CHECK:               %[[VAL_28:.*]] = scf.for %[[VAL_29:.*]] = %[[VAL_25]] to %[[VAL_27]] step %[[VAL_5]] iter_args(%[[VAL_30:.*]] = %[[VAL_23]]) -> (index) {
+// CHECK:                 %[[VAL_31:.*]] = memref.load %[[VAL_10]]{{\[}}%[[VAL_29]]] : memref<?xindex>
+// CHECK:                 %[[VAL_32:.*]] = memref.load %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref<?xf64>
+// CHECK:                 %[[VAL_33:.*]] = memref.load %[[VAL_11]]{{\[}}%[[VAL_29]]] : memref<?xf64>
+// CHECK:                 %[[VAL_34:.*]] = memref.load %[[VAL_13]]{{\[}}%[[VAL_22]], %[[VAL_31]]] : memref<8x8xf64>
+// CHECK:                 %[[VAL_35:.*]] = arith.mulf %[[VAL_24]], %[[VAL_34]] : f64
+// CHECK:                 %[[VAL_36:.*]] = arith.mulf %[[VAL_33]], %[[VAL_35]] : f64
+// CHECK:                 %[[VAL_37:.*]] = arith.addf %[[VAL_32]], %[[VAL_36]] : f64
+// CHECK:                 %[[VAL_38:.*]] = memref.load %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref<?xi1>
+// CHECK:                 %[[VAL_39:.*]] = arith.cmpi eq, %[[VAL_38]], %[[VAL_6]] : i1
+// CHECK:                 %[[VAL_40:.*]] = scf.if %[[VAL_39]] -> (index) {
+// CHECK:                   memref.store %[[VAL_7]], %[[VAL_18]]{{\[}}%[[VAL_31]]] : memref<?xi1>
+// CHECK:                   memref.store %[[VAL_31]], %[[VAL_19]]{{\[}}%[[VAL_30]]] : memref<?xindex>
+// CHECK:                   %[[VAL_41:.*]] = arith.addi %[[VAL_30]], %[[VAL_5]] : index
+// CHECK:                   scf.yield %[[VAL_41]] : index
+// CHECK:                 } else {
+// CHECK:                   scf.yield %[[VAL_30]] : index
+// CHECK:                 }
+// CHECK:                 memref.store %[[VAL_37]], %[[VAL_17]]{{\[}}%[[VAL_31]]] : memref<?xf64>
+// CHECK:                 scf.yield %[[VAL_42:.*]] : index
+// CHECK:               } {"Emitted from" = "linalg.generic"}
+// CHECK:               scf.yield %[[VAL_43:.*]] : index
+// CHECK:             } {"Emitted from" = "linalg.generic"}
+// CHECK:             %[[VAL_44:.*]] = sparse_tensor.compress %[[VAL_17]], %[[VAL_18]], %[[VAL_19]], %[[VAL_45:.*]] into %[[VAL_16]]{{\[}}%[[VAL_15]]] : memref<?xf64>, memref<?xi1>, memref<?xindex>, tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:             scf.yield %[[VAL_44]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           } {"Emitted from" = "linalg.generic"}
+// CHECK:           %[[VAL_46:.*]] = sparse_tensor.load %[[VAL_47:.*]] hasInserts : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           %[[VAL_48:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           %[[VAL_49:.*]] = bufferization.alloc_tensor() : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:           return %[[VAL_46]] : tensor<8x8xf64, #sparse_tensor.encoding<{ lvlTypes = [ "dense", "compressed" ] }>>
+// CHECK:         }
+func.func @sparse_sampled_dd_with_reuse(%args: tensor<8x8xf64, #SortedCOO>, %arga: tensor<8x8xf64>, %argb: tensor<8x8xf64>) -> tensor<8x8xf64, #SortedCOO> {
+  %1 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO>
+  %2 = linalg.generic #trait_sampled_dense_dense
+  ins(%args, %arga, %argb: tensor<8x8xf64, #SortedCOO>,
+  tensor<8x8xf64>, tensor<8x8xf64>)
+  outs(%1: tensor<8x8xf64, #SortedCOO>) {
+  ^bb(%s: f64, %a: f64, %b: f64, %x: f64):
+  %p = arith.mulf %a, %b : f64
+  %q = arith.mulf %s, %p : f64
+  %r = arith.addf %x, %q : f64
+  linalg.yield %r : f64
+  } -> tensor<8x8xf64, #SortedCOO>
+
+  // reuse the input by doing the computation again
+  %3 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO>
+  %4 = linalg.generic #trait_sampled_dense_dense
+  ins(%args, %arga, %argb: tensor<8x8xf64, #SortedCOO>,
+  tensor<8x8xf64>, tensor<8x8xf64>)
+  outs(%3: tensor<8x8xf64, #SortedCOO>) {
+  ^bb(%s: f64, %a: f64, %b: f64, %x: f64):
+  %p = arith.mulf %a, %b : f64
+  %q = arith.mulf %s, %p : f64
+  %r = arith.addf %x, %q : f64
+  linalg.yield %r : f64
+  } -> tensor<8x8xf64, #SortedCOO>
+
+  // elementwise-min operation to produce the return tensor
+  %5 = bufferization.alloc_tensor() : tensor<8x8xf64, #SortedCOO>
+  %6 = linalg.generic #trait_vec_op
+  ins(%2, %4: tensor<8x8xf64, #SortedCOO>, tensor<8x8xf64, #SortedCOO>)
+  outs(%5: tensor<8x8xf64, #SortedCOO>) {
+  ^bb(%a: f64, %b: f64, %x: f64):
+  %d0 = sparse_tensor.binary %a, %b : f64, f64 to f64
+  overlap={
+  ^bb0(%a0: f64, %b0: f64):
+  %c0 = arith.minf %a0, %b0: f64
+  sparse_tensor.yield %c0 : f64
+  }
+  left=identity
+  right=identity
+  linalg.yield %d0 : f64
+  } -> tensor<8x8xf64, #SortedCOO>
+
+  return %2 : tensor<8x8xf64, #SortedCOO>
+}
+
+}
diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sparse-sampled-matmul-lib.mlir
@@ -0,0 +1,119 @@
+//
+// NOTE: this test requires gpu-sm80
+//
+//
+// RUN: mlir-opt %s \
+// RUN:   --sparse-compiler="enable-runtime-library=true enable-gpu-libgen gpu-triple=nvptx64-nvidia-cuda gpu-chip=sm_80 gpu-features=+ptx71"  \
+// RUN: | mlir-cpu-runner \
+// RUN:   --shared-libs=%mlir_cuda_runtime \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --e main --entry-point-result=void \
+// RUN: | FileCheck %s
+//
+
+!Filename = !llvm.ptr<i8>
+
+#SparseMatrix = #sparse_tensor.encoding<{
+  lvlTypes = [ "compressed", "compressed" ],
+  posWidth = 32,
+  crdWidth = 32
+}>
+
+#trait_sampled_dense_dense = {
+  indexing_maps = [
+    affine_map<(i,j,k) -> (i,j)>,  // S
+    affine_map<(i,j,k) -> (i,k)>,  // A
+    affine_map<(i,j,k) -> (k,j)>,  // B
+    affine_map<(i,j,k) -> (i,j)>   // X (out)
+  ],
+  iterator_types = ["parallel", "parallel", "reduction"],
+  doc = "X(i,j) += S(i,j) SUM_k A(i,k) B(k,j)"
+}
+
+//
+// Integration test that lowers a kernel annotated as sparse to
+// actual sparse code, initializes a matching sparse storage scheme
+// from file, and runs the resulting code with the JIT compiler.
+//
+module {
+  //
+  // A kernel that computes a sampled matrix matrix multiplication.
+  //
+  func.func @sampled_dense_dense(%args: tensor<?x?xf32, #SparseMatrix>,
+                                 %arga: tensor<?x?xf32>,
+                                 %argb: tensor<?x?xf32>,
+                                 %argx: tensor<?x?xf32>) -> tensor<?x?xf32> {
+    %0 = linalg.generic #trait_sampled_dense_dense
+      ins(%args, %arga, %argb: tensor<?x?xf32, #SparseMatrix>, tensor<?x?xf32>, tensor<?x?xf32>)
+      outs(%argx: tensor<?x?xf32>) {
+        ^bb(%s: f32, %a: f32, %b: f32, %x: f32):
+          %0 = arith.mulf %a, %b : f32
+          %1 = arith.mulf %s, %0 : f32
+          %2 = arith.addf %x, %1 : f32
+          linalg.yield %2 : f32
+    } -> tensor<?x?xf32>
+    return %0 : tensor<?x?xf32>
+  }
+
+  func.func private @getTensorFilename(index) -> (!Filename)
+
+  //
+  // Main driver that reads matrix from file and calls the sparse kernel.
+  //
+  func.func @entry() {
+    %d0 = arith.constant 0.0 : f32
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %c5 = arith.constant 5 : index
+    %c10 = arith.constant 10 : index
+
+    // Initialize dense matrices.
+    %x = tensor.generate %c5, %c5 {
+    ^bb0(%i : index, %j : index):
+      tensor.yield %d0 : f32
+    } : tensor<?x?xf32>
+
+    %a = tensor.generate %c5, %c10 {
+    ^bb0(%i: index, %j: index):
+      %p = arith.addi %i, %c1 : index
+      %q = arith.index_cast %p : index to i32
+      %d = arith.sitofp %q : i32 to f32
+      tensor.yield %d : f32
+    } : tensor<?x?xf32>
+
+    %b = tensor.generate %c10, %c5 {
+    ^bb0(%i: index, %j: index):
+      %p = arith.addi %j, %c1 : index
+      %q = arith.index_cast %p : index to i32
+      %d = arith.sitofp %q : i32 to f32
+      tensor.yield %d : f32
+    } : tensor<?x?xf32>
+
+    // Read the sparse matrix from file, construct sparse storage.
+    %fileName = call @getTensorFilename(%c0) : (index) -> (!Filename)
+    %s = sparse_tensor.new %fileName : !Filename to tensor<?x?xf32, #SparseMatrix>
+
+    // Call the kernel.
+    %0 = call @sampled_dense_dense(%s, %a, %b, %x)
+       : (tensor<?x?xf32, #SparseMatrix>,
+          tensor<?x?xf32>, tensor<?x?xf32>, tensor<?x?xf32>) -> tensor<?x?xf32>
+
+    // Print the result for verification.
+    //
+    // CHECK: ( 10, 0, 0, 56, 0 )
+    // CHECK: ( 0, 80, 0, 0, 250 )
+    // CHECK: ( 0, 0, 270, 0, 0 )
+    // CHECK: ( 164, 0, 0, 640, 0 )
+    // CHECK: ( 0, 520, 0, 0, 1250 )
+    //
+    scf.for %i = %c0 to %c5 step %c1 {
+      %v = vector.transfer_read %0[%i, %c0], %d0: tensor<?x?xf32>, vector<5xf32>
+      vector.print %v : vector<5xf32>
+    }
+
+    // Release the resources.
+    bufferization.dealloc_tensor %s : tensor<?x?xf32, #SparseMatrix>
+
+    return
+  }
+}