diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -2111,4 +2111,131 @@
   }];
 }
 
+
+def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]> {
+  let summary = "Precompute buffersize for SDDMM operation";
+  let description = [{
+    The `gpu.sddmm_buffer_size` operation returns the buffer size required
+    to perform the SDDMM operation on the given sparse and dense matrices.
+    The operation expects handles returned by previous sparse operations
+    to construct an environment and the operands for SDDMM.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32
+    ```
+
+    The matrix arguments can also be associated with one of the following
+    operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
+    is NON_TRANSPOSE.
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                   GPU_SparseEnvHandle:$env,
+                   GPU_TransposeModeAttr:$modeA,
+                   GPU_TransposeModeAttr:$modeB,
+                   GPU_SparseSpMatHandle:$dnmatA,
+                   GPU_SparseSpMatHandle:$dnmatB,
+                   GPU_SparseSpMatHandle:$spmatC,
+                   TypeAttr:$computeType);
+  let results = (outs Res<Index>:$bufferSz, Optional<GPU_AsyncToken>:$asyncToken);
+
+  let builders = [OpBuilder<(ins
+      "Type":$bufferSz,
+      "Type":$asyncToken,
+      "ValueRange":$asyncDependencies,
+      "Value":$env,
+      "Value":$spmatA,
+      "Value":$spmatB,
+      "Value":$spmatC,
+      "Type":$computeType), [{
+    auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
+    auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
+    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies,
+                 env, modeA, modeB, spmatA, spmatB, spmatC, computeType);}]>
+  ];
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType
+  }];
+}
+
+// TODO: cusparseSpGEMM_createDescr, cusparseSpGEMM_destroyDescr, cusparseSpGEMM_workEstimation, cusparseSpGEMM_estimateMemory, cusparseSpGEMM_compute, cusparseSpMatGetSize, cusparseSpGEMM_copy
+def GPU_SpGEMMCreateDescrOp : GPU_Op<"spgemm_create_descr"> {
+  let summary = "SpGEMM Create Descr operation";
+  let description = [{
+    The `gpu.spgemm_create_descr` 
+
+    Example:
+
+    ```mlir
+    %descriptor = gpu.spgemm_create_descr
+    ```
+
+  }];
+
+  let results = (outs GPU_SpGEMMDescriptor:$desc);
+}
+
+// def GPU_SpGEMMOp : GPU_Op<"spgemm_compute", [GPU_AsyncOpInterface]> {
+//   let summary = "SpGEMM operation";
+//   let description = [{
+//     The `gpu.spgemm` operation performs the SpGEMM operation on the given sparse
+//    matrices, and buffer.  The operation expects handles returned by previous
+//     sparse operations to construct an environment and the operands for SpGEMM. The
+//     buffer must have been allocated on the device.
+
+//     C' = alpha * op(A) * op(B) + beta * C
+
+//     If the `async` keyword is present, the op is executed asynchronously (i.e.
+//     it does not block until the execution has finished on the device). In
+//     that case, it returns a !gpu.async.token in addition to the environment.
+
+//     Example:
+
+//     ```mlir
+//     %token = gpu.spgemm async [%dep] %env, %spmatA{TRANSPOSE}, %spmatB{TRANSPOSE}, %spmatC, %buffer into f32
+//     ```
+
+//     The matrix arguments can also be associated with one of the following
+//     operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value
+//     is NON_TRANSPOSE.
+//   }];
+
+//   let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+//                    GPU_TransposeModeAttr:$modeA,
+//                    GPU_TransposeModeAttr:$modeB,
+//                    GPU_SparseSpMatHandle:$spmatA,
+//                    GPU_SparseSpMatHandle:$spmatB,
+//                    GPU_SparseSpMatHandle:$spmatC,
+//                    TypeAttr:$computeType,
+//                    AnyMemRef:$buffer);
+//   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
+
+//   let builders = [OpBuilder<(ins
+//     "Type":$asyncToken,
+//     "ValueRange":$asyncDependencies,
+//     "Value":$spmatA,
+//     "Value":$spmatB,
+//     "Value":$spmatC,
+//     "Type":$computeType,
+//     "Value":$buffer), [{
+//   auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
+//   auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
+//   return build($_builder, $_state, asyncToken, asyncDependencies, modeA,
+//                 modeB, spmatA, spmatB, spmatC, computeType, buffer);}]>
+//   ];
+
+//   let assemblyFormat = [{
+//     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+//     $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType
+//   }];
+// }
+
 #endif // GPU_OPS
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -478,6 +478,59 @@
                                          CUSPARSE_SDDMM_ALG_DEFAULT, buf))
 }
 
+// TODO: add support to passing alpha and beta as arguments
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
+mgpuSpGEMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+                    int32_t ctp, CUstream /*stream*/) {
+  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  cusparseSpGEMMDescr_t spgemmDesc;
+  CUSPARSE_REPORT_IF_ERROR( cusparseSpGEMM_createDescr(&spgemmDesc) )
+  cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
+  cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
+  cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+  cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
+  cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
+  auto cTp = static_cast<cudaDataType_t>(ctp);
+  ALPHABETA(cTp, alpha, beta)
+  size_t bufferSize = 0;
+  // CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize(
+  //     handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp,
+  //     CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize))
+
+  // ask bufferSize1 bytes for external memory
+    CUSPARSE_REPORT_IF_ERROR(
+        cusparseSpGEMM_workEstimation(handle, modeA, modeB,
+                                      alphap, matA, matB, betap, matC,
+                                      computeType, CUSPARSE_SPGEMM_DEFAULT,
+                                      spgemmDesc, &bufferSize, NULL) )
+  return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuSpGEMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+          int32_t ctp, void *buf, CUstream /*stream*/) {
+  cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  cusparseSpGEMMDescr_t spgemmDesc;
+  CUSPARSE_REPORT_IF_ERROR( cusparseSpGEMM_createDescr(&spgemmDesc) )
+  cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
+  cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
+  cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+  cusparseSpMatDescr_t matB = reinterpret_cast<cusparseSpMatDescr_t>(b);
+  cusparseSpMatDescr_t matC = reinterpret_cast<cusparseSpMatDescr_t>(c);
+  auto cTp = static_cast<cudaDataType_t>(ctp);
+  ALPHABETA(cTp, alpha, beta)
+  // CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(handle, modeA, modeB, alphap, matA,
+  //                                        matB, betap, matC, cTp,
+  //                                        CUSPARSE_SDDMM_ALG_DEFAULT, buf))
+  
+    CHECK_CUSPARSE(
+        cusparseSpGEMM_compute(handle, opA, opB,
+                               &alpha, matA, matB, &beta, matC,
+                               computeType, CUSPARSE_SPGEMM_DEFAULT,
+                               spgemmDesc, &bufferSize, NULL) )
+  
+}
+
 #ifdef MLIR_ENABLE_CUDA_CUSPARSELT
 
 ///