diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -287,11 +287,11 @@
       {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
        llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType,
        llvmPointerType /* void *stream */}};
-  FunctionCallBuilder AssertSparseLTEnvHandleSizeCallBuilder = {
-      "mgpuAssertSparseLTEnvHandleSize", llvmVoidType, {}};
-  FunctionCallBuilder AssertSparseLTSpMatHandleSizeCallBuilder = {
-      "mgpuAssertSparseLTSpMatHandleSize", llvmVoidType, {}};
-  FunctionCallBuilder AssertSparseLTDnMatHandleSizeCallBuilder = {
+  FunctionCallBuilder AssertSparseLtEnvHandleSizeCallBuilder = {
+      "mgpuAssertSparseLtEnvHandleSize", llvmVoidType, {}};
+  FunctionCallBuilder AssertSparseLtSpMatHandleSizeCallBuilder = {
+      "mgpuAssertSparseLtSpMatHandleSize", llvmVoidType, {}};
+  FunctionCallBuilder AssertSparseLtDnMatHandleSizeCallBuilder = {
       "mgpuAssertSparseLtDnMatHandleSize", llvmVoidType, {}};
   FunctionCallBuilder createSparseLtEnvCallBuilder = {
       "mgpuCreateSparseLtEnv",
@@ -322,7 +322,8 @@
   FunctionCallBuilder cuSparseLtSpmmBufferSizeBuilder = {
       "mgpuCuSparseLtSpMMBufferSize",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType, llvmPointerType,
+      {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+       llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type,
        llvmPointerType /*void *stream*/}};
   FunctionCallBuilder cuSparseLtSpmmBuilder = {
       "mgpuCuSparseLtSpMM",
@@ -1437,7 +1438,7 @@
   Value handle;
   if (isSpMMCusparseLtOp(op.getEnv())) {
     // Assert the size is 11024 bytes
-    AssertSparseLTEnvHandleSizeCallBuilder.create(loc, rewriter, {});
+    AssertSparseLtEnvHandleSizeCallBuilder.create(loc, rewriter, {});
     auto handleSz = rewriter.create<LLVM::ConstantOp>(
         loc, getIndexType(), rewriter.getIndexAttr(11024));
     handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1532,7 +1533,7 @@
   Value handle;
   if (isSpMMCusparseLtOp(op.getDmat())) {
     auto envHandle = adaptor.getEnv();
-    AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
+    AssertSparseLtDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
     auto handleSz = rewriter.create<LLVM::ConstantOp>(
         loc, getIndexType(), rewriter.getIndexAttr(11032));
     handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1695,7 +1696,7 @@
   auto dtp = genConstInt32From(rewriter, loc, getCuSparseLtDataTypeFrom(dType));
   auto envHandle = adaptor.getEnv();
 
-  AssertSparseLTSpMatHandleSizeCallBuilder.create(loc, rewriter, {});
+  AssertSparseLtSpMatHandleSizeCallBuilder.create(loc, rewriter, {});
   auto handleSz = rewriter.create<LLVM::ConstantOp>(
       loc, getIndexType(), rewriter.getIndexAttr(44104));
   Value handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1785,10 +1786,11 @@
   auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA());
   auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB());
   auto stream = adaptor.getAsyncDependencies().front();
-  auto computeType =
-      genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
   Value bufferSize;
   if (is2To4Sparsity(op.getSpmatA())) {
+
+    auto computeType = genConstInt32From(
+        rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
     auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                    rewriter.getIndexAttr(3));
     bufferSize = rewriter.create<LLVM::AllocaOp>(loc, llvmInt64PointerType,
@@ -1798,10 +1800,14 @@
 
     cuSparseLtSpmmBufferSizeBuilder
         .create(loc, rewriter,
-                {bufferSize, adaptor.getEnv(), adaptor.getSpmatA(), stream})
+                {bufferSize, adaptor.getEnv(), modeA, modeB,
+                 adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(),
+                 computeType, stream})
         .getResult();
     rewriter.replaceOp(op, {bufferSize, stream});
   } else {
+    auto computeType =
+        genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
     bufferSize = spMMBufferSizeCallBuilder
                      .create(loc, rewriter,
                              {adaptor.getEnv(), modeA, modeB,
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -482,7 +482,7 @@
   void *values{nullptr};
 };
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLTEnvHandleSize() {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtEnvHandleSize() {
   assert(sizeof(cusparseLtHandle_t) == 11024);
 }
 
@@ -490,11 +490,11 @@
   return assert(sizeof(cusparseLtSpMatHandleAndData) == 44104);
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSparseLtDnMatHandleSize() {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtDnMatHandleSize() {
   return assert(sizeof(cusparseLtDnMatHandleAndData) == 11032);
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) {
   // note that cuSparseLt still uses cusparseStatus_t
   CUSPARSE_REPORT_IF_ERROR(
@@ -510,15 +510,14 @@
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols,
-                          void *values, int32_t dw, CUstream /*stream*/) {
-  cusparseLtMatDescriptor_t mat;
+                          void *values, int32_t dtp, CUstream /*stream*/) {
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
   auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
-  cudaDataType_t dtp = dataTp(dw);
+  auto dTp = static_cast<cudaDataType_t>(dtp);
   // assuming row-major when deciding lda
   CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
-      handle, &(dh->mat), rows, cols, /*lda=*/cols,
-      /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW))
+      handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
+      /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW))
   dnmat_handle->values = values;
 }
 
@@ -526,56 +525,65 @@
 // cusparseLt
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) {
-  auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData>(m);
+  auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData *>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) {
-  auto matAndData = reinterpret_cast<cusparseLtDnMatHandleAndData>(m);
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat)))
+  auto matAndData = reinterpret_cast<cusparseLtDnMatHandleAndData *>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols,
-                              void *values, int32_t dw, CUstream /*stream*/) {
+                              void *values, int32_t dtp, CUstream /*stream*/) {
   auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
   spmat_handle->values = values;
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
-  cudaDataType_t dtp = dataTp_cusparseLt(dw);
+  auto dTp = static_cast<cudaDataType_t>(dtp);
   // assuming row-major when deciding lda
   CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
-      handle, &(sh->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, dtp,
-      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+      handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16,
+      dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
 }
 
 // Several things are being done in this stage, algorithm selection, planning,
 // and returning workspace and compressed matrices data buffer sizes.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMMBufferSize(void *workspace_size, void *compressed_size,
-                             void *compressed_buffer_size, void *h, void *a,
+mgpuCuSparseLtSpMMBufferSize(void *ws, void *cs, void *cbs, void *h, int32_t ma,
+                             int32_t mb, void *a, void *b, void *c, int32_t ctp,
                              CUstream /*stream*/) {
   // TODO: support more advanced settings, e.g., the input right operand is a
   // sparse matrix assuming matA is the sparse matrix
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
   auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
+  auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
+  auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
+  auto workspace_size = reinterpret_cast<size_t *>(ws);
+  auto compressed_size = reinterpret_cast<size_t *>(cs);
+  auto compressed_buffer_size = reinterpret_cast<size_t *>(cbs);
+  auto cTp = static_cast<cusparseComputeType>(ctp);
 
-  CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit(
-      handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit(
+      handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT))
   int alg = 0;
-  CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-      handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute(
+      handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
       sizeof(alg)))
-  // TODO: add transpose support
-  CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-      handle, &(matA.matmul), c, CUSPARSE_OPERATION_NON_TRANSPOSE, &(matA->mat),
-      &matB, &matC, &matC, compute_type))
-  CHECK_CUSPARSE(cusparseLtMatmulPlanInit(handle, &(matWithData.plan), &matmul,
-                                          &(matWithData.alg_sel)))
-
-  CHECK_CUSPARSE(
-      cusparseLtMatmulGetWorkspace(handle, &(matA.plan), workspace_size))
-  CHECK_CUSPARSE(cusparseLtSpMMACompressedSize(
-      handle, &(matA.plan), compressed_size, compressed_buffer_size))
+
+  cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
+  cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit(
+      handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat),
+      &(matC->mat), &(matC->mat), cTp))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit(
+      handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
+
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtMatmulGetWorkspace(handle, &(matA->plan), workspace_size))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(
+      handle, &(matA->plan), compressed_size, compressed_buffer_size))
 
   // avoid zero-alloc
   *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size);
@@ -586,34 +594,31 @@
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMM(void *alg_sel, void *plan, void *matmul, void *h, void *a,
-                   void *b, void *c, int32_t dw, void *buf, void *dA_compressed,
+mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, int32_t ctp,
+                   void *d_workspace, void *dA_compressed,
                    void *dA_compressedBuffer, CUstream stream) {
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
   auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
   auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
   auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
 
-  cusparseLtMatmulAlgSelection_t alg_sel;
-  cusparseLtMatmulPlan_t plan;
-  cusparseLtMatmulDescriptor_t matmul;
-
-  ALPHABETA(dw, alpha, beta)
+  auto cTp = static_cast<cudaDataType_t>(ctp);
+  ALPHABETA(cTp, alpha, beta)
 
-  CHECK_CUSPARSE(cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values),
-                                         dA_compressed, dA_compressedBuffer,
-                                         stream))
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values),
+                              dA_compressed, dA_compressedBuffer, stream))
 
   // TODO: add support to multi-stream execution
   // Perform the matrix multiplication. D = A*B+C using C==D for now
-  CHECK_CUSPARSE(
-      cusparseLtMatmul(handle, reinterpret_cast<cusparseLtMatmulPlan_t *>(plan),
-                       &alpha, dA_compressed, dB, &beta, matC->values,
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed,
+                       matB->values, betap, matC->values,
                        /*dD*/ matC->values, d_workspace, &stream, 1))
 
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat)))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat)))
   // destroy the plan associated with the sparse matrix
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan)))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan)))
 }
 
 #endif // MLIR_ENABLE_CUDA_CUSPARSELT