diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -287,11 +287,11 @@
       {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType,
        llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType,
        llvmPointerType /* void *stream */}};
-  FunctionCallBuilder AssertSparseLTEnvHandleSizeCallBuilder = {
-      "mgpuAssertSparseLTEnvHandleSize", llvmVoidType, {}};
-  FunctionCallBuilder AssertSparseLTSpMatHandleSizeCallBuilder = {
-      "mgpuAssertSparseLTSpMatHandleSize", llvmVoidType, {}};
-  FunctionCallBuilder AssertSparseLTDnMatHandleSizeCallBuilder = {
+  FunctionCallBuilder AssertSparseLtEnvHandleSizeCallBuilder = {
+      "mgpuAssertSparseLtEnvHandleSize", llvmVoidType, {}};
+  FunctionCallBuilder AssertSparseLtSpMatHandleSizeCallBuilder = {
+      "mgpuAssertSparseLtSpMatHandleSize", llvmVoidType, {}};
+  FunctionCallBuilder AssertSparseLtDnMatHandleSizeCallBuilder = {
       "mgpuAssertSparseLtDnMatHandleSize", llvmVoidType, {}};
   FunctionCallBuilder createSparseLtEnvCallBuilder = {
       "mgpuCreateSparseLtEnv",
@@ -322,13 +322,14 @@
   FunctionCallBuilder cuSparseLtSpmmBufferSizeBuilder = {
       "mgpuCuSparseLtSpMMBufferSize",
       llvmVoidType,
-      {llvmPointerType, llvmPointerType, llvmPointerType,
+      {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+       llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type,
        llvmPointerType /*void *stream*/}};
   FunctionCallBuilder cuSparseLtSpmmBuilder = {
       "mgpuCuSparseLtSpMM",
       llvmVoidType,
       {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType,
-       llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType,
+       llvmPointerType, llvmPointerType, llvmPointerType,
        llvmPointerType /*void *stream*/}};
 };
 
@@ -1417,13 +1418,6 @@
                                           static_cast<int32_t>(TValue));
 }
 
-static Value genConstInt32FromComputeMode(OpBuilder &builder, Location loc,
-                                          Type computeType) {
-  auto computeTypeInt = getCuSparseDataTypeFrom(computeType);
-  auto computeTypeConst = genConstInt32From(builder, loc, computeTypeInt);
-  return computeTypeConst;
-}
-
 LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::CreateSparseEnvOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1437,7 +1431,7 @@
   Value handle;
   if (isSpMMCusparseLtOp(op.getEnv())) {
     // Assert the size is 11024 bytes
-    AssertSparseLTEnvHandleSizeCallBuilder.create(loc, rewriter, {});
+    AssertSparseLtEnvHandleSizeCallBuilder.create(loc, rewriter, {});
     auto handleSz = rewriter.create<LLVM::ConstantOp>(
         loc, getIndexType(), rewriter.getIndexAttr(11024));
     handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1532,7 +1526,7 @@
   Value handle;
   if (isSpMMCusparseLtOp(op.getDmat())) {
     auto envHandle = adaptor.getEnv();
-    AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
+    AssertSparseLtDnMatHandleSizeCallBuilder.create(loc, rewriter, {});
     auto handleSz = rewriter.create<LLVM::ConstantOp>(
         loc, getIndexType(), rewriter.getIndexAttr(11032));
     handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1692,10 +1686,10 @@
     pMat = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pMat);
   Type dType =
       llvm::cast<MemRefType>(op.getMemref().getType()).getElementType();
-  auto dtp = genConstInt32From(rewriter, loc, getCuSparseLtDataTypeFrom(dType));
+  auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType));
   auto envHandle = adaptor.getEnv();
 
-  AssertSparseLTSpMatHandleSizeCallBuilder.create(loc, rewriter, {});
+  AssertSparseLtSpMatHandleSizeCallBuilder.create(loc, rewriter, {});
   auto handleSz = rewriter.create<LLVM::ConstantOp>(
       loc, getIndexType(), rewriter.getIndexAttr(44104));
   Value handle = rewriter.create<LLVM::AllocaOp>(loc, llvmInt8PointerType,
@@ -1739,8 +1733,8 @@
     return failure();
   Location loc = op.getLoc();
   auto modeA = genConstInt32From(rewriter, loc, op.getModeA());
-  auto computeType =
-      genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
+  auto computeType = genConstInt32From(
+      rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
   auto stream = adaptor.getAsyncDependencies().front();
   auto bufferSize =
       spMVBufferSizeCallBuilder
@@ -1760,8 +1754,8 @@
     return failure();
   Location loc = op.getLoc();
   auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA());
-  auto computeType =
-      genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
+  auto computeType = genConstInt32From(
+      rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
   auto stream = adaptor.getAsyncDependencies().front();
   Value pBuf =
       MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
@@ -1785,10 +1779,11 @@
   auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA());
   auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB());
   auto stream = adaptor.getAsyncDependencies().front();
-  auto computeType =
-      genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
   Value bufferSize;
   if (is2To4Sparsity(op.getSpmatA())) {
+
+    auto computeType = genConstInt32From(
+        rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType()));
     auto three = rewriter.create<LLVM::ConstantOp>(loc, getIndexType(),
                                                    rewriter.getIndexAttr(3));
     bufferSize = rewriter.create<LLVM::AllocaOp>(loc, llvmInt64PointerType,
@@ -1798,10 +1793,14 @@
 
     cuSparseLtSpmmBufferSizeBuilder
         .create(loc, rewriter,
-                {bufferSize, adaptor.getEnv(), adaptor.getSpmatA(), stream})
+                {bufferSize, adaptor.getEnv(), modeA, modeB,
+                 adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(),
+                 computeType, stream})
         .getResult();
     rewriter.replaceOp(op, {bufferSize, stream});
   } else {
+    auto computeType = genConstInt32From(
+        rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
     bufferSize = spMMBufferSizeCallBuilder
                      .create(loc, rewriter,
                              {adaptor.getEnv(), modeA, modeB,
@@ -1822,8 +1821,8 @@
   Location loc = op.getLoc();
   auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA());
   auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB());
-  auto computeType =
-      genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
+  auto computeType = genConstInt32From(
+      rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
   auto stream = adaptor.getAsyncDependencies().front();
   auto bufferSize = SDDMMBufferSizeCallBuilder
                         .create(loc, rewriter,
@@ -1844,8 +1843,8 @@
   Location loc = op.getLoc();
   auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA());
   auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB());
-  auto computeType =
-      genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
+  auto computeType = genConstInt32From(
+      rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
 
   auto stream = adaptor.getAsyncDependencies().front();
 
@@ -1861,8 +1860,7 @@
     cuSparseLtSpmmBuilder.create(loc, rewriter,
                                  {adaptor.getEnv(), adaptor.getSpmatA(),
                                   adaptor.getDnmatB(), adaptor.getDnmatC(),
-                                  computeType, pBufs[0], pBufs[1], pBufs[2],
-                                  stream});
+                                  pBufs[0], pBufs[1], pBufs[2], stream});
   } else {
     Value pBuf = MemRefDescriptor(adaptor.getBuffers().front())
                      .allocatedPtr(rewriter, loc);
@@ -1892,8 +1890,8 @@
       failed(isAsyncWithOneDependency(rewriter, op)))
     return failure();
   Location loc = op.getLoc();
-  auto computeType =
-      genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType());
+  auto computeType = genConstInt32From(
+      rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType()));
   auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA());
   auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB());
   auto stream = adaptor.getAsyncDependencies().front();
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -468,13 +468,13 @@
 
 struct cusparseLtSpMatHandleAndData {
   cusparseLtMatDescriptor_t mat;
-  void *values{nullptr};
-  // TODO: the following is associated with the SpMM operator rather than the
-  // sparse matrix. Create workspace buffers and pass them to the SpMM
+  // TODO: the following three are associated with the SpMM operator rather than
+  // the sparse matrix. Create workspace buffers and pass them to the SpMM
   // execution.
   cusparseLtMatmulAlgSelection_t alg_sel;
   cusparseLtMatmulPlan_t plan;
   cusparseLtMatmulDescriptor_t matmul;
+  void *values{nullptr};
 };
 
 struct cusparseLtDnMatHandleAndData {
@@ -482,7 +482,7 @@
   void *values{nullptr};
 };
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLTEnvHandleSize() {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtEnvHandleSize() {
   assert(sizeof(cusparseLtHandle_t) == 11024);
 }
 
@@ -490,11 +490,11 @@
   return assert(sizeof(cusparseLtSpMatHandleAndData) == 44104);
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSparseLtDnMatHandleSize() {
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtDnMatHandleSize() {
   return assert(sizeof(cusparseLtDnMatHandleAndData) == 11032);
 }
 
-extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) {
   // note that cuSparseLt still uses cusparseStatus_t
   CUSPARSE_REPORT_IF_ERROR(
@@ -510,15 +510,15 @@
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols,
-                          void *values, int32_t dw, CUstream /*stream*/) {
-  cusparseLtMatDescriptor_t mat;
+                          void *values, int32_t dtp, CUstream /*stream*/) {
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
+  memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData));
   auto dnmat_handle = reinterpret_cast<cusparseLtDnMatHandleAndData *>(dh);
-  cudaDataType_t dtp = dataTp(dw);
+  auto dTp = static_cast<cudaDataType_t>(dtp);
   // assuming row-major when deciding lda
   CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit(
-      handle, &(dh->mat), rows, cols, /*lda=*/cols,
-      /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW))
+      handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols,
+      /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW))
   dnmat_handle->values = values;
 }
 
@@ -526,56 +526,66 @@
 // cusparseLt
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) {
-  auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData>(m);
+  auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData *>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) {
-  auto matAndData = reinterpret_cast<cusparseLtDnMatHandleAndData>(m);
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat)))
+  auto matAndData = reinterpret_cast<cusparseLtDnMatHandleAndData *>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat)))
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
 mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols,
-                              void *values, int32_t dw, CUstream /*stream*/) {
+                              void *values, int32_t dtp, CUstream /*stream*/) {
   auto spmat_handle = reinterpret_cast<cusparseLtSpMatHandleAndData *>(sh);
+  memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData));
   spmat_handle->values = values;
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
-  cudaDataType_t dtp = dataTp_cusparseLt(dw);
+  auto dTp = static_cast<cudaDataType_t>(dtp);
   // assuming row-major when deciding lda
   CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
-      handle, &(sh->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, dtp,
-      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+      handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16,
+      dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
 }
 
 // Several things are being done in this stage, algorithm selection, planning,
 // and returning workspace and compressed matrices data buffer sizes.
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMMBufferSize(void *workspace_size, void *compressed_size,
-                             void *compressed_buffer_size, void *h, void *a,
+mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a,
+                             void *b, void *c, int32_t ctp,
                              CUstream /*stream*/) {
   // TODO: support more advanced settings, e.g., the input right operand is a
   // sparse matrix assuming matA is the sparse matrix
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
   auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
+  auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
+  auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
+  auto workspace_size = reinterpret_cast<size_t *>(bs);
+  auto compressed_size = &(reinterpret_cast<size_t *>(bs)[1]);
+  auto compressed_buffer_size = &(reinterpret_cast<size_t *>(bs)[2]);
+  auto cTp = static_cast<cusparseComputeType>(ctp);
 
-  CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit(
-      handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+  cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
+  cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit(
+      handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat),
+      &(matC->mat), &(matC->mat), cTp))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit(
+      handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT))
   int alg = 0;
-  CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
-      handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute(
+      handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
       sizeof(alg)))
-  // TODO: add transpose support
-  CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit(
-      handle, &(matA.matmul), c, CUSPARSE_OPERATION_NON_TRANSPOSE, &(matA->mat),
-      &matB, &matC, &matC, compute_type))
-  CHECK_CUSPARSE(cusparseLtMatmulPlanInit(handle, &(matWithData.plan), &matmul,
-                                          &(matWithData.alg_sel)))
-
-  CHECK_CUSPARSE(
-      cusparseLtMatmulGetWorkspace(handle, &(matA.plan), workspace_size))
-  CHECK_CUSPARSE(cusparseLtSpMMACompressedSize(
-      handle, &(matA.plan), compressed_size, compressed_buffer_size))
+
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit(
+      handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel)))
+
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtMatmulGetWorkspace(handle, &(matA->plan), workspace_size))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize(
+      handle, &(matA->plan), compressed_size, compressed_buffer_size))
 
   // avoid zero-alloc
   *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size);
@@ -586,34 +596,29 @@
 }
 
 extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
-mgpuCuSparseLtSpMM(void *alg_sel, void *plan, void *matmul, void *h, void *a,
-                   void *b, void *c, int32_t dw, void *buf, void *dA_compressed,
-                   void *dA_compressedBuffer, CUstream stream) {
+mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, void *d_workspace,
+                   void *dA_compressed, void *dA_compressedBuffer,
+                   CUstream stream) {
   auto handle = reinterpret_cast<cusparseLtHandle_t *>(h);
   auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
   auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
   auto matC = reinterpret_cast<cusparseLtDnMatHandleAndData *>(c);
 
-  cusparseLtMatmulAlgSelection_t alg_sel;
-  cusparseLtMatmulPlan_t plan;
-  cusparseLtMatmulDescriptor_t matmul;
-
-  ALPHABETA(dw, alpha, beta)
-
-  CHECK_CUSPARSE(cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values),
-                                         dA_compressed, dA_compressedBuffer,
-                                         stream))
+  ALPHABETA(CUDA_R_32F, alpha, beta)
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtSpMMACompress(handle, &(matA->plan), (matA->values),
+                              dA_compressed, dA_compressedBuffer, stream))
 
   // TODO: add support to multi-stream execution
   // Perform the matrix multiplication. D = A*B+C using C==D for now
-  CHECK_CUSPARSE(
-      cusparseLtMatmul(handle, reinterpret_cast<cusparseLtMatmulPlan_t *>(plan),
-                       &alpha, dA_compressed, dB, &beta, matC->values,
-                       /*dD*/ matC->values, d_workspace, &stream, 1))
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed,
+                       matB->values, betap, matC->values,
+                       /*dD*/ matC->values, d_workspace, nullptr, 0))
 
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat)))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat)))
   // destroy the plan associated with the sparse matrix
-  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan)))
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan)))
 }
 
 #endif // MLIR_ENABLE_CUDA_CUSPARSELT