diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
--- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
+++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td
@@ -1789,6 +1789,40 @@
   }];
 }
 
+
+def GPU_Create2To4SpMatOp : GPU_Op<"create_2to4_spmat", [GPU_AsyncOpInterface]> {
+  let summary = "Create sparse matrix with 2:4 sparsity operation";
+  let description = [{
+    The `gpu.create_2to4_spmat` operation initializes a sparse matrix in dense 
+    format with 2:4 sparsity.
+    The buffers must already be copied from the host to the device prior to
+    using this operation. The operation returns a handle to the sparse
+    matrix descriptor.
+
+    If the `async` keyword is present, the op is executed asynchronously (i.e.
+    it does not block until the execution has finished on the device). In
+    that case, it returns a !gpu.async.token in addition to the environment.
+
+    Example:
+
+    ```mlir
+    %spmat, %token = gpu.create_2to4_spmat async [%dep] %mem, %size : memref<?xf64>
+    ```
+  }];
+
+  let arguments = (ins Variadic<GPU_AsyncToken>:$asyncDependencies,
+                       Index:$rows,
+                       Index:$cols,
+                       AnyMemRef:$memref);
+  let results = (outs Res<GPU_SparseSpMatHandleType>:$spMat, 
+                      Optional<GPU_AsyncToken>:$asyncToken);
+
+  let assemblyFormat = [{
+    custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
+    $rows `,` $cols `,` $memref attr-dict `:` type($memref)
+  }];
+}
+
 def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> {
   let summary = "Destroy sparse matrix operation";
   let description = [{
@@ -1960,7 +1994,7 @@
     Example:
 
     ```mlir
-    %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC
+    %bufferszs, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64
     ```
   }];
 
@@ -1971,11 +2005,12 @@
                        GPU_SparseSpMatHandle:$spmatA,
                        GPU_SparseDnMatHandle:$dnmatB,
                        GPU_SparseDnMatHandle:$dnmatC);
-  let results = (outs Res<Index>:$bufferSz, 
+  let results = (outs Res<AnyTypeOf<[Index, TupleOf<[Index, Index, 
+                                                     Index]>]>>:$bufferSzs, 
                       Optional<GPU_AsyncToken>:$asyncToken);
 
   let builders = [OpBuilder<(ins
-      "::mlir::Type":$bufferSz,
+      "::mlir::Type":$bufferSzs,
       "::mlir::Type":$asyncToken,
       "::mlir::ValueRange":$asyncDependencies,
       "::mlir::Value":$env,
@@ -1984,17 +2019,17 @@
       "::mlir::Value":$dnmatC), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
-    return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, 
+    return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies, 
                  env, modeA, modeB, spmatA, dnmatB, dnmatC);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict
+    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs)
   }];
 }
 
-def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> {
+def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface, AttrSizedOperandSegments]> {
   let summary = "SpMM operation";
   let description = [{
     The `gpu.spmm` operation performs the SpMM operation on the given sparse and
@@ -2024,7 +2059,7 @@
                        GPU_SparseSpMatHandle:$spmatA,
                        GPU_SparseDnMatHandle:$dnmatB,
                        GPU_SparseDnMatHandle:$dnmatC,
-                       AnyMemRef:$buffer);
+                       Variadic<AnyMemRef>:$buffers);
   let results = (outs Optional<GPU_AsyncToken>:$asyncToken);
 
   let builders = [OpBuilder<(ins
@@ -2034,16 +2069,16 @@
       "::mlir::Value":$spmatA,
       "::mlir::Value":$dnmatB,
       "::mlir::Value":$dnmatC,
-      "::mlir::Value":$buffer), [{
+      "::mlir::ValueRange":$buffers), [{
     auto modeA = gpu::TransposeMode::NON_TRANSPOSE;
     auto modeB = gpu::TransposeMode::NON_TRANSPOSE;
     return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, 
-                 modeB, spmatA, dnmatB, dnmatC, buffer);}]>
+                 modeB, spmatA, dnmatB, dnmatC, buffers);}]>
   ];
 
   let assemblyFormat = [{
     custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
-    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffer attr-dict `:` type($buffer)
+    $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers)
   }];
 }
 
diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
--- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp
@@ -230,6 +230,14 @@
       {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
        llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
        llvmInt32Type, llvmPointerType /* void *stream */}};
+#if MLIR_CUDA_CUSPARSELT_ENABLED
+  FunctionCallBuilder create2To4SpMatCallBuilder = {
+      "mgpuCreateCoo",
+      llvmPointerType,
+      {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType,
+       llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type,
+       llvmPointerType /* void *stream */}};
+#endif
   FunctionCallBuilder destroySpMatCallBuilder = {
       "mgpuDestroySpMat",
       llvmVoidType,
@@ -559,6 +567,20 @@
                   ConversionPatternRewriter &rewriter) const override;
 };
 
+class ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern
+    : public ConvertOpToGpuRuntimeCallPattern<gpu::Create2To4SpMatOp> {
+public:
+  ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern(
+      LLVMTypeConverter &typeConverter)
+      : ConvertOpToGpuRuntimeCallPattern<gpu::Create2To4SpMatOp>(
+            typeConverter) {}
+
+private:
+  LogicalResult
+  matchAndRewrite(gpu::Create2To4SpMatOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override;
+};
+
 class ConvertDestroySpMatOpToGpuRuntimeCallPattern
     : public ConvertOpToGpuRuntimeCallPattern<gpu::DestroySpMatOp> {
 public:
@@ -688,6 +710,29 @@
   return builder.create<LLVM::CallOp>(loc, function, arguments);
 }
 
+static bool is2To4Sparsity(Value spMat) {
+  // TODO: DefaultValuedAttr<BoolAttr, "true">
+  if (auto op = spMat.getDefiningOp<gpu::Create2To4SpMatOp>())
+    return true;
+  if (auto op = spMat.getDefiningOp<gpu::CreateCooOp>())
+    return false;
+  if (auto op = spMat.getDefiningOp<gpu::CreateCsrOp>())
+    return false;
+  llvm_unreachable("cannot find spmat def");
+}
+
+static const char *inferSpMMType(Operation op) {
+  for (Operation *user : op.getUsers()) {
+    auto spmmOp = dyn_cast<gpu::SpMMOp>(user);
+    // if the other operator is 50% sparsity then we should use cusparseLt
+    if (!spmmOp)
+      continue;
+    if (is2To4Sparsity(spmmOp.getSpMatA()))
+      return "cusparseLt";
+  }
+  return "cusparse";
+}
+
 // Returns whether all operands are of LLVM type.
 static LogicalResult areAllLLVMTypes(Operation *op, ValueRange operands,
                                      ConversionPatternRewriter &rewriter) {
@@ -1287,6 +1332,11 @@
       llvm::cast<MemRefType>(op.getMemref().getType()).getElementType();
   auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
                                               dType.getIntOrFloatBitWidth());
+  // For now, we track the use of the handle and lower it to cusparse/cusparseLt
+  // accordingly. If in a block, both cusparse and cusparseLt are used, we
+  // require two separate Creation ops to be the correct logic. In future, we
+  // may add support to using one handle in sparse tensor / GPU dialect in both
+  // cusparse and cusparseLt.
   auto handle =
       createDnVecCallBuilder
           .create(loc, rewriter, {adaptor.getSize(), pVec, dw, stream})
@@ -1424,6 +1474,11 @@
   return success();
 }
 
+// TODO: work on this
+LogicalResult ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
+    gpu::CreateCooOp op, OpAdaptor adaptor,
+    ConversionPatternRewriter &rewriter) const {}
+
 LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite(
     gpu::DestroySpMatOp op, OpAdaptor adaptor,
     ConversionPatternRewriter &rewriter) const {
@@ -1542,8 +1597,8 @@
   auto dw = rewriter.create<LLVM::ConstantOp>(loc, llvmInt32Type,
                                               dType.getIntOrFloatBitWidth());
   auto stream = adaptor.getAsyncDependencies().front();
-  Value pBuf =
-      MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc);
+  Value pBuf = MemRefDescriptor(adaptor.getBuffers().front())
+                   .allocatedPtr(rewriter, loc);
   if (!getTypeConverter()->useOpaquePointers())
     pBuf = rewriter.create<LLVM::BitcastOp>(loc, llvmPointerType, pBuf);
   spMMCallBuilder.create(loc, rewriter,
@@ -1615,6 +1670,7 @@
                ConvertDestroyDnMatOpToGpuRuntimeCallPattern,
                ConvertCreateCooOpToGpuRuntimeCallPattern,
                ConvertCreateCsrOpToGpuRuntimeCallPattern,
+               ConvertCreate2To4SpMatOpToGpuRuntimeCallPattern,
                ConvertDestroySpMatOpToGpuRuntimeCallPattern,
                ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern,
                ConvertSpMVOpToGpuRuntimeCallPattern,
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -200,15 +200,36 @@
       EXCLUDE_FROM_LIBMLIR
     )
     set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14)
-    target_include_directories(mlir_cuda_runtime
-      PRIVATE
-      ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-    )
-    target_link_libraries(mlir_cuda_runtime
-      PRIVATE
-      ${CUDA_RUNTIME_LIBRARY}
-      ${CUDA_CUSPARSE_LIBRARY}
-    )
+    
+    
+    # We need the cusparseLT to provide 2:4 sparsity support.
+    # As of the pre-1.0 version, we suppose the cusparselt is downloaded as an 
+    # archive and extracted in an exclusive directory CUDA_CUSPARSELT_DIR, rather
+    # than installed by the package manager. This is the same as Nvidia examples.
+    if (DEFINED CUDA_CUSPARSELT_DIR)
+      target_include_directories(mlir_cuda_runtime
+        PRIVATE
+        ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+        ${CUDA_CUSPARSELT_DIR}/include
+      )
+      target_link_libraries(mlir_cuda_runtime
+        PRIVATE
+        ${CUDA_RUNTIME_LIBRARY}
+        ${CUDA_CUSPARSE_LIBRARY}
+        ${CUDA_CUSPARSELT_DIR}/lib64
+      )
+    else()
+      target_include_directories(mlir_cuda_runtime
+        PRIVATE
+        ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+      )
+      target_link_libraries(mlir_cuda_runtime
+        PRIVATE
+        ${CUDA_RUNTIME_LIBRARY}
+        ${CUDA_CUSPARSE_LIBRARY}
+      )
+    endif()
+    add_definitions(-DMLIR_CUDA_CUSPARSELT_ENABLED=(defined(CUDA_CUSPARSELT_DIR)))
   endif()
 
   if(MLIR_ENABLE_ROCM_RUNNER)
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -19,6 +19,17 @@
 #include "cuda.h"
 #include "cusparse.h"
 
+// TODO: this is a compile-time (of the mlir compiler) disablement. We may also
+// want a run-time (of the mlir compiler) disablement/warning too: cusparseLt
+// currently won't work for cuda architecture <8.0 and will trigger a runtime
+// (of the CUDA program) error , but it might be great if we could at least
+// output a warning when we found the target architecture is <8.0 and the user
+// still wants to use cusparseLt. to make sure when lowering gpu sparse dialect
+// to llvm calls, the cusparselt calls are disabled for cuda architecture <8.0
+#if MLIR_CUDA_CUSPARSELT_ENABLED
+#include "cusparseLt.h"
+#endif // MLIR_CUDA_CUSPARSELT_ENABLED
+
 #ifdef _WIN32
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
 #else
@@ -438,3 +449,136 @@
                                          matB, betap, matC, dtp,
                                          CUSPARSE_SDDMM_ALG_DEFAULT, buf))
 }
+
+///
+/// Wrapper methods for the cuSparseLt library.
+///
+#if MLIR_CUDA_CUSPARSELT_ENABLED
+struct cusparseLtSpMatHandleAndData {
+  cusparseLtMatDescriptor_t mat;
+  void *values{nullptr};
+  // TODO: the following is associated with the SpMM operator rather than the
+  // sparse matrix. Create workspace buffers and pass them to the SpMM
+  // execution.
+  cusparseLtMatmulAlgSelection_t alg_sel;
+  cusparseLtMatmulPlan_t plan;
+};
+struct cusparseLtDnMatHandleAndData {
+  cusparseLtMatDescriptor_t mat;
+  void *values{nullptr};
+};
+struct cusparseLtWorkspaceSizes {
+  size_t workspace_size;
+  size_t compressed_size;
+  size_t compressed_buffer_size;
+};
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateSparseLtEnv(CUstream /*stream*/) {
+  cusparseLtHandle_t handle = nullptr;
+  // note that cuSparseLt still uses cusparseStatus_t
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&handle))
+  return reinterpret_cast<void *>(handle);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) {
+  cusparseLtHandle_t handle = reinterpret_cast<cusparseLtHandle_t>(h);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle))
+}
+
+// TODO: pass handle ptr
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCuSparseLtDnMat(intptr_t rows, intptr_t cols, void *values,
+                          int32_t dw, CUstream /*stream*/) {
+  cusparseLtMatDescriptor_t mat;
+  cudaDataType_t dtp = dataTp(dw);
+  // assuming row-major when deciding lda
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtDenseDescriptorInit(handlePtr, &mat, rows, cols, /*lda=*/cols,
+                                    /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW))
+  cusparseLtDnMatHandleAndData matWithData{
+      .mat = mat,
+      .values = values,
+  };
+  return reinterpret_cast<void *>(matWithData);
+}
+
+// This can be used to destroy both dense matrices and sparse matrices in
+// cusparseLt
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) {
+  auto matAndData = reinterpret_cast<cusparseLtSpMatHandleAndData>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat)))
+  // destroy the plan associated with the sparse matrix
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan)))
+}
+
+// TODO: pass handle ptr
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCusparseLtCreate2To4SpMat(intptr_t rows, intptr_t cols, void *values,
+                              int32_t dw, CUstream /*stream*/) {
+  cusparseLtSpMatHandleAndData matWithData;
+  matWithData.values = values;
+
+  cudaDataType_t dtp = dataTp_cusparseLt(dw);
+  // assuming row-major when deciding lda
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
+      handlePtr, &(matWithData.mat), rows, cols, /*ld=*/cols, /*alignment=*/16,
+      dtp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+
+  return reinterpret_cast<void *>(matWithData);
+}
+
+// Several things are being done in this stage, algorithm selection, planning,
+// and returning workspace and compressed matrices data buffer sizes.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
+mgpuCuSparseLtSpMMBufferSize(void *h, void *a, CUstream /*stream*/) {
+  // TODO: support more advanced settings, e.g., the input right operand is a
+  // sparse matrix assuming matA is the sparse matrix
+  auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
+
+  CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit(
+      &handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+  int alg = 0;
+  CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+      &handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg,
+      sizeof(alg)))
+  cusparseLtMatmulPlan_t plan;
+  CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &(matWithData.plan), &matmul,
+                                          &(matWithData.alg_sel)))
+
+  CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(handle, &(matA.plan),
+                                              &(sizes.workspace_size)))
+  CHECK_CUSPARSE(cusparseLtSpMMACompressedSize(handle, &(matA.plan),
+                                               &(sizes.compressed_size),
+                                               &(sizes.compressed_buffer_size)))
+  // avoid zero-alloc
+  sizes.workspace_size = (sizes.workspace_size == 0 ? 1 : sizes.workspace_size);
+  sizes.compressed_size =
+      (sizes.compressed_size == 0 ? 1 : sizes.compressed_size);
+  sizes.compressed_buffer_size =
+      (sizes.compressed_buffer_size == 0 ? 1 : sizes.compressed_buffer_size);
+  return reinterpret_cast<void *>(sizes);
+
+  // TODO: operator specific stuff like plan needs to be passed
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuCuSparseLtSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+                   int32_t dw, void *buf, CUstream stream) {
+
+  auto matA = reinterpret_cast<cusparseLtSpMatHandleAndData *>(a);
+  auto matB = reinterpret_cast<cusparseLtDnMatHandleAndData *>(b);
+  ALPHABETA(dw, alpha, beta)
+
+  CHECK_CUSPARSE(cusparseLtSpMMACompress(
+      &handle, &(matA.plan), dA, dA_compressed, dA_compressedBuffer, stream))
+
+  // TODO: add support to multi-stream execution
+  // Perform the matrix multiplication
+  CHECK_CUSPARSE(cusparseLtMatmul(&handle, &plan, &alpha, dA_compressed, dB,
+                                  &beta, dC, dD, d_workspace, &stream, 1))
+}
+
+#endif // MLIR_CUDA_CUSPARSELT_ENABLED
\ No newline at end of file
diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
--- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
+++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir
@@ -53,7 +53,7 @@
     %env, %token3 = gpu.create_sparse_env async [%token2]
     %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref<?xindex>, memref<?xindex>, memref<?xf64>
     %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref<?xf64>
-    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat
+    %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index
     %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>
     %token8 = gpu.destroy_sp_mat async [%token7] %spmat
     %token9 = gpu.destroy_dn_mat async [%token8] %dnmat
diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir
--- a/mlir/test/Dialect/GPU/ops.mlir
+++ b/mlir/test/Dialect/GPU/ops.mlir
@@ -341,7 +341,7 @@
     // CHECK: gpu.create_dn_mat async
     %dnmat, %token9 = gpu.create_dn_mat async [%token8] %arg0, %arg0, %mem2 : memref<?xf64>
     // CHECK: gpu.spmm_buffer_size async
-    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat
+    %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index
     // CHECK: gpu.spmm async
     %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref<?xf64>
     // CHECK: gpu.sddmm_buffer_size async
diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
--- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
+++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir
@@ -49,7 +49,7 @@
 // CHECK:           %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref<?xindex>, memref<?xindex>, memref<?xf64>
 // CHECK:           %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref<?x?xf64>
 // CHECK:           %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref<?x?xf64>
-// CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]]
+// CHECK:           %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index
 // CHECK:           %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref<?xi8>
 // CHECK:           %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref<?xi8>
 // CHECK:           %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]]