diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -200,15 +200,36 @@
       EXCLUDE_FROM_LIBMLIR
     )
     set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14)
-    target_include_directories(mlir_cuda_runtime
-      PRIVATE
-      ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
-    )
-    target_link_libraries(mlir_cuda_runtime
-      PRIVATE
-      ${CUDA_RUNTIME_LIBRARY}
-      ${CUDA_CUSPARSE_LIBRARY}
-    )
+    
+    
+    # We need the cusparseLT to provide 2:4 sparsity support.
+    # As of the pre-1.0 version, we suppose the cusparselt is downloaded as an 
+    # archive and extracted in an exclusive directory CUDA_CUSPARSELT_DIR, rather
+    # than installed by the package manager. This is the same as Nvidia examples.
+    if (DEFINED CUDA_CUSPARSELT_DIR)
+      target_include_directories(mlir_cuda_runtime
+        PRIVATE
+        ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+        ${CUDA_CUSPARSELT_DIR}/include
+      )
+      target_link_libraries(mlir_cuda_runtime
+        PRIVATE
+        ${CUDA_RUNTIME_LIBRARY}
+        ${CUDA_CUSPARSE_LIBRARY}
+        ${CUDA_CUSPARSELT_DIR}/lib64
+      )
+    else()
+      target_include_directories(mlir_cuda_runtime
+        PRIVATE
+        ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+      )
+      target_link_libraries(mlir_cuda_runtime
+        PRIVATE
+        ${CUDA_RUNTIME_LIBRARY}
+        ${CUDA_CUSPARSE_LIBRARY}
+      )
+    endif()
+    add_definitions(-DMLIR_CUDA_CUSPARSELT_ENABLED=(defined(CUDA_CUSPARSELT_DIR)))
   endif()
 
   if(MLIR_ENABLE_ROCM_RUNNER)
diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
--- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
+++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp
@@ -19,6 +19,10 @@
 #include "cuda.h"
 #include "cusparse.h"
 
+#if MLIR_CUDA_CUSPARSELT_ENABLED
+#include "cusparseLt.h"
+#endif // MLIR_CUDA_CUSPARSELT_ENABLED
+
 #ifdef _WIN32
 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport)
 #else
@@ -438,3 +442,138 @@
                                          matB, betap, matC, dtp,
                                          CUSPARSE_SDDMM_ALG_DEFAULT, buf))
 }
+
+///
+/// Wrapper methods for the cuSparseLt library.
+///
+#if MLIR_CUDA_CUSPARSELT_ENABLED
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateSparseLtEnv(CUstream /*stream*/) {
+  cusparseLtHandle_t handle = nullptr;
+  // note that cuSparseLt still uses cusparseStatus_t
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&handle))
+  return reinterpret_cast<void *>(handle);
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) {
+  cusparseLtHandle_t handle = reinterpret_cast<cusparseLtHandle_t>(h);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle))
+}
+
+struct cusparseLtSpMatHandleAndData {
+  cusparseLtMatDescriptor_t mat;
+  void *rowPos;
+  void *colIdxs;
+  void *values;
+};
+struct cusparseLtDnMatHandleAndData {
+  cusparseLtMatDescriptor_t mat;
+  void *values;
+};
+
+// TODO: pass handle ptr
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *
+mgpuCreateCuSparseLtDnMat(intptr_t rows, intptr_t cols, void *values,
+                          int32_t dw, CUstream /*stream*/) {
+  cusparseLtMatDescriptor_t mat;
+  cudaDataType_t dtp = dataTp(dw);
+  // assuming row-major when deciding lda
+  CUSPARSE_REPORT_IF_ERROR(
+      cusparseLtDenseDescriptorInit(handlePtr, &mat, rows, cols, /*lda=*/cols,
+                                    /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW))
+  cusparseLtDnMatHandleAndData matWithData{
+      .mat = mat,
+      .values = values,
+  };
+  return reinterpret_cast<void *>(matWithData);
+}
+
+// This can be used to destroy both dense matrices and sparse matrices in
+// cusparseLt
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) {
+  auto mat = reinterpret_cast<cusparseLtMatDescriptor_t>(m);
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(mat))
+}
+
+// TODO: pass handle ptr
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuCusparseLtCreate224SpMat(
+    intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos, void *colIdxs,
+    void *values, int32_t pw, int32_t iw, int32_t dw, CUstream /*stream*/) {
+  cusparseLtMatDescriptor_t mat;
+  cusparseIndexType_t ptp = idxTp(pw);
+  cusparseIndexType_t itp = idxTp(iw);
+  cudaDataType_t dtp = dataTp(dw);
+  // assuming row-major when deciding lda
+  CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit(
+      handlePtr, &mat, rows, cols, /*ld=*/cols, /*alignment=*/16, dtp,
+      CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT))
+  cusparseLtSpMatHandleAndData matWithData{
+      .mat = mat,
+      .rowPos = rowPos,
+      .colIdxs = colIdxs,
+      .values = values,
+  };
+  return reinterpret_cast<void *>(matWithData);
+}
+
+// Several things are being done in this stage, algorithm selection, planning,
+// and returning workspace and compressed matrices data buffer sizes.
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t
+mgpuCuSparseLtSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b,
+                             void *c, int32_t dw, CUstream /*stream*/) {
+  // cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  // cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
+  // cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
+  // cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+  // cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
+  // cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
+  // cudaDataType_t dtp = dataTp(dw);
+  // ALPHABETA(dw, alpha, beta)
+  // size_t bufferSize = 0;
+  // CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize(
+  //     handle, modeA, modeB, alphap, matA, matB, betap, matC, dtp,
+  //     CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize))
+  // return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc
+
+  cusparseLtMatmulAlgSelection_t alg_sel;
+  CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit(
+      &handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT))
+  int alg = 0;
+  CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute(
+      &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg)))
+  size_t workspace_size;
+  CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel))
+
+  CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size))
+  CHECK_CUSPARSE(cusparseLtSpMMACompressedSize(&handle, &plan, &compressed_size,
+                                               &compressed_buffer_size))
+}
+
+extern "C" MLIR_CUDA_WRAPPERS_EXPORT void
+mgpuCuSparseLtSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c,
+                   int32_t dw, void *buf, CUstream /*stream*/) {
+  // cusparseHandle_t handle = reinterpret_cast<cusparseHandle_t>(h);
+  // cusparseOperation_t modeA = static_cast<cusparseOperation_t>(ma);
+  // cusparseOperation_t modeB = static_cast<cusparseOperation_t>(mb);
+  // cusparseSpMatDescr_t matA = reinterpret_cast<cusparseSpMatDescr_t>(a);
+  // cusparseDnMatDescr_t matB = reinterpret_cast<cusparseDnMatDescr_t>(b);
+  // cusparseDnMatDescr_t matC = reinterpret_cast<cusparseDnMatDescr_t>(c);
+  // cudaDataType_t dtp = dataTp(dw);
+  ALPHABETA(dw, alpha, beta)
+  // CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, alphap, matA,
+  //                                       matB, betap, matC, dtp,
+  //                                       CUSPARSE_SPMM_ALG_DEFAULT, buf))
+
+  CHECK_CUSPARSE(cusparseLtSpMMACompress(&handle, &plan, dA, dA_compressed,
+                                         dA_compressedBuffer, stream))
+
+  // Perform the matrix multiplication
+  CHECK_CUSPARSE(cusparseLtMatmul(&handle, &plan, &alpha, dA_compressed, dB,
+                                  &beta, dC, dD, d_workspace, streams,
+                                  num_streams))
+}
+
+#endif // MLIR_CUDA_CUSPARSELT_ENABLED
\ No newline at end of file