diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -200,15 +200,36 @@ EXCLUDE_FROM_LIBMLIR ) set_property(TARGET mlir_cuda_runtime PROPERTY CXX_STANDARD 14) - target_include_directories(mlir_cuda_runtime - PRIVATE - ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} - ) - target_link_libraries(mlir_cuda_runtime - PRIVATE - ${CUDA_RUNTIME_LIBRARY} - ${CUDA_CUSPARSE_LIBRARY} - ) + + + # We need the cusparseLT to provide 2:4 sparsity support. + # As of the pre-1.0 version, we suppose the cusparselt is downloaded as an + # archive and extracted in an exclusive directory CUDA_CUSPARSELT_DIR, rather + # than installed by the package manager. This is the same as Nvidia examples. + if (DEFINED CUDA_CUSPARSELT_DIR) + target_include_directories(mlir_cuda_runtime + PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ${CUDA_CUSPARSELT_DIR}/include + ) + target_link_libraries(mlir_cuda_runtime + PRIVATE + ${CUDA_RUNTIME_LIBRARY} + ${CUDA_CUSPARSE_LIBRARY} + ${CUDA_CUSPARSELT_DIR}/lib64 + ) + else() + target_include_directories(mlir_cuda_runtime + PRIVATE + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ) + target_link_libraries(mlir_cuda_runtime + PRIVATE + ${CUDA_RUNTIME_LIBRARY} + ${CUDA_CUSPARSE_LIBRARY} + ) + endif() + add_definitions(-DMLIR_CUDA_CUSPARSELT_ENABLED=(defined(CUDA_CUSPARSELT_DIR))) endif() if(MLIR_ENABLE_ROCM_RUNNER) diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -19,6 +19,10 @@ #include "cuda.h" #include "cusparse.h" +#if MLIR_CUDA_CUSPARSELT_ENABLED +#include "cusparseLt.h" +#endif // MLIR_CUDA_CUSPARSELT_ENABLED + #ifdef _WIN32 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) #else @@ -438,3 +442,138 @@ matB, betap, matC, dtp, CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } + +/// +/// Wrapper methods for the cuSparseLt library. +/// +#if MLIR_CUDA_CUSPARSELT_ENABLED + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateSparseLtEnv(CUstream /*stream*/) { + cusparseLtHandle_t handle = nullptr; + // note that cuSparseLt still uses cusparseStatus_t + CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&handle)) + return reinterpret_cast(handle); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) { + cusparseLtHandle_t handle = reinterpret_cast(h); + CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle)) +} + +struct cusparseLtSpMatHandleAndData { + cusparseLtMatDescriptor_t mat; + void *rowPos; + void *colIdxs; + void *values; +}; +struct cusparseLtDnMatHandleAndData { + cusparseLtMatDescriptor_t mat; + void *values; +}; + +// TODO: pass handle ptr +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateCuSparseLtDnMat(intptr_t rows, intptr_t cols, void *values, + int32_t dw, CUstream /*stream*/) { + cusparseLtMatDescriptor_t mat; + cudaDataType_t dtp = dataTp(dw); + // assuming row-major when deciding lda + CUSPARSE_REPORT_IF_ERROR( + cusparseLtDenseDescriptorInit(handlePtr, &mat, rows, cols, /*lda=*/cols, + /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW)) + cusparseLtDnMatHandleAndData matWithData{ + .mat = mat, + .values = values, + }; + return reinterpret_cast(matWithData); +} + +// This can be used to destroy both dense matrices and sparse matrices in +// cusparseLt +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) { + auto mat = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(mat)) +} + +// TODO: pass handle ptr +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuCusparseLtCreate224SpMat( + intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos, void *colIdxs, + void *values, int32_t pw, int32_t iw, int32_t dw, CUstream /*stream*/) { + cusparseLtMatDescriptor_t mat; + cusparseIndexType_t ptp = idxTp(pw); + cusparseIndexType_t itp = idxTp(iw); + cudaDataType_t dtp = dataTp(dw); + // assuming row-major when deciding lda + CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( + handlePtr, &mat, rows, cols, /*ld=*/cols, /*alignment=*/16, dtp, + CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) + cusparseLtSpMatHandleAndData matWithData{ + .mat = mat, + .rowPos = rowPos, + .colIdxs = colIdxs, + .values = values, + }; + return reinterpret_cast(matWithData); +} + +// Several things are being done in this stage, algorithm selection, planning, +// and returning workspace and compressed matrices data buffer sizes. +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuCuSparseLtSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, + void *c, int32_t dw, CUstream /*stream*/) { + // cusparseHandle_t handle = reinterpret_cast(h); + // cusparseOperation_t modeA = static_cast(ma); + // cusparseOperation_t modeB = static_cast(mb); + // cusparseSpMatDescr_t matA = reinterpret_cast(a); + // cusparseDnMatDescr_t matB = reinterpret_cast(b); + // cusparseDnMatDescr_t matC = reinterpret_cast(c); + // cudaDataType_t dtp = dataTp(dw); + // ALPHABETA(dw, alpha, beta) + // size_t bufferSize = 0; + // CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( + // handle, modeA, modeB, alphap, matA, matB, betap, matC, dtp, + // CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) + // return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc + + cusparseLtMatmulAlgSelection_t alg_sel; + CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit( + &handle, &alg_sel, &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) + int alg = 0; + CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( + &handle, &alg_sel, CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) + size_t workspace_size; + CHECK_CUSPARSE(cusparseLtMatmulPlanInit(&handle, &plan, &matmul, &alg_sel)) + + CHECK_CUSPARSE(cusparseLtMatmulGetWorkspace(&handle, &plan, &workspace_size)) + CHECK_CUSPARSE(cusparseLtSpMMACompressedSize(&handle, &plan, &compressed_size, + &compressed_buffer_size)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuCuSparseLtSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t dw, void *buf, CUstream /*stream*/) { + // cusparseHandle_t handle = reinterpret_cast(h); + // cusparseOperation_t modeA = static_cast(ma); + // cusparseOperation_t modeB = static_cast(mb); + // cusparseSpMatDescr_t matA = reinterpret_cast(a); + // cusparseDnMatDescr_t matB = reinterpret_cast(b); + // cusparseDnMatDescr_t matC = reinterpret_cast(c); + // cudaDataType_t dtp = dataTp(dw); + ALPHABETA(dw, alpha, beta) + // CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, alphap, matA, + // matB, betap, matC, dtp, + // CUSPARSE_SPMM_ALG_DEFAULT, buf)) + + CHECK_CUSPARSE(cusparseLtSpMMACompress(&handle, &plan, dA, dA_compressed, + dA_compressedBuffer, stream)) + + // Perform the matrix multiplication + CHECK_CUSPARSE(cusparseLtMatmul(&handle, &plan, &alpha, dA_compressed, dB, + &beta, dC, dD, d_workspace, streams, + num_streams)) +} + +#endif // MLIR_CUDA_CUSPARSELT_ENABLED \ No newline at end of file