diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -287,11 +287,11 @@ {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; - FunctionCallBuilder AssertSparseLTEnvHandleSizeCallBuilder = { - "mgpuAssertSparseLTEnvHandleSize", llvmVoidType, {}}; - FunctionCallBuilder AssertSparseLTSpMatHandleSizeCallBuilder = { - "mgpuAssertSparseLTSpMatHandleSize", llvmVoidType, {}}; - FunctionCallBuilder AssertSparseLTDnMatHandleSizeCallBuilder = { + FunctionCallBuilder AssertSparseLtEnvHandleSizeCallBuilder = { + "mgpuAssertSparseLtEnvHandleSize", llvmVoidType, {}}; + FunctionCallBuilder AssertSparseLtSpMatHandleSizeCallBuilder = { + "mgpuAssertSparseLtSpMatHandleSize", llvmVoidType, {}}; + FunctionCallBuilder AssertSparseLtDnMatHandleSizeCallBuilder = { "mgpuAssertSparseLtDnMatHandleSize", llvmVoidType, {}}; FunctionCallBuilder createSparseLtEnvCallBuilder = { "mgpuCreateSparseLtEnv", @@ -322,7 +322,8 @@ FunctionCallBuilder cuSparseLtSpmmBufferSizeBuilder = { "mgpuCuSparseLtSpMMBufferSize", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmPointerType, + {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, + llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType /*void *stream*/}}; FunctionCallBuilder cuSparseLtSpmmBuilder = { "mgpuCuSparseLtSpMM", @@ -1437,7 +1438,7 @@ Value handle; if (isSpMMCusparseLtOp(op.getEnv())) { // Assert the size is 11024 bytes - AssertSparseLTEnvHandleSizeCallBuilder.create(loc, rewriter, {}); + AssertSparseLtEnvHandleSizeCallBuilder.create(loc, rewriter, {}); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11024)); handle = rewriter.create(loc, llvmInt8PointerType, @@ -1532,7 +1533,7 @@ Value handle; if (isSpMMCusparseLtOp(op.getDmat())) { auto envHandle = adaptor.getEnv(); - AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); + AssertSparseLtDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11032)); handle = rewriter.create(loc, llvmInt8PointerType, @@ -1695,7 +1696,7 @@ auto dtp = genConstInt32From(rewriter, loc, getCuSparseLtDataTypeFrom(dType)); auto envHandle = adaptor.getEnv(); - AssertSparseLTSpMatHandleSizeCallBuilder.create(loc, rewriter, {}); + AssertSparseLtSpMatHandleSizeCallBuilder.create(loc, rewriter, {}); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(44104)); Value handle = rewriter.create(loc, llvmInt8PointerType, @@ -1785,10 +1786,11 @@ auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); auto stream = adaptor.getAsyncDependencies().front(); - auto computeType = - genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); Value bufferSize; if (is2To4Sparsity(op.getSpmatA())) { + + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType())); auto three = rewriter.create(loc, getIndexType(), rewriter.getIndexAttr(3)); bufferSize = rewriter.create(loc, llvmInt64PointerType, @@ -1798,10 +1800,14 @@ cuSparseLtSpmmBufferSizeBuilder .create(loc, rewriter, - {bufferSize, adaptor.getEnv(), adaptor.getSpmatA(), stream}) + {bufferSize, adaptor.getEnv(), modeA, modeB, + adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), + computeType, stream}) .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); } else { + auto computeType = + genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); bufferSize = spMMBufferSizeCallBuilder .create(loc, rewriter, {adaptor.getEnv(), modeA, modeB, diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -482,7 +482,7 @@ void *values{nullptr}; }; -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLTEnvHandleSize() { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtEnvHandleSize() { assert(sizeof(cusparseLtHandle_t) == 11024); } @@ -490,11 +490,11 @@ return assert(sizeof(cusparseLtSpMatHandleAndData) == 44104); } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSparseLtDnMatHandleSize() { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtDnMatHandleSize() { return assert(sizeof(cusparseLtDnMatHandleAndData) == 11032); } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) { // note that cuSparseLt still uses cusparseStatus_t CUSPARSE_REPORT_IF_ERROR( @@ -510,15 +510,14 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols, - void *values, int32_t dw, CUstream /*stream*/) { - cusparseLtMatDescriptor_t mat; + void *values, int32_t dtp, CUstream /*stream*/) { auto handle = reinterpret_cast(h); auto dnmat_handle = reinterpret_cast(dh); - cudaDataType_t dtp = dataTp(dw); + auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( - handle, &(dh->mat), rows, cols, /*lda=*/cols, - /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW)) + handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, + /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW)) dnmat_handle->values = values; } @@ -526,56 +525,65 @@ // cusparseLt extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) { - auto matAndData = reinterpret_cast(m); + auto matAndData = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat))) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) { - auto matAndData = reinterpret_cast(m); - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat))) + auto matAndData = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat))) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols, - void *values, int32_t dw, CUstream /*stream*/) { + void *values, int32_t dtp, CUstream /*stream*/) { auto spmat_handle = reinterpret_cast(sh); spmat_handle->values = values; auto handle = reinterpret_cast(h); - cudaDataType_t dtp = dataTp_cusparseLt(dw); + auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( - handle, &(sh->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, dtp, - CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) + handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, + dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) } // Several things are being done in this stage, algorithm selection, planning, // and returning workspace and compressed matrices data buffer sizes. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMMBufferSize(void *workspace_size, void *compressed_size, - void *compressed_buffer_size, void *h, void *a, +mgpuCuSparseLtSpMMBufferSize(void *ws, void *cs, void *cbs, void *h, int32_t ma, + int32_t mb, void *a, void *b, void *c, int32_t ctp, CUstream /*stream*/) { // TODO: support more advanced settings, e.g., the input right operand is a // sparse matrix assuming matA is the sparse matrix auto handle = reinterpret_cast(h); auto matA = reinterpret_cast(a); + auto matB = reinterpret_cast(b); + auto matC = reinterpret_cast(c); + auto workspace_size = reinterpret_cast(ws); + auto compressed_size = reinterpret_cast(cs); + auto compressed_buffer_size = reinterpret_cast(cbs); + auto cTp = static_cast(ctp); - CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit( - handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit( + handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT)) int alg = 0; - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute( + handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - // TODO: add transpose support - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( - handle, &(matA.matmul), c, CUSPARSE_OPERATION_NON_TRANSPOSE, &(matA->mat), - &matB, &matC, &matC, compute_type)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(handle, &(matWithData.plan), &matmul, - &(matWithData.alg_sel))) - - CHECK_CUSPARSE( - cusparseLtMatmulGetWorkspace(handle, &(matA.plan), workspace_size)) - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize( - handle, &(matA.plan), compressed_size, compressed_buffer_size)) + + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit( + handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat), + &(matC->mat), &(matC->mat), cTp)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit( + handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) + + CUSPARSE_REPORT_IF_ERROR( + cusparseLtMatmulGetWorkspace(handle, &(matA->plan), workspace_size)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( + handle, &(matA->plan), compressed_size, compressed_buffer_size)) // avoid zero-alloc *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size); @@ -586,34 +594,31 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMM(void *alg_sel, void *plan, void *matmul, void *h, void *a, - void *b, void *c, int32_t dw, void *buf, void *dA_compressed, +mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, int32_t ctp, + void *d_workspace, void *dA_compressed, void *dA_compressedBuffer, CUstream stream) { auto handle = reinterpret_cast(h); auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - cusparseLtMatmulDescriptor_t matmul; - - ALPHABETA(dw, alpha, beta) + auto cTp = static_cast(ctp); + ALPHABETA(cTp, alpha, beta) - CHECK_CUSPARSE(cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values), - dA_compressed, dA_compressedBuffer, - stream)) + CUSPARSE_REPORT_IF_ERROR( + cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values), + dA_compressed, dA_compressedBuffer, stream)) // TODO: add support to multi-stream execution // Perform the matrix multiplication. D = A*B+C using C==D for now - CHECK_CUSPARSE( - cusparseLtMatmul(handle, reinterpret_cast(plan), - &alpha, dA_compressed, dB, &beta, matC->values, + CUSPARSE_REPORT_IF_ERROR( + cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed, + matB->values, betap, matC->values, /*dD*/ matC->values, d_workspace, &stream, 1)) - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat))) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat))) // destroy the plan associated with the sparse matrix - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan))) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan))) } #endif // MLIR_ENABLE_CUDA_CUSPARSELT