diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -287,11 +287,11 @@ {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; - FunctionCallBuilder AssertSparseLTEnvHandleSizeCallBuilder = { - "mgpuAssertSparseLTEnvHandleSize", llvmVoidType, {}}; - FunctionCallBuilder AssertSparseLTSpMatHandleSizeCallBuilder = { - "mgpuAssertSparseLTSpMatHandleSize", llvmVoidType, {}}; - FunctionCallBuilder AssertSparseLTDnMatHandleSizeCallBuilder = { + FunctionCallBuilder AssertSparseLtEnvHandleSizeCallBuilder = { + "mgpuAssertSparseLtEnvHandleSize", llvmVoidType, {}}; + FunctionCallBuilder AssertSparseLtSpMatHandleSizeCallBuilder = { + "mgpuAssertSparseLtSpMatHandleSize", llvmVoidType, {}}; + FunctionCallBuilder AssertSparseLtDnMatHandleSizeCallBuilder = { "mgpuAssertSparseLtDnMatHandleSize", llvmVoidType, {}}; FunctionCallBuilder createSparseLtEnvCallBuilder = { "mgpuCreateSparseLtEnv", @@ -322,13 +322,14 @@ FunctionCallBuilder cuSparseLtSpmmBufferSizeBuilder = { "mgpuCuSparseLtSpMMBufferSize", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmPointerType, + {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, + llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType /*void *stream*/}}; FunctionCallBuilder cuSparseLtSpmmBuilder = { "mgpuCuSparseLtSpMM", llvmVoidType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType /*void *stream*/}}; }; @@ -1417,13 +1418,6 @@ static_cast(TValue)); } -static Value genConstInt32FromComputeMode(OpBuilder &builder, Location loc, - Type computeType) { - auto computeTypeInt = getCuSparseDataTypeFrom(computeType); - auto computeTypeConst = genConstInt32From(builder, loc, computeTypeInt); - return computeTypeConst; -} - LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateSparseEnvOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1437,7 +1431,7 @@ Value handle; if (isSpMMCusparseLtOp(op.getEnv())) { // Assert the size is 11024 bytes - AssertSparseLTEnvHandleSizeCallBuilder.create(loc, rewriter, {}); + AssertSparseLtEnvHandleSizeCallBuilder.create(loc, rewriter, {}); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11024)); handle = rewriter.create(loc, llvmInt8PointerType, @@ -1532,7 +1526,7 @@ Value handle; if (isSpMMCusparseLtOp(op.getDmat())) { auto envHandle = adaptor.getEnv(); - AssertSparseLTDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); + AssertSparseLtDnMatHandleSizeCallBuilder.create(loc, rewriter, {}); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11032)); handle = rewriter.create(loc, llvmInt8PointerType, @@ -1692,10 +1686,10 @@ pMat = rewriter.create(loc, llvmPointerType, pMat); Type dType = llvm::cast(op.getMemref().getType()).getElementType(); - auto dtp = genConstInt32From(rewriter, loc, getCuSparseLtDataTypeFrom(dType)); + auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); auto envHandle = adaptor.getEnv(); - AssertSparseLTSpMatHandleSizeCallBuilder.create(loc, rewriter, {}); + AssertSparseLtSpMatHandleSizeCallBuilder.create(loc, rewriter, {}); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(44104)); Value handle = rewriter.create(loc, llvmInt8PointerType, @@ -1739,8 +1733,8 @@ return failure(); Location loc = op.getLoc(); auto modeA = genConstInt32From(rewriter, loc, op.getModeA()); - auto computeType = - genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); auto bufferSize = spMVBufferSizeCallBuilder @@ -1760,8 +1754,8 @@ return failure(); Location loc = op.getLoc(); auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); - auto computeType = - genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); Value pBuf = MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); @@ -1785,10 +1779,11 @@ auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); auto stream = adaptor.getAsyncDependencies().front(); - auto computeType = - genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); Value bufferSize; if (is2To4Sparsity(op.getSpmatA())) { + + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseLtDataTypeFrom(adaptor.getComputeType())); auto three = rewriter.create(loc, getIndexType(), rewriter.getIndexAttr(3)); bufferSize = rewriter.create(loc, llvmInt64PointerType, @@ -1798,10 +1793,14 @@ cuSparseLtSpmmBufferSizeBuilder .create(loc, rewriter, - {bufferSize, adaptor.getEnv(), adaptor.getSpmatA(), stream}) + {bufferSize, adaptor.getEnv(), modeA, modeB, + adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), + computeType, stream}) .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); } else { + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); bufferSize = spMMBufferSizeCallBuilder .create(loc, rewriter, {adaptor.getEnv(), modeA, modeB, @@ -1822,8 +1821,8 @@ Location loc = op.getLoc(); auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); - auto computeType = - genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); auto bufferSize = SDDMMBufferSizeCallBuilder .create(loc, rewriter, @@ -1844,8 +1843,8 @@ Location loc = op.getLoc(); auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); - auto computeType = - genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); @@ -1861,8 +1860,7 @@ cuSparseLtSpmmBuilder.create(loc, rewriter, {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), - computeType, pBufs[0], pBufs[1], pBufs[2], - stream}); + pBufs[0], pBufs[1], pBufs[2], stream}); } else { Value pBuf = MemRefDescriptor(adaptor.getBuffers().front()) .allocatedPtr(rewriter, loc); @@ -1892,8 +1890,8 @@ failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); - auto computeType = - genConstInt32FromComputeMode(rewriter, loc, adaptor.getComputeType()); + auto computeType = genConstInt32From( + rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); auto stream = adaptor.getAsyncDependencies().front(); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -468,13 +468,13 @@ struct cusparseLtSpMatHandleAndData { cusparseLtMatDescriptor_t mat; - void *values{nullptr}; - // TODO: the following is associated with the SpMM operator rather than the - // sparse matrix. Create workspace buffers and pass them to the SpMM + // TODO: the following three are associated with the SpMM operator rather than + // the sparse matrix. Create workspace buffers and pass them to the SpMM // execution. cusparseLtMatmulAlgSelection_t alg_sel; cusparseLtMatmulPlan_t plan; cusparseLtMatmulDescriptor_t matmul; + void *values{nullptr}; }; struct cusparseLtDnMatHandleAndData { @@ -482,7 +482,7 @@ void *values{nullptr}; }; -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLTEnvHandleSize() { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtEnvHandleSize() { assert(sizeof(cusparseLtHandle_t) == 11024); } @@ -490,11 +490,11 @@ return assert(sizeof(cusparseLtSpMatHandleAndData) == 44104); } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSparseLtDnMatHandleSize() { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuAssertSparseLtDnMatHandleSize() { return assert(sizeof(cusparseLtDnMatHandleAndData) == 11032); } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) { // note that cuSparseLt still uses cusparseStatus_t CUSPARSE_REPORT_IF_ERROR( @@ -510,15 +510,15 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols, - void *values, int32_t dw, CUstream /*stream*/) { - cusparseLtMatDescriptor_t mat; + void *values, int32_t dtp, CUstream /*stream*/) { auto handle = reinterpret_cast(h); + memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData)); auto dnmat_handle = reinterpret_cast(dh); - cudaDataType_t dtp = dataTp(dw); + auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( - handle, &(dh->mat), rows, cols, /*lda=*/cols, - /*alignment=*/16, dtp, CUSPARSE_ORDER_ROW)) + handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, + /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW)) dnmat_handle->values = values; } @@ -526,56 +526,66 @@ // cusparseLt extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroyCuSparseLtSpMat(void *m, CUstream /*stream*/) { - auto matAndData = reinterpret_cast(m); + auto matAndData = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat))) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuDestroyCuSparseLtDnMat(void *m, CUstream /*stream*/) { - auto matAndData = reinterpret_cast(m); - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat))) + auto matAndData = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matAndData->mat))) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols, - void *values, int32_t dw, CUstream /*stream*/) { + void *values, int32_t dtp, CUstream /*stream*/) { auto spmat_handle = reinterpret_cast(sh); + memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData)); spmat_handle->values = values; auto handle = reinterpret_cast(h); - cudaDataType_t dtp = dataTp_cusparseLt(dw); + auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( - handle, &(sh->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, dtp, - CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) + handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, + dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) } // Several things are being done in this stage, algorithm selection, planning, // and returning workspace and compressed matrices data buffer sizes. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMMBufferSize(void *workspace_size, void *compressed_size, - void *compressed_buffer_size, void *h, void *a, +mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a, + void *b, void *c, int32_t ctp, CUstream /*stream*/) { // TODO: support more advanced settings, e.g., the input right operand is a // sparse matrix assuming matA is the sparse matrix auto handle = reinterpret_cast(h); auto matA = reinterpret_cast(a); + auto matB = reinterpret_cast(b); + auto matC = reinterpret_cast(c); + auto workspace_size = reinterpret_cast(bs); + auto compressed_size = &(reinterpret_cast(bs)[1]); + auto compressed_buffer_size = &(reinterpret_cast(bs)[2]); + auto cTp = static_cast(ctp); - CHECK_CUSPARSE(cusparseLtMatmulAlgSelectionInit( - handle, &(matWithData.alg_sel), &matmul, CUSPARSELT_MATMUL_ALG_DEFAULT)) + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit( + handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat), + &(matC->mat), &(matC->mat), cTp)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit( + handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT)) int alg = 0; - CHECK_CUSPARSE(cusparseLtMatmulAlgSetAttribute( - handle, &(matWithData.alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute( + handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) - // TODO: add transpose support - CHECK_CUSPARSE(cusparseLtMatmulDescriptorInit( - handle, &(matA.matmul), c, CUSPARSE_OPERATION_NON_TRANSPOSE, &(matA->mat), - &matB, &matC, &matC, compute_type)) - CHECK_CUSPARSE(cusparseLtMatmulPlanInit(handle, &(matWithData.plan), &matmul, - &(matWithData.alg_sel))) - - CHECK_CUSPARSE( - cusparseLtMatmulGetWorkspace(handle, &(matA.plan), workspace_size)) - CHECK_CUSPARSE(cusparseLtSpMMACompressedSize( - handle, &(matA.plan), compressed_size, compressed_buffer_size)) + + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit( + handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) + + CUSPARSE_REPORT_IF_ERROR( + cusparseLtMatmulGetWorkspace(handle, &(matA->plan), workspace_size)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( + handle, &(matA->plan), compressed_size, compressed_buffer_size)) // avoid zero-alloc *workspace_size = (*workspace_size == 0 ? 1 : *workspace_size); @@ -586,34 +596,29 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMM(void *alg_sel, void *plan, void *matmul, void *h, void *a, - void *b, void *c, int32_t dw, void *buf, void *dA_compressed, - void *dA_compressedBuffer, CUstream stream) { +mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, void *d_workspace, + void *dA_compressed, void *dA_compressedBuffer, + CUstream stream) { auto handle = reinterpret_cast(h); auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); - cusparseLtMatmulAlgSelection_t alg_sel; - cusparseLtMatmulPlan_t plan; - cusparseLtMatmulDescriptor_t matmul; - - ALPHABETA(dw, alpha, beta) - - CHECK_CUSPARSE(cusparseLtSpMMACompress(handle, &(matA->plan), &(matA->values), - dA_compressed, dA_compressedBuffer, - stream)) + ALPHABETA(CUDA_R_32F, alpha, beta) + CUSPARSE_REPORT_IF_ERROR( + cusparseLtSpMMACompress(handle, &(matA->plan), (matA->values), + dA_compressed, dA_compressedBuffer, stream)) // TODO: add support to multi-stream execution // Perform the matrix multiplication. D = A*B+C using C==D for now - CHECK_CUSPARSE( - cusparseLtMatmul(handle, reinterpret_cast(plan), - &alpha, dA_compressed, dB, &beta, matC->values, - /*dD*/ matC->values, d_workspace, &stream, 1)) + CUSPARSE_REPORT_IF_ERROR( + cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed, + matB->values, betap, matC->values, + /*dD*/ matC->values, d_workspace, nullptr, 0)) - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(mat->mat))) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatDescriptorDestroy(&(matA->mat))) // destroy the plan associated with the sparse matrix - CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(mat->plan))) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanDestroy(&(matA->plan))) } #endif // MLIR_ENABLE_CUDA_CUSPARSELT