diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -2111,4 +2111,131 @@ }]; } + +def GPU_SDDMMBufferSizeOp : GPU_Op<"sddmm_buffer_size", [GPU_AsyncOpInterface]> { + let summary = "Precompute buffersize for SDDMM operation"; + let description = [{ + The `gpu.sddmm_buffer_size` operation returns the buffer size required + to perform the SDDMM operation on the given sparse and dense matrices. + The operation expects handles returned by previous sparse operations + to construct an environment and the operands for SDDMM. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 + ``` + + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseEnvHandle:$env, + GPU_TransposeModeAttr:$modeA, + GPU_TransposeModeAttr:$modeB, + GPU_SparseSpMatHandle:$dnmatA, + GPU_SparseSpMatHandle:$dnmatB, + GPU_SparseSpMatHandle:$spmatC, + TypeAttr:$computeType); + let results = (outs Res:$bufferSz, Optional:$asyncToken); + + let builders = [OpBuilder<(ins + "Type":$bufferSz, + "Type":$asyncToken, + "ValueRange":$asyncDependencies, + "Value":$env, + "Value":$spmatA, + "Value":$spmatB, + "Value":$spmatC, + "Type":$computeType), [{ + auto modeA = gpu::TransposeMode::NON_TRANSPOSE; + auto modeB = gpu::TransposeMode::NON_TRANSPOSE; + return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, + env, modeA, modeB, spmatA, spmatB, spmatC, computeType);}]> + ]; + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType + }]; +} + +// TODO: cusparseSpGEMM_createDescr, cusparseSpGEMM_destroyDescr, cusparseSpGEMM_workEstimation, cusparseSpGEMM_estimateMemory, cusparseSpGEMM_compute, cusparseSpMatGetSize, cusparseSpGEMM_copy +def GPU_SpGEMMCreateDescrOp : GPU_Op<"spgemm_create_descr"> { + let summary = "SpGEMM Create Descr operation"; + let description = [{ + The `gpu.spgemm_create_descr` + + Example: + + ```mlir + %descriptor = gpu.spgemm_create_descr + ``` + + }]; + + let results = (outs GPU_SpGEMMDescriptor:$desc); +} + +// def GPU_SpGEMMOp : GPU_Op<"spgemm_compute", [GPU_AsyncOpInterface]> { +// let summary = "SpGEMM operation"; +// let description = [{ +// The `gpu.spgemm` operation performs the SpGEMM operation on the given sparse +// matrices, and buffer. The operation expects handles returned by previous +// sparse operations to construct an environment and the operands for SpGEMM. The +// buffer must have been allocated on the device. + +// C' = alpha * op(A) * op(B) + beta * C + +// If the `async` keyword is present, the op is executed asynchronously (i.e. +// it does not block until the execution has finished on the device). In +// that case, it returns a !gpu.async.token in addition to the environment. + +// Example: + +// ```mlir +// %token = gpu.spgemm async [%dep] %env, %spmatA{TRANSPOSE}, %spmatB{TRANSPOSE}, %spmatC, %buffer into f32 +// ``` + +// The matrix arguments can also be associated with one of the following +// operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value +// is NON_TRANSPOSE. +// }]; + +// let arguments = (ins Variadic:$asyncDependencies, +// GPU_TransposeModeAttr:$modeA, +// GPU_TransposeModeAttr:$modeB, +// GPU_SparseSpMatHandle:$spmatA, +// GPU_SparseSpMatHandle:$spmatB, +// GPU_SparseSpMatHandle:$spmatC, +// TypeAttr:$computeType, +// AnyMemRef:$buffer); +// let results = (outs Optional:$asyncToken); + +// let builders = [OpBuilder<(ins +// "Type":$asyncToken, +// "ValueRange":$asyncDependencies, +// "Value":$spmatA, +// "Value":$spmatB, +// "Value":$spmatC, +// "Type":$computeType, +// "Value":$buffer), [{ +// auto modeA = gpu::TransposeMode::NON_TRANSPOSE; +// auto modeB = gpu::TransposeMode::NON_TRANSPOSE; +// return build($_builder, $_state, asyncToken, asyncDependencies, modeA, +// modeB, spmatA, spmatB, spmatC, computeType, buffer);}]> +// ]; + +// let assemblyFormat = [{ +// custom(type($asyncToken), $asyncDependencies) +// $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType +// }]; +// } + #endif // GPU_OPS diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -478,6 +478,59 @@ CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } +// TODO: add support to passing alpha and beta as arguments +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpGEMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t ctp, CUstream /*stream*/) { + cusparseHandle_t handle = reinterpret_cast(h); + cusparseSpGEMMDescr_t spgemmDesc; + CUSPARSE_REPORT_IF_ERROR( cusparseSpGEMM_createDescr(&spgemmDesc) ) + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseSpMatDescr_t matB = reinterpret_cast(b); + cusparseSpMatDescr_t matC = reinterpret_cast(c); + auto cTp = static_cast(ctp); + ALPHABETA(cTp, alpha, beta) + size_t bufferSize = 0; + // CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize( + // handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp, + // CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize)) + + // ask bufferSize1 bytes for external memory + CUSPARSE_REPORT_IF_ERROR( + cusparseSpGEMM_workEstimation(handle, modeA, modeB, + alphap, matA, matB, betap, matC, + computeType, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc, &bufferSize, NULL) ) + return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpGEMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t ctp, void *buf, CUstream /*stream*/) { + cusparseHandle_t handle = reinterpret_cast(h); + cusparseSpGEMMDescr_t spgemmDesc; + CUSPARSE_REPORT_IF_ERROR( cusparseSpGEMM_createDescr(&spgemmDesc) ) + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseSpMatDescr_t matB = reinterpret_cast(b); + cusparseSpMatDescr_t matC = reinterpret_cast(c); + auto cTp = static_cast(ctp); + ALPHABETA(cTp, alpha, beta) + // CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(handle, modeA, modeB, alphap, matA, + // matB, betap, matC, cTp, + // CUSPARSE_SDDMM_ALG_DEFAULT, buf)) + + CHECK_CUSPARSE( + cusparseSpGEMM_compute(handle, opA, opB, + &alpha, matA, matB, &beta, matC, + computeType, CUSPARSE_SPGEMM_DEFAULT, + spgemmDesc, &bufferSize, NULL) ) + +} + #ifdef MLIR_ENABLE_CUDA_CUSPARSELT ///