diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -122,6 +122,12 @@ "sparse matrix handle type">, BuildableType<"mlir::gpu::SparseSpMatHandleType::get($_builder.getContext())">; +def GPU_SparseSpGEMMOpHandle : + DialectType($_self)">, + "SpGEMM operation handle type">, + BuildableType<"mlir::gpu::SparseSpGEMMOpHandleType::get($_builder.getContext())">; + //===----------------------------------------------------------------------===// // GPU Interfaces. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -165,7 +165,7 @@ void addAsyncDependency(Operation *op, Value token); // Handle types for sparse. -enum class SparseHandleKind { SpMat, DnTensor }; +enum class SparseHandleKind { SpMat, DnTensor, SpGEMMOp }; template class SparseHandleType @@ -178,6 +178,7 @@ using SparseDnTensorHandleType = SparseHandleType; using SparseSpMatHandleType = SparseHandleType; +using SparseSpGEMMOpHandleType = SparseHandleType; } // namespace gpu } // namespace mlir diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1799,6 +1799,22 @@ let defaultValue = "TransposeMode::NON_TRANSPOSE"; } +def GPU_SpGEMMAlg : I32EnumAttr<"SpGEMMAlg", + "selected algorithm for sparse matrix SpGEMM ops supported by sparse tensor", + [ + I32EnumAttrCase<"ALG1", 0>, + I32EnumAttrCase<"ALG2", 1>, + I32EnumAttrCase<"ALG3", 2>, + ]> { + let genSpecializedAttr = 0; + let cppNamespace = GPU_Dialect.cppNamespace; +} + +def GPU_SpGEMMAlgAttr : EnumAttr{ + let defaultValue = "SpGEMMAlg::ALG2"; +} + def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> { let summary = "Precompute buffersize for SpMV operation"; let description = [{ @@ -2111,4 +2127,316 @@ }]; } +// TODO: cusparseSpMatGetSize + + +def GPU_SpGEMMCreateDescrOp : GPU_Op<"spgemm_create_descr", [GPU_AsyncOpInterface]> { + let summary = "SpGEMM Create Descr operation"; + let description = [{ + The `gpu.spgemm_create_descr` + + Example: + + ```mlir + %descriptor = gpu.spgemm_create_descr + ``` + + }]; + let arguments = (ins Variadic:$asyncDependencies); + let results = (outs GPU_SparseSpGEMMOpHandle:$desc, + Optional:$asyncToken); + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + attr-dict + }]; +} + +def GPU_SpGEMMDestroyDescrOp : GPU_Op<"spgemm_destroy_descr", [GPU_AsyncOpInterface]> { + let summary = "SpGEMM Destroy Descr operation"; + let description = [{ + The `gpu.spgemm_destroy_descr` + + Example: + + ```mlir + %descriptor = gpu.spgemm_destroy_descr + ``` + + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseSpGEMMOpHandle:$desc); + let results = (outs Optional:$asyncToken); + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $desc attr-dict + }]; +} + + +def GPU_SpGEMMWorkEstimationOp : GPU_Op<"spgemm_work_estimation", [GPU_AsyncOpInterface]> { + let summary = "SpGEMM work estimation operation"; + let description = [{ + The `gpu.spgemm_work_estimation` + + Example: + + ```mlir + %descriptor = gpu.spgemm_work_estimation + ``` + + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseSpGEMMOpHandle:$desc, + GPU_TransposeModeAttr:$modeA, + GPU_TransposeModeAttr:$modeB, + GPU_SparseSpMatHandle:$spmatA, + GPU_SparseSpMatHandle:$spmatB, + GPU_SparseSpMatHandle:$spmatC, + Index:$bufferSz, + GPU_SpGEMMAlgAttr:$alg, + AnyMemRef:$buffer); + let results = (outs Index:$bufferSzNew, + Optional:$asyncToken); + + let builders = [OpBuilder<(ins + "Type":$bufferSzNew, + "Type":$asyncToken, + "ValueRange":$asyncDependencies, + "Value":$desc, + "Value":$spmatA, + "Value":$spmatB, + "Value":$spmatC, + "Value":$bufferSz, + "Value":$buffer), [{ + auto modeA = gpu::TransposeMode::NON_TRANSPOSE; + auto modeB = gpu::TransposeMode::NON_TRANSPOSE; + auto alg = gpu::SpGEMMAlg::ALG2; + return build($_builder, $_state, bufferSzNew, asyncToken, asyncDependencies, desc, + modeA, modeB, spmatA, spmatB, spmatC, bufferSz, alg, buffer);}]> + ]; + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $alg `,` $desc `,` $bufferSz `,` $buffer attr-dict `:` type($buffer) + }]; +} + + +def GPU_SpGEMMEstimateMemoryOp : GPU_Op<"spgemm_estimate_memory", [GPU_AsyncOpInterface]> { + let summary = "SpGEMM estimate memory operation"; + let description = [{ + The `gpu.spgemm_estimate_memory` + + Example: + + ```mlir + %descriptor = gpu.spgemm_estimate_memory + ``` + + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseSpGEMMOpHandle:$desc, + GPU_TransposeModeAttr:$modeA, + GPU_TransposeModeAttr:$modeB, + GPU_SparseSpMatHandle:$spmatA, + GPU_SparseSpMatHandle:$spmatB, + GPU_SparseSpMatHandle:$spmatC, + TypeAttr:$computeType, + GPU_SpGEMMAlgAttr:$alg, + Index:$bufferSz3, + AnyMemRef:$buffer3, + Index:$bufferSz2); + let results = (outs Index:$bufferSz3New, + Index:$bufferSz2New, + Optional:$asyncToken); + + let builders = [OpBuilder<(ins + "Type":$bufferSz3New, + "Type":$bufferSz2New, + "Type":$asyncToken, + "ValueRange":$asyncDependencies, + "Value":$desc, + "Value":$spmatA, + "Value":$spmatB, + "Value":$spmatC, + "Type":$computeType, + "Value":$bufferSz3, + "Value":$buffer3, + "Value":$bufferSz2), [{ + auto modeA = gpu::TransposeMode::NON_TRANSPOSE; + auto modeB = gpu::TransposeMode::NON_TRANSPOSE; + auto alg = gpu::SpGEMMAlg::ALG2; + return build($_builder, $_state, bufferSz3New, bufferSz2New, asyncToken, asyncDependencies, desc, + modeA, modeB, spmatA, spmatB, spmatC, computeType, alg, bufferSz3, buffer3, bufferSz2);}]> + ]; + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $alg `,` $desc `,` $bufferSz3 `,` $bufferSz2 `,` $buffer3 attr-dict `:` $computeType `into` type($buffer3) + }]; +} + +def GPU_SpGEMMComputeOp : GPU_Op<"spgemm_compute", [GPU_AsyncOpInterface]> { + let summary = "SpGEMM compute operation"; + let description = [{ + The `gpu.spgemm` operation performs the SpGEMM operation on the given sparse + matrices, and buffer. The operation expects handles returned by previous + sparse operations to construct an environment and the operands for SpGEMM. The + buffer must have been allocated on the device. + + C' = alpha * op(A) * op(B) + beta * C + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %descriptor = gpu.spgemm_compute + ``` + + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseSpGEMMOpHandle:$desc, + GPU_TransposeModeAttr:$modeA, + GPU_TransposeModeAttr:$modeB, + GPU_SparseSpMatHandle:$spmatA, + GPU_SparseSpMatHandle:$spmatB, + GPU_SparseSpMatHandle:$spmatC, + TypeAttr:$computeType, + GPU_SpGEMMAlgAttr:$alg, + Index:$bufferSz2, + AnyMemRef:$buffer2); + let results = (outs Index:$bufferSz2New, + Optional:$asyncToken); + + let builders = [OpBuilder<(ins + "Type":$asyncToken, + "ValueRange":$asyncDependencies, + "Value":$desc, + "Value":$spmatA, + "Value":$spmatB, + "Value":$spmatC, + "Type":$computeType, + "Value":$bufferSz2, + "Value":$buffer2), [{ + auto modeA = gpu::TransposeMode::NON_TRANSPOSE; + auto modeB = gpu::TransposeMode::NON_TRANSPOSE; + auto alg = gpu::SpGEMMAlg::ALG2; + return build($_builder, $_state, asyncToken, asyncDependencies, desc, + modeA, modeB, spmatA, spmatB, spmatC, computeType, alg, bufferSz2, buffer2);}]> + ]; + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $alg `,` $desc `,` $bufferSz2 `,` $buffer2 attr-dict `:` $computeType `into` type($buffer2) + }]; +} + +def GPU_SpGEMMCopyOp : GPU_Op<"spgemm_copy", [GPU_AsyncOpInterface]> { + let summary = "SpGEMM copy operation"; + let description = [{ + The `gpu.spgemm` operation performs the SpGEMM operation on the given sparse + matrices, and buffer. The operation expects handles returned by previous + sparse operations to construct an environment and the operands for SpGEMM. The + buffer must have been allocated on the device. + + C' = alpha * op(A) * op(B) + beta * C + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %descriptor = gpu.spgemm_copy + ``` + + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseSpGEMMOpHandle:$desc, + GPU_TransposeModeAttr:$modeA, + GPU_TransposeModeAttr:$modeB, + GPU_SparseSpMatHandle:$spmatA, + GPU_SparseSpMatHandle:$spmatB, + GPU_SparseSpMatHandle:$spmatC, + TypeAttr:$computeType, + GPU_SpGEMMAlgAttr:$alg); + let results = (outs Optional:$asyncToken); + + let builders = [OpBuilder<(ins + "Type":$asyncToken, + "ValueRange":$asyncDependencies, + "Value":$desc, + "Value":$spmatA, + "Value":$spmatB, + "Value":$spmatC, + "Type":$computeType), [{ + auto modeA = gpu::TransposeMode::NON_TRANSPOSE; + auto modeB = gpu::TransposeMode::NON_TRANSPOSE; + auto alg = gpu::SpGEMMAlg::ALG2; + return build($_builder, $_state, asyncToken, asyncDependencies, desc, + modeA, modeB, spmatA, spmatB, spmatC, computeType, alg);}]> + ]; + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $alg `,` $desc attr-dict `:` $computeType + }]; +} + + +def GPU_SpGEMMGetSizeOp : GPU_Op<"spgemm_get_size", [GPU_AsyncOpInterface]> { + let summary = "SpGEMM copy operation"; + let description = [{ + The `gpu.spgemm` operation performs the SpGEMM operation on the given sparse + matrices, and buffer. The operation expects handles returned by previous + sparse operations to construct an environment and the operands for SpGEMM. The + buffer must have been allocated on the device. + + C' = alpha * op(A) * op(B) + beta * C + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %descriptor = gpu.spgemm_get_size + ``` + + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseSpMatHandle:$spmat); + let results = (outs Index:$rows, + Index:$cols, + Index:$nnz, + Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $spmat attr-dict + }]; +} #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -1784,6 +1784,7 @@ addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); addOpaquePointerConversion(converter); + addOpaquePointerConversion(converter); patterns.add(); addTypes(); addTypes(); + addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc" @@ -165,6 +166,8 @@ return "sparse.dntensor_handle"; case SparseHandleKind::SpMat: return "sparse.spmat_handle"; + case SparseHandleKind::SpGEMMOp: + return "sparse.spgemmop_handle"; } llvm_unreachable("unknown sparse handle kind"); return ""; @@ -217,6 +220,8 @@ return SparseDnTensorHandleType::get(context); if (keyword == getSparseHandleKeyword(SparseHandleKind::SpMat)) return SparseSpMatHandleType::get(context); + if (keyword == getSparseHandleKeyword(SparseHandleKind::SpGEMMOp)) + return SparseSpGEMMOpHandleType::get(context); parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword); return Type(); @@ -231,6 +236,9 @@ }) .Case( [&](Type) { os << getSparseHandleKeyword(SparseHandleKind::SpMat); }) + .Case([&](Type) { + os << getSparseHandleKeyword(SparseHandleKind::SpGEMMOp); + }) .Case([&](MMAMatrixType fragTy) { os << "mma_matrix<"; auto shape = fragTy.getShape(); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -478,6 +478,116 @@ CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } +// TODO: add support to passing alpha and beta as arguments +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation( + void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, + void *buf, intptr_t bs, CUstream /*stream*/) { + cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseSpMatDescr_t matB = reinterpret_cast(b); + cusparseSpMatDescr_t matC = reinterpret_cast(c); + auto cTp = static_cast(ctp); + ALPHABETA(cTp, alpha, beta) + size_t newBufferSize = bs; + + CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_workEstimation( + cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, computeType, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize, buf)) + return newBufferSize == 0 ? 1 : newBufferSize; // avoid zero-alloc +} + +// TODO: add support to passing alpha and beta as arguments +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpGEMMEstimateMemory(void *nbs3, void *nbs2, void *s, int32_t ma, + int32_t mb, void *a, void *b, void *c, int32_t ctp, + int32_t alg, intptr_t bs3, void *buf3, intptr_t bs2, + CUstream /*stream*/) { + cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseSpMatDescr_t matB = reinterpret_cast(b); + cusparseSpMatDescr_t matC = reinterpret_cast(c); + auto cTp = static_cast(ctp); + ALPHABETA(cTp, alpha, beta) + size_t *newBufferSize2 = reinterpret_cast(nbs2); + size_t *newBufferSize3 = reinterpret_cast(nbs3); + *newBufferSize2 = bs2; + *newBufferSize3 = bs3; + cusparseSpGEMMAlg_t algorithm = static_cast(alg); + + CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_estimateMemory( + cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, computeType, + algorithm, spgemmDesc, &newBufferSize3, buf3, &newBufferSize2)) + // avoid zero-alloc + if (*newBufferSize2 == 0) { + *newBufferSize2 = 1; + } + if (*newBufferSize3 == 0) { + *newBufferSize3 = 1; + } + return; +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpGEMMCompute(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t ctp, void *buf, CUstream /*stream*/) { + cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseSpMatDescr_t matB = reinterpret_cast(b); + cusparseSpMatDescr_t matC = reinterpret_cast(c); + auto cTp = static_cast(ctp); + ALPHABETA(cTp, alpha, beta) + + CHECK_CUSPARSE(cusparseSpGEMM_compute( + cusparse_env, opA, opB, alphap, matA, matB, betap, matC, computeType, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize, NULL)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpGEMMCopy(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t ctp, void *buf, CUstream /*stream*/) { + cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseSpMatDescr_t matB = reinterpret_cast(b); + cusparseSpMatDescr_t matC = reinterpret_cast(c); + auto cTp = static_cast(ctp); + ALPHABETA(cTp, alpha, beta) + + CHECK_CUSPARSE(cusparseSpGEMM_copy( + cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, computeType, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &bufferSize, NULL)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void *mgpuSpGEMMCreateDescr() { + // cusparseSpGEMMDescr_t is a pointer type + cusparseSpGEMMDescr_t spgemmDesc = nullptr; + CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc)) + return reinterpret_cast(spgemmDesc); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpGEMMDestroyDescr(void *s) { + // cusparseSpGEMMDescr_t is a pointer type + cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); + CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(&spgemmDesc)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpGEMMGetSize(void *mc, void *rc, void *cc, void *nc, CUstream /*stream*/) { + cusparseConstSpMatDescr_t matCDescr = + reinterpret_cast(mc); + int64_t *rowsC = reinterpret_cast(rc); + int64_t *colsC = reinterpret_cast(cc); + int64_t *nnzC = reinterpret_cast(nc); + CHECK_CUSPARSE(cusparseSpMatGetSize(matCDescr, &rowsC, &colsC, &nnzC)); +} + #ifdef MLIR_ENABLE_CUDA_CUSPARSELT ///