diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -2391,10 +2391,10 @@ }]; } -def GPU_SpGEMMGetSizeOp : GPU_Op<"spgemm_get_size", [GPU_AsyncOpInterface]> { - let summary = "SpGEMM get size operation"; +def GPU_SpMatGetSizeOp : GPU_Op<"spmat_get_size", [GPU_AsyncOpInterface]> { + let summary = "SpMat get size operation"; let description = [{ - The `gpu.spgemm_get_size` operation retrieves the number of rows, number of + The `gpu.spmat_get_size` operation retrieves the number of rows, number of columns, and number of non-zero elements of a sparse matrix. If the `async` keyword is present, the op is executed asynchronously (i.e. @@ -2404,7 +2404,7 @@ Example: ```mlir - %rows, %cols, %nnz, %token = gpu.spgemm_get_size async [%dep] %spmatC + %rows, %cols, %nnz, %token = gpu.spmat_get_size async [%dep] %spmatC ``` }]; diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -296,6 +296,14 @@ llvmVoidType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType /*void *stream*/}}; + FunctionCallBuilder createSpGEMMCreateDescrBuilder = { + "mgpuSpGEMMCreateDescr", + llvmPointerType, + {llvmPointerType /*void *stream*/}}; + FunctionCallBuilder createSpGEMMDestroyDescrBuilder = { + "mgpuSpGEMMDestroyDescr", + llvmVoidType, + {llvmPointerType /*s*/, llvmPointerType /*void *stream*/}}; FunctionCallBuilder createSpGEMMWorkEstimationBuilder = { "mgpuSpGEMMWorkEstimation", llvmIntPtrType, @@ -316,16 +324,8 @@ {llvmPointerType /*s*/, llvmInt32Type /*ma*/, llvmInt32Type /*mb*/, llvmPointerType /*a*/, llvmPointerType /*b*/, llvmPointerType /*c*/, llvmInt32Type /*ctp*/, llvmPointerType /*void *stream*/}}; - FunctionCallBuilder createSpGEMMCreateDescrBuilder = { - "mgpuSpGEMMCreateDescr", - llvmPointerType, - {llvmPointerType /*void *stream*/}}; - FunctionCallBuilder createSpGEMMDestroyDescrBuilder = { - "mgpuSpGEMMDestroyDescr", - llvmVoidType, - {llvmPointerType /*s*/, llvmPointerType /*void *stream*/}}; - FunctionCallBuilder createSpGEMMGetSizeBuilder = { - "mgpuSpGEMMGetSize", + FunctionCallBuilder createSpMatGetSizeBuilder = { + "mgpuSpMatGetSize", llvmVoidType, {llvmPointerType /*mc*/, llvmPointerType /*rc*/, llvmPointerType /*cc*/, llvmPointerType /*nc*/, llvmPointerType /*void *stream*/}}; @@ -564,7 +564,7 @@ DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMDestroyDescrOp) DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMWorkEstimationOrComputeOp) DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMCopyOp) -DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpGEMMGetSizeOp) +DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SpMatGetSizeOp) DECLARE_CONVERT_OP_TO_GPU_RUNTIME_CALL_PATTERN(SetCsrPointersOp) } // namespace @@ -1852,8 +1852,8 @@ return success(); } -LogicalResult ConvertSpGEMMGetSizeOpToGpuRuntimeCallPattern::matchAndRewrite( - gpu::SpGEMMGetSizeOp op, OpAdaptor adaptor, +LogicalResult ConvertSpMatGetSizeOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::SpMatGetSizeOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || failed(isAsyncWithOneDependency(rewriter, op))) @@ -1878,7 +1878,7 @@ loc, llvmInt64PointerType, llvmInt64PointerType, buffer, ValueRange{rewriter.create(loc, getIndexType(), rewriter.getIndexAttr(2))}); - createSpGEMMGetSizeBuilder.create( + createSpMatGetSizeBuilder.create( loc, rewriter, {adaptor.getSpmat(), rowsPtr, colsPtr, nnzsPtr, stream}); auto rows = rewriter.create(loc, llvmInt64Type, rowsPtr); auto cols = rewriter.create(loc, llvmInt64Type, colsPtr); @@ -1950,7 +1950,7 @@ ConvertSpGEMMDestroyDescrOpToGpuRuntimeCallPattern, ConvertSpGEMMWorkEstimationOrComputeOpToGpuRuntimeCallPattern, ConvertSpGEMMCopyOpToGpuRuntimeCallPattern, - ConvertSpGEMMGetSizeOpToGpuRuntimeCallPattern, + ConvertSpMatGetSizeOpToGpuRuntimeCallPattern, ConvertSetCsrPointersOpToGpuRuntimeCallPattern>(converter); patterns.add( converter, gpuBinaryAnnotation, kernelBarePtrCallConv, cachedModuleTable); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -844,7 +844,7 @@ token = compute2->getResult(1); // Get sizes. - Operation *sizes = rewriter.create( + Operation *sizes = rewriter.create( loc, indexTp, indexTp, indexTp, tokenTp, token, spMatC); Value nnz = sizes->getResult(2); token = sizes->getResult(3); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -603,6 +603,19 @@ CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuSpGEMMCreateDescr(CUstream /*stream*/) { + cusparseSpGEMMDescr_t spgemmDesc = nullptr; + CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc)) + return reinterpret_cast(spgemmDesc); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpGEMMDestroyDescr(void *s, CUstream /*stream*/) { + cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); + CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(spgemmDesc)) +} + extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation( void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, intptr_t bs, void *buf, CUstream /*stream*/) { @@ -655,21 +668,8 @@ matC, cTp, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * -mgpuSpGEMMCreateDescr(CUstream /*stream*/) { - cusparseSpGEMMDescr_t spgemmDesc = nullptr; - CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc)) - return reinterpret_cast(spgemmDesc); -} - -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSpGEMMDestroyDescr(void *s, CUstream /*stream*/) { - cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); - CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(spgemmDesc)) -} - extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSpGEMMGetSize(void *m, void *r, void *c, void *n, CUstream /*stream*/) { +mgpuSpMatGetSize(void *m, void *r, void *c, void *n, CUstream /*stream*/) { cusparseConstSpMatDescr_t matDescr = reinterpret_cast(m); int64_t *rows = reinterpret_cast(r); diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -64,7 +64,7 @@ // CHECK: llvm.call @mgpuSpGEMMCreateDescr // CHECK: llvm.call @mgpuSpGEMMWorkEstimation // CHECK: llvm.call @mgpuSpGEMMCompute - // CHECK: llvm.call @mgpuSpGEMMGetSize + // CHECK: llvm.call @mgpuSpMatGetSize // CHECK: llvm.call @mgpuSetCsrPointers // CHECK: llvm.call @mgpuSpGEMMCopy // CHECK: llvm.call @mgpuSpGEMMDestroyDescr @@ -91,7 +91,7 @@ [%token7]{COMPUTE} %spmatA, %spmatB, %spmatC, %spgemmDesc, %c0, %alloc: f32 into memref<0xi8> - %rows, %cols, %nnz, %token9 = gpu.spgemm_get_size async [%token8] %spmatC + %rows, %cols, %nnz, %token9 = gpu.spmat_get_size async [%token8] %spmatC %token10 = gpu.set_csr_pointers async [%token8] %spmatC, %mem1, %mem1, %mem2 : memref, memref, memref %token11 = gpu.spgemm_copy async [%token10] %spmatA, %spmatB, %spmatC, %spgemmDesc: f32 %token12 = gpu.spgemm_destroy_descr async [%token11] %spgemmDesc diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir --- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir +++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir @@ -64,7 +64,7 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_create_descr async [%{{.*}}] // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ WORK_ESTIMATION} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8> // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{ COMPUTE} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8> - // CHECK: %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_get_size async [%{{.*}}] %{{.*}} + // CHECK: %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} = gpu.spmat_get_size async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.set_csr_pointers async [%{{.*}}] %{{.*}}, {{.*}}, {{.*}}, {{.*}} : memref, memref, memref // CHECK: %{{.*}} = gpu.spgemm_copy async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 // CHECK: %{{.*}} = gpu.spgemm_destroy_descr async [%{{.*}}] %{{.*}} @@ -91,7 +91,7 @@ [%token7]{COMPUTE} %spmatA, %spmatB, %spmatC, %spgemmDesc, %c0, %alloc: f32 into memref<0xi8> - %rows, %cols, %nnz, %token9 = gpu.spgemm_get_size async [%token8] %spmatC + %rows, %cols, %nnz, %token9 = gpu.spmat_get_size async [%token8] %spmatC %token10 = gpu.set_csr_pointers async [%token8] %spmatC, %mem1, %mem1, %mem2 : memref, memref, memref %token11 = gpu.spgemm_copy async [%token10] %spmatA, %spmatB, %spmatC, %spgemmDesc: f32 %token12 = gpu.spgemm_destroy_descr async [%token11] %spgemmDesc diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_spgemm_lib.mlir @@ -57,7 +57,7 @@ // CHECK: %[[VAL_65:.*]], %[[VAL_66:.*]] = gpu.spgemm_work_estimation_or_compute async {{\[}}%[[VAL_64]]]{ COMPUTE} %[[VAL_45]], %[[VAL_47]], %[[VAL_55]], %[[VAL_57]], %[[VAL_3]], %[[VAL_53]] : f32 into memref // CHECK: %[[VAL_67:.*]], %[[VAL_68:.*]] = gpu.alloc async {{\[}}%[[VAL_66]]] (%[[VAL_65]]) : memref // CHECK: %[[VAL_69:.*]], %[[VAL_70:.*]] = gpu.spgemm_work_estimation_or_compute async {{\[}}%[[VAL_68]]]{ COMPUTE} %[[VAL_45]], %[[VAL_47]], %[[VAL_55]], %[[VAL_57]], %[[VAL_65]], %[[VAL_67]] : f32 into memref -// CHECK: %[[VAL_71:.*]], %[[VAL_72:.*]], %[[VAL_73:.*]], %[[VAL_74:.*]] = gpu.spgemm_get_size async {{\[}}%[[VAL_70]]] %[[VAL_55]] +// CHECK: %[[VAL_71:.*]], %[[VAL_72:.*]], %[[VAL_73:.*]], %[[VAL_74:.*]] = gpu.spmat_get_size async {{\[}}%[[VAL_70]]] %[[VAL_55]] // CHECK: %[[VAL_75:.*]], %[[VAL_76:.*]] = gpu.alloc async {{\[}}%[[VAL_74]]] (%[[VAL_73]]) : memref // CHECK: %[[VAL_77:.*]], %[[VAL_78:.*]] = gpu.alloc async {{\[}}%[[VAL_76]]] (%[[VAL_73]]) : memref // CHECK: %[[VAL_79:.*]] = gpu.set_csr_pointers async {{\[}}%[[VAL_78]]] %[[VAL_55]], %[[VAL_49]], %[[VAL_75]], %[[VAL_77]] : memref, memref, memref