diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1623,7 +1623,7 @@ def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> { let summary = "Destroy dense vector operation"; let description = [{ - The `gpu.destroy_sparse_env` operation releases all resources of a dense + The `gpu.destroy_dn_vec` operation releases all resources of a dense vector represented by a handle that was previously created by a `gpu.create_dn_vec` operation. @@ -1647,6 +1647,64 @@ }]; } +def GPU_CreateDnMatOp : GPU_Op<"create_dn_mat", [GPU_AsyncOpInterface]> { + let summary = "Create dense matrix operation"; + let description = [{ + The `gpu.create_dn_mat` operation initializes a dense matrix from + the given values buffer and sizes. The buffer must already be copied + from the host to the device prior to using this operation. The + operation returns a handle to the dense matrix descriptor. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %dmat, %token = gpu.create_dn_mat async [%dep] %mem, %size : memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Index:$rows, + Index:$cols, + AnyMemRef:$memref); + let results = (outs Res:$dmat, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $rows `,` $cols `,` $memref attr-dict `:` type($memref) + }]; +} + +def GPU_DestroyDnMatOp : GPU_Op<"destroy_dn_mat", [GPU_AsyncOpInterface]> { + let summary = "Destroy dense matrix operation"; + let description = [{ + The `gpu.destroy_dn_mat` operation releases all resources of a dense + matrix represented by a handle that was previously created by a + `gpu.create_dn_mat` operation. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %token = gpu.destroy_dn_vec async [%dep] %dmat + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Arg:$dmat); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) $dmat attr-dict + }]; +} + def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> { let summary = "Create sparse matrix in COO format operation"; let description = [{ @@ -1812,4 +1870,69 @@ }]; } +def GPU_SpMMBufferSizeOp : GPU_Op<"spmm_buffer_size", [GPU_AsyncOpInterface]> { + let summary = "Precompute buffersize for SpMM operation"; + let description = [{ + The `gpu.spmm_buffer_size` operation returns the buffer size required + to perform the SpMM operation on the given sparse and dense matrix. + The operation expects handles returned by previous sparse operations + to construct an environment and the operands for SpMM. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA, %spmatB, %spmatC + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseHandle:$env, + GPU_SparseHandle:$spmatA, + GPU_SparseHandle:$dnmatB, + GPU_SparseHandle:$dnmatC); + let results = (outs Res:$bufferSz, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $env `,` $spmatA `,` $dnmatB `,` $dnmatC attr-dict + }]; +} + +def GPU_SpMMOp : GPU_Op<"spmm", [GPU_AsyncOpInterface]> { + let summary = "SpMM operation"; + let description = [{ + The `gpu.spmm` operation performs the SpMM operation on the given sparse and + dense matrix, and buffer. The operation expects handles returned by previous + sparse operations to construct an environment and the operands for SpMM. The + buffer must have been allocated on the device. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %token = gpu.spmm async [%dep] %env, %spmatA, %spmatB, %spmatC, %buffer + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseHandle:$env, + GPU_SparseHandle:$spmatA, + GPU_SparseHandle:$dnmatB, + GPU_SparseHandle:$dnmatC, + AnyMemRef:$buffer); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $env `,` $spmatA `,` $dnmatB `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) + }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -209,6 +209,15 @@ "mgpuDestroyDnVec", llvmVoidType, {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder createDnMatCallBuilder = { + "mgpuCreateDnMat", + llvmPointerType, + {llvmIntPtrType, llvmIntPtrType, llvmPointerType, llvmInt32Type, + llvmPointerType /* void *stream */}}; + FunctionCallBuilder destroyDnMatCallBuilder = { + "mgpuDestroyDnMat", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createCooCallBuilder = { "mgpuCreateCoo", llvmPointerType, @@ -235,6 +244,16 @@ llvmVoidType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder spMMBufferSizeCallBuilder = { + "mgpuSpMMBufferSize", + llvmIntPtrType, + {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, + llvmPointerType /* void *stream */}}; + FunctionCallBuilder spMMCallBuilder = { + "mgpuSpMM", + llvmVoidType, + {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, + llvmPointerType, llvmPointerType /* void *stream */}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -477,6 +496,30 @@ ConversionPatternRewriter &rewriter) const override; }; +class ConvertCreateDnMatOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::CreateDnMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertDestroyDnMatOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertDestroyDnMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::DestroyDnMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + class ConvertCreateCooOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: @@ -539,6 +582,32 @@ ConversionPatternRewriter &rewriter) const override; }; +class ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) { + } + +private: + LogicalResult + matchAndRewrite(gpu::SpMMBufferSizeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertSpMMOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertSpMMOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::SpMMOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + } // namespace void GpuToLLVMConversionPass::runOnOperation() { @@ -1180,6 +1249,43 @@ return success(); } +LogicalResult ConvertCreateDnMatOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateDnMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pMat = + MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pMat = rewriter.create(loc, llvmPointerType, pMat); + Type dType = op.getMemref().getType().cast().getElementType(); + auto dw = rewriter.create(loc, llvmInt32Type, + dType.getIntOrFloatBitWidth()); + auto handle = + createDnMatCallBuilder + .create(loc, rewriter, + {adaptor.getRows(), adaptor.getCols(), pMat, dw, stream}) + .getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + +LogicalResult ConvertDestroyDnMatOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::DestroyDnMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + destroyDnMatCallBuilder.create(loc, rewriter, {adaptor.getDmat(), stream}); + rewriter.replaceOp(op, {stream}); + return success(); +} + LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateCooOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1302,6 +1408,44 @@ return success(); } +LogicalResult ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::SpMMBufferSizeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + auto bufferSize = + spMMBufferSizeCallBuilder + .create(loc, rewriter, + {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(), + adaptor.getDnmatC(), stream}) + .getResult(); + rewriter.replaceOp(op, {bufferSize, stream}); + return success(); +} + +LogicalResult ConvertSpMMOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::SpMMOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pBuf = + MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pBuf = rewriter.create(loc, llvmPointerType, pBuf); + spMMCallBuilder.create(loc, rewriter, + {adaptor.getEnv(), adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), pBuf, + stream}); + rewriter.replaceOp(op, {stream}); + return success(); +} + void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef gpuBinaryAnnotation, @@ -1329,11 +1473,15 @@ ConvertDestroySparseEnvOpToGpuRuntimeCallPattern, ConvertCreateDnVecOpToGpuRuntimeCallPattern, ConvertDestroyDnVecOpToGpuRuntimeCallPattern, + ConvertCreateDnMatOpToGpuRuntimeCallPattern, + ConvertDestroyDnMatOpToGpuRuntimeCallPattern, ConvertCreateCooOpToGpuRuntimeCallPattern, ConvertCreateCsrOpToGpuRuntimeCallPattern, ConvertDestroySpMatOpToGpuRuntimeCallPattern, ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern, - ConvertSpMVOpToGpuRuntimeCallPattern>(converter); + ConvertSpMVOpToGpuRuntimeCallPattern, + ConvertSpMMBufferSizeOpToGpuRuntimeCallPattern, + ConvertSpMMOpToGpuRuntimeCallPattern>(converter); patterns.add( converter, gpuBinaryAnnotation, kernelBarePtrCallConv); patterns.add(&converter.getContext()); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -338,7 +338,7 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSpMV(void *h, void *a, void *x, void *y, void *b, CUstream /*stream*/) { +mgpuSpMV(void *h, void *a, void *x, void *y, void *buf, CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); @@ -347,5 +347,35 @@ double beta = 1.0; CUSPARSE_REPORT_IF_ERROR( cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, - &beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, b)) + &beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, buf)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpMMBufferSize(void *h, void *a, void *b, void *c, CUstream /*stream*/) { + cusparseHandle_t handle = reinterpret_cast(h); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseDnMatDescr_t matB = reinterpret_cast(b); + cusparseDnMatDescr_t matC = reinterpret_cast(c); + double alpha = 1.0; + double beta = 1.0; + size_t bufferSize = 0; + CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, matB, &beta, matC, + CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) + return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpMM(void *h, void *a, void *b, void *c, void *buf, CUstream /*stream*/) { + cusparseHandle_t handle = reinterpret_cast(h); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseDnMatDescr_t matB = reinterpret_cast(b); + cusparseDnMatDescr_t matC = reinterpret_cast(c); + double alpha = 1.0; + double beta = 1.0; + CUSPARSE_REPORT_IF_ERROR( + cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, matB, &beta, + matC, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, buf)) } diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -10,7 +10,7 @@ // CHECK: llvm.call @mgpuCreateCoo // CHECK: llvm.call @mgpuCreateDnVec // CHECK: llvm.call @mgpuSpMVBufferSize - // CHECK: llvm.call @mgpuSpM + // CHECK: llvm.call @mgpuSpMV // CHECK: llvm.call @mgpuDestroySpMat // CHECK: llvm.call @mgpuDestroyDnVec // CHECK: llvm.call @mgpuDestroySparseEnv @@ -32,6 +32,36 @@ return } + // CHECK-LABEL: func @matmul + // CHECK: llvm.call @mgpuStreamCreate + // CHECK: llvm.call @mgpuMemAlloc + // CHECK: llvm.call @mgpuMemAlloc + // CHECK: llvm.call @mgpuCreateSparseEnv + // CHECK: llvm.call @mgpuCreateCsr + // CHECK: llvm.call @mgpuCreateDnMat + // CHECK: llvm.call @mgpuSpMMBufferSize + // CHECK: llvm.call @mgpuSpMM + // CHECK: llvm.call @mgpuDestroySpMat + // CHECK: llvm.call @mgpuDestroyDnMat + // CHECK: llvm.call @mgpuDestroySparseEnv + // CHECK: llvm.call @mgpuStreamSynchronize + // CHECK: llvm.call @mgpuStreamDestroy + func.func @matmul(%arg0: index) { + %token0 = gpu.wait async + %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref + %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref + %env, %token3 = gpu.create_sparse_env async [%token2] + %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat + %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref + %token8 = gpu.destroy_sp_mat async [%token7] %spmat + %token9 = gpu.destroy_dn_mat async [%token8] %dnmat + %token10 = gpu.destroy_sparse_env async [%token9] %env + gpu.wait [%token10] + return + } + } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -338,14 +338,22 @@ %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec // CHECK: gpu.spmv async %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref + // CHECK: gpu.create_dn_mat async + %dnmat, %token9 = gpu.create_dn_mat async [%token8] %arg0, %arg0, %mem2 : memref + // CHECK: gpu.spmm_buffer_size async + %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat + // CHECK: gpu.spmm async + %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref + // CHECK: gpu.destroy_dn_mat async + %token12 = gpu.destroy_dn_mat async [%token11] %dnmat // CHECK: gpu.destroy_sp_mat async - %token9 = gpu.destroy_sp_mat async [%token8] %spmat + %token13 = gpu.destroy_sp_mat async [%token12] %spmat // CHECK: gpu.destroy_dn_vec async - %token10 = gpu.destroy_dn_vec async [%token9] %dnvec + %token14 = gpu.destroy_dn_vec async [%token13] %dnvec // CHECK: gpu.destroy_sparse_env async - %token11 = gpu.destroy_sparse_env async [%token10] %env + %token15 = gpu.destroy_sparse_env async [%token14] %env // CHECK: gpu.wait - gpu.wait [%token11] + gpu.wait [%token15] return } }