diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUBase.td @@ -109,6 +109,11 @@ "$_self.cast<::mlir::gpu::MMAMatrixType>().getElementType()", "gpu.mma_matrix", "::mlir::gpu::MMAMatrixType">; +// Generic type for all sparse handles (could be refined). +def GPU_SparseHandle : DialectType< + GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::SparseHandleType>()">, "sparse handle type">, + BuildableType<"mlir::gpu::SparseHandleType::get($_builder.getContext())">; + //===----------------------------------------------------------------------===// // GPU Interfaces. //===----------------------------------------------------------------------===// diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h --- a/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUDialect.h @@ -163,6 +163,14 @@ // Adds a `gpu.async.token` to the front of the argument list. void addAsyncDependency(Operation *op, Value token); +// Represents any sparse handle. +class SparseHandleType + : public Type::TypeBase { +public: + // Used for generic hooks in TypeBase. + using Base::Base; +}; + } // namespace gpu } // namespace mlir diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1533,4 +1533,320 @@ }]; } +// +// Operation on sparse matrices, called from the host +// (currently lowers to cuSparse for CUDA only, no ROCM lowering). +// + +def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> { + let summary = "Create sparse environment operation"; + let description = [{ + The `gpu.create_sparse_env` operation initializes a sparse environment. + It must be executed prior to any other sparse operation. The operation + returns a handle to the new sparse environment. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %env, %token = gpu.create_sparse_env async [%dep] + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies); + let results = (outs Res:$env, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) attr-dict + }]; +} + +def GPU_DestroySparseEnvOp : GPU_Op<"destroy_sparse_env", [GPU_AsyncOpInterface]> { + let summary = "Destroy sparse environment operation"; + let description = [{ + The `gpu.destroy_sparse_env` operation releases all resources of a sparse + environment represented by a handle that was previously created by a + `gpu.create_sparse_env` operation. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %token = gpu.destroy_sparse_env async [%dep] %env + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Arg:$env); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) $env attr-dict + }]; +} + +def GPU_CreateDnVecOp : GPU_Op<"create_dn_vec", [GPU_AsyncOpInterface]> { + let summary = "Create dense vector operation"; + let description = [{ + The `gpu.create_dn_vec` operation initializes a dense vector from + the given values buffer and size. The buffer must already be copied + from the host to the device prior to using this operation. The + operation returns a handle to the dense vector descriptor. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %dvec, %token = gpu.create_dn_vec async [%dep] %mem, %size : memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + AnyMemRef:$memref, Index:$size); + let results = (outs Res:$dvec, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $memref `,` $size attr-dict `:` type($memref) + }]; +} + +def GPU_DestroyDnVecOp : GPU_Op<"destroy_dn_vec", [GPU_AsyncOpInterface]> { + let summary = "Destroy dense vector operation"; + let description = [{ + The `gpu.destroy_sparse_env` operation releases all resources of a dense + vector represented by a handle that was previously created by a + `gpu.create_dn_vec` operation. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %token = gpu.destroy_dn_vec async [%dep] %dvec + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Arg:$dvec); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) $dvec attr-dict + }]; +} + +def GPU_CreateCooOp : GPU_Op<"create_coo", [GPU_AsyncOpInterface]> { + let summary = "Create sparse matrix in COO format operation"; + let description = [{ + The `gpu.create_coo` operation initializes a sparse matrix in COO format + with the given sizes from the given index and values buffers. The buffers + must already be copied from the host to the device prior to using this + operation. The operation returns a handle to the sparse matrix descriptor. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %spmat, %token = gpu.create_coo async [%dep] %rows, %cols, %nnz, %rowIdx, + %colIdx, %values : memref, memref, memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Index:$rows, + Index:$cols, + Index:$nnz, + AnyMemRef:$rowIdxs, + AnyMemRef:$colIdxs, + AnyMemRef:$values); + let results = (outs Res:$spmat, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $rows `,` $cols `,` $nnz `,` $rowIdxs `,` $colIdxs `,` $values attr-dict + `:` type($rowIdxs) `,` type($colIdxs) `,` type($values) + }]; +} + +def GPU_CreateCooAoSOp : GPU_Op<"create_coo_aos", [GPU_AsyncOpInterface]> { + let summary = "Create sparse matrix in COO format operation (AoS)"; + let description = [{ + The `gpu.create_coo_aos` operation initializes a sparse matrix in COO format + with the given sizes from the given index and values buffers. The buffers + must already be copied from the host to the device prior to using this + operation. The operation returns a handle to the sparse matrix descriptor. + Unlike the default `gpu.create_coo` operation, this operation builds the + COO format from a single index buffer in AoS format. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %spmat, %token = gpu.create_coo_aos async [%dep] %rows, %cols, %nnz, %idxs, + %values : memref, memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Index:$rows, + Index:$cols, + Index:$nnz, + AnyMemRef:$idxs, + AnyMemRef:$values); + let results = (outs Res:$spmat, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $rows `,` $cols `,` $nnz `,` $idxs `,` $values attr-dict + `:` type($idxs) `,` type($values) + }]; +} + +def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> { + let summary = "Create sparse matrix in CSR format operation"; + let description = [{ + The `gpu.create_csr` operation initializes a sparse matrix in CSR format + with the given sizes from the given position, index, and values buffers. + The buffers must already be copied from the host to the device prior to + using this operation. The operation returns a handle to the sparse + matrix descriptor. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %spmat, %token = gpu.create_csr async [%dep] %rows, %cols, %nnz, %rowPos, + %colIdx, %values : memref, memref, memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Index:$rows, + Index:$cols, + Index:$nnz, + AnyMemRef:$rowPos, + AnyMemRef:$colIdxs, + AnyMemRef:$values); + let results = (outs Res:$spmat, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $rows `,` $cols `,` $nnz `,` $rowPos `,` $colIdxs `,` $values attr-dict + `:` type($rowPos) `,` type($colIdxs) `,` type($values) + }]; +} + +def GPU_DestroySpMatOp : GPU_Op<"destroy_sp_mat", [GPU_AsyncOpInterface]> { + let summary = "Destroy sparse matrix operation"; + let description = [{ + The `gpu.destroy_sp_mat` operation releases all resources of a sparse + matrix represented by a handle that was previously created by a + one of the sparse matrix creation operations. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %token = gpu.destroy_sp_mat async [%dep] %spmat + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Arg:$spmat); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) $spmat attr-dict + }]; +} + +def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> { + let summary = "Precompute buffersize for SpMV operation"; + let description = [{ + The `gpu.spmv_buffer_size` operation returns the buffer size required + to perform the SpMV operation on the given sparse matrix and dense vectors. + The operation expects handles returned by previous sparse operations + to construct an environment and the operands for SpMV. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA, %dnX, %dnY + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseHandle:$env, + GPU_SparseHandle:$spmatA, + GPU_SparseHandle:$dnX, + GPU_SparseHandle:$dnY); + let results = (outs Res:$bufferSz, Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $env `,` $spmatA `,` $dnX `,` $dnY attr-dict + }]; +} + +def GPU_SpMVOp : GPU_Op<"spmv", [GPU_AsyncOpInterface]> { + let summary = "SpMV operation"; + let description = [{ + The `gpu.spmv` operation performs the SpMV operation on the given sparse matrix, + dense vectors, and buffer. The operation expects handles returned by previous + sparse operations to construct an environment and the operands for SpMV. The + buffer must have been allocated on the device. + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %token = gpu.spmv async [%dep] %env, %spmatA, %dnX, %dnY : memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + GPU_SparseHandle:$env, + GPU_SparseHandle:$spmatA, + GPU_SparseHandle:$dnX, + GPU_SparseHandle:$dnY, + AnyMemRef:$buffer); + let results = (outs Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $env `,` $spmatA `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) + }]; +} + #endif // GPU_OPS diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -192,6 +192,55 @@ "mgpuSetDefaultDevice", llvmVoidType, {llvmInt32Type /* uint32_t devIndex */}}; + FunctionCallBuilder createSparseEnvCallBuilder = { + "mgpuCreateSparseEnv", + llvmPointerType, + {llvmPointerType /* void *stream */}}; + FunctionCallBuilder destroySparseEnvCallBuilder = { + "mgpuDestroySparseEnv", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder createDnVecCallBuilder = { + "mgpuCreateDnVec", + llvmPointerType, + {llvmIntPtrType, llvmPointerType, llvmInt32Type, + llvmPointerType /* void *stream */}}; + FunctionCallBuilder destroyDnVecCallBuilder = { + "mgpuDestroyDnVec", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder createCooCallBuilder = { + "mgpuCreateCoo", + llvmPointerType, + {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, + llvmPointerType /* void *stream */}}; + FunctionCallBuilder createCooAoSCallBuilder = { + "mgpuCreateCooAoS", + llvmPointerType, + {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmInt32Type, + llvmPointerType /* void *stream */}}; + FunctionCallBuilder createCsrCallBuilder = { + "mgpuCreateCsr", + llvmPointerType, + {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, + llvmInt32Type, llvmPointerType /* void *stream */}}; + FunctionCallBuilder destroySpMatCallBuilder = { + "mgpuDestroySpMat", + llvmVoidType, + {llvmPointerType, llvmPointerType /* void *stream */}}; + FunctionCallBuilder spMVBufferSizeCallBuilder = { + "mgpuSpMVBufferSize", + llvmIntPtrType, + {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, + llvmPointerType /* void *stream */}}; + FunctionCallBuilder spMVCallBuilder = { + "mgpuSpMV", + llvmVoidType, + {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, + llvmPointerType, llvmPointerType /* void *stream */}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -381,6 +430,133 @@ matchAndRewrite(gpu::SetDefaultDeviceOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const override; }; + +class ConvertCreateSparseEnvOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateSparseEnvOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern( + typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::CreateSparseEnvOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertDestroySparseEnvOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertDestroySparseEnvOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern( + typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::DestroySparseEnvOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertCreateDnVecOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::CreateDnVecOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertDestroyDnVecOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertDestroyDnVecOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::DestroyDnVecOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertCreateCooOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateCooOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::CreateCooOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertCreateCooAoSOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateCooAoSOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::CreateCooAoSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertCreateCsrOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateCsrOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::CreateCsrOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertDestroySpMatOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertDestroySpMatOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::DestroySpMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern( + LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) { + } + +private: + LogicalResult + matchAndRewrite(gpu::SpMVBufferSizeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + +class ConvertSpMVOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertSpMVOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::SpMVOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + } // namespace void GpuToLLVMConversionPass::runOnOperation() { @@ -959,6 +1135,221 @@ return success(); } +LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateSparseEnvOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + auto handle = + createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + +LogicalResult ConvertDestroySparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::DestroySparseEnvOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + destroySparseEnvCallBuilder.create(loc, rewriter, {adaptor.getEnv(), stream}); + rewriter.replaceOp(op, {stream}); + return success(); +} + +LogicalResult ConvertCreateDnVecOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateDnVecOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pVec = + MemRefDescriptor(adaptor.getMemref()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pVec = rewriter.create(loc, llvmPointerType, pVec); + Type dType = op.getMemref().getType().cast().getElementType(); + auto dw = rewriter.create(loc, llvmInt32Type, + dType.getIntOrFloatBitWidth()); + auto handle = + createDnVecCallBuilder + .create(loc, rewriter, {adaptor.getSize(), pVec, dw, stream}) + .getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + +LogicalResult ConvertDestroyDnVecOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::DestroyDnVecOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + destroyDnVecCallBuilder.create(loc, rewriter, {adaptor.getDvec(), stream}); + rewriter.replaceOp(op, {stream}); + return success(); +} + +LogicalResult ConvertCreateCooOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateCooOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pRowIdxs = + MemRefDescriptor(adaptor.getRowIdxs()).allocatedPtr(rewriter, loc); + Value pColIdxs = + MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc); + Value pValues = + MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) { + pRowIdxs = rewriter.create(loc, llvmPointerType, pRowIdxs); + pColIdxs = rewriter.create(loc, llvmPointerType, pColIdxs); + pValues = rewriter.create(loc, llvmPointerType, pValues); + } + Type iType = op.getColIdxs().getType().cast().getElementType(); + Type dType = op.getValues().getType().cast().getElementType(); + auto iw = rewriter.create( + loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth()); + auto dw = rewriter.create(loc, llvmInt32Type, + dType.getIntOrFloatBitWidth()); + auto handle = + createCooCallBuilder + .create(loc, rewriter, + {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(), + pRowIdxs, pColIdxs, pValues, iw, dw, stream}) + .getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + +LogicalResult ConvertCreateCooAoSOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateCooAoSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pIdxs = MemRefDescriptor(adaptor.getIdxs()).allocatedPtr(rewriter, loc); + Value pValues = + MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) { + pIdxs = rewriter.create(loc, llvmPointerType, pIdxs); + pValues = rewriter.create(loc, llvmPointerType, pValues); + } + Type iType = op.getIdxs().getType().cast().getElementType(); + Type dType = op.getValues().getType().cast().getElementType(); + auto iw = rewriter.create( + loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth()); + auto dw = rewriter.create(loc, llvmInt32Type, + dType.getIntOrFloatBitWidth()); + auto handle = createCooAoSCallBuilder + .create(loc, rewriter, + {adaptor.getRows(), adaptor.getCols(), + adaptor.getNnz(), pIdxs, pValues, iw, dw, stream}) + .getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + +LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateCsrOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pRowPos = + MemRefDescriptor(adaptor.getRowPos()).allocatedPtr(rewriter, loc); + Value pColIdxs = + MemRefDescriptor(adaptor.getColIdxs()).allocatedPtr(rewriter, loc); + Value pValues = + MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) { + pRowPos = rewriter.create(loc, llvmPointerType, pRowPos); + pColIdxs = rewriter.create(loc, llvmPointerType, pColIdxs); + pValues = rewriter.create(loc, llvmPointerType, pValues); + } + Type pType = op.getRowPos().getType().cast().getElementType(); + Type iType = op.getColIdxs().getType().cast().getElementType(); + Type dType = op.getValues().getType().cast().getElementType(); + auto pw = rewriter.create( + loc, llvmInt32Type, pType.isIndex() ? 64 : pType.getIntOrFloatBitWidth()); + auto iw = rewriter.create( + loc, llvmInt32Type, iType.isIndex() ? 64 : iType.getIntOrFloatBitWidth()); + auto dw = rewriter.create(loc, llvmInt32Type, + dType.getIntOrFloatBitWidth()); + auto handle = + createCsrCallBuilder + .create(loc, rewriter, + {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(), + pRowPos, pColIdxs, pValues, pw, iw, dw, stream}) + .getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + +LogicalResult ConvertDestroySpMatOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::DestroySpMatOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + destroySpMatCallBuilder.create(loc, rewriter, {adaptor.getSpmat(), stream}); + rewriter.replaceOp(op, {stream}); + return success(); +} + +LogicalResult ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::SpMVBufferSizeOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + auto bufferSize = spMVBufferSizeCallBuilder + .create(loc, rewriter, + {adaptor.getEnv(), adaptor.getSpmatA(), + adaptor.getDnX(), adaptor.getDnY(), stream}) + .getResult(); + rewriter.replaceOp(op, {bufferSize, stream}); + return success(); +} + +LogicalResult ConvertSpMVOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::SpMVOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pBuf = + MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) + pBuf = rewriter.create(loc, llvmPointerType, pBuf); + spMVCallBuilder.create(loc, rewriter, + {adaptor.getEnv(), adaptor.getSpmatA(), + adaptor.getDnX(), adaptor.getDnY(), pBuf, stream}); + rewriter.replaceOp(op, {stream}); + return success(); +} + void mlir::populateGpuToLLVMConversionPatterns(LLVMTypeConverter &converter, RewritePatternSet &patterns, StringRef gpuBinaryAnnotation, @@ -967,6 +1358,11 @@ return converter.getPointerType( IntegerType::get(&converter.getContext(), 8)); }); + converter.addConversion([&converter](gpu::SparseHandleType type) -> Type { + return converter.getPointerType( + IntegerType::get(&converter.getContext(), 8)); + }); + patterns.add(converter); + ConvertAsyncYieldToGpuRuntimeCallPattern, + ConvertCreateSparseEnvOpToGpuRuntimeCallPattern, + ConvertDestroySparseEnvOpToGpuRuntimeCallPattern, + ConvertCreateDnVecOpToGpuRuntimeCallPattern, + ConvertDestroyDnVecOpToGpuRuntimeCallPattern, + ConvertCreateCooOpToGpuRuntimeCallPattern, + ConvertCreateCooAoSOpToGpuRuntimeCallPattern, + ConvertCreateCsrOpToGpuRuntimeCallPattern, + ConvertDestroySpMatOpToGpuRuntimeCallPattern, + ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern, + ConvertSpMVOpToGpuRuntimeCallPattern>(converter); patterns.add( converter, gpuBinaryAnnotation, kernelBarePtrCallConv); patterns.add(&converter.getContext()); diff --git a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp --- a/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp +++ b/mlir/lib/Dialect/GPU/IR/GPUDialect.cpp @@ -146,6 +146,7 @@ void GPUDialect::initialize() { addTypes(); addTypes(); + addTypes(); addOperations< #define GET_OP_LIST #include "mlir/Dialect/GPU/IR/GPUOps.cpp.inc" @@ -200,6 +201,9 @@ shape, elementType, operand); } + if (keyword == "sparse.handle") + return SparseHandleType::get(context); + parser.emitError(parser.getNameLoc(), "unknown gpu type: " + keyword); return Type(); } @@ -207,6 +211,7 @@ void GPUDialect::printType(Type type, DialectAsmPrinter &os) const { TypeSwitch(type) .Case([&](Type) { os << "async.token"; }) + .Case([&](Type) { os << "sparse.handle"; }) .Case([&](MMAMatrixType fragTy) { os << "mma_matrix<"; auto shape = fragTy.getShape(); diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt --- a/mlir/lib/ExecutionEngine/CMakeLists.txt +++ b/mlir/lib/ExecutionEngine/CMakeLists.txt @@ -190,6 +190,9 @@ # We need the libcuda.so library. find_library(CUDA_RUNTIME_LIBRARY cuda) + # We need the libcusparse.so library. + find_library(CUDA_CUSPARSE_LIBRARY cusparse HINTS ${CMAKE_CUDA_IMPLICIT_LINK_DIRECTORIES}) + add_mlir_library(mlir_cuda_runtime SHARED CudaRuntimeWrappers.cpp @@ -204,6 +207,7 @@ target_link_libraries(mlir_cuda_runtime PRIVATE ${CUDA_RUNTIME_LIBRARY} + ${CUDA_CUSPARSE_LIBRARY} ) endif() diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -17,6 +17,7 @@ #include #include "cuda.h" +#include "cusparse.h" #ifdef _WIN32 #define MLIR_CUDA_WRAPPERS_EXPORT __declspec(dllexport) @@ -35,6 +36,15 @@ fprintf(stderr, "'%s' failed with '%s'\n", #expr, name); \ }(expr) +#define CUSPARSE_REPORT_IF_ERROR(expr) \ + { \ + cusparseStatus_t status = (expr); \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + fprintf(stderr, "cuSPARSE '%s' failed with '%s'\n", #expr, \ + cusparseGetErrorString(status)); \ + } \ + } + thread_local static int32_t defaultDevice = 0; // Make the primary context of the current default device current for the @@ -211,3 +221,138 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSetDefaultDevice(int32_t device) { defaultDevice = device; } + +/// Wrapper methods for the cuSparse library. + +static inline cudaDataType_t dataTp(int32_t width) { + switch (width) { + case 32: + return CUDA_R_32F; + default: + return CUDA_R_64F; + } +} + +static inline cusparseIndexType_t idxTp(int32_t width) { + switch (width) { + case 32: + return CUSPARSE_INDEX_32I; + default: + return CUSPARSE_INDEX_64I; + } +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateSparseEnv(CUstream /*stream*/) { + cusparseHandle_t handle = nullptr; + CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle)) + return reinterpret_cast(handle); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroySparseEnv(void *h, CUstream /*stream*/) { + cusparseHandle_t handle = reinterpret_cast(h); + CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateDnVec(intptr_t size, void *values, int32_t dw, CUstream /*stream*/) { + cusparseDnVecDescr_t vec = nullptr; + cudaDataType_t dtp = dataTp(dw); + CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnVec(&vec, size, values, dtp)) + return reinterpret_cast(vec); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroyDnVec(void *v, CUstream /*stream*/) { + cusparseDnVecDescr_t vec = reinterpret_cast(v); + CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnVec(vec)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateDnMat(intptr_t rows, intptr_t cols, void *values, int32_t dw, + CUstream /*stream*/) { + cusparseDnMatDescr_t mat = nullptr; + cudaDataType_t dtp = dataTp(dw); + CUSPARSE_REPORT_IF_ERROR(cusparseCreateDnMat(&mat, rows, cols, /*ld=*/cols, + values, dtp, CUSPARSE_ORDER_ROW)) + return reinterpret_cast(mat); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroyDnMat(void *m, CUstream /*stream*/) { + cusparseDnMatDescr_t mat = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseDestroyDnMat(mat)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateCoo(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowIdxs, + void *colIdxs, void *values, int32_t iw, int32_t dw, + CUstream /*stream*/) { + cusparseSpMatDescr_t mat = nullptr; + cusparseIndexType_t itp = idxTp(iw); + cudaDataType_t dtp = dataTp(dw); + CUSPARSE_REPORT_IF_ERROR(cusparseCreateCoo(&mat, rows, cols, nnz, rowIdxs, + colIdxs, values, itp, + CUSPARSE_INDEX_BASE_ZERO, dtp)) + return reinterpret_cast(mat); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateCooAoS(intptr_t rows, intptr_t cols, intptr_t nnz, void *idxs, + void *values, int32_t iw, int32_t dw, CUstream /*stream*/) { + cusparseSpMatDescr_t mat = nullptr; + cusparseIndexType_t itp = idxTp(iw); + cudaDataType_t dtp = dataTp(dw); + CUSPARSE_REPORT_IF_ERROR(cusparseCreateCooAoS( + &mat, rows, cols, nnz, idxs, values, itp, CUSPARSE_INDEX_BASE_ZERO, dtp)) + return reinterpret_cast(mat); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos, + void *colIdxs, void *values, int32_t pw, int32_t iw, int32_t dw, + CUstream /*stream*/) { + cusparseSpMatDescr_t mat = nullptr; + cusparseIndexType_t ptp = idxTp(pw); + cusparseIndexType_t itp = idxTp(iw); + cudaDataType_t dtp = dataTp(dw); + CUSPARSE_REPORT_IF_ERROR(cusparseCreateCsr(&mat, rows, cols, nnz, rowPos, + colIdxs, values, ptp, itp, + CUSPARSE_INDEX_BASE_ZERO, dtp)) + return reinterpret_cast(mat); +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuDestroySpMat(void *m, CUstream /*stream*/) { + cusparseSpMatDescr_t mat = reinterpret_cast(m); + CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpMVBufferSize(void *h, void *a, void *x, void *y, CUstream /*stream*/) { + cusparseHandle_t handle = reinterpret_cast(h); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseDnVecDescr_t vecX = reinterpret_cast(x); + cusparseDnVecDescr_t vecY = reinterpret_cast(y); + double alpha = 1.0; + double beta = 1.0; + size_t bufferSize = 0; + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( + handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, &beta, vecY, + CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) + return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc +} + +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpMV(void *h, void *a, void *x, void *y, void *b, CUstream /*stream*/) { + cusparseHandle_t handle = reinterpret_cast(h); + cusparseSpMatDescr_t matA = reinterpret_cast(a); + cusparseDnVecDescr_t vecX = reinterpret_cast(x); + cusparseDnVecDescr_t vecY = reinterpret_cast(y); + double alpha = 1.0; + double beta = 1.0; + CUSPARSE_REPORT_IF_ERROR( + cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, &alpha, matA, vecX, + &beta, vecY, CUDA_R_64F, CUSPARSE_SPMV_ALG_DEFAULT, b)) +} diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir new file mode 100644 --- /dev/null +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -0,0 +1,37 @@ +// RUN: mlir-opt %s --gpu-to-llvm='use-opaque-pointers=1' | FileCheck %s + +module attributes {gpu.container_module} { + + // CHECK-LABEL: func @matvec + // CHECK: llvm.call @mgpuStreamCreate + // CHECK: llvm.call @mgpuMemAlloc + // CHECK: llvm.call @mgpuMemAlloc + // CHECK: llvm.call @mgpuCreateSparseEnv + // CHECK: llvm.call @mgpuCreateCoo + // CHECK: llvm.call @mgpuCreateDnVec + // CHECK: llvm.call @mgpuSpMVBufferSize + // CHECK: llvm.call @mgpuSpM + // CHECK: llvm.call @mgpuDestroySpMat + // CHECK: llvm.call @mgpuDestroyDnVec + // CHECK: llvm.call @mgpuDestroySparseEnv + // CHECK: llvm.call @mgpuStreamSynchronize + // CHECK: llvm.call @mgpuStreamDestroy + func.func @matvec(%arg0: index) { + %token0 = gpu.wait async + %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref + %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref + %env, %token3 = gpu.create_sparse_env async [%token2] + %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec + %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref + %token8 = gpu.destroy_sp_mat async [%token7] %spmat + %token9 = gpu.destroy_dn_vec async [%token8] %dnvec + %token10 = gpu.destroy_sparse_env async [%token9] %env + gpu.wait [%token10] + return + } + +} + + diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -317,6 +317,39 @@ gpu.set_default_device %arg0 return } + + // CHECK-LABEL: func @sparse_ops + func.func @sparse_ops(%arg0: index) { + // CHECK: gpu.wait async + %token0 = gpu.wait async + // CHECK: gpu.alloc async + %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref + // CHECK: gpu.alloc async + %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref + // CHECK: gpu.create_sparse_env async + %env, %token3 = gpu.create_sparse_env async [%token2] + // CHECK: gpu.create_coo async + %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + // CHECK: gpu.create_coo_aos async + %spmat1, %token5 = gpu.create_coo_aos async [%token4] %arg0, %arg0, %arg0, %mem1, %mem2 : memref, memref + // CHECK: gpu.create_csr async + %spmat2, %token6 = gpu.create_csr async [%token5] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref + // CHECK: gpu.create_dn_vec async + %dnvec, %token7 = gpu.create_dn_vec async [%token6] %mem2, %arg0 : memref + // CHECK: gpu.spmv_buffer_size async + %bufferSz, %token8 = gpu.spmv_buffer_size async [%token7] %env, %spmat, %dnvec, %dnvec + // CHECK: gpu.spmv async + %token9 = gpu.spmv async [%token8] %env, %spmat, %dnvec, %dnvec, %mem2 : memref + // CHECK: gpu.destroy_sp_mat async + %token10 = gpu.destroy_sp_mat async [%token9] %spmat + // CHECK: gpu.destroy_dn_vec async + %token11 = gpu.destroy_dn_vec async [%token10] %dnvec + // CHECK: gpu.destroy_sparse_env async + %token12 = gpu.destroy_sparse_env async [%token11] %env + // CHECK: gpu.wait + gpu.wait [%token12] + return + } } // Just check that this doesn't crash. diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -7757,6 +7757,7 @@ ":LLVMSupportHeaders", ":mlir_c_runner_utils", "@cuda//:cuda_headers", + "@cuda//:cusparse_static", "@cuda//:libcuda", ], )