diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1560,7 +1560,7 @@ }]; let arguments = (ins Variadic:$asyncDependencies); - let results = (outs Res:$env, + let results = (outs Res:$env, Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) attr-dict @@ -1568,7 +1568,7 @@ } def GPU_DestroySparseEnvOp : GPU_Op< - "destroy_sparse_env", + "destroy_sparse_env", [GPU_AsyncOpInterface]> { let summary = "Destroy sparse environment operation"; let description = [{ @@ -1592,7 +1592,7 @@ let results = (outs Optional:$asyncToken); let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) + custom(type($asyncToken), $asyncDependencies) $env attr-dict }]; } @@ -1618,7 +1618,7 @@ let arguments = (ins Variadic:$asyncDependencies, AnyMemRef:$memref, Index:$size); - let results = (outs Res:$dvec, + let results = (outs Res:$dvec, Optional:$asyncToken); let assemblyFormat = [{ @@ -1650,7 +1650,7 @@ let results = (outs Optional:$asyncToken); let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) + custom(type($asyncToken), $asyncDependencies) $dvec attr-dict }]; } @@ -1709,7 +1709,7 @@ let results = (outs Optional:$asyncToken); let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) + custom(type($asyncToken), $asyncDependencies) $dmat attr-dict }]; } @@ -1721,6 +1721,7 @@ with the given sizes from the given index and values buffers. The buffers must already be copied from the host to the device prior to using this operation. The operation returns a handle to the sparse matrix descriptor. + Note that this operation builds the COO in SoA format. If the `async` keyword is present, the op is executed asynchronously (i.e. it does not block until the execution has finished on the device). In @@ -1741,7 +1742,7 @@ AnyMemRef:$rowIdxs, AnyMemRef:$colIdxs, AnyMemRef:$values); - let results = (outs Res:$spmat, + let results = (outs Res:$spmat, Optional:$asyncToken); let assemblyFormat = [{ @@ -1751,6 +1752,45 @@ }]; } +def GPU_CreateCooAoSOp : GPU_Op<"create_coo_aos", [GPU_AsyncOpInterface]> { + let summary = "Create sparse matrix in COO format operation (AoS)"; + let description = [{ + The `gpu.create_coo_aos` operation initializes a sparse matrix in COO format + with the given sizes from the given index and values buffers. The buffers + must already be copied from the host to the device prior to using this + operation. The operation returns a handle to the sparse matrix descriptor. + Unlike the default `gpu.create_coo` operation, this operation builds the + COO format from a single index buffer in AoS format (note that this + feature has been deprecated in cuSparse 11.2). + + If the `async` keyword is present, the op is executed asynchronously (i.e. + it does not block until the execution has finished on the device). In + that case, it returns a !gpu.async.token in addition to the environment. + + Example: + + ```mlir + %spmat, %token = gpu.create_coo_aos async [%dep] %rows, %cols, %nnz, %idxs, + %values : memref, memref + ``` + }]; + + let arguments = (ins Variadic:$asyncDependencies, + Index:$rows, + Index:$cols, + Index:$nnz, + AnyMemRef:$idxs, + AnyMemRef:$values); + let results = (outs Res:$spmat, + Optional:$asyncToken); + + let assemblyFormat = [{ + custom(type($asyncToken), $asyncDependencies) + $rows `,` $cols `,` $nnz `,` $idxs `,` $values attr-dict + `:` type($idxs) `,` type($values) + }]; +} + def GPU_CreateCsrOp : GPU_Op<"create_csr", [GPU_AsyncOpInterface]> { let summary = "Create sparse matrix in CSR format operation"; let description = [{ @@ -1779,7 +1819,7 @@ AnyMemRef:$rowPos, AnyMemRef:$colIdxs, AnyMemRef:$values); - let results = (outs Res:$spmat, + let results = (outs Res:$spmat, Optional:$asyncToken); let assemblyFormat = [{ @@ -1816,8 +1856,8 @@ }]; } -// To avoid coupling this dialect with cusparse.h specifics, we hardcoded magic -// literals in this enum. Note that this should be kept in sync with +// To avoid coupling this dialect with cusparse.h specifics, we hardcoded magic +// literals in this enum. Note that this should be kept in sync with // cusparseOperation_t in cusparse.h: // typedef enum { // CUSPARSE_OPERATION_NON_TRANSPOSE = 0, @@ -1828,8 +1868,8 @@ def GPU_TransposeMode : I32EnumAttr<"TransposeMode", "transpose mode of sparse matrix supported by sparse tensor ops", [ - I32EnumAttrCase<"NON_TRANSPOSE", 0>, - I32EnumAttrCase<"TRANSPOSE", 1>, + I32EnumAttrCase<"NON_TRANSPOSE", 0>, + I32EnumAttrCase<"TRANSPOSE", 1>, I32EnumAttrCase<"CONJUGATE_TRANSPOSE", 2>, ]> { let genSpecializedAttr = 0; @@ -1853,7 +1893,7 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. - The matrix arguments can also be associated with one of the following + The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. @@ -1870,7 +1910,7 @@ GPU_SparseDnVecHandle:$dnX, GPU_SparseDnVecHandle:$dnY, TypeAttr:$computeType); - let results = (outs Res:$bufferSz, + let results = (outs Res:$bufferSz, Optional:$asyncToken); let builders = [OpBuilder<(ins @@ -1884,7 +1924,7 @@ "Type":$computeType) , [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, + return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, env, modeA, spmatA, dnX, dnY, computeType);}]> ]; @@ -1906,7 +1946,7 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. - The matrix arguments can also be associated with one of the following + The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. @@ -1958,7 +1998,7 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. - The matrix arguments can also be associated with one of the following + The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. @@ -1977,7 +2017,7 @@ GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC, TypeAttr:$computeType); - let results = (outs Res:$bufferSz, + let results = (outs Res:$bufferSz, Optional:$asyncToken); let builders = [OpBuilder<(ins @@ -1991,7 +2031,7 @@ "Type":$computeType), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, + return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> ]; @@ -2013,7 +2053,7 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. - The matrix arguments can also be associated with one of the following + The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. @@ -2046,7 +2086,7 @@ "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType, buffer);}]> ]; @@ -2074,7 +2114,7 @@ %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 ``` - The matrix arguments can also be associated with one of the following + The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. }]; @@ -2100,7 +2140,7 @@ "Type":$computeType), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, + return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> ]; @@ -2128,7 +2168,7 @@ %token = gpu.sddmm async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 ``` - The matrix arguments can also be associated with one of the following + The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. }]; @@ -2155,7 +2195,7 @@ "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]> ]; diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -224,6 +224,12 @@ {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType /* void *stream */}}; + FunctionCallBuilder createCooAoSCallBuilder = { + "mgpuCreateCooAoS", // deprecated in cuSPARSE 11.2 + llvmPointerType, + {llvmIntPtrType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmInt32Type, + llvmPointerType /* void *stream */}}; FunctionCallBuilder createCsrCallBuilder = { "mgpuCreateCsr", llvmPointerType, @@ -547,6 +553,18 @@ ConversionPatternRewriter &rewriter) const override; }; +class ConvertCreateCooAoSOpToGpuRuntimeCallPattern + : public ConvertOpToGpuRuntimeCallPattern { +public: + ConvertCreateCooAoSOpToGpuRuntimeCallPattern(LLVMTypeConverter &typeConverter) + : ConvertOpToGpuRuntimeCallPattern(typeConverter) {} + +private: + LogicalResult + matchAndRewrite(gpu::CreateCooAoSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const override; +}; + class ConvertCreateCsrOpToGpuRuntimeCallPattern : public ConvertOpToGpuRuntimeCallPattern { public: @@ -1421,6 +1439,37 @@ return success(); } +LogicalResult ConvertCreateCooAoSOpToGpuRuntimeCallPattern::matchAndRewrite( + gpu::CreateCooAoSOp op, OpAdaptor adaptor, + ConversionPatternRewriter &rewriter) const { + if (failed(areAllLLVMTypes(op, adaptor.getOperands(), rewriter)) || + failed(isAsyncWithOneDependency(rewriter, op))) + return failure(); + Location loc = op.getLoc(); + auto stream = adaptor.getAsyncDependencies().front(); + Value pIdxs = MemRefDescriptor(adaptor.getIdxs()).allocatedPtr(rewriter, loc); + Value pValues = + MemRefDescriptor(adaptor.getValues()).allocatedPtr(rewriter, loc); + if (!getTypeConverter()->useOpaquePointers()) { + pIdxs = rewriter.create(loc, llvmPointerType, pIdxs); + pValues = rewriter.create(loc, llvmPointerType, pValues); + } + Type iType = + llvm::cast(op.getIdxs().getType()).getElementType(); + Type dType = + llvm::cast(op.getValues().getType()).getElementType(); + auto itp = genConstInt32From(rewriter, loc, getCuSparseIndexTypeFrom(iType)); + auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); + auto handle = + createCooAoSCallBuilder + .create(loc, rewriter, + {adaptor.getRows(), adaptor.getCols(), adaptor.getNnz(), + pIdxs, pValues, itp, dtp, stream}) + .getResult(); + rewriter.replaceOp(op, {handle, stream}); + return success(); +} + LogicalResult ConvertCreateCsrOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateCsrOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1645,6 +1694,7 @@ ConvertCreateDnMatOpToGpuRuntimeCallPattern, ConvertDestroyDnMatOpToGpuRuntimeCallPattern, ConvertCreateCooOpToGpuRuntimeCallPattern, + ConvertCreateCooAoSOpToGpuRuntimeCallPattern, ConvertCreateCsrOpToGpuRuntimeCallPattern, ConvertDestroySpMatOpToGpuRuntimeCallPattern, ConvertSpMVBufferSizeOpToGpuRuntimeCallPattern, diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -355,7 +355,11 @@ return false; if (isAdmissibleCOO(aTp)) { isCOO = true; - return enableRT; // TODO: CreateCooAoSOp was deprecated, find another way +#ifdef CUSPARSE_COO_AOS + return true; +#else + return enableRT; +#endif } return isAdmissibleCSR(aTp); } @@ -393,7 +397,13 @@ return builder.create(loc, handleTp, tokenTp, token, sz1, sz2, nseA, rowA, colA, valA); } +#ifdef CUSPARSE_COO_AOS + assert(!colA); + return builder.create(loc, handleTp, tokenTp, token, + sz1, sz2, nseA, rowA, valA); +#else llvm_unreachable("gpu::CreateCooAoSOp is deprecated"); +#endif } assert(colA); return builder.create(loc, handleTp, tokenTp, token, sz1, diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -312,6 +312,19 @@ return reinterpret_cast(mat); } +#ifdef CUSPARSE_COO_AOS // deprecated in cuSPARSE 11.2 +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +mgpuCreateCooAoS(intptr_t rows, intptr_t cols, intptr_t nnz, void *idxs, + void *values, int32_t itp, int32_t dtp, CUstream /*stream*/) { + cusparseSpMatDescr_t mat = nullptr; + auto iTp = static_cast(itp); + auto dTp = static_cast(dtp); + CUSPARSE_REPORT_IF_ERROR(cusparseCreateCooAoS( + &mat, rows, cols, nnz, idxs, values, iTp, CUSPARSE_INDEX_BASE_ZERO, dTp)) + return reinterpret_cast(mat); +} +#endif // CUSPARSE_COO_AOS + extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * mgpuCreateCsr(intptr_t rows, intptr_t cols, intptr_t nnz, void *rowPos, void *colIdxs, void *values, int32_t ptp, int32_t itp,