diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1813,6 +1813,30 @@ }]; } +// To avoid coupling this dialect with cusparse.h specifics, we hardcoded magic literals in this enum. +// Note that this should be kept in sync with cusparseOperation_t in cusparse.h: +// typedef enum { +// CUSPARSE_OPERATION_NON_TRANSPOSE = 0, +// CUSPARSE_OPERATION_TRANSPOSE = 1, +// CUSPARSE_OPERATION_CONJUGATE_TRANSPOSE = 2 +// } cusparseOperation_t; +// todo: find a proper way to keep them in sync? +def GPU_TransposeMode : I32EnumAttr<"TransposeMode", + "transpose mode of sparse matrix supported by sparse tensor ops", + [ + I32EnumAttrCase<"NON_TRANSPOSE", 0>, + I32EnumAttrCase<"TRANSPOSE", 1>, + I32EnumAttrCase<"CONJUGATE_TRANSPOSE", 2>, + ]> { + let genSpecializedAttr = 0; + let cppNamespace = GPU_Dialect.cppNamespace; +} + +def GPU_TransposeModeAttr : EnumAttr{ + let defaultValue = "TransposeMode::NON_TRANSPOSE"; +} + def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> { let summary = "Precompute buffersize for SpMV operation"; let description = [{ @@ -1825,14 +1849,19 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + Example: ```mlir - %buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA, %dnX, %dnY + %buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnX, %dnY ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnVecHandle:$dnX, GPU_SparseDnVecHandle:$dnY); @@ -1841,7 +1870,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnX `,` $dnY attr-dict + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict }]; } @@ -1857,14 +1886,19 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + Example: ```mlir - %token = gpu.spmv async [%dep] %env, %spmatA, %dnX, %dnY : memref + %token = gpu.spmv async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnX, %dnY : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnVecHandle:$dnX, GPU_SparseDnVecHandle:$dnY, @@ -1873,7 +1907,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) }]; } @@ -1889,15 +1923,21 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + Example: ```mlir - %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA, %spmatB, %spmatC + %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnmatB{NON_TRANSPOSE}, %dnmatC ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + GPU_TransposeModeAttr:$modeA, + GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC); @@ -1906,7 +1946,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnmatB `,` $dnmatC attr-dict + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict }]; } @@ -1922,15 +1962,21 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value + is NON_TRANSPOSE. + Example: ```mlir - %token = gpu.spmm async [%dep] %env, %spmatA, %spmatB, %spmatC, %buffer + %token = gpu.spmm async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnmatB{NON_TRANSPOSE}, %dnmatC, %buffer ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + GPU_TransposeModeAttr:$modeA, + GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC, @@ -1939,7 +1985,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnmatB `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) + $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -237,23 +237,26 @@ FunctionCallBuilder spMVBufferSizeCallBuilder = { "mgpuSpMVBufferSize", llvmIntPtrType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder spMVCallBuilder = { "mgpuSpMV", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, + llvmPointerType /* void *stream */}}; FunctionCallBuilder spMMBufferSizeCallBuilder = { "mgpuSpMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, + llvmPointerType /* void *stream */}}; FunctionCallBuilder spMMCallBuilder = { "mgpuSpMM", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + llvmPointerType /* void *stream */}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -1196,6 +1199,15 @@ llvm_unreachable("cannot find spmat def"); } +static LLVM::ConstantOp genConstFrom(ConversionPatternRewriter &rewriter, + Location loc, gpu::TransposeMode mode) { + + MLIRContext *context = &(rewriter.getContext()); + Type llvmInt32Type = IntegerType::get(context, 32); + return rewriter.create(loc, llvmInt32Type, + static_cast(mode)); +} + LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateSparseEnvOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1389,6 +1401,7 @@ failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); + auto modeA = genConstFrom(rewriter, loc, op.getModeA()); Type dType = getSpMatElemType(op.getSpmatA()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); @@ -1396,8 +1409,8 @@ auto bufferSize = spMVBufferSizeCallBuilder .create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnX(), - adaptor.getDnY(), dw, stream}) + {adaptor.getEnv(), modeA, adaptor.getSpmatA(), + adaptor.getDnX(), adaptor.getDnY(), dw, stream}) .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); @@ -1411,6 +1424,7 @@ return failure(); Location loc = op.getLoc(); Type dType = getSpMatElemType(op.getSpmatA()); + auto modeA = genConstFrom(rewriter, loc, adaptor.getModeA()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto stream = adaptor.getAsyncDependencies().front(); @@ -1419,7 +1433,7 @@ if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMVCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), + {adaptor.getEnv(), modeA, adaptor.getSpmatA(), adaptor.getDnX(), adaptor.getDnY(), dw, pBuf, stream}); rewriter.replaceOp(op, {stream}); @@ -1434,14 +1448,16 @@ return failure(); Location loc = op.getLoc(); Type dType = getSpMatElemType(op.getSpmatA()); + auto modeA = genConstFrom(rewriter, loc, adaptor.getModeA()); + auto modeB = genConstFrom(rewriter, loc, adaptor.getModeB()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto stream = adaptor.getAsyncDependencies().front(); auto bufferSize = spMMBufferSizeCallBuilder .create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), dw, stream}) + {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), dw, stream}) .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); @@ -1457,13 +1473,15 @@ Type dType = getSpMatElemType(op.getSpmatA()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); + auto modeA = genConstFrom(rewriter, loc, adaptor.getModeA()); + auto modeB = genConstFrom(rewriter, loc, adaptor.getModeB()); auto stream = adaptor.getAsyncDependencies().front(); Value pBuf = MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMMCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), + {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), dw, pBuf, stream}); rewriter.replaceOp(op, {stream}); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -447,6 +447,7 @@ auto env = rewriter.create(loc, envHandleTp, tokenTp, token); Value handle = env.getResult(0); + auto ModeA = gpu::TransposeMode::NON_TRANSPOSE; token = env.getAsyncToken(); Operation *spGenA = genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szY, szX, nseA, @@ -464,7 +465,7 @@ // Precompute buffersize for SpMV. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnX, dnY); + loc, indexTp, tokenTp, token, handle, ModeA, spMatA, dnX, dnY); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); @@ -473,7 +474,7 @@ // Perform the SpMV. auto spmvComp = rewriter.create(loc, tokenTp, token, handle, - spMatA, dnX, dnY, buffer); + ModeA, spMatA, dnX, dnY, buffer); token = spmvComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -551,6 +552,8 @@ rewriter.create(loc, envHandleTp, tokenTp, token); Value handle = env.getResult(0); token = env.getAsyncToken(); + auto ModeA = gpu::TransposeMode::NON_TRANSPOSE; + auto ModeB = gpu::TransposeMode::NON_TRANSPOSE; Operation *spGenA = genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk, nseA, rowA, colA, valA, isCOO, enableRT); @@ -567,7 +570,7 @@ // Precompute buffersize for SpMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC); + loc, indexTp, tokenTp, token, handle, ModeA, ModeB, spMatA, dnB, dnC); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); @@ -575,8 +578,8 @@ token = buf.getAsyncToken(); // Perform the SpMM. - auto spmmComp = rewriter.create(loc, tokenTp, token, handle, - spMatA, dnB, dnC, buffer); + auto spmmComp = rewriter.create( + loc, tokenTp, token, handle, ModeA, ModeB, spMatA, dnB, dnC, buffer); token = spmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -740,6 +743,7 @@ if (numLoops == 2 && numTensors == 3 && linalg::isParallelIterator(iteratorTypes[0]) && linalg::isReductionIterator(iteratorTypes[1]) && + // TODO: add transposed {i, j} maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) { return rewriteSpMV(rewriter, op, enableRT); } @@ -749,6 +753,8 @@ linalg::isParallelIterator(iteratorTypes[0]) && linalg::isParallelIterator(iteratorTypes[1]) && linalg::isReductionIterator(iteratorTypes[2]) && + // TODO: add transposed {i, k}, {k, j} + // TODO: maybe add transposed {i, j} in future maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) { return rewriteSpMM(rewriter, op, enableRT); } diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -338,39 +338,45 @@ CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize( - void *h, void *a, void *x, void *y, int32_t dw, CUstream /*stream*/) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpMVBufferSize(void *h, int32_t ma, void *a, void *x, void *y, int32_t dw, + CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); cusparseDnVecDescr_t vecY = reinterpret_cast(y); cudaDataType_t dtp = dataTp(dw); ALPHABETA(dw, alpha, beta) size_t bufferSize = 0; - CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, vecX, betap, vecY, - dtp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) + CUSPARSE_REPORT_IF_ERROR( + cusparseSpMV_bufferSize(handle, modeA, &alpha, matA, vecX, &beta, vecY, + dtp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, void *a, void *x, - void *y, int32_t dw, +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, int32_t ma, void *a, + void *x, void *y, int32_t dw, void *buf, CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); cusparseDnVecDescr_t vecY = reinterpret_cast(y); cudaDataType_t dtp = dataTp(dw); ALPHABETA(dw, alpha, beta) - CUSPARSE_REPORT_IF_ERROR( - cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, vecX, - betap, vecY, dtp, CUSPARSE_SPMV_ALG_DEFAULT, buf)) + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(handle, modeA, &alpha, matA, vecX, + &beta, vecY, dtp, + CUSPARSE_SPMV_ALG_DEFAULT, buf)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMMBufferSize( - void *h, void *a, void *b, void *c, int32_t dw, CUstream /*stream*/) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t dw, CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnMatDescr_t matB = reinterpret_cast(b); cusparseDnMatDescr_t matC = reinterpret_cast(c); @@ -378,24 +384,23 @@ ALPHABETA(dw, alpha, beta) size_t bufferSize = 0; CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, matB, betap, matC, dtp, + handle, modeA, modeB, &alpha, matA, matB, &beta, matC, dtp, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(void *h, void *a, void *b, - void *c, int32_t dw, - void *buf, - CUstream /*stream*/) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t dw, + void *buf, CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t modeA = static_cast(ma); + cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnMatDescr_t matB = reinterpret_cast(b); cusparseDnMatDescr_t matC = reinterpret_cast(c); cudaDataType_t dtp = dataTp(dw); ALPHABETA(dw, alpha, beta) - CUSPARSE_REPORT_IF_ERROR( - cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, matB, betap, - matC, dtp, CUSPARSE_SPMM_ALG_DEFAULT, buf)) + CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, &alpha, matA, + matB, &beta, matC, dtp, + CUSPARSE_SPMM_ALG_DEFAULT, buf)) }