diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1813,6 +1813,22 @@ }]; } +def GPU_MatTransposeOp : I32EnumAttr<"MatTransposeOp", + "transpose mode of sparse matrix supported by sparse tensor ops", + [ + I32EnumAttrCase<"NON_TRANSPOSE", 0>, + I32EnumAttrCase<"TRANSPOSE", 1>, + I32EnumAttrCase<"CONJUGATE_TRANSPOSE", 2>, + ]> { + let genSpecializedAttr = 0; + let cppNamespace = GPU_Dialect.cppNamespace; +} + +def GPU_MatTransposeOpAttr : EnumAttr; +def Default_GPU_MatTransposeOpAttr : DefaultValuedAttr; + def GPU_SpMVBufferSizeOp : GPU_Op<"spmv_buffer_size", [GPU_AsyncOpInterface]> { let summary = "Precompute buffersize for SpMV operation"; let description = [{ @@ -1825,14 +1841,18 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. + Example: ```mlir - %buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA, %dnX, %dnY + %buffersz, %token = gpu.spmv_buffersize async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnX, %dnY ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + Default_GPU_MatTransposeOpAttr:$tOpA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnVecHandle:$dnX, GPU_SparseDnVecHandle:$dnY); @@ -1841,7 +1861,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnX `,` $dnY attr-dict + $env `,` $spmatA `{` $tOpA `}` `,` $dnX `,` $dnY attr-dict }]; } @@ -1857,14 +1877,18 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. + Example: ```mlir - %token = gpu.spmv async [%dep] %env, %spmatA, %dnX, %dnY : memref + %token = gpu.spmv async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnX, %dnY : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + Default_GPU_MatTransposeOpAttr:$tOpA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnVecHandle:$dnX, GPU_SparseDnVecHandle:$dnY, @@ -1873,7 +1897,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) + $env `,` $spmatA `{` $tOpA `}` `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) }]; } @@ -1889,15 +1913,21 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. + + Example: ```mlir - %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA, %spmatB, %spmatC + %buffersz, %token = gpu.spmm_buffersize async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnmatB{NON_TRANSPOSE}, %dnmatC ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + Default_GPU_MatTransposeOpAttr:$tOpA, + Default_GPU_MatTransposeOpAttr:$tOpB, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC); @@ -1906,7 +1936,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnmatB `,` $dnmatC attr-dict + $env `,` $spmatA `{` $tOpA `}` `,` $dnmatB `{` $tOpB `}` `,` $dnmatC attr-dict }]; } @@ -1922,15 +1952,20 @@ it does not block until the execution has finished on the device). In that case, it returns a !gpu.async.token in addition to the environment. + The matrix arguments can also be associated with one of the following + operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. + Example: ```mlir - %token = gpu.spmm async [%dep] %env, %spmatA, %spmatB, %spmatC, %buffer + %token = gpu.spmm async [%dep] %env, %spmatA{NON_TRANSPOSE}, %dnmatB{NON_TRANSPOSE}, %dnmatC, %buffer ``` }]; let arguments = (ins Variadic:$asyncDependencies, GPU_SparseEnvHandle:$env, + Default_GPU_MatTransposeOpAttr:$tOpA, + Default_GPU_MatTransposeOpAttr:$tOpB, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnMatHandle:$dnmatB, GPU_SparseDnMatHandle:$dnmatC, @@ -1939,7 +1974,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA `,` $dnmatB `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) + $env `,` $spmatA `{` $tOpA `}` `,` $dnmatB `{` $tOpB `}` `,` $dnmatC `,` $buffer attr-dict `:` type($buffer) }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -237,23 +237,26 @@ FunctionCallBuilder spMVBufferSizeCallBuilder = { "mgpuSpMVBufferSize", llvmIntPtrType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder spMVCallBuilder = { "mgpuSpMV", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, + llvmPointerType /* void *stream */}}; FunctionCallBuilder spMMBufferSizeCallBuilder = { "mgpuSpMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, + llvmPointerType /* void *stream */}}; FunctionCallBuilder spMMCallBuilder = { "mgpuSpMM", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + llvmPointerType /* void *stream */}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -1196,6 +1199,13 @@ llvm_unreachable("cannot find spmat def"); } +static LLVM::ConstantOp +genConstFromMatTransposeOp(ConversionPatternRewriter &rewriter, Location loc, + Type int32Type, gpu::MatTransposeOp tOp) { + return rewriter.create(loc, int32Type, + static_cast(tOp)); +} + LogicalResult ConvertCreateSparseEnvOpToGpuRuntimeCallPattern::matchAndRewrite( gpu::CreateSparseEnvOp op, OpAdaptor adaptor, ConversionPatternRewriter &rewriter) const { @@ -1389,6 +1399,8 @@ failed(isAsyncWithOneDependency(rewriter, op))) return failure(); Location loc = op.getLoc(); + auto tOpA = + genConstFromMatTransposeOp(rewriter, loc, llvmInt32Type, op.getTOpA()); Type dType = getSpMatElemType(op.getSpmatA()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); @@ -1396,8 +1408,8 @@ auto bufferSize = spMVBufferSizeCallBuilder .create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnX(), - adaptor.getDnY(), dw, stream}) + {adaptor.getEnv(), tOpA, adaptor.getSpmatA(), + adaptor.getDnX(), adaptor.getDnY(), dw, stream}) .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); @@ -1411,6 +1423,8 @@ return failure(); Location loc = op.getLoc(); Type dType = getSpMatElemType(op.getSpmatA()); + auto tOpA = genConstFromMatTransposeOp(rewriter, loc, llvmInt32Type, + adaptor.getTOpA()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto stream = adaptor.getAsyncDependencies().front(); @@ -1419,7 +1433,7 @@ if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMVCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), + {adaptor.getEnv(), tOpA, adaptor.getSpmatA(), adaptor.getDnX(), adaptor.getDnY(), dw, pBuf, stream}); rewriter.replaceOp(op, {stream}); @@ -1434,14 +1448,18 @@ return failure(); Location loc = op.getLoc(); Type dType = getSpMatElemType(op.getSpmatA()); + auto tOpA = genConstFromMatTransposeOp(rewriter, loc, llvmInt32Type, + adaptor.getTOpA()); + auto tOpB = genConstFromMatTransposeOp(rewriter, loc, llvmInt32Type, + adaptor.getTOpB()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); auto stream = adaptor.getAsyncDependencies().front(); auto bufferSize = spMMBufferSizeCallBuilder .create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), dw, stream}) + {adaptor.getEnv(), tOpA, tOpB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), dw, stream}) .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); @@ -1457,13 +1475,17 @@ Type dType = getSpMatElemType(op.getSpmatA()); auto dw = rewriter.create(loc, llvmInt32Type, dType.getIntOrFloatBitWidth()); + auto tOpA = genConstFromMatTransposeOp(rewriter, loc, llvmInt32Type, + adaptor.getTOpA()); + auto tOpB = genConstFromMatTransposeOp(rewriter, loc, llvmInt32Type, + adaptor.getTOpB()); auto stream = adaptor.getAsyncDependencies().front(); Value pBuf = MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMMCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), + {adaptor.getEnv(), tOpA, tOpB, adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), dw, pBuf, stream}); rewriter.replaceOp(op, {stream}); diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -447,6 +447,7 @@ auto env = rewriter.create(loc, envHandleTp, tokenTp, token); Value handle = env.getResult(0); + auto tOpA = gpu::MatTransposeOp::NON_TRANSPOSE; token = env.getAsyncToken(); Operation *spGenA = genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szY, szX, nseA, @@ -464,7 +465,7 @@ // Precompute buffersize for SpMV. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnX, dnY); + loc, indexTp, tokenTp, token, handle, tOpA, spMatA, dnX, dnY); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); @@ -473,7 +474,7 @@ // Perform the SpMV. auto spmvComp = rewriter.create(loc, tokenTp, token, handle, - spMatA, dnX, dnY, buffer); + tOpA, spMatA, dnX, dnY, buffer); token = spmvComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -567,7 +568,7 @@ // Precompute buffersize for SpMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC); + loc, indexTp, tokenTp, token, handle, tOpA, tOpB, spMatA, dnB, dnC); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); @@ -575,8 +576,8 @@ token = buf.getAsyncToken(); // Perform the SpMM. - auto spmmComp = rewriter.create(loc, tokenTp, token, handle, - spMatA, dnB, dnC, buffer); + auto spmmComp = rewriter.create( + loc, tokenTp, token, handle, tOpA, tOpB, spMatA, dnB, dnC, buffer); token = spmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -740,6 +741,7 @@ if (numLoops == 2 && numTensors == 3 && linalg::isParallelIterator(iteratorTypes[0]) && linalg::isReductionIterator(iteratorTypes[1]) && + // TODO: add transposed {i, j} maps == infer({{i, j}, {j}, {i}}) && matchSumOfMultOfArgs(op)) { return rewriteSpMV(rewriter, op, enableRT); } @@ -749,6 +751,8 @@ linalg::isParallelIterator(iteratorTypes[0]) && linalg::isParallelIterator(iteratorTypes[1]) && linalg::isReductionIterator(iteratorTypes[2]) && + // TODO: add transposed {i, k}, {k, j} + // TODO: maybe add transposed {i, j} in future maps == infer({{i, k}, {k, j}, {i, j}}) && matchSumOfMultOfArgs(op)) { return rewriteSpMM(rewriter, op, enableRT); } diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -338,64 +338,76 @@ CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize( - void *h, void *a, void *x, void *y, int32_t dw, CUstream /*stream*/) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpMVBufferSize(void *h, int32_t oa, void *a, void *x, void *y, int32_t dw, + CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t opA = static_cast(oa); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); cusparseDnVecDescr_t vecY = reinterpret_cast(y); cudaDataType_t dtp = dataTp(dw); ALPHABETA(dw, alpha, beta) size_t bufferSize = 0; - CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( - handle, CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, vecX, betap, vecY, - dtp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) + CUSPARSE_REPORT_IF_ERROR( + cusparseSpMV_bufferSize(handle, opA, &alpha, matA, vecX, &beta, vecY, dtp, + CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, void *a, void *x, - void *y, int32_t dw, +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, int32_t oa, void *a, + void *x, void *y, int32_t dw, void *buf, CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t opA = static_cast(oa); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); cusparseDnVecDescr_t vecY = reinterpret_cast(y); cudaDataType_t dtp = dataTp(dw); ALPHABETA(dw, alpha, beta) - CUSPARSE_REPORT_IF_ERROR( - cusparseSpMV(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, vecX, - betap, vecY, dtp, CUSPARSE_SPMV_ALG_DEFAULT, buf)) + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(handle, opA, &alpha, matA, vecX, &beta, + vecY, dtp, CUSPARSE_SPMV_ALG_DEFAULT, + buf)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMMBufferSize( - void *h, void *a, void *b, void *c, int32_t dw, CUstream /*stream*/) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpMMBufferSize(void *h, int32_t oa, int32_t ob, void *a, void *b, void *c, + int32_t dw, CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t opA = static_cast(oa); + cusparseOperation_t opB = static_cast(ob); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnMatDescr_t matB = reinterpret_cast(b); cusparseDnMatDescr_t matC = reinterpret_cast(c); cudaDataType_t dtp = dataTp(dw); ALPHABETA(dw, alpha, beta) size_t bufferSize = 0; +<<<<<<< HEAD CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, matB, betap, matC, dtp, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) +======= + CUSPARSE_REPORT_IF_ERROR( + cusparseSpMM_bufferSize(handle, opA, opB, &alpha, matA, matB, &beta, matC, + dtp, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) +>>>>>>> ba7bc179d826 ([mlir] [sparse] [gpu] adding transpose support to spmm spmv) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(void *h, void *a, void *b, - void *c, int32_t dw, - void *buf, - CUstream /*stream*/) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void +mgpuSpMM(void *h, int32_t oa, int32_t ob, void *a, void *b, void *c, int32_t dw, + void *buf, CUstream /*stream*/) { cusparseHandle_t handle = reinterpret_cast(h); + cusparseOperation_t opA = static_cast(oa); + cusparseOperation_t opB = static_cast(ob); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnMatDescr_t matB = reinterpret_cast(b); cusparseDnMatDescr_t matC = reinterpret_cast(c); cudaDataType_t dtp = dataTp(dw); ALPHABETA(dw, alpha, beta) - CUSPARSE_REPORT_IF_ERROR( - cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_NON_TRANSPOSE, alphap, matA, matB, betap, - matC, dtp, CUSPARSE_SPMM_ALG_DEFAULT, buf)) + CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, opA, opB, &alpha, matA, matB, + &beta, matC, dtp, + CUSPARSE_SPMM_ALG_DEFAULT, buf)) } diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -23,8 +23,8 @@ %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref %dnvec, %token5 = gpu.create_dn_vec async [%token4] %mem2, %arg0 : memref - %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec - %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat{NON_TRANSPOSE}, %dnvec, %dnvec + %token7 = gpu.spmv async [%token6] %env, %spmat{NON_TRANSPOSE}, %dnvec, %dnvec, %mem2 : memref %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_vec async [%token8] %dnvec %token10 = gpu.destroy_sparse_env async [%token9] %env @@ -53,8 +53,8 @@ %env, %token3 = gpu.create_sparse_env async [%token2] %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref %dnmat, %token5 = gpu.create_dn_mat async [%token4] %arg0, %arg0, %mem2 : memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat + %token7 = gpu.spmm async [%token6] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat, %mem2 : memref %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_mat async [%token8] %dnmat %token10 = gpu.destroy_sparse_env async [%token9] %env diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -335,15 +335,15 @@ // CHECK: gpu.create_dn_vec async %dnvec, %token6 = gpu.create_dn_vec async [%token5] %mem2, %arg0 : memref // CHECK: gpu.spmv_buffer_size async - %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec + %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat{NON_TRANSPOSE}, %dnvec, %dnvec // CHECK: gpu.spmv async - %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref + %token8 = gpu.spmv async [%token7] %env, %spmat{NON_TRANSPOSE}, %dnvec, %dnvec, %mem2 : memref // CHECK: gpu.create_dn_mat async %dnmat, %token9 = gpu.create_dn_mat async [%token8] %arg0, %arg0, %mem2 : memref // CHECK: gpu.spmm_buffer_size async - %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat + %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat // CHECK: gpu.spmm async - %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref + %token11 = gpu.spmm async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat, %mem2 : memref // CHECK: gpu.destroy_dn_mat async %token12 = gpu.destroy_dn_mat async [%token11] %dnmat // CHECK: gpu.destroy_sp_mat async diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -46,12 +46,12 @@ // CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]] // CHECK: %[[VAL_41:.*]] = gpu.wait async // CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] -// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref -// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_7]], %[[VAL_8]], %[[VAL_31]] : memref -// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_38]] : memref -// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] +// CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_8]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref +// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_45]]] %[[VAL_8]], %[[VAL_7]], %[[VAL_31]] : memref +// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_mat async {{\[}}%[[VAL_47]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_38]] : memref +// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]]{ NON_TRANSPOSE}, %[[VAL_46]]{ NON_TRANSPOSE}, %[[VAL_48]] // CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref -// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref +// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]]{ NON_TRANSPOSE}, %[[VAL_46]]{ NON_TRANSPOSE}, %[[VAL_48]], %[[VAL_52]] : memref // CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]] // CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_55]]] %[[VAL_46]] // CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_mat async {{\[}}%[[VAL_56]]] %[[VAL_48]] diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -47,9 +47,9 @@ // CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref // CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : memref // CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_vec async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : memref -// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] +// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]]{ NON_TRANSPOSE}, %[[VAL_43]], %[[VAL_45]] // CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref -// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref +// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]]{ NON_TRANSPOSE}, %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref // CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]] // CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_52]]] %[[VAL_43]] // CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_vec async {{\[}}%[[VAL_53]]] %[[VAL_45]]