diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -2149,23 +2149,6 @@ }]; } -// ALG1, ALG2, ALG3 use 3--5 to align with cusparseSpGEMMAlg_t in cusparse.h. -def GPU_SpGEMMAlg : I32EnumAttr<"SpGEMMAlg", - "selected algorithm for sparse matrix SpGEMM", - [ - I32EnumAttrCase<"ALG1", 3>, - I32EnumAttrCase<"ALG2", 4>, - I32EnumAttrCase<"ALG3", 5>, - ]> { - let genSpecializedAttr = 0; - let cppNamespace = GPU_Dialect.cppNamespace; - let defaultValue = "SpGEMMAlg::ALG1"; -} - -def GPU_SpGEMMAlgAttr : EnumAttr { - let defaultValue = GPU_SpGEMMAlg.defaultValue; -} - def GPU_SpGEMMWorkEstimationOrComputeKind : I32EnumAttr<"SpGEMMWorkEstimationOrComputeKind", "choose whether spgemm_work_estimation_or_compute does work estimation or compute", [ @@ -2195,9 +2178,8 @@ Example: ```mlir - %desc, %token = gpu.spgemm_create_descr async [%dep] + %desc, %token = gpu.spgemm_create_descr async [%dep] ``` - }]; let arguments = (ins Variadic:$asyncDependencies); let results = (outs GPU_SparseSpGEMMOpHandle:$desc, @@ -2222,7 +2204,6 @@ ```mlir %token = gpu.spgemm_destroy_descr async [%dep] %desc ``` - }]; let arguments = (ins Variadic:$asyncDependencies, @@ -2234,7 +2215,6 @@ }]; } - def GPU_SpGEMMWorkEstimationOrComputeOp : GPU_Op<"spgemm_work_estimation_or_compute", [GPU_AsyncOpInterface]> { let summary = "SpGEMM work estimation operation"; let description = [{ @@ -2245,7 +2225,6 @@ construct an environment and the operands for SpGEMM. The buffer must have been allocated on the device. - C' = alpha * op(A) * op(B) + beta * C If the `async` keyword is present, the op is executed asynchronously (i.e. @@ -2264,7 +2243,6 @@ The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. - }]; let arguments = (ins Variadic:$asyncDependencies, @@ -2276,7 +2254,6 @@ GPU_SparseSpMatHandle:$spmatC, TypeAttr:$computeType, Index:$bufferSz, - GPU_SpGEMMAlgAttr:$alg, AnyMemRef:$buffer, GPU_SpGEMMWorkEstimationOrComputeKindAttr:$kind); let results = (outs Res:$bufferSzNew, @@ -2295,19 +2272,17 @@ "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - auto alg = gpu::SpGEMMAlg::ALG1; auto kind = gpu::SpGEMMWorkEstimationOrComputeKind::WORK_ESTIMATION; return build($_builder, $_state, bufferSzNew, asyncToken, asyncDependencies, desc, - modeA, modeB, spmatA, spmatB, spmatC, computeType, bufferSz, alg, buffer, kind);}]> + modeA, modeB, spmatA, spmatB, spmatC, computeType, bufferSz, buffer, kind);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - `{` $kind `}` $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $alg `,` $desc `,` $bufferSz `,` $buffer attr-dict `:` $computeType `into` type($buffer) + `{` $kind `}` $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc `,` $bufferSz `,` $buffer attr-dict `:` $computeType `into` type($buffer) }]; } - def GPU_SpGEMMEstimateMemoryOp : GPU_Op<"spgemm_estimate_memory", [GPU_AsyncOpInterface]> { let summary = "SpGEMM estimate memory operation"; let description = [{ @@ -2323,7 +2298,6 @@ ```mlir %bufferSz3, %dummy, %token = gpu.spgemm_estimate_memory async [%dep] %spmatA, %spmatB, %spmatC, ALG2, %spgemmDesc, %c0, %c0, %alloc: f32 into memref<0xi8> ``` - }]; let arguments = (ins Variadic:$asyncDependencies, @@ -2334,7 +2308,6 @@ GPU_SparseSpMatHandle:$spmatB, GPU_SparseSpMatHandle:$spmatC, TypeAttr:$computeType, - GPU_SpGEMMAlgAttr:$alg, Index:$bufferSz3, AnyMemRef:$buffer3, Index:$bufferSz2); @@ -2357,19 +2330,17 @@ "Value":$bufferSz2), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - auto alg = gpu::SpGEMMAlg::ALG1; return build($_builder, $_state, bufferSz3New, bufferSz2New, asyncToken, asyncDependencies, desc, modeA, modeB, spmatA, spmatB, spmatC, - computeType, alg, bufferSz3, buffer3, bufferSz2);}]> + computeType, bufferSz3, buffer3, bufferSz2);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $alg `,` $desc `,` $bufferSz3 `,` $bufferSz2 `,` $buffer3 attr-dict `:` $computeType `into` type($buffer3) + $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc `,` $bufferSz3 `,` $bufferSz2 `,` $buffer3 attr-dict `:` $computeType `into` type($buffer3) }]; } - def GPU_SpGEMMCopyOp : GPU_Op<"spgemm_copy", [GPU_AsyncOpInterface]> { let summary = "SpGEMM copy operation"; let description = [{ @@ -2389,7 +2360,6 @@ The matrix arguments can also be associated with one of the following operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value is NON_TRANSPOSE. - }]; let arguments = (ins Variadic:$asyncDependencies, @@ -2399,8 +2369,7 @@ GPU_SparseSpMatHandle:$spmatA, GPU_SparseSpMatHandle:$spmatB, GPU_SparseSpMatHandle:$spmatC, - TypeAttr:$computeType, - GPU_SpGEMMAlgAttr:$alg); + TypeAttr:$computeType); let results = (outs Optional:$asyncToken); let builders = [OpBuilder<(ins @@ -2413,18 +2382,16 @@ "Type":$computeType), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - auto alg = gpu::SpGEMMAlg::ALG1; return build($_builder, $_state, asyncToken, asyncDependencies, desc, - modeA, modeB, spmatA, spmatB, spmatC, computeType, alg);}]> + modeA, modeB, spmatA, spmatB, spmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $alg `,` $desc attr-dict `:` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $spmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $desc attr-dict `:` $computeType }]; } - def GPU_SpGEMMGetSizeOp : GPU_Op<"spgemm_get_size", [GPU_AsyncOpInterface]> { let summary = "SpGEMM get size operation"; let description = [{ @@ -2440,11 +2407,6 @@ ```mlir %rows, %cols, %nnz, %token = gpu.spgemm_get_size async [%dep] %spmatC ``` - - The matrix arguments can also be associated with one of the following - operators: NON_TRANSPOSE, TRANSPOSE, CONJUGATE_TRANSPOSE. The default value - is NON_TRANSPOSE. - }]; let arguments = (ins Variadic:$asyncDependencies, diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -300,32 +300,30 @@ llvmIntPtrType, {llvmPointerType /*s*/, llvmInt32Type /*ma*/, llvmInt32Type /*mb*/, llvmPointerType /*a*/, llvmPointerType /*b*/, llvmPointerType /*c*/, - llvmInt32Type /*ctp*/, llvmInt32Type /*alg*/, llvmIntPtrType /*bs*/, - llvmPointerType /*buf*/, llvmPointerType /*void *stream*/}}; + llvmInt32Type /*ctp*/, llvmIntPtrType /*bs*/, llvmPointerType /*buf*/, + llvmPointerType /*void *stream*/}}; FunctionCallBuilder createSpGEMMEstimateMemoryBuilder = { "mgpuSpGEMMEstimateMemory", llvmVoidType, {llvmPointerType /*nbs3*/, llvmPointerType /*nbs2*/, llvmPointerType /*s*/, llvmInt32Type /*ma*/, llvmInt32Type /*mb*/, llvmPointerType /*a*/, llvmPointerType /*b*/, llvmPointerType /*c*/, - llvmInt32Type /*ctp*/, llvmInt32Type /*alg*/, - llvmFloat32Type /*chunk_fraction*/, llvmIntPtrType /*bs3*/, - llvmPointerType /*buf3*/, llvmIntPtrType /*bs2*/, + llvmInt32Type /*ctp*/, llvmFloat32Type /*chunk_fraction*/, + llvmIntPtrType /*bs3*/, llvmPointerType /*buf3*/, llvmIntPtrType /*bs2*/, llvmPointerType /*void *stream*/}}; FunctionCallBuilder createSpGEMMComputeBuilder = { "mgpuSpGEMMCompute", llvmIntPtrType, {llvmPointerType /*s*/, llvmInt32Type /*ma*/, llvmInt32Type /*mb*/, llvmPointerType /*a*/, llvmPointerType /*b*/, llvmPointerType /*c*/, - llvmInt32Type /*ctp*/, llvmInt32Type /*alg*/, llvmIntPtrType /*bs*/, - llvmPointerType /*buf*/, llvmPointerType /*void *stream*/}}; + llvmInt32Type /*ctp*/, llvmIntPtrType /*bs*/, llvmPointerType /*buf*/, + llvmPointerType /*void *stream*/}}; FunctionCallBuilder createSpGEMMCopyBuilder = { "mgpuSpGEMMCopy", llvmVoidType, {llvmPointerType /*s*/, llvmInt32Type /*ma*/, llvmInt32Type /*mb*/, llvmPointerType /*a*/, llvmPointerType /*b*/, llvmPointerType /*c*/, - llvmInt32Type /*ctp*/, llvmInt32Type /*alg*/, - llvmPointerType /*void *stream*/}}; + llvmInt32Type /*ctp*/, llvmPointerType /*void *stream*/}}; FunctionCallBuilder createSpGEMMCreateDescrBuilder = { "mgpuSpGEMMCreateDescr", llvmPointerType, @@ -1735,7 +1733,6 @@ rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); - auto alg = genConstInt32From(rewriter, loc, adaptor.getAlg()); auto stream = adaptor.getAsyncDependencies().front(); Value pBuf = @@ -1751,7 +1748,7 @@ createSpGEMMWorkEstimationBuilder .create(loc, rewriter, {adaptor.getDesc(), modeA, modeB, adaptor.getSpmatA(), - adaptor.getSpmatB(), adaptor.getSpmatC(), computeType, alg, + adaptor.getSpmatB(), adaptor.getSpmatC(), computeType, adaptor.getBufferSz(), pBuf, stream}) .getResult(); } else { @@ -1759,7 +1756,7 @@ createSpGEMMComputeBuilder .create(loc, rewriter, {adaptor.getDesc(), modeA, modeB, adaptor.getSpmatA(), - adaptor.getSpmatB(), adaptor.getSpmatC(), computeType, alg, + adaptor.getSpmatB(), adaptor.getSpmatC(), computeType, adaptor.getBufferSz(), pBuf, stream}) .getResult(); } @@ -1777,7 +1774,6 @@ Location loc = op.getLoc(); auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); - auto alg = genConstInt32From(rewriter, loc, adaptor.getAlg()); auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); auto stream = adaptor.getAsyncDependencies().front(); @@ -1806,7 +1802,7 @@ loc, rewriter, {bufferSizePtr3, bufferSizePtr2, adaptor.getDesc(), modeA, modeB, adaptor.getSpmatA(), adaptor.getSpmatB(), adaptor.getSpmatC(), - computeType, alg, chunkFraction, adaptor.getBufferSz3(), pBuf3, + computeType, chunkFraction, adaptor.getBufferSz3(), pBuf3, adaptor.getBufferSz2(), stream}); auto bufferSize2 = rewriter.create(loc, llvmInt64Type, bufferSizePtr2); @@ -1828,12 +1824,11 @@ rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto modeA = genConstInt32From(rewriter, loc, adaptor.getModeA()); auto modeB = genConstInt32From(rewriter, loc, adaptor.getModeB()); - auto alg = genConstInt32From(rewriter, loc, adaptor.getAlg()); auto stream = adaptor.getAsyncDependencies().front(); - createSpGEMMCopyBuilder.create( - loc, rewriter, - {adaptor.getDesc(), modeA, modeB, adaptor.getSpmatA(), - adaptor.getSpmatB(), adaptor.getSpmatC(), computeType, alg, stream}); + createSpGEMMCopyBuilder.create(loc, rewriter, + {adaptor.getDesc(), modeA, modeB, + adaptor.getSpmatA(), adaptor.getSpmatB(), + adaptor.getSpmatC(), computeType, stream}); rewriter.replaceOp(op, {stream}); return success(); } diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -605,11 +605,10 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMWorkEstimation( void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, - int32_t alg, intptr_t bs, void *buf, CUstream /*stream*/) { + intptr_t bs, void *buf, CUstream /*stream*/) { cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); - cusparseSpGEMMAlg_t algorithm = static_cast(alg); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseSpMatDescr_t matB = reinterpret_cast(b); cusparseSpMatDescr_t matC = reinterpret_cast(c); @@ -619,15 +618,15 @@ CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_workEstimation( cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, - algorithm, spgemmDesc, &newBufferSize, buf)) + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize, buf)) return newBufferSize == 0 ? 1 : newBufferSize; // avoid zero-alloc } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpGEMMEstimateMemory(void *nbs3, void *nbs2, void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, - int32_t alg, float chunk_fraction, intptr_t bs3, - void *buf3, intptr_t bs2, CUstream /*stream*/) { + float chunk_fraction, intptr_t bs3, void *buf3, + intptr_t bs2, CUstream /*stream*/) { cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); @@ -640,11 +639,10 @@ size_t *newBufferSize3 = reinterpret_cast(nbs3); *newBufferSize2 = bs2; *newBufferSize3 = bs3; - auto algorithm = static_cast(alg); CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_estimateMemory( cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, - algorithm, spgemmDesc, chunk_fraction, newBufferSize3, buf3, + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, chunk_fraction, newBufferSize3, buf3, newBufferSize2)) // avoid zero-alloc if (*newBufferSize2 == 0) { @@ -656,13 +654,12 @@ return; } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpGEMMCompute( - void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, - int32_t alg, intptr_t bsz2, void *buf2, CUstream /*stream*/) { +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t +mgpuSpGEMMCompute(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, + int32_t ctp, intptr_t bsz2, void *buf2, CUstream /*stream*/) { cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); - cusparseSpGEMMAlg_t algorithm = static_cast(alg); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseSpMatDescr_t matB = reinterpret_cast(b); cusparseSpMatDescr_t matC = reinterpret_cast(c); @@ -671,13 +668,13 @@ size_t newBufferSize2 = bsz2; CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_compute( cusparse_env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, - algorithm, spgemmDesc, &newBufferSize2, buf2)) + CUSPARSE_SPGEMM_DEFAULT, spgemmDesc, &newBufferSize2, buf2)) return newBufferSize2 == 0 ? 1 : newBufferSize2; // avoid zero-alloc } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpGEMMCopy(void *s, int32_t ma, int32_t mb, void *a, void *b, void *c, - int32_t ctp, int32_t alg, CUstream /*stream*/) { + int32_t ctp, CUstream /*stream*/) { cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); @@ -685,17 +682,15 @@ cusparseSpMatDescr_t matB = reinterpret_cast(b); cusparseSpMatDescr_t matC = reinterpret_cast(c); auto cTp = static_cast(ctp); - auto algorithm = static_cast(alg); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_copy(cusparse_env, modeA, modeB, - alphap, matA, matB, betap, matC, - cTp, algorithm, spgemmDesc)) + CUSPARSE_REPORT_IF_ERROR( + cusparseSpGEMM_copy(cusparse_env, modeA, modeB, alphap, matA, matB, betap, + matC, cTp, CUSPARSE_SPGEMM_DEFAULT, spgemmDesc)) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * mgpuSpGEMMCreateDescr(CUstream /*stream*/) { - // cusparseSpGEMMDescr_t is a pointer type cusparseSpGEMMDescr_t spgemmDesc = nullptr; CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_createDescr(&spgemmDesc)) return reinterpret_cast(spgemmDesc); @@ -703,7 +698,6 @@ extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpGEMMDestroyDescr(void *s, CUstream /*stream*/) { - // cusparseSpGEMMDescr_t is a pointer type cusparseSpGEMMDescr_t spgemmDesc = reinterpret_cast(s); CUSPARSE_REPORT_IF_ERROR(cusparseSpGEMM_destroyDescr(spgemmDesc)) } @@ -902,4 +896,3 @@ } #endif // MLIR_ENABLE_CUDA_CUSPARSELT -#endif // MLIR_ENABLE_CUDA_CUSPARSE diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -120,36 +120,36 @@ // Used as nullptr %alloc = memref.alloc() : memref<0xi8> %c0 = arith.constant 0 : index - %bufferSz1, %token7 = gpu.spgemm_work_estimation_or_compute async + %bufferSz1, %token7 = gpu.spgemm_work_estimation_or_compute async [%token6]{WORK_ESTIMATION} - %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE}, - %spmatC, ALG2, %spgemmDesc, %c0, + %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE}, + %spmatC, %spgemmDesc, %c0, %alloc: f32 into memref<0xi8> %buf1, %token8 = gpu.alloc async [%token7] (%bufferSz1) : memref %bufferSz1_1, %token9 = gpu.spgemm_work_estimation_or_compute async - [%token8]{WORK_ESTIMATION} %spmatA, %spmatB, - %spmatC, ALG2, %spgemmDesc, %bufferSz1, + [%token8]{WORK_ESTIMATION} %spmatA, %spmatB, + %spmatC, %spgemmDesc, %bufferSz1, %buf1: f32 into memref - %bufferSz3, %dummy, %token10 = gpu.spgemm_estimate_memory async [%token9] - %spmatA, %spmatB, %spmatC, ALG2, - %spgemmDesc, %c0, %c0, + %bufferSz3, %dummy, %token10 = gpu.spgemm_estimate_memory async [%token9] + %spmatA, %spmatB, %spmatC, + %spgemmDesc, %c0, %c0, %alloc: f32 into memref<0xi8> %buf3, %token11 = gpu.alloc async [%token10] (%bufferSz3) : memref - %bufferSz3_2, %bufferSz2, %token12 = gpu.spgemm_estimate_memory async + %bufferSz3_2, %bufferSz2, %token12 = gpu.spgemm_estimate_memory async [%token11] %spmatA, %spmatB, %spmatC, - ALG2, %spgemmDesc, %bufferSz3, %c0, + %spgemmDesc, %bufferSz3, %c0, %buf3: f32 into memref %buf2, %token13 = gpu.alloc async [%token12] (%bufferSz2) : memref - %bufferSz2_2, %token14 = gpu.spgemm_work_estimation_or_compute async - [%token13]{COMPUTE} %spmatA, %spmatB, %spmatC, - ALG2, %spgemmDesc, %bufferSz2, + %bufferSz2_2, %token14 = gpu.spgemm_work_estimation_or_compute async + [%token13]{COMPUTE} %spmatA, %spmatB, %spmatC, + %spgemmDesc, %bufferSz2, %buf2: f32 into memref %rows, %cols, %nnz, %token15 = gpu.spgemm_get_size async [%token14] %spmatC %mem_columns, %token16 = gpu.alloc async [%token15] (%cols) : memref %mem_values, %token17 = gpu.alloc async [%token16] (%nnz) : memref gpu.wait [%token17] %token18 = gpu.wait async - %token19 = gpu.spgemm_copy async [%token18] %spmatA, %spmatB, %spmatC, ALG2, %spgemmDesc: f32 + %token19 = gpu.spgemm_copy async [%token18] %spmatA, %spmatB, %spmatC, %spgemmDesc: f32 %token20 = gpu.destroy_sp_mat async [%token19] %spmatA %token21 = gpu.destroy_sp_mat async [%token20] %spmatB %token22 = gpu.destroy_sp_mat async [%token21] %spmatC @@ -158,5 +158,3 @@ } } - - diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir --- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir +++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir @@ -64,19 +64,19 @@ // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_create_descr async [%{{.*}}] // CHECK: %{{.*}} = memref.alloc() : memref<0xi8> // CHECK: %{{.*}} = arith.constant 0 : index - // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, ALG2, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8> + // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8> // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, ALG2, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref - // CHECK: %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_estimate_memory async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, ALG2, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8> + // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref + // CHECK: %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_estimate_memory async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref<0xi8> // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_estimate_memory async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, ALG2, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref + // CHECK: %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_estimate_memory async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, ALG2, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.spgemm_work_estimation_or_compute async [%{{.*}}]{{{.*}}} %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 into memref // CHECK: %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} = gpu.spgemm_get_size async [%{{.*}}] %{{.*}} // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: gpu.wait [%{{.*}}] - // CHECK: gpu.spgemm_copy %{{.*}}, %{{.*}}, %{{.*}}, ALG2, %{{.*}} : f32 + // CHECK: gpu.spgemm_copy %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : f32 // CHECK: gpu.destroy_sp_mat %{{.*}} // CHECK: gpu.destroy_sp_mat %{{.*}} // CHECK: gpu.destroy_sp_mat %{{.*}} @@ -92,35 +92,35 @@ // Used as nullptr %alloc = memref.alloc() : memref<0xi8> %c0 = arith.constant 0 : index - %bufferSz1, %token7 = gpu.spgemm_work_estimation_or_compute async + %bufferSz1, %token7 = gpu.spgemm_work_estimation_or_compute async [%token6]{WORK_ESTIMATION} - %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE}, - %spmatC, ALG2, %spgemmDesc, %c0, + %spmatA{NON_TRANSPOSE}, %spmatB{NON_TRANSPOSE}, + %spmatC, %spgemmDesc, %c0, %alloc: f32 into memref<0xi8> %buf1, %token8 = gpu.alloc async [%token7] (%bufferSz1) : memref - %bufferSz1_1, %token9 = gpu.spgemm_work_estimation_or_compute async - [%token8]{WORK_ESTIMATION} %spmatA, %spmatB, - %spmatC, ALG2, %spgemmDesc, %bufferSz1, + %bufferSz1_1, %token9 = gpu.spgemm_work_estimation_or_compute async + [%token8]{WORK_ESTIMATION} %spmatA, %spmatB, + %spmatC, %spgemmDesc, %bufferSz1, %buf1: f32 into memref - %bufferSz3, %dummy, %token10 = gpu.spgemm_estimate_memory async [%token9] - %spmatA, %spmatB, %spmatC, ALG2, - %spgemmDesc, %c0, %c0, + %bufferSz3, %dummy, %token10 = gpu.spgemm_estimate_memory async [%token9] + %spmatA, %spmatB, %spmatC, + %spgemmDesc, %c0, %c0, %alloc: f32 into memref<0xi8> %buf3, %token11 = gpu.alloc async [%token10] (%bufferSz3) : memref - %bufferSz3_2, %bufferSz2, %token12 = gpu.spgemm_estimate_memory async + %bufferSz3_2, %bufferSz2, %token12 = gpu.spgemm_estimate_memory async [%token11] %spmatA, %spmatB, %spmatC, - ALG2, %spgemmDesc, %bufferSz3, %c0, + %spgemmDesc, %bufferSz3, %c0, %buf3: f32 into memref %buf2, %token13 = gpu.alloc async [%token12] (%bufferSz2) : memref - %bufferSz2_2, %token14 = gpu.spgemm_work_estimation_or_compute async - [%token13]{COMPUTE} %spmatA, %spmatB, %spmatC, - ALG2, %spgemmDesc, %bufferSz2, + %bufferSz2_2, %token14 = gpu.spgemm_work_estimation_or_compute async + [%token13]{COMPUTE} %spmatA, %spmatB, %spmatC, + %spgemmDesc, %bufferSz2, %buf2: f32 into memref %rows, %cols, %nnz, %token15 = gpu.spgemm_get_size async [%token14] %spmatC %mem_columns, %token16 = gpu.alloc async [%token15] (%cols) : memref %mem_values, %token17 = gpu.alloc async [%token16] (%nnz) : memref gpu.wait [%token17] - gpu.spgemm_copy %spmatA, %spmatB, %spmatC, ALG2, %spgemmDesc: f32 + gpu.spgemm_copy %spmatA, %spmatB, %spmatC, %spgemmDesc: f32 gpu.destroy_sp_mat %spmatA gpu.destroy_sp_mat %spmatB gpu.destroy_sp_mat %spmatC @@ -154,5 +154,3 @@ } } - -