diff --git a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td --- a/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td +++ b/mlir/include/mlir/Dialect/GPU/IR/GPUOps.td @@ -1540,13 +1540,28 @@ // Operation on sparse matrices, called from the host // (currently lowers to cuSparse for CUDA only, no ROCM lowering). // +def GPU_RtLibMode: I32EnumAttr<"RtLibMode", + "optional GPU runtime libraries to be enabled to support sparse ops", + [ + I32EnumAttrCase<"CUSPARSE_AND_CUSPARSE_LT", 0>, + I32EnumAttrCase<"CUSPARSE", 1>, + ]>{ + let genSpecializedAttr = 0; + let cppNamespace = GPU_Dialect.cppNamespace; +} + +def GPU_RtLibModeAttr : EnumAttr{ + let defaultValue = "RtLibMode::CUSPARSE_AND_CUSPARSE_LT"; +} def GPU_CreateSparseEnvOp : GPU_Op<"create_sparse_env", [GPU_AsyncOpInterface]> { let summary = "Create sparse environment operation"; let description = [{ - The `gpu.create_sparse_env` operation initializes a sparse environment. + The `gpu.initialize_sparse_env` operation initializes a sparse environment. It must be executed prior to any other sparse operation. The operation - returns a handle to the new sparse environment. + takes in the RtLibMode argument to indicate whether cuSparse and cuSparseLt + will be initialized, respectively. If the `async` keyword is present, the op is executed asynchronously (i.e. it does not block until the execution has finished on the device). In @@ -1555,15 +1570,15 @@ Example: ```mlir - %env, %token = gpu.create_sparse_env async [%dep] + %token = gpu.create_sparse_env async [%dep] %rtLibMode ``` }]; - let arguments = (ins Variadic:$asyncDependencies); - let results = (outs Res:$env, - Optional:$asyncToken); + let arguments = (ins Variadic:$asyncDependencies, + Arg:$rtLibMode); + let results = (outs Optional:$asyncToken); let assemblyFormat = [{ - custom(type($asyncToken), $asyncDependencies) attr-dict + custom(type($asyncToken), $asyncDependencies) $rtLibMode attr-dict }]; } @@ -1573,8 +1588,8 @@ let summary = "Destroy sparse environment operation"; let description = [{ The `gpu.destroy_sparse_env` operation releases all resources of a sparse - environment represented by a handle that was previously created by a - `gpu.create_sparse_env` operation. + environment represented by the GPU_RtLibMode flag indicating whether cuSparse + environment and cuSparseLt's will be destroyed, respectively. If the `async` keyword is present, the op is executed asynchronously (i.e. it does not block until the execution has finished on the device). In @@ -1583,17 +1598,17 @@ Example: ```mlir - %token = gpu.destroy_sparse_env async [%dep] %env + %token = gpu.destroy_sparse_env async [%dep] %rtLibMode ``` }]; let arguments = (ins Variadic:$asyncDependencies, - Arg:$env); + Arg:$rtLibMode); let results = (outs Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env attr-dict + $rtLibMode attr-dict }]; } @@ -1612,19 +1627,18 @@ Example: ```mlir - %dmat, %token = gpu.create_dn_tensor async [%dep] %env, %mem, %dims : index, index into memref + %dmat, %token = gpu.create_dn_tensor async [%dep] %mem, %dims : index, index into memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, AnyMemRef:$memref, Variadic:$dims); let results = (outs Res:$dnTensor, Optional:$asyncToken); let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) + $memref `,` $dims attr-dict `:` type($dims) `into` type($memref) }]; } @@ -1788,12 +1802,11 @@ Example: ```mlir - %spmat, %token = gpu.create_2to4_spmat async [%dep] %env, %rows, %cols, %mem : memref + %spmat, %token = gpu.create_2to4_spmat async [%dep] %rows, %cols, %mem : memref ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, Index:$rows, Index:$cols, AnyMemRef:$memref); @@ -1802,7 +1815,7 @@ let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $rows `,` $cols `,` $memref attr-dict `:` type($memref) + $rows `,` $cols `,` $memref attr-dict `:` type($memref) }]; } @@ -1877,11 +1890,10 @@ Example: ```mlir - %buffersz, %token = gpu.spmv_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY into f32 + %buffersz, %token = gpu.spmv_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnTensorHandle:$dnX, @@ -1894,7 +1906,6 @@ "Type":$bufferSz, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnX, "Value":$dnY, @@ -1902,12 +1913,12 @@ , [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, - env, modeA, spmatA, dnX, dnY, computeType);}]> + modeA, spmatA, dnX, dnY, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY attr-dict `into` $computeType }]; } @@ -1930,11 +1941,10 @@ Example: ```mlir - %token = gpu.spmv async [%dep] %env, %spmatA{TRANSPOSE}, %dnX, %dnY : memref into bf16 + %token = gpu.spmv async [%dep] %spmatA{TRANSPOSE}, %dnX, %dnY : memref into bf16 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_SparseSpMatHandle:$spmatA, GPU_SparseDnTensorHandle:$dnX, @@ -1946,20 +1956,19 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnX, "Value":$dnY, "Type":$computeType, "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, spmatA, dnX, dnY, computeType, buffer);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnX `,` $dnY `,` $buffer attr-dict `:` type($buffer) `into` $computeType }]; } @@ -1982,12 +1991,11 @@ Example: ```mlir - %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 + %bufferszs, %token = gpu.spmm_buffer_size async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC : i64 into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, @@ -2001,7 +2009,6 @@ "Type":$bufferSzs, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnmatB, "Value":$dnmatC, @@ -2009,12 +2016,12 @@ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSzs, asyncToken, asyncDependencies, - env, modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> + modeA, modeB, spmatA, dnmatB, dnmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC attr-dict `:` type($bufferSzs) `into` $computeType }]; } @@ -2037,12 +2044,11 @@ Example: ```mlir - %token = gpu.spmm async [%dep] %env, %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 + %token = gpu.spmm async [%dep] %spmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %dnmatC, %buffers : type($buffers) into f32 ``` }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseSpMatHandle:$spmatA, @@ -2055,7 +2061,6 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$spmatA, "Value":$dnmatB, "Value":$dnmatC, @@ -2063,13 +2068,13 @@ "ValueRange":$buffers), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, modeB, spmatA, dnmatB, dnmatC, computeType, buffers);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType + $spmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $dnmatC `,` $buffers attr-dict `:` type($buffers) `into` $computeType }]; } @@ -2088,7 +2093,7 @@ Example: ```mlir - %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 + %buffersz, %token = gpu.sddmm_buffer_size async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC into f32 ``` The matrix arguments can also be associated with one of the following @@ -2097,7 +2102,6 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseDnTensorHandle:$dnmatA, @@ -2110,7 +2114,6 @@ "Type":$bufferSz, "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$dnmatA, "Value":$dnmatB, "Value":$spmatC, @@ -2118,12 +2121,12 @@ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; return build($_builder, $_state, bufferSz, asyncToken, asyncDependencies, - env, modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> + modeA, modeB, dnmatA, dnmatB, spmatC, computeType);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType + $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC attr-dict `into` $computeType }]; } @@ -2142,7 +2145,7 @@ Example: ```mlir - %token = gpu.sddmm async [%dep] %env, %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 + %token = gpu.sddmm async [%dep] %dnmatA{TRANSPOSE}, %dnmatB{TRANSPOSE}, %spmatC, %buffer into f32 ``` The matrix arguments can also be associated with one of the following @@ -2151,7 +2154,6 @@ }]; let arguments = (ins Variadic:$asyncDependencies, - GPU_SparseEnvHandle:$env, GPU_TransposeModeAttr:$modeA, GPU_TransposeModeAttr:$modeB, GPU_SparseDnTensorHandle:$dnmatA, @@ -2164,7 +2166,6 @@ let builders = [OpBuilder<(ins "Type":$asyncToken, "ValueRange":$asyncDependencies, - "Value":$env, "Value":$dnmatA, "Value":$dnmatB, "Value":$spmatC, @@ -2172,13 +2173,13 @@ "Value":$buffer), [{ auto modeA = gpu::TransposeMode::NON_TRANSPOSE; auto modeB = gpu::TransposeMode::NON_TRANSPOSE; - return build($_builder, $_state, asyncToken, asyncDependencies, env, modeA, + return build($_builder, $_state, asyncToken, asyncDependencies, modeA, modeB, dnmatA, dnmatB, spmatC, computeType, buffer);}]> ]; let assemblyFormat = [{ custom(type($asyncToken), $asyncDependencies) - $env `,` $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType + $dnmatA (`{` $modeA^ `}`)? `,` $dnmatB (`{` $modeB^ `}`)? `,` $spmatC `,` $buffer attr-dict `:` type($buffer) `into` $computeType }]; } diff --git a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp --- a/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp +++ b/mlir/lib/Conversion/GPUCommon/GPUToLLVMConversion.cpp @@ -206,12 +206,12 @@ {llvmInt32Type /* uint32_t devIndex */}}; FunctionCallBuilder createSparseEnvCallBuilder = { "mgpuCreateSparseEnv", - llvmPointerType, + llvmVoidType, {llvmPointerType /* void *stream */}}; FunctionCallBuilder destroySparseEnvCallBuilder = { "mgpuDestroySparseEnv", llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; + {llvmPointerType /* void *stream */}}; FunctionCallBuilder createDnVecCallBuilder = { "mgpuCreateDnVec", llvmPointerType, @@ -255,51 +255,48 @@ FunctionCallBuilder spMVBufferSizeCallBuilder = { "mgpuSpMVBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder spMVCallBuilder = { "mgpuSpMV", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType, - llvmPointerType, llvmInt32Type, llvmPointerType, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmPointerType, llvmPointerType, llvmPointerType, + llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSpMMBufferSizeCallBuilder = { "mgpuSpMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSpMMCallBuilder = { "mgpuSpMM", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSDDMMBufferSizeCallBuilder = { "mgpuSDDMMBufferSize", llvmIntPtrType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, - llvmPointerType /* void *stream */}}; + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSDDMMCallBuilder = { "mgpuSDDMM", llvmVoidType, - {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, - llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType, + {llvmInt32Type, llvmInt32Type, llvmPointerType, llvmPointerType, + llvmPointerType, llvmInt32Type, llvmPointerType, llvmPointerType /* void *stream */}}; FunctionCallBuilder createSparseLtEnvCallBuilder = { "mgpuCreateSparseLtEnv", llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; + {llvmPointerType /* void *stream */}}; FunctionCallBuilder destroySparseLtEnvCallBuilder = { "mgpuDestroySparseLtEnv", llvmVoidType, - {llvmPointerType, llvmPointerType /* void *stream */}}; + {llvmPointerType /* void *stream */}}; FunctionCallBuilder createLtDnMatCallBuilder = { "mgpuCreateCuSparseLtDnMat", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder destroyCuSparseLtSpMatBuilder = { "mgpuDestroyCuSparseLtSpMat", llvmVoidType, @@ -311,20 +308,19 @@ FunctionCallBuilder create2To4SpMatCallBuilder = { "mgpuCusparseLtCreate2To4SpMat", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmIntPtrType, llvmIntPtrType, - llvmPointerType, llvmInt32Type, llvmPointerType /* void *stream */}}; + {llvmPointerType, llvmIntPtrType, llvmIntPtrType, llvmPointerType, + llvmInt32Type, llvmPointerType /* void *stream */}}; FunctionCallBuilder createCuSparseLtSpMMBufferSizeBuilder = { "mgpuCuSparseLtSpMMBufferSize", llvmVoidType, - {llvmPointerType, llvmPointerType, llvmInt32Type, llvmInt32Type, - llvmPointerType, llvmPointerType, llvmPointerType, llvmInt32Type, + {llvmPointerType, llvmInt32Type, llvmInt32Type, llvmPointerType, + llvmPointerType, llvmPointerType, llvmInt32Type, llvmPointerType /*void *stream*/}}; FunctionCallBuilder createCuSparseLtSpMMBuilder = { "mgpuCuSparseLtSpMM", llvmVoidType, {llvmPointerType, llvmPointerType, llvmPointerType, llvmPointerType, - llvmPointerType, llvmPointerType, llvmPointerType, - llvmPointerType /*void *stream*/}}; + llvmPointerType, llvmPointerType, llvmPointerType /*void *stream*/}}; }; /// A rewrite pattern to convert gpu.host_register operations into a GPU runtime @@ -1403,21 +1399,15 @@ auto stream = adaptor.getAsyncDependencies().front(); // Use the cusparseLt create call if the dnmat is used with spmat with // 2:4 sparsity - Value handle; - if (isSpMMCusparseLtOp(op.getEnv())) { + if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT) { // CUDA runner asserts the size is 11024 bytes. - auto handleSz = rewriter.create( - loc, getIndexType(), rewriter.getIndexAttr(11024)); - handle = rewriter.create(loc, llvmInt8PointerType, - llvmInt8Type, handleSz); - handle = rewriter.create(loc, llvmPointerType, handle); - createSparseLtEnvCallBuilder.create(loc, rewriter, {handle, stream}) - .getResult(); - } else { - handle = - createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); + createSparseLtEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); } - rewriter.replaceOp(op, {handle, stream}); + if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT || + op.getRtLibMode() == gpu::RtLibMode::CUSPARSE) { + createSparseEnvCallBuilder.create(loc, rewriter, {stream}).getResult(); + } + rewriter.replaceOp(op, {stream}); return success(); } @@ -1431,12 +1421,12 @@ auto stream = adaptor.getAsyncDependencies().front(); // Use the cusparseLt destroy call if the dnmat is used with spmat with // 2:4 sparsity - if (isSpMMCusparseLtOp(op.getEnv())) { - destroySparseLtEnvCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), stream}); - } else { - destroySparseEnvCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), stream}); + if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT) { + destroySparseLtEnvCallBuilder.create(loc, rewriter, {stream}); + } + if (op.getRtLibMode() == gpu::RtLibMode::CUSPARSE_AND_CUSPARSE_LT || + op.getRtLibMode() == gpu::RtLibMode::CUSPARSE) { + destroySparseEnvCallBuilder.create(loc, rewriter, {stream}); } rewriter.replaceOp(op, {stream}); return success(); @@ -1471,7 +1461,6 @@ // the dnmat is used with spmat with 2:4 sparsity if (dims.size() == 2) { if (isSpMMCusparseLtOp(op.getDnTensor())) { - auto envHandle = adaptor.getEnv(); auto handleSz = rewriter.create( loc, getIndexType(), rewriter.getIndexAttr(11032)); handle = rewriter.create(loc, llvmInt8PointerType, @@ -1480,7 +1469,7 @@ createLtDnMatCallBuilder .create(loc, rewriter, - {handle, envHandle, dims[0], dims[1], pTensor, dtp, stream}) + {handle, dims[0], dims[1], pTensor, dtp, stream}) .getResult(); } else { handle = @@ -1648,7 +1637,6 @@ Type dType = llvm::cast(op.getMemref().getType()).getElementType(); auto dtp = genConstInt32From(rewriter, loc, getCuSparseDataTypeFrom(dType)); - auto envHandle = adaptor.getEnv(); // CUDA runner asserts the size is 44104 bytes. auto handleSz = rewriter.create( @@ -1659,8 +1647,7 @@ create2To4SpMatCallBuilder .create(loc, rewriter, - {handle, envHandle, adaptor.getRows(), adaptor.getCols(), pMat, - dtp, stream}) + {handle, adaptor.getRows(), adaptor.getCols(), pMat, dtp, stream}) .getResult(); rewriter.replaceOp(op, {handle, stream}); return success(); @@ -1697,12 +1684,11 @@ auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); - auto bufferSize = - spMVBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, adaptor.getSpmatA(), - adaptor.getDnX(), adaptor.getDnY(), computeType, stream}) - .getResult(); + auto bufferSize = spMVBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, adaptor.getSpmatA(), adaptor.getDnX(), + adaptor.getDnY(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); } @@ -1723,9 +1709,8 @@ if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); spMVCallBuilder.create(loc, rewriter, - {adaptor.getEnv(), modeA, adaptor.getSpmatA(), - adaptor.getDnX(), adaptor.getDnY(), computeType, pBuf, - stream}); + {modeA, adaptor.getSpmatA(), adaptor.getDnX(), + adaptor.getDnY(), computeType, pBuf, stream}); rewriter.replaceOp(op, {stream}); return success(); } @@ -1750,9 +1735,8 @@ llvmInt64Type, three); createCuSparseLtSpMMBufferSizeBuilder .create(loc, rewriter, - {bufferSize, adaptor.getEnv(), modeA, modeB, - adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), - computeType, stream}) + {bufferSize, modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, stream}) .getResult(); auto bufferSizePtr1 = rewriter.create( @@ -1774,12 +1758,12 @@ } else { auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); - bufferSize = createSpMMBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, - adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), computeType, stream}) - .getResult(); + bufferSize = + createSpMMBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, modeB, adaptor.getSpmatA(), adaptor.getDnmatB(), + adaptor.getDnmatC(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); } return success(); @@ -1797,12 +1781,12 @@ auto computeType = genConstInt32From( rewriter, loc, getCuSparseDataTypeFrom(adaptor.getComputeType())); auto stream = adaptor.getAsyncDependencies().front(); - auto bufferSize = createSDDMMBufferSizeCallBuilder - .create(loc, rewriter, - {adaptor.getEnv(), modeA, modeB, - adaptor.getDnmatA(), adaptor.getDnmatB(), - adaptor.getSpmatC(), computeType, stream}) - .getResult(); + auto bufferSize = + createSDDMMBufferSizeCallBuilder + .create(loc, rewriter, + {modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(), + adaptor.getSpmatC(), computeType, stream}) + .getResult(); rewriter.replaceOp(op, {bufferSize, stream}); return success(); } @@ -1832,17 +1816,17 @@ } createCuSparseLtSpMMBuilder.create( loc, rewriter, - {adaptor.getEnv(), adaptor.getSpmatA(), adaptor.getDnmatB(), - adaptor.getDnmatC(), pBufs[0], pBufs[1], pBufs[2], stream}); + {adaptor.getSpmatA(), adaptor.getDnmatB(), adaptor.getDnmatC(), + pBufs[0], pBufs[1], pBufs[2], stream}); } else { Value pBuf = MemRefDescriptor(adaptor.getBuffers().front()) .allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); - createSpMMCallBuilder.create( - loc, rewriter, - {adaptor.getEnv(), modeA, modeB, adaptor.getSpmatA(), - adaptor.getDnmatB(), adaptor.getDnmatC(), computeType, pBuf, stream}); + createSpMMCallBuilder.create(loc, rewriter, + {modeA, modeB, adaptor.getSpmatA(), + adaptor.getDnmatB(), adaptor.getDnmatC(), + computeType, pBuf, stream}); } rewriter.replaceOp(op, {stream}); return success(); @@ -1872,10 +1856,10 @@ MemRefDescriptor(adaptor.getBuffer()).allocatedPtr(rewriter, loc); if (!getTypeConverter()->useOpaquePointers()) pBuf = rewriter.create(loc, llvmPointerType, pBuf); - createSDDMMCallBuilder.create( - loc, rewriter, - {adaptor.getEnv(), modeA, modeB, adaptor.getDnmatA(), adaptor.getDnmatB(), - adaptor.getSpmatC(), computeType, pBuf, stream}); + createSDDMMCallBuilder.create(loc, rewriter, + {modeA, modeB, adaptor.getDnmatA(), + adaptor.getDnmatB(), adaptor.getSpmatC(), + computeType, pBuf, stream}); rewriter.replaceOp(op, {stream}); return success(); } diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp --- a/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp +++ b/mlir/lib/Dialect/SparseTensor/Transforms/SparseGPUCodegen.cpp @@ -494,14 +494,12 @@ // Create sparse environment and sparse matrix/dense vector handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnTensorHandleTp = rewriter.getType(); Type spmatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); + auto env = rewriter.create(loc, tokenTp, token, + gpu::RtLibMode::CUSPARSE); token = env.getAsyncToken(); Operation *spGenA = genSpMat(rewriter, loc, spmatHandleTp, tokenTp, token, szY, szX, nseA, @@ -509,11 +507,11 @@ Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); auto dvecX = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, vecX, szX); + loc, dnTensorHandleTp, tokenTp, token, vecX, szX); Value dnX = dvecX.getResult(0); token = dvecX.getAsyncToken(); auto dvecY = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, vecY, szY); + loc, dnTensorHandleTp, tokenTp, token, vecY, szY); Value dnY = dvecY.getResult(0); token = dvecY.getAsyncToken(); @@ -521,7 +519,7 @@ // Precompute buffersize for SpMV. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnX, dnY, + loc, indexTp, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); @@ -530,9 +528,8 @@ token = buf.getAsyncToken(); // Perform the SpMV. - auto spmvComp = - rewriter.create(loc, tokenTp, token, handle, spMatA, dnX, - dnY, /*computeType=*/dnYType, buffer); + auto spmvComp = rewriter.create( + loc, tokenTp, token, spMatA, dnX, dnY, /*computeType=*/dnYType, buffer); token = spmvComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -542,7 +539,9 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, dnY) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) + token = rewriter + .create(loc, tokenTp, token, + gpu::RtLibMode::CUSPARSE) .getAsyncToken(); token = genDeallocMemRef(rewriter, loc, rowA, token); if (colA) @@ -601,14 +600,12 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnTensorHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); + auto env = rewriter.create(loc, tokenTp, token, + gpu::RtLibMode::CUSPARSE); token = env.getAsyncToken(); Operation *spGenA = genSpMat(rewriter, loc, spMatHandleTp, tokenTp, token, szm, szk, nseA, @@ -616,12 +613,12 @@ Value spMatA = spGenA->getResult(0); token = spGenA->getResult(1); auto dmatB = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, matB, + loc, dnTensorHandleTp, tokenTp, token, matB, SmallVector{szk, szn}); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); auto dmatC = rewriter.create( - loc, dnTensorHandleTp, tokenTp, token, handle, matC, + loc, dnTensorHandleTp, tokenTp, token, matC, SmallVector{szm, szn}); Value dnC = dmatC.getResult(0); token = dmatC.getAsyncToken(); @@ -630,7 +627,7 @@ // Precompute buffersize for SpMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, spMatA, dnB, dnC, + loc, indexTp, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dmatCType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); @@ -641,9 +638,8 @@ auto dnCType = llvm::cast(c.getType()).getElementType(); // Perform the SpMM. - auto spmmComp = - rewriter.create(loc, tokenTp, token, handle, spMatA, dnB, - dnC, /*computeType=*/dnCType, buffer); + auto spmmComp = rewriter.create( + loc, tokenTp, token, spMatA, dnB, dnC, /*computeType=*/dnCType, buffer); token = spmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -653,7 +649,9 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, dnC) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) + token = rewriter + .create(loc, tokenTp, token, + gpu::RtLibMode::CUSPARSE) .getAsyncToken(); token = genDeallocMemRef(rewriter, loc, rowA, token); if (colA) @@ -715,24 +713,20 @@ // Create sparse environment and sparse matrix/dense matrix handles. Type indexTp = rewriter.getIndexType(); - Type envHandleTp = rewriter.getType(); Type dnMatHandleTp = rewriter.getType(); Type spMatHandleTp = rewriter.getType(); Type tokenTp = rewriter.getType(); Value token = genFirstWait(rewriter, loc); - auto env = - rewriter.create(loc, envHandleTp, tokenTp, token); - Value handle = env.getResult(0); + auto env = rewriter.create(loc, tokenTp, token, + gpu::RtLibMode::CUSPARSE); token = env.getAsyncToken(); auto dmatA = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, matA, - SmallVector{szm, szk}); + loc, dnMatHandleTp, tokenTp, token, matA, SmallVector{szm, szk}); Value dnA = dmatA.getResult(0); token = dmatA.getAsyncToken(); auto dmatB = rewriter.create( - loc, dnMatHandleTp, tokenTp, token, handle, matB, - SmallVector{szk, szn}); + loc, dnMatHandleTp, tokenTp, token, matB, SmallVector{szk, szn}); Value dnB = dmatB.getResult(0); token = dmatB.getAsyncToken(); @@ -745,7 +739,7 @@ auto dnCType = llvm::cast(c.getType()).getElementType(); // Precompute buffersize for SDDMM. auto bufferComp = rewriter.create( - loc, indexTp, tokenTp, token, handle, dnA, dnB, spMatC, dnCType); + loc, indexTp, tokenTp, token, dnA, dnB, spMatC, dnCType); Value bufferSz = bufferComp.getResult(0); token = bufferComp.getAsyncToken(); auto buf = genAllocBuffer(rewriter, loc, bufferSz, token); @@ -753,8 +747,8 @@ token = buf.getAsyncToken(); // Perform the SDDMM. - auto sddmmComp = rewriter.create( - loc, tokenTp, token, handle, dnA, dnB, spMatC, dnCType, buffer); + auto sddmmComp = rewriter.create(loc, tokenTp, token, dnA, dnB, + spMatC, dnCType, buffer); token = sddmmComp.getAsyncToken(); // Copy data back to host and free all the resoures. @@ -764,7 +758,9 @@ .getAsyncToken(); token = rewriter.create(loc, tokenTp, token, spMatC) .getAsyncToken(); - token = rewriter.create(loc, tokenTp, token, handle) + token = rewriter + .create(loc, tokenTp, token, + gpu::RtLibMode::CUSPARSE) .getAsyncToken(); token = genDeallocMemRef(rewriter, loc, buffer, token); token = genDeallocMemRef(rewriter, loc, matA, token); diff --git a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp --- a/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp +++ b/mlir/lib/ExecutionEngine/CudaRuntimeWrappers.cpp @@ -79,6 +79,49 @@ ~ScopedContext() { CUDA_REPORT_IF_ERROR(cuCtxPopCurrent(nullptr)); } }; +#ifdef MLIR_ENABLE_CUDA_CUSPARSE +// Create the cusparse handles once for the duration of the instance +class ScopedCuSparseHandleStorage { +public: + static cusparseHandle_t env; + static bool initiated; + ScopedCuSparseHandleStorage() { + // Static reference to CUDA cuSparse environment handle + if (!initiated) { + CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&env)); + initiated = true; + } + } + + ~ScopedCuSparseHandleStorage() {} +}; + +cusparseHandle_t ScopedCuSparseHandleStorage::env = nullptr; +bool ScopedCuSparseHandleStorage::initiated = false; + +#ifdef MLIR_ENABLE_CUDA_CUSPARSELT +class ScopedCuSparseLtHandleStorage { +public: + static cusparseLtHandle_t env; + static bool initiated; + ScopedCuSparseLtHandleStorage() { + // Static reference to CUDA cuSparseLt environment handle + if (!initiated) { + initiated = true; + // note that cuSparseLt still uses cusparseStatus_t + CUSPARSE_REPORT_IF_ERROR(cusparseLtInit(&env)); + } + } + + ~ScopedCuSparseLtHandleStorage() {} +}; + +cusparseLtHandle_t ScopedCuSparseLtHandleStorage::env; +bool ScopedCuSparseLtHandleStorage::initiated = false; + +#endif // MLIR_ENABLE_CUDA_CUSPARSELT +#endif // MLIR_ENABLE_CUDA_CUSPARSE + extern "C" MLIR_CUDA_WRAPPERS_EXPORT CUmodule mgpuModuleLoad(void *data) { ScopedContext scopedContext; CUmodule module = nullptr; @@ -270,17 +313,17 @@ (beta##p) = reinterpret_cast(&(beta##d)); \ } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuCreateSparseEnv(CUstream /*stream*/) { - cusparseHandle_t handle = nullptr; - CUSPARSE_REPORT_IF_ERROR(cusparseCreate(&handle)) - return reinterpret_cast(handle); + ScopedCuSparseHandleStorage hstorage; + return; } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroySparseEnv(void *h, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); - CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(handle)) +mgpuDestroySparseEnv(CUstream /*stream*/) { + ScopedCuSparseHandleStorage hstorage; + CUSPARSE_REPORT_IF_ERROR(cusparseDestroy(hstorage.env)) + hstorage.initiated = false; } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void * @@ -359,10 +402,10 @@ CUSPARSE_REPORT_IF_ERROR(cusparseDestroySpMat(mat)) } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSpMVBufferSize(void *h, int32_t ma, void *a, void *x, void *y, int32_t ctp, - CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t mgpuSpMVBufferSize( + int32_t ma, void *a, void *x, void *y, int32_t ctp, CUstream /*stream*/) { + ScopedCuSparseHandleStorage hstorage; + cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); @@ -370,32 +413,33 @@ cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; - CUSPARSE_REPORT_IF_ERROR( - cusparseSpMV_bufferSize(handle, modeA, alphap, matA, vecX, betap, vecY, - cTp, CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV_bufferSize( + hstorage.env, modeA, alphap, matA, vecX, betap, vecY, cTp, + CUSPARSE_SPMV_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(void *h, int32_t ma, void *a, - void *x, void *y, - int32_t ctp, void *buf, +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMV(int32_t ma, void *a, void *x, + void *y, int32_t ctp, + void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseSpMatDescr_t matA = reinterpret_cast(a); cusparseDnVecDescr_t vecX = reinterpret_cast(x); cusparseDnVecDescr_t vecY = reinterpret_cast(y); cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(handle, modeA, alphap, matA, vecX, + CUSPARSE_REPORT_IF_ERROR(cusparseSpMV(hstorage.env, modeA, alphap, matA, vecX, betap, vecY, cTp, CUSPARSE_SPMV_ALG_DEFAULT, buf)) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSpMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, +mgpuSpMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); @@ -405,15 +449,16 @@ ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; CUSPARSE_REPORT_IF_ERROR(cusparseSpMM_bufferSize( - handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp, + hstorage.env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, CUSPARSE_SPMM_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSpMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, - int32_t ctp, void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSpMM(int32_t ma, int32_t mb, + void *a, void *b, void *c, + int32_t ctp, void *buf, + CUstream /*stream*/) { + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseSpMatDescr_t matA = reinterpret_cast(a); @@ -421,16 +466,16 @@ cusparseDnMatDescr_t matC = reinterpret_cast(c); cudaDataType_t cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(handle, modeA, modeB, alphap, matA, - matB, betap, matC, cTp, + CUSPARSE_REPORT_IF_ERROR(cusparseSpMM(hstorage.env, modeA, modeB, alphap, + matA, matB, betap, matC, cTp, CUSPARSE_SPMM_ALG_DEFAULT, buf)) } // TODO: add support to passing alpha and beta as arguments extern "C" MLIR_CUDA_WRAPPERS_EXPORT intptr_t -mgpuSDDMMBufferSize(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, +mgpuSDDMMBufferSize(int32_t ma, int32_t mb, void *a, void *b, void *c, int32_t ctp, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseDnMatDescr_t matA = reinterpret_cast(a); @@ -440,15 +485,16 @@ ALPHABETA(cTp, alpha, beta) size_t bufferSize = 0; CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM_bufferSize( - handle, modeA, modeB, alphap, matA, matB, betap, matC, cTp, + hstorage.env, modeA, modeB, alphap, matA, matB, betap, matC, cTp, CUSPARSE_SDDMM_ALG_DEFAULT, &bufferSize)) return bufferSize == 0 ? 1 : bufferSize; // avoid zero-alloc } -extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuSDDMM(void *h, int32_t ma, int32_t mb, void *a, void *b, void *c, - int32_t ctp, void *buf, CUstream /*stream*/) { - cusparseHandle_t handle = reinterpret_cast(h); +extern "C" MLIR_CUDA_WRAPPERS_EXPORT void mgpuSDDMM(int32_t ma, int32_t mb, + void *a, void *b, void *c, + int32_t ctp, void *buf, + CUstream /*stream*/) { + ScopedCuSparseHandleStorage hstorage; cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); cusparseDnMatDescr_t matA = reinterpret_cast(a); @@ -456,8 +502,8 @@ cusparseSpMatDescr_t matC = reinterpret_cast(c); auto cTp = static_cast(ctp); ALPHABETA(cTp, alpha, beta) - CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(handle, modeA, modeB, alphap, matA, - matB, betap, matC, cTp, + CUSPARSE_REPORT_IF_ERROR(cusparseSDDMM(hstorage.env, modeA, modeB, alphap, + matA, matB, betap, matC, cTp, CUSPARSE_SDDMM_ALG_DEFAULT, buf)) } @@ -488,29 +534,28 @@ static_assert(sizeof(cusparseLtDnMatHandleAndData) == 11032); extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCreateSparseLtEnv(void *h, CUstream /*stream*/) { - // note that cuSparseLt still uses cusparseStatus_t - CUSPARSE_REPORT_IF_ERROR( - cusparseLtInit(reinterpret_cast(h))) +mgpuCreateSparseLtEnv(CUstream /*stream*/) { + ScopedCuSparseLtHandleStorage hstorage; } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuDestroySparseLtEnv(void *h, CUstream /*stream*/) { - auto handle = reinterpret_cast(h); - CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(handle)) +mgpuDestroySparseLtEnv(CUstream /*stream*/) { + ScopedCuSparseLtHandleStorage hstorage; + CUSPARSE_REPORT_IF_ERROR(cusparseLtDestroy(&(hstorage.env))) } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCreateCuSparseLtDnMat(void *dh, void *h, intptr_t rows, intptr_t cols, - void *values, int32_t dtp, CUstream /*stream*/) { - auto handle = reinterpret_cast(h); +mgpuCreateCuSparseLtDnMat(void *dh, intptr_t rows, intptr_t cols, void *values, + int32_t dtp, CUstream /*stream*/) { + ScopedCuSparseLtHandleStorage hstorage; + // CusparseLt expects the descriptors to be zero-initialized. memset(dh, 0, sizeof(cusparseLtDnMatHandleAndData)); auto dnmat_handle = reinterpret_cast(dh); auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtDenseDescriptorInit( - handle, &(dnmat_handle->mat), rows, cols, /*lda=*/cols, + &(hstorage.env), &(dnmat_handle->mat), rows, cols, /*lda=*/cols, /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW)) dnmat_handle->values = values; } @@ -530,29 +575,29 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCusparseLtCreate2To4SpMat(void *sh, void *h, intptr_t rows, intptr_t cols, +mgpuCusparseLtCreate2To4SpMat(void *sh, intptr_t rows, intptr_t cols, void *values, int32_t dtp, CUstream /*stream*/) { auto spmat_handle = reinterpret_cast(sh); // CusparseLt expects the descriptors to be zero-initialized. memset(spmat_handle, 0, sizeof(cusparseLtSpMatHandleAndData)); spmat_handle->values = values; - auto handle = reinterpret_cast(h); + ScopedCuSparseLtHandleStorage hstorage; auto dTp = static_cast(dtp); // assuming row-major when deciding lda CUSPARSE_REPORT_IF_ERROR(cusparseLtStructuredDescriptorInit( - handle, &(spmat_handle->mat), rows, cols, /*ld=*/cols, /*alignment=*/16, - dTp, CUSPARSE_ORDER_ROW, CUSPARSELT_SPARSITY_50_PERCENT)) + &(hstorage.env), &(spmat_handle->mat), rows, cols, /*ld=*/cols, + /*alignment=*/16, dTp, CUSPARSE_ORDER_ROW, + CUSPARSELT_SPARSITY_50_PERCENT)) } // Several things are being done in this stage, algorithm selection, planning, // and returning workspace and compressed matrices data buffer sizes. extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMMBufferSize(void *bs, void *h, int32_t ma, int32_t mb, void *a, - void *b, void *c, int32_t ctp, - CUstream /*stream*/) { +mgpuCuSparseLtSpMMBufferSize(void *bs, int32_t ma, int32_t mb, void *a, void *b, + void *c, int32_t ctp, CUstream /*stream*/) { // TODO: support more advanced settings, e.g., the input right operand is a // sparse matrix assuming matA is the sparse matrix - auto handle = reinterpret_cast(h); + ScopedCuSparseLtHandleStorage hstorage; auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); @@ -565,22 +610,25 @@ cusparseOperation_t modeA = static_cast(ma); cusparseOperation_t modeB = static_cast(mb); CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulDescriptorInit( - handle, &(matA->matmul), modeA, modeB, &(matA->mat), &(matB->mat), - &(matC->mat), &(matC->mat), cTp)) + &(hstorage.env), &(matA->matmul), modeA, modeB, &(matA->mat), + &(matB->mat), &(matC->mat), &(matC->mat), cTp)) CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSelectionInit( - handle, &(matA->alg_sel), &(matA->matmul), CUSPARSELT_MATMUL_ALG_DEFAULT)) + &(hstorage.env), &(matA->alg_sel), &(matA->matmul), + CUSPARSELT_MATMUL_ALG_DEFAULT)) int alg = 0; CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulAlgSetAttribute( - handle, &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, + &(hstorage.env), &(matA->alg_sel), CUSPARSELT_MATMUL_ALG_CONFIG_ID, &alg, sizeof(alg))) CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulPlanInit( - handle, &(matA->plan), &(matA->matmul), &(matA->alg_sel))) + &(hstorage.env), &(matA->plan), &(matA->matmul), &(matA->alg_sel))) - CUSPARSE_REPORT_IF_ERROR( - cusparseLtMatmulGetWorkspace(handle, &(matA->plan), &workspace_size_)) + CUSPARSE_REPORT_IF_ERROR(cusparseLtMatmulGetWorkspace( + &(hstorage.env), &(matA->plan), &workspace_size_)) CUSPARSE_REPORT_IF_ERROR(cusparseLtSpMMACompressedSize( - handle, &(matA->plan), &compressed_size_, &compressed_buffer_size_)) + &(hstorage.env), &(matA->plan), &compressed_size_, + &compressed_buffer_size_)) + // avoid zero-alloc *workspace_size = (workspace_size_ == 0 ? 1 : workspace_size_); *compressed_size = (compressed_size_ == 0 ? 1 : compressed_size_); @@ -589,23 +637,23 @@ } extern "C" MLIR_CUDA_WRAPPERS_EXPORT void -mgpuCuSparseLtSpMM(void *h, void *a, void *b, void *c, void *d_workspace, +mgpuCuSparseLtSpMM(void *a, void *b, void *c, void *d_workspace, void *dA_compressed, void *dA_compressedBuffer, CUstream stream) { - auto handle = reinterpret_cast(h); + ScopedCuSparseLtHandleStorage hstorage; auto matA = reinterpret_cast(a); auto matB = reinterpret_cast(b); auto matC = reinterpret_cast(c); ALPHABETA(CUDA_R_32F, alpha, beta) CUSPARSE_REPORT_IF_ERROR( - cusparseLtSpMMACompress(handle, &(matA->plan), (matA->values), + cusparseLtSpMMACompress(&(hstorage.env), &(matA->plan), (matA->values), dA_compressed, dA_compressedBuffer, stream)) // TODO: add support to multi-stream execution // Perform the matrix multiplication. D = A*B+C using C==D for now CUSPARSE_REPORT_IF_ERROR( - cusparseLtMatmul(handle, &(matA->plan), alphap, dA_compressed, + cusparseLtMatmul(&(hstorage.env), &(matA->plan), alphap, dA_compressed, matB->values, betap, matC->values, /*dD*/ matC->values, d_workspace, nullptr, 0)) diff --git a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-2to4-sparse-to-gpu-runtime-calls.mlir @@ -20,14 +20,14 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] - %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %env, %arg0, %arg0, %mem1: memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index,index,index into f16 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE_AND_CUSPARSE_LT + %spmat, %token4 = gpu.create_2to4_spmat async [%token3] %arg0, %arg0, %mem1: memref + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz0, %bufferSz1, %bufferSz2, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index,index,index into f16 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2, %mem2, %mem2 : memref,memref,memref into f16 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env + %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE_AND_CUSPARSE_LT gpu.wait [%token10] return } diff --git a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir --- a/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir +++ b/mlir/test/Conversion/GPUCommon/lower-sparse-to-gpu-runtime-calls.mlir @@ -20,14 +20,14 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref - %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 - %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64 + %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec - %token10 = gpu.destroy_sparse_env async [%token9] %env + %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE gpu.wait [%token10] return } @@ -50,14 +50,14 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env + %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE gpu.wait [%token10] return } @@ -80,14 +80,14 @@ %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 - %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64 + %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env + %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE gpu.wait [%token10] return } diff --git a/mlir/test/Dialect/GPU/ops.mlir b/mlir/test/Dialect/GPU/ops.mlir --- a/mlir/test/Dialect/GPU/ops.mlir +++ b/mlir/test/Dialect/GPU/ops.mlir @@ -327,27 +327,27 @@ // CHECK: gpu.alloc async %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref // CHECK: gpu.create_sparse_env async - %env, %token3 = gpu.create_sparse_env async [%token2] + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE // CHECK: gpu.create_coo async %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_csr async %spmat2, %token5 = gpu.create_csr async [%token4] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref // CHECK: gpu.create_dn_tensor async - %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %env, %mem2, %arg0 : index into memref + %dnvec, %token6 = gpu.create_dn_tensor async [%token5] %mem2, %arg0 : index into memref // CHECK: gpu.spmv_buffer_size async - %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %env, %spmat, %dnvec, %dnvec into f64 + %bufferSz, %token7 = gpu.spmv_buffer_size async [%token6] %spmat, %dnvec, %dnvec into f64 // CHECK: gpu.spmv async - %token8 = gpu.spmv async [%token7] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + %token8 = gpu.spmv async [%token7] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 // CHECK: gpu.create_dn_tensor async - %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %mem2, %arg0, %arg0 : index, index into memref + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %mem2, %arg0, %arg0 : index, index into memref // CHECK: gpu.spmm_buffer_size async - %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %env, %spmat, %dnmat, %dnmat : index into f64 + %bufferSz2, %token10 = gpu.spmm_buffer_size async [%token9] %spmat, %dnmat, %dnmat : index into f64 // CHECK: gpu.spmm async - %token11 = gpu.spmm async [%token10] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + %token11 = gpu.spmm async [%token10] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 // CHECK: gpu.sddmm_buffer_size async - %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %env, %dnmat, %dnmat, %spmat into f64 + %bufferSz3, %token12 = gpu.sddmm_buffer_size async [%token11] %dnmat, %dnmat, %spmat into f64 // CHECK: gpu.sddmm async - %token13 = gpu.sddmm async [%token12] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + %token13 = gpu.sddmm async [%token12] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 // CHECK: gpu.destroy_dn_tensor async %token14 = gpu.destroy_dn_tensor async [%token13] %dnmat // CHECK: gpu.destroy_sp_mat async @@ -355,7 +355,7 @@ // CHECK: gpu.destroy_dn_tensor async %token16 = gpu.destroy_dn_tensor async [%token15] %dnvec // CHECK: gpu.destroy_sparse_env async - %token17 = gpu.destroy_sparse_env async [%token16] %env + %token17 = gpu.destroy_sparse_env async [%token16] CUSPARSE // CHECK: gpu.wait gpu.wait [%token17] return diff --git a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir --- a/mlir/test/Dialect/GPU/sparse-roundtrip.mlir +++ b/mlir/test/Dialect/GPU/sparse-roundtrip.mlir @@ -6,28 +6,28 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] + // CHECK: %{{.*}} = gpu.create_sparse_env async [%{{.*}}] CUSPARSE // CHECK: %{{.*}}, %{{.*}} = gpu.create_coo async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}} : index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.spmv_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.spmv async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} + // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] CUSPARSE // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @matvec(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE %spmat, %token4 = gpu.create_coo async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0 : index into memref - %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %env, %spmat, %dnvec, %dnvec into f64 - %token7 = gpu.spmv async [%token6] %env, %spmat, %dnvec, %dnvec, %mem2 : memref into f64 + %dnvec, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0 : index into memref + %bufferSz, %token6 = gpu.spmv_buffer_size async [%token5] %spmat, %dnvec, %dnvec into f64 + %token7 = gpu.spmv async [%token6] %spmat, %dnvec, %dnvec, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnvec - %token10 = gpu.destroy_sparse_env async [%token9] %env + %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE gpu.wait [%token10] return } @@ -36,28 +36,28 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] + // CHECK: %{{.*}} = gpu.create_sparse_env async [%{{.*}}] CUSPARSE // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.spmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.spmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} + // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] CUSPARSE // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @matmul(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %env, %spmat, %dnmat, %dnmat : index into f64 - %token7 = gpu.spmm async [%token6] %env, %spmat, %dnmat, %dnmat, %mem2 : memref into f64 + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.spmm_buffer_size async [%token5] %spmat, %dnmat, %dnmat : index into f64 + %token7 = gpu.spmm async [%token6] %spmat, %dnmat, %dnmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env + %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE gpu.wait [%token10] return } @@ -66,28 +66,28 @@ // CHECK: %{{.*}} = gpu.wait async // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref // CHECK: %{{.*}}, %{{.*}} = gpu.alloc async [%{{.*}}] (%{{.*}}) : memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_sparse_env async [%{{.*}}] + // CHECK: %{{.*}} = gpu.create_sparse_env async [%{{.*}}] CUSPARSE // CHECK: %{{.*}}, %{{.*}} = gpu.create_csr async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref, memref, memref - // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref - // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} into f64 - // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 + // CHECK: %{{.*}}, %{{.*}} = gpu.create_dn_tensor async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} : index, index into memref + // CHECK: %{{.*}}, %{{.*}} = gpu.sddmm_buffer_size async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}} into f64 + // CHECK: %{{.*}} = gpu.sddmm async [%{{.*}}] %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref into f64 // CHECK: %{{.*}} = gpu.destroy_sp_mat async [%{{.*}}] %{{.*}} // CHECK: %{{.*}} = gpu.destroy_dn_tensor async [%{{.*}}] %{{.*}} - // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] %{{.*}} + // CHECK: %{{.*}} = gpu.destroy_sparse_env async [%{{.*}}] CUSPARSE // CHECK: gpu.wait [%{{.*}}] // CHECK: return func.func @sddmm(%arg0: index) { %token0 = gpu.wait async %mem1, %token1 = gpu.alloc async [%token0] (%arg0) : memref %mem2, %token2 = gpu.alloc async [%token1] (%arg0) : memref - %env, %token3 = gpu.create_sparse_env async [%token2] + %token3 = gpu.create_sparse_env async [%token2] CUSPARSE %spmat, %token4 = gpu.create_csr async [%token3] %arg0, %arg0, %arg0, %mem1, %mem1, %mem2 : memref, memref, memref - %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %env, %mem2, %arg0, %arg0 : index, index into memref - %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %env, %dnmat, %dnmat, %spmat into f64 - %token7 = gpu.sddmm async [%token6] %env, %dnmat, %dnmat, %spmat, %mem2 : memref into f64 + %dnmat, %token5 = gpu.create_dn_tensor async [%token4] %mem2, %arg0, %arg0 : index, index into memref + %bufferSz, %token6 = gpu.sddmm_buffer_size async [%token5] %dnmat, %dnmat, %spmat into f64 + %token7 = gpu.sddmm async [%token6] %dnmat, %dnmat, %spmat, %mem2 : memref into f64 %token8 = gpu.destroy_sp_mat async [%token7] %spmat %token9 = gpu.destroy_dn_tensor async [%token8] %dnmat - %token10 = gpu.destroy_sparse_env async [%token9] %env + %token10 = gpu.destroy_sparse_env async [%token9] CUSPARSE gpu.wait [%token10] return } diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matmul_lib.mlir @@ -45,17 +45,17 @@ // CHECK: %[[VAL_40:.*]] = gpu.memcpy async {{\[}}%[[VAL_39]]] %[[VAL_38]], %[[VAL_34]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_16]], %[[VAL_21]], %[[VAL_26]], %[[VAL_33]], %[[VAL_40]]] // CHECK: %[[VAL_41:.*]] = gpu.wait async -// CHECK: %[[VAL_42:.*]], %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] +// CHECK: %[[VAL_43:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_41]]] CUSPARSE // CHECK: %[[VAL_44:.*]], %[[VAL_45:.*]] = gpu.create_csr async {{\[}}%[[VAL_43]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_14]], %[[VAL_19]], %[[VAL_24]] : memref, memref, memref -// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_42]], %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref -// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_42]], %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref -// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index +// CHECK: %[[VAL_46:.*]], %[[VAL_47:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_45]]] %[[VAL_31]], %[[VAL_7]], %[[VAL_8]] : index, index into memref +// CHECK: %[[VAL_48:.*]], %[[VAL_49:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_38]], %[[VAL_6]], %[[VAL_8]] : index, index into memref +// CHECK: %[[VAL_50:.*]], %[[VAL_51:.*]] = gpu.spmm_buffer_size async {{\[}}%[[VAL_49]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]] : index // CHECK: %[[VAL_52:.*]], %[[VAL_53:.*]] = gpu.alloc async {{\[}}%[[VAL_51]]] (%[[VAL_50]]) : memref -// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_42]], %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref +// CHECK: %[[VAL_54:.*]] = gpu.spmm async {{\[}}%[[VAL_53]]] %[[VAL_44]], %[[VAL_46]], %[[VAL_48]], %[[VAL_52]] : memref // CHECK: %[[VAL_55:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_54]]] %[[VAL_44]] // CHECK: %[[VAL_56:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_55]]] %[[VAL_46]] // CHECK: %[[VAL_57:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_56]]] %[[VAL_48]] -// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] %[[VAL_42]] +// CHECK: %[[VAL_58:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_57]]] CUSPARSE // CHECK: %[[VAL_59:.*]] = gpu.dealloc async {{\[}}%[[VAL_58]]] %[[VAL_14]] : memref // CHECK: %[[VAL_60:.*]] = gpu.dealloc async {{\[}}%[[VAL_59]]] %[[VAL_19]] : memref // CHECK: %[[VAL_61:.*]] = gpu.dealloc async {{\[}}%[[VAL_60]]] %[[VAL_24]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_matvec_lib.mlir @@ -43,17 +43,17 @@ // CHECK: %[[VAL_37:.*]] = gpu.memcpy async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_32]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_15]], %[[VAL_20]], %[[VAL_25]], %[[VAL_31]], %[[VAL_37]]] // CHECK: %[[VAL_38:.*]] = gpu.wait async -// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]] +// CHECK: %[[VAL_40:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_38]]] CUSPARSE // CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_coo async {{\[}}%[[VAL_40]]] %[[VAL_6]], %[[VAL_7]], %[[VAL_5]], %[[VAL_13]], %[[VAL_18]], %[[VAL_23]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_39:.*]], %[[VAL_29]], %[[VAL_7]] : index into memref -// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_39:.*]], %[[VAL_35]], %[[VAL_6]] : index into memref -// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_42]]] %[[VAL_29]], %[[VAL_7]] : index into memref +// CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_44]]] %[[VAL_35]], %[[VAL_6]] : index into memref +// CHECK: %[[VAL_47:.*]], %[[VAL_48:.*]] = gpu.spmv_buffer_size async {{\[}}%[[VAL_46]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]] // CHECK: %[[VAL_49:.*]], %[[VAL_50:.*]] = gpu.alloc async {{\[}}%[[VAL_48]]] (%[[VAL_47]]) : memref -// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_39]], %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref +// CHECK: %[[VAL_51:.*]] = gpu.spmv async {{\[}}%[[VAL_50]]] %[[VAL_41]], %[[VAL_43]], %[[VAL_45]], %[[VAL_49]] : memref // CHECK: %[[VAL_52:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_51]]] %[[VAL_41]] // CHECK: %[[VAL_53:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_52]]] %[[VAL_43]] // CHECK: %[[VAL_54:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_53]]] %[[VAL_45]] -// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] %[[VAL_39]] +// CHECK: %[[VAL_55:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_54]]] CUSPARSE // CHECK: %[[VAL_56:.*]] = gpu.dealloc async {{\[}}%[[VAL_55]]] %[[VAL_13]] : memref // CHECK: %[[VAL_57:.*]] = gpu.dealloc async {{\[}}%[[VAL_56]]] %[[VAL_18]] : memref // CHECK: %[[VAL_58:.*]] = gpu.dealloc async {{\[}}%[[VAL_57]]] %[[VAL_23]] : memref diff --git a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir --- a/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir +++ b/mlir/test/Dialect/SparseTensor/GPU/gpu_sampled_matmul_lib.mlir @@ -53,17 +53,17 @@ // CHECK: %[[VAL_33:.*]] = gpu.memcpy async {{\[}}%[[VAL_32]]] %[[VAL_31]], %[[VAL_18]] : memref, memref // CHECK: gpu.wait {{\[}}%[[VAL_10]], %[[VAL_15]], %[[VAL_23]], %[[VAL_28]], %[[VAL_33]]] // CHECK: %[[VAL_34:.*]] = gpu.wait async -// CHECK: %[[VAL_35:.*]], %[[VAL_36:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_34]]] -// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_36]]] %[[VAL_35]], %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> -// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_35]], %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> +// CHECK: %[[VAL_36:.*]] = gpu.create_sparse_env async {{\[}}%[[VAL_34]]] CUSPARSE +// CHECK: %[[VAL_37:.*]], %[[VAL_38:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_36]]] %[[VAL_8]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> +// CHECK: %[[VAL_39:.*]], %[[VAL_40:.*]] = gpu.create_dn_tensor async {{\[}}%[[VAL_38]]] %[[VAL_13]], %[[VAL_3]], %[[VAL_3]] : index, index into memref<8x8xf64> // CHECK: %[[VAL_41:.*]], %[[VAL_42:.*]] = gpu.create_csr async {{\[}}%[[VAL_40]]] %[[VAL_3]], %[[VAL_3]], %[[VAL_5]], %[[VAL_21]], %[[VAL_26]], %[[VAL_31]] : memref, memref, memref -// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64 +// CHECK: %[[VAL_43:.*]], %[[VAL_44:.*]] = gpu.sddmm_buffer_size async {{\[}}%[[VAL_42]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]] into f64 // CHECK: %[[VAL_45:.*]], %[[VAL_46:.*]] = gpu.alloc async {{\[}}%[[VAL_44]]] (%[[VAL_43]]) : memref -// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_35]], %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref into f64 +// CHECK: %[[VAL_47:.*]] = gpu.sddmm async {{\[}}%[[VAL_46]]] %[[VAL_37]], %[[VAL_39]], %[[VAL_41]], %[[VAL_45]] : memref into f64 // CHECK: %[[VAL_48:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_47]]] %[[VAL_37]] // CHECK: %[[VAL_49:.*]] = gpu.destroy_dn_tensor async {{\[}}%[[VAL_48]]] %[[VAL_39]] // CHECK: %[[VAL_50:.*]] = gpu.destroy_sp_mat async {{\[}}%[[VAL_49]]] %[[VAL_41]] -// CHECK: %[[VAL_51:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_50]]] %[[VAL_35]] +// CHECK: %[[VAL_51:.*]] = gpu.destroy_sparse_env async {{\[}}%[[VAL_50]]] CUSPARSE // CHECK: %[[VAL_52:.*]] = gpu.dealloc async {{\[}}%[[VAL_51]]] %[[VAL_45]] : memref // CHECK: %[[VAL_53:.*]] = gpu.dealloc async {{\[}}%[[VAL_52]]] %[[VAL_8]] : memref<8x8xf64> // CHECK: %[[VAL_54:.*]] = gpu.dealloc async {{\[}}%[[VAL_53]]] %[[VAL_13]] : memref<8x8xf64> diff --git a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir --- a/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir +++ b/mlir/test/Integration/Dialect/SparseTensor/GPU/CUDA/sm80-lt/sparse-matmul-2-4-lib.mlir @@ -28,18 +28,18 @@ %token4 = gpu.memcpy async [%token3] %d_a, %a : memref<16x32xf16>, memref<16x32xf16> %token5 = gpu.memcpy async [%token4] %d_b, %b : memref<32x16xf16>, memref<32x16xf16> %token6 = gpu.memcpy async [%token5] %d_c, %c : memref<16x16xf16>, memref<16x16xf16> - %env, %token7 = gpu.create_sparse_env async [%token6] - %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %env, %c16, %c32, %d_a: memref<16x32xf16> - %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %env, %d_b, %c32, %c16: index, index into memref<32x16xf16> - %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %env, %d_c, %c16, %c16: index, index into memref<16x16xf16> - %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16 + %token7 = gpu.create_sparse_env async [%token6] CUSPARSE + %spmat, %token8 = gpu.create_2to4_spmat async [%token7] %c16, %c32, %d_a: memref<16x32xf16> + %dnmat, %token9 = gpu.create_dn_tensor async [%token8] %d_b, %c32, %c16: index, index into memref<32x16xf16> + %dnmat2, %token10 = gpu.create_dn_tensor async [%token9] %d_c, %c16, %c16: index, index into memref<16x16xf16> + %bufferSz0, %bufferSz1, %bufferSz2, %token11 = gpu.spmm_buffer_size async [%token10] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2 : index, index,index into f16 %mem1, %token12 = gpu.alloc async [%token11] (%bufferSz0) : memref %mem2, %token13 = gpu.alloc async [%token12] (%bufferSz1) : memref %mem3, %token14 = gpu.alloc async [%token13] (%bufferSz2) : memref - %token15 = gpu.spmm async [%token14] %env, %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 + %token15 = gpu.spmm async [%token14] %spmat{NON_TRANSPOSE}, %dnmat{NON_TRANSPOSE}, %dnmat2, %mem1, %mem2, %mem3 : memref, memref,memref into f16 %token16 = gpu.destroy_sp_mat async [%token15] %spmat %token17 = gpu.destroy_dn_tensor async [%token16] %dnmat - %token18 = gpu.destroy_sparse_env async [%token17] %env + %token18 = gpu.destroy_sparse_env async [%token17] CUSPARSE %token19 = gpu.memcpy async [%token18] %c, %d_c : memref<16x16xf16>, memref<16x16xf16> %token20 = gpu.dealloc async [%token19] %d_c : memref<16x16xf16> %token21 = gpu.dealloc async [%token20] %d_b : memref<32x16xf16>